Tighten asset linking and batch conversion tests

This commit is contained in:
welsberr 2026-04-22 21:23:10 -04:00
parent d0f2352341
commit 787e3e7330
4 changed files with 105 additions and 2 deletions

View File

@ -9,6 +9,7 @@ from .legacy_doc import (
extract_references,
extract_tables,
extract_title,
link_related_assets,
normalize_text_preserve_layout,
render_markdown,
run_catdoc,
@ -32,7 +33,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
layout = build_layout_manifest(layout_body)
table_refs = extract_references(body, r"\bTable\s+\d+\b")
figure_refs = extract_references(body, r"\b(?:Fig\.?\s*[\d.]+|Figure\s+[\d.]+)\b")
related_assets = list(figure_assets or [])
related_assets = link_related_assets(figure_refs, list(figure_assets or []))
doc_out = _document_output_dir(out_root, source_path, title)
doc_out.mkdir(parents=True, exist_ok=True)
@ -93,6 +94,15 @@ def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None
"summary": {
"documents_with_tables": sum(1 for bundle in bundles if bundle.table_count > 0),
"documents_with_figure_references": sum(1 for bundle in bundles if bundle.figure_reference_count > 0),
"documents": [
{
"document_id": bundle.document_id,
"title": bundle.title,
"table_count": bundle.table_count,
"figure_reference_count": bundle.figure_reference_count,
}
for bundle in bundles
],
}
},
)

View File

@ -221,6 +221,28 @@ def collect_figure_assets(root: Path) -> list[FigureAsset]:
return assets
def link_related_assets(figure_refs: list[str], figure_assets: list[FigureAsset]) -> list[FigureAsset]:
if not figure_refs:
return []
matched: list[FigureAsset] = []
seen: set[str] = set()
ref_keys: set[str] = set()
for ref in figure_refs:
key = slugify(ref.replace("Figure", "Fig").replace("figure", "fig"))
ref_keys.add(key)
for asset in figure_assets:
asset_key = slugify(asset.name.rsplit(".", 1)[0])
for ref_key in ref_keys:
if ref_key and ref_key in asset_key:
if asset.asset_id not in seen:
seen.add(asset.asset_id)
matched.append(asset)
break
return matched
def build_layout_manifest(layout_body: str) -> list[dict]:
manifest: list[dict] = []
for line_no, line in enumerate(layout_body.splitlines(), start=1):

48
tests/test_convert.py Executable file
View File

@ -0,0 +1,48 @@
from __future__ import annotations
import json
from pathlib import Path
from doclift import convert as convert_module
def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path, monkeypatch) -> None:
source_root = tmp_path / "src"
asset_root = tmp_path / "assets"
out_root = tmp_path / "out"
source_root.mkdir()
asset_root.mkdir()
(source_root / "sample.doc").write_text("stub", encoding="utf-8")
(asset_root / "Fig. 5.1.bmp").write_text("img", encoding="utf-8")
sample_text = "\n".join(
[
"Lecture 1. Example legacy document",
"",
"See Fig. 5.1 and Table 1.",
"",
"Table 1. Example caption",
"",
"Metric\tRest\tSwim",
"O2\t1.0\t2.0",
]
)
monkeypatch.setattr(convert_module, "run_catdoc", lambda path: sample_text)
report = convert_module.convert_directory(source_root, out_root, asset_root=asset_root)
assert report.document_count == 1
manifest = json.loads((out_root / "manifest.json").read_text(encoding="utf-8"))
conversion_report = json.loads((out_root / "conversion_report.json").read_text(encoding="utf-8"))
figures_payload = json.loads(
(out_root / "documents" / "sample-lecture-1-example-legacy-document" / "document.figures.json").read_text(
encoding="utf-8"
)
)
assert manifest["document_count"] == 1
assert conversion_report["summary"]["documents_with_tables"] == 1
assert conversion_report["summary"]["documents_with_figure_references"] == 1
assert figures_payload["figure_references"] == ["Fig. 5.1"]
assert len(figures_payload["related_assets"]) == 1

View File

@ -1,4 +1,4 @@
from doclift.legacy_doc import extract_references, extract_tables
from doclift.legacy_doc import FigureAsset, extract_references, extract_tables, link_related_assets
def test_extract_references_dedupes() -> None:
@ -22,3 +22,26 @@ def test_extract_tables_parses_tabbed_rows() -> None:
assert tables[0].caption == "Table 1. Example caption"
assert tables[0].column_count_guess == 3
assert tables[0].parsed_rows[1] == ["O2", "1.0", "2.0"]
def test_link_related_assets_matches_explicit_figure_refs() -> None:
assets = [
FigureAsset(
asset_id="a1",
path="/tmp/Fig. 5.1.bmp",
relative_path="vol/Fig. 5.1.bmp",
name="Fig. 5.1.bmp",
container="vol",
looks_like_figure=True,
),
FigureAsset(
asset_id="a2",
path="/tmp/Slide 1.jpg",
relative_path="vol/Slide 1.jpg",
name="Slide 1.jpg",
container="vol",
looks_like_figure=False,
),
]
matched = link_related_assets(["Fig. 5.1"], assets)
assert [asset.asset_id for asset in matched] == ["a1"]