diff --git a/src/doclift/convert.py b/src/doclift/convert.py index 47252d3..b2489c6 100755 --- a/src/doclift/convert.py +++ b/src/doclift/convert.py @@ -9,6 +9,7 @@ from .legacy_doc import ( extract_references, extract_tables, extract_title, + link_related_assets, normalize_text_preserve_layout, render_markdown, run_catdoc, @@ -32,7 +33,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = layout = build_layout_manifest(layout_body) table_refs = extract_references(body, r"\bTable\s+\d+\b") figure_refs = extract_references(body, r"\b(?:Fig\.?\s*[\d.]+|Figure\s+[\d.]+)\b") - related_assets = list(figure_assets or []) + related_assets = link_related_assets(figure_refs, list(figure_assets or [])) doc_out = _document_output_dir(out_root, source_path, title) doc_out.mkdir(parents=True, exist_ok=True) @@ -93,6 +94,15 @@ def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None "summary": { "documents_with_tables": sum(1 for bundle in bundles if bundle.table_count > 0), "documents_with_figure_references": sum(1 for bundle in bundles if bundle.figure_reference_count > 0), + "documents": [ + { + "document_id": bundle.document_id, + "title": bundle.title, + "table_count": bundle.table_count, + "figure_reference_count": bundle.figure_reference_count, + } + for bundle in bundles + ], } }, ) diff --git a/src/doclift/legacy_doc.py b/src/doclift/legacy_doc.py index 5482680..98e7ee9 100755 --- a/src/doclift/legacy_doc.py +++ b/src/doclift/legacy_doc.py @@ -221,6 +221,28 @@ def collect_figure_assets(root: Path) -> list[FigureAsset]: return assets +def link_related_assets(figure_refs: list[str], figure_assets: list[FigureAsset]) -> list[FigureAsset]: + if not figure_refs: + return [] + + matched: list[FigureAsset] = [] + seen: set[str] = set() + ref_keys: set[str] = set() + for ref in figure_refs: + key = slugify(ref.replace("Figure", "Fig").replace("figure", "fig")) + ref_keys.add(key) + + for asset in figure_assets: + asset_key = slugify(asset.name.rsplit(".", 1)[0]) + for ref_key in ref_keys: + if ref_key and ref_key in asset_key: + if asset.asset_id not in seen: + seen.add(asset.asset_id) + matched.append(asset) + break + return matched + + def build_layout_manifest(layout_body: str) -> list[dict]: manifest: list[dict] = [] for line_no, line in enumerate(layout_body.splitlines(), start=1): diff --git a/tests/test_convert.py b/tests/test_convert.py new file mode 100755 index 0000000..ef27a71 --- /dev/null +++ b/tests/test_convert.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from doclift import convert as convert_module + + +def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path, monkeypatch) -> None: + source_root = tmp_path / "src" + asset_root = tmp_path / "assets" + out_root = tmp_path / "out" + source_root.mkdir() + asset_root.mkdir() + (source_root / "sample.doc").write_text("stub", encoding="utf-8") + (asset_root / "Fig. 5.1.bmp").write_text("img", encoding="utf-8") + + sample_text = "\n".join( + [ + "Lecture 1. Example legacy document", + "", + "See Fig. 5.1 and Table 1.", + "", + "Table 1. Example caption", + "", + "Metric\tRest\tSwim", + "O2\t1.0\t2.0", + ] + ) + + monkeypatch.setattr(convert_module, "run_catdoc", lambda path: sample_text) + + report = convert_module.convert_directory(source_root, out_root, asset_root=asset_root) + + assert report.document_count == 1 + manifest = json.loads((out_root / "manifest.json").read_text(encoding="utf-8")) + conversion_report = json.loads((out_root / "conversion_report.json").read_text(encoding="utf-8")) + figures_payload = json.loads( + (out_root / "documents" / "sample-lecture-1-example-legacy-document" / "document.figures.json").read_text( + encoding="utf-8" + ) + ) + + assert manifest["document_count"] == 1 + assert conversion_report["summary"]["documents_with_tables"] == 1 + assert conversion_report["summary"]["documents_with_figure_references"] == 1 + assert figures_payload["figure_references"] == ["Fig. 5.1"] + assert len(figures_payload["related_assets"]) == 1 diff --git a/tests/test_legacy_doc.py b/tests/test_legacy_doc.py index c787c7a..468c7be 100755 --- a/tests/test_legacy_doc.py +++ b/tests/test_legacy_doc.py @@ -1,4 +1,4 @@ -from doclift.legacy_doc import extract_references, extract_tables +from doclift.legacy_doc import FigureAsset, extract_references, extract_tables, link_related_assets def test_extract_references_dedupes() -> None: @@ -22,3 +22,26 @@ def test_extract_tables_parses_tabbed_rows() -> None: assert tables[0].caption == "Table 1. Example caption" assert tables[0].column_count_guess == 3 assert tables[0].parsed_rows[1] == ["O2", "1.0", "2.0"] + + +def test_link_related_assets_matches_explicit_figure_refs() -> None: + assets = [ + FigureAsset( + asset_id="a1", + path="/tmp/Fig. 5.1.bmp", + relative_path="vol/Fig. 5.1.bmp", + name="Fig. 5.1.bmp", + container="vol", + looks_like_figure=True, + ), + FigureAsset( + asset_id="a2", + path="/tmp/Slide 1.jpg", + relative_path="vol/Slide 1.jpg", + name="Slide 1.jpg", + container="vol", + looks_like_figure=False, + ), + ] + matched = link_related_assets(["Fig. 5.1"], assets) + assert [asset.asset_id for asset in matched] == ["a1"]