Tighten asset linking and batch conversion tests
This commit is contained in:
parent
d0f2352341
commit
787e3e7330
|
|
@ -9,6 +9,7 @@ from .legacy_doc import (
|
|||
extract_references,
|
||||
extract_tables,
|
||||
extract_title,
|
||||
link_related_assets,
|
||||
normalize_text_preserve_layout,
|
||||
render_markdown,
|
||||
run_catdoc,
|
||||
|
|
@ -32,7 +33,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
|||
layout = build_layout_manifest(layout_body)
|
||||
table_refs = extract_references(body, r"\bTable\s+\d+\b")
|
||||
figure_refs = extract_references(body, r"\b(?:Fig\.?\s*[\d.]+|Figure\s+[\d.]+)\b")
|
||||
related_assets = list(figure_assets or [])
|
||||
related_assets = link_related_assets(figure_refs, list(figure_assets or []))
|
||||
|
||||
doc_out = _document_output_dir(out_root, source_path, title)
|
||||
doc_out.mkdir(parents=True, exist_ok=True)
|
||||
|
|
@ -93,6 +94,15 @@ def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None
|
|||
"summary": {
|
||||
"documents_with_tables": sum(1 for bundle in bundles if bundle.table_count > 0),
|
||||
"documents_with_figure_references": sum(1 for bundle in bundles if bundle.figure_reference_count > 0),
|
||||
"documents": [
|
||||
{
|
||||
"document_id": bundle.document_id,
|
||||
"title": bundle.title,
|
||||
"table_count": bundle.table_count,
|
||||
"figure_reference_count": bundle.figure_reference_count,
|
||||
}
|
||||
for bundle in bundles
|
||||
],
|
||||
}
|
||||
},
|
||||
)
|
||||
|
|
|
|||
|
|
@ -221,6 +221,28 @@ def collect_figure_assets(root: Path) -> list[FigureAsset]:
|
|||
return assets
|
||||
|
||||
|
||||
def link_related_assets(figure_refs: list[str], figure_assets: list[FigureAsset]) -> list[FigureAsset]:
|
||||
if not figure_refs:
|
||||
return []
|
||||
|
||||
matched: list[FigureAsset] = []
|
||||
seen: set[str] = set()
|
||||
ref_keys: set[str] = set()
|
||||
for ref in figure_refs:
|
||||
key = slugify(ref.replace("Figure", "Fig").replace("figure", "fig"))
|
||||
ref_keys.add(key)
|
||||
|
||||
for asset in figure_assets:
|
||||
asset_key = slugify(asset.name.rsplit(".", 1)[0])
|
||||
for ref_key in ref_keys:
|
||||
if ref_key and ref_key in asset_key:
|
||||
if asset.asset_id not in seen:
|
||||
seen.add(asset.asset_id)
|
||||
matched.append(asset)
|
||||
break
|
||||
return matched
|
||||
|
||||
|
||||
def build_layout_manifest(layout_body: str) -> list[dict]:
|
||||
manifest: list[dict] = []
|
||||
for line_no, line in enumerate(layout_body.splitlines(), start=1):
|
||||
|
|
|
|||
|
|
@ -0,0 +1,48 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from doclift import convert as convert_module
|
||||
|
||||
|
||||
def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path, monkeypatch) -> None:
|
||||
source_root = tmp_path / "src"
|
||||
asset_root = tmp_path / "assets"
|
||||
out_root = tmp_path / "out"
|
||||
source_root.mkdir()
|
||||
asset_root.mkdir()
|
||||
(source_root / "sample.doc").write_text("stub", encoding="utf-8")
|
||||
(asset_root / "Fig. 5.1.bmp").write_text("img", encoding="utf-8")
|
||||
|
||||
sample_text = "\n".join(
|
||||
[
|
||||
"Lecture 1. Example legacy document",
|
||||
"",
|
||||
"See Fig. 5.1 and Table 1.",
|
||||
"",
|
||||
"Table 1. Example caption",
|
||||
"",
|
||||
"Metric\tRest\tSwim",
|
||||
"O2\t1.0\t2.0",
|
||||
]
|
||||
)
|
||||
|
||||
monkeypatch.setattr(convert_module, "run_catdoc", lambda path: sample_text)
|
||||
|
||||
report = convert_module.convert_directory(source_root, out_root, asset_root=asset_root)
|
||||
|
||||
assert report.document_count == 1
|
||||
manifest = json.loads((out_root / "manifest.json").read_text(encoding="utf-8"))
|
||||
conversion_report = json.loads((out_root / "conversion_report.json").read_text(encoding="utf-8"))
|
||||
figures_payload = json.loads(
|
||||
(out_root / "documents" / "sample-lecture-1-example-legacy-document" / "document.figures.json").read_text(
|
||||
encoding="utf-8"
|
||||
)
|
||||
)
|
||||
|
||||
assert manifest["document_count"] == 1
|
||||
assert conversion_report["summary"]["documents_with_tables"] == 1
|
||||
assert conversion_report["summary"]["documents_with_figure_references"] == 1
|
||||
assert figures_payload["figure_references"] == ["Fig. 5.1"]
|
||||
assert len(figures_payload["related_assets"]) == 1
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
from doclift.legacy_doc import extract_references, extract_tables
|
||||
from doclift.legacy_doc import FigureAsset, extract_references, extract_tables, link_related_assets
|
||||
|
||||
|
||||
def test_extract_references_dedupes() -> None:
|
||||
|
|
@ -22,3 +22,26 @@ def test_extract_tables_parses_tabbed_rows() -> None:
|
|||
assert tables[0].caption == "Table 1. Example caption"
|
||||
assert tables[0].column_count_guess == 3
|
||||
assert tables[0].parsed_rows[1] == ["O2", "1.0", "2.0"]
|
||||
|
||||
|
||||
def test_link_related_assets_matches_explicit_figure_refs() -> None:
|
||||
assets = [
|
||||
FigureAsset(
|
||||
asset_id="a1",
|
||||
path="/tmp/Fig. 5.1.bmp",
|
||||
relative_path="vol/Fig. 5.1.bmp",
|
||||
name="Fig. 5.1.bmp",
|
||||
container="vol",
|
||||
looks_like_figure=True,
|
||||
),
|
||||
FigureAsset(
|
||||
asset_id="a2",
|
||||
path="/tmp/Slide 1.jpg",
|
||||
relative_path="vol/Slide 1.jpg",
|
||||
name="Slide 1.jpg",
|
||||
container="vol",
|
||||
looks_like_figure=False,
|
||||
),
|
||||
]
|
||||
matched = link_related_assets(["Fig. 5.1"], assets)
|
||||
assert [asset.asset_id for asset in matched] == ["a1"]
|
||||
|
|
|
|||
Loading…
Reference in New Issue