Tighten asset linking and batch conversion tests
This commit is contained in:
parent
d0f2352341
commit
787e3e7330
|
|
@ -9,6 +9,7 @@ from .legacy_doc import (
|
||||||
extract_references,
|
extract_references,
|
||||||
extract_tables,
|
extract_tables,
|
||||||
extract_title,
|
extract_title,
|
||||||
|
link_related_assets,
|
||||||
normalize_text_preserve_layout,
|
normalize_text_preserve_layout,
|
||||||
render_markdown,
|
render_markdown,
|
||||||
run_catdoc,
|
run_catdoc,
|
||||||
|
|
@ -32,7 +33,7 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
||||||
layout = build_layout_manifest(layout_body)
|
layout = build_layout_manifest(layout_body)
|
||||||
table_refs = extract_references(body, r"\bTable\s+\d+\b")
|
table_refs = extract_references(body, r"\bTable\s+\d+\b")
|
||||||
figure_refs = extract_references(body, r"\b(?:Fig\.?\s*[\d.]+|Figure\s+[\d.]+)\b")
|
figure_refs = extract_references(body, r"\b(?:Fig\.?\s*[\d.]+|Figure\s+[\d.]+)\b")
|
||||||
related_assets = list(figure_assets or [])
|
related_assets = link_related_assets(figure_refs, list(figure_assets or []))
|
||||||
|
|
||||||
doc_out = _document_output_dir(out_root, source_path, title)
|
doc_out = _document_output_dir(out_root, source_path, title)
|
||||||
doc_out.mkdir(parents=True, exist_ok=True)
|
doc_out.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
@ -93,6 +94,15 @@ def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None
|
||||||
"summary": {
|
"summary": {
|
||||||
"documents_with_tables": sum(1 for bundle in bundles if bundle.table_count > 0),
|
"documents_with_tables": sum(1 for bundle in bundles if bundle.table_count > 0),
|
||||||
"documents_with_figure_references": sum(1 for bundle in bundles if bundle.figure_reference_count > 0),
|
"documents_with_figure_references": sum(1 for bundle in bundles if bundle.figure_reference_count > 0),
|
||||||
|
"documents": [
|
||||||
|
{
|
||||||
|
"document_id": bundle.document_id,
|
||||||
|
"title": bundle.title,
|
||||||
|
"table_count": bundle.table_count,
|
||||||
|
"figure_reference_count": bundle.figure_reference_count,
|
||||||
|
}
|
||||||
|
for bundle in bundles
|
||||||
|
],
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -221,6 +221,28 @@ def collect_figure_assets(root: Path) -> list[FigureAsset]:
|
||||||
return assets
|
return assets
|
||||||
|
|
||||||
|
|
||||||
|
def link_related_assets(figure_refs: list[str], figure_assets: list[FigureAsset]) -> list[FigureAsset]:
|
||||||
|
if not figure_refs:
|
||||||
|
return []
|
||||||
|
|
||||||
|
matched: list[FigureAsset] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
ref_keys: set[str] = set()
|
||||||
|
for ref in figure_refs:
|
||||||
|
key = slugify(ref.replace("Figure", "Fig").replace("figure", "fig"))
|
||||||
|
ref_keys.add(key)
|
||||||
|
|
||||||
|
for asset in figure_assets:
|
||||||
|
asset_key = slugify(asset.name.rsplit(".", 1)[0])
|
||||||
|
for ref_key in ref_keys:
|
||||||
|
if ref_key and ref_key in asset_key:
|
||||||
|
if asset.asset_id not in seen:
|
||||||
|
seen.add(asset.asset_id)
|
||||||
|
matched.append(asset)
|
||||||
|
break
|
||||||
|
return matched
|
||||||
|
|
||||||
|
|
||||||
def build_layout_manifest(layout_body: str) -> list[dict]:
|
def build_layout_manifest(layout_body: str) -> list[dict]:
|
||||||
manifest: list[dict] = []
|
manifest: list[dict] = []
|
||||||
for line_no, line in enumerate(layout_body.splitlines(), start=1):
|
for line_no, line in enumerate(layout_body.splitlines(), start=1):
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,48 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from doclift import convert as convert_module
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path, monkeypatch) -> None:
|
||||||
|
source_root = tmp_path / "src"
|
||||||
|
asset_root = tmp_path / "assets"
|
||||||
|
out_root = tmp_path / "out"
|
||||||
|
source_root.mkdir()
|
||||||
|
asset_root.mkdir()
|
||||||
|
(source_root / "sample.doc").write_text("stub", encoding="utf-8")
|
||||||
|
(asset_root / "Fig. 5.1.bmp").write_text("img", encoding="utf-8")
|
||||||
|
|
||||||
|
sample_text = "\n".join(
|
||||||
|
[
|
||||||
|
"Lecture 1. Example legacy document",
|
||||||
|
"",
|
||||||
|
"See Fig. 5.1 and Table 1.",
|
||||||
|
"",
|
||||||
|
"Table 1. Example caption",
|
||||||
|
"",
|
||||||
|
"Metric\tRest\tSwim",
|
||||||
|
"O2\t1.0\t2.0",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.setattr(convert_module, "run_catdoc", lambda path: sample_text)
|
||||||
|
|
||||||
|
report = convert_module.convert_directory(source_root, out_root, asset_root=asset_root)
|
||||||
|
|
||||||
|
assert report.document_count == 1
|
||||||
|
manifest = json.loads((out_root / "manifest.json").read_text(encoding="utf-8"))
|
||||||
|
conversion_report = json.loads((out_root / "conversion_report.json").read_text(encoding="utf-8"))
|
||||||
|
figures_payload = json.loads(
|
||||||
|
(out_root / "documents" / "sample-lecture-1-example-legacy-document" / "document.figures.json").read_text(
|
||||||
|
encoding="utf-8"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert manifest["document_count"] == 1
|
||||||
|
assert conversion_report["summary"]["documents_with_tables"] == 1
|
||||||
|
assert conversion_report["summary"]["documents_with_figure_references"] == 1
|
||||||
|
assert figures_payload["figure_references"] == ["Fig. 5.1"]
|
||||||
|
assert len(figures_payload["related_assets"]) == 1
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
from doclift.legacy_doc import extract_references, extract_tables
|
from doclift.legacy_doc import FigureAsset, extract_references, extract_tables, link_related_assets
|
||||||
|
|
||||||
|
|
||||||
def test_extract_references_dedupes() -> None:
|
def test_extract_references_dedupes() -> None:
|
||||||
|
|
@ -22,3 +22,26 @@ def test_extract_tables_parses_tabbed_rows() -> None:
|
||||||
assert tables[0].caption == "Table 1. Example caption"
|
assert tables[0].caption == "Table 1. Example caption"
|
||||||
assert tables[0].column_count_guess == 3
|
assert tables[0].column_count_guess == 3
|
||||||
assert tables[0].parsed_rows[1] == ["O2", "1.0", "2.0"]
|
assert tables[0].parsed_rows[1] == ["O2", "1.0", "2.0"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_link_related_assets_matches_explicit_figure_refs() -> None:
|
||||||
|
assets = [
|
||||||
|
FigureAsset(
|
||||||
|
asset_id="a1",
|
||||||
|
path="/tmp/Fig. 5.1.bmp",
|
||||||
|
relative_path="vol/Fig. 5.1.bmp",
|
||||||
|
name="Fig. 5.1.bmp",
|
||||||
|
container="vol",
|
||||||
|
looks_like_figure=True,
|
||||||
|
),
|
||||||
|
FigureAsset(
|
||||||
|
asset_id="a2",
|
||||||
|
path="/tmp/Slide 1.jpg",
|
||||||
|
relative_path="vol/Slide 1.jpg",
|
||||||
|
name="Slide 1.jpg",
|
||||||
|
container="vol",
|
||||||
|
looks_like_figure=False,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
matched = link_related_assets(["Fig. 5.1"], assets)
|
||||||
|
assert [asset.asset_id for asset in matched] == ["a1"]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue