From 28aea13192f91e7eed1318e6ac8e942cc07f53a0 Mon Sep 17 00:00:00 2001 From: welsberr Date: Thu, 23 Apr 2026 10:27:33 -0400 Subject: [PATCH] Make doclift bundle paths portable --- src/doclift/convert.py | 31 ++++++++++++++++++++----------- src/doclift/legacy_doc.py | 2 +- src/doclift/schemas.py | 3 +++ tests/test_convert.py | 6 ++++++ 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/doclift/convert.py b/src/doclift/convert.py index fe43260..1d9b7b9 100755 --- a/src/doclift/convert.py +++ b/src/doclift/convert.py @@ -24,7 +24,11 @@ def _document_output_dir(out_root: Path, source_path: Path, title: str) -> Path: return out_root / "documents" / f"{slugify(source_path.stem)}-{slugify(title)}" -def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle: +def _relative_to_root(path: Path, root: Path) -> str: + return path.relative_to(root).as_posix() + + +def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle: raw = run_catdoc(source_path) cleaned = clean_text(raw) title = extract_title(cleaned, source_path.stem) @@ -49,7 +53,8 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = write_json( tables_path, { - "source_path": str(source_path), + "source_path": _relative_to_root(source_path, source_root), + "source_path_kind": "source_root_relative", "table_references": table_refs, "tables": [table.model_dump() for table in tables], }, @@ -57,7 +62,8 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = write_json( figures_path, { - "source_path": str(source_path), + "source_path": _relative_to_root(source_path, source_root), + "source_path_kind": "source_root_relative", "figure_references": figure_refs, "related_assets": [asset.model_dump() for asset in related_assets], }, @@ -67,12 +73,14 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = document_id=slugify(title), title=title, document_kind=document_kind, - source_path=str(source_path), - output_dir=str(doc_out), - markdown_path=str(markdown_path), - layout_path=str(layout_path), - tables_path=str(tables_path), - figures_path=str(figures_path), + source_path=_relative_to_root(source_path, source_root), + source_path_kind="source_root_relative", + output_dir=_relative_to_root(doc_out, out_root), + markdown_path=_relative_to_root(markdown_path, out_root), + layout_path=_relative_to_root(layout_path, out_root), + tables_path=_relative_to_root(tables_path, out_root), + figures_path=_relative_to_root(figures_path, out_root), + bundle_path_kind="bundle_root_relative", table_count=len(tables), figure_reference_count=len(figure_refs), ) @@ -81,9 +89,10 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None = None) -> ConversionReport: docs = sorted(path for path in source_root.rglob("*") if path.is_file() and path.suffix.lower() == ".doc") figure_assets = collect_figure_assets(asset_root) if asset_root is not None else [] - bundles = [convert_doc(path, out_root, figure_assets=figure_assets) for path in docs] + bundles = [convert_doc(path, source_root, out_root, figure_assets=figure_assets) for path in docs] report = ConversionReport( - source_root=str(source_root), + source_root=source_root.name, + source_root_kind="source_label", converter="catdoc_doc", document_count=len(bundles), documents=bundles, diff --git a/src/doclift/legacy_doc.py b/src/doclift/legacy_doc.py index 2391390..db0d945 100755 --- a/src/doclift/legacy_doc.py +++ b/src/doclift/legacy_doc.py @@ -268,7 +268,7 @@ def collect_figure_assets(root: Path) -> list[FigureAsset]: assets.append( FigureAsset( asset_id=slugify(relative), - path=str(path), + path=relative, relative_path=relative, name=path.name, container=path.parent.name, diff --git a/src/doclift/schemas.py b/src/doclift/schemas.py index a69ff09..f3957a0 100755 --- a/src/doclift/schemas.py +++ b/src/doclift/schemas.py @@ -36,17 +36,20 @@ class DocumentBundle(BaseModel): title: str document_kind: str = "document" source_path: str + source_path_kind: str = "source_root_relative" output_dir: str markdown_path: str layout_path: str tables_path: str figures_path: str + bundle_path_kind: str = "bundle_root_relative" table_count: int = 0 figure_reference_count: int = 0 class ConversionReport(BaseModel): source_root: str + source_root_kind: str = "source_label" converter: str document_count: int = 0 documents: list[DocumentBundle] = Field(default_factory=list) diff --git a/tests/test_convert.py b/tests/test_convert.py index ef27a71..56d2455 100755 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -42,7 +42,13 @@ def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path, ) assert manifest["document_count"] == 1 + assert manifest["source_root"] == "src" + assert manifest["documents"][0]["source_path"] == "sample.doc" + assert manifest["documents"][0]["markdown_path"] == "documents/sample-lecture-1-example-legacy-document/document.md" assert conversion_report["summary"]["documents_with_tables"] == 1 assert conversion_report["summary"]["documents_with_figure_references"] == 1 + assert figures_payload["source_path"] == "sample.doc" + assert figures_payload["source_path_kind"] == "source_root_relative" assert figures_payload["figure_references"] == ["Fig. 5.1"] assert len(figures_payload["related_assets"]) == 1 + assert figures_payload["related_assets"][0]["path"] == "Fig. 5.1.bmp"