Make doclift bundle paths portable

This commit is contained in:
welsberr 2026-04-23 10:27:33 -04:00
parent e725553562
commit 28aea13192
4 changed files with 30 additions and 12 deletions

View File

@ -24,7 +24,11 @@ def _document_output_dir(out_root: Path, source_path: Path, title: str) -> Path:
return out_root / "documents" / f"{slugify(source_path.stem)}-{slugify(title)}" return out_root / "documents" / f"{slugify(source_path.stem)}-{slugify(title)}"
def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle: def _relative_to_root(path: Path, root: Path) -> str:
return path.relative_to(root).as_posix()
def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle:
raw = run_catdoc(source_path) raw = run_catdoc(source_path)
cleaned = clean_text(raw) cleaned = clean_text(raw)
title = extract_title(cleaned, source_path.stem) title = extract_title(cleaned, source_path.stem)
@ -49,7 +53,8 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
write_json( write_json(
tables_path, tables_path,
{ {
"source_path": str(source_path), "source_path": _relative_to_root(source_path, source_root),
"source_path_kind": "source_root_relative",
"table_references": table_refs, "table_references": table_refs,
"tables": [table.model_dump() for table in tables], "tables": [table.model_dump() for table in tables],
}, },
@ -57,7 +62,8 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
write_json( write_json(
figures_path, figures_path,
{ {
"source_path": str(source_path), "source_path": _relative_to_root(source_path, source_root),
"source_path_kind": "source_root_relative",
"figure_references": figure_refs, "figure_references": figure_refs,
"related_assets": [asset.model_dump() for asset in related_assets], "related_assets": [asset.model_dump() for asset in related_assets],
}, },
@ -67,12 +73,14 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
document_id=slugify(title), document_id=slugify(title),
title=title, title=title,
document_kind=document_kind, document_kind=document_kind,
source_path=str(source_path), source_path=_relative_to_root(source_path, source_root),
output_dir=str(doc_out), source_path_kind="source_root_relative",
markdown_path=str(markdown_path), output_dir=_relative_to_root(doc_out, out_root),
layout_path=str(layout_path), markdown_path=_relative_to_root(markdown_path, out_root),
tables_path=str(tables_path), layout_path=_relative_to_root(layout_path, out_root),
figures_path=str(figures_path), tables_path=_relative_to_root(tables_path, out_root),
figures_path=_relative_to_root(figures_path, out_root),
bundle_path_kind="bundle_root_relative",
table_count=len(tables), table_count=len(tables),
figure_reference_count=len(figure_refs), figure_reference_count=len(figure_refs),
) )
@ -81,9 +89,10 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None = None) -> ConversionReport: def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None = None) -> ConversionReport:
docs = sorted(path for path in source_root.rglob("*") if path.is_file() and path.suffix.lower() == ".doc") docs = sorted(path for path in source_root.rglob("*") if path.is_file() and path.suffix.lower() == ".doc")
figure_assets = collect_figure_assets(asset_root) if asset_root is not None else [] figure_assets = collect_figure_assets(asset_root) if asset_root is not None else []
bundles = [convert_doc(path, out_root, figure_assets=figure_assets) for path in docs] bundles = [convert_doc(path, source_root, out_root, figure_assets=figure_assets) for path in docs]
report = ConversionReport( report = ConversionReport(
source_root=str(source_root), source_root=source_root.name,
source_root_kind="source_label",
converter="catdoc_doc", converter="catdoc_doc",
document_count=len(bundles), document_count=len(bundles),
documents=bundles, documents=bundles,

View File

@ -268,7 +268,7 @@ def collect_figure_assets(root: Path) -> list[FigureAsset]:
assets.append( assets.append(
FigureAsset( FigureAsset(
asset_id=slugify(relative), asset_id=slugify(relative),
path=str(path), path=relative,
relative_path=relative, relative_path=relative,
name=path.name, name=path.name,
container=path.parent.name, container=path.parent.name,

View File

@ -36,17 +36,20 @@ class DocumentBundle(BaseModel):
title: str title: str
document_kind: str = "document" document_kind: str = "document"
source_path: str source_path: str
source_path_kind: str = "source_root_relative"
output_dir: str output_dir: str
markdown_path: str markdown_path: str
layout_path: str layout_path: str
tables_path: str tables_path: str
figures_path: str figures_path: str
bundle_path_kind: str = "bundle_root_relative"
table_count: int = 0 table_count: int = 0
figure_reference_count: int = 0 figure_reference_count: int = 0
class ConversionReport(BaseModel): class ConversionReport(BaseModel):
source_root: str source_root: str
source_root_kind: str = "source_label"
converter: str converter: str
document_count: int = 0 document_count: int = 0
documents: list[DocumentBundle] = Field(default_factory=list) documents: list[DocumentBundle] = Field(default_factory=list)

View File

@ -42,7 +42,13 @@ def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path,
) )
assert manifest["document_count"] == 1 assert manifest["document_count"] == 1
assert manifest["source_root"] == "src"
assert manifest["documents"][0]["source_path"] == "sample.doc"
assert manifest["documents"][0]["markdown_path"] == "documents/sample-lecture-1-example-legacy-document/document.md"
assert conversion_report["summary"]["documents_with_tables"] == 1 assert conversion_report["summary"]["documents_with_tables"] == 1
assert conversion_report["summary"]["documents_with_figure_references"] == 1 assert conversion_report["summary"]["documents_with_figure_references"] == 1
assert figures_payload["source_path"] == "sample.doc"
assert figures_payload["source_path_kind"] == "source_root_relative"
assert figures_payload["figure_references"] == ["Fig. 5.1"] assert figures_payload["figure_references"] == ["Fig. 5.1"]
assert len(figures_payload["related_assets"]) == 1 assert len(figures_payload["related_assets"]) == 1
assert figures_payload["related_assets"][0]["path"] == "Fig. 5.1.bmp"