Make doclift bundle paths portable
This commit is contained in:
parent
e725553562
commit
28aea13192
|
|
@ -24,7 +24,11 @@ def _document_output_dir(out_root: Path, source_path: Path, title: str) -> Path:
|
|||
return out_root / "documents" / f"{slugify(source_path.stem)}-{slugify(title)}"
|
||||
|
||||
|
||||
def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle:
|
||||
def _relative_to_root(path: Path, root: Path) -> str:
|
||||
return path.relative_to(root).as_posix()
|
||||
|
||||
|
||||
def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle:
|
||||
raw = run_catdoc(source_path)
|
||||
cleaned = clean_text(raw)
|
||||
title = extract_title(cleaned, source_path.stem)
|
||||
|
|
@ -49,7 +53,8 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
|||
write_json(
|
||||
tables_path,
|
||||
{
|
||||
"source_path": str(source_path),
|
||||
"source_path": _relative_to_root(source_path, source_root),
|
||||
"source_path_kind": "source_root_relative",
|
||||
"table_references": table_refs,
|
||||
"tables": [table.model_dump() for table in tables],
|
||||
},
|
||||
|
|
@ -57,7 +62,8 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
|||
write_json(
|
||||
figures_path,
|
||||
{
|
||||
"source_path": str(source_path),
|
||||
"source_path": _relative_to_root(source_path, source_root),
|
||||
"source_path_kind": "source_root_relative",
|
||||
"figure_references": figure_refs,
|
||||
"related_assets": [asset.model_dump() for asset in related_assets],
|
||||
},
|
||||
|
|
@ -67,12 +73,14 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
|||
document_id=slugify(title),
|
||||
title=title,
|
||||
document_kind=document_kind,
|
||||
source_path=str(source_path),
|
||||
output_dir=str(doc_out),
|
||||
markdown_path=str(markdown_path),
|
||||
layout_path=str(layout_path),
|
||||
tables_path=str(tables_path),
|
||||
figures_path=str(figures_path),
|
||||
source_path=_relative_to_root(source_path, source_root),
|
||||
source_path_kind="source_root_relative",
|
||||
output_dir=_relative_to_root(doc_out, out_root),
|
||||
markdown_path=_relative_to_root(markdown_path, out_root),
|
||||
layout_path=_relative_to_root(layout_path, out_root),
|
||||
tables_path=_relative_to_root(tables_path, out_root),
|
||||
figures_path=_relative_to_root(figures_path, out_root),
|
||||
bundle_path_kind="bundle_root_relative",
|
||||
table_count=len(tables),
|
||||
figure_reference_count=len(figure_refs),
|
||||
)
|
||||
|
|
@ -81,9 +89,10 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
|||
def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None = None) -> ConversionReport:
|
||||
docs = sorted(path for path in source_root.rglob("*") if path.is_file() and path.suffix.lower() == ".doc")
|
||||
figure_assets = collect_figure_assets(asset_root) if asset_root is not None else []
|
||||
bundles = [convert_doc(path, out_root, figure_assets=figure_assets) for path in docs]
|
||||
bundles = [convert_doc(path, source_root, out_root, figure_assets=figure_assets) for path in docs]
|
||||
report = ConversionReport(
|
||||
source_root=str(source_root),
|
||||
source_root=source_root.name,
|
||||
source_root_kind="source_label",
|
||||
converter="catdoc_doc",
|
||||
document_count=len(bundles),
|
||||
documents=bundles,
|
||||
|
|
|
|||
|
|
@ -268,7 +268,7 @@ def collect_figure_assets(root: Path) -> list[FigureAsset]:
|
|||
assets.append(
|
||||
FigureAsset(
|
||||
asset_id=slugify(relative),
|
||||
path=str(path),
|
||||
path=relative,
|
||||
relative_path=relative,
|
||||
name=path.name,
|
||||
container=path.parent.name,
|
||||
|
|
|
|||
|
|
@ -36,17 +36,20 @@ class DocumentBundle(BaseModel):
|
|||
title: str
|
||||
document_kind: str = "document"
|
||||
source_path: str
|
||||
source_path_kind: str = "source_root_relative"
|
||||
output_dir: str
|
||||
markdown_path: str
|
||||
layout_path: str
|
||||
tables_path: str
|
||||
figures_path: str
|
||||
bundle_path_kind: str = "bundle_root_relative"
|
||||
table_count: int = 0
|
||||
figure_reference_count: int = 0
|
||||
|
||||
|
||||
class ConversionReport(BaseModel):
|
||||
source_root: str
|
||||
source_root_kind: str = "source_label"
|
||||
converter: str
|
||||
document_count: int = 0
|
||||
documents: list[DocumentBundle] = Field(default_factory=list)
|
||||
|
|
|
|||
|
|
@ -42,7 +42,13 @@ def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path,
|
|||
)
|
||||
|
||||
assert manifest["document_count"] == 1
|
||||
assert manifest["source_root"] == "src"
|
||||
assert manifest["documents"][0]["source_path"] == "sample.doc"
|
||||
assert manifest["documents"][0]["markdown_path"] == "documents/sample-lecture-1-example-legacy-document/document.md"
|
||||
assert conversion_report["summary"]["documents_with_tables"] == 1
|
||||
assert conversion_report["summary"]["documents_with_figure_references"] == 1
|
||||
assert figures_payload["source_path"] == "sample.doc"
|
||||
assert figures_payload["source_path_kind"] == "source_root_relative"
|
||||
assert figures_payload["figure_references"] == ["Fig. 5.1"]
|
||||
assert len(figures_payload["related_assets"]) == 1
|
||||
assert figures_payload["related_assets"][0]["path"] == "Fig. 5.1.bmp"
|
||||
|
|
|
|||
Loading…
Reference in New Issue