Make doclift bundle paths portable
This commit is contained in:
parent
e725553562
commit
28aea13192
|
|
@ -24,7 +24,11 @@ def _document_output_dir(out_root: Path, source_path: Path, title: str) -> Path:
|
||||||
return out_root / "documents" / f"{slugify(source_path.stem)}-{slugify(title)}"
|
return out_root / "documents" / f"{slugify(source_path.stem)}-{slugify(title)}"
|
||||||
|
|
||||||
|
|
||||||
def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle:
|
def _relative_to_root(path: Path, root: Path) -> str:
|
||||||
|
return path.relative_to(root).as_posix()
|
||||||
|
|
||||||
|
|
||||||
|
def convert_doc(source_path: Path, source_root: Path, out_root: Path, figure_assets: list | None = None) -> DocumentBundle:
|
||||||
raw = run_catdoc(source_path)
|
raw = run_catdoc(source_path)
|
||||||
cleaned = clean_text(raw)
|
cleaned = clean_text(raw)
|
||||||
title = extract_title(cleaned, source_path.stem)
|
title = extract_title(cleaned, source_path.stem)
|
||||||
|
|
@ -49,7 +53,8 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
||||||
write_json(
|
write_json(
|
||||||
tables_path,
|
tables_path,
|
||||||
{
|
{
|
||||||
"source_path": str(source_path),
|
"source_path": _relative_to_root(source_path, source_root),
|
||||||
|
"source_path_kind": "source_root_relative",
|
||||||
"table_references": table_refs,
|
"table_references": table_refs,
|
||||||
"tables": [table.model_dump() for table in tables],
|
"tables": [table.model_dump() for table in tables],
|
||||||
},
|
},
|
||||||
|
|
@ -57,7 +62,8 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
||||||
write_json(
|
write_json(
|
||||||
figures_path,
|
figures_path,
|
||||||
{
|
{
|
||||||
"source_path": str(source_path),
|
"source_path": _relative_to_root(source_path, source_root),
|
||||||
|
"source_path_kind": "source_root_relative",
|
||||||
"figure_references": figure_refs,
|
"figure_references": figure_refs,
|
||||||
"related_assets": [asset.model_dump() for asset in related_assets],
|
"related_assets": [asset.model_dump() for asset in related_assets],
|
||||||
},
|
},
|
||||||
|
|
@ -67,12 +73,14 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
||||||
document_id=slugify(title),
|
document_id=slugify(title),
|
||||||
title=title,
|
title=title,
|
||||||
document_kind=document_kind,
|
document_kind=document_kind,
|
||||||
source_path=str(source_path),
|
source_path=_relative_to_root(source_path, source_root),
|
||||||
output_dir=str(doc_out),
|
source_path_kind="source_root_relative",
|
||||||
markdown_path=str(markdown_path),
|
output_dir=_relative_to_root(doc_out, out_root),
|
||||||
layout_path=str(layout_path),
|
markdown_path=_relative_to_root(markdown_path, out_root),
|
||||||
tables_path=str(tables_path),
|
layout_path=_relative_to_root(layout_path, out_root),
|
||||||
figures_path=str(figures_path),
|
tables_path=_relative_to_root(tables_path, out_root),
|
||||||
|
figures_path=_relative_to_root(figures_path, out_root),
|
||||||
|
bundle_path_kind="bundle_root_relative",
|
||||||
table_count=len(tables),
|
table_count=len(tables),
|
||||||
figure_reference_count=len(figure_refs),
|
figure_reference_count=len(figure_refs),
|
||||||
)
|
)
|
||||||
|
|
@ -81,9 +89,10 @@ def convert_doc(source_path: Path, out_root: Path, figure_assets: list | None =
|
||||||
def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None = None) -> ConversionReport:
|
def convert_directory(source_root: Path, out_root: Path, asset_root: Path | None = None) -> ConversionReport:
|
||||||
docs = sorted(path for path in source_root.rglob("*") if path.is_file() and path.suffix.lower() == ".doc")
|
docs = sorted(path for path in source_root.rglob("*") if path.is_file() and path.suffix.lower() == ".doc")
|
||||||
figure_assets = collect_figure_assets(asset_root) if asset_root is not None else []
|
figure_assets = collect_figure_assets(asset_root) if asset_root is not None else []
|
||||||
bundles = [convert_doc(path, out_root, figure_assets=figure_assets) for path in docs]
|
bundles = [convert_doc(path, source_root, out_root, figure_assets=figure_assets) for path in docs]
|
||||||
report = ConversionReport(
|
report = ConversionReport(
|
||||||
source_root=str(source_root),
|
source_root=source_root.name,
|
||||||
|
source_root_kind="source_label",
|
||||||
converter="catdoc_doc",
|
converter="catdoc_doc",
|
||||||
document_count=len(bundles),
|
document_count=len(bundles),
|
||||||
documents=bundles,
|
documents=bundles,
|
||||||
|
|
|
||||||
|
|
@ -268,7 +268,7 @@ def collect_figure_assets(root: Path) -> list[FigureAsset]:
|
||||||
assets.append(
|
assets.append(
|
||||||
FigureAsset(
|
FigureAsset(
|
||||||
asset_id=slugify(relative),
|
asset_id=slugify(relative),
|
||||||
path=str(path),
|
path=relative,
|
||||||
relative_path=relative,
|
relative_path=relative,
|
||||||
name=path.name,
|
name=path.name,
|
||||||
container=path.parent.name,
|
container=path.parent.name,
|
||||||
|
|
|
||||||
|
|
@ -36,17 +36,20 @@ class DocumentBundle(BaseModel):
|
||||||
title: str
|
title: str
|
||||||
document_kind: str = "document"
|
document_kind: str = "document"
|
||||||
source_path: str
|
source_path: str
|
||||||
|
source_path_kind: str = "source_root_relative"
|
||||||
output_dir: str
|
output_dir: str
|
||||||
markdown_path: str
|
markdown_path: str
|
||||||
layout_path: str
|
layout_path: str
|
||||||
tables_path: str
|
tables_path: str
|
||||||
figures_path: str
|
figures_path: str
|
||||||
|
bundle_path_kind: str = "bundle_root_relative"
|
||||||
table_count: int = 0
|
table_count: int = 0
|
||||||
figure_reference_count: int = 0
|
figure_reference_count: int = 0
|
||||||
|
|
||||||
|
|
||||||
class ConversionReport(BaseModel):
|
class ConversionReport(BaseModel):
|
||||||
source_root: str
|
source_root: str
|
||||||
|
source_root_kind: str = "source_label"
|
||||||
converter: str
|
converter: str
|
||||||
document_count: int = 0
|
document_count: int = 0
|
||||||
documents: list[DocumentBundle] = Field(default_factory=list)
|
documents: list[DocumentBundle] = Field(default_factory=list)
|
||||||
|
|
|
||||||
|
|
@ -42,7 +42,13 @@ def test_convert_directory_writes_manifest_and_conversion_report(tmp_path: Path,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert manifest["document_count"] == 1
|
assert manifest["document_count"] == 1
|
||||||
|
assert manifest["source_root"] == "src"
|
||||||
|
assert manifest["documents"][0]["source_path"] == "sample.doc"
|
||||||
|
assert manifest["documents"][0]["markdown_path"] == "documents/sample-lecture-1-example-legacy-document/document.md"
|
||||||
assert conversion_report["summary"]["documents_with_tables"] == 1
|
assert conversion_report["summary"]["documents_with_tables"] == 1
|
||||||
assert conversion_report["summary"]["documents_with_figure_references"] == 1
|
assert conversion_report["summary"]["documents_with_figure_references"] == 1
|
||||||
|
assert figures_payload["source_path"] == "sample.doc"
|
||||||
|
assert figures_payload["source_path_kind"] == "source_root_relative"
|
||||||
assert figures_payload["figure_references"] == ["Fig. 5.1"]
|
assert figures_payload["figure_references"] == ["Fig. 5.1"]
|
||||||
assert len(figures_payload["related_assets"]) == 1
|
assert len(figures_payload["related_assets"]) == 1
|
||||||
|
assert figures_payload["related_assets"][0]["path"] == "Fig. 5.1.bmp"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue