Preserve portable doclift source paths

This commit is contained in:
welsberr 2026-04-23 10:27:43 -04:00
parent b7e2f9f540
commit 3837bd2316
2 changed files with 33 additions and 16 deletions

View File

@ -38,6 +38,15 @@ def _safe_read_json(path: Path) -> dict:
return json.loads(path.read_text(encoding="utf-8"))
def _resolve_bundle_path(base: Path, value: str | Path | None, fallback: Path) -> Path:
if value is None or value == "":
return fallback
path = Path(value)
if path.is_absolute():
return path
return base / path
def adapt_markdown(path: str | Path) -> NormalizedDocument:
text = read_textish(path)
return NormalizedDocument(
@ -140,9 +149,14 @@ def adapt_doclift_bundle(path: str | Path) -> list[NormalizedDocument]:
text = markdown_path.read_text(encoding="utf-8")
sections = _simple_section_split(text)
bundle_meta = by_output_dir.get(doc_dir.name, {})
figures_payload = _safe_read_json(doc_dir / "document.figures.json")
tables_payload = _safe_read_json(doc_dir / "document.tables.json")
source_path = figures_payload.get("source_path") or tables_payload.get("source_path") or str(markdown_path)
layout_path = _resolve_bundle_path(base, bundle_meta.get("layout_path"), doc_dir / "document.layout.json")
tables_path = _resolve_bundle_path(base, bundle_meta.get("tables_path"), doc_dir / "document.tables.json")
figures_path = _resolve_bundle_path(base, bundle_meta.get("figures_path"), doc_dir / "document.figures.json")
figures_payload = _safe_read_json(figures_path)
tables_payload = _safe_read_json(tables_path)
source_path = figures_payload.get("source_path") or tables_payload.get("source_path") or markdown_path.relative_to(base).as_posix()
relative_doc_dir = doc_dir.relative_to(base).as_posix()
relative_markdown_path = markdown_path.relative_to(base).as_posix()
docs.append(
NormalizedDocument(
source_path=str(source_path),
@ -152,13 +166,14 @@ def adapt_doclift_bundle(path: str | Path) -> list[NormalizedDocument]:
sections=sections,
metadata={
"doclift_bundle": True,
"bundle_root": str(base),
"bundle_document_dir": str(doc_dir),
"bundle_markdown_path": str(markdown_path),
"bundle_root": ".",
"bundle_document_dir": relative_doc_dir,
"bundle_markdown_path": relative_markdown_path,
"document_kind": bundle_meta.get("document_kind", "document"),
"layout_path": bundle_meta.get("layout_path", str(doc_dir / "document.layout.json")),
"tables_path": bundle_meta.get("tables_path", str(doc_dir / "document.tables.json")),
"figures_path": bundle_meta.get("figures_path", str(doc_dir / "document.figures.json")),
"source_path_kind": figures_payload.get("source_path_kind") or tables_payload.get("source_path_kind") or bundle_meta.get("source_path_kind", "source_root_relative"),
"layout_path": bundle_meta.get("layout_path", layout_path.relative_to(base).as_posix()),
"tables_path": bundle_meta.get("tables_path", tables_path.relative_to(base).as_posix()),
"figures_path": bundle_meta.get("figures_path", figures_path.relative_to(base).as_posix()),
"table_count": bundle_meta.get("table_count", 0),
"figure_reference_count": bundle_meta.get("figure_reference_count", 0),
},

View File

@ -75,10 +75,10 @@ def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None:
{
"title": "Lecture 1. Example",
"document_kind": "lecture",
"output_dir": str(doc_dir),
"layout_path": str(doc_dir / "document.layout.json"),
"tables_path": str(doc_dir / "document.tables.json"),
"figures_path": str(doc_dir / "document.figures.json"),
"output_dir": "documents/lesson-a",
"layout_path": "documents/lesson-a/document.layout.json",
"tables_path": "documents/lesson-a/document.tables.json",
"figures_path": "documents/lesson-a/document.figures.json",
"table_count": 1,
"figure_reference_count": 0,
}
@ -89,8 +89,8 @@ def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None:
)
(doc_dir / "document.md").write_text("# Lecture 1. Example\n\n## Module\n### Lesson A\nBody.", encoding="utf-8")
(doc_dir / "document.layout.json").write_text("[]", encoding="utf-8")
(doc_dir / "document.tables.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "tables": []}), encoding="utf-8")
(doc_dir / "document.figures.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "figure_references": []}), encoding="utf-8")
(doc_dir / "document.tables.json").write_text(json.dumps({"source_path": "raw/source.doc", "source_path_kind": "source_root_relative", "tables": []}), encoding="utf-8")
(doc_dir / "document.figures.json").write_text(json.dumps({"source_path": "raw/source.doc", "source_path_kind": "source_root_relative", "figure_references": []}), encoding="utf-8")
docs = adapt_documents(bundle)
@ -99,4 +99,6 @@ def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None:
assert docs[0].title == "Lecture 1. Example"
assert docs[0].metadata["document_kind"] == "lecture"
assert docs[0].metadata["doclift_bundle"] is True
assert docs[0].source_path == "/tmp/source.doc"
assert docs[0].source_path == "raw/source.doc"
assert docs[0].metadata["bundle_markdown_path"] == "documents/lesson-a/document.md"
assert docs[0].metadata["source_path_kind"] == "source_root_relative"