Preserve portable doclift source paths

This commit is contained in:
welsberr 2026-04-23 10:27:43 -04:00
parent b7e2f9f540
commit 3837bd2316
2 changed files with 33 additions and 16 deletions

View File

@ -38,6 +38,15 @@ def _safe_read_json(path: Path) -> dict:
return json.loads(path.read_text(encoding="utf-8")) return json.loads(path.read_text(encoding="utf-8"))
def _resolve_bundle_path(base: Path, value: str | Path | None, fallback: Path) -> Path:
if value is None or value == "":
return fallback
path = Path(value)
if path.is_absolute():
return path
return base / path
def adapt_markdown(path: str | Path) -> NormalizedDocument: def adapt_markdown(path: str | Path) -> NormalizedDocument:
text = read_textish(path) text = read_textish(path)
return NormalizedDocument( return NormalizedDocument(
@ -140,9 +149,14 @@ def adapt_doclift_bundle(path: str | Path) -> list[NormalizedDocument]:
text = markdown_path.read_text(encoding="utf-8") text = markdown_path.read_text(encoding="utf-8")
sections = _simple_section_split(text) sections = _simple_section_split(text)
bundle_meta = by_output_dir.get(doc_dir.name, {}) bundle_meta = by_output_dir.get(doc_dir.name, {})
figures_payload = _safe_read_json(doc_dir / "document.figures.json") layout_path = _resolve_bundle_path(base, bundle_meta.get("layout_path"), doc_dir / "document.layout.json")
tables_payload = _safe_read_json(doc_dir / "document.tables.json") tables_path = _resolve_bundle_path(base, bundle_meta.get("tables_path"), doc_dir / "document.tables.json")
source_path = figures_payload.get("source_path") or tables_payload.get("source_path") or str(markdown_path) figures_path = _resolve_bundle_path(base, bundle_meta.get("figures_path"), doc_dir / "document.figures.json")
figures_payload = _safe_read_json(figures_path)
tables_payload = _safe_read_json(tables_path)
source_path = figures_payload.get("source_path") or tables_payload.get("source_path") or markdown_path.relative_to(base).as_posix()
relative_doc_dir = doc_dir.relative_to(base).as_posix()
relative_markdown_path = markdown_path.relative_to(base).as_posix()
docs.append( docs.append(
NormalizedDocument( NormalizedDocument(
source_path=str(source_path), source_path=str(source_path),
@ -152,13 +166,14 @@ def adapt_doclift_bundle(path: str | Path) -> list[NormalizedDocument]:
sections=sections, sections=sections,
metadata={ metadata={
"doclift_bundle": True, "doclift_bundle": True,
"bundle_root": str(base), "bundle_root": ".",
"bundle_document_dir": str(doc_dir), "bundle_document_dir": relative_doc_dir,
"bundle_markdown_path": str(markdown_path), "bundle_markdown_path": relative_markdown_path,
"document_kind": bundle_meta.get("document_kind", "document"), "document_kind": bundle_meta.get("document_kind", "document"),
"layout_path": bundle_meta.get("layout_path", str(doc_dir / "document.layout.json")), "source_path_kind": figures_payload.get("source_path_kind") or tables_payload.get("source_path_kind") or bundle_meta.get("source_path_kind", "source_root_relative"),
"tables_path": bundle_meta.get("tables_path", str(doc_dir / "document.tables.json")), "layout_path": bundle_meta.get("layout_path", layout_path.relative_to(base).as_posix()),
"figures_path": bundle_meta.get("figures_path", str(doc_dir / "document.figures.json")), "tables_path": bundle_meta.get("tables_path", tables_path.relative_to(base).as_posix()),
"figures_path": bundle_meta.get("figures_path", figures_path.relative_to(base).as_posix()),
"table_count": bundle_meta.get("table_count", 0), "table_count": bundle_meta.get("table_count", 0),
"figure_reference_count": bundle_meta.get("figure_reference_count", 0), "figure_reference_count": bundle_meta.get("figure_reference_count", 0),
}, },

View File

@ -75,10 +75,10 @@ def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None:
{ {
"title": "Lecture 1. Example", "title": "Lecture 1. Example",
"document_kind": "lecture", "document_kind": "lecture",
"output_dir": str(doc_dir), "output_dir": "documents/lesson-a",
"layout_path": str(doc_dir / "document.layout.json"), "layout_path": "documents/lesson-a/document.layout.json",
"tables_path": str(doc_dir / "document.tables.json"), "tables_path": "documents/lesson-a/document.tables.json",
"figures_path": str(doc_dir / "document.figures.json"), "figures_path": "documents/lesson-a/document.figures.json",
"table_count": 1, "table_count": 1,
"figure_reference_count": 0, "figure_reference_count": 0,
} }
@ -89,8 +89,8 @@ def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None:
) )
(doc_dir / "document.md").write_text("# Lecture 1. Example\n\n## Module\n### Lesson A\nBody.", encoding="utf-8") (doc_dir / "document.md").write_text("# Lecture 1. Example\n\n## Module\n### Lesson A\nBody.", encoding="utf-8")
(doc_dir / "document.layout.json").write_text("[]", encoding="utf-8") (doc_dir / "document.layout.json").write_text("[]", encoding="utf-8")
(doc_dir / "document.tables.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "tables": []}), encoding="utf-8") (doc_dir / "document.tables.json").write_text(json.dumps({"source_path": "raw/source.doc", "source_path_kind": "source_root_relative", "tables": []}), encoding="utf-8")
(doc_dir / "document.figures.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "figure_references": []}), encoding="utf-8") (doc_dir / "document.figures.json").write_text(json.dumps({"source_path": "raw/source.doc", "source_path_kind": "source_root_relative", "figure_references": []}), encoding="utf-8")
docs = adapt_documents(bundle) docs = adapt_documents(bundle)
@ -99,4 +99,6 @@ def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None:
assert docs[0].title == "Lecture 1. Example" assert docs[0].title == "Lecture 1. Example"
assert docs[0].metadata["document_kind"] == "lecture" assert docs[0].metadata["document_kind"] == "lecture"
assert docs[0].metadata["doclift_bundle"] is True assert docs[0].metadata["doclift_bundle"] is True
assert docs[0].source_path == "/tmp/source.doc" assert docs[0].source_path == "raw/source.doc"
assert docs[0].metadata["bundle_markdown_path"] == "documents/lesson-a/document.md"
assert docs[0].metadata["source_path_kind"] == "source_root_relative"