diff --git a/src/didactopus/document_adapters.py b/src/didactopus/document_adapters.py index 7138017..9cd4674 100644 --- a/src/didactopus/document_adapters.py +++ b/src/didactopus/document_adapters.py @@ -38,6 +38,15 @@ def _safe_read_json(path: Path) -> dict: return json.loads(path.read_text(encoding="utf-8")) +def _resolve_bundle_path(base: Path, value: str | Path | None, fallback: Path) -> Path: + if value is None or value == "": + return fallback + path = Path(value) + if path.is_absolute(): + return path + return base / path + + def adapt_markdown(path: str | Path) -> NormalizedDocument: text = read_textish(path) return NormalizedDocument( @@ -140,9 +149,14 @@ def adapt_doclift_bundle(path: str | Path) -> list[NormalizedDocument]: text = markdown_path.read_text(encoding="utf-8") sections = _simple_section_split(text) bundle_meta = by_output_dir.get(doc_dir.name, {}) - figures_payload = _safe_read_json(doc_dir / "document.figures.json") - tables_payload = _safe_read_json(doc_dir / "document.tables.json") - source_path = figures_payload.get("source_path") or tables_payload.get("source_path") or str(markdown_path) + layout_path = _resolve_bundle_path(base, bundle_meta.get("layout_path"), doc_dir / "document.layout.json") + tables_path = _resolve_bundle_path(base, bundle_meta.get("tables_path"), doc_dir / "document.tables.json") + figures_path = _resolve_bundle_path(base, bundle_meta.get("figures_path"), doc_dir / "document.figures.json") + figures_payload = _safe_read_json(figures_path) + tables_payload = _safe_read_json(tables_path) + source_path = figures_payload.get("source_path") or tables_payload.get("source_path") or markdown_path.relative_to(base).as_posix() + relative_doc_dir = doc_dir.relative_to(base).as_posix() + relative_markdown_path = markdown_path.relative_to(base).as_posix() docs.append( NormalizedDocument( source_path=str(source_path), @@ -152,13 +166,14 @@ def adapt_doclift_bundle(path: str | Path) -> list[NormalizedDocument]: sections=sections, metadata={ "doclift_bundle": True, - "bundle_root": str(base), - "bundle_document_dir": str(doc_dir), - "bundle_markdown_path": str(markdown_path), + "bundle_root": ".", + "bundle_document_dir": relative_doc_dir, + "bundle_markdown_path": relative_markdown_path, "document_kind": bundle_meta.get("document_kind", "document"), - "layout_path": bundle_meta.get("layout_path", str(doc_dir / "document.layout.json")), - "tables_path": bundle_meta.get("tables_path", str(doc_dir / "document.tables.json")), - "figures_path": bundle_meta.get("figures_path", str(doc_dir / "document.figures.json")), + "source_path_kind": figures_payload.get("source_path_kind") or tables_payload.get("source_path_kind") or bundle_meta.get("source_path_kind", "source_root_relative"), + "layout_path": bundle_meta.get("layout_path", layout_path.relative_to(base).as_posix()), + "tables_path": bundle_meta.get("tables_path", tables_path.relative_to(base).as_posix()), + "figures_path": bundle_meta.get("figures_path", figures_path.relative_to(base).as_posix()), "table_count": bundle_meta.get("table_count", 0), "figure_reference_count": bundle_meta.get("figure_reference_count", 0), }, diff --git a/tests/test_topic_ingest.py b/tests/test_topic_ingest.py index a2410ac..cc59907 100644 --- a/tests/test_topic_ingest.py +++ b/tests/test_topic_ingest.py @@ -75,10 +75,10 @@ def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None: { "title": "Lecture 1. Example", "document_kind": "lecture", - "output_dir": str(doc_dir), - "layout_path": str(doc_dir / "document.layout.json"), - "tables_path": str(doc_dir / "document.tables.json"), - "figures_path": str(doc_dir / "document.figures.json"), + "output_dir": "documents/lesson-a", + "layout_path": "documents/lesson-a/document.layout.json", + "tables_path": "documents/lesson-a/document.tables.json", + "figures_path": "documents/lesson-a/document.figures.json", "table_count": 1, "figure_reference_count": 0, } @@ -89,8 +89,8 @@ def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None: ) (doc_dir / "document.md").write_text("# Lecture 1. Example\n\n## Module\n### Lesson A\nBody.", encoding="utf-8") (doc_dir / "document.layout.json").write_text("[]", encoding="utf-8") - (doc_dir / "document.tables.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "tables": []}), encoding="utf-8") - (doc_dir / "document.figures.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "figure_references": []}), encoding="utf-8") + (doc_dir / "document.tables.json").write_text(json.dumps({"source_path": "raw/source.doc", "source_path_kind": "source_root_relative", "tables": []}), encoding="utf-8") + (doc_dir / "document.figures.json").write_text(json.dumps({"source_path": "raw/source.doc", "source_path_kind": "source_root_relative", "figure_references": []}), encoding="utf-8") docs = adapt_documents(bundle) @@ -99,4 +99,6 @@ def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None: assert docs[0].title == "Lecture 1. Example" assert docs[0].metadata["document_kind"] == "lecture" assert docs[0].metadata["doclift_bundle"] is True - assert docs[0].source_path == "/tmp/source.doc" + assert docs[0].source_path == "raw/source.doc" + assert docs[0].metadata["bundle_markdown_path"] == "documents/lesson-a/document.md" + assert docs[0].metadata["source_path_kind"] == "source_root_relative"