Add doclift bundle document adapter

2026-04-22 21:30:44 -04:00 · 2026-04-22 21:30:44 -04:00 · bb64c01123
parent 9549961d10
commit bb64c01123
2 changed files with 109 additions and 1 deletions
--- a/src/didactopus/document_adapters.py
+++ b/src/didactopus/document_adapters.py
@ -1,5 +1,6 @@
 from __future__ import annotations

+import json
 from pathlib import Path
 import re
 from .course_schema import NormalizedDocument, Section
@ -31,6 +32,12 @@ def read_textish(path: str | Path) -> str:
    return Path(path).read_text(encoding="utf-8")


+def _safe_read_json(path: Path) -> dict:
+    if not path.exists():
+        return {}
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
 def adapt_markdown(path: str | Path) -> NormalizedDocument:
    text = read_textish(path)
    return NormalizedDocument(
@ -108,8 +115,62 @@ def adapt_pptx(path: str | Path) -> NormalizedDocument:
    )


+def is_doclift_bundle(path: str | Path) -> bool:
+    base = Path(path)
+    if not base.is_dir():
+        return False
+    manifest_path = base / "manifest.json"
+    documents_dir = base / "documents"
+    return manifest_path.exists() and documents_dir.exists()
+
+
+def adapt_doclift_bundle(path: str | Path) -> list[NormalizedDocument]:
+    base = Path(path)
+    manifest = _safe_read_json(base / "manifest.json")
+    by_output_dir = {
+        Path(item.get("output_dir", "")).name: item
+        for item in manifest.get("documents", [])
+        if isinstance(item, dict) and item.get("output_dir")
+    }
+    docs: list[NormalizedDocument] = []
+    for doc_dir in sorted(child for child in (base / "documents").iterdir() if child.is_dir()):
+        markdown_path = doc_dir / "document.md"
+        if not markdown_path.exists():
+            continue
+        text = markdown_path.read_text(encoding="utf-8")
+        sections = _simple_section_split(text)
+        bundle_meta = by_output_dir.get(doc_dir.name, {})
+        figures_payload = _safe_read_json(doc_dir / "document.figures.json")
+        tables_payload = _safe_read_json(doc_dir / "document.tables.json")
+        source_path = figures_payload.get("source_path") or tables_payload.get("source_path") or str(markdown_path)
+        docs.append(
+            NormalizedDocument(
+                source_path=str(source_path),
+                source_type="doclift_bundle",
+                title=str(bundle_meta.get("title") or _title_from_path(doc_dir.name)),
+                text=text,
+                sections=sections,
+                metadata={
+                    "doclift_bundle": True,
+                    "bundle_root": str(base),
+                    "bundle_document_dir": str(doc_dir),
+                    "bundle_markdown_path": str(markdown_path),
+                    "document_kind": bundle_meta.get("document_kind", "document"),
+                    "layout_path": bundle_meta.get("layout_path", str(doc_dir / "document.layout.json")),
+                    "tables_path": bundle_meta.get("tables_path", str(doc_dir / "document.tables.json")),
+                    "figures_path": bundle_meta.get("figures_path", str(doc_dir / "document.figures.json")),
+                    "table_count": bundle_meta.get("table_count", 0),
+                    "figure_reference_count": bundle_meta.get("figure_reference_count", 0),
+                },
+            )
+        )
+    return docs
+
+
 def detect_adapter(path: str | Path) -> str:
    p = Path(path)
+    if is_doclift_bundle(p):
+        return "doclift_bundle"
    suffix = p.suffix.lower()
    if suffix == ".md":
        return "markdown"
@ -128,11 +189,13 @@ def detect_adapter(path: str | Path) -> str:

 def is_supported_document(path: str | Path) -> bool:
    p = Path(path)
-    return p.is_file() and detect_adapter(p) in {"markdown", "text", "html", "pdf", "docx", "pptx"}
+    return detect_adapter(p) in {"markdown", "text", "html", "pdf", "docx", "pptx", "doclift_bundle"} and (p.is_file() or p.is_dir())


 def adapt_documents(path: str | Path) -> list[NormalizedDocument]:
    p = Path(path)
+    if is_doclift_bundle(p):
+        return adapt_doclift_bundle(p)
    if p.is_dir():
        docs = [adapt_document(child) for child in sorted(p.rglob("*")) if is_supported_document(child)]
        return docs
@ -141,6 +204,11 @@ def adapt_documents(path: str | Path) -> list[NormalizedDocument]:

 def adapt_document(path: str | Path) -> NormalizedDocument:
    adapter = detect_adapter(path)
+    if adapter == "doclift_bundle":
+        docs = adapt_doclift_bundle(path)
+        if not docs:
+            raise ValueError(f"No documents found in doclift bundle {path}")
+        return docs[0]
    if adapter == "markdown":
        return adapt_markdown(path)
    if adapter == "html":
--- a/tests/test_topic_ingest.py
+++ b/tests/test_topic_ingest.py
@ -1,5 +1,7 @@
+import json
 from pathlib import Path
 from didactopus.document_adapters import adapt_document
+from didactopus.document_adapters import adapt_documents
 from didactopus.topic_ingest import document_to_course, build_topic_bundle, merge_courses_into_topic_course, extract_concept_candidates


@ -60,3 +62,41 @@ def test_extract_concepts_retains_lessons_but_filters_generic_terms(tmp_path: Pa
    assert "MIT OCW 6.050J Information and Entropy: Syllabus" in titles
    assert "Explain" not in titles
    assert "Channel Capacity" in titles
+
+
+def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None:
+    bundle = tmp_path / "bundle"
+    doc_dir = bundle / "documents" / "lesson-a"
+    doc_dir.mkdir(parents=True)
+    (bundle / "manifest.json").write_text(
+        json.dumps(
+            {
+                "documents": [
+                    {
+                        "title": "Lecture 1. Example",
+                        "document_kind": "lecture",
+                        "output_dir": str(doc_dir),
+                        "layout_path": str(doc_dir / "document.layout.json"),
+                        "tables_path": str(doc_dir / "document.tables.json"),
+                        "figures_path": str(doc_dir / "document.figures.json"),
+                        "table_count": 1,
+                        "figure_reference_count": 0,
+                    }
+                ]
+            }
+        ),
+        encoding="utf-8",
+    )
+    (doc_dir / "document.md").write_text("# Lecture 1. Example\n\n## Module\n### Lesson A\nBody.", encoding="utf-8")
+    (doc_dir / "document.layout.json").write_text("[]", encoding="utf-8")
+    (doc_dir / "document.tables.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "tables": []}), encoding="utf-8")
+    (doc_dir / "document.figures.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "figure_references": []}), encoding="utf-8")
+
+    docs = adapt_documents(bundle)
+
+    assert len(docs) == 1
+    assert docs[0].source_type == "doclift_bundle"
+    assert docs[0].title == "Lecture 1. Example"
+    assert docs[0].metadata["document_kind"] == "lecture"
+    assert docs[0].metadata["doclift_bundle"] is True
+    assert docs[0].source_path == "/tmp/source.doc"