diff --git a/src/didactopus/document_adapters.py b/src/didactopus/document_adapters.py index 13c17c9..7138017 100644 --- a/src/didactopus/document_adapters.py +++ b/src/didactopus/document_adapters.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json from pathlib import Path import re from .course_schema import NormalizedDocument, Section @@ -31,6 +32,12 @@ def read_textish(path: str | Path) -> str: return Path(path).read_text(encoding="utf-8") +def _safe_read_json(path: Path) -> dict: + if not path.exists(): + return {} + return json.loads(path.read_text(encoding="utf-8")) + + def adapt_markdown(path: str | Path) -> NormalizedDocument: text = read_textish(path) return NormalizedDocument( @@ -108,8 +115,62 @@ def adapt_pptx(path: str | Path) -> NormalizedDocument: ) +def is_doclift_bundle(path: str | Path) -> bool: + base = Path(path) + if not base.is_dir(): + return False + manifest_path = base / "manifest.json" + documents_dir = base / "documents" + return manifest_path.exists() and documents_dir.exists() + + +def adapt_doclift_bundle(path: str | Path) -> list[NormalizedDocument]: + base = Path(path) + manifest = _safe_read_json(base / "manifest.json") + by_output_dir = { + Path(item.get("output_dir", "")).name: item + for item in manifest.get("documents", []) + if isinstance(item, dict) and item.get("output_dir") + } + docs: list[NormalizedDocument] = [] + for doc_dir in sorted(child for child in (base / "documents").iterdir() if child.is_dir()): + markdown_path = doc_dir / "document.md" + if not markdown_path.exists(): + continue + text = markdown_path.read_text(encoding="utf-8") + sections = _simple_section_split(text) + bundle_meta = by_output_dir.get(doc_dir.name, {}) + figures_payload = _safe_read_json(doc_dir / "document.figures.json") + tables_payload = _safe_read_json(doc_dir / "document.tables.json") + source_path = figures_payload.get("source_path") or tables_payload.get("source_path") or str(markdown_path) + docs.append( + NormalizedDocument( + source_path=str(source_path), + source_type="doclift_bundle", + title=str(bundle_meta.get("title") or _title_from_path(doc_dir.name)), + text=text, + sections=sections, + metadata={ + "doclift_bundle": True, + "bundle_root": str(base), + "bundle_document_dir": str(doc_dir), + "bundle_markdown_path": str(markdown_path), + "document_kind": bundle_meta.get("document_kind", "document"), + "layout_path": bundle_meta.get("layout_path", str(doc_dir / "document.layout.json")), + "tables_path": bundle_meta.get("tables_path", str(doc_dir / "document.tables.json")), + "figures_path": bundle_meta.get("figures_path", str(doc_dir / "document.figures.json")), + "table_count": bundle_meta.get("table_count", 0), + "figure_reference_count": bundle_meta.get("figure_reference_count", 0), + }, + ) + ) + return docs + + def detect_adapter(path: str | Path) -> str: p = Path(path) + if is_doclift_bundle(p): + return "doclift_bundle" suffix = p.suffix.lower() if suffix == ".md": return "markdown" @@ -128,11 +189,13 @@ def detect_adapter(path: str | Path) -> str: def is_supported_document(path: str | Path) -> bool: p = Path(path) - return p.is_file() and detect_adapter(p) in {"markdown", "text", "html", "pdf", "docx", "pptx"} + return detect_adapter(p) in {"markdown", "text", "html", "pdf", "docx", "pptx", "doclift_bundle"} and (p.is_file() or p.is_dir()) def adapt_documents(path: str | Path) -> list[NormalizedDocument]: p = Path(path) + if is_doclift_bundle(p): + return adapt_doclift_bundle(p) if p.is_dir(): docs = [adapt_document(child) for child in sorted(p.rglob("*")) if is_supported_document(child)] return docs @@ -141,6 +204,11 @@ def adapt_documents(path: str | Path) -> list[NormalizedDocument]: def adapt_document(path: str | Path) -> NormalizedDocument: adapter = detect_adapter(path) + if adapter == "doclift_bundle": + docs = adapt_doclift_bundle(path) + if not docs: + raise ValueError(f"No documents found in doclift bundle {path}") + return docs[0] if adapter == "markdown": return adapt_markdown(path) if adapter == "html": diff --git a/tests/test_topic_ingest.py b/tests/test_topic_ingest.py index f43127e..a2410ac 100644 --- a/tests/test_topic_ingest.py +++ b/tests/test_topic_ingest.py @@ -1,5 +1,7 @@ +import json from pathlib import Path from didactopus.document_adapters import adapt_document +from didactopus.document_adapters import adapt_documents from didactopus.topic_ingest import document_to_course, build_topic_bundle, merge_courses_into_topic_course, extract_concept_candidates @@ -60,3 +62,41 @@ def test_extract_concepts_retains_lessons_but_filters_generic_terms(tmp_path: Pa assert "MIT OCW 6.050J Information and Entropy: Syllabus" in titles assert "Explain" not in titles assert "Channel Capacity" in titles + + +def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None: + bundle = tmp_path / "bundle" + doc_dir = bundle / "documents" / "lesson-a" + doc_dir.mkdir(parents=True) + (bundle / "manifest.json").write_text( + json.dumps( + { + "documents": [ + { + "title": "Lecture 1. Example", + "document_kind": "lecture", + "output_dir": str(doc_dir), + "layout_path": str(doc_dir / "document.layout.json"), + "tables_path": str(doc_dir / "document.tables.json"), + "figures_path": str(doc_dir / "document.figures.json"), + "table_count": 1, + "figure_reference_count": 0, + } + ] + } + ), + encoding="utf-8", + ) + (doc_dir / "document.md").write_text("# Lecture 1. Example\n\n## Module\n### Lesson A\nBody.", encoding="utf-8") + (doc_dir / "document.layout.json").write_text("[]", encoding="utf-8") + (doc_dir / "document.tables.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "tables": []}), encoding="utf-8") + (doc_dir / "document.figures.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "figure_references": []}), encoding="utf-8") + + docs = adapt_documents(bundle) + + assert len(docs) == 1 + assert docs[0].source_type == "doclift_bundle" + assert docs[0].title == "Lecture 1. Example" + assert docs[0].metadata["document_kind"] == "lecture" + assert docs[0].metadata["doclift_bundle"] is True + assert docs[0].source_path == "/tmp/source.doc"