Add stable doclift bundle fixture

Wire doclift bundle import into Didactopus CLI
Add doclift bundle pack demo
2026-04-23 07:23:34 -04:00 · 2026-04-23 07:18:15 -04:00 · 2026-04-22 21:37:09 -04:00 · 2026-04-22 21:30:44 -04:00
13 changed files with 403 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -140,6 +140,26 @@ For the fastest included example, use the MIT OCW Information and Entropy demo.
 - progress visualization
 - skill export

+## `doclift` Bundle Ingestion
+
+When your source material starts as legacy office documents, the intended
+boundary is:
+
+1. `doclift` normalizes the source tree into a bundle.
+2. `Didactopus` turns that bundle into a draft pack and learning path.
+3. `GroundRecall` can import the same bundle directly when you need canonical
+   knowledge storage instead of a learner pack.
+
+Example:
+
+```bash
+doclift convert-dir /path/to/legacy-course /tmp/doclift-bundle --asset-root /path/to/legacy-course
+didactopus doclift-bundle /tmp/doclift-bundle /tmp/didactopus-pack --course-title "Example Course"
+```
+
+That command writes the normal draft-pack outputs plus a
+`doclift_bundle_summary.json` file that records the bundle-to-pack conversion.
+
 ## Didactopus As Pedagogy Support

 Didactopus is broader than a learner chat loop.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -16,6 +16,7 @@ dependencies = [
 ]

 [project.scripts]
+didactopus = "didactopus.main:main"
 didactopus-api = "didactopus.api:main"

 [tool.setuptools.packages.find]
--- a/src/didactopus/doclift_bundle_demo.py
+++ b/src/didactopus/doclift_bundle_demo.py
@ -0,0 +1,86 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+from .artifact_registry import validate_pack
+from .document_adapters import adapt_documents
+from .knowledge_graph import write_knowledge_graph
+from .pack_emitter import build_draft_pack, write_draft_pack, write_source_corpus
+from .rule_policy import RuleContext, build_default_rules, run_rules
+from .topic_ingest import build_topic_bundle, document_to_course, extract_concept_candidates, merge_courses_into_topic_course
+
+
+def run_doclift_bundle_demo(
+    bundle_dir: str | Path,
+    course_title: str,
+    pack_dir: str | Path,
+    author: str = "doclift bundle import",
+    license_name: str = "See source bundle metadata",
+) -> dict:
+    bundle_dir = Path(bundle_dir)
+    pack_dir = Path(pack_dir)
+
+    docs = adapt_documents(bundle_dir)
+    if not docs:
+        raise ValueError(f"No documents found in doclift bundle {bundle_dir}")
+
+    courses = [document_to_course(doc, course_title) for doc in docs]
+    merged = merge_courses_into_topic_course(build_topic_bundle(course_title, courses))
+    concepts = extract_concept_candidates(merged)
+    lesson_concept_ids = {concept.id for concept in concepts if concept.title in {lesson.title for module in merged.modules for lesson in module.lessons}}
+    concepts = [concept for concept in concepts if concept.id in lesson_concept_ids]
+    ctx = RuleContext(course=merged, concepts=concepts)
+    run_rules(ctx, build_default_rules(enable_projects=False, enable_review=False))
+
+    draft = build_draft_pack(
+        merged,
+        ctx.concepts,
+        author=author,
+        license_name=license_name,
+        review_flags=ctx.review_flags,
+        conflicts=[],
+    )
+    write_draft_pack(draft, pack_dir)
+    write_source_corpus(merged, pack_dir)
+    write_knowledge_graph(merged, ctx.concepts, pack_dir)
+
+    validation = validate_pack(pack_dir)
+    if not validation.is_valid:
+        raise ValueError(f"Generated pack failed validation: {validation.errors}")
+
+    summary = {
+        "bundle_dir": str(bundle_dir),
+        "course_title": course_title,
+        "pack_dir": str(pack_dir),
+        "source_document_count": len(docs),
+        "module_count": len(merged.modules),
+        "concept_count": len(ctx.concepts),
+        "review_flags": list(ctx.review_flags),
+    }
+    (pack_dir / "doclift_bundle_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    return summary
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate a Didactopus draft pack from a doclift bundle.")
+    parser.add_argument("bundle_dir")
+    parser.add_argument("pack_dir")
+    parser.add_argument("--course-title", required=True)
+    parser.add_argument("--author", default="doclift bundle import")
+    parser.add_argument("--license-name", default="See source bundle metadata")
+    args = parser.parse_args()
+
+    summary = run_doclift_bundle_demo(
+        bundle_dir=args.bundle_dir,
+        course_title=args.course_title,
+        pack_dir=args.pack_dir,
+        author=args.author,
+        license_name=args.license_name,
+    )
+    print(json.dumps(summary, indent=2))
+
+
+if __name__ == "__main__":
+    main()
--- a/src/didactopus/document_adapters.py
+++ b/src/didactopus/document_adapters.py
@ -1,5 +1,6 @@
 from __future__ import annotations

+import json
 from pathlib import Path
 import re
 from .course_schema import NormalizedDocument, Section
@ -31,6 +32,12 @@ def read_textish(path: str | Path) -> str:
    return Path(path).read_text(encoding="utf-8")


+def _safe_read_json(path: Path) -> dict:
+    if not path.exists():
+        return {}
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
 def adapt_markdown(path: str | Path) -> NormalizedDocument:
    text = read_textish(path)
    return NormalizedDocument(
@ -108,8 +115,62 @@ def adapt_pptx(path: str | Path) -> NormalizedDocument:
    )


+def is_doclift_bundle(path: str | Path) -> bool:
+    base = Path(path)
+    if not base.is_dir():
+        return False
+    manifest_path = base / "manifest.json"
+    documents_dir = base / "documents"
+    return manifest_path.exists() and documents_dir.exists()
+
+
+def adapt_doclift_bundle(path: str | Path) -> list[NormalizedDocument]:
+    base = Path(path)
+    manifest = _safe_read_json(base / "manifest.json")
+    by_output_dir = {
+        Path(item.get("output_dir", "")).name: item
+        for item in manifest.get("documents", [])
+        if isinstance(item, dict) and item.get("output_dir")
+    }
+    docs: list[NormalizedDocument] = []
+    for doc_dir in sorted(child for child in (base / "documents").iterdir() if child.is_dir()):
+        markdown_path = doc_dir / "document.md"
+        if not markdown_path.exists():
+            continue
+        text = markdown_path.read_text(encoding="utf-8")
+        sections = _simple_section_split(text)
+        bundle_meta = by_output_dir.get(doc_dir.name, {})
+        figures_payload = _safe_read_json(doc_dir / "document.figures.json")
+        tables_payload = _safe_read_json(doc_dir / "document.tables.json")
+        source_path = figures_payload.get("source_path") or tables_payload.get("source_path") or str(markdown_path)
+        docs.append(
+            NormalizedDocument(
+                source_path=str(source_path),
+                source_type="doclift_bundle",
+                title=str(bundle_meta.get("title") or _title_from_path(doc_dir.name)),
+                text=text,
+                sections=sections,
+                metadata={
+                    "doclift_bundle": True,
+                    "bundle_root": str(base),
+                    "bundle_document_dir": str(doc_dir),
+                    "bundle_markdown_path": str(markdown_path),
+                    "document_kind": bundle_meta.get("document_kind", "document"),
+                    "layout_path": bundle_meta.get("layout_path", str(doc_dir / "document.layout.json")),
+                    "tables_path": bundle_meta.get("tables_path", str(doc_dir / "document.tables.json")),
+                    "figures_path": bundle_meta.get("figures_path", str(doc_dir / "document.figures.json")),
+                    "table_count": bundle_meta.get("table_count", 0),
+                    "figure_reference_count": bundle_meta.get("figure_reference_count", 0),
+                },
+            )
+        )
+    return docs
+
+
 def detect_adapter(path: str | Path) -> str:
    p = Path(path)
+    if is_doclift_bundle(p):
+        return "doclift_bundle"
    suffix = p.suffix.lower()
    if suffix == ".md":
        return "markdown"
@ -128,11 +189,13 @@ def detect_adapter(path: str | Path) -> str:

 def is_supported_document(path: str | Path) -> bool:
    p = Path(path)
-    return p.is_file() and detect_adapter(p) in {"markdown", "text", "html", "pdf", "docx", "pptx"}
+    return detect_adapter(p) in {"markdown", "text", "html", "pdf", "docx", "pptx", "doclift_bundle"} and (p.is_file() or p.is_dir())


 def adapt_documents(path: str | Path) -> list[NormalizedDocument]:
    p = Path(path)
+    if is_doclift_bundle(p):
+        return adapt_doclift_bundle(p)
    if p.is_dir():
        docs = [adapt_document(child) for child in sorted(p.rglob("*")) if is_supported_document(child)]
        return docs
@ -141,6 +204,11 @@ def adapt_documents(path: str | Path) -> list[NormalizedDocument]:

 def adapt_document(path: str | Path) -> NormalizedDocument:
    adapter = detect_adapter(path)
+    if adapter == "doclift_bundle":
+        docs = adapt_doclift_bundle(path)
+        if not docs:
+            raise ValueError(f"No documents found in doclift bundle {path}")
+        return docs[0]
    if adapter == "markdown":
        return adapt_markdown(path)
    if adapter == "html":
--- a/src/didactopus/main.py
+++ b/src/didactopus/main.py
@ -1,16 +1,18 @@
 from __future__ import annotations

 import argparse
+import sys
 from pathlib import Path

 from .config import load_config
+from .doclift_bundle_demo import run_doclift_bundle_demo
 from .review_loader import load_draft_pack
 from .review_schema import ReviewSession, ReviewAction
 from .review_actions import apply_action
 from .review_export import export_review_state_json, export_promoted_pack, export_review_ui_data


-def build_parser() -> argparse.ArgumentParser:
+def build_review_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Didactopus interactive review workflow scaffold")
    parser.add_argument("--draft-pack", required=True, help="Path to draft pack directory")
    parser.add_argument("--output-dir", default="review-output")
@ -18,8 +20,25 @@ def build_parser() -> argparse.ArgumentParser:
    return parser


-def main() -> None:
-    args = build_parser().parse_args()
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Didactopus command-line tools")
+    subparsers = parser.add_subparsers(dest="command")
+
+    review_parser = subparsers.add_parser("review", help="Run the interactive review workflow scaffold")
+    review_parser.add_argument("--draft-pack", required=True, help="Path to draft pack directory")
+    review_parser.add_argument("--output-dir", default="review-output")
+    review_parser.add_argument("--config", default="configs/config.example.yaml")
+
+    doclift_parser = subparsers.add_parser("doclift-bundle", help="Generate a draft pack from a doclift bundle")
+    doclift_parser.add_argument("bundle_dir")
+    doclift_parser.add_argument("pack_dir")
+    doclift_parser.add_argument("--course-title", required=True)
+    doclift_parser.add_argument("--author", default="doclift bundle import")
+    doclift_parser.add_argument("--license-name", default="See source bundle metadata")
+    return parser
+
+
+def run_review_workflow(args: argparse.Namespace) -> None:
    config = load_config(Path(args.config))
    draft = load_draft_pack(args.draft_pack)
    session = ReviewSession(reviewer=config.review.default_reviewer, draft_pack=draft)
@ -53,3 +72,27 @@ def main() -> None:
    print(f"Concepts: {len(session.draft_pack.concepts)}")
    print(f"Ledger entries: {len(session.ledger)}")
    print(f"Output dir: {outdir}")
+
+
+def main() -> None:
+    argv = sys.argv[1:]
+    if not argv or argv[0].startswith("-"):
+        args = build_review_parser().parse_args(argv)
+        run_review_workflow(args)
+        return
+
+    args = build_parser().parse_args(argv)
+    if args.command == "review":
+        run_review_workflow(args)
+        return
+    if args.command == "doclift-bundle":
+        summary = run_doclift_bundle_demo(
+            bundle_dir=args.bundle_dir,
+            course_title=args.course_title,
+            pack_dir=args.pack_dir,
+            author=args.author,
+            license_name=args.license_name,
+        )
+        print(summary)
+        return
+    build_parser().print_help()
--- a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.figures.json
+++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.figures.json
@ -0,0 +1,9 @@
+{
+  "source_path": "legacy/lecture-1.doc",
+  "figure_references": [
+    {
+      "label": "Figure 1",
+      "caption": "Example figure caption"
+    }
+  ]
+}
--- a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.layout.json
+++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.layout.json
@ -0,0 +1,8 @@
+[
+  {
+    "line_index": 0,
+    "text": "Lecture 1. Example",
+    "kind": "heading",
+    "indent": 0
+  }
+]
--- a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.md
+++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.md
@ -0,0 +1,9 @@
+# Lecture 1. Example
+
+## Module A
+
+### Lesson A
+
+- Objective: Explain lesson A.
+
+Body text that grounds the example lesson.
--- a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.tables.json
+++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.tables.json
@ -0,0 +1,13 @@
+{
+  "source_path": "legacy/lecture-1.doc",
+  "tables": [
+    {
+      "table_id": "table-1",
+      "caption": "Example table",
+      "rows": [
+        ["Column A", "Column B"],
+        ["1", "2"]
+      ]
+    }
+  ]
+}
--- a/tests/fixtures/doclift_bundle_minimal/manifest.json
+++ b/tests/fixtures/doclift_bundle_minimal/manifest.json
@ -0,0 +1,16 @@
+{
+  "documents": [
+    {
+      "document_id": "lecture-1",
+      "title": "Lecture 1. Example",
+      "document_kind": "lecture",
+      "output_dir": "documents/lecture-1",
+      "markdown_path": "documents/lecture-1/document.md",
+      "layout_path": "documents/lecture-1/document.layout.json",
+      "tables_path": "documents/lecture-1/document.tables.json",
+      "figures_path": "documents/lecture-1/document.figures.json",
+      "table_count": 1,
+      "figure_reference_count": 1
+    }
+  ]
+}
--- a/tests/test_doclift_bundle_demo.py
+++ b/tests/test_doclift_bundle_demo.py
@ -0,0 +1,19 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from didactopus.doclift_bundle_demo import run_doclift_bundle_demo
+
+
+def _fixture_bundle() -> Path:
+    return Path(__file__).parent / "fixtures" / "doclift_bundle_minimal"
+
+
+def test_doclift_bundle_demo_generates_pack(tmp_path: Path) -> None:
+    summary = run_doclift_bundle_demo(_fixture_bundle(), "Example Course", tmp_path / "pack")
+
+    assert summary["source_document_count"] == 1
+    assert (tmp_path / "pack" / "pack.yaml").exists()
+    assert (tmp_path / "pack" / "source_corpus.json").exists()
+    assert (tmp_path / "pack" / "knowledge_graph.json").exists()
+    assert (tmp_path / "pack" / "doclift_bundle_summary.json").exists()
--- a/tests/test_main_cli.py
+++ b/tests/test_main_cli.py
@ -0,0 +1,67 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import didactopus.main as main_module
+
+
+def test_main_doclift_bundle_subcommand(monkeypatch, capsys, tmp_path: Path) -> None:
+    captured: dict = {}
+
+    def _fake_run_doclift_bundle_demo(bundle_dir, course_title, pack_dir, author, license_name):
+        captured.update(
+            {
+                "bundle_dir": str(bundle_dir),
+                "course_title": course_title,
+                "pack_dir": str(pack_dir),
+                "author": author,
+                "license_name": license_name,
+            }
+        )
+        return {"pack_dir": str(pack_dir), "course_title": course_title}
+
+    monkeypatch.setattr(main_module, "run_doclift_bundle_demo", _fake_run_doclift_bundle_demo)
+    monkeypatch.setattr(
+        main_module.sys,
+        "argv",
+        [
+            "didactopus",
+            "doclift-bundle",
+            str(tmp_path / "bundle"),
+            str(tmp_path / "pack"),
+            "--course-title",
+            "Example Course",
+        ],
+    )
+
+    main_module.main()
+    out = capsys.readouterr().out
+
+    assert captured["course_title"] == "Example Course"
+    assert "Example Course" in out
+
+
+def test_main_legacy_review_mode_uses_review_parser(monkeypatch, tmp_path: Path) -> None:
+    called: dict = {}
+
+    def _fake_run_review_workflow(args):
+        called["draft_pack"] = args.draft_pack
+        called["output_dir"] = args.output_dir
+
+    monkeypatch.setattr(main_module, "run_review_workflow", _fake_run_review_workflow)
+    monkeypatch.setattr(
+        main_module.sys,
+        "argv",
+        [
+            "didactopus",
+            "--draft-pack",
+            str(tmp_path / "draft"),
+            "--output-dir",
+            str(tmp_path / "out"),
+        ],
+    )
+
+    main_module.main()
+
+    assert called["draft_pack"] == str(tmp_path / "draft")
+    assert called["output_dir"] == str(tmp_path / "out")
--- a/tests/test_topic_ingest.py
+++ b/tests/test_topic_ingest.py
@ -1,5 +1,7 @@
+import json
 from pathlib import Path
 from didactopus.document_adapters import adapt_document
+from didactopus.document_adapters import adapt_documents
 from didactopus.topic_ingest import document_to_course, build_topic_bundle, merge_courses_into_topic_course, extract_concept_candidates


@ -60,3 +62,41 @@ def test_extract_concepts_retains_lessons_but_filters_generic_terms(tmp_path: Pa
    assert "MIT OCW 6.050J Information and Entropy: Syllabus" in titles
    assert "Explain" not in titles
    assert "Channel Capacity" in titles
+
+
+def test_adapt_documents_from_doclift_bundle(tmp_path: Path) -> None:
+    bundle = tmp_path / "bundle"
+    doc_dir = bundle / "documents" / "lesson-a"
+    doc_dir.mkdir(parents=True)
+    (bundle / "manifest.json").write_text(
+        json.dumps(
+            {
+                "documents": [
+                    {
+                        "title": "Lecture 1. Example",
+                        "document_kind": "lecture",
+                        "output_dir": str(doc_dir),
+                        "layout_path": str(doc_dir / "document.layout.json"),
+                        "tables_path": str(doc_dir / "document.tables.json"),
+                        "figures_path": str(doc_dir / "document.figures.json"),
+                        "table_count": 1,
+                        "figure_reference_count": 0,
+                    }
+                ]
+            }
+        ),
+        encoding="utf-8",
+    )
+    (doc_dir / "document.md").write_text("# Lecture 1. Example\n\n## Module\n### Lesson A\nBody.", encoding="utf-8")
+    (doc_dir / "document.layout.json").write_text("[]", encoding="utf-8")
+    (doc_dir / "document.tables.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "tables": []}), encoding="utf-8")
+    (doc_dir / "document.figures.json").write_text(json.dumps({"source_path": "/tmp/source.doc", "figure_references": []}), encoding="utf-8")
+
+    docs = adapt_documents(bundle)
+
+    assert len(docs) == 1
+    assert docs[0].source_type == "doclift_bundle"
+    assert docs[0].title == "Lecture 1. Example"
+    assert docs[0].metadata["document_kind"] == "lecture"
+    assert docs[0].metadata["doclift_bundle"] is True
+    assert docs[0].source_path == "/tmp/source.doc"
Author	SHA1	Message	Date
welsberr	b7e2f9f540	Add stable doclift bundle fixture	2026-04-23 07:23:34 -04:00
welsberr	8e616f6bc6	Wire doclift bundle import into Didactopus CLI	2026-04-23 07:18:15 -04:00
welsberr	8d2b6928a8	Add doclift bundle pack demo	2026-04-22 21:37:09 -04:00
welsberr	bb64c01123	Add doclift bundle document adapter	2026-04-22 21:30:44 -04:00