Keep doclift fixture immutable in tests

Harden doclift bundle workflow and fixtures
2026-04-23 07:26:39 -04:00 · 2026-04-23 07:23:32 -04:00
9 changed files with 105 additions and 34 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,8 @@
 # GroundRecall

-GroundRecall is a human-reviewable/AI usable knowledge layer with capabilities to meet or exceed 'llmwiki' with 'v2' specifications, plus an import path for existing llmwiki instances, and integration with Didactopus for review workflows and knowledge merging.
+GroundRecall is a human-reviewable/AI usable knowledge layer with capabilities to meet or exceed 'llmwiki' with 'v2' specifications, plus an import path for existing llmwiki instances, and integration with Didactopus for review workflows and knowledge merging.
+
+`GroundRecall` can also import normalized `doclift` bundles directly when the
+source material began as legacy office documents and you want a provenance-aware
+knowledge import without going through a learner pack first. See
+`docs/quickstart.md` for the minimal `doclift -> GroundRecall` flow.
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@ -33,6 +33,21 @@ groundrecall import /path/to/llmwiki --mode grounded

 The importer writes normalized artifacts under `imports/<import-id>/`.

+Import from a normalized `doclift` bundle:
+
+```bash
+groundrecall import /path/to/doclift-bundle --mode quick
+```
+
+This path is intended for legacy-document corpora that were first normalized by
+`doclift`. If you want a learner-facing pack first, use Didactopus in between:
+
+```bash
+doclift convert-dir /path/to/legacy-course /tmp/doclift-bundle --asset-root /path/to/legacy-course
+didactopus doclift-bundle /tmp/doclift-bundle /tmp/didactopus-pack --course-title "Example Course"
+groundrecall import /tmp/doclift-bundle --mode quick
+```
+
 ## Review And Promote

 Inspect the import outputs:
--- a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py
+++ b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py
@ -10,6 +10,14 @@ from .base import DiscoveredImportSource, StructuredImportRows, register_source_
 class DocliftBundleSourceAdapter:
    name = "doclift_bundle"

+    def _resolve_bundle_path(self, base: Path, value: str | Path | None) -> Path:
+        if value is None:
+            return Path()
+        path = Path(value)
+        if path.is_absolute():
+            return path
+        return base / path
+
    def detect(self, root: str | Path) -> bool:
        base = Path(root)
        return (base / "manifest.json").exists() and (base / "documents").exists()
@ -69,10 +77,13 @@ class DocliftBundleSourceAdapter:
        for index, document in enumerate(documents, start=1):
            title = str(document.get("title") or f"Document {index}")
            concept_id = f"concept::{document.get('document_id') or title.lower().replace(' ', '-')}"
-            markdown_path = Path(document.get("markdown_path", ""))
-            relative_markdown = markdown_path.relative_to(base).as_posix() if markdown_path.is_absolute() and markdown_path.exists() and markdown_path.is_relative_to(base) else document.get("markdown_path", "")
+            markdown_path = self._resolve_bundle_path(base, document.get("markdown_path", ""))
+            if markdown_path.exists():
+                relative_markdown = markdown_path.relative_to(base).as_posix()
+            else:
+                relative_markdown = str(document.get("markdown_path", ""))
            artifact_id = artifact_by_path.get(str(relative_markdown), "")
-            figures_path = Path(document.get("figures_path", ""))
+            figures_path = self._resolve_bundle_path(base, document.get("figures_path", ""))
            figure_payload = {}
            if figures_path.exists():
                figure_payload = json.loads(figures_path.read_text(encoding="utf-8"))
--- a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.figures.json
+++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.figures.json
@ -0,0 +1,9 @@
+{
+  "source_path": "legacy/lecture-1.doc",
+  "figure_references": [
+    {
+      "label": "Figure 1",
+      "caption": "Example figure caption"
+    }
+  ]
+}
--- a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.layout.json
+++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.layout.json
@ -0,0 +1,8 @@
+[
+  {
+    "line_index": 0,
+    "text": "Lecture 1. Example",
+    "kind": "heading",
+    "indent": 0
+  }
+]
--- a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.md
+++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.md
@ -0,0 +1,9 @@
+# Lecture 1. Example
+
+## Module A
+
+### Lesson A
+
+- Objective: Explain lesson A.
+
+Body text that grounds the example lesson.
--- a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.tables.json
+++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.tables.json
@ -0,0 +1,13 @@
+{
+  "source_path": "legacy/lecture-1.doc",
+  "tables": [
+    {
+      "table_id": "table-1",
+      "caption": "Example table",
+      "rows": [
+        ["Column A", "Column B"],
+        ["1", "2"]
+      ]
+    }
+  ]
+}
--- a/tests/fixtures/doclift_bundle_minimal/manifest.json
+++ b/tests/fixtures/doclift_bundle_minimal/manifest.json
@ -0,0 +1,16 @@
+{
+  "documents": [
+    {
+      "document_id": "lecture-1",
+      "title": "Lecture 1. Example",
+      "document_kind": "lecture",
+      "output_dir": "documents/lecture-1",
+      "markdown_path": "documents/lecture-1/document.md",
+      "layout_path": "documents/lecture-1/document.layout.json",
+      "tables_path": "documents/lecture-1/document.tables.json",
+      "figures_path": "documents/lecture-1/document.figures.json",
+      "table_count": 1,
+      "figure_reference_count": 1
+    }
+  ]
+}
--- a/tests/test_groundrecall_source_adapters.py
+++ b/tests/test_groundrecall_source_adapters.py
@ -1,6 +1,7 @@
 from __future__ import annotations

 from pathlib import Path
+import shutil

 import groundrecall.ingest as ingest_module
 import groundrecall.source_adapters  # noqa: F401
@ -8,6 +9,16 @@ from groundrecall.source_adapters.base import detect_source_adapter, list_source
 from groundrecall.ingest import run_groundrecall_import


+def _fixture_doclift_bundle() -> Path:
+    return Path(__file__).parent / "fixtures" / "doclift_bundle_minimal"
+
+
+def _copied_fixture_doclift_bundle(tmp_path: Path) -> Path:
+    target = tmp_path / "doclift_bundle_minimal"
+    shutil.copytree(_fixture_doclift_bundle(), target)
+    return target
+
+
 def test_groundrecall_source_adapter_registry_lists_expected_adapters() -> None:
    names = set(list_source_adapters())
    assert "llmwiki" in names
@ -33,10 +44,8 @@ def test_detect_didactopus_pack_adapter(tmp_path: Path) -> None:
    assert adapter.import_intent() == "both"


-def test_detect_doclift_bundle_adapter(tmp_path: Path) -> None:
-    (tmp_path / "documents").mkdir()
-    (tmp_path / "manifest.json").write_text('{"documents": []}\n', encoding="utf-8")
-    adapter = detect_source_adapter(tmp_path)
+def test_detect_doclift_bundle_adapter() -> None:
+    adapter = detect_source_adapter(_fixture_doclift_bundle())
    assert adapter.name == "doclift_bundle"
    assert adapter.import_intent() == "both"

@ -202,34 +211,10 @@ def test_didactopus_pack_import_generates_structured_concepts_and_relations(tmp_


 def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) -> None:
-    doc_dir = tmp_path / "documents" / "lesson-a"
-    doc_dir.mkdir(parents=True)
-    (tmp_path / "manifest.json").write_text(
-        '\n'.join(
-            [
-                "{",
-                '  "documents": [',
-                "    {",
-                '      "document_id": "lesson-a",',
-                '      "title": "Lecture 1. Example",',
-                '      "document_kind": "lecture",',
-                f'      "output_dir": "{doc_dir}",',
-                f'      "markdown_path": "{doc_dir / "document.md"}",',
-                f'      "figures_path": "{doc_dir / "document.figures.json"}"',
-                "    }",
-                "  ]",
-                "}",
-            ]
-        ),
-        encoding="utf-8",
-    )
-    (doc_dir / "document.md").write_text("# Lecture 1. Example\n\nBody.\n", encoding="utf-8")
-    (doc_dir / "document.figures.json").write_text('{"source_path": "/tmp/source.doc"}\n', encoding="utf-8")
-
-    result = run_groundrecall_import(tmp_path, mode="quick", import_id="doclift-test")
+    result = run_groundrecall_import(_copied_fixture_doclift_bundle(tmp_path), mode="quick", import_id="doclift-test")
    assert result.manifest["source_adapter"] == "doclift_bundle"
    assert result.manifest["import_intent"] == "both"
    concept_ids = {item["concept_id"] for item in result.concepts}
-    assert "concept::lesson-a" in concept_ids
+    assert "concept::lecture-1" in concept_ids
    claim_ids = {item["claim_id"] for item in result.claims}
    assert "clm_doclift_1" in claim_ids
Author	SHA1	Message	Date
welsberr	1731e0006a	Keep doclift fixture immutable in tests	2026-04-23 07:26:39 -04:00
welsberr	76ca54327a	Harden doclift bundle workflow and fixtures	2026-04-23 07:23:32 -04:00