Compare commits

...

2 Commits

Author SHA1 Message Date
welsberr 1731e0006a Keep doclift fixture immutable in tests 2026-04-23 07:26:39 -04:00
welsberr 76ca54327a Harden doclift bundle workflow and fixtures 2026-04-23 07:23:32 -04:00
9 changed files with 105 additions and 34 deletions

View File

@ -1,3 +1,8 @@
# GroundRecall
GroundRecall is a human-reviewable/AI usable knowledge layer with capabilities to meet or exceed 'llmwiki' with 'v2' specifications, plus an import path for existing llmwiki instances, and integration with Didactopus for review workflows and knowledge merging.
GroundRecall is a human-reviewable/AI usable knowledge layer with capabilities to meet or exceed 'llmwiki' with 'v2' specifications, plus an import path for existing llmwiki instances, and integration with Didactopus for review workflows and knowledge merging.
`GroundRecall` can also import normalized `doclift` bundles directly when the
source material began as legacy office documents and you want a provenance-aware
knowledge import without going through a learner pack first. See
`docs/quickstart.md` for the minimal `doclift -> GroundRecall` flow.

View File

@ -33,6 +33,21 @@ groundrecall import /path/to/llmwiki --mode grounded
The importer writes normalized artifacts under `imports/<import-id>/`.
Import from a normalized `doclift` bundle:
```bash
groundrecall import /path/to/doclift-bundle --mode quick
```
This path is intended for legacy-document corpora that were first normalized by
`doclift`. If you want a learner-facing pack first, use Didactopus in between:
```bash
doclift convert-dir /path/to/legacy-course /tmp/doclift-bundle --asset-root /path/to/legacy-course
didactopus doclift-bundle /tmp/doclift-bundle /tmp/didactopus-pack --course-title "Example Course"
groundrecall import /tmp/doclift-bundle --mode quick
```
## Review And Promote
Inspect the import outputs:

View File

@ -10,6 +10,14 @@ from .base import DiscoveredImportSource, StructuredImportRows, register_source_
class DocliftBundleSourceAdapter:
name = "doclift_bundle"
def _resolve_bundle_path(self, base: Path, value: str | Path | None) -> Path:
if value is None:
return Path()
path = Path(value)
if path.is_absolute():
return path
return base / path
def detect(self, root: str | Path) -> bool:
base = Path(root)
return (base / "manifest.json").exists() and (base / "documents").exists()
@ -69,10 +77,13 @@ class DocliftBundleSourceAdapter:
for index, document in enumerate(documents, start=1):
title = str(document.get("title") or f"Document {index}")
concept_id = f"concept::{document.get('document_id') or title.lower().replace(' ', '-')}"
markdown_path = Path(document.get("markdown_path", ""))
relative_markdown = markdown_path.relative_to(base).as_posix() if markdown_path.is_absolute() and markdown_path.exists() and markdown_path.is_relative_to(base) else document.get("markdown_path", "")
markdown_path = self._resolve_bundle_path(base, document.get("markdown_path", ""))
if markdown_path.exists():
relative_markdown = markdown_path.relative_to(base).as_posix()
else:
relative_markdown = str(document.get("markdown_path", ""))
artifact_id = artifact_by_path.get(str(relative_markdown), "")
figures_path = Path(document.get("figures_path", ""))
figures_path = self._resolve_bundle_path(base, document.get("figures_path", ""))
figure_payload = {}
if figures_path.exists():
figure_payload = json.loads(figures_path.read_text(encoding="utf-8"))

View File

@ -0,0 +1,9 @@
{
"source_path": "legacy/lecture-1.doc",
"figure_references": [
{
"label": "Figure 1",
"caption": "Example figure caption"
}
]
}

View File

@ -0,0 +1,8 @@
[
{
"line_index": 0,
"text": "Lecture 1. Example",
"kind": "heading",
"indent": 0
}
]

View File

@ -0,0 +1,9 @@
# Lecture 1. Example
## Module A
### Lesson A
- Objective: Explain lesson A.
Body text that grounds the example lesson.

View File

@ -0,0 +1,13 @@
{
"source_path": "legacy/lecture-1.doc",
"tables": [
{
"table_id": "table-1",
"caption": "Example table",
"rows": [
["Column A", "Column B"],
["1", "2"]
]
}
]
}

View File

@ -0,0 +1,16 @@
{
"documents": [
{
"document_id": "lecture-1",
"title": "Lecture 1. Example",
"document_kind": "lecture",
"output_dir": "documents/lecture-1",
"markdown_path": "documents/lecture-1/document.md",
"layout_path": "documents/lecture-1/document.layout.json",
"tables_path": "documents/lecture-1/document.tables.json",
"figures_path": "documents/lecture-1/document.figures.json",
"table_count": 1,
"figure_reference_count": 1
}
]
}

View File

@ -1,6 +1,7 @@
from __future__ import annotations
from pathlib import Path
import shutil
import groundrecall.ingest as ingest_module
import groundrecall.source_adapters # noqa: F401
@ -8,6 +9,16 @@ from groundrecall.source_adapters.base import detect_source_adapter, list_source
from groundrecall.ingest import run_groundrecall_import
def _fixture_doclift_bundle() -> Path:
return Path(__file__).parent / "fixtures" / "doclift_bundle_minimal"
def _copied_fixture_doclift_bundle(tmp_path: Path) -> Path:
target = tmp_path / "doclift_bundle_minimal"
shutil.copytree(_fixture_doclift_bundle(), target)
return target
def test_groundrecall_source_adapter_registry_lists_expected_adapters() -> None:
names = set(list_source_adapters())
assert "llmwiki" in names
@ -33,10 +44,8 @@ def test_detect_didactopus_pack_adapter(tmp_path: Path) -> None:
assert adapter.import_intent() == "both"
def test_detect_doclift_bundle_adapter(tmp_path: Path) -> None:
(tmp_path / "documents").mkdir()
(tmp_path / "manifest.json").write_text('{"documents": []}\n', encoding="utf-8")
adapter = detect_source_adapter(tmp_path)
def test_detect_doclift_bundle_adapter() -> None:
adapter = detect_source_adapter(_fixture_doclift_bundle())
assert adapter.name == "doclift_bundle"
assert adapter.import_intent() == "both"
@ -202,34 +211,10 @@ def test_didactopus_pack_import_generates_structured_concepts_and_relations(tmp_
def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) -> None:
doc_dir = tmp_path / "documents" / "lesson-a"
doc_dir.mkdir(parents=True)
(tmp_path / "manifest.json").write_text(
'\n'.join(
[
"{",
' "documents": [',
" {",
' "document_id": "lesson-a",',
' "title": "Lecture 1. Example",',
' "document_kind": "lecture",',
f' "output_dir": "{doc_dir}",',
f' "markdown_path": "{doc_dir / "document.md"}",',
f' "figures_path": "{doc_dir / "document.figures.json"}"',
" }",
" ]",
"}",
]
),
encoding="utf-8",
)
(doc_dir / "document.md").write_text("# Lecture 1. Example\n\nBody.\n", encoding="utf-8")
(doc_dir / "document.figures.json").write_text('{"source_path": "/tmp/source.doc"}\n', encoding="utf-8")
result = run_groundrecall_import(tmp_path, mode="quick", import_id="doclift-test")
result = run_groundrecall_import(_copied_fixture_doclift_bundle(tmp_path), mode="quick", import_id="doclift-test")
assert result.manifest["source_adapter"] == "doclift_bundle"
assert result.manifest["import_intent"] == "both"
concept_ids = {item["concept_id"] for item in result.concepts}
assert "concept::lesson-a" in concept_ids
assert "concept::lecture-1" in concept_ids
claim_ids = {item["claim_id"] for item in result.claims}
assert "clm_doclift_1" in claim_ids