Harden doclift bundle workflow and fixtures

This commit is contained in:
welsberr 2026-04-23 07:23:32 -04:00
parent e819f17607
commit 76ca54327a
19 changed files with 509 additions and 35 deletions

View File

@ -1,3 +1,8 @@
# GroundRecall # GroundRecall
GroundRecall is a human-reviewable/AI usable knowledge layer with capabilities to meet or exceed 'llmwiki' with 'v2' specifications, plus an import path for existing llmwiki instances, and integration with Didactopus for review workflows and knowledge merging. GroundRecall is a human-reviewable/AI usable knowledge layer with capabilities to meet or exceed 'llmwiki' with 'v2' specifications, plus an import path for existing llmwiki instances, and integration with Didactopus for review workflows and knowledge merging.
`GroundRecall` can also import normalized `doclift` bundles directly when the
source material began as legacy office documents and you want a provenance-aware
knowledge import without going through a learner pack first. See
`docs/quickstart.md` for the minimal `doclift -> GroundRecall` flow.

View File

@ -33,6 +33,21 @@ groundrecall import /path/to/llmwiki --mode grounded
The importer writes normalized artifacts under `imports/<import-id>/`. The importer writes normalized artifacts under `imports/<import-id>/`.
Import from a normalized `doclift` bundle:
```bash
groundrecall import /path/to/doclift-bundle --mode quick
```
This path is intended for legacy-document corpora that were first normalized by
`doclift`. If you want a learner-facing pack first, use Didactopus in between:
```bash
doclift convert-dir /path/to/legacy-course /tmp/doclift-bundle --asset-root /path/to/legacy-course
didactopus doclift-bundle /tmp/doclift-bundle /tmp/didactopus-pack --course-title "Example Course"
groundrecall import /tmp/doclift-bundle --mode quick
```
## Review And Promote ## Review And Promote
Inspect the import outputs: Inspect the import outputs:

View File

@ -10,6 +10,14 @@ from .base import DiscoveredImportSource, StructuredImportRows, register_source_
class DocliftBundleSourceAdapter: class DocliftBundleSourceAdapter:
name = "doclift_bundle" name = "doclift_bundle"
def _resolve_bundle_path(self, base: Path, value: str | Path | None) -> Path:
if value is None:
return Path()
path = Path(value)
if path.is_absolute():
return path
return base / path
def detect(self, root: str | Path) -> bool: def detect(self, root: str | Path) -> bool:
base = Path(root) base = Path(root)
return (base / "manifest.json").exists() and (base / "documents").exists() return (base / "manifest.json").exists() and (base / "documents").exists()
@ -69,10 +77,13 @@ class DocliftBundleSourceAdapter:
for index, document in enumerate(documents, start=1): for index, document in enumerate(documents, start=1):
title = str(document.get("title") or f"Document {index}") title = str(document.get("title") or f"Document {index}")
concept_id = f"concept::{document.get('document_id') or title.lower().replace(' ', '-')}" concept_id = f"concept::{document.get('document_id') or title.lower().replace(' ', '-')}"
markdown_path = Path(document.get("markdown_path", "")) markdown_path = self._resolve_bundle_path(base, document.get("markdown_path", ""))
relative_markdown = markdown_path.relative_to(base).as_posix() if markdown_path.is_absolute() and markdown_path.exists() and markdown_path.is_relative_to(base) else document.get("markdown_path", "") if markdown_path.exists():
relative_markdown = markdown_path.relative_to(base).as_posix()
else:
relative_markdown = str(document.get("markdown_path", ""))
artifact_id = artifact_by_path.get(str(relative_markdown), "") artifact_id = artifact_by_path.get(str(relative_markdown), "")
figures_path = Path(document.get("figures_path", "")) figures_path = self._resolve_bundle_path(base, document.get("figures_path", ""))
figure_payload = {} figure_payload = {}
if figures_path.exists(): if figures_path.exists():
figure_payload = json.loads(figures_path.read_text(encoding="utf-8")) figure_payload = json.loads(figures_path.read_text(encoding="utf-8"))

View File

@ -0,0 +1,9 @@
{
"source_path": "legacy/lecture-1.doc",
"figure_references": [
{
"label": "Figure 1",
"caption": "Example figure caption"
}
]
}

View File

@ -0,0 +1,8 @@
[
{
"line_index": 0,
"text": "Lecture 1. Example",
"kind": "heading",
"indent": 0
}
]

View File

@ -0,0 +1,9 @@
# Lecture 1. Example
## Module A
### Lesson A
- Objective: Explain lesson A.
Body text that grounds the example lesson.

View File

@ -0,0 +1,13 @@
{
"source_path": "legacy/lecture-1.doc",
"tables": [
{
"table_id": "table-1",
"caption": "Example table",
"rows": [
["Column A", "Column B"],
["1", "2"]
]
}
]
}

View File

@ -0,0 +1,5 @@
{"artifact_id": "ia_af72cb1641f3", "artifact_kind": "doclift_bundle_artifact", "created_at": "2026-04-23T11:22:39Z", "current_status": "draft", "import_id": "doclift-test", "metadata": {"source_kind": "doclift_bundle"}, "path": "documents/lecture-1/document.figures.json", "sha256": "f1c6970942981c53761360effdb5e5b590dcf7f0172839d37b636af96c19dadd", "title": "document.figures"}
{"artifact_id": "ia_6cc5265d52f6", "artifact_kind": "doclift_bundle_artifact", "created_at": "2026-04-23T11:22:39Z", "current_status": "draft", "import_id": "doclift-test", "metadata": {"source_kind": "doclift_bundle"}, "path": "documents/lecture-1/document.layout.json", "sha256": "9883a8c3bb6acae5295eaf51ae3308f83c8ec4452bb4279b7d370e0ebd5706b3", "title": "document.layout"}
{"artifact_id": "ia_51bdebab22e6", "artifact_kind": "doclift_bundle_artifact", "created_at": "2026-04-23T11:22:39Z", "current_status": "draft", "import_id": "doclift-test", "metadata": {"source_kind": "doclift_bundle"}, "path": "documents/lecture-1/document.md", "sha256": "bac0c576c657e5a79a484aa7ec1aee193742ff2627f8f7b100f62530ee1c991d", "title": "document"}
{"artifact_id": "ia_893c59d73929", "artifact_kind": "doclift_bundle_artifact", "created_at": "2026-04-23T11:22:39Z", "current_status": "draft", "import_id": "doclift-test", "metadata": {"source_kind": "doclift_bundle"}, "path": "documents/lecture-1/document.tables.json", "sha256": "a1eda53d353a7be08b3a1d55571c80f29be8fa157ab770fccc22fe3db6053fde", "title": "document.tables"}
{"artifact_id": "ia_ffa5b716b5a5", "artifact_kind": "doclift_bundle_artifact", "created_at": "2026-04-23T11:22:39Z", "current_status": "draft", "import_id": "doclift-test", "metadata": {"source_kind": "doclift_bundle"}, "path": "manifest.json", "sha256": "3810d72b9858e1eb69e981759a3901defb75776744ba50f73f426860f05b9b5a", "title": "manifest"}

View File

@ -0,0 +1 @@
{"claim_id": "clm_doclift_1", "claim_kind": "summary", "claim_text": "Lecture 1. Example is a lecture in the imported doclift bundle.", "concept_ids": ["concept::lecture-1"], "confidence_hint": 0.85, "contradicts_claim_ids": [], "current_status": "triaged", "grounding_status": "grounded", "import_id": "doclift-test", "source_observation_ids": ["obs_doclift_1"], "supersedes_claim_ids": [], "supporting_fragment_ids": []}

View File

@ -0,0 +1 @@
{"aliases": [], "concept_id": "concept::lecture-1", "current_status": "triaged", "description": "Imported from doclift bundle document kind 'lecture'.", "import_id": "doclift-test", "source_artifact_ids": ["ia_51bdebab22e6"], "title": "Lecture 1. Example"}

View File

@ -0,0 +1,14 @@
{
"import_id": "doclift-test",
"import_mode": "quick",
"summary": {
"artifact_count": 5,
"observation_count": 1,
"claim_count": 1,
"concept_count": 1,
"relation_count": 0,
"error_count": 0,
"warning_count": 0
},
"findings": []
}

View File

@ -0,0 +1,16 @@
{
"import_id": "doclift-test",
"import_mode": "quick",
"machine_id": "nerdanel",
"agent_id": "groundrecall.ingest",
"source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal",
"imported_at": "2026-04-23T11:22:39Z",
"source_repo_kind": "llmwiki",
"source_adapter": "doclift_bundle",
"import_intent": "both",
"artifact_count": 5,
"observation_count": 1,
"claim_count": 1,
"concept_count": 1,
"relation_count": 0
}

View File

@ -0,0 +1 @@
{"artifact_id": "ia_51bdebab22e6", "confidence_hint": 0.85, "current_status": "draft", "grounding_status": "grounded", "import_id": "doclift-test", "line_end": 0, "line_start": 0, "observation_id": "obs_doclift_1", "origin_path": "documents/lecture-1/document.md", "origin_section": "Lecture 1. Example", "role": "summary", "source_url": "legacy/lecture-1.doc", "support_kind": "direct_source", "text": "Lecture 1. Example"}

View File

@ -0,0 +1,316 @@
{
"reviewer": "GroundRecall Import",
"draft_pack": {
"pack": {
"name": "groundrecall-import-doclift-test",
"display_name": "GroundRecall Import doclift-test",
"version": "0.1.0-draft",
"source_import_id": "doclift-test",
"source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal"
},
"concepts": [
{
"concept_id": "lecture-1",
"title": "Lecture 1. Example",
"description": "Imported from doclift bundle document kind 'lecture'.",
"prerequisites": [],
"mastery_signals": [],
"status": "provisional",
"notes": [
"Claim: Lecture 1. Example is a lecture in the imported doclift bundle. [grounded]"
]
}
],
"conflicts": [],
"review_flags": [],
"attribution": {
"source_repo_kind": "llmwiki",
"source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal",
"imported_at": "2026-04-23T11:22:39Z",
"machine_id": "nerdanel",
"rights_note": "Imported llmwiki-style corpus requires review before promotion."
}
},
"citation_reviews": [],
"ledger": [],
"import_context": {
"manifest": {
"import_id": "doclift-test",
"import_mode": "quick",
"machine_id": "nerdanel",
"agent_id": "groundrecall.ingest",
"source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal",
"imported_at": "2026-04-23T11:22:39Z",
"source_repo_kind": "llmwiki",
"source_adapter": "doclift_bundle",
"import_intent": "both",
"artifact_count": 5,
"observation_count": 1,
"claim_count": 1,
"concept_count": 1,
"relation_count": 0
},
"lint_summary": {
"artifact_count": 5,
"observation_count": 1,
"claim_count": 1,
"concept_count": 1,
"relation_count": 0,
"error_count": 0,
"warning_count": 0
},
"queue_length": 1,
"source_adapter": "doclift_bundle"
},
"review_guidance": {
"overview": "Review concepts first, then inspect representative claims and their source observations before promotion.",
"priorities": [
"Focus reviewer effort on concepts with strong grounded claims and explicit citations first.",
"Downgrade or reject concepts whose claims are fragmented, duplicated, or missing meaningful support.",
"For academic material, citation-bearing claims deserve special scrutiny for fit, contradiction, and fabrication risk."
],
"citation_guidance": [
"A citation key or extracted reference is evidence of traceability, not correctness.",
"Check whether the cited work actually supports the claim and whether the claim overstates it.",
"Use the citation track to prioritize claims that can move into a separate citation-ingestion workflow."
]
},
"field_specs": [
{
"field": "status",
"label": "Review status",
"input": "select",
"required": true,
"options": [
{
"value": "trusted",
"label": "Trusted",
"help": "Promote this concept and its supported claims when the evidence and wording are ready."
},
{
"value": "provisional",
"label": "Provisional",
"help": "Keep this concept in reviewed state when it is promising but still needs citation or wording cleanup."
},
{
"value": "needs_review",
"label": "Needs Review",
"help": "Leave undecided when support, scope, or concept boundaries are still unclear."
},
{
"value": "rejected",
"label": "Rejected",
"help": "Exclude this concept when it is noise, unsupported, duplicated, or misleading."
}
]
},
{
"field": "description",
"label": "Concept description",
"input": "text",
"required": false,
"help": "Refine the concept summary to match the strongest supported interpretation."
},
{
"field": "notes",
"label": "Reviewer notes",
"input": "textarea",
"required": false,
"help": "Record why this concept is trusted, provisional, rejected, or still unclear."
},
{
"field": "prerequisites",
"label": "Prerequisites",
"input": "textarea",
"required": false,
"help": "List prerequisite concepts only when the manuscript support is explicit or defensible."
}
],
"citation_field_specs": [
{
"field": "status",
"label": "Citation review status",
"input": "select",
"required": true,
"options": [
{
"value": "unreviewed",
"label": "Unreviewed",
"help": "Keep this citation candidate in triage until fit and existence are checked."
},
{
"value": "verified",
"label": "Verified",
"help": "The cited work exists and materially supports the associated manuscript claim."
},
{
"value": "needs_source_check",
"label": "Needs Source Check",
"help": "The citation may be useful but still needs direct source inspection or metadata cleanup."
},
{
"value": "misleading",
"label": "Misleading",
"help": "The citation exists but overstates, contradicts, or poorly fits the claim."
},
{
"value": "irrelevant",
"label": "Irrelevant",
"help": "The citation does not materially support the concept or claim under review."
},
{
"value": "fabricated",
"label": "Fabricated",
"help": "The citation appears invented, malformed, or otherwise not real."
}
]
},
{
"field": "notes",
"label": "Citation notes",
"input": "textarea",
"required": false,
"help": "Record whether the cited work exists, fits the claim, or should move into a dedicated citation-ingestion lane."
}
],
"concept_reviews": [
{
"concept_id": "lecture-1",
"title": "Lecture 1. Example",
"status": "provisional",
"description": "Imported from doclift bundle document kind 'lecture'.",
"review_help": "Prefer `trusted` when claims are coherent and citation-bearing support is appropriate; prefer `provisional` when the concept is plausible but still needs citation or wording cleanup.",
"claim_count": 1,
"grounded_claim_count": 1,
"warning_count": 0,
"has_citation_support": false,
"top_claims": [
{
"claim_id": "clm_doclift_1",
"claim_text": "Lecture 1. Example is a lecture in the imported doclift bundle.",
"claim_kind": "summary",
"grounding_status": "grounded",
"supporting_observations": [
{
"observation_id": "obs_doclift_1",
"origin_path": "documents/lecture-1/document.md",
"origin_section": "Lecture 1. Example",
"text": "Lecture 1. Example",
"line_start": 0,
"line_end": 0
}
],
"citation_support": [
{
"citation_key_count": 0,
"extracted_reference_count": 0,
"has_citation_support": false
}
],
"artifact_paths": [
"documents/lecture-1/document.md"
],
"finding_messages": []
}
],
"notes": [
"Claim: Lecture 1. Example is a lecture in the imported doclift bundle. [grounded]"
]
}
],
"bibliography": {
"enabled": false,
"entry_count": 0,
"source_files": []
},
"citations": {
"enabled": true,
"provider": "citegeist",
"artifacts": [
{
"artifact_id": "ia_af72cb1641f3",
"path": "documents/lecture-1/document.figures.json",
"title": "document.figures",
"citation_keys": [],
"resolved_entries": [],
"citation_key_count": 0,
"extracted_references": [],
"extracted_reference_count": 0,
"citegeist_backends": [
"anystyle",
"grobid",
"heuristic"
]
},
{
"artifact_id": "ia_6cc5265d52f6",
"path": "documents/lecture-1/document.layout.json",
"title": "document.layout",
"citation_keys": [],
"resolved_entries": [],
"citation_key_count": 0,
"extracted_references": [],
"extracted_reference_count": 0,
"citegeist_backends": [
"anystyle",
"grobid",
"heuristic"
]
},
{
"artifact_id": "ia_51bdebab22e6",
"path": "documents/lecture-1/document.md",
"title": "document",
"citation_keys": [],
"resolved_entries": [],
"citation_key_count": 0,
"extracted_references": [],
"extracted_reference_count": 0,
"citegeist_backends": [
"anystyle",
"grobid",
"heuristic"
]
},
{
"artifact_id": "ia_893c59d73929",
"path": "documents/lecture-1/document.tables.json",
"title": "document.tables",
"citation_keys": [],
"resolved_entries": [],
"citation_key_count": 0,
"extracted_references": [],
"extracted_reference_count": 0,
"citegeist_backends": [
"anystyle",
"grobid",
"heuristic"
]
},
{
"artifact_id": "ia_ffa5b716b5a5",
"path": "manifest.json",
"title": "manifest",
"citation_keys": [],
"resolved_entries": [],
"citation_key_count": 0,
"extracted_references": [],
"extracted_reference_count": 0,
"citegeist_backends": [
"anystyle",
"grobid",
"heuristic"
]
}
],
"summary": {
"artifact_count_with_citations": 0,
"citation_key_total": 0,
"extracted_reference_total": 0
},
"next_actions": [
"Promote citation-bearing claims into a dedicated citation review lane.",
"Use CiteGeist extraction as a first pass, then verify support and metadata before trusting the citation."
]
}
}

View File

@ -0,0 +1,20 @@
{
"import_id": "doclift-test",
"queue_length": 1,
"items": [
{
"queue_id": "rq_clm_doclift_1",
"candidate_type": "claim",
"candidate_id": "clm_doclift_1",
"title": "Lecture 1. Example is a lecture in the imported doclift bundle.",
"triage_lane": "knowledge_capture",
"priority": 35,
"grounding_status": "grounded",
"status": "needs_review",
"finding_codes": [],
"concept_ids": [
"concept::lecture-1"
]
}
]
}

View File

@ -0,0 +1,36 @@
{
"reviewer": "GroundRecall Import",
"draft_pack": {
"pack": {
"name": "groundrecall-import-doclift-test",
"display_name": "GroundRecall Import doclift-test",
"version": "0.1.0-draft",
"source_import_id": "doclift-test",
"source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal"
},
"concepts": [
{
"concept_id": "lecture-1",
"title": "Lecture 1. Example",
"description": "Imported from doclift bundle document kind 'lecture'.",
"prerequisites": [],
"mastery_signals": [],
"status": "provisional",
"notes": [
"Claim: Lecture 1. Example is a lecture in the imported doclift bundle. [grounded]"
]
}
],
"conflicts": [],
"review_flags": [],
"attribution": {
"source_repo_kind": "llmwiki",
"source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal",
"imported_at": "2026-04-23T11:22:39Z",
"machine_id": "nerdanel",
"rights_note": "Imported llmwiki-style corpus requires review before promotion."
}
},
"citation_reviews": [],
"ledger": []
}

View File

@ -0,0 +1,16 @@
{
"documents": [
{
"document_id": "lecture-1",
"title": "Lecture 1. Example",
"document_kind": "lecture",
"output_dir": "documents/lecture-1",
"markdown_path": "documents/lecture-1/document.md",
"layout_path": "documents/lecture-1/document.layout.json",
"tables_path": "documents/lecture-1/document.tables.json",
"figures_path": "documents/lecture-1/document.figures.json",
"table_count": 1,
"figure_reference_count": 1
}
]
}

View File

@ -8,6 +8,10 @@ from groundrecall.source_adapters.base import detect_source_adapter, list_source
from groundrecall.ingest import run_groundrecall_import from groundrecall.ingest import run_groundrecall_import
def _fixture_doclift_bundle() -> Path:
return Path(__file__).parent / "fixtures" / "doclift_bundle_minimal"
def test_groundrecall_source_adapter_registry_lists_expected_adapters() -> None: def test_groundrecall_source_adapter_registry_lists_expected_adapters() -> None:
names = set(list_source_adapters()) names = set(list_source_adapters())
assert "llmwiki" in names assert "llmwiki" in names
@ -33,10 +37,8 @@ def test_detect_didactopus_pack_adapter(tmp_path: Path) -> None:
assert adapter.import_intent() == "both" assert adapter.import_intent() == "both"
def test_detect_doclift_bundle_adapter(tmp_path: Path) -> None: def test_detect_doclift_bundle_adapter() -> None:
(tmp_path / "documents").mkdir() adapter = detect_source_adapter(_fixture_doclift_bundle())
(tmp_path / "manifest.json").write_text('{"documents": []}\n', encoding="utf-8")
adapter = detect_source_adapter(tmp_path)
assert adapter.name == "doclift_bundle" assert adapter.name == "doclift_bundle"
assert adapter.import_intent() == "both" assert adapter.import_intent() == "both"
@ -201,35 +203,11 @@ def test_didactopus_pack_import_generates_structured_concepts_and_relations(tmp_
assert "clm_stage_stage1_basics" in claim_ids assert "clm_stage_stage1_basics" in claim_ids
def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) -> None: def test_doclift_bundle_import_generates_structured_concepts() -> None:
doc_dir = tmp_path / "documents" / "lesson-a" result = run_groundrecall_import(_fixture_doclift_bundle(), mode="quick", import_id="doclift-test")
doc_dir.mkdir(parents=True)
(tmp_path / "manifest.json").write_text(
'\n'.join(
[
"{",
' "documents": [',
" {",
' "document_id": "lesson-a",',
' "title": "Lecture 1. Example",',
' "document_kind": "lecture",',
f' "output_dir": "{doc_dir}",',
f' "markdown_path": "{doc_dir / "document.md"}",',
f' "figures_path": "{doc_dir / "document.figures.json"}"',
" }",
" ]",
"}",
]
),
encoding="utf-8",
)
(doc_dir / "document.md").write_text("# Lecture 1. Example\n\nBody.\n", encoding="utf-8")
(doc_dir / "document.figures.json").write_text('{"source_path": "/tmp/source.doc"}\n', encoding="utf-8")
result = run_groundrecall_import(tmp_path, mode="quick", import_id="doclift-test")
assert result.manifest["source_adapter"] == "doclift_bundle" assert result.manifest["source_adapter"] == "doclift_bundle"
assert result.manifest["import_intent"] == "both" assert result.manifest["import_intent"] == "both"
concept_ids = {item["concept_id"] for item in result.concepts} concept_ids = {item["concept_id"] for item in result.concepts}
assert "concept::lesson-a" in concept_ids assert "concept::lecture-1" in concept_ids
claim_ids = {item["claim_id"] for item in result.claims} claim_ids = {item["claim_id"] for item in result.claims}
assert "clm_doclift_1" in claim_ids assert "clm_doclift_1" in claim_ids