diff --git a/README.md b/README.md index ece24eb..41850e2 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ # GroundRecall -GroundRecall is a human-reviewable/AI usable knowledge layer with capabilities to meet or exceed 'llmwiki' with 'v2' specifications, plus an import path for existing llmwiki instances, and integration with Didactopus for review workflows and knowledge merging. \ No newline at end of file +GroundRecall is a human-reviewable/AI usable knowledge layer with capabilities to meet or exceed 'llmwiki' with 'v2' specifications, plus an import path for existing llmwiki instances, and integration with Didactopus for review workflows and knowledge merging. + +`GroundRecall` can also import normalized `doclift` bundles directly when the +source material began as legacy office documents and you want a provenance-aware +knowledge import without going through a learner pack first. See +`docs/quickstart.md` for the minimal `doclift -> GroundRecall` flow. diff --git a/docs/quickstart.md b/docs/quickstart.md index 5641c33..b9a997a 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -33,6 +33,21 @@ groundrecall import /path/to/llmwiki --mode grounded The importer writes normalized artifacts under `imports//`. +Import from a normalized `doclift` bundle: + +```bash +groundrecall import /path/to/doclift-bundle --mode quick +``` + +This path is intended for legacy-document corpora that were first normalized by +`doclift`. If you want a learner-facing pack first, use Didactopus in between: + +```bash +doclift convert-dir /path/to/legacy-course /tmp/doclift-bundle --asset-root /path/to/legacy-course +didactopus doclift-bundle /tmp/doclift-bundle /tmp/didactopus-pack --course-title "Example Course" +groundrecall import /tmp/doclift-bundle --mode quick +``` + ## Review And Promote Inspect the import outputs: diff --git a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py index c15530f..04e6b91 100755 --- a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py +++ b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py @@ -10,6 +10,14 @@ from .base import DiscoveredImportSource, StructuredImportRows, register_source_ class DocliftBundleSourceAdapter: name = "doclift_bundle" + def _resolve_bundle_path(self, base: Path, value: str | Path | None) -> Path: + if value is None: + return Path() + path = Path(value) + if path.is_absolute(): + return path + return base / path + def detect(self, root: str | Path) -> bool: base = Path(root) return (base / "manifest.json").exists() and (base / "documents").exists() @@ -69,10 +77,13 @@ class DocliftBundleSourceAdapter: for index, document in enumerate(documents, start=1): title = str(document.get("title") or f"Document {index}") concept_id = f"concept::{document.get('document_id') or title.lower().replace(' ', '-')}" - markdown_path = Path(document.get("markdown_path", "")) - relative_markdown = markdown_path.relative_to(base).as_posix() if markdown_path.is_absolute() and markdown_path.exists() and markdown_path.is_relative_to(base) else document.get("markdown_path", "") + markdown_path = self._resolve_bundle_path(base, document.get("markdown_path", "")) + if markdown_path.exists(): + relative_markdown = markdown_path.relative_to(base).as_posix() + else: + relative_markdown = str(document.get("markdown_path", "")) artifact_id = artifact_by_path.get(str(relative_markdown), "") - figures_path = Path(document.get("figures_path", "")) + figures_path = self._resolve_bundle_path(base, document.get("figures_path", "")) figure_payload = {} if figures_path.exists(): figure_payload = json.loads(figures_path.read_text(encoding="utf-8")) diff --git a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.figures.json b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.figures.json new file mode 100755 index 0000000..c55670f --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.figures.json @@ -0,0 +1,9 @@ +{ + "source_path": "legacy/lecture-1.doc", + "figure_references": [ + { + "label": "Figure 1", + "caption": "Example figure caption" + } + ] +} diff --git a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.layout.json b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.layout.json new file mode 100755 index 0000000..e777268 --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.layout.json @@ -0,0 +1,8 @@ +[ + { + "line_index": 0, + "text": "Lecture 1. Example", + "kind": "heading", + "indent": 0 + } +] diff --git a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.md b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.md new file mode 100755 index 0000000..421282c --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.md @@ -0,0 +1,9 @@ +# Lecture 1. Example + +## Module A + +### Lesson A + +- Objective: Explain lesson A. + +Body text that grounds the example lesson. diff --git a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.tables.json b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.tables.json new file mode 100755 index 0000000..2fdf31d --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.tables.json @@ -0,0 +1,13 @@ +{ + "source_path": "legacy/lecture-1.doc", + "tables": [ + { + "table_id": "table-1", + "caption": "Example table", + "rows": [ + ["Column A", "Column B"], + ["1", "2"] + ] + } + ] +} diff --git a/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/artifacts.jsonl b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/artifacts.jsonl new file mode 100644 index 0000000..033ad87 --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/artifacts.jsonl @@ -0,0 +1,5 @@ +{"artifact_id": "ia_af72cb1641f3", "artifact_kind": "doclift_bundle_artifact", "created_at": "2026-04-23T11:22:39Z", "current_status": "draft", "import_id": "doclift-test", "metadata": {"source_kind": "doclift_bundle"}, "path": "documents/lecture-1/document.figures.json", "sha256": "f1c6970942981c53761360effdb5e5b590dcf7f0172839d37b636af96c19dadd", "title": "document.figures"} +{"artifact_id": "ia_6cc5265d52f6", "artifact_kind": "doclift_bundle_artifact", "created_at": "2026-04-23T11:22:39Z", "current_status": "draft", "import_id": "doclift-test", "metadata": {"source_kind": "doclift_bundle"}, "path": "documents/lecture-1/document.layout.json", "sha256": "9883a8c3bb6acae5295eaf51ae3308f83c8ec4452bb4279b7d370e0ebd5706b3", "title": "document.layout"} +{"artifact_id": "ia_51bdebab22e6", "artifact_kind": "doclift_bundle_artifact", "created_at": "2026-04-23T11:22:39Z", "current_status": "draft", "import_id": "doclift-test", "metadata": {"source_kind": "doclift_bundle"}, "path": "documents/lecture-1/document.md", "sha256": "bac0c576c657e5a79a484aa7ec1aee193742ff2627f8f7b100f62530ee1c991d", "title": "document"} +{"artifact_id": "ia_893c59d73929", "artifact_kind": "doclift_bundle_artifact", "created_at": "2026-04-23T11:22:39Z", "current_status": "draft", "import_id": "doclift-test", "metadata": {"source_kind": "doclift_bundle"}, "path": "documents/lecture-1/document.tables.json", "sha256": "a1eda53d353a7be08b3a1d55571c80f29be8fa157ab770fccc22fe3db6053fde", "title": "document.tables"} +{"artifact_id": "ia_ffa5b716b5a5", "artifact_kind": "doclift_bundle_artifact", "created_at": "2026-04-23T11:22:39Z", "current_status": "draft", "import_id": "doclift-test", "metadata": {"source_kind": "doclift_bundle"}, "path": "manifest.json", "sha256": "3810d72b9858e1eb69e981759a3901defb75776744ba50f73f426860f05b9b5a", "title": "manifest"} diff --git a/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/claims.jsonl b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/claims.jsonl new file mode 100644 index 0000000..76a4201 --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/claims.jsonl @@ -0,0 +1 @@ +{"claim_id": "clm_doclift_1", "claim_kind": "summary", "claim_text": "Lecture 1. Example is a lecture in the imported doclift bundle.", "concept_ids": ["concept::lecture-1"], "confidence_hint": 0.85, "contradicts_claim_ids": [], "current_status": "triaged", "grounding_status": "grounded", "import_id": "doclift-test", "source_observation_ids": ["obs_doclift_1"], "supersedes_claim_ids": [], "supporting_fragment_ids": []} diff --git a/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/concepts.jsonl b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/concepts.jsonl new file mode 100644 index 0000000..e231be6 --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/concepts.jsonl @@ -0,0 +1 @@ +{"aliases": [], "concept_id": "concept::lecture-1", "current_status": "triaged", "description": "Imported from doclift bundle document kind 'lecture'.", "import_id": "doclift-test", "source_artifact_ids": ["ia_51bdebab22e6"], "title": "Lecture 1. Example"} diff --git a/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/lint_findings.json b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/lint_findings.json new file mode 100644 index 0000000..9bb4b85 --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/lint_findings.json @@ -0,0 +1,14 @@ +{ + "import_id": "doclift-test", + "import_mode": "quick", + "summary": { + "artifact_count": 5, + "observation_count": 1, + "claim_count": 1, + "concept_count": 1, + "relation_count": 0, + "error_count": 0, + "warning_count": 0 + }, + "findings": [] +} \ No newline at end of file diff --git a/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/manifest.json b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/manifest.json new file mode 100644 index 0000000..b312441 --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/manifest.json @@ -0,0 +1,16 @@ +{ + "import_id": "doclift-test", + "import_mode": "quick", + "machine_id": "nerdanel", + "agent_id": "groundrecall.ingest", + "source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal", + "imported_at": "2026-04-23T11:22:39Z", + "source_repo_kind": "llmwiki", + "source_adapter": "doclift_bundle", + "import_intent": "both", + "artifact_count": 5, + "observation_count": 1, + "claim_count": 1, + "concept_count": 1, + "relation_count": 0 +} \ No newline at end of file diff --git a/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/observations.jsonl b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/observations.jsonl new file mode 100644 index 0000000..f4ff9f3 --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/observations.jsonl @@ -0,0 +1 @@ +{"artifact_id": "ia_51bdebab22e6", "confidence_hint": 0.85, "current_status": "draft", "grounding_status": "grounded", "import_id": "doclift-test", "line_end": 0, "line_start": 0, "observation_id": "obs_doclift_1", "origin_path": "documents/lecture-1/document.md", "origin_section": "Lecture 1. Example", "role": "summary", "source_url": "legacy/lecture-1.doc", "support_kind": "direct_source", "text": "Lecture 1. Example"} diff --git a/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/relations.jsonl b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/relations.jsonl new file mode 100644 index 0000000..e69de29 diff --git a/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/review_data.json b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/review_data.json new file mode 100644 index 0000000..daf96b0 --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/review_data.json @@ -0,0 +1,316 @@ +{ + "reviewer": "GroundRecall Import", + "draft_pack": { + "pack": { + "name": "groundrecall-import-doclift-test", + "display_name": "GroundRecall Import doclift-test", + "version": "0.1.0-draft", + "source_import_id": "doclift-test", + "source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal" + }, + "concepts": [ + { + "concept_id": "lecture-1", + "title": "Lecture 1. Example", + "description": "Imported from doclift bundle document kind 'lecture'.", + "prerequisites": [], + "mastery_signals": [], + "status": "provisional", + "notes": [ + "Claim: Lecture 1. Example is a lecture in the imported doclift bundle. [grounded]" + ] + } + ], + "conflicts": [], + "review_flags": [], + "attribution": { + "source_repo_kind": "llmwiki", + "source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal", + "imported_at": "2026-04-23T11:22:39Z", + "machine_id": "nerdanel", + "rights_note": "Imported llmwiki-style corpus requires review before promotion." + } + }, + "citation_reviews": [], + "ledger": [], + "import_context": { + "manifest": { + "import_id": "doclift-test", + "import_mode": "quick", + "machine_id": "nerdanel", + "agent_id": "groundrecall.ingest", + "source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal", + "imported_at": "2026-04-23T11:22:39Z", + "source_repo_kind": "llmwiki", + "source_adapter": "doclift_bundle", + "import_intent": "both", + "artifact_count": 5, + "observation_count": 1, + "claim_count": 1, + "concept_count": 1, + "relation_count": 0 + }, + "lint_summary": { + "artifact_count": 5, + "observation_count": 1, + "claim_count": 1, + "concept_count": 1, + "relation_count": 0, + "error_count": 0, + "warning_count": 0 + }, + "queue_length": 1, + "source_adapter": "doclift_bundle" + }, + "review_guidance": { + "overview": "Review concepts first, then inspect representative claims and their source observations before promotion.", + "priorities": [ + "Focus reviewer effort on concepts with strong grounded claims and explicit citations first.", + "Downgrade or reject concepts whose claims are fragmented, duplicated, or missing meaningful support.", + "For academic material, citation-bearing claims deserve special scrutiny for fit, contradiction, and fabrication risk." + ], + "citation_guidance": [ + "A citation key or extracted reference is evidence of traceability, not correctness.", + "Check whether the cited work actually supports the claim and whether the claim overstates it.", + "Use the citation track to prioritize claims that can move into a separate citation-ingestion workflow." + ] + }, + "field_specs": [ + { + "field": "status", + "label": "Review status", + "input": "select", + "required": true, + "options": [ + { + "value": "trusted", + "label": "Trusted", + "help": "Promote this concept and its supported claims when the evidence and wording are ready." + }, + { + "value": "provisional", + "label": "Provisional", + "help": "Keep this concept in reviewed state when it is promising but still needs citation or wording cleanup." + }, + { + "value": "needs_review", + "label": "Needs Review", + "help": "Leave undecided when support, scope, or concept boundaries are still unclear." + }, + { + "value": "rejected", + "label": "Rejected", + "help": "Exclude this concept when it is noise, unsupported, duplicated, or misleading." + } + ] + }, + { + "field": "description", + "label": "Concept description", + "input": "text", + "required": false, + "help": "Refine the concept summary to match the strongest supported interpretation." + }, + { + "field": "notes", + "label": "Reviewer notes", + "input": "textarea", + "required": false, + "help": "Record why this concept is trusted, provisional, rejected, or still unclear." + }, + { + "field": "prerequisites", + "label": "Prerequisites", + "input": "textarea", + "required": false, + "help": "List prerequisite concepts only when the manuscript support is explicit or defensible." + } + ], + "citation_field_specs": [ + { + "field": "status", + "label": "Citation review status", + "input": "select", + "required": true, + "options": [ + { + "value": "unreviewed", + "label": "Unreviewed", + "help": "Keep this citation candidate in triage until fit and existence are checked." + }, + { + "value": "verified", + "label": "Verified", + "help": "The cited work exists and materially supports the associated manuscript claim." + }, + { + "value": "needs_source_check", + "label": "Needs Source Check", + "help": "The citation may be useful but still needs direct source inspection or metadata cleanup." + }, + { + "value": "misleading", + "label": "Misleading", + "help": "The citation exists but overstates, contradicts, or poorly fits the claim." + }, + { + "value": "irrelevant", + "label": "Irrelevant", + "help": "The citation does not materially support the concept or claim under review." + }, + { + "value": "fabricated", + "label": "Fabricated", + "help": "The citation appears invented, malformed, or otherwise not real." + } + ] + }, + { + "field": "notes", + "label": "Citation notes", + "input": "textarea", + "required": false, + "help": "Record whether the cited work exists, fits the claim, or should move into a dedicated citation-ingestion lane." + } + ], + "concept_reviews": [ + { + "concept_id": "lecture-1", + "title": "Lecture 1. Example", + "status": "provisional", + "description": "Imported from doclift bundle document kind 'lecture'.", + "review_help": "Prefer `trusted` when claims are coherent and citation-bearing support is appropriate; prefer `provisional` when the concept is plausible but still needs citation or wording cleanup.", + "claim_count": 1, + "grounded_claim_count": 1, + "warning_count": 0, + "has_citation_support": false, + "top_claims": [ + { + "claim_id": "clm_doclift_1", + "claim_text": "Lecture 1. Example is a lecture in the imported doclift bundle.", + "claim_kind": "summary", + "grounding_status": "grounded", + "supporting_observations": [ + { + "observation_id": "obs_doclift_1", + "origin_path": "documents/lecture-1/document.md", + "origin_section": "Lecture 1. Example", + "text": "Lecture 1. Example", + "line_start": 0, + "line_end": 0 + } + ], + "citation_support": [ + { + "citation_key_count": 0, + "extracted_reference_count": 0, + "has_citation_support": false + } + ], + "artifact_paths": [ + "documents/lecture-1/document.md" + ], + "finding_messages": [] + } + ], + "notes": [ + "Claim: Lecture 1. Example is a lecture in the imported doclift bundle. [grounded]" + ] + } + ], + "bibliography": { + "enabled": false, + "entry_count": 0, + "source_files": [] + }, + "citations": { + "enabled": true, + "provider": "citegeist", + "artifacts": [ + { + "artifact_id": "ia_af72cb1641f3", + "path": "documents/lecture-1/document.figures.json", + "title": "document.figures", + "citation_keys": [], + "resolved_entries": [], + "citation_key_count": 0, + "extracted_references": [], + "extracted_reference_count": 0, + "citegeist_backends": [ + "anystyle", + "grobid", + "heuristic" + ] + }, + { + "artifact_id": "ia_6cc5265d52f6", + "path": "documents/lecture-1/document.layout.json", + "title": "document.layout", + "citation_keys": [], + "resolved_entries": [], + "citation_key_count": 0, + "extracted_references": [], + "extracted_reference_count": 0, + "citegeist_backends": [ + "anystyle", + "grobid", + "heuristic" + ] + }, + { + "artifact_id": "ia_51bdebab22e6", + "path": "documents/lecture-1/document.md", + "title": "document", + "citation_keys": [], + "resolved_entries": [], + "citation_key_count": 0, + "extracted_references": [], + "extracted_reference_count": 0, + "citegeist_backends": [ + "anystyle", + "grobid", + "heuristic" + ] + }, + { + "artifact_id": "ia_893c59d73929", + "path": "documents/lecture-1/document.tables.json", + "title": "document.tables", + "citation_keys": [], + "resolved_entries": [], + "citation_key_count": 0, + "extracted_references": [], + "extracted_reference_count": 0, + "citegeist_backends": [ + "anystyle", + "grobid", + "heuristic" + ] + }, + { + "artifact_id": "ia_ffa5b716b5a5", + "path": "manifest.json", + "title": "manifest", + "citation_keys": [], + "resolved_entries": [], + "citation_key_count": 0, + "extracted_references": [], + "extracted_reference_count": 0, + "citegeist_backends": [ + "anystyle", + "grobid", + "heuristic" + ] + } + ], + "summary": { + "artifact_count_with_citations": 0, + "citation_key_total": 0, + "extracted_reference_total": 0 + }, + "next_actions": [ + "Promote citation-bearing claims into a dedicated citation review lane.", + "Use CiteGeist extraction as a first pass, then verify support and metadata before trusting the citation." + ] + } +} \ No newline at end of file diff --git a/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/review_queue.json b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/review_queue.json new file mode 100644 index 0000000..02fa3cf --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/review_queue.json @@ -0,0 +1,20 @@ +{ + "import_id": "doclift-test", + "queue_length": 1, + "items": [ + { + "queue_id": "rq_clm_doclift_1", + "candidate_type": "claim", + "candidate_id": "clm_doclift_1", + "title": "Lecture 1. Example is a lecture in the imported doclift bundle.", + "triage_lane": "knowledge_capture", + "priority": 35, + "grounding_status": "grounded", + "status": "needs_review", + "finding_codes": [], + "concept_ids": [ + "concept::lecture-1" + ] + } + ] +} \ No newline at end of file diff --git a/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/review_session.json b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/review_session.json new file mode 100644 index 0000000..6355ad6 --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/imports/doclift-test/review_session.json @@ -0,0 +1,36 @@ +{ + "reviewer": "GroundRecall Import", + "draft_pack": { + "pack": { + "name": "groundrecall-import-doclift-test", + "display_name": "GroundRecall Import doclift-test", + "version": "0.1.0-draft", + "source_import_id": "doclift-test", + "source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal" + }, + "concepts": [ + { + "concept_id": "lecture-1", + "title": "Lecture 1. Example", + "description": "Imported from doclift bundle document kind 'lecture'.", + "prerequisites": [], + "mastery_signals": [], + "status": "provisional", + "notes": [ + "Claim: Lecture 1. Example is a lecture in the imported doclift bundle. [grounded]" + ] + } + ], + "conflicts": [], + "review_flags": [], + "attribution": { + "source_repo_kind": "llmwiki", + "source_root": "/home/netuser/dev/GroundRecall/tests/fixtures/doclift_bundle_minimal", + "imported_at": "2026-04-23T11:22:39Z", + "machine_id": "nerdanel", + "rights_note": "Imported llmwiki-style corpus requires review before promotion." + } + }, + "citation_reviews": [], + "ledger": [] +} \ No newline at end of file diff --git a/tests/fixtures/doclift_bundle_minimal/manifest.json b/tests/fixtures/doclift_bundle_minimal/manifest.json new file mode 100755 index 0000000..c473239 --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/manifest.json @@ -0,0 +1,16 @@ +{ + "documents": [ + { + "document_id": "lecture-1", + "title": "Lecture 1. Example", + "document_kind": "lecture", + "output_dir": "documents/lecture-1", + "markdown_path": "documents/lecture-1/document.md", + "layout_path": "documents/lecture-1/document.layout.json", + "tables_path": "documents/lecture-1/document.tables.json", + "figures_path": "documents/lecture-1/document.figures.json", + "table_count": 1, + "figure_reference_count": 1 + } + ] +} diff --git a/tests/test_groundrecall_source_adapters.py b/tests/test_groundrecall_source_adapters.py index 01f4ca4..a9ef892 100644 --- a/tests/test_groundrecall_source_adapters.py +++ b/tests/test_groundrecall_source_adapters.py @@ -8,6 +8,10 @@ from groundrecall.source_adapters.base import detect_source_adapter, list_source from groundrecall.ingest import run_groundrecall_import +def _fixture_doclift_bundle() -> Path: + return Path(__file__).parent / "fixtures" / "doclift_bundle_minimal" + + def test_groundrecall_source_adapter_registry_lists_expected_adapters() -> None: names = set(list_source_adapters()) assert "llmwiki" in names @@ -33,10 +37,8 @@ def test_detect_didactopus_pack_adapter(tmp_path: Path) -> None: assert adapter.import_intent() == "both" -def test_detect_doclift_bundle_adapter(tmp_path: Path) -> None: - (tmp_path / "documents").mkdir() - (tmp_path / "manifest.json").write_text('{"documents": []}\n', encoding="utf-8") - adapter = detect_source_adapter(tmp_path) +def test_detect_doclift_bundle_adapter() -> None: + adapter = detect_source_adapter(_fixture_doclift_bundle()) assert adapter.name == "doclift_bundle" assert adapter.import_intent() == "both" @@ -201,35 +203,11 @@ def test_didactopus_pack_import_generates_structured_concepts_and_relations(tmp_ assert "clm_stage_stage1_basics" in claim_ids -def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) -> None: - doc_dir = tmp_path / "documents" / "lesson-a" - doc_dir.mkdir(parents=True) - (tmp_path / "manifest.json").write_text( - '\n'.join( - [ - "{", - ' "documents": [', - " {", - ' "document_id": "lesson-a",', - ' "title": "Lecture 1. Example",', - ' "document_kind": "lecture",', - f' "output_dir": "{doc_dir}",', - f' "markdown_path": "{doc_dir / "document.md"}",', - f' "figures_path": "{doc_dir / "document.figures.json"}"', - " }", - " ]", - "}", - ] - ), - encoding="utf-8", - ) - (doc_dir / "document.md").write_text("# Lecture 1. Example\n\nBody.\n", encoding="utf-8") - (doc_dir / "document.figures.json").write_text('{"source_path": "/tmp/source.doc"}\n', encoding="utf-8") - - result = run_groundrecall_import(tmp_path, mode="quick", import_id="doclift-test") +def test_doclift_bundle_import_generates_structured_concepts() -> None: + result = run_groundrecall_import(_fixture_doclift_bundle(), mode="quick", import_id="doclift-test") assert result.manifest["source_adapter"] == "doclift_bundle" assert result.manifest["import_intent"] == "both" concept_ids = {item["concept_id"] for item in result.concepts} - assert "concept::lesson-a" in concept_ids + assert "concept::lecture-1" in concept_ids claim_ids = {item["claim_id"] for item in result.claims} assert "clm_doclift_1" in claim_ids