From a5efe0cccb8cd138f67f406521202a560b484ed3 Mon Sep 17 00:00:00 2001 From: welsberr Date: Mon, 27 Apr 2026 10:29:58 -0400 Subject: [PATCH] Add chunk-backed GroundRecall import artifacts --- docs/ai-knowledge-graph-adoption-plan.md | 464 ++++++++++++++++++ src/groundrecall/groundrecall_normalizer.py | 72 ++- .../groundrecall_source_adapters/base.py | 3 +- .../didactopus_pack.py | 3 +- .../doclift_bundle.py | 88 +++- src/groundrecall/ingest.py | 37 +- src/groundrecall/lint.py | 2 + .../documents/lecture-1/document.chunks.json | 20 + tests/test_groundrecall_import.py | 49 ++ tests/test_groundrecall_source_adapters.py | 5 + 10 files changed, 733 insertions(+), 10 deletions(-) create mode 100644 docs/ai-knowledge-graph-adoption-plan.md create mode 100644 tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.chunks.json diff --git a/docs/ai-knowledge-graph-adoption-plan.md b/docs/ai-knowledge-graph-adoption-plan.md new file mode 100644 index 0000000..a3e542f --- /dev/null +++ b/docs/ai-knowledge-graph-adoption-plan.md @@ -0,0 +1,464 @@ +# AI Knowledge Graph Adoption Plan + +This document translates the feature set of +[`robert-mcdermott/ai-knowledge-graph`](https://github.com/robert-mcdermott/ai-knowledge-graph) +into concrete implementation tickets for the current local repositories: + +- `GroundRecall` +- `Didactopus` +- `doclift` + +The goal is not to copy that repository's data model directly. + +The useful import is: + +- chunk-aware extraction +- entity standardization +- relation suggestion +- graph inspection and review affordances + +The main thing to avoid is treating raw extracted SPO triples as canonical truth. + +## Design Rules + +1. Keep canonical storage typed and provenance-first. +2. Treat extracted triples as candidate claims/relations, not promoted facts. +3. Keep LLM extraction optional and reviewable. +4. Keep `doclift` deterministic by default. +5. Put graph extraction in `GroundRecall` first, then expose downstream affordances in `Didactopus`. + +## Repo Roles + +### GroundRecall + +Primary fit for: + +- candidate claim extraction +- concept alias normalization +- candidate relation inference +- graph diagnostics +- review queue generation + +Key current modules: + +- [src/groundrecall/ingest.py](/home/netuser/bin/GroundRecall/src/groundrecall/ingest.py) +- [src/groundrecall/models.py](/home/netuser/bin/GroundRecall/src/groundrecall/models.py) +- [src/groundrecall/source_adapters](/home/netuser/bin/GroundRecall/src/groundrecall/source_adapters) +- [src/groundrecall/groundrecall_source_adapters/doclift_bundle.py](/home/netuser/bin/GroundRecall/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py) +- [src/groundrecall/review_export.py](/home/netuser/bin/GroundRecall/src/groundrecall/review_export.py) + +### Didactopus + +Primary fit for: + +- graph workbench visualization +- concept merge/split suggestions +- graph-aware review overlays +- learner-facing graph inspection built on grounded artifacts + +Key current modules: + +- [src/didactopus/knowledge_graph.py](/home/netuser/bin/Didactopus/src/didactopus/knowledge_graph.py) +- [src/didactopus/graph_builder.py](/home/netuser/bin/Didactopus/src/didactopus/graph_builder.py) +- [src/didactopus/graph_retrieval.py](/home/netuser/bin/Didactopus/src/didactopus/graph_retrieval.py) +- [src/didactopus/learner_workbench.py](/home/netuser/bin/Didactopus/src/didactopus/learner_workbench.py) +- [src/didactopus/review_export.py](/home/netuser/bin/Didactopus/src/didactopus/review_export.py) +- [src/didactopus/main.py](/home/netuser/bin/Didactopus/src/didactopus/main.py) + +### doclift + +Primary fit for: + +- deterministic chunk metadata +- optional extraction-friendly sidecars +- optional graph preview artifacts + +Key current modules: + +- [src/doclift/convert.py](/home/netuser/bin/doclift/src/doclift/convert.py) +- [src/doclift/schemas.py](/home/netuser/bin/doclift/src/doclift/schemas.py) +- [src/doclift/cli.py](/home/netuser/bin/doclift/src/doclift/cli.py) + +## Phase 1: GroundRecall Candidate Graph Import + +### Ticket GR-1: Add chunk-aware candidate extraction layer + +Outcome: + +- ingest text artifacts into stable chunks +- extract candidate observations/claims/concepts/relations per chunk +- write reviewable import artifacts + +Suggested implementation: + +- add `src/groundrecall/candidate_graph.py` +- add `src/groundrecall/extraction_chunks.py` + +Responsibilities: + +- split long text into bounded chunks with overlap +- assign stable `chunk_id` +- keep chunk-to-artifact provenance +- emit candidate records with `support_kind="derived_from_page"` or `support_kind="inferred"` + +CLI: + +- extend `groundrecall import` with: + - `--extract-graph` + - `--chunk-size` + - `--chunk-overlap` + - `--extractor none|heuristic|llm` + +Acceptance criteria: + +- import still works without graph extraction +- import artifacts include chunk-backed candidate claims and relations when enabled +- all extracted candidates preserve artifact and chunk provenance + +### Ticket GR-2: Add deterministic entity/concept standardization + +Outcome: + +- alias clusters for near-duplicate concepts before review + +Suggested implementation: + +- add `src/groundrecall/entity_standardization.py` + +Responsibilities: + +- normalize punctuation/case +- trim stopwords conservatively +- group obvious aliases +- emit alias-cluster review candidates when confidence is not high enough for direct merge + +Data shape: + +- enrich `ConceptRecord.aliases` +- optionally emit a new review payload section such as `alias_clusters` + +Acceptance criteria: + +- obvious duplicates like minor punctuation/case variants collapse deterministically +- ambiguous clusters remain reviewable rather than auto-merged + +### Ticket GR-3: Add inferred relation candidates + +Outcome: + +- lexical and structural hints become review queue items + +Suggested implementation: + +- add `src/groundrecall/relation_inference.py` + +Inference types: + +- lexical co-occurrence hints +- transitive prerequisite/support hints +- repeated same-source concept pair hints + +Important restriction: + +- inferred relations stay `draft` or `triaged` +- they are never silently promoted to canonical relations + +Acceptance criteria: + +- inferred relations appear in import artifacts with explicit provenance +- review queue distinguishes grounded vs inferred edges + +### Ticket GR-4: Add graph diagnostics and inspector output + +Outcome: + +- maintainers can inspect graph shape before promotion + +Suggested implementation: + +- add `src/groundrecall/graph_diagnostics.py` +- extend [inspect.py](/home/netuser/bin/GroundRecall/src/groundrecall/inspect.py) + +Diagnostics: + +- disconnected components +- orphan concepts +- claims with no strong support +- bridge concepts +- dense noisy clusters + +CLI: + +- `groundrecall inspect ... --graph` +- `groundrecall export ... --include-graph-diagnostics` + +Acceptance criteria: + +- graph diagnostics appear in machine-readable JSON +- review operators can identify noisy imports quickly + +### Ticket GR-5: Add review export support for candidate graph artifacts + +Outcome: + +- current review flows can consume extracted graph candidates + +Suggested implementation: + +- extend [review_export.py](/home/netuser/bin/GroundRecall/src/groundrecall/review_export.py) +- extend review app payloads under [review_app](/home/netuser/bin/GroundRecall/src/groundrecall/review_app) + +UI payload features: + +- candidate relation cards +- alias-cluster cards +- chunk evidence preview +- inferred/grounded badges + +Acceptance criteria: + +- review bundle includes graph-candidate triage data +- no assistant-specific assumptions leak into canonical records + +## Phase 2: Didactopus Graph Review And Workbench Improvements + +### Ticket DT-1: Add review-oriented graph overlays + +Outcome: + +- graph visualizations expose quality problems, not just structure + +Suggested implementation: + +- extend [knowledge_graph.py](/home/netuser/bin/Didactopus/src/didactopus/knowledge_graph.py) +- extend [graph_retrieval.py](/home/netuser/bin/Didactopus/src/didactopus/graph_retrieval.py) + +Overlay ideas: + +- edge grounding status +- concept confidence/review status +- weakly grounded concept markers +- disconnected concept islands + +Acceptance criteria: + +- exported graph JSON can distinguish grounded, heuristic, and inferred links +- downstream visual layers can highlight fragile concepts + +### Ticket DT-2: Add concept consolidation suggestions + +Outcome: + +- reviewers get merge/split suggestions based on graph and text structure + +Suggested implementation: + +- extend [graph_builder.py](/home/netuser/bin/Didactopus/src/didactopus/graph_builder.py) +- extend [review_export.py](/home/netuser/bin/Didactopus/src/didactopus/review_export.py) + +Input signals: + +- title similarity +- shared source lessons +- overlapping prerequisite neighborhoods +- overlapping mastery signals + +Acceptance criteria: + +- review exports include merge suggestions +- suggested merges remain proposals, not automatic edits + +### Ticket DT-3: Add learner-workbench graph inspection modes + +Outcome: + +- learner and reviewer can inspect why concepts exist and how they connect + +Suggested implementation: + +- extend [learner_workbench.py](/home/netuser/bin/Didactopus/src/didactopus/learner_workbench.py) +- extend backend route [api.py](/home/netuser/bin/Didactopus/src/didactopus/api.py) + +Views: + +- concept neighborhood +- source-fragment grounding trail +- alternate supporting lessons +- fragile or noisy concept warnings + +Acceptance criteria: + +- workbench can show source-grounded concept neighborhoods +- concept provenance is inspectable without raw JSON digging + +### Ticket DT-4: Add graph diagnostics to `doclift-bundle` pack generation + +Outcome: + +- `doclift -> Didactopus` imports surface noisy graph structure early + +Suggested implementation: + +- extend [doclift_bundle_demo.py](/home/netuser/bin/Didactopus/src/didactopus/doclift_bundle_demo.py) +- extend [main.py](/home/netuser/bin/Didactopus/src/didactopus/main.py) `doclift-bundle` + +Artifacts: + +- `graph_diagnostics.json` +- `concept_merge_suggestions.json` + +Acceptance criteria: + +- importing a `doclift` bundle produces diagnostics alongside `knowledge_graph.json` +- review workflow can consume those diagnostics + +## Phase 3: doclift Optional Extraction-Friendly Sidecars + +### Ticket DL-1: Emit stable chunk metadata + +Outcome: + +- downstream systems can import `doclift` bundles without re-segmenting blindly + +Suggested implementation: + +- extend [schemas.py](/home/netuser/bin/doclift/src/doclift/schemas.py) +- extend [convert.py](/home/netuser/bin/doclift/src/doclift/convert.py) + +Artifacts: + +- `document.chunks.json` + +Fields: + +- `chunk_id` +- `line_start` +- `line_end` +- `section_labels` +- `text` + +Acceptance criteria: + +- bundle remains valid without downstream AI extraction +- chunk metadata is deterministic across repeat runs + +### Ticket DL-2: Add optional graph-preview sidecars + +Outcome: + +- operators can inspect likely extracted structure at the bundle stage + +Suggested implementation: + +- add optional post-processing module such as `src/doclift/graph_preview.py` + +Artifacts: + +- `document.entities.json` +- `document.relations.json` +- optional `bundle_graph_preview.json` + +CLI: + +- extend `doclift convert` +- extend `doclift convert-dir` +- flags: + - `--graph-preview` + - `--graph-preview-mode heuristic|llm` + +Important restriction: + +- these are preview/debug artifacts only +- they are not the bundle's canonical semantics + +Acceptance criteria: + +- graph preview can be disabled entirely +- default conversion remains deterministic and lightweight + +### Ticket DL-3: Add HTML inspection output for graph previews + +Outcome: + +- maintainers can inspect extracted structure before import + +Suggested implementation: + +- add `doclift preview-graph /path/to/bundle` + +Acceptance criteria: + +- preview HTML references chunk ids and source lines +- graph preview is visibly separate from conversion success reporting + +## Cross-Repo Integration Tickets + +### Ticket X-1: `doclift -> GroundRecall` candidate-graph import path + +Outcome: + +- `GroundRecall` can consume `doclift` chunk metadata directly + +Modules: + +- `doclift` emits `document.chunks.json` +- `GroundRecall` `doclift_bundle` adapter imports it + +Acceptance criteria: + +- `groundrecall import /path/to/doclift-bundle --extract-graph` +- uses `doclift` chunk ids instead of re-splitting markdown where available + +### Ticket X-2: Shared graph diagnostics vocabulary + +Outcome: + +- the three repos use compatible terminology for quality signals + +Suggested shared diagnostic keys: + +- `orphan_concept` +- `weak_grounding` +- `inferred_relation` +- `alias_cluster` +- `disconnected_component` +- `bridge_concept` +- `high_fanout_noisy_concept` + +Acceptance criteria: + +- review and export layers can exchange diagnostics without brittle custom mapping + +## Recommended Build Order + +1. `GR-1` +2. `GR-2` +3. `GR-3` +4. `GR-4` +5. `X-1` +6. `DT-1` +7. `DT-2` +8. `DL-1` +9. `DL-2` +10. `DT-4` + +## Non-Goals + +- replacing GroundRecall canonical models with freeform triples +- forcing LLM extraction into `doclift` core conversion +- auto-promoting inferred relations +- making Didactopus depend on a graph preview layer to ingest ordinary packs + +## Immediate Next Step + +If only one milestone is funded first, build: + +- `GR-1` +- `GR-2` +- `X-1` + +That gives the highest leverage path: + +- `doclift` stays deterministic +- `GroundRecall` gains useful graph-candidate import +- `Didactopus` can later consume cleaner grounded artifacts without architectural churn diff --git a/src/groundrecall/groundrecall_normalizer.py b/src/groundrecall/groundrecall_normalizer.py index fa0c872..a9917c3 100644 --- a/src/groundrecall/groundrecall_normalizer.py +++ b/src/groundrecall/groundrecall_normalizer.py @@ -71,12 +71,35 @@ def build_observation_record( } +def build_fragment_record( + context: ImportContext, + artifact_record: dict[str, Any], + observation: SegmentedObservation, + index: int, +) -> dict[str, Any]: + return { + "fragment_id": f"frag_{artifact_record['artifact_id']}_{index}", + "import_id": context.import_id, + "source_id": artifact_record["artifact_id"], + "text": observation.text, + "section": observation.section, + "line_start": observation.line_start, + "line_end": observation.line_end, + "metadata": { + "artifact_path": observation.artifact_relative_path, + "role": observation.role, + }, + "current_status": "draft", + } + + def build_claim_record( context: ImportContext, observation_record: dict[str, Any], observation: SegmentedObservation, concept_ids: list[str], index: int, + fragment_ids: list[str] | None = None, ) -> dict[str, Any]: return { "claim_id": _claim_id_for_observation(observation_record, observation, index), @@ -84,7 +107,7 @@ def build_claim_record( "claim_text": observation_record["text"], "claim_kind": "statement" if observation_record["role"] == "claim" else "summary", "source_observation_ids": [observation_record["observation_id"]], - "supporting_fragment_ids": [], + "supporting_fragment_ids": list(fragment_ids or []), "concept_ids": [f"concept::{concept_id}" for concept_id in concept_ids], "contradicts_claim_ids": [f"clm_{_sanitize_claim_key(value)}" for value in observation.contradict_keys], "supersedes_claim_ids": [f"clm_{_sanitize_claim_key(value)}" for value in observation.supersede_keys], @@ -134,3 +157,50 @@ def build_relation_records(context: ImportContext, artifact_record: dict[str, An def manifest_record(context: ImportContext) -> dict[str, Any]: return asdict(context) | {"source_repo_kind": "llmwiki"} + + +def standardize_concept_rows( + concept_rows: list[dict[str, Any]], + claim_rows: list[dict[str, Any]], + relation_rows: list[dict[str, Any]], +) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]: + alias_map: dict[str, str] = {} + normalized_index: dict[str, dict[str, Any]] = {} + standardized_rows: list[dict[str, Any]] = [] + + for row in concept_rows: + normalized_title = _normalize_concept_title(str(row.get("title", ""))) + if not normalized_title: + standardized_rows.append(row) + continue + + canonical = normalized_index.get(normalized_title) + if canonical is None: + normalized_index[normalized_title] = row + standardized_rows.append(row) + continue + + canonical["source_artifact_ids"] = sorted( + set(canonical.get("source_artifact_ids", [])) | set(row.get("source_artifact_ids", [])) + ) + aliases = set(canonical.get("aliases", [])) + aliases.add(str(row.get("title", ""))) + aliases.update(str(alias) for alias in row.get("aliases", [])) + aliases.discard(str(canonical.get("title", ""))) + canonical["aliases"] = sorted(alias for alias in aliases if alias) + alias_map[str(row["concept_id"])] = str(canonical["concept_id"]) + + if alias_map: + for row in claim_rows: + row["concept_ids"] = [alias_map.get(concept_id, concept_id) for concept_id in row.get("concept_ids", [])] + for row in relation_rows: + row["source_id"] = alias_map.get(str(row.get("source_id", "")), str(row.get("source_id", ""))) + row["target_id"] = alias_map.get(str(row.get("target_id", "")), str(row.get("target_id", ""))) + + return standardized_rows, claim_rows, relation_rows + + +def _normalize_concept_title(value: str) -> str: + normalized = "".join(ch.lower() if ch.isalnum() else " " for ch in value) + tokens = [token for token in normalized.split() if token not in {"a", "an", "the"}] + return " ".join(tokens) diff --git a/src/groundrecall/groundrecall_source_adapters/base.py b/src/groundrecall/groundrecall_source_adapters/base.py index 3ccc5d6..2ed4d04 100644 --- a/src/groundrecall/groundrecall_source_adapters/base.py +++ b/src/groundrecall/groundrecall_source_adapters/base.py @@ -28,6 +28,7 @@ class DiscoveredImportSource: @dataclass class StructuredImportRows: artifact_rows: list[dict] + fragment_rows: list[dict] observation_rows: list[dict] claim_rows: list[dict] concept_rows: list[dict] @@ -46,7 +47,7 @@ class GroundRecallSourceAdapter(Protocol): def import_intent(self) -> ImportIntent: ... - def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None: + def build_rows(self, context, sources: list[DiscoveredImportSource], root: Path | None = None) -> StructuredImportRows | None: ... diff --git a/src/groundrecall/groundrecall_source_adapters/didactopus_pack.py b/src/groundrecall/groundrecall_source_adapters/didactopus_pack.py index c55a22f..45bf86a 100644 --- a/src/groundrecall/groundrecall_source_adapters/didactopus_pack.py +++ b/src/groundrecall/groundrecall_source_adapters/didactopus_pack.py @@ -38,7 +38,7 @@ class DidactopusPackSourceAdapter: def import_intent(self) -> str: return "both" - def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None: + def build_rows(self, context, sources: list[DiscoveredImportSource], root: Path | None = None) -> StructuredImportRows | None: by_name = {Path(item.relative_path).name: item for item in sources} concepts_src = by_name.get("concepts.yaml") if concepts_src is None: @@ -224,6 +224,7 @@ class DidactopusPackSourceAdapter: return StructuredImportRows( artifact_rows=artifact_rows, + fragment_rows=[], observation_rows=observation_rows, claim_rows=claim_rows, concept_rows=concept_rows, diff --git a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py index 7ee0926..fde221c 100755 --- a/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py +++ b/src/groundrecall/groundrecall_source_adapters/doclift_bundle.py @@ -22,6 +22,23 @@ class DocliftBundleSourceAdapter: base = Path(root) return (base / "manifest.json").exists() and (base / "documents").exists() + def _load_chunks(self, base: Path, document: dict) -> list[dict]: + explicit_path = document.get("chunks_path") + if explicit_path: + chunk_path = self._resolve_bundle_path(base, explicit_path) + else: + output_dir = self._resolve_bundle_path(base, document.get("output_dir")) + chunk_path = output_dir / "document.chunks.json" + if not chunk_path.exists(): + return [] + payload = json.loads(chunk_path.read_text(encoding="utf-8")) + if isinstance(payload, dict): + chunks = payload.get("chunks", []) + return [chunk for chunk in chunks if isinstance(chunk, dict)] + if isinstance(payload, list): + return [chunk for chunk in payload if isinstance(chunk, dict)] + return [] + def discover(self, root: str | Path) -> list[DiscoveredImportSource]: base = Path(root) rows: list[DiscoveredImportSource] = [] @@ -41,8 +58,8 @@ class DocliftBundleSourceAdapter: def import_intent(self) -> str: return "both" - def build_rows(self, context, sources: list[DiscoveredImportSource]) -> StructuredImportRows | None: - base = Path(context.source_root) + def build_rows(self, context, sources: list[DiscoveredImportSource], root: Path | None = None) -> StructuredImportRows | None: + base = Path(root) if root is not None else Path(context.source_root) if not self.detect(base) and sources: for candidate in [sources[0].path.parent, *sources[0].path.parents]: if self.detect(candidate): @@ -54,6 +71,7 @@ class DocliftBundleSourceAdapter: manifest = json.loads(manifest_path.read_text(encoding="utf-8")) artifact_rows: list[dict] = [] + fragment_rows: list[dict] = [] observation_rows: list[dict] = [] claim_rows: list[dict] = [] concept_rows: list[dict] = [] @@ -142,6 +160,71 @@ class DocliftBundleSourceAdapter: "current_status": "triaged", } ) + for chunk_index, chunk in enumerate(self._load_chunks(base, document), start=1): + chunk_text = str(chunk.get("text") or "").strip() + if not chunk_text: + continue + chunk_role = str(chunk.get("role") or "summary") + chunk_section = str(chunk.get("section") or title) + line_start = int(chunk.get("line_start") or 0) + line_end = int(chunk.get("line_end") or line_start) + fragment_id = f"frag_doclift_{index}_{chunk_index}" + observation_id = f"obs_doclift_{index}_{chunk_index}" + fragment_rows.append( + { + "fragment_id": fragment_id, + "import_id": context.import_id, + "source_id": artifact_id, + "text": chunk_text, + "section": chunk_section, + "line_start": line_start, + "line_end": line_end, + "metadata": { + "chunk_id": chunk.get("chunk_id", f"{document.get('document_id', index)}-{chunk_index}"), + "source_kind": "doclift_chunk", + }, + "current_status": "draft", + } + ) + observation_rows.append( + { + "observation_id": observation_id, + "import_id": context.import_id, + "artifact_id": artifact_id, + "role": chunk_role, + "text": chunk_text, + "origin_path": relative_markdown, + "origin_section": chunk_section, + "line_start": line_start, + "line_end": line_end, + "source_url": source_path, + "metadata": { + "source_path_kind": source_path_kind, + "chunk_id": chunk.get("chunk_id", f"{document.get('document_id', index)}-{chunk_index}"), + }, + "grounding_status": "grounded", + "support_kind": "direct_source", + "confidence_hint": float(chunk.get("confidence_hint") or 0.75), + "current_status": "draft", + } + ) + if chunk_role in {"claim", "summary"}: + claim_rows.append( + { + "claim_id": f"clm_doclift_{index}_{chunk_index}", + "import_id": context.import_id, + "claim_text": chunk_text, + "claim_kind": "statement" if chunk_role == "claim" else "summary", + "source_observation_ids": [observation_id], + "supporting_fragment_ids": [fragment_id], + "concept_ids": [concept_id], + "contradicts_claim_ids": [], + "supersedes_claim_ids": [], + "confidence_hint": float(chunk.get("confidence_hint") or 0.75), + "grounding_status": "grounded", + "current_status": "triaged", + } + ) if previous_concept_id is not None: relation_rows.append( { @@ -158,6 +241,7 @@ class DocliftBundleSourceAdapter: return StructuredImportRows( artifact_rows=artifact_rows, + fragment_rows=fragment_rows, observation_rows=observation_rows, claim_rows=claim_rows, concept_rows=concept_rows, diff --git a/src/groundrecall/ingest.py b/src/groundrecall/ingest.py index 942d508..0874775 100644 --- a/src/groundrecall/ingest.py +++ b/src/groundrecall/ingest.py @@ -1,6 +1,7 @@ from __future__ import annotations import argparse +import inspect import json import shutil import socket @@ -18,9 +19,11 @@ from .groundrecall_normalizer import ( build_artifact_record, build_claim_record, build_concept_records, + build_fragment_record, build_observation_record, build_relation_records, manifest_record, + standardize_concept_rows, ) from .groundrecall_review_bridge import export_review_bundle_from_import from .groundrecall_review_queue import build_review_queue @@ -36,6 +39,7 @@ VALID_MODES = {"archive", "quick", "grounded"} class ImportResult: manifest: dict[str, Any] artifacts: list[dict[str, Any]] + fragments: list[dict[str, Any]] observations: list[dict[str, Any]] claims: list[dict[str, Any]] concepts: list[dict[str, Any]] @@ -56,9 +60,10 @@ def _default_import_id(source_root: Path) -> str: def _portable_source_root_ref(source_path: Path, output_root: Path) -> tuple[str, str]: anchor = output_root.resolve().parent if source_path.is_relative_to(anchor): - relative = source_path.relative_to(anchor).as_posix() - if relative != ".": - return relative, "output_root_parent_relative" + relative = source_path.relative_to(anchor) + if relative == Path("."): + return source_path.name, "source_label" + return relative.as_posix(), "output_root_parent_relative" return source_path.name, "source_label" @@ -147,13 +152,19 @@ def run_groundrecall_import( ) artifact_rows: list[dict[str, Any]] = [] + fragment_rows: list[dict[str, Any]] = [] observation_rows: list[dict[str, Any]] = [] claim_rows: list[dict[str, Any]] = [] concept_rows: list[dict[str, Any]] = [] relation_rows: list[dict[str, Any]] = [] - structured_rows = adapter.build_rows(context, discovered) + build_rows_params = inspect.signature(adapter.build_rows).parameters + if "root" in build_rows_params: + structured_rows = adapter.build_rows(context, discovered, root=source_path) + else: + structured_rows = adapter.build_rows(context, discovered) if structured_rows is not None: artifact_rows.extend(structured_rows.artifact_rows) + fragment_rows.extend(structured_rows.fragment_rows) observation_rows.extend(structured_rows.observation_rows) claim_rows.extend(structured_rows.claim_rows) concept_rows.extend(structured_rows.concept_rows) @@ -170,14 +181,27 @@ def run_groundrecall_import( relation_rows.extend(build_relation_records(context, artifact_row, page.concepts, page.links)) for index, observation in enumerate(page.observations, start=1): + fragment_row = build_fragment_record(context, artifact_row, observation, index) + fragment_rows.append(fragment_row) observation_row = build_observation_record(context, artifact_row, observation, index) observation_rows.append(observation_row) if mode == "archive": continue if observation.role not in {"claim", "summary"}: continue - claim_rows.append(build_claim_record(context, observation_row, observation, page.concepts[:3], index)) + claim_rows.append( + build_claim_record( + context, + observation_row, + observation, + page.concepts[:3], + index, + fragment_ids=[fragment_row["fragment_id"]], + ) + ) + fragment_rows = _dedupe_by_key(fragment_rows, "fragment_id") + concept_rows, claim_rows, relation_rows = standardize_concept_rows(concept_rows, claim_rows, relation_rows) concept_rows = _dedupe_by_key(concept_rows, "concept_id") relation_rows = _dedupe_by_key(relation_rows, "relation_id") artifact_rows = _dedupe_by_key(artifact_rows, "artifact_id") @@ -189,6 +213,7 @@ def run_groundrecall_import( "import_intent": adapter.import_intent(), "source_root_kind": source_root_kind, "artifact_count": len(artifact_rows), + "fragment_count": len(fragment_rows), "observation_count": len(observation_rows), "claim_count": len(claim_rows), "concept_count": len(concept_rows), @@ -197,6 +222,7 @@ def run_groundrecall_import( _write_json(output_dir / "manifest.json", manifest) _write_jsonl(output_dir / "artifacts.jsonl", artifact_rows) + _write_jsonl(output_dir / "fragments.jsonl", fragment_rows) _write_jsonl(output_dir / "observations.jsonl", observation_rows) _write_jsonl(output_dir / "claims.jsonl", claim_rows) _write_jsonl(output_dir / "concepts.jsonl", concept_rows) @@ -210,6 +236,7 @@ def run_groundrecall_import( return ImportResult( manifest=manifest, artifacts=artifact_rows, + fragments=fragment_rows, observations=observation_rows, claims=claim_rows, concepts=concept_rows, diff --git a/src/groundrecall/lint.py b/src/groundrecall/lint.py index dbda355..9919450 100644 --- a/src/groundrecall/lint.py +++ b/src/groundrecall/lint.py @@ -24,6 +24,7 @@ def lint_import_directory(import_dir: str | Path) -> dict[str, Any]: base = Path(import_dir) manifest = _read_json(base / "manifest.json") artifacts = _read_jsonl(base / "artifacts.jsonl") + fragments = _read_jsonl(base / "fragments.jsonl") observations = _read_jsonl(base / "observations.jsonl") claims = _read_jsonl(base / "claims.jsonl") concepts = _read_jsonl(base / "concepts.jsonl") @@ -166,6 +167,7 @@ def lint_import_directory(import_dir: str | Path) -> dict[str, Any]: summary = { "artifact_count": len(artifacts), + "fragment_count": len(fragments), "observation_count": len(observations), "claim_count": len(claims), "concept_count": len(concepts), diff --git a/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.chunks.json b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.chunks.json new file mode 100644 index 0000000..9a85993 --- /dev/null +++ b/tests/fixtures/doclift_bundle_minimal/documents/lecture-1/document.chunks.json @@ -0,0 +1,20 @@ +{ + "chunks": [ + { + "chunk_id": "lecture-1-c1", + "role": "summary", + "section": "Module A", + "line_start": 1, + "line_end": 4, + "text": "Lecture 1 introduces Module A and frames the example lesson." + }, + { + "chunk_id": "lecture-1-c2", + "role": "claim", + "section": "Lesson A", + "line_start": 5, + "line_end": 7, + "text": "Objective: Explain lesson A." + } + ] +} diff --git a/tests/test_groundrecall_import.py b/tests/test_groundrecall_import.py index 2fe12cb..1d7d58d 100644 --- a/tests/test_groundrecall_import.py +++ b/tests/test_groundrecall_import.py @@ -3,6 +3,7 @@ from __future__ import annotations import json from pathlib import Path +from groundrecall.groundrecall_normalizer import standardize_concept_rows from groundrecall.ingest import run_groundrecall_import from groundrecall.lint import lint_import_directory @@ -46,8 +47,13 @@ def test_groundrecall_import_emits_normalized_artifacts(tmp_path: Path) -> None: artifacts = _read_jsonl(result.out_dir / "artifacts.jsonl") assert {item["artifact_kind"] for item in artifacts} == {"compiled_page", "raw_note", "session_log"} + fragments = _read_jsonl(result.out_dir / "fragments.jsonl") + assert len(fragments) >= 3 + assert all(item["source_id"].startswith("ia_") for item in fragments) + claims = _read_jsonl(result.out_dir / "claims.jsonl") assert any("Reliable rate upper bound" in item["claim_text"] for item in claims) + assert any(item["supporting_fragment_ids"] for item in claims) concepts = _read_jsonl(result.out_dir / "concepts.jsonl") concept_ids = {item["concept_id"] for item in concepts} @@ -78,6 +84,49 @@ def test_groundrecall_import_emits_normalized_artifacts(tmp_path: Path) -> None: assert "citation_reviews" in review_data +def test_concept_standardization_merges_duplicate_titles_into_aliases() -> None: + concept_rows = [ + { + "concept_id": "concept::signal-processing", + "title": "Signal Processing", + "aliases": [], + "description": "", + "source_artifact_ids": ["ia_one"], + "current_status": "triaged", + }, + { + "concept_id": "concept::signal-processing-variant", + "title": "The Signal Processing", + "aliases": ["DSP"], + "description": "", + "source_artifact_ids": ["ia_two"], + "current_status": "triaged", + }, + ] + claim_rows = [ + { + "claim_id": "clm_1", + "concept_ids": ["concept::signal-processing-variant"], + } + ] + relation_rows = [ + { + "relation_id": "rel_1", + "source_id": "concept::signal-processing-variant", + "target_id": "concept::signal-processing", + } + ] + + concepts, claims, relations = standardize_concept_rows(concept_rows, claim_rows, relation_rows) + + assert len(concepts) == 1 + assert concepts[0]["concept_id"] == "concept::signal-processing" + assert concepts[0]["aliases"] == ["DSP", "The Signal Processing"] + assert concepts[0]["source_artifact_ids"] == ["ia_one", "ia_two"] + assert claims[0]["concept_ids"] == ["concept::signal-processing"] + assert relations[0]["source_id"] == "concept::signal-processing" + + def test_groundrecall_import_parses_explicit_claim_relations(tmp_path: Path) -> None: root = tmp_path / "llmwiki" (root / "wiki").mkdir(parents=True) diff --git a/tests/test_groundrecall_source_adapters.py b/tests/test_groundrecall_source_adapters.py index 31b4852..a3df40c 100644 --- a/tests/test_groundrecall_source_adapters.py +++ b/tests/test_groundrecall_source_adapters.py @@ -216,8 +216,13 @@ def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) -> assert result.manifest["import_intent"] == "both" assert result.manifest["source_root"] == "doclift_bundle_minimal" assert result.manifest["source_root_kind"] == "source_label" + assert result.manifest["fragment_count"] == 2 concept_ids = {item["concept_id"] for item in result.concepts} assert "concept::lecture-1" in concept_ids claim_ids = {item["claim_id"] for item in result.claims} assert "clm_doclift_1" in claim_ids + assert "clm_doclift_1_1" in claim_ids assert result.observations[0]["source_url"] == "legacy/lecture-1.doc" + assert len(result.fragments) == 2 + assert result.fragments[0]["metadata"]["source_kind"] == "doclift_chunk" + assert result.claims[1]["supporting_fragment_ids"] == ["frag_doclift_1_1"]