Add import graph diagnostics to GroundRecall

This commit is contained in:
welsberr 2026-04-27 11:00:35 -04:00
parent a5efe0cccb
commit f8c760b735
5 changed files with 181 additions and 0 deletions

View File

@ -0,0 +1,149 @@
from __future__ import annotations
import json
from collections import defaultdict
from pathlib import Path
from typing import Any
def _read_jsonl(path: Path) -> list[dict[str, Any]]:
if not path.exists():
return []
text = path.read_text(encoding="utf-8").strip()
if not text:
return []
return [json.loads(line) for line in text.splitlines()]
def build_graph_diagnostics(
concepts: list[dict[str, Any]],
relations: list[dict[str, Any]],
) -> dict[str, Any]:
concept_ids = {str(item["concept_id"]) for item in concepts}
adjacency: dict[str, set[str]] = {concept_id: set() for concept_id in concept_ids}
inbound: defaultdict[str, int] = defaultdict(int)
outbound: defaultdict[str, int] = defaultdict(int)
for relation in relations:
source_id = str(relation.get("source_id", ""))
target_id = str(relation.get("target_id", ""))
if source_id not in concept_ids or target_id not in concept_ids:
continue
adjacency[source_id].add(target_id)
adjacency[target_id].add(source_id)
outbound[source_id] += 1
inbound[target_id] += 1
components = _connected_components(adjacency)
bridges = _bridge_concepts(adjacency, components)
degree_ranked = sorted(
(
{
"concept_id": concept_id,
"degree": len(neighbors),
"inbound_count": inbound.get(concept_id, 0),
"outbound_count": outbound.get(concept_id, 0),
}
for concept_id, neighbors in adjacency.items()
),
key=lambda item: (-item["degree"], -item["inbound_count"], item["concept_id"]),
)
return {
"summary": {
"concept_count": len(concepts),
"relation_count": len(relations),
"connected_component_count": len(components),
"largest_component_size": max((len(component) for component in components), default=0),
"isolated_concept_count": sum(1 for component in components if len(component) == 1),
"bridge_concept_count": len(bridges),
},
"components": [
{
"component_id": f"component-{index}",
"size": len(component),
"concept_ids": component,
}
for index, component in enumerate(
sorted(components, key=lambda item: (-len(item), item)),
start=1,
)
],
"bridge_concepts": bridges,
"top_connected_concepts": degree_ranked[:10],
}
def build_graph_diagnostics_from_import(import_dir: str | Path) -> dict[str, Any]:
base = Path(import_dir)
concepts = _read_jsonl(base / "concepts.jsonl")
relations = _read_jsonl(base / "relations.jsonl")
diagnostics = build_graph_diagnostics(concepts, relations)
manifest_path = base / "manifest.json"
if manifest_path.exists():
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
diagnostics["import_id"] = manifest.get("import_id", "")
return diagnostics
def _connected_components(adjacency: dict[str, set[str]]) -> list[list[str]]:
remaining = set(adjacency)
components: list[list[str]] = []
while remaining:
start = remaining.pop()
stack = [start]
component = {start}
while stack:
node = stack.pop()
for neighbor in adjacency.get(node, set()):
if neighbor in component:
continue
component.add(neighbor)
remaining.discard(neighbor)
stack.append(neighbor)
components.append(sorted(component))
return components
def _bridge_concepts(adjacency: dict[str, set[str]], components: list[list[str]]) -> list[dict[str, Any]]:
bridge_payloads: list[dict[str, Any]] = []
for component in components:
if len(component) < 3:
continue
baseline_size = len(component)
component_set = set(component)
for concept_id in component:
remaining = component_set - {concept_id}
if not remaining:
continue
first = next(iter(remaining))
visited = _walk_component(first, adjacency, blocked=concept_id, allowed=remaining)
if len(visited) == len(remaining):
continue
bridge_payloads.append(
{
"concept_id": concept_id,
"component_size": baseline_size,
"reachable_after_removal": len(visited),
}
)
return sorted(bridge_payloads, key=lambda item: (-item["component_size"], item["concept_id"]))
def _walk_component(
start: str,
adjacency: dict[str, set[str]],
*,
blocked: str,
allowed: set[str],
) -> set[str]:
visited = {start}
stack = [start]
while stack:
node = stack.pop()
for neighbor in adjacency.get(node, set()):
if neighbor == blocked or neighbor not in allowed or neighbor in visited:
continue
visited.add(neighbor)
stack.append(neighbor)
return visited

View File

@ -13,6 +13,7 @@ from pathlib import Path
from typing import Any from typing import Any
from .groundrecall_discovery import DiscoveredArtifact from .groundrecall_discovery import DiscoveredArtifact
from .graph_diagnostics import build_graph_diagnostics
from .groundrecall_lint import lint_import_directory from .groundrecall_lint import lint_import_directory
from .groundrecall_normalizer import ( from .groundrecall_normalizer import (
ImportContext, ImportContext,
@ -227,6 +228,7 @@ def run_groundrecall_import(
_write_jsonl(output_dir / "claims.jsonl", claim_rows) _write_jsonl(output_dir / "claims.jsonl", claim_rows)
_write_jsonl(output_dir / "concepts.jsonl", concept_rows) _write_jsonl(output_dir / "concepts.jsonl", concept_rows)
_write_jsonl(output_dir / "relations.jsonl", relation_rows) _write_jsonl(output_dir / "relations.jsonl", relation_rows)
_write_json(output_dir / "graph_diagnostics.json", build_graph_diagnostics(concept_rows, relation_rows))
lint_payload = lint_import_directory(output_dir) lint_payload = lint_import_directory(output_dir)
_write_json(output_dir / "lint_findings.json", lint_payload) _write_json(output_dir / "lint_findings.json", lint_payload)
review_queue = build_review_queue(output_dir) review_queue = build_review_queue(output_dir)

View File

@ -312,6 +312,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di
resolved_source_root = _resolve_source_root(import_dir, manifest.get("source_root", "")) resolved_source_root = _resolve_source_root(import_dir, manifest.get("source_root", ""))
lint_payload = _read_json(import_dir / "lint_findings.json") lint_payload = _read_json(import_dir / "lint_findings.json")
queue_payload = _read_json(import_dir / "review_queue.json") queue_payload = _read_json(import_dir / "review_queue.json")
graph_payload = _read_json(import_dir / "graph_diagnostics.json")
artifacts = _read_jsonl(import_dir / "artifacts.jsonl") artifacts = _read_jsonl(import_dir / "artifacts.jsonl")
observations = _read_jsonl(import_dir / "observations.jsonl") observations = _read_jsonl(import_dir / "observations.jsonl")
claims = _read_jsonl(import_dir / "claims.jsonl") claims = _read_jsonl(import_dir / "claims.jsonl")
@ -390,6 +391,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di
"lint_summary": lint_payload.get("summary", {}), "lint_summary": lint_payload.get("summary", {}),
"queue_length": queue_payload.get("queue_length", 0), "queue_length": queue_payload.get("queue_length", 0),
"source_adapter": manifest.get("source_adapter", ""), "source_adapter": manifest.get("source_adapter", ""),
"graph_summary": graph_payload.get("summary", {}),
}, },
"review_guidance": { "review_guidance": {
"overview": ( "overview": (
@ -419,6 +421,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di
"concept_reviews": concept_reviews, "concept_reviews": concept_reviews,
"citation_reviews": [entry.model_dump() for entry in session.citation_reviews], "citation_reviews": [entry.model_dump() for entry in session.citation_reviews],
"bibliography": bibliography_summary_payload(resolved_source_root), "bibliography": bibliography_summary_payload(resolved_source_root),
"graph_diagnostics": graph_payload,
"citations": { "citations": {
"enabled": True, "enabled": True,
"provider": "citegeist" if artifact_citations and artifact_citations[0].get("citegeist_backends") else "none", "provider": "citegeist" if artifact_citations and artifact_citations[0].get("citegeist_backends") else "none",

View File

@ -5,6 +5,7 @@ from pathlib import Path
from groundrecall.groundrecall_normalizer import standardize_concept_rows from groundrecall.groundrecall_normalizer import standardize_concept_rows
from groundrecall.ingest import run_groundrecall_import from groundrecall.ingest import run_groundrecall_import
from groundrecall.graph_diagnostics import build_graph_diagnostics
from groundrecall.lint import lint_import_directory from groundrecall.lint import lint_import_directory
@ -62,6 +63,9 @@ def test_groundrecall_import_emits_normalized_artifacts(tmp_path: Path) -> None:
relations = _read_jsonl(result.out_dir / "relations.jsonl") relations = _read_jsonl(result.out_dir / "relations.jsonl")
assert any(item["target_id"] == "concept::shannon-entropy" for item in relations) assert any(item["target_id"] == "concept::shannon-entropy" for item in relations)
graph_diagnostics = json.loads((result.out_dir / "graph_diagnostics.json").read_text(encoding="utf-8"))
assert graph_diagnostics["summary"]["connected_component_count"] >= 1
assert graph_diagnostics["summary"]["concept_count"] == len(concepts)
lint_payload = json.loads((result.out_dir / "lint_findings.json").read_text(encoding="utf-8")) lint_payload = json.loads((result.out_dir / "lint_findings.json").read_text(encoding="utf-8"))
assert "summary" in lint_payload assert "summary" in lint_payload
@ -127,6 +131,26 @@ def test_concept_standardization_merges_duplicate_titles_into_aliases() -> None:
assert relations[0]["source_id"] == "concept::signal-processing" assert relations[0]["source_id"] == "concept::signal-processing"
def test_graph_diagnostics_detect_bridge_concepts() -> None:
diagnostics = build_graph_diagnostics(
concepts=[
{"concept_id": "concept::a"},
{"concept_id": "concept::b"},
{"concept_id": "concept::c"},
{"concept_id": "concept::d"},
],
relations=[
{"source_id": "concept::a", "target_id": "concept::b"},
{"source_id": "concept::b", "target_id": "concept::c"},
{"source_id": "concept::c", "target_id": "concept::d"},
],
)
assert diagnostics["summary"]["connected_component_count"] == 1
assert diagnostics["summary"]["bridge_concept_count"] == 2
assert [item["concept_id"] for item in diagnostics["bridge_concepts"]] == ["concept::b", "concept::c"]
def test_groundrecall_import_parses_explicit_claim_relations(tmp_path: Path) -> None: def test_groundrecall_import_parses_explicit_claim_relations(tmp_path: Path) -> None:
root = tmp_path / "llmwiki" root = tmp_path / "llmwiki"
(root / "wiki").mkdir(parents=True) (root / "wiki").mkdir(parents=True)

View File

@ -55,6 +55,9 @@ def test_review_workspace_populates_and_persists_citation_reviews(tmp_path: Path
review_data = json.loads((import_result.out_dir / "review_data.json").read_text(encoding="utf-8")) review_data = json.loads((import_result.out_dir / "review_data.json").read_text(encoding="utf-8"))
assert any(item["citation_review_id"] == citation_review_id for item in review_data["citation_reviews"]) assert any(item["citation_review_id"] == citation_review_id for item in review_data["citation_reviews"])
assert "graph_diagnostics" in review_data
assert "graph_summary" in review_data["import_context"]
assert review_data["graph_diagnostics"]["summary"]["concept_count"] >= 1
def test_review_workspace_resolves_citation_metadata_from_bibtex(tmp_path: Path) -> None: def test_review_workspace_resolves_citation_metadata_from_bibtex(tmp_path: Path) -> None: