diff --git a/src/groundrecall/graph_diagnostics.py b/src/groundrecall/graph_diagnostics.py new file mode 100644 index 0000000..5b30bd2 --- /dev/null +++ b/src/groundrecall/graph_diagnostics.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import json +from collections import defaultdict +from pathlib import Path +from typing import Any + + +def _read_jsonl(path: Path) -> list[dict[str, Any]]: + if not path.exists(): + return [] + text = path.read_text(encoding="utf-8").strip() + if not text: + return [] + return [json.loads(line) for line in text.splitlines()] + + +def build_graph_diagnostics( + concepts: list[dict[str, Any]], + relations: list[dict[str, Any]], +) -> dict[str, Any]: + concept_ids = {str(item["concept_id"]) for item in concepts} + adjacency: dict[str, set[str]] = {concept_id: set() for concept_id in concept_ids} + inbound: defaultdict[str, int] = defaultdict(int) + outbound: defaultdict[str, int] = defaultdict(int) + + for relation in relations: + source_id = str(relation.get("source_id", "")) + target_id = str(relation.get("target_id", "")) + if source_id not in concept_ids or target_id not in concept_ids: + continue + adjacency[source_id].add(target_id) + adjacency[target_id].add(source_id) + outbound[source_id] += 1 + inbound[target_id] += 1 + + components = _connected_components(adjacency) + bridges = _bridge_concepts(adjacency, components) + degree_ranked = sorted( + ( + { + "concept_id": concept_id, + "degree": len(neighbors), + "inbound_count": inbound.get(concept_id, 0), + "outbound_count": outbound.get(concept_id, 0), + } + for concept_id, neighbors in adjacency.items() + ), + key=lambda item: (-item["degree"], -item["inbound_count"], item["concept_id"]), + ) + + return { + "summary": { + "concept_count": len(concepts), + "relation_count": len(relations), + "connected_component_count": len(components), + "largest_component_size": max((len(component) for component in components), default=0), + "isolated_concept_count": sum(1 for component in components if len(component) == 1), + "bridge_concept_count": len(bridges), + }, + "components": [ + { + "component_id": f"component-{index}", + "size": len(component), + "concept_ids": component, + } + for index, component in enumerate( + sorted(components, key=lambda item: (-len(item), item)), + start=1, + ) + ], + "bridge_concepts": bridges, + "top_connected_concepts": degree_ranked[:10], + } + + +def build_graph_diagnostics_from_import(import_dir: str | Path) -> dict[str, Any]: + base = Path(import_dir) + concepts = _read_jsonl(base / "concepts.jsonl") + relations = _read_jsonl(base / "relations.jsonl") + diagnostics = build_graph_diagnostics(concepts, relations) + manifest_path = base / "manifest.json" + if manifest_path.exists(): + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + diagnostics["import_id"] = manifest.get("import_id", "") + return diagnostics + + +def _connected_components(adjacency: dict[str, set[str]]) -> list[list[str]]: + remaining = set(adjacency) + components: list[list[str]] = [] + while remaining: + start = remaining.pop() + stack = [start] + component = {start} + while stack: + node = stack.pop() + for neighbor in adjacency.get(node, set()): + if neighbor in component: + continue + component.add(neighbor) + remaining.discard(neighbor) + stack.append(neighbor) + components.append(sorted(component)) + return components + + +def _bridge_concepts(adjacency: dict[str, set[str]], components: list[list[str]]) -> list[dict[str, Any]]: + bridge_payloads: list[dict[str, Any]] = [] + for component in components: + if len(component) < 3: + continue + baseline_size = len(component) + component_set = set(component) + for concept_id in component: + remaining = component_set - {concept_id} + if not remaining: + continue + first = next(iter(remaining)) + visited = _walk_component(first, adjacency, blocked=concept_id, allowed=remaining) + if len(visited) == len(remaining): + continue + bridge_payloads.append( + { + "concept_id": concept_id, + "component_size": baseline_size, + "reachable_after_removal": len(visited), + } + ) + return sorted(bridge_payloads, key=lambda item: (-item["component_size"], item["concept_id"])) + + +def _walk_component( + start: str, + adjacency: dict[str, set[str]], + *, + blocked: str, + allowed: set[str], +) -> set[str]: + visited = {start} + stack = [start] + while stack: + node = stack.pop() + for neighbor in adjacency.get(node, set()): + if neighbor == blocked or neighbor not in allowed or neighbor in visited: + continue + visited.add(neighbor) + stack.append(neighbor) + return visited diff --git a/src/groundrecall/ingest.py b/src/groundrecall/ingest.py index 0874775..364fe39 100644 --- a/src/groundrecall/ingest.py +++ b/src/groundrecall/ingest.py @@ -13,6 +13,7 @@ from pathlib import Path from typing import Any from .groundrecall_discovery import DiscoveredArtifact +from .graph_diagnostics import build_graph_diagnostics from .groundrecall_lint import lint_import_directory from .groundrecall_normalizer import ( ImportContext, @@ -227,6 +228,7 @@ def run_groundrecall_import( _write_jsonl(output_dir / "claims.jsonl", claim_rows) _write_jsonl(output_dir / "concepts.jsonl", concept_rows) _write_jsonl(output_dir / "relations.jsonl", relation_rows) + _write_json(output_dir / "graph_diagnostics.json", build_graph_diagnostics(concept_rows, relation_rows)) lint_payload = lint_import_directory(output_dir) _write_json(output_dir / "lint_findings.json", lint_payload) review_queue = build_review_queue(output_dir) diff --git a/src/groundrecall/review_export.py b/src/groundrecall/review_export.py index 7cf6abd..f4fba09 100644 --- a/src/groundrecall/review_export.py +++ b/src/groundrecall/review_export.py @@ -312,6 +312,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di resolved_source_root = _resolve_source_root(import_dir, manifest.get("source_root", "")) lint_payload = _read_json(import_dir / "lint_findings.json") queue_payload = _read_json(import_dir / "review_queue.json") + graph_payload = _read_json(import_dir / "graph_diagnostics.json") artifacts = _read_jsonl(import_dir / "artifacts.jsonl") observations = _read_jsonl(import_dir / "observations.jsonl") claims = _read_jsonl(import_dir / "claims.jsonl") @@ -390,6 +391,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di "lint_summary": lint_payload.get("summary", {}), "queue_length": queue_payload.get("queue_length", 0), "source_adapter": manifest.get("source_adapter", ""), + "graph_summary": graph_payload.get("summary", {}), }, "review_guidance": { "overview": ( @@ -419,6 +421,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di "concept_reviews": concept_reviews, "citation_reviews": [entry.model_dump() for entry in session.citation_reviews], "bibliography": bibliography_summary_payload(resolved_source_root), + "graph_diagnostics": graph_payload, "citations": { "enabled": True, "provider": "citegeist" if artifact_citations and artifact_citations[0].get("citegeist_backends") else "none", diff --git a/tests/test_groundrecall_import.py b/tests/test_groundrecall_import.py index 1d7d58d..1e278d8 100644 --- a/tests/test_groundrecall_import.py +++ b/tests/test_groundrecall_import.py @@ -5,6 +5,7 @@ from pathlib import Path from groundrecall.groundrecall_normalizer import standardize_concept_rows from groundrecall.ingest import run_groundrecall_import +from groundrecall.graph_diagnostics import build_graph_diagnostics from groundrecall.lint import lint_import_directory @@ -62,6 +63,9 @@ def test_groundrecall_import_emits_normalized_artifacts(tmp_path: Path) -> None: relations = _read_jsonl(result.out_dir / "relations.jsonl") assert any(item["target_id"] == "concept::shannon-entropy" for item in relations) + graph_diagnostics = json.loads((result.out_dir / "graph_diagnostics.json").read_text(encoding="utf-8")) + assert graph_diagnostics["summary"]["connected_component_count"] >= 1 + assert graph_diagnostics["summary"]["concept_count"] == len(concepts) lint_payload = json.loads((result.out_dir / "lint_findings.json").read_text(encoding="utf-8")) assert "summary" in lint_payload @@ -127,6 +131,26 @@ def test_concept_standardization_merges_duplicate_titles_into_aliases() -> None: assert relations[0]["source_id"] == "concept::signal-processing" +def test_graph_diagnostics_detect_bridge_concepts() -> None: + diagnostics = build_graph_diagnostics( + concepts=[ + {"concept_id": "concept::a"}, + {"concept_id": "concept::b"}, + {"concept_id": "concept::c"}, + {"concept_id": "concept::d"}, + ], + relations=[ + {"source_id": "concept::a", "target_id": "concept::b"}, + {"source_id": "concept::b", "target_id": "concept::c"}, + {"source_id": "concept::c", "target_id": "concept::d"}, + ], + ) + + assert diagnostics["summary"]["connected_component_count"] == 1 + assert diagnostics["summary"]["bridge_concept_count"] == 2 + assert [item["concept_id"] for item in diagnostics["bridge_concepts"]] == ["concept::b", "concept::c"] + + def test_groundrecall_import_parses_explicit_claim_relations(tmp_path: Path) -> None: root = tmp_path / "llmwiki" (root / "wiki").mkdir(parents=True) diff --git a/tests/test_groundrecall_review_workspace.py b/tests/test_groundrecall_review_workspace.py index 9cef3ca..ffea701 100644 --- a/tests/test_groundrecall_review_workspace.py +++ b/tests/test_groundrecall_review_workspace.py @@ -55,6 +55,9 @@ def test_review_workspace_populates_and_persists_citation_reviews(tmp_path: Path review_data = json.loads((import_result.out_dir / "review_data.json").read_text(encoding="utf-8")) assert any(item["citation_review_id"] == citation_review_id for item in review_data["citation_reviews"]) + assert "graph_diagnostics" in review_data + assert "graph_summary" in review_data["import_context"] + assert review_data["graph_diagnostics"]["summary"]["concept_count"] >= 1 def test_review_workspace_resolves_citation_metadata_from_bibtex(tmp_path: Path) -> None: