Add import graph diagnostics to GroundRecall
This commit is contained in:
parent
a5efe0cccb
commit
f8c760b735
|
|
@ -0,0 +1,149 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def _read_jsonl(path: Path) -> list[dict[str, Any]]:
|
||||
if not path.exists():
|
||||
return []
|
||||
text = path.read_text(encoding="utf-8").strip()
|
||||
if not text:
|
||||
return []
|
||||
return [json.loads(line) for line in text.splitlines()]
|
||||
|
||||
|
||||
def build_graph_diagnostics(
|
||||
concepts: list[dict[str, Any]],
|
||||
relations: list[dict[str, Any]],
|
||||
) -> dict[str, Any]:
|
||||
concept_ids = {str(item["concept_id"]) for item in concepts}
|
||||
adjacency: dict[str, set[str]] = {concept_id: set() for concept_id in concept_ids}
|
||||
inbound: defaultdict[str, int] = defaultdict(int)
|
||||
outbound: defaultdict[str, int] = defaultdict(int)
|
||||
|
||||
for relation in relations:
|
||||
source_id = str(relation.get("source_id", ""))
|
||||
target_id = str(relation.get("target_id", ""))
|
||||
if source_id not in concept_ids or target_id not in concept_ids:
|
||||
continue
|
||||
adjacency[source_id].add(target_id)
|
||||
adjacency[target_id].add(source_id)
|
||||
outbound[source_id] += 1
|
||||
inbound[target_id] += 1
|
||||
|
||||
components = _connected_components(adjacency)
|
||||
bridges = _bridge_concepts(adjacency, components)
|
||||
degree_ranked = sorted(
|
||||
(
|
||||
{
|
||||
"concept_id": concept_id,
|
||||
"degree": len(neighbors),
|
||||
"inbound_count": inbound.get(concept_id, 0),
|
||||
"outbound_count": outbound.get(concept_id, 0),
|
||||
}
|
||||
for concept_id, neighbors in adjacency.items()
|
||||
),
|
||||
key=lambda item: (-item["degree"], -item["inbound_count"], item["concept_id"]),
|
||||
)
|
||||
|
||||
return {
|
||||
"summary": {
|
||||
"concept_count": len(concepts),
|
||||
"relation_count": len(relations),
|
||||
"connected_component_count": len(components),
|
||||
"largest_component_size": max((len(component) for component in components), default=0),
|
||||
"isolated_concept_count": sum(1 for component in components if len(component) == 1),
|
||||
"bridge_concept_count": len(bridges),
|
||||
},
|
||||
"components": [
|
||||
{
|
||||
"component_id": f"component-{index}",
|
||||
"size": len(component),
|
||||
"concept_ids": component,
|
||||
}
|
||||
for index, component in enumerate(
|
||||
sorted(components, key=lambda item: (-len(item), item)),
|
||||
start=1,
|
||||
)
|
||||
],
|
||||
"bridge_concepts": bridges,
|
||||
"top_connected_concepts": degree_ranked[:10],
|
||||
}
|
||||
|
||||
|
||||
def build_graph_diagnostics_from_import(import_dir: str | Path) -> dict[str, Any]:
|
||||
base = Path(import_dir)
|
||||
concepts = _read_jsonl(base / "concepts.jsonl")
|
||||
relations = _read_jsonl(base / "relations.jsonl")
|
||||
diagnostics = build_graph_diagnostics(concepts, relations)
|
||||
manifest_path = base / "manifest.json"
|
||||
if manifest_path.exists():
|
||||
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
diagnostics["import_id"] = manifest.get("import_id", "")
|
||||
return diagnostics
|
||||
|
||||
|
||||
def _connected_components(adjacency: dict[str, set[str]]) -> list[list[str]]:
|
||||
remaining = set(adjacency)
|
||||
components: list[list[str]] = []
|
||||
while remaining:
|
||||
start = remaining.pop()
|
||||
stack = [start]
|
||||
component = {start}
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
for neighbor in adjacency.get(node, set()):
|
||||
if neighbor in component:
|
||||
continue
|
||||
component.add(neighbor)
|
||||
remaining.discard(neighbor)
|
||||
stack.append(neighbor)
|
||||
components.append(sorted(component))
|
||||
return components
|
||||
|
||||
|
||||
def _bridge_concepts(adjacency: dict[str, set[str]], components: list[list[str]]) -> list[dict[str, Any]]:
|
||||
bridge_payloads: list[dict[str, Any]] = []
|
||||
for component in components:
|
||||
if len(component) < 3:
|
||||
continue
|
||||
baseline_size = len(component)
|
||||
component_set = set(component)
|
||||
for concept_id in component:
|
||||
remaining = component_set - {concept_id}
|
||||
if not remaining:
|
||||
continue
|
||||
first = next(iter(remaining))
|
||||
visited = _walk_component(first, adjacency, blocked=concept_id, allowed=remaining)
|
||||
if len(visited) == len(remaining):
|
||||
continue
|
||||
bridge_payloads.append(
|
||||
{
|
||||
"concept_id": concept_id,
|
||||
"component_size": baseline_size,
|
||||
"reachable_after_removal": len(visited),
|
||||
}
|
||||
)
|
||||
return sorted(bridge_payloads, key=lambda item: (-item["component_size"], item["concept_id"]))
|
||||
|
||||
|
||||
def _walk_component(
|
||||
start: str,
|
||||
adjacency: dict[str, set[str]],
|
||||
*,
|
||||
blocked: str,
|
||||
allowed: set[str],
|
||||
) -> set[str]:
|
||||
visited = {start}
|
||||
stack = [start]
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
for neighbor in adjacency.get(node, set()):
|
||||
if neighbor == blocked or neighbor not in allowed or neighbor in visited:
|
||||
continue
|
||||
visited.add(neighbor)
|
||||
stack.append(neighbor)
|
||||
return visited
|
||||
|
|
@ -13,6 +13,7 @@ from pathlib import Path
|
|||
from typing import Any
|
||||
|
||||
from .groundrecall_discovery import DiscoveredArtifact
|
||||
from .graph_diagnostics import build_graph_diagnostics
|
||||
from .groundrecall_lint import lint_import_directory
|
||||
from .groundrecall_normalizer import (
|
||||
ImportContext,
|
||||
|
|
@ -227,6 +228,7 @@ def run_groundrecall_import(
|
|||
_write_jsonl(output_dir / "claims.jsonl", claim_rows)
|
||||
_write_jsonl(output_dir / "concepts.jsonl", concept_rows)
|
||||
_write_jsonl(output_dir / "relations.jsonl", relation_rows)
|
||||
_write_json(output_dir / "graph_diagnostics.json", build_graph_diagnostics(concept_rows, relation_rows))
|
||||
lint_payload = lint_import_directory(output_dir)
|
||||
_write_json(output_dir / "lint_findings.json", lint_payload)
|
||||
review_queue = build_review_queue(output_dir)
|
||||
|
|
|
|||
|
|
@ -312,6 +312,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di
|
|||
resolved_source_root = _resolve_source_root(import_dir, manifest.get("source_root", ""))
|
||||
lint_payload = _read_json(import_dir / "lint_findings.json")
|
||||
queue_payload = _read_json(import_dir / "review_queue.json")
|
||||
graph_payload = _read_json(import_dir / "graph_diagnostics.json")
|
||||
artifacts = _read_jsonl(import_dir / "artifacts.jsonl")
|
||||
observations = _read_jsonl(import_dir / "observations.jsonl")
|
||||
claims = _read_jsonl(import_dir / "claims.jsonl")
|
||||
|
|
@ -390,6 +391,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di
|
|||
"lint_summary": lint_payload.get("summary", {}),
|
||||
"queue_length": queue_payload.get("queue_length", 0),
|
||||
"source_adapter": manifest.get("source_adapter", ""),
|
||||
"graph_summary": graph_payload.get("summary", {}),
|
||||
},
|
||||
"review_guidance": {
|
||||
"overview": (
|
||||
|
|
@ -419,6 +421,7 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di
|
|||
"concept_reviews": concept_reviews,
|
||||
"citation_reviews": [entry.model_dump() for entry in session.citation_reviews],
|
||||
"bibliography": bibliography_summary_payload(resolved_source_root),
|
||||
"graph_diagnostics": graph_payload,
|
||||
"citations": {
|
||||
"enabled": True,
|
||||
"provider": "citegeist" if artifact_citations and artifact_citations[0].get("citegeist_backends") else "none",
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ from pathlib import Path
|
|||
|
||||
from groundrecall.groundrecall_normalizer import standardize_concept_rows
|
||||
from groundrecall.ingest import run_groundrecall_import
|
||||
from groundrecall.graph_diagnostics import build_graph_diagnostics
|
||||
from groundrecall.lint import lint_import_directory
|
||||
|
||||
|
||||
|
|
@ -62,6 +63,9 @@ def test_groundrecall_import_emits_normalized_artifacts(tmp_path: Path) -> None:
|
|||
|
||||
relations = _read_jsonl(result.out_dir / "relations.jsonl")
|
||||
assert any(item["target_id"] == "concept::shannon-entropy" for item in relations)
|
||||
graph_diagnostics = json.loads((result.out_dir / "graph_diagnostics.json").read_text(encoding="utf-8"))
|
||||
assert graph_diagnostics["summary"]["connected_component_count"] >= 1
|
||||
assert graph_diagnostics["summary"]["concept_count"] == len(concepts)
|
||||
|
||||
lint_payload = json.loads((result.out_dir / "lint_findings.json").read_text(encoding="utf-8"))
|
||||
assert "summary" in lint_payload
|
||||
|
|
@ -127,6 +131,26 @@ def test_concept_standardization_merges_duplicate_titles_into_aliases() -> None:
|
|||
assert relations[0]["source_id"] == "concept::signal-processing"
|
||||
|
||||
|
||||
def test_graph_diagnostics_detect_bridge_concepts() -> None:
|
||||
diagnostics = build_graph_diagnostics(
|
||||
concepts=[
|
||||
{"concept_id": "concept::a"},
|
||||
{"concept_id": "concept::b"},
|
||||
{"concept_id": "concept::c"},
|
||||
{"concept_id": "concept::d"},
|
||||
],
|
||||
relations=[
|
||||
{"source_id": "concept::a", "target_id": "concept::b"},
|
||||
{"source_id": "concept::b", "target_id": "concept::c"},
|
||||
{"source_id": "concept::c", "target_id": "concept::d"},
|
||||
],
|
||||
)
|
||||
|
||||
assert diagnostics["summary"]["connected_component_count"] == 1
|
||||
assert diagnostics["summary"]["bridge_concept_count"] == 2
|
||||
assert [item["concept_id"] for item in diagnostics["bridge_concepts"]] == ["concept::b", "concept::c"]
|
||||
|
||||
|
||||
def test_groundrecall_import_parses_explicit_claim_relations(tmp_path: Path) -> None:
|
||||
root = tmp_path / "llmwiki"
|
||||
(root / "wiki").mkdir(parents=True)
|
||||
|
|
|
|||
|
|
@ -55,6 +55,9 @@ def test_review_workspace_populates_and_persists_citation_reviews(tmp_path: Path
|
|||
|
||||
review_data = json.loads((import_result.out_dir / "review_data.json").read_text(encoding="utf-8"))
|
||||
assert any(item["citation_review_id"] == citation_review_id for item in review_data["citation_reviews"])
|
||||
assert "graph_diagnostics" in review_data
|
||||
assert "graph_summary" in review_data["import_context"]
|
||||
assert review_data["graph_diagnostics"]["summary"]["concept_count"] >= 1
|
||||
|
||||
|
||||
def test_review_workspace_resolves_citation_metadata_from_bibtex(tmp_path: Path) -> None:
|
||||
|
|
|
|||
Loading…
Reference in New Issue