GroundRecall/src/groundrecall/groundrecall_review_bridge.py

139 lines
5.6 KiB
Python

from __future__ import annotations
import argparse
import json
from collections import defaultdict
from pathlib import Path
from typing import Any
from .review_export import build_citation_review_entries_from_import, export_review_state_json, export_review_ui_data
from .review_schema import ConceptReviewEntry, DraftPackData, ReviewSession
def _read_json(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def _read_jsonl(path: Path) -> list[dict[str, Any]]:
if not path.exists():
return []
text = path.read_text(encoding="utf-8").strip()
if not text:
return []
return [json.loads(line) for line in text.splitlines()]
def _claim_summary(claims: list[dict[str, Any]]) -> list[str]:
lines: list[str] = []
for claim in claims[:3]:
grounding = claim.get("grounding_status", "unknown")
lines.append(f"Claim: {claim.get('claim_text', '')} [{grounding}]")
if len(claims) > 3:
lines.append(f"{len(claims) - 3} additional claims omitted from notes summary.")
return lines
def build_review_session_from_import(import_dir: str | Path, reviewer: str = "GroundRecall Import") -> ReviewSession:
base = Path(import_dir)
manifest = _read_json(base / "manifest.json")
lint_payload = _read_json(base / "lint_findings.json")
claims = _read_jsonl(base / "claims.jsonl")
concepts = _read_jsonl(base / "concepts.jsonl")
claims_by_concept: defaultdict[str, list[dict[str, Any]]] = defaultdict(list)
for claim in claims:
for concept_id in claim.get("concept_ids", []):
claims_by_concept[concept_id].append(claim)
findings_by_target: defaultdict[str, list[dict[str, Any]]] = defaultdict(list)
concept_findings: defaultdict[str, list[dict[str, Any]]] = defaultdict(list)
for finding in lint_payload.get("findings", []):
findings_by_target[finding["target_id"]].append(finding)
for claim in claims:
for concept_id in claim.get("concept_ids", []):
concept_findings[concept_id].extend(findings_by_target.get(claim["claim_id"], []))
for concept in concepts:
concept_findings[concept["concept_id"]].extend(findings_by_target.get(concept["concept_id"], []))
entries: list[ConceptReviewEntry] = []
for concept in concepts:
concept_id = concept["concept_id"]
related_claims = claims_by_concept.get(concept_id, [])
related_findings = concept_findings.get(concept_id, [])
has_errors = any(item["severity"] == "error" for item in related_findings)
all_grounded = bool(related_claims) and all(item.get("grounding_status") == "grounded" for item in related_claims)
status = "needs_review"
if not has_errors and all_grounded:
status = "provisional"
notes = _claim_summary(related_claims)
notes.extend(item["message"] for item in related_findings[:5])
entries.append(
ConceptReviewEntry(
concept_id=concept_id.replace("concept::", "", 1),
title=concept.get("title", concept_id),
description=concept.get("description", ""),
prerequisites=[],
mastery_signals=[],
status=status,
notes=notes,
)
)
conflicts = [item["message"] for item in lint_payload.get("findings", []) if item["severity"] == "error"]
review_flags = [item["message"] for item in lint_payload.get("findings", []) if item["severity"] == "warning"]
pack = {
"name": f"groundrecall-import-{manifest['import_id']}",
"display_name": f"GroundRecall Import {manifest['import_id']}",
"version": "0.1.0-draft",
"source_import_id": manifest["import_id"],
"source_root": manifest.get("source_root", ""),
}
attribution = {
"source_repo_kind": manifest.get("source_repo_kind", "llmwiki"),
"source_root": manifest.get("source_root", ""),
"imported_at": manifest.get("imported_at", ""),
"machine_id": manifest.get("machine_id", ""),
"rights_note": "Imported llmwiki-style corpus requires review before promotion.",
}
return ReviewSession(
reviewer=reviewer,
draft_pack=DraftPackData(
pack=pack,
concepts=entries,
conflicts=conflicts,
review_flags=review_flags,
attribution=attribution,
),
citation_reviews=build_citation_review_entries_from_import(base),
)
def export_review_bundle_from_import(import_dir: str | Path, out_dir: str | Path | None = None, reviewer: str = "GroundRecall Import") -> dict[str, str]:
base = Path(import_dir)
target = Path(out_dir) if out_dir is not None else base
target.mkdir(parents=True, exist_ok=True)
session = build_review_session_from_import(base, reviewer=reviewer)
review_state_path = target / "review_session.json"
export_review_state_json(session, review_state_path)
export_review_ui_data(session, target, import_dir=base)
return {
"review_session_json": str(review_state_path),
"review_data_json": str(target / "review_data.json"),
}
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Build Didactopus review artifacts from a GroundRecall import.")
parser.add_argument("import_dir")
parser.add_argument("--out-dir", default=None)
parser.add_argument("--reviewer", default="GroundRecall Import")
return parser
def main() -> None:
args = build_parser().parse_args()
outputs = export_review_bundle_from_import(args.import_dir, out_dir=args.out_dir, reviewer=args.reviewer)
print(json.dumps(outputs, indent=2))