139 lines
5.6 KiB
Python
139 lines
5.6 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from .review_export import build_citation_review_entries_from_import, export_review_state_json, export_review_ui_data
|
|
from .review_schema import ConceptReviewEntry, DraftPackData, ReviewSession
|
|
|
|
|
|
def _read_json(path: Path) -> dict[str, Any]:
|
|
return json.loads(path.read_text(encoding="utf-8"))
|
|
|
|
|
|
def _read_jsonl(path: Path) -> list[dict[str, Any]]:
|
|
if not path.exists():
|
|
return []
|
|
text = path.read_text(encoding="utf-8").strip()
|
|
if not text:
|
|
return []
|
|
return [json.loads(line) for line in text.splitlines()]
|
|
|
|
|
|
def _claim_summary(claims: list[dict[str, Any]]) -> list[str]:
|
|
lines: list[str] = []
|
|
for claim in claims[:3]:
|
|
grounding = claim.get("grounding_status", "unknown")
|
|
lines.append(f"Claim: {claim.get('claim_text', '')} [{grounding}]")
|
|
if len(claims) > 3:
|
|
lines.append(f"{len(claims) - 3} additional claims omitted from notes summary.")
|
|
return lines
|
|
|
|
|
|
def build_review_session_from_import(import_dir: str | Path, reviewer: str = "GroundRecall Import") -> ReviewSession:
|
|
base = Path(import_dir)
|
|
manifest = _read_json(base / "manifest.json")
|
|
lint_payload = _read_json(base / "lint_findings.json")
|
|
claims = _read_jsonl(base / "claims.jsonl")
|
|
concepts = _read_jsonl(base / "concepts.jsonl")
|
|
|
|
claims_by_concept: defaultdict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
for claim in claims:
|
|
for concept_id in claim.get("concept_ids", []):
|
|
claims_by_concept[concept_id].append(claim)
|
|
|
|
findings_by_target: defaultdict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
concept_findings: defaultdict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
for finding in lint_payload.get("findings", []):
|
|
findings_by_target[finding["target_id"]].append(finding)
|
|
for claim in claims:
|
|
for concept_id in claim.get("concept_ids", []):
|
|
concept_findings[concept_id].extend(findings_by_target.get(claim["claim_id"], []))
|
|
for concept in concepts:
|
|
concept_findings[concept["concept_id"]].extend(findings_by_target.get(concept["concept_id"], []))
|
|
|
|
entries: list[ConceptReviewEntry] = []
|
|
for concept in concepts:
|
|
concept_id = concept["concept_id"]
|
|
related_claims = claims_by_concept.get(concept_id, [])
|
|
related_findings = concept_findings.get(concept_id, [])
|
|
has_errors = any(item["severity"] == "error" for item in related_findings)
|
|
all_grounded = bool(related_claims) and all(item.get("grounding_status") == "grounded" for item in related_claims)
|
|
status = "needs_review"
|
|
if not has_errors and all_grounded:
|
|
status = "provisional"
|
|
|
|
notes = _claim_summary(related_claims)
|
|
notes.extend(item["message"] for item in related_findings[:5])
|
|
|
|
entries.append(
|
|
ConceptReviewEntry(
|
|
concept_id=concept_id.replace("concept::", "", 1),
|
|
title=concept.get("title", concept_id),
|
|
description=concept.get("description", ""),
|
|
prerequisites=[],
|
|
mastery_signals=[],
|
|
status=status,
|
|
notes=notes,
|
|
)
|
|
)
|
|
|
|
conflicts = [item["message"] for item in lint_payload.get("findings", []) if item["severity"] == "error"]
|
|
review_flags = [item["message"] for item in lint_payload.get("findings", []) if item["severity"] == "warning"]
|
|
pack = {
|
|
"name": f"groundrecall-import-{manifest['import_id']}",
|
|
"display_name": f"GroundRecall Import {manifest['import_id']}",
|
|
"version": "0.1.0-draft",
|
|
"source_import_id": manifest["import_id"],
|
|
"source_root": manifest.get("source_root", ""),
|
|
}
|
|
attribution = {
|
|
"source_repo_kind": manifest.get("source_repo_kind", "llmwiki"),
|
|
"source_root": manifest.get("source_root", ""),
|
|
"imported_at": manifest.get("imported_at", ""),
|
|
"machine_id": manifest.get("machine_id", ""),
|
|
"rights_note": "Imported llmwiki-style corpus requires review before promotion.",
|
|
}
|
|
return ReviewSession(
|
|
reviewer=reviewer,
|
|
draft_pack=DraftPackData(
|
|
pack=pack,
|
|
concepts=entries,
|
|
conflicts=conflicts,
|
|
review_flags=review_flags,
|
|
attribution=attribution,
|
|
),
|
|
citation_reviews=build_citation_review_entries_from_import(base),
|
|
)
|
|
|
|
|
|
def export_review_bundle_from_import(import_dir: str | Path, out_dir: str | Path | None = None, reviewer: str = "GroundRecall Import") -> dict[str, str]:
|
|
base = Path(import_dir)
|
|
target = Path(out_dir) if out_dir is not None else base
|
|
target.mkdir(parents=True, exist_ok=True)
|
|
session = build_review_session_from_import(base, reviewer=reviewer)
|
|
review_state_path = target / "review_session.json"
|
|
export_review_state_json(session, review_state_path)
|
|
export_review_ui_data(session, target, import_dir=base)
|
|
return {
|
|
"review_session_json": str(review_state_path),
|
|
"review_data_json": str(target / "review_data.json"),
|
|
}
|
|
|
|
|
|
def build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(description="Build Didactopus review artifacts from a GroundRecall import.")
|
|
parser.add_argument("import_dir")
|
|
parser.add_argument("--out-dir", default=None)
|
|
parser.add_argument("--reviewer", default="GroundRecall Import")
|
|
return parser
|
|
|
|
|
|
def main() -> None:
|
|
args = build_parser().parse_args()
|
|
outputs = export_review_bundle_from_import(args.import_dir, out_dir=args.out_dir, reviewer=args.reviewer)
|
|
print(json.dumps(outputs, indent=2))
|