GroundRecall/src/groundrecall/promotion.py

329 lines
13 KiB
Python

from __future__ import annotations
import argparse
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from .models import (
ArtifactRecord,
ClaimRecord,
ConceptRecord,
ObservationRecord,
PromotionRecord,
ProvenanceRecord,
RelationRecord,
ReviewCandidateRecord,
)
from .review_schema import ReviewSession
from .store import GroundRecallStore
class PromotionGateError(RuntimeError):
"""Raised when an import is not eligible for promotion."""
def __init__(self, message: str, payload: dict[str, Any]) -> None:
super().__init__(message)
self.payload = payload
def _read_json(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def _read_jsonl(path: Path) -> list[dict[str, Any]]:
if not path.exists():
return []
text = path.read_text(encoding="utf-8").strip()
if not text:
return []
return [json.loads(line) for line in text.splitlines()]
def _now() -> str:
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def _review_status_map(status: str) -> str:
return {
"trusted": "promoted",
"provisional": "reviewed",
"rejected": "rejected",
"needs_review": "triaged",
}.get(status, "triaged")
def _provenance_from_payload(payload: dict[str, Any]) -> ProvenanceRecord:
return ProvenanceRecord(
origin_artifact_id=payload.get("origin_artifact_id", ""),
origin_path=payload.get("origin_path", ""),
origin_section=payload.get("origin_section", ""),
source_url=payload.get("source_url", ""),
retrieval_date=payload.get("retrieval_date", ""),
machine_id=payload.get("machine_id", ""),
session_id=payload.get("session_id", ""),
support_kind=payload.get("support_kind", "unknown"),
grounding_status=payload.get("grounding_status", "ungrounded"),
)
def _review_candidate_rationale(item: dict[str, Any]) -> str:
parts: list[str] = []
title = str(item.get("title", "")).strip()
if title:
parts.append(title)
parts.append(f"lane={item.get('triage_lane', 'knowledge_capture')}")
parts.append(f"priority={int(item.get('priority', 50))}")
finding_codes = [str(code) for code in item.get("finding_codes", []) if str(code).strip()]
graph_codes = [str(code) for code in item.get("graph_codes", []) if str(code).strip()]
if finding_codes:
parts.append(f"findings={','.join(finding_codes)}")
if graph_codes:
parts.append(f"graph={','.join(graph_codes)}")
return " | ".join(parts)
def _load_lint_payload(import_dir: Path) -> dict[str, Any]:
lint_path = import_dir / "lint_findings.json"
if not lint_path.exists():
return {
"summary": {"error_count": 0, "warning_count": 0},
"findings": [],
"missing_lint_file": True,
}
payload = _read_json(lint_path)
payload.setdefault("summary", {})
payload.setdefault("findings", [])
return payload
def _lint_error_payload(lint_payload: dict[str, Any]) -> dict[str, Any]:
findings = [item for item in lint_payload.get("findings", []) if item.get("severity") == "error"]
return {
"error_count": int(lint_payload.get("summary", {}).get("error_count", len(findings))),
"warning_count": int(lint_payload.get("summary", {}).get("warning_count", 0)),
"errors": findings,
}
def _enforce_promotion_gate(import_dir: Path, allow_lint_errors: bool) -> dict[str, Any]:
lint_payload = _load_lint_payload(import_dir)
gate_payload = _lint_error_payload(lint_payload)
if gate_payload["error_count"] > 0 and not allow_lint_errors:
raise PromotionGateError(
"Import has lint errors; review or repair the import before promotion, "
"or pass allow_lint_errors=True / --allow-lint-errors to override.",
gate_payload,
)
return gate_payload
def promote_import_to_store(
import_dir: str | Path,
store_dir: str | Path,
reviewer: str | None = None,
snapshot_id: str | None = None,
allow_lint_errors: bool = False,
) -> dict[str, Any]:
base = Path(import_dir)
gate_payload = _enforce_promotion_gate(base, allow_lint_errors=allow_lint_errors)
manifest = _read_json(base / "manifest.json")
review_session = ReviewSession.model_validate_json((base / "review_session.json").read_text(encoding="utf-8"))
queue_payload = _read_json(base / "review_queue.json")
artifacts = _read_jsonl(base / "artifacts.jsonl")
observations = _read_jsonl(base / "observations.jsonl")
claims = _read_jsonl(base / "claims.jsonl")
concepts = _read_jsonl(base / "concepts.jsonl")
relations = _read_jsonl(base / "relations.jsonl")
store = GroundRecallStore(store_dir)
reviewed_by_concept = {entry.concept_id: entry for entry in review_session.draft_pack.concepts}
promoted_claim_ids: list[str] = []
promoted_concept_ids: list[str] = []
promoted_relation_ids: list[str] = []
for artifact in artifacts:
store.save_artifact(
ArtifactRecord(
artifact_id=artifact["artifact_id"],
artifact_kind=artifact["artifact_kind"],
title=artifact.get("title", ""),
path=artifact.get("path", ""),
sha256=artifact.get("sha256", ""),
created_at=artifact.get("created_at", ""),
metadata=dict(artifact.get("metadata", {})),
current_status="reviewed",
)
)
for observation in observations:
store.save_observation(
ObservationRecord(
observation_id=observation["observation_id"],
artifact_id=observation.get("artifact_id", ""),
role=observation.get("role", "summary"),
text=observation.get("text", ""),
provenance=_provenance_from_payload(observation),
confidence_hint=float(observation.get("confidence_hint", 0.0)),
current_status="reviewed",
)
)
for concept in concepts:
short_id = concept["concept_id"].replace("concept::", "", 1)
review_entry = reviewed_by_concept.get(short_id)
current_status = _review_status_map(review_entry.status if review_entry else concept.get("current_status", "triaged"))
record = store.save_concept(
ConceptRecord(
concept_id=concept["concept_id"],
title=review_entry.title if review_entry else concept.get("title", concept["concept_id"]),
aliases=list(concept.get("aliases", [])),
description=review_entry.description if review_entry else concept.get("description", ""),
source_artifact_ids=list(concept.get("source_artifact_ids", [])),
current_status=current_status, # type: ignore[arg-type]
)
)
if record.current_status in {"promoted", "reviewed"}:
promoted_concept_ids.append(record.concept_id)
reviewed_concept_ids = set(promoted_concept_ids)
for claim in claims:
concept_ids = list(claim.get("concept_ids", []))
statuses = []
for concept_id in concept_ids:
short_id = concept_id.replace("concept::", "", 1)
review_entry = reviewed_by_concept.get(short_id)
statuses.append(_review_status_map(review_entry.status) if review_entry else "triaged")
if statuses and all(status == "rejected" for status in statuses):
current_status = "rejected"
elif statuses and any(status == "promoted" for status in statuses):
current_status = "promoted"
elif statuses and any(status == "reviewed" for status in statuses):
current_status = "reviewed"
else:
current_status = "triaged"
record = store.save_claim(
ClaimRecord(
claim_id=claim["claim_id"],
claim_text=claim.get("claim_text", ""),
claim_kind=claim.get("claim_kind", "statement"),
metadata=dict(claim.get("metadata", {})),
source_observation_ids=list(claim.get("source_observation_ids", [])),
supporting_fragment_ids=list(claim.get("supporting_fragment_ids", [])),
concept_ids=concept_ids,
contradicts_claim_ids=list(claim.get("contradicts_claim_ids", [])),
supersedes_claim_ids=list(claim.get("supersedes_claim_ids", [])),
confidence_hint=float(claim.get("confidence_hint", 0.0)),
review_confidence=float(claim.get("review_confidence", 0.0)),
last_confirmed_at=claim.get("last_confirmed_at", ""),
provenance=_provenance_from_payload(claim),
current_status=current_status, # type: ignore[arg-type]
)
)
if record.current_status in {"promoted", "reviewed"}:
promoted_claim_ids.append(record.claim_id)
for relation in relations:
src_ok = relation.get("source_id") in reviewed_concept_ids
tgt_ok = relation.get("target_id") in reviewed_concept_ids
current_status = "promoted" if src_ok and tgt_ok else "triaged"
record = store.save_relation(
RelationRecord(
relation_id=relation["relation_id"],
source_id=relation.get("source_id", ""),
target_id=relation.get("target_id", ""),
relation_type=relation.get("relation_type", "references"),
evidence_ids=list(relation.get("evidence_ids", [])),
provenance=_provenance_from_payload(relation),
current_status=current_status, # type: ignore[arg-type]
)
)
if record.current_status in {"promoted", "reviewed"}:
promoted_relation_ids.append(record.relation_id)
for item in queue_payload.get("items", []):
graph_codes = [str(code) for code in item.get("graph_codes", []) if str(code).strip()]
finding_codes = [str(code) for code in item.get("finding_codes", []) if str(code).strip()]
store.save_review_candidate(
ReviewCandidateRecord(
review_candidate_id=item["queue_id"],
candidate_type=item["candidate_type"],
candidate_id=item["candidate_id"],
triage_lane=item.get("triage_lane", "knowledge_capture"),
priority=int(item.get("priority", 50)),
finding_codes=sorted(set(finding_codes + graph_codes)),
rationale=_review_candidate_rationale(item),
current_status="reviewed" if item["candidate_id"] in set(promoted_claim_ids + promoted_concept_ids + promoted_relation_ids) else "triaged",
)
)
promotion = store.save_promotion(
PromotionRecord(
promotion_id=f"promotion-{manifest['import_id']}",
candidate_type="concept",
candidate_id=manifest["import_id"],
promotion_target="groundrecall_store",
verdict="approved",
reviewer=reviewer or review_session.reviewer,
promoted_object_ids=promoted_concept_ids + promoted_claim_ids + promoted_relation_ids,
notes=f"Promoted import {manifest['import_id']} into GroundRecallStore.",
promoted_at=_now(),
)
)
built_snapshot = store.build_snapshot(
snapshot_id=snapshot_id or f"snapshot-{manifest['import_id']}",
created_at=_now(),
metadata={
"source_import_id": manifest["import_id"],
"reviewer": reviewer or review_session.reviewer,
"export_kind": "canonical",
},
)
store.save_snapshot(built_snapshot)
return {
"import_id": manifest["import_id"],
"store_dir": str(Path(store_dir)),
"promotion_id": promotion.promotion_id,
"promoted_concept_count": len(promoted_concept_ids),
"promoted_claim_count": len(promoted_claim_ids),
"promoted_relation_count": len(promoted_relation_ids),
"lint_error_count": gate_payload["error_count"],
"lint_warning_count": gate_payload["warning_count"],
"lint_errors_allowed": allow_lint_errors,
"snapshot_id": built_snapshot.snapshot_id,
}
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Promote a GroundRecall import into canonical store objects.")
parser.add_argument("import_dir")
parser.add_argument("store_dir")
parser.add_argument("--reviewer", default=None)
parser.add_argument("--snapshot-id", default=None)
parser.add_argument(
"--allow-lint-errors",
action="store_true",
help="Promote even when lint_findings.json contains errors. Warnings do not block promotion.",
)
return parser
def main() -> None:
args = build_parser().parse_args()
try:
payload = promote_import_to_store(
import_dir=args.import_dir,
store_dir=args.store_dir,
reviewer=args.reviewer,
snapshot_id=args.snapshot_id,
allow_lint_errors=args.allow_lint_errors,
)
except PromotionGateError as exc:
print(json.dumps({"ok": False, "error": str(exc), "gate": exc.payload}, indent=2), file=sys.stderr)
raise SystemExit(2) from exc
print(json.dumps(payload, indent=2))