diff --git a/src/groundrecall/query.py b/src/groundrecall/query.py index 0949a9b..f4e7ec7 100644 --- a/src/groundrecall/query.py +++ b/src/groundrecall/query.py @@ -3,6 +3,7 @@ from __future__ import annotations import argparse import json from pathlib import Path +import re from typing import Any from .store import GroundRecallStore @@ -17,6 +18,59 @@ def _matches(query: str, *values: str) -> bool: return any(needle in _normalize(value) for value in values if value) +_SOURCE_ROLE_ORDER = ["overview", "mechanism", "nuance", "controversy", "argumentation"] + + +def _infer_source_role(artifact) -> str: + metadata = artifact.metadata if isinstance(getattr(artifact, "metadata", None), dict) else {} + explicit = str(metadata.get("source_role", "") or metadata.get("source_role_hint", "")).strip().lower() + if explicit in _SOURCE_ROLE_ORDER: + return explicit + + title = str(getattr(artifact, "title", "") or "").lower() + path = str(getattr(artifact, "path", "") or "").lower() + corpus = str(metadata.get("corpus", "") or "").lower() + document_kind = str(metadata.get("document_kind", "") or "").lower() + joined = " ".join(part for part in (title, path, corpus, document_kind) if part) + + if any(token in joined for token in ("pandasthumb", "indexcc", "talkorigins", "evidence", "rebuttal", "argument", "critique")): + return "argumentation" + if any(token in joined for token in ("controvers", "debate", "dispute", "polemic")): + return "controversy" + if any(token in joined for token in ("introduction", "overview", "chapter", "textbook", "handbook", "evolutionary biology", "ecology")): + return "overview" + if any(token in joined for token in ("mechanism", "model", "testing", "test", "how", "rate", "process")): + return "mechanism" + if any(token in joined for token in ("nuance", "qualification", "constraint", "plasticity", "epigenetic", "drift")): + return "nuance" + return "overview" + + +def _claim_distinction_payload(claim: dict[str, Any]) -> dict[str, Any] | None: + text = str(claim.get("claim_text", "")).strip() + lowered = text.lower() + if not text: + return None + + patterns = [ + ("non_implication", r"\bdoes not imply\b", "does not imply"), + ("decoupling", r"\b(can|may)\s+occur\s+without\b", "can or may occur without"), + ("contrast", r"\bversus\b|\bvs\.\b|\bvs\b", "versus"), + ("contrast", r"\brather than\b", "rather than"), + ("contrast", r"\bdifferent from\b|\bdistinguish(?:ed)? from\b", "different from"), + ("contrast", r"\bnot\b.+\bbut\b", "not ... but"), + ] + for distinction_type, pattern, cue in patterns: + if re.search(pattern, lowered): + return { + "claim_id": claim.get("claim_id", ""), + "distinction_type": distinction_type, + "cue": cue, + "text": text, + } + return None + + def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | None: store = GroundRecallStore(store_dir) concepts = store.list_concepts() @@ -48,13 +102,16 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N for observation_id in claim.source_observation_ids: observation = observations.get(observation_id) if observation is not None: + artifact = artifacts.get(observation.artifact_id) supporting_observations.append( { "observation_id": observation.observation_id, + "artifact_id": observation.artifact_id, "text": observation.text, "role": observation.role, "origin_path": observation.provenance.origin_path, "grounding_status": observation.provenance.grounding_status, + "source_role": _infer_source_role(artifact) if artifact is not None else "", } ) @@ -67,11 +124,30 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N ) related_concepts = [item.model_dump() for item in concepts if item.concept_id in related_concept_ids] - source_artifacts = [ - artifact.model_dump() - for artifact in artifacts.values() - if artifact.artifact_id in set(concept.source_artifact_ids) - ] + source_artifacts = [] + for artifact in artifacts.values(): + if artifact.artifact_id not in set(concept.source_artifact_ids): + continue + payload = artifact.model_dump() + payload["source_role"] = _infer_source_role(artifact) + source_artifacts.append(payload) + + claim_payloads: list[dict[str, Any]] = [] + for claim in claims: + payload = claim.model_dump() + source_roles = sorted( + { + _infer_source_role(artifacts[observations[item].artifact_id]) + for item in claim.source_observation_ids + if item in observations and observations[item].artifact_id in artifacts + } + ) + if source_roles: + payload["source_roles"] = source_roles + distinction = _claim_distinction_payload(payload) + if distinction is not None: + payload["distinction"] = distinction + claim_payloads.append(payload) related_review_candidates = [ item.model_dump() for item in review_candidates @@ -82,7 +158,7 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N return { "query_type": "concept", "concept": concept.model_dump(), - "claims": [item.model_dump() for item in claims], + "claims": claim_payloads, "relations": [item.model_dump() for item in relations], "related_concepts": related_concepts, "supporting_observations": supporting_observations, @@ -218,6 +294,12 @@ def build_query_bundle_for_concept(store_dir: str | Path, concept_ref: str) -> d relations = payload["relations"] contradictions = [item for item in claims if item.get("contradicts_claim_ids")] supersessions = [item for item in claims if item.get("supersedes_claim_ids")] + source_role_summary: dict[str, int] = {} + for artifact in payload["source_artifacts"]: + role = str(artifact.get("source_role", "")).strip() + if role: + source_role_summary[role] = source_role_summary.get(role, 0) + 1 + key_distinctions = [item["distinction"] for item in claims if isinstance(item.get("distinction"), dict)] return { "bundle_kind": "groundrecall_query_bundle", "query_type": "concept", @@ -226,6 +308,8 @@ def build_query_bundle_for_concept(store_dir: str | Path, concept_ref: str) -> d "relations": relations, "supporting_observations": payload["supporting_observations"], "source_artifacts": payload["source_artifacts"], + "source_role_summary": dict(sorted(source_role_summary.items())), + "key_distinctions": key_distinctions[:8], "related_concepts": payload["related_concepts"], "review_candidates": payload["review_candidates"], "contradictions": contradictions, diff --git a/tests/test_groundrecall_query.py b/tests/test_groundrecall_query.py index e235227..94aaeea 100644 --- a/tests/test_groundrecall_query.py +++ b/tests/test_groundrecall_query.py @@ -27,6 +27,7 @@ def _seed_store(store: GroundRecallStore) -> None: artifact_kind="compiled_page", title="Channel Capacity", path="wiki/channel-capacity.md", + metadata={"source_role": "mechanism"}, current_status="reviewed", ) ) @@ -82,6 +83,21 @@ def _seed_store(store: GroundRecallStore) -> None: store.save_claim( ClaimRecord( claim_id="clm_002", + claim_text="Channel capacity does not imply error-free transmission without coding.", + concept_ids=["concept::channel-capacity"], + source_observation_ids=["obs_001"], + provenance=ProvenanceRecord( + origin_artifact_id="ia_001", + origin_path="wiki/channel-capacity.md", + support_kind="derived_from_page", + grounding_status="partially_grounded", + ), + current_status="reviewed", + ) + ) + store.save_claim( + ClaimRecord( + claim_id="clm_003", claim_text="Shannon entropy can inform channel coding intuition.", concept_ids=["concept::shannon-entropy"], contradicts_claim_ids=["clm_999"], @@ -136,10 +152,11 @@ def test_query_concept_returns_neighborhood_and_support(tmp_path: Path) -> None: payload = query_concept(store.base_dir, "channel-capacity") assert payload is not None assert payload["concept"]["concept_id"] == "concept::channel-capacity" - assert len(payload["claims"]) == 1 + assert len(payload["claims"]) == 2 assert len(payload["relations"]) == 1 assert any(item["concept_id"] == "concept::shannon-entropy" for item in payload["related_concepts"]) assert payload["supporting_observations"][0]["origin_path"] == "wiki/channel-capacity.md" + assert payload["supporting_observations"][0]["source_role"] == "mechanism" assert len(payload["review_candidates"]) == 2 assert any(item["candidate_id"] == "concept::channel-capacity" for item in payload["review_candidates"]) assert any("graph=bridge_concept" in item["rationale"] for item in payload["review_candidates"]) @@ -151,7 +168,7 @@ def test_search_claims_matches_text_and_concept_titles(tmp_path: Path) -> None: payload = search_claims(store.base_dir, "entropy") assert payload["query_type"] == "claim_search" - assert any(match["claim"]["claim_id"] == "clm_002" for match in payload["matches"]) + assert any(match["claim"]["claim_id"] == "clm_003" for match in payload["matches"]) def test_query_provenance_filters_by_origin_path(tmp_path: Path) -> None: @@ -159,7 +176,7 @@ def test_query_provenance_filters_by_origin_path(tmp_path: Path) -> None: _seed_store(store) payload = query_provenance(store.base_dir, origin_path="wiki/channel-capacity.md") - assert len(payload["claims"]) == 2 + assert len(payload["claims"]) == 3 assert len(payload["observations"]) == 1 @@ -173,6 +190,10 @@ def test_build_query_bundle_for_concept_is_assistant_neutral(tmp_path: Path) -> assert payload["concept"]["concept_id"] == "concept::channel-capacity" assert len(payload["relations"]) == 1 assert payload["source_artifacts"][0]["artifact_id"] == "ia_001" + assert payload["source_artifacts"][0]["source_role"] == "mechanism" + assert payload["source_role_summary"]["mechanism"] == 1 + assert payload["key_distinctions"][0]["distinction_type"] == "non_implication" + assert payload["relevant_claims"][0]["source_roles"] == ["mechanism"] assert len(payload["review_candidates"]) == 2 assert isinstance(payload["suggested_next_actions"], list) forbidden = {"assistant", "codex", "claude", "prompt_text"} @@ -184,7 +205,7 @@ def test_query_bundle_surfaces_contradictions_and_supersessions(tmp_path: Path) _seed_store(store) store.save_claim( ClaimRecord( - claim_id="clm_003", + claim_id="clm_004", claim_text="Channel capacity is undefined in practice.", concept_ids=["concept::channel-capacity"], contradicts_claim_ids=["clm_001"], @@ -199,7 +220,7 @@ def test_query_bundle_surfaces_contradictions_and_supersessions(tmp_path: Path) ) store.save_claim( ClaimRecord( - claim_id="clm_004", + claim_id="clm_005", claim_text="Channel capacity should be interpreted relative to a specific channel model.", concept_ids=["concept::channel-capacity"], supersedes_claim_ids=["clm_001"], @@ -217,5 +238,5 @@ def test_query_bundle_surfaces_contradictions_and_supersessions(tmp_path: Path) assert payload is not None contradiction_ids = {item["claim_id"] for item in payload["contradictions"]} supersession_ids = {item["claim_id"] for item in payload["supersessions"]} - assert "clm_003" in contradiction_ids - assert "clm_004" in supersession_ids + assert "clm_004" in contradiction_ids + assert "clm_005" in supersession_ids