Add source roles and distinctions to query bundles

This commit is contained in:
welsberr 2026-05-08 14:03:57 -04:00
parent 2e82dfd5a7
commit ddbd5dbf2a
2 changed files with 118 additions and 13 deletions

View File

@ -3,6 +3,7 @@ from __future__ import annotations
import argparse import argparse
import json import json
from pathlib import Path from pathlib import Path
import re
from typing import Any from typing import Any
from .store import GroundRecallStore from .store import GroundRecallStore
@ -17,6 +18,59 @@ def _matches(query: str, *values: str) -> bool:
return any(needle in _normalize(value) for value in values if value) return any(needle in _normalize(value) for value in values if value)
_SOURCE_ROLE_ORDER = ["overview", "mechanism", "nuance", "controversy", "argumentation"]
def _infer_source_role(artifact) -> str:
metadata = artifact.metadata if isinstance(getattr(artifact, "metadata", None), dict) else {}
explicit = str(metadata.get("source_role", "") or metadata.get("source_role_hint", "")).strip().lower()
if explicit in _SOURCE_ROLE_ORDER:
return explicit
title = str(getattr(artifact, "title", "") or "").lower()
path = str(getattr(artifact, "path", "") or "").lower()
corpus = str(metadata.get("corpus", "") or "").lower()
document_kind = str(metadata.get("document_kind", "") or "").lower()
joined = " ".join(part for part in (title, path, corpus, document_kind) if part)
if any(token in joined for token in ("pandasthumb", "indexcc", "talkorigins", "evidence", "rebuttal", "argument", "critique")):
return "argumentation"
if any(token in joined for token in ("controvers", "debate", "dispute", "polemic")):
return "controversy"
if any(token in joined for token in ("introduction", "overview", "chapter", "textbook", "handbook", "evolutionary biology", "ecology")):
return "overview"
if any(token in joined for token in ("mechanism", "model", "testing", "test", "how", "rate", "process")):
return "mechanism"
if any(token in joined for token in ("nuance", "qualification", "constraint", "plasticity", "epigenetic", "drift")):
return "nuance"
return "overview"
def _claim_distinction_payload(claim: dict[str, Any]) -> dict[str, Any] | None:
text = str(claim.get("claim_text", "")).strip()
lowered = text.lower()
if not text:
return None
patterns = [
("non_implication", r"\bdoes not imply\b", "does not imply"),
("decoupling", r"\b(can|may)\s+occur\s+without\b", "can or may occur without"),
("contrast", r"\bversus\b|\bvs\.\b|\bvs\b", "versus"),
("contrast", r"\brather than\b", "rather than"),
("contrast", r"\bdifferent from\b|\bdistinguish(?:ed)? from\b", "different from"),
("contrast", r"\bnot\b.+\bbut\b", "not ... but"),
]
for distinction_type, pattern, cue in patterns:
if re.search(pattern, lowered):
return {
"claim_id": claim.get("claim_id", ""),
"distinction_type": distinction_type,
"cue": cue,
"text": text,
}
return None
def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | None: def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | None:
store = GroundRecallStore(store_dir) store = GroundRecallStore(store_dir)
concepts = store.list_concepts() concepts = store.list_concepts()
@ -48,13 +102,16 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N
for observation_id in claim.source_observation_ids: for observation_id in claim.source_observation_ids:
observation = observations.get(observation_id) observation = observations.get(observation_id)
if observation is not None: if observation is not None:
artifact = artifacts.get(observation.artifact_id)
supporting_observations.append( supporting_observations.append(
{ {
"observation_id": observation.observation_id, "observation_id": observation.observation_id,
"artifact_id": observation.artifact_id,
"text": observation.text, "text": observation.text,
"role": observation.role, "role": observation.role,
"origin_path": observation.provenance.origin_path, "origin_path": observation.provenance.origin_path,
"grounding_status": observation.provenance.grounding_status, "grounding_status": observation.provenance.grounding_status,
"source_role": _infer_source_role(artifact) if artifact is not None else "",
} }
) )
@ -67,11 +124,30 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N
) )
related_concepts = [item.model_dump() for item in concepts if item.concept_id in related_concept_ids] related_concepts = [item.model_dump() for item in concepts if item.concept_id in related_concept_ids]
source_artifacts = [ source_artifacts = []
artifact.model_dump() for artifact in artifacts.values():
for artifact in artifacts.values() if artifact.artifact_id not in set(concept.source_artifact_ids):
if artifact.artifact_id in set(concept.source_artifact_ids) continue
] payload = artifact.model_dump()
payload["source_role"] = _infer_source_role(artifact)
source_artifacts.append(payload)
claim_payloads: list[dict[str, Any]] = []
for claim in claims:
payload = claim.model_dump()
source_roles = sorted(
{
_infer_source_role(artifacts[observations[item].artifact_id])
for item in claim.source_observation_ids
if item in observations and observations[item].artifact_id in artifacts
}
)
if source_roles:
payload["source_roles"] = source_roles
distinction = _claim_distinction_payload(payload)
if distinction is not None:
payload["distinction"] = distinction
claim_payloads.append(payload)
related_review_candidates = [ related_review_candidates = [
item.model_dump() item.model_dump()
for item in review_candidates for item in review_candidates
@ -82,7 +158,7 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N
return { return {
"query_type": "concept", "query_type": "concept",
"concept": concept.model_dump(), "concept": concept.model_dump(),
"claims": [item.model_dump() for item in claims], "claims": claim_payloads,
"relations": [item.model_dump() for item in relations], "relations": [item.model_dump() for item in relations],
"related_concepts": related_concepts, "related_concepts": related_concepts,
"supporting_observations": supporting_observations, "supporting_observations": supporting_observations,
@ -218,6 +294,12 @@ def build_query_bundle_for_concept(store_dir: str | Path, concept_ref: str) -> d
relations = payload["relations"] relations = payload["relations"]
contradictions = [item for item in claims if item.get("contradicts_claim_ids")] contradictions = [item for item in claims if item.get("contradicts_claim_ids")]
supersessions = [item for item in claims if item.get("supersedes_claim_ids")] supersessions = [item for item in claims if item.get("supersedes_claim_ids")]
source_role_summary: dict[str, int] = {}
for artifact in payload["source_artifacts"]:
role = str(artifact.get("source_role", "")).strip()
if role:
source_role_summary[role] = source_role_summary.get(role, 0) + 1
key_distinctions = [item["distinction"] for item in claims if isinstance(item.get("distinction"), dict)]
return { return {
"bundle_kind": "groundrecall_query_bundle", "bundle_kind": "groundrecall_query_bundle",
"query_type": "concept", "query_type": "concept",
@ -226,6 +308,8 @@ def build_query_bundle_for_concept(store_dir: str | Path, concept_ref: str) -> d
"relations": relations, "relations": relations,
"supporting_observations": payload["supporting_observations"], "supporting_observations": payload["supporting_observations"],
"source_artifacts": payload["source_artifacts"], "source_artifacts": payload["source_artifacts"],
"source_role_summary": dict(sorted(source_role_summary.items())),
"key_distinctions": key_distinctions[:8],
"related_concepts": payload["related_concepts"], "related_concepts": payload["related_concepts"],
"review_candidates": payload["review_candidates"], "review_candidates": payload["review_candidates"],
"contradictions": contradictions, "contradictions": contradictions,

View File

@ -27,6 +27,7 @@ def _seed_store(store: GroundRecallStore) -> None:
artifact_kind="compiled_page", artifact_kind="compiled_page",
title="Channel Capacity", title="Channel Capacity",
path="wiki/channel-capacity.md", path="wiki/channel-capacity.md",
metadata={"source_role": "mechanism"},
current_status="reviewed", current_status="reviewed",
) )
) )
@ -82,6 +83,21 @@ def _seed_store(store: GroundRecallStore) -> None:
store.save_claim( store.save_claim(
ClaimRecord( ClaimRecord(
claim_id="clm_002", claim_id="clm_002",
claim_text="Channel capacity does not imply error-free transmission without coding.",
concept_ids=["concept::channel-capacity"],
source_observation_ids=["obs_001"],
provenance=ProvenanceRecord(
origin_artifact_id="ia_001",
origin_path="wiki/channel-capacity.md",
support_kind="derived_from_page",
grounding_status="partially_grounded",
),
current_status="reviewed",
)
)
store.save_claim(
ClaimRecord(
claim_id="clm_003",
claim_text="Shannon entropy can inform channel coding intuition.", claim_text="Shannon entropy can inform channel coding intuition.",
concept_ids=["concept::shannon-entropy"], concept_ids=["concept::shannon-entropy"],
contradicts_claim_ids=["clm_999"], contradicts_claim_ids=["clm_999"],
@ -136,10 +152,11 @@ def test_query_concept_returns_neighborhood_and_support(tmp_path: Path) -> None:
payload = query_concept(store.base_dir, "channel-capacity") payload = query_concept(store.base_dir, "channel-capacity")
assert payload is not None assert payload is not None
assert payload["concept"]["concept_id"] == "concept::channel-capacity" assert payload["concept"]["concept_id"] == "concept::channel-capacity"
assert len(payload["claims"]) == 1 assert len(payload["claims"]) == 2
assert len(payload["relations"]) == 1 assert len(payload["relations"]) == 1
assert any(item["concept_id"] == "concept::shannon-entropy" for item in payload["related_concepts"]) assert any(item["concept_id"] == "concept::shannon-entropy" for item in payload["related_concepts"])
assert payload["supporting_observations"][0]["origin_path"] == "wiki/channel-capacity.md" assert payload["supporting_observations"][0]["origin_path"] == "wiki/channel-capacity.md"
assert payload["supporting_observations"][0]["source_role"] == "mechanism"
assert len(payload["review_candidates"]) == 2 assert len(payload["review_candidates"]) == 2
assert any(item["candidate_id"] == "concept::channel-capacity" for item in payload["review_candidates"]) assert any(item["candidate_id"] == "concept::channel-capacity" for item in payload["review_candidates"])
assert any("graph=bridge_concept" in item["rationale"] for item in payload["review_candidates"]) assert any("graph=bridge_concept" in item["rationale"] for item in payload["review_candidates"])
@ -151,7 +168,7 @@ def test_search_claims_matches_text_and_concept_titles(tmp_path: Path) -> None:
payload = search_claims(store.base_dir, "entropy") payload = search_claims(store.base_dir, "entropy")
assert payload["query_type"] == "claim_search" assert payload["query_type"] == "claim_search"
assert any(match["claim"]["claim_id"] == "clm_002" for match in payload["matches"]) assert any(match["claim"]["claim_id"] == "clm_003" for match in payload["matches"])
def test_query_provenance_filters_by_origin_path(tmp_path: Path) -> None: def test_query_provenance_filters_by_origin_path(tmp_path: Path) -> None:
@ -159,7 +176,7 @@ def test_query_provenance_filters_by_origin_path(tmp_path: Path) -> None:
_seed_store(store) _seed_store(store)
payload = query_provenance(store.base_dir, origin_path="wiki/channel-capacity.md") payload = query_provenance(store.base_dir, origin_path="wiki/channel-capacity.md")
assert len(payload["claims"]) == 2 assert len(payload["claims"]) == 3
assert len(payload["observations"]) == 1 assert len(payload["observations"]) == 1
@ -173,6 +190,10 @@ def test_build_query_bundle_for_concept_is_assistant_neutral(tmp_path: Path) ->
assert payload["concept"]["concept_id"] == "concept::channel-capacity" assert payload["concept"]["concept_id"] == "concept::channel-capacity"
assert len(payload["relations"]) == 1 assert len(payload["relations"]) == 1
assert payload["source_artifacts"][0]["artifact_id"] == "ia_001" assert payload["source_artifacts"][0]["artifact_id"] == "ia_001"
assert payload["source_artifacts"][0]["source_role"] == "mechanism"
assert payload["source_role_summary"]["mechanism"] == 1
assert payload["key_distinctions"][0]["distinction_type"] == "non_implication"
assert payload["relevant_claims"][0]["source_roles"] == ["mechanism"]
assert len(payload["review_candidates"]) == 2 assert len(payload["review_candidates"]) == 2
assert isinstance(payload["suggested_next_actions"], list) assert isinstance(payload["suggested_next_actions"], list)
forbidden = {"assistant", "codex", "claude", "prompt_text"} forbidden = {"assistant", "codex", "claude", "prompt_text"}
@ -184,7 +205,7 @@ def test_query_bundle_surfaces_contradictions_and_supersessions(tmp_path: Path)
_seed_store(store) _seed_store(store)
store.save_claim( store.save_claim(
ClaimRecord( ClaimRecord(
claim_id="clm_003", claim_id="clm_004",
claim_text="Channel capacity is undefined in practice.", claim_text="Channel capacity is undefined in practice.",
concept_ids=["concept::channel-capacity"], concept_ids=["concept::channel-capacity"],
contradicts_claim_ids=["clm_001"], contradicts_claim_ids=["clm_001"],
@ -199,7 +220,7 @@ def test_query_bundle_surfaces_contradictions_and_supersessions(tmp_path: Path)
) )
store.save_claim( store.save_claim(
ClaimRecord( ClaimRecord(
claim_id="clm_004", claim_id="clm_005",
claim_text="Channel capacity should be interpreted relative to a specific channel model.", claim_text="Channel capacity should be interpreted relative to a specific channel model.",
concept_ids=["concept::channel-capacity"], concept_ids=["concept::channel-capacity"],
supersedes_claim_ids=["clm_001"], supersedes_claim_ids=["clm_001"],
@ -217,5 +238,5 @@ def test_query_bundle_surfaces_contradictions_and_supersessions(tmp_path: Path)
assert payload is not None assert payload is not None
contradiction_ids = {item["claim_id"] for item in payload["contradictions"]} contradiction_ids = {item["claim_id"] for item in payload["contradictions"]}
supersession_ids = {item["claim_id"] for item in payload["supersessions"]} supersession_ids = {item["claim_id"] for item in payload["supersessions"]}
assert "clm_003" in contradiction_ids assert "clm_004" in contradiction_ids
assert "clm_004" in supersession_ids assert "clm_005" in supersession_ids