Add source roles and distinctions to query bundles
This commit is contained in:
parent
2e82dfd5a7
commit
ddbd5dbf2a
|
|
@ -3,6 +3,7 @@ from __future__ import annotations
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from .store import GroundRecallStore
|
from .store import GroundRecallStore
|
||||||
|
|
@ -17,6 +18,59 @@ def _matches(query: str, *values: str) -> bool:
|
||||||
return any(needle in _normalize(value) for value in values if value)
|
return any(needle in _normalize(value) for value in values if value)
|
||||||
|
|
||||||
|
|
||||||
|
_SOURCE_ROLE_ORDER = ["overview", "mechanism", "nuance", "controversy", "argumentation"]
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_source_role(artifact) -> str:
|
||||||
|
metadata = artifact.metadata if isinstance(getattr(artifact, "metadata", None), dict) else {}
|
||||||
|
explicit = str(metadata.get("source_role", "") or metadata.get("source_role_hint", "")).strip().lower()
|
||||||
|
if explicit in _SOURCE_ROLE_ORDER:
|
||||||
|
return explicit
|
||||||
|
|
||||||
|
title = str(getattr(artifact, "title", "") or "").lower()
|
||||||
|
path = str(getattr(artifact, "path", "") or "").lower()
|
||||||
|
corpus = str(metadata.get("corpus", "") or "").lower()
|
||||||
|
document_kind = str(metadata.get("document_kind", "") or "").lower()
|
||||||
|
joined = " ".join(part for part in (title, path, corpus, document_kind) if part)
|
||||||
|
|
||||||
|
if any(token in joined for token in ("pandasthumb", "indexcc", "talkorigins", "evidence", "rebuttal", "argument", "critique")):
|
||||||
|
return "argumentation"
|
||||||
|
if any(token in joined for token in ("controvers", "debate", "dispute", "polemic")):
|
||||||
|
return "controversy"
|
||||||
|
if any(token in joined for token in ("introduction", "overview", "chapter", "textbook", "handbook", "evolutionary biology", "ecology")):
|
||||||
|
return "overview"
|
||||||
|
if any(token in joined for token in ("mechanism", "model", "testing", "test", "how", "rate", "process")):
|
||||||
|
return "mechanism"
|
||||||
|
if any(token in joined for token in ("nuance", "qualification", "constraint", "plasticity", "epigenetic", "drift")):
|
||||||
|
return "nuance"
|
||||||
|
return "overview"
|
||||||
|
|
||||||
|
|
||||||
|
def _claim_distinction_payload(claim: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
|
text = str(claim.get("claim_text", "")).strip()
|
||||||
|
lowered = text.lower()
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
|
||||||
|
patterns = [
|
||||||
|
("non_implication", r"\bdoes not imply\b", "does not imply"),
|
||||||
|
("decoupling", r"\b(can|may)\s+occur\s+without\b", "can or may occur without"),
|
||||||
|
("contrast", r"\bversus\b|\bvs\.\b|\bvs\b", "versus"),
|
||||||
|
("contrast", r"\brather than\b", "rather than"),
|
||||||
|
("contrast", r"\bdifferent from\b|\bdistinguish(?:ed)? from\b", "different from"),
|
||||||
|
("contrast", r"\bnot\b.+\bbut\b", "not ... but"),
|
||||||
|
]
|
||||||
|
for distinction_type, pattern, cue in patterns:
|
||||||
|
if re.search(pattern, lowered):
|
||||||
|
return {
|
||||||
|
"claim_id": claim.get("claim_id", ""),
|
||||||
|
"distinction_type": distinction_type,
|
||||||
|
"cue": cue,
|
||||||
|
"text": text,
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | None:
|
def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | None:
|
||||||
store = GroundRecallStore(store_dir)
|
store = GroundRecallStore(store_dir)
|
||||||
concepts = store.list_concepts()
|
concepts = store.list_concepts()
|
||||||
|
|
@ -48,13 +102,16 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N
|
||||||
for observation_id in claim.source_observation_ids:
|
for observation_id in claim.source_observation_ids:
|
||||||
observation = observations.get(observation_id)
|
observation = observations.get(observation_id)
|
||||||
if observation is not None:
|
if observation is not None:
|
||||||
|
artifact = artifacts.get(observation.artifact_id)
|
||||||
supporting_observations.append(
|
supporting_observations.append(
|
||||||
{
|
{
|
||||||
"observation_id": observation.observation_id,
|
"observation_id": observation.observation_id,
|
||||||
|
"artifact_id": observation.artifact_id,
|
||||||
"text": observation.text,
|
"text": observation.text,
|
||||||
"role": observation.role,
|
"role": observation.role,
|
||||||
"origin_path": observation.provenance.origin_path,
|
"origin_path": observation.provenance.origin_path,
|
||||||
"grounding_status": observation.provenance.grounding_status,
|
"grounding_status": observation.provenance.grounding_status,
|
||||||
|
"source_role": _infer_source_role(artifact) if artifact is not None else "",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -67,11 +124,30 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N
|
||||||
)
|
)
|
||||||
related_concepts = [item.model_dump() for item in concepts if item.concept_id in related_concept_ids]
|
related_concepts = [item.model_dump() for item in concepts if item.concept_id in related_concept_ids]
|
||||||
|
|
||||||
source_artifacts = [
|
source_artifacts = []
|
||||||
artifact.model_dump()
|
for artifact in artifacts.values():
|
||||||
for artifact in artifacts.values()
|
if artifact.artifact_id not in set(concept.source_artifact_ids):
|
||||||
if artifact.artifact_id in set(concept.source_artifact_ids)
|
continue
|
||||||
]
|
payload = artifact.model_dump()
|
||||||
|
payload["source_role"] = _infer_source_role(artifact)
|
||||||
|
source_artifacts.append(payload)
|
||||||
|
|
||||||
|
claim_payloads: list[dict[str, Any]] = []
|
||||||
|
for claim in claims:
|
||||||
|
payload = claim.model_dump()
|
||||||
|
source_roles = sorted(
|
||||||
|
{
|
||||||
|
_infer_source_role(artifacts[observations[item].artifact_id])
|
||||||
|
for item in claim.source_observation_ids
|
||||||
|
if item in observations and observations[item].artifact_id in artifacts
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if source_roles:
|
||||||
|
payload["source_roles"] = source_roles
|
||||||
|
distinction = _claim_distinction_payload(payload)
|
||||||
|
if distinction is not None:
|
||||||
|
payload["distinction"] = distinction
|
||||||
|
claim_payloads.append(payload)
|
||||||
related_review_candidates = [
|
related_review_candidates = [
|
||||||
item.model_dump()
|
item.model_dump()
|
||||||
for item in review_candidates
|
for item in review_candidates
|
||||||
|
|
@ -82,7 +158,7 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N
|
||||||
return {
|
return {
|
||||||
"query_type": "concept",
|
"query_type": "concept",
|
||||||
"concept": concept.model_dump(),
|
"concept": concept.model_dump(),
|
||||||
"claims": [item.model_dump() for item in claims],
|
"claims": claim_payloads,
|
||||||
"relations": [item.model_dump() for item in relations],
|
"relations": [item.model_dump() for item in relations],
|
||||||
"related_concepts": related_concepts,
|
"related_concepts": related_concepts,
|
||||||
"supporting_observations": supporting_observations,
|
"supporting_observations": supporting_observations,
|
||||||
|
|
@ -218,6 +294,12 @@ def build_query_bundle_for_concept(store_dir: str | Path, concept_ref: str) -> d
|
||||||
relations = payload["relations"]
|
relations = payload["relations"]
|
||||||
contradictions = [item for item in claims if item.get("contradicts_claim_ids")]
|
contradictions = [item for item in claims if item.get("contradicts_claim_ids")]
|
||||||
supersessions = [item for item in claims if item.get("supersedes_claim_ids")]
|
supersessions = [item for item in claims if item.get("supersedes_claim_ids")]
|
||||||
|
source_role_summary: dict[str, int] = {}
|
||||||
|
for artifact in payload["source_artifacts"]:
|
||||||
|
role = str(artifact.get("source_role", "")).strip()
|
||||||
|
if role:
|
||||||
|
source_role_summary[role] = source_role_summary.get(role, 0) + 1
|
||||||
|
key_distinctions = [item["distinction"] for item in claims if isinstance(item.get("distinction"), dict)]
|
||||||
return {
|
return {
|
||||||
"bundle_kind": "groundrecall_query_bundle",
|
"bundle_kind": "groundrecall_query_bundle",
|
||||||
"query_type": "concept",
|
"query_type": "concept",
|
||||||
|
|
@ -226,6 +308,8 @@ def build_query_bundle_for_concept(store_dir: str | Path, concept_ref: str) -> d
|
||||||
"relations": relations,
|
"relations": relations,
|
||||||
"supporting_observations": payload["supporting_observations"],
|
"supporting_observations": payload["supporting_observations"],
|
||||||
"source_artifacts": payload["source_artifacts"],
|
"source_artifacts": payload["source_artifacts"],
|
||||||
|
"source_role_summary": dict(sorted(source_role_summary.items())),
|
||||||
|
"key_distinctions": key_distinctions[:8],
|
||||||
"related_concepts": payload["related_concepts"],
|
"related_concepts": payload["related_concepts"],
|
||||||
"review_candidates": payload["review_candidates"],
|
"review_candidates": payload["review_candidates"],
|
||||||
"contradictions": contradictions,
|
"contradictions": contradictions,
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,7 @@ def _seed_store(store: GroundRecallStore) -> None:
|
||||||
artifact_kind="compiled_page",
|
artifact_kind="compiled_page",
|
||||||
title="Channel Capacity",
|
title="Channel Capacity",
|
||||||
path="wiki/channel-capacity.md",
|
path="wiki/channel-capacity.md",
|
||||||
|
metadata={"source_role": "mechanism"},
|
||||||
current_status="reviewed",
|
current_status="reviewed",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
@ -82,6 +83,21 @@ def _seed_store(store: GroundRecallStore) -> None:
|
||||||
store.save_claim(
|
store.save_claim(
|
||||||
ClaimRecord(
|
ClaimRecord(
|
||||||
claim_id="clm_002",
|
claim_id="clm_002",
|
||||||
|
claim_text="Channel capacity does not imply error-free transmission without coding.",
|
||||||
|
concept_ids=["concept::channel-capacity"],
|
||||||
|
source_observation_ids=["obs_001"],
|
||||||
|
provenance=ProvenanceRecord(
|
||||||
|
origin_artifact_id="ia_001",
|
||||||
|
origin_path="wiki/channel-capacity.md",
|
||||||
|
support_kind="derived_from_page",
|
||||||
|
grounding_status="partially_grounded",
|
||||||
|
),
|
||||||
|
current_status="reviewed",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
store.save_claim(
|
||||||
|
ClaimRecord(
|
||||||
|
claim_id="clm_003",
|
||||||
claim_text="Shannon entropy can inform channel coding intuition.",
|
claim_text="Shannon entropy can inform channel coding intuition.",
|
||||||
concept_ids=["concept::shannon-entropy"],
|
concept_ids=["concept::shannon-entropy"],
|
||||||
contradicts_claim_ids=["clm_999"],
|
contradicts_claim_ids=["clm_999"],
|
||||||
|
|
@ -136,10 +152,11 @@ def test_query_concept_returns_neighborhood_and_support(tmp_path: Path) -> None:
|
||||||
payload = query_concept(store.base_dir, "channel-capacity")
|
payload = query_concept(store.base_dir, "channel-capacity")
|
||||||
assert payload is not None
|
assert payload is not None
|
||||||
assert payload["concept"]["concept_id"] == "concept::channel-capacity"
|
assert payload["concept"]["concept_id"] == "concept::channel-capacity"
|
||||||
assert len(payload["claims"]) == 1
|
assert len(payload["claims"]) == 2
|
||||||
assert len(payload["relations"]) == 1
|
assert len(payload["relations"]) == 1
|
||||||
assert any(item["concept_id"] == "concept::shannon-entropy" for item in payload["related_concepts"])
|
assert any(item["concept_id"] == "concept::shannon-entropy" for item in payload["related_concepts"])
|
||||||
assert payload["supporting_observations"][0]["origin_path"] == "wiki/channel-capacity.md"
|
assert payload["supporting_observations"][0]["origin_path"] == "wiki/channel-capacity.md"
|
||||||
|
assert payload["supporting_observations"][0]["source_role"] == "mechanism"
|
||||||
assert len(payload["review_candidates"]) == 2
|
assert len(payload["review_candidates"]) == 2
|
||||||
assert any(item["candidate_id"] == "concept::channel-capacity" for item in payload["review_candidates"])
|
assert any(item["candidate_id"] == "concept::channel-capacity" for item in payload["review_candidates"])
|
||||||
assert any("graph=bridge_concept" in item["rationale"] for item in payload["review_candidates"])
|
assert any("graph=bridge_concept" in item["rationale"] for item in payload["review_candidates"])
|
||||||
|
|
@ -151,7 +168,7 @@ def test_search_claims_matches_text_and_concept_titles(tmp_path: Path) -> None:
|
||||||
|
|
||||||
payload = search_claims(store.base_dir, "entropy")
|
payload = search_claims(store.base_dir, "entropy")
|
||||||
assert payload["query_type"] == "claim_search"
|
assert payload["query_type"] == "claim_search"
|
||||||
assert any(match["claim"]["claim_id"] == "clm_002" for match in payload["matches"])
|
assert any(match["claim"]["claim_id"] == "clm_003" for match in payload["matches"])
|
||||||
|
|
||||||
|
|
||||||
def test_query_provenance_filters_by_origin_path(tmp_path: Path) -> None:
|
def test_query_provenance_filters_by_origin_path(tmp_path: Path) -> None:
|
||||||
|
|
@ -159,7 +176,7 @@ def test_query_provenance_filters_by_origin_path(tmp_path: Path) -> None:
|
||||||
_seed_store(store)
|
_seed_store(store)
|
||||||
|
|
||||||
payload = query_provenance(store.base_dir, origin_path="wiki/channel-capacity.md")
|
payload = query_provenance(store.base_dir, origin_path="wiki/channel-capacity.md")
|
||||||
assert len(payload["claims"]) == 2
|
assert len(payload["claims"]) == 3
|
||||||
assert len(payload["observations"]) == 1
|
assert len(payload["observations"]) == 1
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -173,6 +190,10 @@ def test_build_query_bundle_for_concept_is_assistant_neutral(tmp_path: Path) ->
|
||||||
assert payload["concept"]["concept_id"] == "concept::channel-capacity"
|
assert payload["concept"]["concept_id"] == "concept::channel-capacity"
|
||||||
assert len(payload["relations"]) == 1
|
assert len(payload["relations"]) == 1
|
||||||
assert payload["source_artifacts"][0]["artifact_id"] == "ia_001"
|
assert payload["source_artifacts"][0]["artifact_id"] == "ia_001"
|
||||||
|
assert payload["source_artifacts"][0]["source_role"] == "mechanism"
|
||||||
|
assert payload["source_role_summary"]["mechanism"] == 1
|
||||||
|
assert payload["key_distinctions"][0]["distinction_type"] == "non_implication"
|
||||||
|
assert payload["relevant_claims"][0]["source_roles"] == ["mechanism"]
|
||||||
assert len(payload["review_candidates"]) == 2
|
assert len(payload["review_candidates"]) == 2
|
||||||
assert isinstance(payload["suggested_next_actions"], list)
|
assert isinstance(payload["suggested_next_actions"], list)
|
||||||
forbidden = {"assistant", "codex", "claude", "prompt_text"}
|
forbidden = {"assistant", "codex", "claude", "prompt_text"}
|
||||||
|
|
@ -184,7 +205,7 @@ def test_query_bundle_surfaces_contradictions_and_supersessions(tmp_path: Path)
|
||||||
_seed_store(store)
|
_seed_store(store)
|
||||||
store.save_claim(
|
store.save_claim(
|
||||||
ClaimRecord(
|
ClaimRecord(
|
||||||
claim_id="clm_003",
|
claim_id="clm_004",
|
||||||
claim_text="Channel capacity is undefined in practice.",
|
claim_text="Channel capacity is undefined in practice.",
|
||||||
concept_ids=["concept::channel-capacity"],
|
concept_ids=["concept::channel-capacity"],
|
||||||
contradicts_claim_ids=["clm_001"],
|
contradicts_claim_ids=["clm_001"],
|
||||||
|
|
@ -199,7 +220,7 @@ def test_query_bundle_surfaces_contradictions_and_supersessions(tmp_path: Path)
|
||||||
)
|
)
|
||||||
store.save_claim(
|
store.save_claim(
|
||||||
ClaimRecord(
|
ClaimRecord(
|
||||||
claim_id="clm_004",
|
claim_id="clm_005",
|
||||||
claim_text="Channel capacity should be interpreted relative to a specific channel model.",
|
claim_text="Channel capacity should be interpreted relative to a specific channel model.",
|
||||||
concept_ids=["concept::channel-capacity"],
|
concept_ids=["concept::channel-capacity"],
|
||||||
supersedes_claim_ids=["clm_001"],
|
supersedes_claim_ids=["clm_001"],
|
||||||
|
|
@ -217,5 +238,5 @@ def test_query_bundle_surfaces_contradictions_and_supersessions(tmp_path: Path)
|
||||||
assert payload is not None
|
assert payload is not None
|
||||||
contradiction_ids = {item["claim_id"] for item in payload["contradictions"]}
|
contradiction_ids = {item["claim_id"] for item in payload["contradictions"]}
|
||||||
supersession_ids = {item["claim_id"] for item in payload["supersessions"]}
|
supersession_ids = {item["claim_id"] for item in payload["supersessions"]}
|
||||||
assert "clm_003" in contradiction_ids
|
assert "clm_004" in contradiction_ids
|
||||||
assert "clm_004" in supersession_ids
|
assert "clm_005" in supersession_ids
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue