diff --git a/src/groundrecall/artifact_schemas.py b/src/groundrecall/artifact_schemas.py index aedd100..a209701 100644 --- a/src/groundrecall/artifact_schemas.py +++ b/src/groundrecall/artifact_schemas.py @@ -47,6 +47,11 @@ class ConceptEntry(BaseModel): description: str = "" prerequisites: list[str] = Field(default_factory=list) mastery_signals: list[str] = Field(default_factory=list) + source_role: str = "" + distinctions: list[str] = Field(default_factory=list) + definition_candidates: list[str] = Field(default_factory=list) + qualification_candidates: list[str] = Field(default_factory=list) + constraint_candidates: list[str] = Field(default_factory=list) mastery_profile: MasteryProfileSpec = Field(default_factory=MasteryProfileSpec) diff --git a/src/groundrecall/groundrecall_source_adapters/didactopus_pack.py b/src/groundrecall/groundrecall_source_adapters/didactopus_pack.py index 45bf86a..c1fd876 100644 --- a/src/groundrecall/groundrecall_source_adapters/didactopus_pack.py +++ b/src/groundrecall/groundrecall_source_adapters/didactopus_pack.py @@ -64,6 +64,52 @@ class DidactopusPackSourceAdapter: concept_rows: list[dict] = [] relation_rows: list[dict] = [] + def append_claim( + *, + claim_id: str, + observation_id: str, + artifact_id: str, + origin_section: str, + text: str, + claim_kind: str, + concept_id: str, + confidence_hint: float, + role: str = "summary", + ) -> None: + observation_rows.append( + { + "observation_id": observation_id, + "import_id": context.import_id, + "artifact_id": artifact_id, + "role": role, + "text": text, + "origin_path": concepts_src.relative_path, + "origin_section": origin_section, + "line_start": 0, + "line_end": 0, + "grounding_status": "grounded", + "support_kind": "direct_source", + "confidence_hint": confidence_hint, + "current_status": "draft", + } + ) + claim_rows.append( + { + "claim_id": claim_id, + "import_id": context.import_id, + "claim_text": text, + "claim_kind": claim_kind, + "source_observation_ids": [observation_id], + "supporting_fragment_ids": [], + "concept_ids": [concept_id], + "contradicts_claim_ids": [], + "supersedes_claim_ids": [], + "confidence_hint": confidence_hint, + "grounding_status": "grounded", + "current_status": "triaged", + } + ) + for source in sources: artifact_rows.append( { @@ -98,40 +144,65 @@ class DidactopusPackSourceAdapter: "current_status": "triaged", } ) - observation_id = f"obs_pack_{concept.id}_{index}" - observation_rows.append( - { - "observation_id": observation_id, - "import_id": context.import_id, - "artifact_id": concepts_artifact_id, - "role": "summary", - "text": concept.description or concept.title, - "origin_path": concepts_src.relative_path, - "origin_section": concept.title, - "line_start": 0, - "line_end": 0, - "grounding_status": "grounded", - "support_kind": "direct_source", - "confidence_hint": 0.85, - "current_status": "draft", - } - ) - claim_rows.append( - { - "claim_id": f"clm_pack_{concept.id}", - "import_id": context.import_id, - "claim_text": concept.description or f"{concept.title} is a concept in pack {pack_name}.", - "claim_kind": "summary", - "source_observation_ids": [observation_id], - "supporting_fragment_ids": [], - "concept_ids": [concept_key], - "contradicts_claim_ids": [], - "supersedes_claim_ids": [], - "confidence_hint": 0.85, - "grounding_status": "grounded", - "current_status": "triaged", - } + append_claim( + claim_id=f"clm_pack_{concept.id}", + observation_id=f"obs_pack_{concept.id}_{index}", + artifact_id=concepts_artifact_id, + origin_section=concept.title, + text=concept.description or f"{concept.title} is a concept in pack {pack_name}.", + claim_kind="summary", + concept_id=concept_key, + confidence_hint=0.85, + role="summary", ) + for item_index, definition in enumerate(concept.definition_candidates, start=1): + append_claim( + claim_id=f"clm_def_{concept.id}_{item_index}", + observation_id=f"obs_def_{concept.id}_{item_index}", + artifact_id=concepts_artifact_id, + origin_section=f"{concept.title} definition", + text=definition, + claim_kind="definition", + concept_id=concept_key, + confidence_hint=0.84, + role="definition", + ) + for item_index, distinction in enumerate(concept.distinctions, start=1): + append_claim( + claim_id=f"clm_dist_{concept.id}_{item_index}", + observation_id=f"obs_dist_{concept.id}_{item_index}", + artifact_id=concepts_artifact_id, + origin_section=f"{concept.title} distinction", + text=distinction, + claim_kind="distinction", + concept_id=concept_key, + confidence_hint=0.82, + role="distinction", + ) + for item_index, qualification in enumerate(concept.qualification_candidates, start=1): + append_claim( + claim_id=f"clm_qual_{concept.id}_{item_index}", + observation_id=f"obs_qual_{concept.id}_{item_index}", + artifact_id=concepts_artifact_id, + origin_section=f"{concept.title} qualification", + text=qualification, + claim_kind="qualification", + concept_id=concept_key, + confidence_hint=0.8, + role="qualification", + ) + for item_index, constraint in enumerate(concept.constraint_candidates, start=1): + append_claim( + claim_id=f"clm_constraint_{concept.id}_{item_index}", + observation_id=f"obs_constraint_{concept.id}_{item_index}", + artifact_id=concepts_artifact_id, + origin_section=f"{concept.title} constraint", + text=constraint, + claim_kind="constraint", + concept_id=concept_key, + confidence_hint=0.81, + role="constraint", + ) for prereq in concept.prerequisites: relation_rows.append( { @@ -145,39 +216,16 @@ class DidactopusPackSourceAdapter: } ) for signal_idx, signal in enumerate(concept.mastery_signals, start=1): - signal_obs_id = f"obs_signal_{concept.id}_{signal_idx}" - observation_rows.append( - { - "observation_id": signal_obs_id, - "import_id": context.import_id, - "artifact_id": concepts_artifact_id, - "role": "summary", - "text": signal, - "origin_path": concepts_src.relative_path, - "origin_section": f"{concept.title} mastery signal", - "line_start": 0, - "line_end": 0, - "grounding_status": "grounded", - "support_kind": "direct_source", - "confidence_hint": 0.8, - "current_status": "draft", - } - ) - claim_rows.append( - { - "claim_id": f"clm_signal_{concept.id}_{signal_idx}", - "import_id": context.import_id, - "claim_text": signal, - "claim_kind": "mastery_signal", - "source_observation_ids": [signal_obs_id], - "supporting_fragment_ids": [], - "concept_ids": [concept_key], - "contradicts_claim_ids": [], - "supersedes_claim_ids": [], - "confidence_hint": 0.8, - "grounding_status": "grounded", - "current_status": "triaged", - } + append_claim( + claim_id=f"clm_signal_{concept.id}_{signal_idx}", + observation_id=f"obs_signal_{concept.id}_{signal_idx}", + artifact_id=concepts_artifact_id, + origin_section=f"{concept.title} mastery signal", + text=signal, + claim_kind="mastery_signal", + concept_id=concept_key, + confidence_hint=0.8, + role="mastery_signal", ) if roadmap_payload is not None and roadmap_src is not None: diff --git a/src/groundrecall/query.py b/src/groundrecall/query.py index f4e7ec7..10ea3e0 100644 --- a/src/groundrecall/query.py +++ b/src/groundrecall/query.py @@ -53,10 +53,12 @@ def _claim_distinction_payload(claim: dict[str, Any]) -> dict[str, Any] | None: return None patterns = [ + ("contrast", r"\bcompare\b", "compare"), ("non_implication", r"\bdoes not imply\b", "does not imply"), ("decoupling", r"\b(can|may)\s+occur\s+without\b", "can or may occur without"), ("contrast", r"\bversus\b|\bvs\.\b|\bvs\b", "versus"), ("contrast", r"\brather than\b", "rather than"), + ("contrast", r"\bdiffer(?:s|ed|ent)? from\b|\bdiffers?\b", "differs from"), ("contrast", r"\bdifferent from\b|\bdistinguish(?:ed)? from\b", "different from"), ("contrast", r"\bnot\b.+\bbut\b", "not ... but"), ] @@ -71,6 +73,20 @@ def _claim_distinction_payload(claim: dict[str, Any]) -> dict[str, Any] | None: return None +def _role_from_observation_or_claim(artifact_role: str, observation: Any | None, claim: Any | dict[str, Any] | None) -> str: + observation_role = str(getattr(observation, "role", "") or "").lower() if observation is not None else "" + claim_kind = str(getattr(claim, "claim_kind", "") or (claim.get("claim_kind", "") if isinstance(claim, dict) else "")).lower() + claim_text = str(getattr(claim, "claim_text", "") or (claim.get("claim_text", "") if isinstance(claim, dict) else "")).lower() + + if observation_role in {"distinction", "qualification", "constraint"} or claim_kind in {"distinction", "qualification", "constraint"}: + return "nuance" + if observation_role == "definition" or claim_kind == "definition": + return "overview" + if claim_kind == "mastery_signal" and re.search(r"\b(build|compute|derive|detect|protect|repair|compare|contrast|state why)\b", claim_text): + return "mechanism" + return artifact_role + + def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | None: store = GroundRecallStore(store_dir) concepts = store.list_concepts() @@ -111,7 +127,11 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N "role": observation.role, "origin_path": observation.provenance.origin_path, "grounding_status": observation.provenance.grounding_status, - "source_role": _infer_source_role(artifact) if artifact is not None else "", + "source_role": _role_from_observation_or_claim( + _infer_source_role(artifact) if artifact is not None else "", + observation, + claim, + ), } ) @@ -137,7 +157,11 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N payload = claim.model_dump() source_roles = sorted( { - _infer_source_role(artifacts[observations[item].artifact_id]) + _role_from_observation_or_claim( + _infer_source_role(artifacts[observations[item].artifact_id]), + observations[item], + claim, + ) for item in claim.source_observation_ids if item in observations and observations[item].artifact_id in artifacts } @@ -299,6 +323,14 @@ def build_query_bundle_for_concept(store_dir: str | Path, concept_ref: str) -> d role = str(artifact.get("source_role", "")).strip() if role: source_role_summary[role] = source_role_summary.get(role, 0) + 1 + claim_role_summary: dict[str, int] = {} + for claim in claims: + for role in claim.get("source_roles", []) or []: + role = str(role).strip() + if role: + claim_role_summary[role] = claim_role_summary.get(role, 0) + 1 + if claim_role_summary: + source_role_summary = dict(sorted(claim_role_summary.items())) key_distinctions = [item["distinction"] for item in claims if isinstance(item.get("distinction"), dict)] return { "bundle_kind": "groundrecall_query_bundle", diff --git a/src/groundrecall/review_export.py b/src/groundrecall/review_export.py index 9717b7f..c984442 100644 --- a/src/groundrecall/review_export.py +++ b/src/groundrecall/review_export.py @@ -368,10 +368,12 @@ def _claim_distinction_payload(claim: dict[str, Any]) -> dict[str, Any] | None: if not text: return None patterns = [ + ("contrast", r"\bcompare\b", "compare"), ("non_implication", r"\bdoes not imply\b", "does not imply"), ("decoupling", r"\b(can|may)\s+occur\s+without\b", "can or may occur without"), ("contrast", r"\bversus\b|\bvs\.\b|\bvs\b", "versus"), ("contrast", r"\brather than\b", "rather than"), + ("contrast", r"\bdiffer(?:s|ed|ent)? from\b|\bdiffers?\b", "differs from"), ("contrast", r"\bdifferent from\b|\bdistinguish(?:ed)? from\b", "different from"), ("contrast", r"\bnot\b.+\bbut\b", "not ... but"), ] @@ -386,6 +388,19 @@ def _claim_distinction_payload(claim: dict[str, Any]) -> dict[str, Any] | None: return None +def _role_from_observation_or_claim(artifact_role: str, observation: dict[str, Any] | None, claim: dict[str, Any] | None) -> str: + observation_role = str((observation or {}).get("role", "") or "").lower() + claim_kind = str((claim or {}).get("claim_kind", "") or "").lower() + claim_text = str((claim or {}).get("claim_text", "") or "").lower() + if observation_role in {"distinction", "qualification", "constraint"} or claim_kind in {"distinction", "qualification", "constraint"}: + return "nuance" + if observation_role == "definition" or claim_kind == "definition": + return "overview" + if claim_kind == "mastery_signal" and re.search(r"\b(build|compute|derive|detect|protect|repair|compare|contrast|state why)\b", claim_text): + return "mechanism" + return artifact_role + + def build_citation_review_entries_from_import(import_dir: str | Path) -> list[CitationReviewEntry]: base = Path(import_dir) manifest = _read_json(base / "manifest.json") @@ -512,7 +527,21 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di for claim in concept_claims[:25]: supporting_observations = [observations_by_id[item] for item in claim.get("source_observation_ids", []) if item in observations_by_id] artifact_ids = {item["artifact_id"] for item in supporting_observations} - source_roles = sorted({artifact_role_by_id.get(artifact_id, "") for artifact_id in artifact_ids if artifact_role_by_id.get(artifact_id, "")}) + source_roles = sorted( + { + _role_from_observation_or_claim( + artifact_role_by_id.get(obs.get("artifact_id", ""), ""), + obs, + claim, + ) + for obs in supporting_observations + if _role_from_observation_or_claim( + artifact_role_by_id.get(obs.get("artifact_id", ""), ""), + obs, + claim, + ) + } + ) citation_support = [artifact_citation_summary.get(artifact_id, {}) for artifact_id in artifact_ids] has_citation_support = has_citation_support or any(item.get("has_citation_support") for item in citation_support) analysis = _claim_analysis_metadata(claim) @@ -558,6 +587,11 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di "artifact_id": obs.get("artifact_id", ""), "origin_path": obs.get("origin_path", ""), "origin_section": obs.get("origin_section", ""), + "source_role": _role_from_observation_or_claim( + artifact_role_by_id.get(obs.get("artifact_id", ""), ""), + obs, + claim, + ), "text": obs.get("text", ""), "line_start": obs.get("line_start", 0), "line_end": obs.get("line_end", 0), diff --git a/tests/test_groundrecall_query.py b/tests/test_groundrecall_query.py index 94aaeea..6d39b84 100644 --- a/tests/test_groundrecall_query.py +++ b/tests/test_groundrecall_query.py @@ -191,7 +191,7 @@ def test_build_query_bundle_for_concept_is_assistant_neutral(tmp_path: Path) -> assert len(payload["relations"]) == 1 assert payload["source_artifacts"][0]["artifact_id"] == "ia_001" assert payload["source_artifacts"][0]["source_role"] == "mechanism" - assert payload["source_role_summary"]["mechanism"] == 1 + assert payload["source_role_summary"]["mechanism"] == 2 assert payload["key_distinctions"][0]["distinction_type"] == "non_implication" assert payload["relevant_claims"][0]["source_roles"] == ["mechanism"] assert len(payload["review_candidates"]) == 2 diff --git a/tests/test_groundrecall_source_adapters.py b/tests/test_groundrecall_source_adapters.py index 0988e46..3549a3c 100644 --- a/tests/test_groundrecall_source_adapters.py +++ b/tests/test_groundrecall_source_adapters.py @@ -248,6 +248,10 @@ def test_didactopus_pack_import_generates_structured_concepts_and_relations(tmp_ " title: Advanced", " description: Builds on basics.", " prerequisites: [basics]", + " distinctions: [Advanced differs from basics in scope.]", + " definition_candidates: [Advanced is a follow-on concept.]", + " qualification_candidates: [Advanced builds on basics but assumes more context.]", + " constraint_candidates: [Advanced cannot be understood without basics.]", ] ), encoding="utf-8", @@ -275,6 +279,10 @@ def test_didactopus_pack_import_generates_structured_concepts_and_relations(tmp_ claim_ids = {item["claim_id"] for item in result.claims} assert "clm_pack_basics" in claim_ids assert "clm_stage_stage1_basics" in claim_ids + assert "clm_dist_advanced_1" in claim_ids + assert "clm_def_advanced_1" in claim_ids + assert "clm_qual_advanced_1" in claim_ids + assert "clm_constraint_advanced_1" in claim_ids def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) -> None: