Promote richer Didactopus pack signals into Notebook bundles

This commit is contained in:
welsberr 2026-05-08 17:16:55 -04:00
parent 97e27946da
commit ef37323016
6 changed files with 197 additions and 70 deletions

View File

@ -47,6 +47,11 @@ class ConceptEntry(BaseModel):
description: str = "" description: str = ""
prerequisites: list[str] = Field(default_factory=list) prerequisites: list[str] = Field(default_factory=list)
mastery_signals: list[str] = Field(default_factory=list) mastery_signals: list[str] = Field(default_factory=list)
source_role: str = ""
distinctions: list[str] = Field(default_factory=list)
definition_candidates: list[str] = Field(default_factory=list)
qualification_candidates: list[str] = Field(default_factory=list)
constraint_candidates: list[str] = Field(default_factory=list)
mastery_profile: MasteryProfileSpec = Field(default_factory=MasteryProfileSpec) mastery_profile: MasteryProfileSpec = Field(default_factory=MasteryProfileSpec)

View File

@ -64,6 +64,52 @@ class DidactopusPackSourceAdapter:
concept_rows: list[dict] = [] concept_rows: list[dict] = []
relation_rows: list[dict] = [] relation_rows: list[dict] = []
def append_claim(
*,
claim_id: str,
observation_id: str,
artifact_id: str,
origin_section: str,
text: str,
claim_kind: str,
concept_id: str,
confidence_hint: float,
role: str = "summary",
) -> None:
observation_rows.append(
{
"observation_id": observation_id,
"import_id": context.import_id,
"artifact_id": artifact_id,
"role": role,
"text": text,
"origin_path": concepts_src.relative_path,
"origin_section": origin_section,
"line_start": 0,
"line_end": 0,
"grounding_status": "grounded",
"support_kind": "direct_source",
"confidence_hint": confidence_hint,
"current_status": "draft",
}
)
claim_rows.append(
{
"claim_id": claim_id,
"import_id": context.import_id,
"claim_text": text,
"claim_kind": claim_kind,
"source_observation_ids": [observation_id],
"supporting_fragment_ids": [],
"concept_ids": [concept_id],
"contradicts_claim_ids": [],
"supersedes_claim_ids": [],
"confidence_hint": confidence_hint,
"grounding_status": "grounded",
"current_status": "triaged",
}
)
for source in sources: for source in sources:
artifact_rows.append( artifact_rows.append(
{ {
@ -98,39 +144,64 @@ class DidactopusPackSourceAdapter:
"current_status": "triaged", "current_status": "triaged",
} }
) )
observation_id = f"obs_pack_{concept.id}_{index}" append_claim(
observation_rows.append( claim_id=f"clm_pack_{concept.id}",
{ observation_id=f"obs_pack_{concept.id}_{index}",
"observation_id": observation_id, artifact_id=concepts_artifact_id,
"import_id": context.import_id, origin_section=concept.title,
"artifact_id": concepts_artifact_id, text=concept.description or f"{concept.title} is a concept in pack {pack_name}.",
"role": "summary", claim_kind="summary",
"text": concept.description or concept.title, concept_id=concept_key,
"origin_path": concepts_src.relative_path, confidence_hint=0.85,
"origin_section": concept.title, role="summary",
"line_start": 0,
"line_end": 0,
"grounding_status": "grounded",
"support_kind": "direct_source",
"confidence_hint": 0.85,
"current_status": "draft",
}
) )
claim_rows.append( for item_index, definition in enumerate(concept.definition_candidates, start=1):
{ append_claim(
"claim_id": f"clm_pack_{concept.id}", claim_id=f"clm_def_{concept.id}_{item_index}",
"import_id": context.import_id, observation_id=f"obs_def_{concept.id}_{item_index}",
"claim_text": concept.description or f"{concept.title} is a concept in pack {pack_name}.", artifact_id=concepts_artifact_id,
"claim_kind": "summary", origin_section=f"{concept.title} definition",
"source_observation_ids": [observation_id], text=definition,
"supporting_fragment_ids": [], claim_kind="definition",
"concept_ids": [concept_key], concept_id=concept_key,
"contradicts_claim_ids": [], confidence_hint=0.84,
"supersedes_claim_ids": [], role="definition",
"confidence_hint": 0.85, )
"grounding_status": "grounded", for item_index, distinction in enumerate(concept.distinctions, start=1):
"current_status": "triaged", append_claim(
} claim_id=f"clm_dist_{concept.id}_{item_index}",
observation_id=f"obs_dist_{concept.id}_{item_index}",
artifact_id=concepts_artifact_id,
origin_section=f"{concept.title} distinction",
text=distinction,
claim_kind="distinction",
concept_id=concept_key,
confidence_hint=0.82,
role="distinction",
)
for item_index, qualification in enumerate(concept.qualification_candidates, start=1):
append_claim(
claim_id=f"clm_qual_{concept.id}_{item_index}",
observation_id=f"obs_qual_{concept.id}_{item_index}",
artifact_id=concepts_artifact_id,
origin_section=f"{concept.title} qualification",
text=qualification,
claim_kind="qualification",
concept_id=concept_key,
confidence_hint=0.8,
role="qualification",
)
for item_index, constraint in enumerate(concept.constraint_candidates, start=1):
append_claim(
claim_id=f"clm_constraint_{concept.id}_{item_index}",
observation_id=f"obs_constraint_{concept.id}_{item_index}",
artifact_id=concepts_artifact_id,
origin_section=f"{concept.title} constraint",
text=constraint,
claim_kind="constraint",
concept_id=concept_key,
confidence_hint=0.81,
role="constraint",
) )
for prereq in concept.prerequisites: for prereq in concept.prerequisites:
relation_rows.append( relation_rows.append(
@ -145,39 +216,16 @@ class DidactopusPackSourceAdapter:
} }
) )
for signal_idx, signal in enumerate(concept.mastery_signals, start=1): for signal_idx, signal in enumerate(concept.mastery_signals, start=1):
signal_obs_id = f"obs_signal_{concept.id}_{signal_idx}" append_claim(
observation_rows.append( claim_id=f"clm_signal_{concept.id}_{signal_idx}",
{ observation_id=f"obs_signal_{concept.id}_{signal_idx}",
"observation_id": signal_obs_id, artifact_id=concepts_artifact_id,
"import_id": context.import_id, origin_section=f"{concept.title} mastery signal",
"artifact_id": concepts_artifact_id, text=signal,
"role": "summary", claim_kind="mastery_signal",
"text": signal, concept_id=concept_key,
"origin_path": concepts_src.relative_path, confidence_hint=0.8,
"origin_section": f"{concept.title} mastery signal", role="mastery_signal",
"line_start": 0,
"line_end": 0,
"grounding_status": "grounded",
"support_kind": "direct_source",
"confidence_hint": 0.8,
"current_status": "draft",
}
)
claim_rows.append(
{
"claim_id": f"clm_signal_{concept.id}_{signal_idx}",
"import_id": context.import_id,
"claim_text": signal,
"claim_kind": "mastery_signal",
"source_observation_ids": [signal_obs_id],
"supporting_fragment_ids": [],
"concept_ids": [concept_key],
"contradicts_claim_ids": [],
"supersedes_claim_ids": [],
"confidence_hint": 0.8,
"grounding_status": "grounded",
"current_status": "triaged",
}
) )
if roadmap_payload is not None and roadmap_src is not None: if roadmap_payload is not None and roadmap_src is not None:

View File

@ -53,10 +53,12 @@ def _claim_distinction_payload(claim: dict[str, Any]) -> dict[str, Any] | None:
return None return None
patterns = [ patterns = [
("contrast", r"\bcompare\b", "compare"),
("non_implication", r"\bdoes not imply\b", "does not imply"), ("non_implication", r"\bdoes not imply\b", "does not imply"),
("decoupling", r"\b(can|may)\s+occur\s+without\b", "can or may occur without"), ("decoupling", r"\b(can|may)\s+occur\s+without\b", "can or may occur without"),
("contrast", r"\bversus\b|\bvs\.\b|\bvs\b", "versus"), ("contrast", r"\bversus\b|\bvs\.\b|\bvs\b", "versus"),
("contrast", r"\brather than\b", "rather than"), ("contrast", r"\brather than\b", "rather than"),
("contrast", r"\bdiffer(?:s|ed|ent)? from\b|\bdiffers?\b", "differs from"),
("contrast", r"\bdifferent from\b|\bdistinguish(?:ed)? from\b", "different from"), ("contrast", r"\bdifferent from\b|\bdistinguish(?:ed)? from\b", "different from"),
("contrast", r"\bnot\b.+\bbut\b", "not ... but"), ("contrast", r"\bnot\b.+\bbut\b", "not ... but"),
] ]
@ -71,6 +73,20 @@ def _claim_distinction_payload(claim: dict[str, Any]) -> dict[str, Any] | None:
return None return None
def _role_from_observation_or_claim(artifact_role: str, observation: Any | None, claim: Any | dict[str, Any] | None) -> str:
observation_role = str(getattr(observation, "role", "") or "").lower() if observation is not None else ""
claim_kind = str(getattr(claim, "claim_kind", "") or (claim.get("claim_kind", "") if isinstance(claim, dict) else "")).lower()
claim_text = str(getattr(claim, "claim_text", "") or (claim.get("claim_text", "") if isinstance(claim, dict) else "")).lower()
if observation_role in {"distinction", "qualification", "constraint"} or claim_kind in {"distinction", "qualification", "constraint"}:
return "nuance"
if observation_role == "definition" or claim_kind == "definition":
return "overview"
if claim_kind == "mastery_signal" and re.search(r"\b(build|compute|derive|detect|protect|repair|compare|contrast|state why)\b", claim_text):
return "mechanism"
return artifact_role
def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | None: def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | None:
store = GroundRecallStore(store_dir) store = GroundRecallStore(store_dir)
concepts = store.list_concepts() concepts = store.list_concepts()
@ -111,7 +127,11 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N
"role": observation.role, "role": observation.role,
"origin_path": observation.provenance.origin_path, "origin_path": observation.provenance.origin_path,
"grounding_status": observation.provenance.grounding_status, "grounding_status": observation.provenance.grounding_status,
"source_role": _infer_source_role(artifact) if artifact is not None else "", "source_role": _role_from_observation_or_claim(
_infer_source_role(artifact) if artifact is not None else "",
observation,
claim,
),
} }
) )
@ -137,7 +157,11 @@ def query_concept(store_dir: str | Path, concept_ref: str) -> dict[str, Any] | N
payload = claim.model_dump() payload = claim.model_dump()
source_roles = sorted( source_roles = sorted(
{ {
_infer_source_role(artifacts[observations[item].artifact_id]) _role_from_observation_or_claim(
_infer_source_role(artifacts[observations[item].artifact_id]),
observations[item],
claim,
)
for item in claim.source_observation_ids for item in claim.source_observation_ids
if item in observations and observations[item].artifact_id in artifacts if item in observations and observations[item].artifact_id in artifacts
} }
@ -299,6 +323,14 @@ def build_query_bundle_for_concept(store_dir: str | Path, concept_ref: str) -> d
role = str(artifact.get("source_role", "")).strip() role = str(artifact.get("source_role", "")).strip()
if role: if role:
source_role_summary[role] = source_role_summary.get(role, 0) + 1 source_role_summary[role] = source_role_summary.get(role, 0) + 1
claim_role_summary: dict[str, int] = {}
for claim in claims:
for role in claim.get("source_roles", []) or []:
role = str(role).strip()
if role:
claim_role_summary[role] = claim_role_summary.get(role, 0) + 1
if claim_role_summary:
source_role_summary = dict(sorted(claim_role_summary.items()))
key_distinctions = [item["distinction"] for item in claims if isinstance(item.get("distinction"), dict)] key_distinctions = [item["distinction"] for item in claims if isinstance(item.get("distinction"), dict)]
return { return {
"bundle_kind": "groundrecall_query_bundle", "bundle_kind": "groundrecall_query_bundle",

View File

@ -368,10 +368,12 @@ def _claim_distinction_payload(claim: dict[str, Any]) -> dict[str, Any] | None:
if not text: if not text:
return None return None
patterns = [ patterns = [
("contrast", r"\bcompare\b", "compare"),
("non_implication", r"\bdoes not imply\b", "does not imply"), ("non_implication", r"\bdoes not imply\b", "does not imply"),
("decoupling", r"\b(can|may)\s+occur\s+without\b", "can or may occur without"), ("decoupling", r"\b(can|may)\s+occur\s+without\b", "can or may occur without"),
("contrast", r"\bversus\b|\bvs\.\b|\bvs\b", "versus"), ("contrast", r"\bversus\b|\bvs\.\b|\bvs\b", "versus"),
("contrast", r"\brather than\b", "rather than"), ("contrast", r"\brather than\b", "rather than"),
("contrast", r"\bdiffer(?:s|ed|ent)? from\b|\bdiffers?\b", "differs from"),
("contrast", r"\bdifferent from\b|\bdistinguish(?:ed)? from\b", "different from"), ("contrast", r"\bdifferent from\b|\bdistinguish(?:ed)? from\b", "different from"),
("contrast", r"\bnot\b.+\bbut\b", "not ... but"), ("contrast", r"\bnot\b.+\bbut\b", "not ... but"),
] ]
@ -386,6 +388,19 @@ def _claim_distinction_payload(claim: dict[str, Any]) -> dict[str, Any] | None:
return None return None
def _role_from_observation_or_claim(artifact_role: str, observation: dict[str, Any] | None, claim: dict[str, Any] | None) -> str:
observation_role = str((observation or {}).get("role", "") or "").lower()
claim_kind = str((claim or {}).get("claim_kind", "") or "").lower()
claim_text = str((claim or {}).get("claim_text", "") or "").lower()
if observation_role in {"distinction", "qualification", "constraint"} or claim_kind in {"distinction", "qualification", "constraint"}:
return "nuance"
if observation_role == "definition" or claim_kind == "definition":
return "overview"
if claim_kind == "mastery_signal" and re.search(r"\b(build|compute|derive|detect|protect|repair|compare|contrast|state why)\b", claim_text):
return "mechanism"
return artifact_role
def build_citation_review_entries_from_import(import_dir: str | Path) -> list[CitationReviewEntry]: def build_citation_review_entries_from_import(import_dir: str | Path) -> list[CitationReviewEntry]:
base = Path(import_dir) base = Path(import_dir)
manifest = _read_json(base / "manifest.json") manifest = _read_json(base / "manifest.json")
@ -512,7 +527,21 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di
for claim in concept_claims[:25]: for claim in concept_claims[:25]:
supporting_observations = [observations_by_id[item] for item in claim.get("source_observation_ids", []) if item in observations_by_id] supporting_observations = [observations_by_id[item] for item in claim.get("source_observation_ids", []) if item in observations_by_id]
artifact_ids = {item["artifact_id"] for item in supporting_observations} artifact_ids = {item["artifact_id"] for item in supporting_observations}
source_roles = sorted({artifact_role_by_id.get(artifact_id, "") for artifact_id in artifact_ids if artifact_role_by_id.get(artifact_id, "")}) source_roles = sorted(
{
_role_from_observation_or_claim(
artifact_role_by_id.get(obs.get("artifact_id", ""), ""),
obs,
claim,
)
for obs in supporting_observations
if _role_from_observation_or_claim(
artifact_role_by_id.get(obs.get("artifact_id", ""), ""),
obs,
claim,
)
}
)
citation_support = [artifact_citation_summary.get(artifact_id, {}) for artifact_id in artifact_ids] citation_support = [artifact_citation_summary.get(artifact_id, {}) for artifact_id in artifact_ids]
has_citation_support = has_citation_support or any(item.get("has_citation_support") for item in citation_support) has_citation_support = has_citation_support or any(item.get("has_citation_support") for item in citation_support)
analysis = _claim_analysis_metadata(claim) analysis = _claim_analysis_metadata(claim)
@ -558,6 +587,11 @@ def _build_import_review_payload(session: ReviewSession, import_dir: Path) -> di
"artifact_id": obs.get("artifact_id", ""), "artifact_id": obs.get("artifact_id", ""),
"origin_path": obs.get("origin_path", ""), "origin_path": obs.get("origin_path", ""),
"origin_section": obs.get("origin_section", ""), "origin_section": obs.get("origin_section", ""),
"source_role": _role_from_observation_or_claim(
artifact_role_by_id.get(obs.get("artifact_id", ""), ""),
obs,
claim,
),
"text": obs.get("text", ""), "text": obs.get("text", ""),
"line_start": obs.get("line_start", 0), "line_start": obs.get("line_start", 0),
"line_end": obs.get("line_end", 0), "line_end": obs.get("line_end", 0),

View File

@ -191,7 +191,7 @@ def test_build_query_bundle_for_concept_is_assistant_neutral(tmp_path: Path) ->
assert len(payload["relations"]) == 1 assert len(payload["relations"]) == 1
assert payload["source_artifacts"][0]["artifact_id"] == "ia_001" assert payload["source_artifacts"][0]["artifact_id"] == "ia_001"
assert payload["source_artifacts"][0]["source_role"] == "mechanism" assert payload["source_artifacts"][0]["source_role"] == "mechanism"
assert payload["source_role_summary"]["mechanism"] == 1 assert payload["source_role_summary"]["mechanism"] == 2
assert payload["key_distinctions"][0]["distinction_type"] == "non_implication" assert payload["key_distinctions"][0]["distinction_type"] == "non_implication"
assert payload["relevant_claims"][0]["source_roles"] == ["mechanism"] assert payload["relevant_claims"][0]["source_roles"] == ["mechanism"]
assert len(payload["review_candidates"]) == 2 assert len(payload["review_candidates"]) == 2

View File

@ -248,6 +248,10 @@ def test_didactopus_pack_import_generates_structured_concepts_and_relations(tmp_
" title: Advanced", " title: Advanced",
" description: Builds on basics.", " description: Builds on basics.",
" prerequisites: [basics]", " prerequisites: [basics]",
" distinctions: [Advanced differs from basics in scope.]",
" definition_candidates: [Advanced is a follow-on concept.]",
" qualification_candidates: [Advanced builds on basics but assumes more context.]",
" constraint_candidates: [Advanced cannot be understood without basics.]",
] ]
), ),
encoding="utf-8", encoding="utf-8",
@ -275,6 +279,10 @@ def test_didactopus_pack_import_generates_structured_concepts_and_relations(tmp_
claim_ids = {item["claim_id"] for item in result.claims} claim_ids = {item["claim_id"] for item in result.claims}
assert "clm_pack_basics" in claim_ids assert "clm_pack_basics" in claim_ids
assert "clm_stage_stage1_basics" in claim_ids assert "clm_stage_stage1_basics" in claim_ids
assert "clm_dist_advanced_1" in claim_ids
assert "clm_def_advanced_1" in claim_ids
assert "clm_qual_advanced_1" in claim_ids
assert "clm_constraint_advanced_1" in claim_ids
def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) -> None: def test_doclift_bundle_import_generates_structured_concepts(tmp_path: Path) -> None: