Didactopus/src/didactopus/coverage_alignment_qa.py

79 lines
3.7 KiB
Python

import re
from .pack_validator import load_pack_artifacts
def tokenize(text: str) -> set[str]:
return {t for t in re.sub(r"[^a-z0-9]+", " ", str(text).lower()).split() if t}
def _concept_title_tokens(title: str) -> set[str]:
stop = {"the","of","and","to","for","in","on","a","an"}
return {t for t in tokenize(title) if t not in stop}
def coverage_alignment_for_pack(source_dir):
loaded = load_pack_artifacts(source_dir)
if not loaded["ok"]:
return {"warnings": [], "summary": {"coverage_warning_count": 0}}
concepts = loaded["artifacts"]["concepts"].get("concepts", []) or []
roadmap = loaded["artifacts"]["roadmap"].get("stages", []) or []
projects = loaded["artifacts"]["projects"].get("projects", []) or []
rubrics = loaded["artifacts"]["rubrics"].get("rubrics", []) or []
concept_by_id = {c.get("id"): c for c in concepts if c.get("id")}
roadmap_ids = {cid for stage in roadmap for cid in (stage.get("concepts", []) or [])}
checkpoint_tokens = tokenize(" ".join(str(item) for stage in roadmap for item in (stage.get("checkpoint", []) or [])))
project_ids = {cid for project in projects for cid in (project.get("prerequisites", []) or [])}
deliverable_tokens = tokenize(" ".join(str(item) for project in projects for item in (project.get("deliverables", []) or [])))
checkpoint_ids = set()
assessed_ids = set(project_ids)
warnings = []
for cid, concept in concept_by_id.items():
title_tokens = _concept_title_tokens(concept.get("title", ""))
if cid not in roadmap_ids:
warnings.append(f"Concept '{cid}' does not appear in any roadmap stage.")
if title_tokens and (title_tokens & checkpoint_tokens):
checkpoint_ids.add(cid)
else:
warnings.append(f"Concept '{cid}' is not reflected in checkpoint language.")
if cid not in project_ids:
warnings.append(f"Concept '{cid}' is not referenced by any project prerequisites.")
if cid in project_ids or cid in checkpoint_ids:
assessed_ids.add(cid)
else:
warnings.append(f"Concept '{cid}' is never covered by checkpoints or projects.")
for cid, concept in concept_by_id.items():
for signal in concept.get("mastery_signals", []) or []:
signal_tokens = tokenize(signal)
if signal_tokens and not ((signal_tokens & checkpoint_tokens) or (signal_tokens & deliverable_tokens)):
warnings.append(f"Mastery signal for concept '{cid}' is not reflected in checkpoints or project deliverables.")
rubric_tokens = set()
for rubric in rubrics:
for criterion in rubric.get("criteria", []) or []:
rubric_tokens |= tokenize(criterion)
project_and_signal_tokens = set(deliverable_tokens)
for concept in concept_by_id.values():
for signal in concept.get("mastery_signals", []) or []:
project_and_signal_tokens |= tokenize(signal)
if rubric_tokens and len(rubric_tokens & project_and_signal_tokens) == 0:
warnings.append("Rubric criteria show weak lexical overlap with mastery signals and project deliverables.")
concept_count = max(1, len(concept_by_id))
if projects and len(project_ids) <= max(1, concept_count // 4):
warnings.append("Projects appear to cover only a narrow subset of the concept set.")
return {
"warnings": warnings,
"summary": {
"coverage_warning_count": len(warnings),
"concept_count": len(concept_by_id),
"roadmap_covered_count": len(roadmap_ids & set(concept_by_id)),
"checkpoint_covered_count": len(checkpoint_ids),
"project_covered_count": len(project_ids & set(concept_by_id)),
"assessed_concept_count": len(assessed_ids),
},
}