83 lines
3.9 KiB
Python
83 lines
3.9 KiB
Python
from __future__ import annotations
|
|
import re
|
|
from difflib import SequenceMatcher
|
|
from .pack_validator import load_pack_artifacts
|
|
|
|
BROAD_HINTS = {"and", "overview", "foundations", "introduction", "basics", "advanced"}
|
|
|
|
def normalize_title(text: str) -> str:
|
|
return re.sub(r"[^a-z0-9]+", " ", text.lower()).strip()
|
|
|
|
def similarity(a: str, b: str) -> float:
|
|
return SequenceMatcher(None, normalize_title(a), normalize_title(b)).ratio()
|
|
|
|
def token_set(text: str) -> set[str]:
|
|
return {t for t in normalize_title(text).split() if t}
|
|
|
|
def semantic_qa_for_pack(source_dir) -> dict:
|
|
loaded = load_pack_artifacts(source_dir)
|
|
if not loaded["ok"]:
|
|
return {"warnings": [], "summary": {"semantic_warning_count": 0}}
|
|
|
|
pack = loaded["artifacts"]["pack"]
|
|
concepts = loaded["artifacts"]["concepts"].get("concepts", []) or []
|
|
roadmap = loaded["artifacts"]["roadmap"].get("stages", []) or []
|
|
|
|
warnings: list[str] = []
|
|
|
|
for i in range(len(concepts)):
|
|
for j in range(i + 1, len(concepts)):
|
|
a = concepts[i]
|
|
b = concepts[j]
|
|
sim = similarity(a.get("title", ""), b.get("title", ""))
|
|
if sim >= 0.86 and a.get("id") != b.get("id"):
|
|
warnings.append(f"Near-duplicate concept titles: '{a.get('title')}' vs '{b.get('title')}'")
|
|
|
|
for concept in concepts:
|
|
title = concept.get("title", "")
|
|
toks = token_set(title)
|
|
if len(toks) >= 3 and (BROAD_HINTS & toks):
|
|
warnings.append(f"Concept '{title}' may be over-broad and may need splitting.")
|
|
if " and " in title.lower():
|
|
warnings.append(f"Concept '{title}' is compound and may combine multiple ideas.")
|
|
|
|
for i in range(len(concepts)):
|
|
for j in range(i + 1, len(concepts)):
|
|
da = str(concepts[i].get("description", "") or "")
|
|
db = str(concepts[j].get("description", "") or "")
|
|
if len(da) > 20 and len(db) > 20:
|
|
sim = SequenceMatcher(None, da.lower(), db.lower()).ratio()
|
|
if sim >= 0.82:
|
|
warnings.append(
|
|
f"Concept descriptions are very similar: '{concepts[i].get('title')}' vs '{concepts[j].get('title')}'"
|
|
)
|
|
|
|
for concept in concepts:
|
|
title = normalize_title(concept.get("title", ""))
|
|
prereqs = concept.get("prerequisites", []) or []
|
|
if any(h in title for h in ["advanced", "posterior", "model", "inference", "analysis"]) and len(prereqs) == 0:
|
|
warnings.append(f"Concept '{concept.get('title')}' looks advanced but has no prerequisites.")
|
|
|
|
concept_by_id = {c.get("id"): c for c in concepts if c.get("id")}
|
|
for idx in range(len(roadmap) - 1):
|
|
current_stage = roadmap[idx]
|
|
next_stage = roadmap[idx + 1]
|
|
current_titles = [concept_by_id[cid].get("title", "") for cid in current_stage.get("concepts", []) if cid in concept_by_id]
|
|
next_titles = [concept_by_id[cid].get("title", "") for cid in next_stage.get("concepts", []) if cid in concept_by_id]
|
|
current_tokens = set().union(*[token_set(t) for t in current_titles]) if current_titles else set()
|
|
next_tokens = set().union(*[token_set(t) for t in next_titles]) if next_titles else set()
|
|
overlap = current_tokens & next_tokens
|
|
if current_titles and next_titles and len(overlap) == 0:
|
|
warnings.append(
|
|
f"Roadmap transition from stage '{current_stage.get('title')}' to '{next_stage.get('title')}' may lack a bridge concept."
|
|
)
|
|
if len(next_titles) == 1 and len(current_titles) >= 2 and len(overlap) == 0:
|
|
warnings.append(
|
|
f"Stage '{next_stage.get('title')}' contains a singleton concept with weak visible continuity from the prior stage."
|
|
)
|
|
|
|
return {
|
|
"warnings": warnings,
|
|
"summary": {"semantic_warning_count": len(warnings), "pack_name": pack.get("name", "")},
|
|
}
|