Didactopus/src/didactopus/semantic_qa.py

from __future__ import annotations
import re
from difflib import SequenceMatcher
from .pack_validator import load_pack_artifacts

BROAD_HINTS = {"and", "overview", "foundations", "introduction", "basics", "advanced"}

def normalize_title(text: str) -> str:
    return re.sub(r"[^a-z0-9]+", " ", text.lower()).strip()

def similarity(a: str, b: str) -> float:
    return SequenceMatcher(None, normalize_title(a), normalize_title(b)).ratio()

def token_set(text: str) -> set[str]:
    return {t for t in normalize_title(text).split() if t}

def semantic_qa_for_pack(source_dir) -> dict:
    loaded = load_pack_artifacts(source_dir)
    if not loaded["ok"]:
        return {"warnings": [], "summary": {"semantic_warning_count": 0}}

    pack = loaded["artifacts"]["pack"]
    concepts = loaded["artifacts"]["concepts"].get("concepts", []) or []
    roadmap = loaded["artifacts"]["roadmap"].get("stages", []) or []

    warnings: list[str] = []

    for i in range(len(concepts)):
        for j in range(i + 1, len(concepts)):
            a = concepts[i]
            b = concepts[j]
            sim = similarity(a.get("title", ""), b.get("title", ""))
            if sim >= 0.86 and a.get("id") != b.get("id"):
                warnings.append(f"Near-duplicate concept titles: '{a.get('title')}' vs '{b.get('title')}'")

    for concept in concepts:
        title = concept.get("title", "")
        toks = token_set(title)
        if len(toks) >= 3 and (BROAD_HINTS & toks):
            warnings.append(f"Concept '{title}' may be over-broad and may need splitting.")
        if " and " in title.lower():
            warnings.append(f"Concept '{title}' is compound and may combine multiple ideas.")

    for i in range(len(concepts)):
        for j in range(i + 1, len(concepts)):
            da = str(concepts[i].get("description", "") or "")
            db = str(concepts[j].get("description", "") or "")
            if len(da) > 20 and len(db) > 20:
                sim = SequenceMatcher(None, da.lower(), db.lower()).ratio()
                if sim >= 0.82:
                    warnings.append(
                        f"Concept descriptions are very similar: '{concepts[i].get('title')}' vs '{concepts[j].get('title')}'"
                    )

    for concept in concepts:
        title = normalize_title(concept.get("title", ""))
        prereqs = concept.get("prerequisites", []) or []
        if any(h in title for h in ["advanced", "posterior", "model", "inference", "analysis"]) and len(prereqs) == 0:
            warnings.append(f"Concept '{concept.get('title')}' looks advanced but has no prerequisites.")

    concept_by_id = {c.get("id"): c for c in concepts if c.get("id")}
    for idx in range(len(roadmap) - 1):
        current_stage = roadmap[idx]
        next_stage = roadmap[idx + 1]
        current_titles = [concept_by_id[cid].get("title", "") for cid in current_stage.get("concepts", []) if cid in concept_by_id]
        next_titles = [concept_by_id[cid].get("title", "") for cid in next_stage.get("concepts", []) if cid in concept_by_id]
        current_tokens = set().union(*[token_set(t) for t in current_titles]) if current_titles else set()
        next_tokens = set().union(*[token_set(t) for t in next_titles]) if next_titles else set()
        overlap = current_tokens & next_tokens
        if current_titles and next_titles and len(overlap) == 0:
            warnings.append(
                f"Roadmap transition from stage '{current_stage.get('title')}' to '{next_stage.get('title')}' may lack a bridge concept."
            )
        if len(next_titles) == 1 and len(current_titles) >= 2 and len(overlap) == 0:
            warnings.append(
                f"Stage '{next_stage.get('title')}' contains a singleton concept with weak visible continuity from the prior stage."
            )

    return {
        "warnings": warnings,
        "summary": {"semantic_warning_count": len(warnings), "pack_name": pack.get("name", "")},
    }