Didactopus/src/didactopus/semantic_similarity.py

from collections import Counter
import math


def _tokenize(text: str) -> list[str]:
    cleaned = "".join(ch.lower() if ch.isalnum() else " " for ch in text)
    return [tok for tok in cleaned.split() if tok]


def token_cosine_similarity(text_a: str, text_b: str) -> float:
    tokens_a = _tokenize(text_a)
    tokens_b = _tokenize(text_b)
    if not tokens_a or not tokens_b:
        return 0.0
    ca = Counter(tokens_a)
    cb = Counter(tokens_b)
    shared = set(ca) & set(cb)
    dot = sum(ca[t] * cb[t] for t in shared)
    na = math.sqrt(sum(v * v for v in ca.values()))
    nb = math.sqrt(sum(v * v for v in cb.values()))
    if na == 0 or nb == 0:
        return 0.0
    return dot / (na * nb)


def concept_similarity(concept_a: dict, concept_b: dict) -> float:
    text_a = " ".join([concept_a.get("title", ""), concept_a.get("description", ""), " ".join(concept_a.get("mastery_signals", []))])
    text_b = " ".join([concept_b.get("title", ""), concept_b.get("description", ""), " ".join(concept_b.get("mastery_signals", []))])
    return token_cosine_similarity(text_a, text_b)