30 lines
1.1 KiB
Python
30 lines
1.1 KiB
Python
from collections import Counter
|
|
import math
|
|
|
|
|
|
def _tokenize(text: str) -> list[str]:
|
|
cleaned = "".join(ch.lower() if ch.isalnum() else " " for ch in text)
|
|
return [tok for tok in cleaned.split() if tok]
|
|
|
|
|
|
def token_cosine_similarity(text_a: str, text_b: str) -> float:
|
|
tokens_a = _tokenize(text_a)
|
|
tokens_b = _tokenize(text_b)
|
|
if not tokens_a or not tokens_b:
|
|
return 0.0
|
|
ca = Counter(tokens_a)
|
|
cb = Counter(tokens_b)
|
|
shared = set(ca) & set(cb)
|
|
dot = sum(ca[t] * cb[t] for t in shared)
|
|
na = math.sqrt(sum(v * v for v in ca.values()))
|
|
nb = math.sqrt(sum(v * v for v in cb.values()))
|
|
if na == 0 or nb == 0:
|
|
return 0.0
|
|
return dot / (na * nb)
|
|
|
|
|
|
def concept_similarity(concept_a: dict, concept_b: dict) -> float:
|
|
text_a = " ".join([concept_a.get("title", ""), concept_a.get("description", ""), " ".join(concept_a.get("mastery_signals", []))])
|
|
text_b = " ".join([concept_b.get("title", ""), concept_b.get("description", ""), " ".join(concept_b.get("mastery_signals", []))])
|
|
return token_cosine_similarity(text_a, text_b)
|