Enrich OCW concept extraction for Notebook signals
This commit is contained in:
parent
628a9f050b
commit
3371d3f30b
|
|
@ -47,6 +47,11 @@ class ConceptEntry(BaseModel):
|
||||||
description: str = ""
|
description: str = ""
|
||||||
prerequisites: list[str] = Field(default_factory=list)
|
prerequisites: list[str] = Field(default_factory=list)
|
||||||
mastery_signals: list[str] = Field(default_factory=list)
|
mastery_signals: list[str] = Field(default_factory=list)
|
||||||
|
source_role: str = ""
|
||||||
|
distinctions: list[str] = Field(default_factory=list)
|
||||||
|
definition_candidates: list[str] = Field(default_factory=list)
|
||||||
|
qualification_candidates: list[str] = Field(default_factory=list)
|
||||||
|
constraint_candidates: list[str] = Field(default_factory=list)
|
||||||
mastery_profile: MasteryProfileSpec = Field(default_factory=MasteryProfileSpec)
|
mastery_profile: MasteryProfileSpec = Field(default_factory=MasteryProfileSpec)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ from .course_schema import NormalizedCourse, NormalizedSourceRecord, Module, Les
|
||||||
|
|
||||||
HEADING_RE = re.compile(r"^(#{1,3})\s+(.*)$")
|
HEADING_RE = re.compile(r"^(#{1,3})\s+(.*)$")
|
||||||
BULLET_RE = re.compile(r"^\s*[-*+]\s+(.*)$")
|
BULLET_RE = re.compile(r"^\s*[-*+]\s+(.*)$")
|
||||||
|
SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
|
||||||
|
|
||||||
|
|
||||||
def slugify(text: str) -> str:
|
def slugify(text: str) -> str:
|
||||||
|
|
@ -43,6 +44,120 @@ def extract_key_terms(text: str, min_term_length: int = 4, max_terms: int = 8) -
|
||||||
return ordered
|
return ordered
|
||||||
|
|
||||||
|
|
||||||
|
def _lesson_sentences(lesson: Lesson) -> list[str]:
|
||||||
|
parts = [lesson.body, *lesson.objectives, *lesson.exercises]
|
||||||
|
joined = "\n".join(part.strip() for part in parts if part and part.strip())
|
||||||
|
if not joined:
|
||||||
|
return []
|
||||||
|
sentences = []
|
||||||
|
for chunk in SENTENCE_SPLIT_RE.split(joined):
|
||||||
|
text = " ".join(chunk.split()).strip(" -")
|
||||||
|
if text:
|
||||||
|
sentences.append(text)
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
|
||||||
|
def _compact_description(lesson: Lesson, max_chars: int = 320) -> str:
|
||||||
|
sentences = _lesson_sentences(lesson)
|
||||||
|
if not sentences:
|
||||||
|
return lesson.title
|
||||||
|
out = []
|
||||||
|
total = 0
|
||||||
|
for sentence in sentences:
|
||||||
|
candidate = sentence if sentence.endswith((".", "!", "?")) else f"{sentence}."
|
||||||
|
if out and total + 1 + len(candidate) > max_chars:
|
||||||
|
break
|
||||||
|
out.append(candidate)
|
||||||
|
total += len(candidate) + (1 if out[:-1] else 0)
|
||||||
|
if len(out) >= 2:
|
||||||
|
break
|
||||||
|
return " ".join(out).strip()[:max_chars]
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_sentences_by_patterns(sentences: list[str], patterns: list[str], max_items: int = 3) -> list[str]:
|
||||||
|
compiled = [re.compile(pattern, re.IGNORECASE) for pattern in patterns]
|
||||||
|
out: list[str] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for sentence in sentences:
|
||||||
|
lowered = sentence.lower()
|
||||||
|
if lowered in seen:
|
||||||
|
continue
|
||||||
|
if any(pattern.search(sentence) for pattern in compiled):
|
||||||
|
seen.add(lowered)
|
||||||
|
out.append(sentence)
|
||||||
|
if len(out) >= max_items:
|
||||||
|
break
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_source_role(module: Module, lesson: Lesson, distinctions: list[str], qualifications: list[str], constraints: list[str]) -> str:
|
||||||
|
title_blob = " ".join([module.title, lesson.title, lesson.body]).lower()
|
||||||
|
if distinctions or qualifications or constraints:
|
||||||
|
return "nuance"
|
||||||
|
if any(token in title_blob for token in ("foundation", "background", "course identity", "course description", "reading base", "learning norms")):
|
||||||
|
return "overview"
|
||||||
|
if any(token in title_blob for token in ("coding", "capacity", "compression", "error-correcting", "error correcting", "mutual information", "reversible", "quantum", "cryptography", "noise")):
|
||||||
|
return "mechanism"
|
||||||
|
return "overview"
|
||||||
|
|
||||||
|
|
||||||
|
def _concept_enrichment(module: Module, lesson: Lesson) -> dict[str, list[str] | str]:
|
||||||
|
sentences = _lesson_sentences(lesson)
|
||||||
|
distinctions = _extract_sentences_by_patterns(
|
||||||
|
sentences,
|
||||||
|
[
|
||||||
|
r"\bcompare\b",
|
||||||
|
r"\bcontrast\b",
|
||||||
|
r"\bdistinguish\b",
|
||||||
|
r"\bdiffer(?:ent|s)?\b",
|
||||||
|
r"\brelat(?:e|es|ed)\b.+\band\b",
|
||||||
|
r"\bnot\b.+\bbut\b",
|
||||||
|
r"\bversus\b|\bvs\.?\b",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
definitions = _extract_sentences_by_patterns(
|
||||||
|
sentences,
|
||||||
|
[
|
||||||
|
r"\bis (?:a|an|the)\b",
|
||||||
|
r"\bmeasure of\b",
|
||||||
|
r"\brefers to\b",
|
||||||
|
r"\bdefined as\b",
|
||||||
|
r"\btreated as\b",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
qualifications = _extract_sentences_by_patterns(
|
||||||
|
sentences,
|
||||||
|
[
|
||||||
|
r"\bbut\b",
|
||||||
|
r"\bhowever\b",
|
||||||
|
r"\bwhile\b",
|
||||||
|
r"\balthough\b",
|
||||||
|
r"\bcareful\b",
|
||||||
|
r"\bnot identical\b",
|
||||||
|
r"\bdangerous\b",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
constraints = _extract_sentences_by_patterns(
|
||||||
|
sentences,
|
||||||
|
[
|
||||||
|
r"\bimpossible\b",
|
||||||
|
r"\blimit(?:s)?\b",
|
||||||
|
r"\bfailure mode(?:s)?\b",
|
||||||
|
r"\bcannot\b",
|
||||||
|
r"\bonly up to\b",
|
||||||
|
r"\bin the long run\b",
|
||||||
|
r"\babove capacity\b",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"source_role": _infer_source_role(module, lesson, distinctions, qualifications, constraints),
|
||||||
|
"distinctions": distinctions,
|
||||||
|
"definition_candidates": definitions,
|
||||||
|
"qualification_candidates": qualifications,
|
||||||
|
"constraint_candidates": constraints,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_markdown_like(text: str, title: str, source_name: str, source_path: str) -> NormalizedSourceRecord:
|
def parse_markdown_like(text: str, title: str, source_name: str, source_path: str) -> NormalizedSourceRecord:
|
||||||
lines = text.splitlines()
|
lines = text.splitlines()
|
||||||
modules: list[Module] = []
|
modules: list[Module] = []
|
||||||
|
|
@ -167,6 +282,7 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat
|
||||||
seen_ids: set[str] = set()
|
seen_ids: set[str] = set()
|
||||||
for module in course.modules:
|
for module in course.modules:
|
||||||
for lesson in module.lessons:
|
for lesson in module.lessons:
|
||||||
|
enrichment = _concept_enrichment(module, lesson)
|
||||||
title_id = slugify(lesson.title)
|
title_id = slugify(lesson.title)
|
||||||
if title_id not in seen_ids:
|
if title_id not in seen_ids:
|
||||||
seen_ids.add(title_id)
|
seen_ids.add(title_id)
|
||||||
|
|
@ -174,10 +290,16 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat
|
||||||
ConceptCandidate(
|
ConceptCandidate(
|
||||||
id=title_id,
|
id=title_id,
|
||||||
title=lesson.title,
|
title=lesson.title,
|
||||||
description=lesson.body[:240].strip(),
|
description=_compact_description(lesson),
|
||||||
source_modules=[module.title],
|
source_modules=[module.title],
|
||||||
source_lessons=[lesson.title],
|
source_lessons=[lesson.title],
|
||||||
|
source_courses=list(lesson.source_refs),
|
||||||
mastery_signals=list(lesson.objectives[:3] or lesson.exercises[:2]),
|
mastery_signals=list(lesson.objectives[:3] or lesson.exercises[:2]),
|
||||||
|
source_role=str(enrichment["source_role"]),
|
||||||
|
distinctions=list(enrichment["distinctions"]),
|
||||||
|
definition_candidates=list(enrichment["definition_candidates"]),
|
||||||
|
qualification_candidates=list(enrichment["qualification_candidates"]),
|
||||||
|
constraint_candidates=list(enrichment["constraint_candidates"]),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
for term in lesson.key_terms:
|
for term in lesson.key_terms:
|
||||||
|
|
@ -192,7 +314,13 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat
|
||||||
description=f"Candidate concept extracted from lesson '{lesson.title}'.",
|
description=f"Candidate concept extracted from lesson '{lesson.title}'.",
|
||||||
source_modules=[module.title],
|
source_modules=[module.title],
|
||||||
source_lessons=[lesson.title],
|
source_lessons=[lesson.title],
|
||||||
|
source_courses=list(lesson.source_refs),
|
||||||
mastery_signals=list(lesson.objectives[:2]),
|
mastery_signals=list(lesson.objectives[:2]),
|
||||||
|
source_role=str(enrichment["source_role"]),
|
||||||
|
distinctions=list(enrichment["distinctions"][:1]),
|
||||||
|
definition_candidates=list(enrichment["definition_candidates"][:1]),
|
||||||
|
qualification_candidates=list(enrichment["qualification_candidates"][:1]),
|
||||||
|
constraint_candidates=list(enrichment["constraint_candidates"][:1]),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return concepts
|
return concepts
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,11 @@ class ConceptCandidate(BaseModel):
|
||||||
source_courses: list[str] = Field(default_factory=list)
|
source_courses: list[str] = Field(default_factory=list)
|
||||||
prerequisites: list[str] = Field(default_factory=list)
|
prerequisites: list[str] = Field(default_factory=list)
|
||||||
mastery_signals: list[str] = Field(default_factory=list)
|
mastery_signals: list[str] = Field(default_factory=list)
|
||||||
|
source_role: str = ""
|
||||||
|
distinctions: list[str] = Field(default_factory=list)
|
||||||
|
definition_candidates: list[str] = Field(default_factory=list)
|
||||||
|
qualification_candidates: list[str] = Field(default_factory=list)
|
||||||
|
constraint_candidates: list[str] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
class DraftPack(BaseModel):
|
class DraftPack(BaseModel):
|
||||||
|
|
|
||||||
|
|
@ -102,6 +102,11 @@ def build_draft_pack(
|
||||||
"description": c.description,
|
"description": c.description,
|
||||||
"prerequisites": c.prerequisites,
|
"prerequisites": c.prerequisites,
|
||||||
"mastery_signals": c.mastery_signals,
|
"mastery_signals": c.mastery_signals,
|
||||||
|
"source_role": c.source_role,
|
||||||
|
"distinctions": c.distinctions,
|
||||||
|
"definition_candidates": c.definition_candidates,
|
||||||
|
"qualification_candidates": c.qualification_candidates,
|
||||||
|
"constraint_candidates": c.constraint_candidates,
|
||||||
"mastery_profile": {},
|
"mastery_profile": {},
|
||||||
}
|
}
|
||||||
for c in concepts
|
for c in concepts
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,11 @@ def export_promoted_pack(session: ReviewSession, outdir: str | Path) -> None:
|
||||||
"description": concept.description,
|
"description": concept.description,
|
||||||
"prerequisites": concept.prerequisites,
|
"prerequisites": concept.prerequisites,
|
||||||
"mastery_signals": concept.mastery_signals,
|
"mastery_signals": concept.mastery_signals,
|
||||||
|
"source_role": concept.source_role,
|
||||||
|
"distinctions": concept.distinctions,
|
||||||
|
"definition_candidates": concept.definition_candidates,
|
||||||
|
"qualification_candidates": concept.qualification_candidates,
|
||||||
|
"constraint_candidates": concept.constraint_candidates,
|
||||||
"status": concept.status,
|
"status": concept.status,
|
||||||
"notes": concept.notes,
|
"notes": concept.notes,
|
||||||
"mastery_profile": {},
|
"mastery_profile": {},
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,11 @@ def load_draft_pack(pack_dir: str | Path) -> DraftPackData:
|
||||||
description=item.get("description", ""),
|
description=item.get("description", ""),
|
||||||
prerequisites=list(item.get("prerequisites", [])),
|
prerequisites=list(item.get("prerequisites", [])),
|
||||||
mastery_signals=list(item.get("mastery_signals", [])),
|
mastery_signals=list(item.get("mastery_signals", [])),
|
||||||
|
source_role=item.get("source_role", ""),
|
||||||
|
distinctions=list(item.get("distinctions", [])),
|
||||||
|
definition_candidates=list(item.get("definition_candidates", [])),
|
||||||
|
qualification_candidates=list(item.get("qualification_candidates", [])),
|
||||||
|
constraint_candidates=list(item.get("constraint_candidates", [])),
|
||||||
status=item.get("status", "needs_review"),
|
status=item.get("status", "needs_review"),
|
||||||
notes=list(item.get("notes", [])),
|
notes=list(item.get("notes", [])),
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,11 @@ class ConceptReviewEntry(BaseModel):
|
||||||
description: str = ""
|
description: str = ""
|
||||||
prerequisites: list[str] = Field(default_factory=list)
|
prerequisites: list[str] = Field(default_factory=list)
|
||||||
mastery_signals: list[str] = Field(default_factory=list)
|
mastery_signals: list[str] = Field(default_factory=list)
|
||||||
|
source_role: str = ""
|
||||||
|
distinctions: list[str] = Field(default_factory=list)
|
||||||
|
definition_candidates: list[str] = Field(default_factory=list)
|
||||||
|
qualification_candidates: list[str] = Field(default_factory=list)
|
||||||
|
constraint_candidates: list[str] = Field(default_factory=list)
|
||||||
status: TrustStatus = "needs_review"
|
status: TrustStatus = "needs_review"
|
||||||
notes: list[str] = Field(default_factory=list)
|
notes: list[str] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ from __future__ import annotations
|
||||||
import re
|
import re
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from .course_schema import NormalizedDocument, NormalizedCourse, Module, Lesson, TopicBundle, ConceptCandidate
|
from .course_schema import NormalizedDocument, NormalizedCourse, Module, Lesson, TopicBundle, ConceptCandidate
|
||||||
|
SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
|
||||||
|
|
||||||
GENERIC_TERM_STOPWORDS = {
|
GENERIC_TERM_STOPWORDS = {
|
||||||
"attribution",
|
"attribution",
|
||||||
|
|
@ -77,6 +78,120 @@ def _parse_signal_line(line: str) -> tuple[str | None, str]:
|
||||||
if lowered.startswith("exercise:"):
|
if lowered.startswith("exercise:"):
|
||||||
return "exercise", stripped.split(":", 1)[1].strip()
|
return "exercise", stripped.split(":", 1)[1].strip()
|
||||||
return None, stripped
|
return None, stripped
|
||||||
|
|
||||||
|
|
||||||
|
def _lesson_sentences(lesson: Lesson) -> list[str]:
|
||||||
|
parts = [lesson.body, *lesson.objectives, *lesson.exercises]
|
||||||
|
joined = "\n".join(part.strip() for part in parts if part and part.strip())
|
||||||
|
if not joined:
|
||||||
|
return []
|
||||||
|
sentences = []
|
||||||
|
for chunk in SENTENCE_SPLIT_RE.split(joined):
|
||||||
|
text = " ".join(chunk.split()).strip(" -")
|
||||||
|
if text:
|
||||||
|
sentences.append(text)
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
|
||||||
|
def _compact_description(lesson: Lesson, max_chars: int = 320) -> str:
|
||||||
|
sentences = _lesson_sentences(lesson)
|
||||||
|
if not sentences:
|
||||||
|
return lesson.title
|
||||||
|
out = []
|
||||||
|
total = 0
|
||||||
|
for sentence in sentences:
|
||||||
|
candidate = sentence if sentence.endswith((".", "!", "?")) else f"{sentence}."
|
||||||
|
if out and total + 1 + len(candidate) > max_chars:
|
||||||
|
break
|
||||||
|
out.append(candidate)
|
||||||
|
total += len(candidate) + (1 if out[:-1] else 0)
|
||||||
|
if len(out) >= 2:
|
||||||
|
break
|
||||||
|
return " ".join(out).strip()[:max_chars]
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_sentences_by_patterns(sentences: list[str], patterns: list[str], max_items: int = 3) -> list[str]:
|
||||||
|
compiled = [re.compile(pattern, re.IGNORECASE) for pattern in patterns]
|
||||||
|
out: list[str] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for sentence in sentences:
|
||||||
|
lowered = sentence.lower()
|
||||||
|
if lowered in seen:
|
||||||
|
continue
|
||||||
|
if any(pattern.search(sentence) for pattern in compiled):
|
||||||
|
seen.add(lowered)
|
||||||
|
out.append(sentence)
|
||||||
|
if len(out) >= max_items:
|
||||||
|
break
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_source_role(module: Module, lesson: Lesson, distinctions: list[str], qualifications: list[str], constraints: list[str]) -> str:
|
||||||
|
title_blob = " ".join([module.title, lesson.title, lesson.body]).lower()
|
||||||
|
if distinctions or qualifications or constraints:
|
||||||
|
return "nuance"
|
||||||
|
if any(token in title_blob for token in ("foundation", "overview", "introduction", "background")):
|
||||||
|
return "overview"
|
||||||
|
if any(token in title_blob for token in ("method", "model", "test", "mechanism", "process", "coding", "capacity")):
|
||||||
|
return "mechanism"
|
||||||
|
return "overview"
|
||||||
|
|
||||||
|
|
||||||
|
def _concept_enrichment(module: Module, lesson: Lesson) -> dict[str, list[str] | str]:
|
||||||
|
sentences = _lesson_sentences(lesson)
|
||||||
|
distinctions = _extract_sentences_by_patterns(
|
||||||
|
sentences,
|
||||||
|
[
|
||||||
|
r"\bcompare\b",
|
||||||
|
r"\bcontrast\b",
|
||||||
|
r"\bdistinguish\b",
|
||||||
|
r"\bdiffer(?:ent|s)?\b",
|
||||||
|
r"\brelat(?:e|es|ed)\b.+\band\b",
|
||||||
|
r"\bnot\b.+\bbut\b",
|
||||||
|
r"\bversus\b|\bvs\.?\b",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
definitions = _extract_sentences_by_patterns(
|
||||||
|
sentences,
|
||||||
|
[
|
||||||
|
r"\bis (?:a|an|the)\b",
|
||||||
|
r"\bmeasure of\b",
|
||||||
|
r"\brefers to\b",
|
||||||
|
r"\bdefined as\b",
|
||||||
|
r"\btreated as\b",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
qualifications = _extract_sentences_by_patterns(
|
||||||
|
sentences,
|
||||||
|
[
|
||||||
|
r"\bbut\b",
|
||||||
|
r"\bhowever\b",
|
||||||
|
r"\bwhile\b",
|
||||||
|
r"\balthough\b",
|
||||||
|
r"\bcareful\b",
|
||||||
|
r"\bnot identical\b",
|
||||||
|
r"\bdangerous\b",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
constraints = _extract_sentences_by_patterns(
|
||||||
|
sentences,
|
||||||
|
[
|
||||||
|
r"\bimpossible\b",
|
||||||
|
r"\blimit(?:s)?\b",
|
||||||
|
r"\bfailure mode(?:s)?\b",
|
||||||
|
r"\bcannot\b",
|
||||||
|
r"\bonly up to\b",
|
||||||
|
r"\bin the long run\b",
|
||||||
|
r"\babove capacity\b",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"source_role": _infer_source_role(module, lesson, distinctions, qualifications, constraints),
|
||||||
|
"distinctions": distinctions,
|
||||||
|
"definition_candidates": definitions,
|
||||||
|
"qualification_candidates": qualifications,
|
||||||
|
"constraint_candidates": constraints,
|
||||||
|
}
|
||||||
def document_to_course(doc: NormalizedDocument, course_title: str) -> NormalizedCourse:
|
def document_to_course(doc: NormalizedDocument, course_title: str) -> NormalizedCourse:
|
||||||
# Conservative mapping: each section becomes a lesson; all lessons go into one module.
|
# Conservative mapping: each section becomes a lesson; all lessons go into one module.
|
||||||
lessons = []
|
lessons = []
|
||||||
|
|
@ -149,6 +264,7 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat
|
||||||
seen_ids = set()
|
seen_ids = set()
|
||||||
for module in course.modules:
|
for module in course.modules:
|
||||||
for lesson in module.lessons:
|
for lesson in module.lessons:
|
||||||
|
enrichment = _concept_enrichment(module, lesson)
|
||||||
cid = slugify(lesson.title)
|
cid = slugify(lesson.title)
|
||||||
if cid not in seen_ids:
|
if cid not in seen_ids:
|
||||||
seen_ids.add(cid)
|
seen_ids.add(cid)
|
||||||
|
|
@ -156,11 +272,16 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat
|
||||||
ConceptCandidate(
|
ConceptCandidate(
|
||||||
id=cid,
|
id=cid,
|
||||||
title=lesson.title,
|
title=lesson.title,
|
||||||
description=lesson.body[:240].strip(),
|
description=_compact_description(lesson),
|
||||||
source_modules=[module.title],
|
source_modules=[module.title],
|
||||||
source_lessons=[lesson.title],
|
source_lessons=[lesson.title],
|
||||||
source_courses=list(lesson.source_refs),
|
source_courses=list(lesson.source_refs),
|
||||||
mastery_signals=list(lesson.objectives[:3] or lesson.exercises[:2]),
|
mastery_signals=list(lesson.objectives[:3] or lesson.exercises[:2]),
|
||||||
|
source_role=str(enrichment["source_role"]),
|
||||||
|
distinctions=list(enrichment["distinctions"]),
|
||||||
|
definition_candidates=list(enrichment["definition_candidates"]),
|
||||||
|
qualification_candidates=list(enrichment["qualification_candidates"]),
|
||||||
|
constraint_candidates=list(enrichment["constraint_candidates"]),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
for term in lesson.key_terms:
|
for term in lesson.key_terms:
|
||||||
|
|
@ -179,6 +300,11 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat
|
||||||
source_lessons=[lesson.title],
|
source_lessons=[lesson.title],
|
||||||
source_courses=list(lesson.source_refs),
|
source_courses=list(lesson.source_refs),
|
||||||
mastery_signals=list(lesson.objectives[:2]),
|
mastery_signals=list(lesson.objectives[:2]),
|
||||||
|
source_role=str(enrichment["source_role"]),
|
||||||
|
distinctions=list(enrichment["distinctions"][:1]),
|
||||||
|
definition_candidates=list(enrichment["definition_candidates"][:1]),
|
||||||
|
qualification_candidates=list(enrichment["qualification_candidates"][:1]),
|
||||||
|
constraint_candidates=list(enrichment["constraint_candidates"][:1]),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return concepts
|
return concepts
|
||||||
|
|
|
||||||
|
|
@ -24,3 +24,30 @@ def test_extract_concepts() -> None:
|
||||||
course = parse_markdown_course(SAMPLE, "Sample Course")
|
course = parse_markdown_course(SAMPLE, "Sample Course")
|
||||||
concepts = extract_concept_candidates(course)
|
concepts = extract_concept_candidates(course)
|
||||||
assert len(concepts) >= 2
|
assert len(concepts) >= 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_concepts_captures_distinctions_and_constraints() -> None:
|
||||||
|
sample = """
|
||||||
|
# OCW Slice
|
||||||
|
|
||||||
|
## Broader Applications
|
||||||
|
### Thermodynamics and Entropy
|
||||||
|
- Objective: Explain how thermodynamic entropy relates to, and differs from, Shannon entropy.
|
||||||
|
- Exercise: Compare the two entropy notions and identify what is preserved across the analogy.
|
||||||
|
Entropy is a measure of uncertainty in the source model. The analogy is useful but dangerous when used loosely.
|
||||||
|
|
||||||
|
### Channel Capacity
|
||||||
|
- Objective: Explain channel capacity as a limit on reliable communication over a noisy channel.
|
||||||
|
- Exercise: State why reliable transmission above capacity is impossible in the long run.
|
||||||
|
"""
|
||||||
|
course = parse_markdown_course(sample, "OCW Slice")
|
||||||
|
concepts = {concept.id: concept for concept in extract_concept_candidates(course)}
|
||||||
|
|
||||||
|
entropy = concepts["thermodynamics-and-entropy"]
|
||||||
|
assert entropy.source_role == "nuance"
|
||||||
|
assert entropy.distinctions
|
||||||
|
assert entropy.definition_candidates
|
||||||
|
assert entropy.qualification_candidates
|
||||||
|
|
||||||
|
capacity = concepts["channel-capacity"]
|
||||||
|
assert capacity.constraint_candidates
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import yaml
|
||||||
from didactopus.course_ingest import parse_markdown_course, extract_concept_candidates
|
from didactopus.course_ingest import parse_markdown_course, extract_concept_candidates
|
||||||
from didactopus.knowledge_graph import write_knowledge_graph
|
from didactopus.knowledge_graph import write_knowledge_graph
|
||||||
from didactopus.rule_policy import RuleContext, build_default_rules, run_rules
|
from didactopus.rule_policy import RuleContext, build_default_rules, run_rules
|
||||||
|
|
@ -56,3 +57,28 @@ def test_emit_pack_can_write_groundrecall_query_bundle(tmp_path: Path) -> None:
|
||||||
assert "notebook_page.json" in pack_yaml
|
assert "notebook_page.json" in pack_yaml
|
||||||
assert '"bundle_kind": "groundrecall_query_bundle"' in bundle_payload
|
assert '"bundle_kind": "groundrecall_query_bundle"' in bundle_payload
|
||||||
assert '"page_kind": "didactopus_notebook_page"' in notebook_payload
|
assert '"page_kind": "didactopus_notebook_page"' in notebook_payload
|
||||||
|
|
||||||
|
|
||||||
|
def test_emit_pack_preserves_richer_concept_fields(tmp_path: Path) -> None:
|
||||||
|
sample = """
|
||||||
|
# OCW Slice
|
||||||
|
|
||||||
|
## Broader Applications
|
||||||
|
### Thermodynamics and Entropy
|
||||||
|
- Objective: Explain how thermodynamic entropy relates to, and differs from, Shannon entropy.
|
||||||
|
- Exercise: Compare the two entropy notions and identify what is preserved across the analogy.
|
||||||
|
Entropy is a measure of uncertainty in the source model. The analogy is useful but dangerous when used loosely.
|
||||||
|
"""
|
||||||
|
course = parse_markdown_course(sample, "OCW Slice")
|
||||||
|
concepts = extract_concept_candidates(course)
|
||||||
|
ctx = RuleContext(course=course, concepts=concepts)
|
||||||
|
run_rules(ctx, build_default_rules())
|
||||||
|
draft = build_draft_pack(course, ctx.concepts, "Tester", "REVIEW", ctx.review_flags)
|
||||||
|
write_draft_pack(draft, tmp_path)
|
||||||
|
|
||||||
|
concepts_yaml = yaml.safe_load((tmp_path / "concepts.yaml").read_text(encoding="utf-8"))
|
||||||
|
concept = concepts_yaml["concepts"][0]
|
||||||
|
assert concept["source_role"] == "nuance"
|
||||||
|
assert concept["distinctions"]
|
||||||
|
assert concept["definition_candidates"]
|
||||||
|
assert concept["qualification_candidates"]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue