127 lines
5.4 KiB
Python
127 lines
5.4 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from collections import defaultdict
|
|
from .course_schema import NormalizedDocument, NormalizedCourse, Module, Lesson, TopicBundle, ConceptCandidate
|
|
|
|
|
|
def slugify(text: str) -> str:
|
|
cleaned = re.sub(r"[^a-zA-Z0-9]+", "-", text.strip().lower()).strip("-")
|
|
return cleaned or "untitled"
|
|
|
|
|
|
def extract_key_terms(text: str, min_term_length: int = 4, max_terms: int = 8) -> list[str]:
|
|
candidates = re.findall(r"\b[A-Z][A-Za-z0-9\-]{%d,}\b" % (min_term_length - 1), text)
|
|
seen = set()
|
|
out = []
|
|
for term in candidates:
|
|
if term not in seen:
|
|
seen.add(term)
|
|
out.append(term)
|
|
if len(out) >= max_terms:
|
|
break
|
|
return out
|
|
|
|
|
|
def document_to_course(doc: NormalizedDocument, course_title: str) -> NormalizedCourse:
|
|
# Conservative mapping: each section becomes a lesson; all lessons go into one module.
|
|
lessons = []
|
|
for section in doc.sections:
|
|
body = section.body.strip()
|
|
lines = body.splitlines()
|
|
objectives = []
|
|
exercises = []
|
|
for line in lines:
|
|
low = line.lower().strip()
|
|
if low.startswith("objective:"):
|
|
objectives.append(line.split(":", 1)[1].strip())
|
|
if low.startswith("exercise:"):
|
|
exercises.append(line.split(":", 1)[1].strip())
|
|
lessons.append(
|
|
Lesson(
|
|
title=section.heading.strip() or "Untitled Lesson",
|
|
body=body,
|
|
objectives=objectives,
|
|
exercises=exercises,
|
|
key_terms=extract_key_terms(section.heading + "\n" + body),
|
|
source_refs=[doc.source_path],
|
|
)
|
|
)
|
|
module = Module(title=f"Imported from {doc.source_type.upper()}", lessons=lessons)
|
|
return NormalizedCourse(title=course_title, modules=[module], source_records=[doc])
|
|
|
|
|
|
def build_topic_bundle(topic_title: str, courses: list[NormalizedCourse]) -> TopicBundle:
|
|
return TopicBundle(topic_title=topic_title, courses=courses)
|
|
|
|
|
|
def merge_courses_into_topic_course(topic_bundle: TopicBundle, merge_same_named_lessons: bool = True) -> NormalizedCourse:
|
|
modules_by_title: dict[str, Module] = {}
|
|
source_records = []
|
|
for course in topic_bundle.courses:
|
|
source_records.extend(course.source_records)
|
|
for module in course.modules:
|
|
target_module = modules_by_title.setdefault(module.title, Module(title=module.title, lessons=[]))
|
|
if merge_same_named_lessons:
|
|
lesson_map = {lesson.title: lesson for lesson in target_module.lessons}
|
|
for lesson in module.lessons:
|
|
if lesson.title in lesson_map:
|
|
existing = lesson_map[lesson.title]
|
|
if lesson.body and lesson.body not in existing.body:
|
|
existing.body = (existing.body + "\n\n" + lesson.body).strip()
|
|
for x in lesson.objectives:
|
|
if x not in existing.objectives:
|
|
existing.objectives.append(x)
|
|
for x in lesson.exercises:
|
|
if x not in existing.exercises:
|
|
existing.exercises.append(x)
|
|
for x in lesson.key_terms:
|
|
if x not in existing.key_terms:
|
|
existing.key_terms.append(x)
|
|
for x in lesson.source_refs:
|
|
if x not in existing.source_refs:
|
|
existing.source_refs.append(x)
|
|
else:
|
|
target_module.lessons.append(lesson)
|
|
else:
|
|
target_module.lessons.extend(module.lessons)
|
|
return NormalizedCourse(title=topic_bundle.topic_title, modules=list(modules_by_title.values()), source_records=source_records)
|
|
|
|
|
|
def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidate]:
|
|
concepts = []
|
|
seen_ids = set()
|
|
for module in course.modules:
|
|
for lesson in module.lessons:
|
|
cid = slugify(lesson.title)
|
|
if cid not in seen_ids:
|
|
seen_ids.add(cid)
|
|
concepts.append(
|
|
ConceptCandidate(
|
|
id=cid,
|
|
title=lesson.title,
|
|
description=lesson.body[:240].strip(),
|
|
source_modules=[module.title],
|
|
source_lessons=[lesson.title],
|
|
source_courses=list(lesson.source_refs),
|
|
mastery_signals=list(lesson.objectives[:3] or lesson.exercises[:2]),
|
|
)
|
|
)
|
|
for term in lesson.key_terms:
|
|
tid = slugify(term)
|
|
if tid in seen_ids:
|
|
continue
|
|
seen_ids.add(tid)
|
|
concepts.append(
|
|
ConceptCandidate(
|
|
id=tid,
|
|
title=term,
|
|
description=f"Candidate concept extracted from lesson '{lesson.title}'.",
|
|
source_modules=[module.title],
|
|
source_lessons=[lesson.title],
|
|
source_courses=list(lesson.source_refs),
|
|
mastery_signals=list(lesson.objectives[:2]),
|
|
)
|
|
)
|
|
return concepts
|