diff --git a/src/didactopus/artifact_schemas.py b/src/didactopus/artifact_schemas.py index aedd100..a209701 100644 --- a/src/didactopus/artifact_schemas.py +++ b/src/didactopus/artifact_schemas.py @@ -47,6 +47,11 @@ class ConceptEntry(BaseModel): description: str = "" prerequisites: list[str] = Field(default_factory=list) mastery_signals: list[str] = Field(default_factory=list) + source_role: str = "" + distinctions: list[str] = Field(default_factory=list) + definition_candidates: list[str] = Field(default_factory=list) + qualification_candidates: list[str] = Field(default_factory=list) + constraint_candidates: list[str] = Field(default_factory=list) mastery_profile: MasteryProfileSpec = Field(default_factory=MasteryProfileSpec) diff --git a/src/didactopus/course_ingest.py b/src/didactopus/course_ingest.py index 1e095a5..7fe9891 100644 --- a/src/didactopus/course_ingest.py +++ b/src/didactopus/course_ingest.py @@ -6,6 +6,7 @@ from .course_schema import NormalizedCourse, NormalizedSourceRecord, Module, Les HEADING_RE = re.compile(r"^(#{1,3})\s+(.*)$") BULLET_RE = re.compile(r"^\s*[-*+]\s+(.*)$") +SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+") def slugify(text: str) -> str: @@ -43,6 +44,120 @@ def extract_key_terms(text: str, min_term_length: int = 4, max_terms: int = 8) - return ordered +def _lesson_sentences(lesson: Lesson) -> list[str]: + parts = [lesson.body, *lesson.objectives, *lesson.exercises] + joined = "\n".join(part.strip() for part in parts if part and part.strip()) + if not joined: + return [] + sentences = [] + for chunk in SENTENCE_SPLIT_RE.split(joined): + text = " ".join(chunk.split()).strip(" -") + if text: + sentences.append(text) + return sentences + + +def _compact_description(lesson: Lesson, max_chars: int = 320) -> str: + sentences = _lesson_sentences(lesson) + if not sentences: + return lesson.title + out = [] + total = 0 + for sentence in sentences: + candidate = sentence if sentence.endswith((".", "!", "?")) else f"{sentence}." + if out and total + 1 + len(candidate) > max_chars: + break + out.append(candidate) + total += len(candidate) + (1 if out[:-1] else 0) + if len(out) >= 2: + break + return " ".join(out).strip()[:max_chars] + + +def _extract_sentences_by_patterns(sentences: list[str], patterns: list[str], max_items: int = 3) -> list[str]: + compiled = [re.compile(pattern, re.IGNORECASE) for pattern in patterns] + out: list[str] = [] + seen: set[str] = set() + for sentence in sentences: + lowered = sentence.lower() + if lowered in seen: + continue + if any(pattern.search(sentence) for pattern in compiled): + seen.add(lowered) + out.append(sentence) + if len(out) >= max_items: + break + return out + + +def _infer_source_role(module: Module, lesson: Lesson, distinctions: list[str], qualifications: list[str], constraints: list[str]) -> str: + title_blob = " ".join([module.title, lesson.title, lesson.body]).lower() + if distinctions or qualifications or constraints: + return "nuance" + if any(token in title_blob for token in ("foundation", "background", "course identity", "course description", "reading base", "learning norms")): + return "overview" + if any(token in title_blob for token in ("coding", "capacity", "compression", "error-correcting", "error correcting", "mutual information", "reversible", "quantum", "cryptography", "noise")): + return "mechanism" + return "overview" + + +def _concept_enrichment(module: Module, lesson: Lesson) -> dict[str, list[str] | str]: + sentences = _lesson_sentences(lesson) + distinctions = _extract_sentences_by_patterns( + sentences, + [ + r"\bcompare\b", + r"\bcontrast\b", + r"\bdistinguish\b", + r"\bdiffer(?:ent|s)?\b", + r"\brelat(?:e|es|ed)\b.+\band\b", + r"\bnot\b.+\bbut\b", + r"\bversus\b|\bvs\.?\b", + ], + ) + definitions = _extract_sentences_by_patterns( + sentences, + [ + r"\bis (?:a|an|the)\b", + r"\bmeasure of\b", + r"\brefers to\b", + r"\bdefined as\b", + r"\btreated as\b", + ], + ) + qualifications = _extract_sentences_by_patterns( + sentences, + [ + r"\bbut\b", + r"\bhowever\b", + r"\bwhile\b", + r"\balthough\b", + r"\bcareful\b", + r"\bnot identical\b", + r"\bdangerous\b", + ], + ) + constraints = _extract_sentences_by_patterns( + sentences, + [ + r"\bimpossible\b", + r"\blimit(?:s)?\b", + r"\bfailure mode(?:s)?\b", + r"\bcannot\b", + r"\bonly up to\b", + r"\bin the long run\b", + r"\babove capacity\b", + ], + ) + return { + "source_role": _infer_source_role(module, lesson, distinctions, qualifications, constraints), + "distinctions": distinctions, + "definition_candidates": definitions, + "qualification_candidates": qualifications, + "constraint_candidates": constraints, + } + + def parse_markdown_like(text: str, title: str, source_name: str, source_path: str) -> NormalizedSourceRecord: lines = text.splitlines() modules: list[Module] = [] @@ -167,6 +282,7 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat seen_ids: set[str] = set() for module in course.modules: for lesson in module.lessons: + enrichment = _concept_enrichment(module, lesson) title_id = slugify(lesson.title) if title_id not in seen_ids: seen_ids.add(title_id) @@ -174,10 +290,16 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat ConceptCandidate( id=title_id, title=lesson.title, - description=lesson.body[:240].strip(), + description=_compact_description(lesson), source_modules=[module.title], source_lessons=[lesson.title], + source_courses=list(lesson.source_refs), mastery_signals=list(lesson.objectives[:3] or lesson.exercises[:2]), + source_role=str(enrichment["source_role"]), + distinctions=list(enrichment["distinctions"]), + definition_candidates=list(enrichment["definition_candidates"]), + qualification_candidates=list(enrichment["qualification_candidates"]), + constraint_candidates=list(enrichment["constraint_candidates"]), ) ) for term in lesson.key_terms: @@ -192,7 +314,13 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat description=f"Candidate concept extracted from lesson '{lesson.title}'.", source_modules=[module.title], source_lessons=[lesson.title], + source_courses=list(lesson.source_refs), mastery_signals=list(lesson.objectives[:2]), + source_role=str(enrichment["source_role"]), + distinctions=list(enrichment["distinctions"][:1]), + definition_candidates=list(enrichment["definition_candidates"][:1]), + qualification_candidates=list(enrichment["qualification_candidates"][:1]), + constraint_candidates=list(enrichment["constraint_candidates"][:1]), ) ) return concepts diff --git a/src/didactopus/course_schema.py b/src/didactopus/course_schema.py index ff0fb0c..722285d 100644 --- a/src/didactopus/course_schema.py +++ b/src/didactopus/course_schema.py @@ -58,6 +58,11 @@ class ConceptCandidate(BaseModel): source_courses: list[str] = Field(default_factory=list) prerequisites: list[str] = Field(default_factory=list) mastery_signals: list[str] = Field(default_factory=list) + source_role: str = "" + distinctions: list[str] = Field(default_factory=list) + definition_candidates: list[str] = Field(default_factory=list) + qualification_candidates: list[str] = Field(default_factory=list) + constraint_candidates: list[str] = Field(default_factory=list) class DraftPack(BaseModel): diff --git a/src/didactopus/pack_emitter.py b/src/didactopus/pack_emitter.py index 529636f..7a09bcc 100644 --- a/src/didactopus/pack_emitter.py +++ b/src/didactopus/pack_emitter.py @@ -102,6 +102,11 @@ def build_draft_pack( "description": c.description, "prerequisites": c.prerequisites, "mastery_signals": c.mastery_signals, + "source_role": c.source_role, + "distinctions": c.distinctions, + "definition_candidates": c.definition_candidates, + "qualification_candidates": c.qualification_candidates, + "constraint_candidates": c.constraint_candidates, "mastery_profile": {}, } for c in concepts diff --git a/src/didactopus/review_export.py b/src/didactopus/review_export.py index 9d28035..14f86db 100644 --- a/src/didactopus/review_export.py +++ b/src/didactopus/review_export.py @@ -24,6 +24,11 @@ def export_promoted_pack(session: ReviewSession, outdir: str | Path) -> None: "description": concept.description, "prerequisites": concept.prerequisites, "mastery_signals": concept.mastery_signals, + "source_role": concept.source_role, + "distinctions": concept.distinctions, + "definition_candidates": concept.definition_candidates, + "qualification_candidates": concept.qualification_candidates, + "constraint_candidates": concept.constraint_candidates, "status": concept.status, "notes": concept.notes, "mastery_profile": {}, diff --git a/src/didactopus/review_loader.py b/src/didactopus/review_loader.py index e21269f..407ecf1 100644 --- a/src/didactopus/review_loader.py +++ b/src/didactopus/review_loader.py @@ -15,6 +15,11 @@ def load_draft_pack(pack_dir: str | Path) -> DraftPackData: description=item.get("description", ""), prerequisites=list(item.get("prerequisites", [])), mastery_signals=list(item.get("mastery_signals", [])), + source_role=item.get("source_role", ""), + distinctions=list(item.get("distinctions", [])), + definition_candidates=list(item.get("definition_candidates", [])), + qualification_candidates=list(item.get("qualification_candidates", [])), + constraint_candidates=list(item.get("constraint_candidates", [])), status=item.get("status", "needs_review"), notes=list(item.get("notes", [])), ) diff --git a/src/didactopus/review_schema.py b/src/didactopus/review_schema.py index 352940e..3482c88 100644 --- a/src/didactopus/review_schema.py +++ b/src/didactopus/review_schema.py @@ -10,6 +10,11 @@ class ConceptReviewEntry(BaseModel): description: str = "" prerequisites: list[str] = Field(default_factory=list) mastery_signals: list[str] = Field(default_factory=list) + source_role: str = "" + distinctions: list[str] = Field(default_factory=list) + definition_candidates: list[str] = Field(default_factory=list) + qualification_candidates: list[str] = Field(default_factory=list) + constraint_candidates: list[str] = Field(default_factory=list) status: TrustStatus = "needs_review" notes: list[str] = Field(default_factory=list) diff --git a/src/didactopus/topic_ingest.py b/src/didactopus/topic_ingest.py index 7823d39..bf873ba 100644 --- a/src/didactopus/topic_ingest.py +++ b/src/didactopus/topic_ingest.py @@ -3,6 +3,7 @@ from __future__ import annotations import re from collections import defaultdict from .course_schema import NormalizedDocument, NormalizedCourse, Module, Lesson, TopicBundle, ConceptCandidate +SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+") GENERIC_TERM_STOPWORDS = { "attribution", @@ -77,6 +78,120 @@ def _parse_signal_line(line: str) -> tuple[str | None, str]: if lowered.startswith("exercise:"): return "exercise", stripped.split(":", 1)[1].strip() return None, stripped + + +def _lesson_sentences(lesson: Lesson) -> list[str]: + parts = [lesson.body, *lesson.objectives, *lesson.exercises] + joined = "\n".join(part.strip() for part in parts if part and part.strip()) + if not joined: + return [] + sentences = [] + for chunk in SENTENCE_SPLIT_RE.split(joined): + text = " ".join(chunk.split()).strip(" -") + if text: + sentences.append(text) + return sentences + + +def _compact_description(lesson: Lesson, max_chars: int = 320) -> str: + sentences = _lesson_sentences(lesson) + if not sentences: + return lesson.title + out = [] + total = 0 + for sentence in sentences: + candidate = sentence if sentence.endswith((".", "!", "?")) else f"{sentence}." + if out and total + 1 + len(candidate) > max_chars: + break + out.append(candidate) + total += len(candidate) + (1 if out[:-1] else 0) + if len(out) >= 2: + break + return " ".join(out).strip()[:max_chars] + + +def _extract_sentences_by_patterns(sentences: list[str], patterns: list[str], max_items: int = 3) -> list[str]: + compiled = [re.compile(pattern, re.IGNORECASE) for pattern in patterns] + out: list[str] = [] + seen: set[str] = set() + for sentence in sentences: + lowered = sentence.lower() + if lowered in seen: + continue + if any(pattern.search(sentence) for pattern in compiled): + seen.add(lowered) + out.append(sentence) + if len(out) >= max_items: + break + return out + + +def _infer_source_role(module: Module, lesson: Lesson, distinctions: list[str], qualifications: list[str], constraints: list[str]) -> str: + title_blob = " ".join([module.title, lesson.title, lesson.body]).lower() + if distinctions or qualifications or constraints: + return "nuance" + if any(token in title_blob for token in ("foundation", "overview", "introduction", "background")): + return "overview" + if any(token in title_blob for token in ("method", "model", "test", "mechanism", "process", "coding", "capacity")): + return "mechanism" + return "overview" + + +def _concept_enrichment(module: Module, lesson: Lesson) -> dict[str, list[str] | str]: + sentences = _lesson_sentences(lesson) + distinctions = _extract_sentences_by_patterns( + sentences, + [ + r"\bcompare\b", + r"\bcontrast\b", + r"\bdistinguish\b", + r"\bdiffer(?:ent|s)?\b", + r"\brelat(?:e|es|ed)\b.+\band\b", + r"\bnot\b.+\bbut\b", + r"\bversus\b|\bvs\.?\b", + ], + ) + definitions = _extract_sentences_by_patterns( + sentences, + [ + r"\bis (?:a|an|the)\b", + r"\bmeasure of\b", + r"\brefers to\b", + r"\bdefined as\b", + r"\btreated as\b", + ], + ) + qualifications = _extract_sentences_by_patterns( + sentences, + [ + r"\bbut\b", + r"\bhowever\b", + r"\bwhile\b", + r"\balthough\b", + r"\bcareful\b", + r"\bnot identical\b", + r"\bdangerous\b", + ], + ) + constraints = _extract_sentences_by_patterns( + sentences, + [ + r"\bimpossible\b", + r"\blimit(?:s)?\b", + r"\bfailure mode(?:s)?\b", + r"\bcannot\b", + r"\bonly up to\b", + r"\bin the long run\b", + r"\babove capacity\b", + ], + ) + return { + "source_role": _infer_source_role(module, lesson, distinctions, qualifications, constraints), + "distinctions": distinctions, + "definition_candidates": definitions, + "qualification_candidates": qualifications, + "constraint_candidates": constraints, + } def document_to_course(doc: NormalizedDocument, course_title: str) -> NormalizedCourse: # Conservative mapping: each section becomes a lesson; all lessons go into one module. lessons = [] @@ -149,6 +264,7 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat seen_ids = set() for module in course.modules: for lesson in module.lessons: + enrichment = _concept_enrichment(module, lesson) cid = slugify(lesson.title) if cid not in seen_ids: seen_ids.add(cid) @@ -156,11 +272,16 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat ConceptCandidate( id=cid, title=lesson.title, - description=lesson.body[:240].strip(), + description=_compact_description(lesson), source_modules=[module.title], source_lessons=[lesson.title], source_courses=list(lesson.source_refs), mastery_signals=list(lesson.objectives[:3] or lesson.exercises[:2]), + source_role=str(enrichment["source_role"]), + distinctions=list(enrichment["distinctions"]), + definition_candidates=list(enrichment["definition_candidates"]), + qualification_candidates=list(enrichment["qualification_candidates"]), + constraint_candidates=list(enrichment["constraint_candidates"]), ) ) for term in lesson.key_terms: @@ -179,6 +300,11 @@ def extract_concept_candidates(course: NormalizedCourse) -> list[ConceptCandidat source_lessons=[lesson.title], source_courses=list(lesson.source_refs), mastery_signals=list(lesson.objectives[:2]), + source_role=str(enrichment["source_role"]), + distinctions=list(enrichment["distinctions"][:1]), + definition_candidates=list(enrichment["definition_candidates"][:1]), + qualification_candidates=list(enrichment["qualification_candidates"][:1]), + constraint_candidates=list(enrichment["constraint_candidates"][:1]), ) ) return concepts diff --git a/tests/test_course_ingest.py b/tests/test_course_ingest.py index 5d39d82..a64d242 100644 --- a/tests/test_course_ingest.py +++ b/tests/test_course_ingest.py @@ -24,3 +24,30 @@ def test_extract_concepts() -> None: course = parse_markdown_course(SAMPLE, "Sample Course") concepts = extract_concept_candidates(course) assert len(concepts) >= 2 + + +def test_extract_concepts_captures_distinctions_and_constraints() -> None: + sample = """ +# OCW Slice + +## Broader Applications +### Thermodynamics and Entropy +- Objective: Explain how thermodynamic entropy relates to, and differs from, Shannon entropy. +- Exercise: Compare the two entropy notions and identify what is preserved across the analogy. +Entropy is a measure of uncertainty in the source model. The analogy is useful but dangerous when used loosely. + +### Channel Capacity +- Objective: Explain channel capacity as a limit on reliable communication over a noisy channel. +- Exercise: State why reliable transmission above capacity is impossible in the long run. +""" + course = parse_markdown_course(sample, "OCW Slice") + concepts = {concept.id: concept for concept in extract_concept_candidates(course)} + + entropy = concepts["thermodynamics-and-entropy"] + assert entropy.source_role == "nuance" + assert entropy.distinctions + assert entropy.definition_candidates + assert entropy.qualification_candidates + + capacity = concepts["channel-capacity"] + assert capacity.constraint_candidates diff --git a/tests/test_pack_emitter.py b/tests/test_pack_emitter.py index 1e80fae..673bb69 100644 --- a/tests/test_pack_emitter.py +++ b/tests/test_pack_emitter.py @@ -1,4 +1,5 @@ from pathlib import Path +import yaml from didactopus.course_ingest import parse_markdown_course, extract_concept_candidates from didactopus.knowledge_graph import write_knowledge_graph from didactopus.rule_policy import RuleContext, build_default_rules, run_rules @@ -56,3 +57,28 @@ def test_emit_pack_can_write_groundrecall_query_bundle(tmp_path: Path) -> None: assert "notebook_page.json" in pack_yaml assert '"bundle_kind": "groundrecall_query_bundle"' in bundle_payload assert '"page_kind": "didactopus_notebook_page"' in notebook_payload + + +def test_emit_pack_preserves_richer_concept_fields(tmp_path: Path) -> None: + sample = """ +# OCW Slice + +## Broader Applications +### Thermodynamics and Entropy +- Objective: Explain how thermodynamic entropy relates to, and differs from, Shannon entropy. +- Exercise: Compare the two entropy notions and identify what is preserved across the analogy. +Entropy is a measure of uncertainty in the source model. The analogy is useful but dangerous when used loosely. +""" + course = parse_markdown_course(sample, "OCW Slice") + concepts = extract_concept_candidates(course) + ctx = RuleContext(course=course, concepts=concepts) + run_rules(ctx, build_default_rules()) + draft = build_draft_pack(course, ctx.concepts, "Tester", "REVIEW", ctx.review_flags) + write_draft_pack(draft, tmp_path) + + concepts_yaml = yaml.safe_load((tmp_path / "concepts.yaml").read_text(encoding="utf-8")) + concept = concepts_yaml["concepts"][0] + assert concept["source_role"] == "nuance" + assert concept["distinctions"] + assert concept["definition_candidates"] + assert concept["qualification_candidates"]