diff --git a/src/didactopus/ocw_information_entropy_demo.py b/src/didactopus/ocw_information_entropy_demo.py index a560e01..46d1078 100644 --- a/src/didactopus/ocw_information_entropy_demo.py +++ b/src/didactopus/ocw_information_entropy_demo.py @@ -3,6 +3,7 @@ from __future__ import annotations import json from pathlib import Path import sys +import re from .agentic_loop import AgenticStudentState, integrate_attempt from .artifact_registry import validate_pack @@ -179,6 +180,83 @@ def _merge_source_inventories(primary_path: Path, extra_path: Path, out_path: Pa return out_path +def _slugify(text: str) -> str: + cleaned = re.sub(r"[^a-zA-Z0-9]+", "-", text.strip().lower()).strip("-") + return cleaned or "untitled" + + +def _merge_concept_descriptions(primary: str, secondary: str, max_chars: int = 640) -> str: + primary = (primary or "").strip() + secondary = (secondary or "").strip() + if not primary: + return secondary[:max_chars] + if not secondary or secondary in primary: + return primary[:max_chars] + merged = f"{primary} {secondary}".strip() + return merged[:max_chars] + + +def _merge_unique(existing: list[str], additions: list[str]) -> list[str]: + seen = set(existing) + out = list(existing) + for item in additions: + if item not in seen: + seen.add(item) + out.append(item) + return out + + +def _load_wolfe_concept_alignment(wolfe_snippets_dir: Path | None) -> dict[str, str]: + if wolfe_snippets_dir is None: + return {} + import yaml + + path = wolfe_snippets_dir / "concept-alignment.yaml" + if not path.exists(): + return {} + payload = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + alignments = payload.get("alignments", []) or [] + mapping: dict[str, str] = {} + for item in alignments: + if not isinstance(item, dict): + continue + source_title = str(item.get("source_title", "")).strip() + target_title = str(item.get("target_title", "")).strip() + if source_title and target_title: + mapping[source_title] = target_title + return mapping + + +def _apply_concept_alignment(concepts: list, alignment: dict[str, str]) -> list: + if not alignment: + return concepts + merged_by_id: dict[str, object] = {} + ordered_ids: list[str] = [] + for concept in concepts: + target_title = alignment.get(concept.title, concept.title) + target_id = _slugify(target_title) + if target_id not in merged_by_id: + concept.id = target_id + concept.title = target_title + merged_by_id[target_id] = concept + ordered_ids.append(target_id) + continue + existing = merged_by_id[target_id] + existing.description = _merge_concept_descriptions(existing.description, concept.description) + existing.source_modules = _merge_unique(existing.source_modules, concept.source_modules) + existing.source_lessons = _merge_unique(existing.source_lessons, concept.source_lessons) + existing.source_courses = _merge_unique(existing.source_courses, concept.source_courses) + existing.prerequisites = _merge_unique(existing.prerequisites, concept.prerequisites) + existing.mastery_signals = _merge_unique(existing.mastery_signals, concept.mastery_signals) + existing.distinctions = _merge_unique(existing.distinctions, concept.distinctions) + existing.definition_candidates = _merge_unique(existing.definition_candidates, concept.definition_candidates) + existing.qualification_candidates = _merge_unique(existing.qualification_candidates, concept.qualification_candidates) + existing.constraint_candidates = _merge_unique(existing.constraint_candidates, concept.constraint_candidates) + if existing.source_role != "nuance" and concept.source_role == "nuance": + existing.source_role = concept.source_role + return [merged_by_id[item] for item in ordered_ids] + + def resolve_ocw_demo_paths( root: Path, course_repo: str | Path | None = None, @@ -251,7 +329,11 @@ def run_ocw_information_entropy_demo( docs = adapt_documents(course_source) wolfe_doc_count = 0 if wolfe_snippets_dir is not None and wolfe_snippets_dir.exists(): - wolfe_docs = adapt_documents(wolfe_snippets_dir) + wolfe_docs = [ + doc + for doc in adapt_documents(wolfe_snippets_dir) + if Path(getattr(doc, "source_path", "")).name not in {"concept-alignment.yaml", "concept-alignment.yml", "concept-alignment.json"} + ] docs.extend(wolfe_docs) wolfe_doc_count = len(wolfe_docs) if wolfe_doc_count and not (wolfe_source_inventory is not None and wolfe_source_inventory.exists()): @@ -267,6 +349,7 @@ def run_ocw_information_entropy_demo( merged.rights_note = DEFAULT_RIGHTS_NOTE concepts = extract_concept_candidates(merged) + concepts = _apply_concept_alignment(concepts, _load_wolfe_concept_alignment(wolfe_snippets_dir)) ctx = RuleContext(course=merged, concepts=concepts) run_rules(ctx, build_default_rules()) if review_flag: diff --git a/tests/test_ocw_information_entropy_demo.py b/tests/test_ocw_information_entropy_demo.py index 5e0e5f9..470ccb2 100644 --- a/tests/test_ocw_information_entropy_demo.py +++ b/tests/test_ocw_information_entropy_demo.py @@ -96,6 +96,16 @@ def test_ocw_demo_can_apply_wolfe_snippet_augmentation(tmp_path: Path) -> None: ), encoding="utf-8", ) + (wolfe_dir / "concept-alignment.yaml").write_text( + "\n".join( + [ + "alignments:", + " - source_title: Entropy Comparison", + " target_title: Thermodynamics and Entropy", + ] + ), + encoding="utf-8", + ) summary = run_ocw_information_entropy_demo( course_source=source_dir, @@ -109,7 +119,9 @@ def test_ocw_demo_can_apply_wolfe_snippet_augmentation(tmp_path: Path) -> None: bundle = json.loads((tmp_path / "pack" / "groundrecall_query_bundle.json").read_text(encoding="utf-8")) manifest = json.loads((tmp_path / "pack" / "pack_compliance_manifest.json").read_text(encoding="utf-8")) + concept_titles = [item["title"] for item in (json.loads((tmp_path / "pack" / "knowledge_graph.json").read_text(encoding="utf-8"))["nodes"]) if item.get("type") == "concept"] assert summary["wolfe_source_document_count"] == 1 assert summary["source_document_count"] == 2 assert "wolfe-local-snippet" in manifest["derived_from_sources"] assert bundle["bundle_kind"] == "groundrecall_query_bundle" + assert "Entropy Comparison" not in concept_titles