Align Wolfe augmentation to OCW concepts
This commit is contained in:
parent
389a3dfdf1
commit
6e660187f6
|
|
@ -3,6 +3,7 @@ from __future__ import annotations
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
from .agentic_loop import AgenticStudentState, integrate_attempt
|
from .agentic_loop import AgenticStudentState, integrate_attempt
|
||||||
from .artifact_registry import validate_pack
|
from .artifact_registry import validate_pack
|
||||||
|
|
@ -179,6 +180,83 @@ def _merge_source_inventories(primary_path: Path, extra_path: Path, out_path: Pa
|
||||||
return out_path
|
return out_path
|
||||||
|
|
||||||
|
|
||||||
|
def _slugify(text: str) -> str:
|
||||||
|
cleaned = re.sub(r"[^a-zA-Z0-9]+", "-", text.strip().lower()).strip("-")
|
||||||
|
return cleaned or "untitled"
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_concept_descriptions(primary: str, secondary: str, max_chars: int = 640) -> str:
|
||||||
|
primary = (primary or "").strip()
|
||||||
|
secondary = (secondary or "").strip()
|
||||||
|
if not primary:
|
||||||
|
return secondary[:max_chars]
|
||||||
|
if not secondary or secondary in primary:
|
||||||
|
return primary[:max_chars]
|
||||||
|
merged = f"{primary} {secondary}".strip()
|
||||||
|
return merged[:max_chars]
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_unique(existing: list[str], additions: list[str]) -> list[str]:
|
||||||
|
seen = set(existing)
|
||||||
|
out = list(existing)
|
||||||
|
for item in additions:
|
||||||
|
if item not in seen:
|
||||||
|
seen.add(item)
|
||||||
|
out.append(item)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _load_wolfe_concept_alignment(wolfe_snippets_dir: Path | None) -> dict[str, str]:
|
||||||
|
if wolfe_snippets_dir is None:
|
||||||
|
return {}
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
path = wolfe_snippets_dir / "concept-alignment.yaml"
|
||||||
|
if not path.exists():
|
||||||
|
return {}
|
||||||
|
payload = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||||
|
alignments = payload.get("alignments", []) or []
|
||||||
|
mapping: dict[str, str] = {}
|
||||||
|
for item in alignments:
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
source_title = str(item.get("source_title", "")).strip()
|
||||||
|
target_title = str(item.get("target_title", "")).strip()
|
||||||
|
if source_title and target_title:
|
||||||
|
mapping[source_title] = target_title
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_concept_alignment(concepts: list, alignment: dict[str, str]) -> list:
|
||||||
|
if not alignment:
|
||||||
|
return concepts
|
||||||
|
merged_by_id: dict[str, object] = {}
|
||||||
|
ordered_ids: list[str] = []
|
||||||
|
for concept in concepts:
|
||||||
|
target_title = alignment.get(concept.title, concept.title)
|
||||||
|
target_id = _slugify(target_title)
|
||||||
|
if target_id not in merged_by_id:
|
||||||
|
concept.id = target_id
|
||||||
|
concept.title = target_title
|
||||||
|
merged_by_id[target_id] = concept
|
||||||
|
ordered_ids.append(target_id)
|
||||||
|
continue
|
||||||
|
existing = merged_by_id[target_id]
|
||||||
|
existing.description = _merge_concept_descriptions(existing.description, concept.description)
|
||||||
|
existing.source_modules = _merge_unique(existing.source_modules, concept.source_modules)
|
||||||
|
existing.source_lessons = _merge_unique(existing.source_lessons, concept.source_lessons)
|
||||||
|
existing.source_courses = _merge_unique(existing.source_courses, concept.source_courses)
|
||||||
|
existing.prerequisites = _merge_unique(existing.prerequisites, concept.prerequisites)
|
||||||
|
existing.mastery_signals = _merge_unique(existing.mastery_signals, concept.mastery_signals)
|
||||||
|
existing.distinctions = _merge_unique(existing.distinctions, concept.distinctions)
|
||||||
|
existing.definition_candidates = _merge_unique(existing.definition_candidates, concept.definition_candidates)
|
||||||
|
existing.qualification_candidates = _merge_unique(existing.qualification_candidates, concept.qualification_candidates)
|
||||||
|
existing.constraint_candidates = _merge_unique(existing.constraint_candidates, concept.constraint_candidates)
|
||||||
|
if existing.source_role != "nuance" and concept.source_role == "nuance":
|
||||||
|
existing.source_role = concept.source_role
|
||||||
|
return [merged_by_id[item] for item in ordered_ids]
|
||||||
|
|
||||||
|
|
||||||
def resolve_ocw_demo_paths(
|
def resolve_ocw_demo_paths(
|
||||||
root: Path,
|
root: Path,
|
||||||
course_repo: str | Path | None = None,
|
course_repo: str | Path | None = None,
|
||||||
|
|
@ -251,7 +329,11 @@ def run_ocw_information_entropy_demo(
|
||||||
docs = adapt_documents(course_source)
|
docs = adapt_documents(course_source)
|
||||||
wolfe_doc_count = 0
|
wolfe_doc_count = 0
|
||||||
if wolfe_snippets_dir is not None and wolfe_snippets_dir.exists():
|
if wolfe_snippets_dir is not None and wolfe_snippets_dir.exists():
|
||||||
wolfe_docs = adapt_documents(wolfe_snippets_dir)
|
wolfe_docs = [
|
||||||
|
doc
|
||||||
|
for doc in adapt_documents(wolfe_snippets_dir)
|
||||||
|
if Path(getattr(doc, "source_path", "")).name not in {"concept-alignment.yaml", "concept-alignment.yml", "concept-alignment.json"}
|
||||||
|
]
|
||||||
docs.extend(wolfe_docs)
|
docs.extend(wolfe_docs)
|
||||||
wolfe_doc_count = len(wolfe_docs)
|
wolfe_doc_count = len(wolfe_docs)
|
||||||
if wolfe_doc_count and not (wolfe_source_inventory is not None and wolfe_source_inventory.exists()):
|
if wolfe_doc_count and not (wolfe_source_inventory is not None and wolfe_source_inventory.exists()):
|
||||||
|
|
@ -267,6 +349,7 @@ def run_ocw_information_entropy_demo(
|
||||||
merged.rights_note = DEFAULT_RIGHTS_NOTE
|
merged.rights_note = DEFAULT_RIGHTS_NOTE
|
||||||
|
|
||||||
concepts = extract_concept_candidates(merged)
|
concepts = extract_concept_candidates(merged)
|
||||||
|
concepts = _apply_concept_alignment(concepts, _load_wolfe_concept_alignment(wolfe_snippets_dir))
|
||||||
ctx = RuleContext(course=merged, concepts=concepts)
|
ctx = RuleContext(course=merged, concepts=concepts)
|
||||||
run_rules(ctx, build_default_rules())
|
run_rules(ctx, build_default_rules())
|
||||||
if review_flag:
|
if review_flag:
|
||||||
|
|
|
||||||
|
|
@ -96,6 +96,16 @@ def test_ocw_demo_can_apply_wolfe_snippet_augmentation(tmp_path: Path) -> None:
|
||||||
),
|
),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
)
|
)
|
||||||
|
(wolfe_dir / "concept-alignment.yaml").write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"alignments:",
|
||||||
|
" - source_title: Entropy Comparison",
|
||||||
|
" target_title: Thermodynamics and Entropy",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
summary = run_ocw_information_entropy_demo(
|
summary = run_ocw_information_entropy_demo(
|
||||||
course_source=source_dir,
|
course_source=source_dir,
|
||||||
|
|
@ -109,7 +119,9 @@ def test_ocw_demo_can_apply_wolfe_snippet_augmentation(tmp_path: Path) -> None:
|
||||||
|
|
||||||
bundle = json.loads((tmp_path / "pack" / "groundrecall_query_bundle.json").read_text(encoding="utf-8"))
|
bundle = json.loads((tmp_path / "pack" / "groundrecall_query_bundle.json").read_text(encoding="utf-8"))
|
||||||
manifest = json.loads((tmp_path / "pack" / "pack_compliance_manifest.json").read_text(encoding="utf-8"))
|
manifest = json.loads((tmp_path / "pack" / "pack_compliance_manifest.json").read_text(encoding="utf-8"))
|
||||||
|
concept_titles = [item["title"] for item in (json.loads((tmp_path / "pack" / "knowledge_graph.json").read_text(encoding="utf-8"))["nodes"]) if item.get("type") == "concept"]
|
||||||
assert summary["wolfe_source_document_count"] == 1
|
assert summary["wolfe_source_document_count"] == 1
|
||||||
assert summary["source_document_count"] == 2
|
assert summary["source_document_count"] == 2
|
||||||
assert "wolfe-local-snippet" in manifest["derived_from_sources"]
|
assert "wolfe-local-snippet" in manifest["derived_from_sources"]
|
||||||
assert bundle["bundle_kind"] == "groundrecall_query_bundle"
|
assert bundle["bundle_kind"] == "groundrecall_query_bundle"
|
||||||
|
assert "Entropy Comparison" not in concept_titles
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue