diff --git a/src/didactopus/course_ingestion_compliance.py b/src/didactopus/course_ingestion_compliance.py index 3229695..cb3d2b2 100644 --- a/src/didactopus/course_ingestion_compliance.py +++ b/src/didactopus/course_ingestion_compliance.py @@ -4,7 +4,12 @@ import argparse, json, yaml from .compliance_models import SourceInventory, PackComplianceManifest def load_sources(path: str | Path) -> SourceInventory: - data = yaml.safe_load(Path(path).read_text(encoding="utf-8")) or {} + p = Path(path) + raw = p.read_text(encoding="utf-8") + if p.suffix.lower() == ".json": + data = json.loads(raw) if raw.strip() else {} + else: + data = yaml.safe_load(raw) or {} return SourceInventory.model_validate(data) def build_pack_compliance_manifest( diff --git a/src/didactopus/ocw_information_entropy_demo.py b/src/didactopus/ocw_information_entropy_demo.py index daa733e..a560e01 100644 --- a/src/didactopus/ocw_information_entropy_demo.py +++ b/src/didactopus/ocw_information_entropy_demo.py @@ -163,6 +163,22 @@ def _load_groundrecall_runtime(): return run_groundrecall_import, promote_import_to_store, export_groundrecall_query_bundle +def _merge_source_inventories(primary_path: Path, extra_path: Path, out_path: Path) -> Path: + primary = load_sources(primary_path) + extra = load_sources(extra_path) + merged = [] + seen_ids: set[str] = set() + for source in [*primary.sources, *extra.sources]: + if source.source_id in seen_ids: + continue + seen_ids.add(source.source_id) + merged.append(source.model_dump()) + out_path.parent.mkdir(parents=True, exist_ok=True) + import yaml + out_path.write_text(yaml.safe_dump({"sources": merged}, sort_keys=False), encoding="utf-8") + return out_path + + def resolve_ocw_demo_paths( root: Path, course_repo: str | Path | None = None, @@ -221,14 +237,29 @@ def run_ocw_information_entropy_demo( pack_dir: str | Path, run_dir: str | Path, skill_dir: str | Path, + wolfe_snippets_dir: str | Path | None = None, + wolfe_source_inventory: str | Path | None = None, ) -> dict: course_source = Path(course_source) source_inventory = Path(source_inventory) pack_dir = Path(pack_dir) run_dir = Path(run_dir) skill_dir = Path(skill_dir) + wolfe_snippets_dir = Path(wolfe_snippets_dir) if wolfe_snippets_dir is not None else None + wolfe_source_inventory = Path(wolfe_source_inventory) if wolfe_source_inventory is not None else None docs = adapt_documents(course_source) + wolfe_doc_count = 0 + if wolfe_snippets_dir is not None and wolfe_snippets_dir.exists(): + wolfe_docs = adapt_documents(wolfe_snippets_dir) + docs.extend(wolfe_docs) + wolfe_doc_count = len(wolfe_docs) + if wolfe_doc_count and not (wolfe_source_inventory is not None and wolfe_source_inventory.exists()): + review_flag = ( + "Wolfe snippet augmentation was used without a Wolfe source inventory; compliance manifest excludes those augmentation sources." + ) + else: + review_flag = "" if not docs: raise ValueError(f"No supported source documents found under {course_source}") courses = [document_to_course(doc, "MIT OCW Information and Entropy") for doc in docs] @@ -238,6 +269,8 @@ def run_ocw_information_entropy_demo( concepts = extract_concept_candidates(merged) ctx = RuleContext(course=merged, concepts=concepts) run_rules(ctx, build_default_rules()) + if review_flag: + ctx.review_flags.append(review_flag) draft = build_draft_pack( merged, @@ -250,11 +283,21 @@ def run_ocw_information_entropy_demo( write_draft_pack(draft, pack_dir) write_source_corpus(merged, pack_dir) write_knowledge_graph(merged, ctx.concepts, pack_dir) - if source_inventory.exists(): - inventory = load_sources(source_inventory) + effective_inventory_path = source_inventory + if wolfe_source_inventory is not None and wolfe_source_inventory.exists(): + effective_inventory_path = _merge_source_inventories( + source_inventory, + wolfe_source_inventory, + run_dir / "merged_source_inventory.yaml", + ) + if effective_inventory_path.exists(): + inventory = load_sources(effective_inventory_path) compliance_manifest = build_pack_compliance_manifest(draft.pack["name"], draft.pack["display_name"], inventory) write_manifest(compliance_manifest, pack_dir / "pack_compliance_manifest.json") - (pack_dir / "source_inventory.yaml").write_text(source_inventory.read_text(encoding="utf-8"), encoding="utf-8") + if effective_inventory_path.suffix.lower() in {".yaml", ".yml"}: + (pack_dir / "source_inventory.yaml").write_text(effective_inventory_path.read_text(encoding="utf-8"), encoding="utf-8") + else: + (pack_dir / "source_inventory.json").write_text(effective_inventory_path.read_text(encoding="utf-8"), encoding="utf-8") validation = validate_pack(pack_dir) if not validation.is_valid: @@ -324,6 +367,10 @@ def run_ocw_information_entropy_demo( "pack_dir": str(pack_dir), "skill_dir": str(skill_dir), "source_inventory": str(source_inventory), + "effective_source_inventory": str(effective_inventory_path), + "wolfe_snippets_dir": str(wolfe_snippets_dir) if wolfe_snippets_dir is not None else "", + "wolfe_source_inventory": str(wolfe_source_inventory) if wolfe_source_inventory is not None else "", + "wolfe_source_document_count": wolfe_doc_count, "review_flags": list(ctx.review_flags), "concept_count": len(ctx.concepts), "source_fragment_count": len(json.loads((pack_dir / "source_corpus.json").read_text(encoding="utf-8")).get("fragments", [])), @@ -357,6 +404,8 @@ def main() -> None: parser.add_argument("--pack-dir") parser.add_argument("--run-dir") parser.add_argument("--skill-dir") + parser.add_argument("--wolfe-snippets-dir") + parser.add_argument("--wolfe-source-inventory") args = parser.parse_args() if args.course_repo_target: @@ -383,6 +432,8 @@ def main() -> None: pack_dir=resolved["pack_dir"], run_dir=resolved["run_dir"], skill_dir=resolved["skill_dir"], + wolfe_snippets_dir=args.wolfe_snippets_dir, + wolfe_source_inventory=args.wolfe_source_inventory, ) print(json.dumps(summary, indent=2)) diff --git a/tests/test_ocw_information_entropy_demo.py b/tests/test_ocw_information_entropy_demo.py index ee1db34..5e0e5f9 100644 --- a/tests/test_ocw_information_entropy_demo.py +++ b/tests/test_ocw_information_entropy_demo.py @@ -57,3 +57,59 @@ def test_ocw_demo_accepts_directory_tree_sources(tmp_path: Path) -> None: assert len(corpus["sources"]) == 2 assert groundrecall_bundle["bundle_kind"] == "groundrecall_query_bundle" assert any(fragment["lesson_title"] == "Shannon Entropy" for fragment in corpus["fragments"]) + + +def test_ocw_demo_can_apply_wolfe_snippet_augmentation(tmp_path: Path) -> None: + source_dir = tmp_path / "course" + source_dir.mkdir() + (source_dir / "unit1.md").write_text( + "# Course\n\n## Unit 1\n### Thermodynamics and Entropy\n- Objective: Explain entropy.\nEntropy links uncertainty to physics.", + encoding="utf-8", + ) + sources = tmp_path / "sources.yaml" + sources.write_text("sources: []\n", encoding="utf-8") + + wolfe_dir = tmp_path / "wolfe" + wolfe_dir.mkdir() + (wolfe_dir / "snippet.md").write_text( + "# Wolfe Snippet\n\n## Augmentation\n### Entropy Comparison\n- Objective: Compare Shannon entropy with thermodynamic entropy.\nThe two notions differ in interpretation even when the mathematics overlaps.", + encoding="utf-8", + ) + wolfe_sources = tmp_path / "wolfe-sources.yaml" + wolfe_sources.write_text( + "\n".join( + [ + "sources:", + " - source_id: wolfe-local-snippet", + " title: Wolfe local snippet", + " url: file:///local/wolfe/snippet", + " publisher: Local Library", + " creator: Local Search", + " license_id: local-only", + " license_url: https://example.invalid/local-only", + " retrieved_at: '2026-05-08'", + " adapted: false", + " attribution_text: Local Wolfe-derived snippet for private evaluation.", + " excluded_from_upstream_license: true", + " exclusion_notes: Local-only experimental augmentation.", + ] + ), + encoding="utf-8", + ) + + summary = run_ocw_information_entropy_demo( + course_source=source_dir, + source_inventory=sources, + pack_dir=tmp_path / "pack", + run_dir=tmp_path / "run", + skill_dir=tmp_path / "skill", + wolfe_snippets_dir=wolfe_dir, + wolfe_source_inventory=wolfe_sources, + ) + + bundle = json.loads((tmp_path / "pack" / "groundrecall_query_bundle.json").read_text(encoding="utf-8")) + manifest = json.loads((tmp_path / "pack" / "pack_compliance_manifest.json").read_text(encoding="utf-8")) + assert summary["wolfe_source_document_count"] == 1 + assert summary["source_document_count"] == 2 + assert "wolfe-local-snippet" in manifest["derived_from_sources"] + assert bundle["bundle_kind"] == "groundrecall_query_bundle"