Add Wolfe augmentation experiment to OCW demo
This commit is contained in:
parent
3371d3f30b
commit
389a3dfdf1
|
|
@ -4,7 +4,12 @@ import argparse, json, yaml
|
||||||
from .compliance_models import SourceInventory, PackComplianceManifest
|
from .compliance_models import SourceInventory, PackComplianceManifest
|
||||||
|
|
||||||
def load_sources(path: str | Path) -> SourceInventory:
|
def load_sources(path: str | Path) -> SourceInventory:
|
||||||
data = yaml.safe_load(Path(path).read_text(encoding="utf-8")) or {}
|
p = Path(path)
|
||||||
|
raw = p.read_text(encoding="utf-8")
|
||||||
|
if p.suffix.lower() == ".json":
|
||||||
|
data = json.loads(raw) if raw.strip() else {}
|
||||||
|
else:
|
||||||
|
data = yaml.safe_load(raw) or {}
|
||||||
return SourceInventory.model_validate(data)
|
return SourceInventory.model_validate(data)
|
||||||
|
|
||||||
def build_pack_compliance_manifest(
|
def build_pack_compliance_manifest(
|
||||||
|
|
|
||||||
|
|
@ -163,6 +163,22 @@ def _load_groundrecall_runtime():
|
||||||
return run_groundrecall_import, promote_import_to_store, export_groundrecall_query_bundle
|
return run_groundrecall_import, promote_import_to_store, export_groundrecall_query_bundle
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_source_inventories(primary_path: Path, extra_path: Path, out_path: Path) -> Path:
|
||||||
|
primary = load_sources(primary_path)
|
||||||
|
extra = load_sources(extra_path)
|
||||||
|
merged = []
|
||||||
|
seen_ids: set[str] = set()
|
||||||
|
for source in [*primary.sources, *extra.sources]:
|
||||||
|
if source.source_id in seen_ids:
|
||||||
|
continue
|
||||||
|
seen_ids.add(source.source_id)
|
||||||
|
merged.append(source.model_dump())
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
import yaml
|
||||||
|
out_path.write_text(yaml.safe_dump({"sources": merged}, sort_keys=False), encoding="utf-8")
|
||||||
|
return out_path
|
||||||
|
|
||||||
|
|
||||||
def resolve_ocw_demo_paths(
|
def resolve_ocw_demo_paths(
|
||||||
root: Path,
|
root: Path,
|
||||||
course_repo: str | Path | None = None,
|
course_repo: str | Path | None = None,
|
||||||
|
|
@ -221,14 +237,29 @@ def run_ocw_information_entropy_demo(
|
||||||
pack_dir: str | Path,
|
pack_dir: str | Path,
|
||||||
run_dir: str | Path,
|
run_dir: str | Path,
|
||||||
skill_dir: str | Path,
|
skill_dir: str | Path,
|
||||||
|
wolfe_snippets_dir: str | Path | None = None,
|
||||||
|
wolfe_source_inventory: str | Path | None = None,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
course_source = Path(course_source)
|
course_source = Path(course_source)
|
||||||
source_inventory = Path(source_inventory)
|
source_inventory = Path(source_inventory)
|
||||||
pack_dir = Path(pack_dir)
|
pack_dir = Path(pack_dir)
|
||||||
run_dir = Path(run_dir)
|
run_dir = Path(run_dir)
|
||||||
skill_dir = Path(skill_dir)
|
skill_dir = Path(skill_dir)
|
||||||
|
wolfe_snippets_dir = Path(wolfe_snippets_dir) if wolfe_snippets_dir is not None else None
|
||||||
|
wolfe_source_inventory = Path(wolfe_source_inventory) if wolfe_source_inventory is not None else None
|
||||||
|
|
||||||
docs = adapt_documents(course_source)
|
docs = adapt_documents(course_source)
|
||||||
|
wolfe_doc_count = 0
|
||||||
|
if wolfe_snippets_dir is not None and wolfe_snippets_dir.exists():
|
||||||
|
wolfe_docs = adapt_documents(wolfe_snippets_dir)
|
||||||
|
docs.extend(wolfe_docs)
|
||||||
|
wolfe_doc_count = len(wolfe_docs)
|
||||||
|
if wolfe_doc_count and not (wolfe_source_inventory is not None and wolfe_source_inventory.exists()):
|
||||||
|
review_flag = (
|
||||||
|
"Wolfe snippet augmentation was used without a Wolfe source inventory; compliance manifest excludes those augmentation sources."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
review_flag = ""
|
||||||
if not docs:
|
if not docs:
|
||||||
raise ValueError(f"No supported source documents found under {course_source}")
|
raise ValueError(f"No supported source documents found under {course_source}")
|
||||||
courses = [document_to_course(doc, "MIT OCW Information and Entropy") for doc in docs]
|
courses = [document_to_course(doc, "MIT OCW Information and Entropy") for doc in docs]
|
||||||
|
|
@ -238,6 +269,8 @@ def run_ocw_information_entropy_demo(
|
||||||
concepts = extract_concept_candidates(merged)
|
concepts = extract_concept_candidates(merged)
|
||||||
ctx = RuleContext(course=merged, concepts=concepts)
|
ctx = RuleContext(course=merged, concepts=concepts)
|
||||||
run_rules(ctx, build_default_rules())
|
run_rules(ctx, build_default_rules())
|
||||||
|
if review_flag:
|
||||||
|
ctx.review_flags.append(review_flag)
|
||||||
|
|
||||||
draft = build_draft_pack(
|
draft = build_draft_pack(
|
||||||
merged,
|
merged,
|
||||||
|
|
@ -250,11 +283,21 @@ def run_ocw_information_entropy_demo(
|
||||||
write_draft_pack(draft, pack_dir)
|
write_draft_pack(draft, pack_dir)
|
||||||
write_source_corpus(merged, pack_dir)
|
write_source_corpus(merged, pack_dir)
|
||||||
write_knowledge_graph(merged, ctx.concepts, pack_dir)
|
write_knowledge_graph(merged, ctx.concepts, pack_dir)
|
||||||
if source_inventory.exists():
|
effective_inventory_path = source_inventory
|
||||||
inventory = load_sources(source_inventory)
|
if wolfe_source_inventory is not None and wolfe_source_inventory.exists():
|
||||||
|
effective_inventory_path = _merge_source_inventories(
|
||||||
|
source_inventory,
|
||||||
|
wolfe_source_inventory,
|
||||||
|
run_dir / "merged_source_inventory.yaml",
|
||||||
|
)
|
||||||
|
if effective_inventory_path.exists():
|
||||||
|
inventory = load_sources(effective_inventory_path)
|
||||||
compliance_manifest = build_pack_compliance_manifest(draft.pack["name"], draft.pack["display_name"], inventory)
|
compliance_manifest = build_pack_compliance_manifest(draft.pack["name"], draft.pack["display_name"], inventory)
|
||||||
write_manifest(compliance_manifest, pack_dir / "pack_compliance_manifest.json")
|
write_manifest(compliance_manifest, pack_dir / "pack_compliance_manifest.json")
|
||||||
(pack_dir / "source_inventory.yaml").write_text(source_inventory.read_text(encoding="utf-8"), encoding="utf-8")
|
if effective_inventory_path.suffix.lower() in {".yaml", ".yml"}:
|
||||||
|
(pack_dir / "source_inventory.yaml").write_text(effective_inventory_path.read_text(encoding="utf-8"), encoding="utf-8")
|
||||||
|
else:
|
||||||
|
(pack_dir / "source_inventory.json").write_text(effective_inventory_path.read_text(encoding="utf-8"), encoding="utf-8")
|
||||||
|
|
||||||
validation = validate_pack(pack_dir)
|
validation = validate_pack(pack_dir)
|
||||||
if not validation.is_valid:
|
if not validation.is_valid:
|
||||||
|
|
@ -324,6 +367,10 @@ def run_ocw_information_entropy_demo(
|
||||||
"pack_dir": str(pack_dir),
|
"pack_dir": str(pack_dir),
|
||||||
"skill_dir": str(skill_dir),
|
"skill_dir": str(skill_dir),
|
||||||
"source_inventory": str(source_inventory),
|
"source_inventory": str(source_inventory),
|
||||||
|
"effective_source_inventory": str(effective_inventory_path),
|
||||||
|
"wolfe_snippets_dir": str(wolfe_snippets_dir) if wolfe_snippets_dir is not None else "",
|
||||||
|
"wolfe_source_inventory": str(wolfe_source_inventory) if wolfe_source_inventory is not None else "",
|
||||||
|
"wolfe_source_document_count": wolfe_doc_count,
|
||||||
"review_flags": list(ctx.review_flags),
|
"review_flags": list(ctx.review_flags),
|
||||||
"concept_count": len(ctx.concepts),
|
"concept_count": len(ctx.concepts),
|
||||||
"source_fragment_count": len(json.loads((pack_dir / "source_corpus.json").read_text(encoding="utf-8")).get("fragments", [])),
|
"source_fragment_count": len(json.loads((pack_dir / "source_corpus.json").read_text(encoding="utf-8")).get("fragments", [])),
|
||||||
|
|
@ -357,6 +404,8 @@ def main() -> None:
|
||||||
parser.add_argument("--pack-dir")
|
parser.add_argument("--pack-dir")
|
||||||
parser.add_argument("--run-dir")
|
parser.add_argument("--run-dir")
|
||||||
parser.add_argument("--skill-dir")
|
parser.add_argument("--skill-dir")
|
||||||
|
parser.add_argument("--wolfe-snippets-dir")
|
||||||
|
parser.add_argument("--wolfe-source-inventory")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.course_repo_target:
|
if args.course_repo_target:
|
||||||
|
|
@ -383,6 +432,8 @@ def main() -> None:
|
||||||
pack_dir=resolved["pack_dir"],
|
pack_dir=resolved["pack_dir"],
|
||||||
run_dir=resolved["run_dir"],
|
run_dir=resolved["run_dir"],
|
||||||
skill_dir=resolved["skill_dir"],
|
skill_dir=resolved["skill_dir"],
|
||||||
|
wolfe_snippets_dir=args.wolfe_snippets_dir,
|
||||||
|
wolfe_source_inventory=args.wolfe_source_inventory,
|
||||||
)
|
)
|
||||||
print(json.dumps(summary, indent=2))
|
print(json.dumps(summary, indent=2))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -57,3 +57,59 @@ def test_ocw_demo_accepts_directory_tree_sources(tmp_path: Path) -> None:
|
||||||
assert len(corpus["sources"]) == 2
|
assert len(corpus["sources"]) == 2
|
||||||
assert groundrecall_bundle["bundle_kind"] == "groundrecall_query_bundle"
|
assert groundrecall_bundle["bundle_kind"] == "groundrecall_query_bundle"
|
||||||
assert any(fragment["lesson_title"] == "Shannon Entropy" for fragment in corpus["fragments"])
|
assert any(fragment["lesson_title"] == "Shannon Entropy" for fragment in corpus["fragments"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_ocw_demo_can_apply_wolfe_snippet_augmentation(tmp_path: Path) -> None:
|
||||||
|
source_dir = tmp_path / "course"
|
||||||
|
source_dir.mkdir()
|
||||||
|
(source_dir / "unit1.md").write_text(
|
||||||
|
"# Course\n\n## Unit 1\n### Thermodynamics and Entropy\n- Objective: Explain entropy.\nEntropy links uncertainty to physics.",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
sources = tmp_path / "sources.yaml"
|
||||||
|
sources.write_text("sources: []\n", encoding="utf-8")
|
||||||
|
|
||||||
|
wolfe_dir = tmp_path / "wolfe"
|
||||||
|
wolfe_dir.mkdir()
|
||||||
|
(wolfe_dir / "snippet.md").write_text(
|
||||||
|
"# Wolfe Snippet\n\n## Augmentation\n### Entropy Comparison\n- Objective: Compare Shannon entropy with thermodynamic entropy.\nThe two notions differ in interpretation even when the mathematics overlaps.",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
wolfe_sources = tmp_path / "wolfe-sources.yaml"
|
||||||
|
wolfe_sources.write_text(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
"sources:",
|
||||||
|
" - source_id: wolfe-local-snippet",
|
||||||
|
" title: Wolfe local snippet",
|
||||||
|
" url: file:///local/wolfe/snippet",
|
||||||
|
" publisher: Local Library",
|
||||||
|
" creator: Local Search",
|
||||||
|
" license_id: local-only",
|
||||||
|
" license_url: https://example.invalid/local-only",
|
||||||
|
" retrieved_at: '2026-05-08'",
|
||||||
|
" adapted: false",
|
||||||
|
" attribution_text: Local Wolfe-derived snippet for private evaluation.",
|
||||||
|
" excluded_from_upstream_license: true",
|
||||||
|
" exclusion_notes: Local-only experimental augmentation.",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
summary = run_ocw_information_entropy_demo(
|
||||||
|
course_source=source_dir,
|
||||||
|
source_inventory=sources,
|
||||||
|
pack_dir=tmp_path / "pack",
|
||||||
|
run_dir=tmp_path / "run",
|
||||||
|
skill_dir=tmp_path / "skill",
|
||||||
|
wolfe_snippets_dir=wolfe_dir,
|
||||||
|
wolfe_source_inventory=wolfe_sources,
|
||||||
|
)
|
||||||
|
|
||||||
|
bundle = json.loads((tmp_path / "pack" / "groundrecall_query_bundle.json").read_text(encoding="utf-8"))
|
||||||
|
manifest = json.loads((tmp_path / "pack" / "pack_compliance_manifest.json").read_text(encoding="utf-8"))
|
||||||
|
assert summary["wolfe_source_document_count"] == 1
|
||||||
|
assert summary["source_document_count"] == 2
|
||||||
|
assert "wolfe-local-snippet" in manifest["derived_from_sources"]
|
||||||
|
assert bundle["bundle_kind"] == "groundrecall_query_bundle"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue