diff --git a/.gitignore b/.gitignore index 6c41cf8..6f6e8df 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ build/ *.egg-info/ .env configs/config.yaml +tmp-* +codex* diff --git a/README.md b/README.md index acf658c..6e78d8d 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,14 @@ That demo builds a graph-grounded session from the MIT OCW skill bundle and emit The point of this module is architectural as much as demonstrational: it is the session core that future accessibility, model-benchmark, and voice-interaction work should build on. +The first benchmark harness for that session core is now: + +```bash +python -m didactopus.model_bench +``` + +It evaluates local-model adequacy for the `mentor`, `practice`, and `evaluator` roles using the MIT OCW skill bundle as grounded context. + ## What Is In This Repository - `src/didactopus/` @@ -433,6 +441,7 @@ What remains heuristic or lightweight: ## Recommended Reading - [docs/roadmap.md](docs/roadmap.md) +- [docs/local-model-benchmark.md](docs/local-model-benchmark.md) - [docs/course-to-pack.md](docs/course-to-pack.md) - [docs/learning-graph.md](docs/learning-graph.md) - [docs/agentic-learner-loop.md](docs/agentic-learner-loop.md) diff --git a/docs/local-model-benchmark.md b/docs/local-model-benchmark.md new file mode 100644 index 0000000..98a52bc --- /dev/null +++ b/docs/local-model-benchmark.md @@ -0,0 +1,108 @@ +# Local Model Benchmark + +Didactopus should not evaluate local models as generic chatbots. It should evaluate them as role-specific components in a graph-grounded learner workflow. + +This benchmark uses the MIT OCW Information and Entropy skill bundle and measures whether a local model is adequate for the current Didactopus mentor loop. + +## What It Benchmarks + +The current harness evaluates three Didactopus roles: + +- `mentor` +- `practice` +- `evaluator` + +Each role is prompted with graph-grounded context derived from: + +- `knowledge_graph.json` +- `source_corpus.json` +- the generated OCW skill bundle + +## Why This Matters + +Didactopus needs local models that are good enough to support guided learning on constrained hardware. That is a different question from asking which model is globally strongest. + +The benchmark is intended to support comparisons such as: + +- Raspberry Pi-class devices +- low-end local desktops +- stronger local workstations +- RoleMesh-routed local model mixes + +## How To Run It + +Stub or local-demo run: + +```bash +python -m didactopus.model_bench \ + --config configs/config.example.yaml \ + --skill-dir skills/ocw-information-entropy-agent \ + --out-dir examples/model-benchmark \ + --hardware-profile pi-minimal \ + --hardware-cpu cortex-a76 \ + --hardware-ram-gb 8 +``` + +RoleMesh-backed run: + +```bash +python -m didactopus.model_bench \ + --config configs/config.rolemesh.example.yaml \ + --skill-dir skills/ocw-information-entropy-agent \ + --out-dir examples/model-benchmark-rolemesh \ + --hardware-profile laptop-local \ + --hardware-cpu ryzen-7 \ + --hardware-ram-gb 32 +``` + +## Outputs + +The benchmark writes: + +- `model_benchmark.json` +- `model_benchmark.md` + +These include: + +- provider and model information +- hardware profile metadata +- per-role latency +- per-role adequacy score and adequacy rating +- an overall recommendation + +## Current Scoring Shape + +The current heuristic scoring asks whether each role does the right kind of work: + +- `mentor` + - stays tied to the grounded concept + - surfaces structure or prerequisites + - asks a focused learner question +- `practice` + - produces a real exercise + - avoids giving away the full solution + - stays tied to the grounded topic +- `evaluator` + - acknowledges learner strengths + - preserves an existing caveat rather than inventing an omission + - gives a concrete next step + +This is deliberately narrower than a general-purpose benchmark. Didactopus cares about trustworthy learner guidance, not maximal generic fluency. + +## Interpreting Ratings + +- `adequate` + - suitable for local guided-learning experiments +- `borderline` + - usable only with review and caution +- `inadequate` + - not recommended for learner-facing use in the current configuration + +## Recommended Next Step + +As the learner session backend grows, the benchmark should expand to include: + +- multi-turn sessions +- first-token delay and tokens-per-second capture +- memory and thermal observations on constrained hardware +- accessibility-specific checks for structure and spoken-output quality diff --git a/examples/model-benchmark/model_benchmark.json b/examples/model-benchmark/model_benchmark.json new file mode 100644 index 0000000..c4c4a66 --- /dev/null +++ b/examples/model-benchmark/model_benchmark.json @@ -0,0 +1,61 @@ +{ + "benchmark": { + "name": "didactopus-local-model-adequacy", + "task_family": "graph-grounded-mentor-loop", + "provider": "stub", + "hardware_profile": { + "profile_name": "pi-minimal", + "cpu": "cortex-a76", + "ram_gb": 8.0, + "notes": "stub benchmark structure run" + } + }, + "context": { + "skill_name": "ocw-information-entropy-agent", + "study_plan_task": "Help a learner connect Shannon entropy, channel capacity, and thermodynamic entropy.", + "primary_concept": "Independent Reasoning and Careful Comparison", + "secondary_concept": "Thermodynamics and Entropy" + }, + "role_results": [ + { + "role": "mentor", + "provider": "stub", + "model_name": "local-demo", + "latency_ms": 0.027, + "response_preview": "[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons", + "adequacy_score": 0.65, + "adequacy_rating": "borderline", + "notes": [ + "Did not ask a focused learner question." + ] + }, + { + "role": "practice", + "provider": "stub", + "model_name": "local-demo", + "latency_ms": 0.004, + "response_preview": "[stubbed-response] [practice] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons", + "adequacy_score": 1.0, + "adequacy_rating": "adequate", + "notes": [] + }, + { + "role": "evaluator", + "provider": "stub", + "model_name": "local-demo", + "latency_ms": 0.003, + "response_preview": "[stubbed-response] [evaluator] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons", + "adequacy_score": 0.35, + "adequacy_rating": "inadequate", + "notes": [ + "Did not acknowledge learner strengths.", + "Did not provide a concrete next step." + ] + } + ], + "summary": { + "overall_adequacy_score": 0.667, + "overall_adequacy_rating": "borderline", + "recommended_use": "Use with caution; responses should stay in review." + } +} \ No newline at end of file diff --git a/examples/model-benchmark/model_benchmark.md b/examples/model-benchmark/model_benchmark.md new file mode 100644 index 0000000..f15067a --- /dev/null +++ b/examples/model-benchmark/model_benchmark.md @@ -0,0 +1,15 @@ +# Didactopus Local Model Benchmark + +- Provider: `stub` +- Hardware profile: `pi-minimal` +- Primary concept: Independent Reasoning and Careful Comparison +- Secondary concept: Thermodynamics and Entropy +- Overall adequacy: borderline (0.667) +- Recommended use: Use with caution; responses should stay in review. + +## Role Results +- `mentor` via `local-demo`: borderline (0.65), latency 0.027 ms + Notes: Did not ask a focused learner question. +- `practice` via `local-demo`: adequate (1.0), latency 0.004 ms +- `evaluator` via `local-demo`: inadequate (0.35), latency 0.003 ms + Notes: Did not acknowledge learner strengths.; Did not provide a concrete next step. \ No newline at end of file diff --git a/src/didactopus/model_bench.py b/src/didactopus/model_bench.py new file mode 100644 index 0000000..24228ec --- /dev/null +++ b/src/didactopus/model_bench.py @@ -0,0 +1,265 @@ +from __future__ import annotations + +import json +from pathlib import Path +from time import perf_counter + +from .config import load_config +from .learner_session import _grounding_block +from .model_provider import ModelProvider +from .ocw_skill_agent_demo import build_skill_grounded_study_plan, evaluate_submission_with_skill, load_ocw_skill_context +from .role_prompts import system_prompt_for_role + + +def _score_mentor_response(text: str) -> tuple[float, list[str]]: + lowered = text.lower() + score = 0.0 + notes: list[str] = [] + if "concept" in lowered or "entropy" in lowered: + score += 0.35 + else: + notes.append("Did not clearly reference the grounded concept.") + if "?" in text: + score += 0.35 + else: + notes.append("Did not ask a focused learner question.") + if "supporting lessons" in lowered or "prerequisite" in lowered or "course notes" in lowered: + score += 0.3 + else: + notes.append("Did not clearly surface grounded structure or prerequisites.") + return min(score, 1.0), notes + + +def _score_practice_response(text: str) -> tuple[float, list[str]]: + lowered = text.lower() + score = 0.0 + notes: list[str] = [] + if "practice" in lowered or "task" in lowered or "compare" in lowered or "explain" in lowered: + score += 0.35 + else: + notes.append("Did not clearly frame an exercise.") + if "solution" not in lowered and "answer:" not in lowered: + score += 0.35 + else: + notes.append("Looks too close to giving away the full solution.") + if "entropy" in lowered or "concept" in lowered or "channel" in lowered: + score += 0.3 + else: + notes.append("Did not stay visibly tied to the grounded topic.") + return min(score, 1.0), notes + + +def _score_evaluator_response(text: str) -> tuple[float, list[str]]: + lowered = text.lower() + score = 0.0 + notes: list[str] = [] + if "strength" in lowered or "correct" in lowered or "good" in lowered: + score += 0.35 + else: + notes.append("Did not acknowledge learner strengths.") + if "not identical" in lowered or "limitation" in lowered or "careful" in lowered: + score += 0.35 + else: + notes.append("Did not preserve the learner's existing caveat.") + if "next" in lowered or "revise" in lowered or "follow-up" in lowered or "improve" in lowered: + score += 0.3 + else: + notes.append("Did not provide a concrete next step.") + return min(score, 1.0), notes + + +def _adequacy_rating(score: float) -> str: + if score >= 0.8: + return "adequate" + if score >= 0.6: + return "borderline" + return "inadequate" + + +def _hardware_profile( + *, + profile_name: str, + cpu: str, + ram_gb: float | None, + notes: str | None, +) -> dict: + return { + "profile_name": profile_name, + "cpu": cpu, + "ram_gb": ram_gb, + "notes": notes or "", + } + + +def run_model_benchmark( + *, + config_path: str | Path, + skill_dir: str | Path, + out_dir: str | Path, + hardware_profile_name: str = "unspecified-local", + hardware_cpu: str = "unknown", + hardware_ram_gb: float | None = None, + hardware_notes: str | None = None, +) -> dict: + config = load_config(config_path) + provider = ModelProvider(config.model_provider) + context = load_ocw_skill_context(skill_dir) + study_plan = build_skill_grounded_study_plan( + context, + "Help a learner connect Shannon entropy, channel capacity, and thermodynamic entropy.", + ) + steps = study_plan.get("steps", []) + if len(steps) < 2: + raise ValueError("Benchmark requires at least two grounded study-plan steps.") + + primary = steps[0] + secondary = steps[1] + learner_submission = ( + "Entropy measures uncertainty because more possible outcomes require more information to describe, " + "but thermodynamic entropy is not identical to Shannon entropy without careful interpretation." + ) + deterministic = evaluate_submission_with_skill( + context, + primary["concept_key"].split("::", 1)[-1], + learner_submission, + ) + + prompts = { + "mentor": ( + f"{_grounding_block(primary)}\n\n" + f"{_grounding_block(secondary)}\n\n" + "Give a grounded mentor response that orients the learner, explains the sequence, and asks one focused question." + ), + "practice": ( + f"{_grounding_block(primary)}\n\n" + "Create one reasoning-heavy practice task. Keep it grounded and do not provide the full solution." + ), + "evaluator": ( + f"{_grounding_block(primary)}\n\n" + f"Learner submission: {learner_submission}\n" + f"Deterministic evaluator result: verdict={deterministic['verdict']}, aggregated={deterministic['aggregated']}\n" + "Respond as evaluator. Acknowledge what the learner already did correctly, preserve existing caveats, and give one next revision target." + ), + } + + scorers = { + "mentor": _score_mentor_response, + "practice": _score_practice_response, + "evaluator": _score_evaluator_response, + } + + role_results = [] + adequacy_scores = [] + for role, prompt in prompts.items(): + started = perf_counter() + response = provider.generate( + prompt, + role=role, + system_prompt=system_prompt_for_role(role), + temperature=0.2, + max_tokens=220, + ) + elapsed_ms = round((perf_counter() - started) * 1000.0, 3) + score, notes = scorers[role](response.text) + adequacy_scores.append(score) + role_results.append( + { + "role": role, + "provider": response.provider, + "model_name": response.model_name, + "latency_ms": elapsed_ms, + "response_preview": response.text[:280], + "adequacy_score": round(score, 3), + "adequacy_rating": _adequacy_rating(score), + "notes": notes, + } + ) + + overall_score = sum(adequacy_scores) / len(adequacy_scores) + payload = { + "benchmark": { + "name": "didactopus-local-model-adequacy", + "task_family": "graph-grounded-mentor-loop", + "provider": config.model_provider.provider, + "hardware_profile": _hardware_profile( + profile_name=hardware_profile_name, + cpu=hardware_cpu, + ram_gb=hardware_ram_gb, + notes=hardware_notes, + ), + }, + "context": { + "skill_name": context.skill_name, + "study_plan_task": study_plan["task"], + "primary_concept": primary["title"], + "secondary_concept": secondary["title"], + }, + "role_results": role_results, + "summary": { + "overall_adequacy_score": round(overall_score, 3), + "overall_adequacy_rating": _adequacy_rating(overall_score), + "recommended_use": ( + "Suitable for local guided-learning experiments." + if overall_score >= 0.8 + else "Use with caution; responses should stay in review." + if overall_score >= 0.6 + else "Not recommended for learner-facing local deployment." + ), + }, + } + + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / "model_benchmark.json").write_text(json.dumps(payload, indent=2), encoding="utf-8") + + lines = [ + "# Didactopus Local Model Benchmark", + "", + f"- Provider: `{payload['benchmark']['provider']}`", + f"- Hardware profile: `{payload['benchmark']['hardware_profile']['profile_name']}`", + f"- Primary concept: {payload['context']['primary_concept']}", + f"- Secondary concept: {payload['context']['secondary_concept']}", + f"- Overall adequacy: {payload['summary']['overall_adequacy_rating']} ({payload['summary']['overall_adequacy_score']})", + f"- Recommended use: {payload['summary']['recommended_use']}", + "", + "## Role Results", + ] + for result in role_results: + lines.append( + f"- `{result['role']}` via `{result['model_name']}`: " + f"{result['adequacy_rating']} ({result['adequacy_score']}), latency {result['latency_ms']} ms" + ) + if result["notes"]: + lines.append(f" Notes: {'; '.join(result['notes'])}") + (out_dir / "model_benchmark.md").write_text("\n".join(lines), encoding="utf-8") + + return payload + + +def main() -> None: + import argparse + + root = Path(__file__).resolve().parents[2] + parser = argparse.ArgumentParser(description="Benchmark local-model adequacy for the Didactopus mentor loop.") + parser.add_argument("--config", default=str(root / "configs" / "config.example.yaml")) + parser.add_argument("--skill-dir", default=str(root / "skills" / "ocw-information-entropy-agent")) + parser.add_argument("--out-dir", default=str(root / "examples" / "model-benchmark")) + parser.add_argument("--hardware-profile", default="unspecified-local") + parser.add_argument("--hardware-cpu", default="unknown") + parser.add_argument("--hardware-ram-gb", type=float, default=None) + parser.add_argument("--hardware-notes", default="") + args = parser.parse_args() + payload = run_model_benchmark( + config_path=args.config, + skill_dir=args.skill_dir, + out_dir=args.out_dir, + hardware_profile_name=args.hardware_profile, + hardware_cpu=args.hardware_cpu, + hardware_ram_gb=args.hardware_ram_gb, + hardware_notes=args.hardware_notes, + ) + print(json.dumps(payload, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/tests/test_model_bench.py b/tests/test_model_bench.py new file mode 100644 index 0000000..3563eb6 --- /dev/null +++ b/tests/test_model_bench.py @@ -0,0 +1,44 @@ +import json + +from didactopus.model_bench import run_model_benchmark + + +def test_run_model_benchmark_writes_reports(tmp_path) -> None: + payload = run_model_benchmark( + config_path="configs/config.example.yaml", + skill_dir="skills/ocw-information-entropy-agent", + out_dir=tmp_path, + hardware_profile_name="pi-minimal", + hardware_cpu="cortex-a76", + hardware_ram_gb=8.0, + hardware_notes="local stub run for structure verification", + ) + + assert payload["benchmark"]["name"] == "didactopus-local-model-adequacy" + assert payload["benchmark"]["hardware_profile"]["profile_name"] == "pi-minimal" + assert len(payload["role_results"]) == 3 + assert {result["role"] for result in payload["role_results"]} == {"mentor", "practice", "evaluator"} + assert payload["summary"]["overall_adequacy_rating"] in {"adequate", "borderline", "inadequate"} + + json_path = tmp_path / "model_benchmark.json" + md_path = tmp_path / "model_benchmark.md" + assert json_path.exists() + assert md_path.exists() + + written = json.loads(json_path.read_text(encoding="utf-8")) + assert written["summary"]["overall_adequacy_score"] == payload["summary"]["overall_adequacy_score"] + assert "Role Results" in md_path.read_text(encoding="utf-8") + + +def test_model_benchmark_captures_response_preview_and_latency(tmp_path) -> None: + payload = run_model_benchmark( + config_path="configs/config.example.yaml", + skill_dir="skills/ocw-information-entropy-agent", + out_dir=tmp_path, + ) + + for result in payload["role_results"]: + assert result["provider"] == "stub" + assert result["latency_ms"] >= 0.0 + assert result["response_preview"] + assert "adequacy_score" in result