From 8862e00194ae6152a9fdc7dc0e95e988df9a53c2 Mon Sep 17 00:00:00 2001
From: welsberr <welsberr@gmail.com>
Date: Tue, 17 Mar 2026 19:29:12 -0400
Subject: [PATCH] Initial pass at making Didactopus multilingual.

---
 README.md                                     |  12 +
 configs/arena.example.yaml                    |  18 ++
 docs/arena.md                                 |  96 +++++++
 docs/faq.md                                   |   3 +
 docs/local-model-benchmark.md                 |   4 +
 docs/model-provider-setup.md                  |   2 +
 examples/arena/arena_report.md                |  16 ++
 examples/arena/arena_results.json             | 202 ++++++++++++++
 examples/arena/arena_review_queue.json        |  32 +++
 .../ocw-information-entropy-session-es.html   | 118 ++++++++
 .../ocw-information-entropy-session-es.json   | 244 ++++++++++++++++
 .../ocw-information-entropy-session-es.txt    |  55 ++++
 src/didactopus/arena.py                       | 262 ++++++++++++++++++
 src/didactopus/language_support.py            |  28 ++
 src/didactopus/learner_accessibility.py       |   4 +
 src/didactopus/learner_session.py             |  17 +-
 src/didactopus/learner_session_demo.py        |   5 +
 src/didactopus/model_bench.py                 |   8 +-
 src/didactopus/ocw_skill_agent_demo.py        |  10 +-
 src/didactopus/role_prompts.py                |  37 +++
 tests/test_arena.py                           |  38 +++
 tests/test_learner_accessibility.py           |   2 +
 tests/test_learner_session.py                 |   3 +
 tests/test_model_bench.py                     |   1 +
 tests/test_ocw_skill_agent_demo.py            |   2 +
 25 files changed, 1215 insertions(+), 4 deletions(-)
 create mode 100644 configs/arena.example.yaml
 create mode 100644 docs/arena.md
 create mode 100644 examples/arena/arena_report.md
 create mode 100644 examples/arena/arena_results.json
 create mode 100644 examples/arena/arena_review_queue.json
 create mode 100644 examples/ocw-information-entropy-session-es.html
 create mode 100644 examples/ocw-information-entropy-session-es.json
 create mode 100644 examples/ocw-information-entropy-session-es.txt
 create mode 100644 src/didactopus/arena.py
 create mode 100644 src/didactopus/language_support.py
 create mode 100644 tests/test_arena.py

diff --git a/README.md b/README.md
index 3d9dd5b..633e4f2 100644
--- a/README.md
+++ b/README.md
@@ -173,6 +173,7 @@ The main mentor-style backend now has a dedicated demo entry point:
 
 ```bash
 python -m didactopus.learner_session_demo
+python -m didactopus.learner_session_demo --language es
 ```
 
 That demo builds a graph-grounded session from the MIT OCW skill bundle and emits:
@@ -183,6 +184,8 @@ That demo builds a graph-grounded session from the MIT OCW skill bundle and emit
 - evaluator feedback
 - a recommended next step
 
+The learner-facing CLI now treats language as a first-class parameter, so the same session flow can target another output language while preserving the English source-grounding context.
+
 The point of this module is architectural as much as demonstrational: it is the session core that future accessibility, model-benchmark, and voice-interaction work should build on.
 
 The learner-session demo also writes accessible companion outputs:
@@ -198,6 +201,14 @@ python -m didactopus.model_bench
 
 It evaluates local-model adequacy for the `mentor`, `practice`, and `evaluator` roles using the MIT OCW skill bundle as grounded context.
 
+There is also now a Didactopus-specific arena for comparing provider/model/prompt combinations:
+
+```bash
+python -m didactopus.arena --arena-spec configs/arena.example.yaml
+```
+
+That produces rankings, a human review queue, and an optional LLM-written comparative summary for reviewer triage.
+
 ### Easiest LLM setup paths
 
 If you want live LLM-backed Didactopus behavior without the complexity of RoleMesh, start with one of these:
@@ -466,6 +477,7 @@ What remains heuristic or lightweight:
 ## Recommended Reading
 
 - [docs/roadmap.md](docs/roadmap.md)
+- [docs/arena.md](docs/arena.md)
 - [docs/learner-accessibility.md](docs/learner-accessibility.md)
 - [docs/local-model-benchmark.md](docs/local-model-benchmark.md)
 - [docs/model-provider-setup.md](docs/model-provider-setup.md)
diff --git a/configs/arena.example.yaml b/configs/arena.example.yaml
new file mode 100644
index 0000000..757ecf5
--- /dev/null
+++ b/configs/arena.example.yaml
@@ -0,0 +1,18 @@
+candidates:
+  - name: "stub-baseline"
+    config: "configs/config.example.yaml"
+    prompt_variant: "baseline"
+    language: "en"
+  - name: "stub-strict-grounding"
+    config: "configs/config.example.yaml"
+    prompt_variant: "strict_grounding"
+    language: "es"
+  - name: "stub-trust-preserving"
+    config: "configs/config.example.yaml"
+    prompt_variant: "trust_preserving"
+    language: "fr"
+
+review:
+  enabled: true
+  config: "configs/config.example.yaml"
+  role: "mentor"
diff --git a/docs/arena.md b/docs/arena.md
new file mode 100644
index 0000000..87570b6
--- /dev/null
+++ b/docs/arena.md
@@ -0,0 +1,96 @@
+# Didactopus Arena
+
+The Didactopus arena compares candidate combinations of:
+
+- provider configuration
+- model choice
+- role prompt variant
+- output language
+
+It is not a generic chatbot arena. It is a Didactopus-specific behavior arena for grounded learner tasks.
+
+## What It Does
+
+For each candidate, the arena runs the current graph-grounded learner-task shape for:
+
+- `mentor`
+- `practice`
+- `evaluator`
+
+It then produces:
+
+- deterministic role scores
+- candidate rankings
+- a human review queue
+- an optional LLM-written review summary to help the human reviewer triage results
+
+## Why This Exists
+
+Didactopus needs a practical way to improve:
+
+- local model choice
+- prompt variants
+- trust-preserving behavior
+- source-grounded behavior
+
+This is an aid to benchmarking and review, not an automatic certification system.
+
+## How To Run It
+
+Use the example spec:
+
+```bash
+python -m didactopus.arena --arena-spec configs/arena.example.yaml
+```
+
+That writes outputs under:
+
+- `examples/arena/`
+
+## Spec Shape
+
+The arena spec is a YAML file with:
+
+- `candidates`
+- `review`
+
+Example candidate fields:
+
+- `name`
+- `config`
+- `prompt_variant`
+- `language`
+
+Example review fields:
+
+- `enabled`
+- `config`
+- `role`
+
+## Current Prompt Variants
+
+- `baseline`
+- `strict_grounding`
+- `trust_preserving`
+- `concise`
+
+These are applied to Didactopus role prompts, not to arbitrary raw prompt strings.
+
+## Outputs
+
+The arena currently writes:
+
+- `arena_results.json`
+- `arena_review_queue.json`
+- `arena_report.md`
+
+## Human Review Position
+
+The LLM review summary should be treated as initial triage support only.
+
+The intended order of trust is:
+
+1. deterministic checks
+2. arena comparison results
+3. LLM comparative summary
+4. human reviewer decision
diff --git a/docs/faq.md b/docs/faq.md
index 783b834..6b45556 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -153,6 +153,7 @@ Run:
 
 ```bash
 python -m didactopus.learner_session_demo
+python -m didactopus.learner_session_demo --language es
 ```
 
 That demo loads the MIT OCW skill bundle, retrieves grounded concept neighborhoods and source fragments, and emits a single learner session containing:
@@ -166,6 +167,8 @@ That demo loads the MIT OCW skill bundle, retrieves grounded concept neighborhoo
 
 This is the backend shape the repository should now treat as the base for future accessibility, benchmarking, and voice-interaction work.
 
+The learner-facing commands are also starting to expose `--language`, so output language becomes an explicit session parameter rather than an implicit prompt tweak.
+
 ## How should I use it if I am taking a course and do not want to hire a tutor?
 
 Use it as a structured study companion:
diff --git a/docs/local-model-benchmark.md b/docs/local-model-benchmark.md
index 98a52bc..788107c 100644
--- a/docs/local-model-benchmark.md
+++ b/docs/local-model-benchmark.md
@@ -106,3 +106,7 @@ As the learner session backend grows, the benchmark should expand to include:
 - first-token delay and tokens-per-second capture
 - memory and thermal observations on constrained hardware
 - accessibility-specific checks for structure and spoken-output quality
+
+For model-and-prompt comparison across multiple candidates, use:
+
+- `docs/arena.md`
diff --git a/docs/model-provider-setup.md b/docs/model-provider-setup.md
index 2d66f26..8878c09 100644
--- a/docs/model-provider-setup.md
+++ b/docs/model-provider-setup.md
@@ -37,6 +37,7 @@ Example commands:
 ```bash
 ollama pull llama3.2:3b
 python -m didactopus.learner_session_demo --config configs/config.ollama.example.yaml
+python -m didactopus.learner_session_demo --config configs/config.ollama.example.yaml --language es
 ```
 
 If you want a different local model, change:
@@ -69,6 +70,7 @@ Example:
 
 ```bash
 python -m didactopus.learner_session_demo --config configs/config.openai-compatible.example.yaml
+python -m didactopus.learner_session_demo --config configs/config.openai-compatible.example.yaml --language fr
 ```
 
 ## Option 3: RoleMesh Gateway
diff --git a/examples/arena/arena_report.md b/examples/arena/arena_report.md
new file mode 100644
index 0000000..8ebe119
--- /dev/null
+++ b/examples/arena/arena_report.md
@@ -0,0 +1,16 @@
+# Didactopus Arena Report
+
+- Candidates: 3
+
+## Rankings
+- `stub-baseline` via `stub` / prompt variant `baseline`: borderline (0.667), language `en`
+- `stub-strict-grounding` via `stub` / prompt variant `strict_grounding`: borderline (0.667), language `es`
+- `stub-trust-preserving` via `stub` / prompt variant `trust_preserving`: borderline (0.667), language `fr`
+
+## Human Review Queue
+- `stub-baseline`: needs_human_review=True, weak_roles=['mentor', 'evaluator']
+- `stub-strict-grounding`: needs_human_review=True, weak_roles=['mentor', 'evaluator']
+- `stub-trust-preserving`: needs_human_review=True, weak_roles=['mentor', 'evaluator']
+
+## LLM Review Summary
+[stubbed-response] [mentor] Review these Didactopus arena results for a human reviewer. Rank the strongest candidates, identify likely prompt improv
\ No newline at end of file
diff --git a/examples/arena/arena_results.json b/examples/arena/arena_results.json
new file mode 100644
index 0000000..920b358
--- /dev/null
+++ b/examples/arena/arena_results.json
@@ -0,0 +1,202 @@
+{
+  "arena": {
+    "name": "didactopus-behavior-arena",
+    "candidate_count": 3
+  },
+  "ranked_candidates": [
+    {
+      "candidate_name": "stub-baseline",
+      "config": "configs/config.example.yaml",
+      "prompt_variant": "baseline",
+      "language": "en",
+      "provider": "stub",
+      "overall_score": 0.667,
+      "overall_rating": "borderline",
+      "role_results": [
+        {
+          "role": "mentor",
+          "provider": "stub",
+          "model_name": "local-demo",
+          "prompt_variant": "baseline",
+          "language": "en",
+          "latency_ms": 0.027,
+          "adequacy_score": 0.65,
+          "adequacy_rating": "borderline",
+          "response_preview": "[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
+          "notes": [
+            "Did not ask a focused learner question."
+          ]
+        },
+        {
+          "role": "practice",
+          "provider": "stub",
+          "model_name": "local-demo",
+          "prompt_variant": "baseline",
+          "language": "en",
+          "latency_ms": 0.006,
+          "adequacy_score": 1.0,
+          "adequacy_rating": "adequate",
+          "response_preview": "[stubbed-response] [practice] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
+          "notes": []
+        },
+        {
+          "role": "evaluator",
+          "provider": "stub",
+          "model_name": "local-demo",
+          "prompt_variant": "baseline",
+          "language": "en",
+          "latency_ms": 0.005,
+          "adequacy_score": 0.35,
+          "adequacy_rating": "inadequate",
+          "response_preview": "[stubbed-response] [evaluator] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
+          "notes": [
+            "Did not acknowledge learner strengths.",
+            "Did not provide a concrete next step."
+          ]
+        }
+      ]
+    },
+    {
+      "candidate_name": "stub-strict-grounding",
+      "config": "configs/config.example.yaml",
+      "prompt_variant": "strict_grounding",
+      "language": "es",
+      "provider": "stub",
+      "overall_score": 0.667,
+      "overall_rating": "borderline",
+      "role_results": [
+        {
+          "role": "mentor",
+          "provider": "stub",
+          "model_name": "local-demo",
+          "prompt_variant": "strict_grounding",
+          "language": "es",
+          "latency_ms": 0.019,
+          "adequacy_score": 0.65,
+          "adequacy_rating": "borderline",
+          "response_preview": "[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
+          "notes": [
+            "Did not ask a focused learner question."
+          ]
+        },
+        {
+          "role": "practice",
+          "provider": "stub",
+          "model_name": "local-demo",
+          "prompt_variant": "strict_grounding",
+          "language": "es",
+          "latency_ms": 0.005,
+          "adequacy_score": 1.0,
+          "adequacy_rating": "adequate",
+          "response_preview": "[stubbed-response] [practice] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
+          "notes": []
+        },
+        {
+          "role": "evaluator",
+          "provider": "stub",
+          "model_name": "local-demo",
+          "prompt_variant": "strict_grounding",
+          "language": "es",
+          "latency_ms": 0.004,
+          "adequacy_score": 0.35,
+          "adequacy_rating": "inadequate",
+          "response_preview": "[stubbed-response] [evaluator] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
+          "notes": [
+            "Did not acknowledge learner strengths.",
+            "Did not provide a concrete next step."
+          ]
+        }
+      ]
+    },
+    {
+      "candidate_name": "stub-trust-preserving",
+      "config": "configs/config.example.yaml",
+      "prompt_variant": "trust_preserving",
+      "language": "fr",
+      "provider": "stub",
+      "overall_score": 0.667,
+      "overall_rating": "borderline",
+      "role_results": [
+        {
+          "role": "mentor",
+          "provider": "stub",
+          "model_name": "local-demo",
+          "prompt_variant": "trust_preserving",
+          "language": "fr",
+          "latency_ms": 0.025,
+          "adequacy_score": 0.65,
+          "adequacy_rating": "borderline",
+          "response_preview": "[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
+          "notes": [
+            "Did not ask a focused learner question."
+          ]
+        },
+        {
+          "role": "practice",
+          "provider": "stub",
+          "model_name": "local-demo",
+          "prompt_variant": "trust_preserving",
+          "language": "fr",
+          "latency_ms": 0.005,
+          "adequacy_score": 1.0,
+          "adequacy_rating": "adequate",
+          "response_preview": "[stubbed-response] [practice] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
+          "notes": []
+        },
+        {
+          "role": "evaluator",
+          "provider": "stub",
+          "model_name": "local-demo",
+          "prompt_variant": "trust_preserving",
+          "language": "fr",
+          "latency_ms": 0.005,
+          "adequacy_score": 0.35,
+          "adequacy_rating": "inadequate",
+          "response_preview": "[stubbed-response] [evaluator] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
+          "notes": [
+            "Did not acknowledge learner strengths.",
+            "Did not provide a concrete next step."
+          ]
+        }
+      ]
+    }
+  ],
+  "review_queue": [
+    {
+      "candidate_name": "stub-baseline",
+      "overall_rating": "borderline",
+      "overall_score": 0.667,
+      "needs_human_review": true,
+      "weak_roles": [
+        "mentor",
+        "evaluator"
+      ]
+    },
+    {
+      "candidate_name": "stub-strict-grounding",
+      "overall_rating": "borderline",
+      "overall_score": 0.667,
+      "needs_human_review": true,
+      "weak_roles": [
+        "mentor",
+        "evaluator"
+      ]
+    },
+    {
+      "candidate_name": "stub-trust-preserving",
+      "overall_rating": "borderline",
+      "overall_score": 0.667,
+      "needs_human_review": true,
+      "weak_roles": [
+        "mentor",
+        "evaluator"
+      ]
+    }
+  ],
+  "llm_review": {
+    "provider": "stub",
+    "model_name": "local-demo",
+    "role": "mentor",
+    "summary": "[stubbed-response] [mentor] Review these Didactopus arena results for a human reviewer. Rank the strongest candidates, identify likely prompt improv"
+  }
+}
\ No newline at end of file
diff --git a/examples/arena/arena_review_queue.json b/examples/arena/arena_review_queue.json
new file mode 100644
index 0000000..9ba3fd5
--- /dev/null
+++ b/examples/arena/arena_review_queue.json
@@ -0,0 +1,32 @@
+[
+  {
+    "candidate_name": "stub-baseline",
+    "overall_rating": "borderline",
+    "overall_score": 0.667,
+    "needs_human_review": true,
+    "weak_roles": [
+      "mentor",
+      "evaluator"
+    ]
+  },
+  {
+    "candidate_name": "stub-strict-grounding",
+    "overall_rating": "borderline",
+    "overall_score": 0.667,
+    "needs_human_review": true,
+    "weak_roles": [
+      "mentor",
+      "evaluator"
+    ]
+  },
+  {
+    "candidate_name": "stub-trust-preserving",
+    "overall_rating": "borderline",
+    "overall_score": 0.667,
+    "needs_human_review": true,
+    "weak_roles": [
+      "mentor",
+      "evaluator"
+    ]
+  }
+]
\ No newline at end of file
diff --git a/examples/ocw-information-entropy-session-es.html b/examples/ocw-information-entropy-session-es.html
new file mode 100644
index 0000000..fdf3ab9
--- /dev/null
+++ b/examples/ocw-information-entropy-session-es.html
@@ -0,0 +1,118 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>Didactopus Learner Session</title>
+<style>
+:root { color-scheme: light; --bg: #f7f4ed; --panel: #fffdf8; --ink: #1e2b31; --muted: #53656d; --line: #d3c8b7; --accent: #155e63; }
+body { margin: 0; font-family: Georgia, 'Times New Roman', serif; background: var(--bg); color: var(--ink); line-height: 1.55; }
+a { color: var(--accent); }
+.skip { position: absolute; left: 12px; top: 12px; background: #fff; padding: 8px 10px; border: 1px solid var(--line); }
+main { max-width: 980px; margin: 0 auto; padding: 24px; }
+section { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 20px; margin-bottom: 18px; }
+h1, h2, h3 { line-height: 1.2; }
+ol, ul { padding-left: 22px; }
+.meta { color: var(--muted); }
+.turn { border-top: 1px solid var(--line); padding-top: 12px; margin-top: 12px; }
+.turn:first-of-type { border-top: 0; padding-top: 0; margin-top: 0; }
+.fragment { background: #f3efe5; padding: 10px; border-radius: 10px; margin: 8px 0; }
+.sr-note { color: var(--muted); font-size: 0.95rem; }
+</style>
+</head>
+<body>
+<a class="skip" href="#session-main">Skip to learner session</a>
+<main id="session-main" aria-label="Didactopus learner session">
+<section aria-labelledby="session-title">
+<h1 id="session-title">Didactopus Learner Session</h1>
+<p class="sr-note">This page is structured for keyboard and screen-reader use. It presents the learner goal, study plan, grounded source fragments, and conversation turns in reading order.</p>
+<p><strong>Learner goal:</strong> Help me understand how Shannon entropy leads into channel capacity and thermodynamic entropy.</p>
+<p><strong>Source language:</strong> en</p>
+<p><strong>Output language:</strong> es</p>
+</section>
+<section aria-labelledby="study-plan-title">
+<h2 id="study-plan-title">Study Plan</h2>
+<ol>
+<li>
+<h3>Independent Reasoning and Careful Comparison</h3>
+<p><strong>Status:</strong> mastered</p>
+<p><strong>Prerequisites:</strong> Course Notes and Reference Texts</p>
+<p><strong>Supporting lessons:</strong> Independent Reasoning and Careful Comparison</p>
+<p><strong>Grounding fragments:</strong></p>
+<ul>
+<li><div class="fragment"><strong>Independent Reasoning and Careful Comparison</strong> (lesson_body)<br>- Objective: Explain why the course requires precise comparison of related but non-identical concepts.
+- Exercise: Write a short note distinguishing Shannon entropy, channel capacity, and thermodynamic entropy.
+The syllabus framing implies a style of work where analogy is useful but dangerous when used loosely. Learners must compare models carefully, state assumptions, and notice where similar mathematics does not imply identical interpretation.</div></li>
+<li><div class="fragment"><strong>Independent Reasoning and Careful Comparison</strong> (objective)<br>Explain why the course requires precise comparison of related but non-identical concepts.</div></li>
+</ul>
+</li>
+<li>
+<h3>Thermodynamics and Entropy</h3>
+<p><strong>Status:</strong> mastered</p>
+<p><strong>Prerequisites:</strong> Cryptography and Information Hiding</p>
+<p><strong>Supporting lessons:</strong> Thermodynamics and Entropy</p>
+<p><strong>Grounding fragments:</strong></p>
+<ul>
+<li><div class="fragment"><strong>Thermodynamics and Entropy</strong> (lesson_body)<br>- Objective: Explain how thermodynamic entropy relates to, and differs from, Shannon entropy.
+- Exercise: Compare the two entropy notions and identify what is preserved across the analogy.
+The course uses entropy as a bridge concept between communication theory and physics while insisting on careful interpretation.</div></li>
+<li><div class="fragment"><strong>Thermodynamics and Entropy</strong> (objective)<br>Explain how thermodynamic entropy relates to, and differs from, Shannon entropy.</div></li>
+</ul>
+</li>
+<li>
+<h3>Shannon Entropy</h3>
+<p><strong>Status:</strong> mastered</p>
+<p><strong>Prerequisites:</strong> Counting and Probability</p>
+<p><strong>Supporting lessons:</strong> Shannon Entropy</p>
+<p><strong>Grounding fragments:</strong></p>
+<ul>
+<li><div class="fragment"><strong>Shannon Entropy</strong> (lesson_body)<br>- Objective: Explain Shannon entropy as a measure of uncertainty and compare high-entropy and low-entropy sources.
+- Exercise: Compute the entropy of a Bernoulli source and interpret the result.
+The course then introduces entropy as a quantitative measure of uncertainty for a source model and uses it to reason about representation cost and surprise.</div></li>
+<li><div class="fragment"><strong>Shannon Entropy</strong> (objective)<br>Explain Shannon entropy as a measure of uncertainty and compare high-entropy and low-entropy sources.</div></li>
+</ul>
+</li>
+</ol>
+</section>
+<section aria-labelledby="conversation-title">
+<h2 id="conversation-title">Conversation</h2>
+<article class="turn" aria-label="Conversation turn">
+<h3>Learner Goal</h3>
+<p class="meta">Role: user</p>
+<p>Help me understand how Shannon entropy leads into channel capacity and thermodynamic entropy.</p>
+</article>
+<article class="turn" aria-label="Conversation turn">
+<h3>Didactopus Mentor</h3>
+<p class="meta">Role: assistant</p>
+<p>[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons</p>
+</article>
+<article class="turn" aria-label="Conversation turn">
+<h3>Didactopus Practice Designer</h3>
+<p class="meta">Role: assistant</p>
+<p>[stubbed-response] [practice] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons</p>
+</article>
+<article class="turn" aria-label="Conversation turn">
+<h3>Learner Submission</h3>
+<p class="meta">Role: user</p>
+<p>Entropy measures uncertainty because more possible outcomes require more information to describe, but one limitation is that thermodynamic entropy is not identical to Shannon entropy.</p>
+</article>
+<article class="turn" aria-label="Conversation turn">
+<h3>Didactopus Evaluator</h3>
+<p class="meta">Role: assistant</p>
+<p>[stubbed-response] [evaluator] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons</p>
+</article>
+<article class="turn" aria-label="Conversation turn">
+<h3>Didactopus Mentor</h3>
+<p class="meta">Role: assistant</p>
+<p>[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons</p>
+</article>
+</section>
+<section aria-labelledby="evaluation-title">
+<h2 id="evaluation-title">Evaluation Summary</h2>
+<p><strong>Verdict:</strong> needs_revision</p>
+<p><strong>Aggregated dimensions:</strong> {&quot;correctness&quot;: 0.6000000000000001, &quot;critique&quot;: 0.6499999999999999, &quot;explanation&quot;: 0.85}</p>
+<p><strong>Follow-up:</strong> Rework the answer so it states the equality/relationship explicitly and explains why it matters.</p>
+</section>
+</main>
+</body>
+</html>
\ No newline at end of file
diff --git a/examples/ocw-information-entropy-session-es.json b/examples/ocw-information-entropy-session-es.json
new file mode 100644
index 0000000..c3509c1
--- /dev/null
+++ b/examples/ocw-information-entropy-session-es.json
@@ -0,0 +1,244 @@
+{
+  "goal": "Help me understand how Shannon entropy leads into channel capacity and thermodynamic entropy.",
+  "output_language": "es",
+  "source_language": "en",
+  "study_plan": {
+    "skill": "ocw-information-entropy-agent",
+    "task": "Help me understand how Shannon entropy leads into channel capacity and thermodynamic entropy.",
+    "source_language": "en",
+    "steps": [
+      {
+        "concept_key": "mit-ocw-information-and-entropy::independent-reasoning-and-careful-comparison",
+        "title": "Independent Reasoning and Careful Comparison",
+        "status": "mastered",
+        "prerequisites": [
+          "mit-ocw-information-and-entropy::course-notes-and-reference-texts"
+        ],
+        "prerequisite_titles": [
+          "Course Notes and Reference Texts"
+        ],
+        "supporting_lessons": [
+          "Independent Reasoning and Careful Comparison"
+        ],
+        "source_fragments": [
+          {
+            "lesson_title": "Independent Reasoning and Careful Comparison",
+            "kind": "lesson_body",
+            "text": "- Objective: Explain why the course requires precise comparison of related but non-identical concepts.\n- Exercise: Write a short note distinguishing Shannon entropy, channel capacity, and thermodynamic entropy.\nThe syllabus framing implies a style of work where analogy is useful but dangerous when used loosely. Learners must compare models carefully, state assumptions, and notice where similar mathematics does not imply identical interpretation."
+          },
+          {
+            "lesson_title": "Independent Reasoning and Careful Comparison",
+            "kind": "objective",
+            "text": "Explain why the course requires precise comparison of related but non-identical concepts."
+          }
+        ],
+        "recommended_action": "Use Independent Reasoning and Careful Comparison as the primary teaching anchor."
+      },
+      {
+        "concept_key": "mit-ocw-information-and-entropy::thermodynamics-and-entropy",
+        "title": "Thermodynamics and Entropy",
+        "status": "mastered",
+        "prerequisites": [
+          "mit-ocw-information-and-entropy::cryptography-and-information-hiding"
+        ],
+        "prerequisite_titles": [
+          "Cryptography and Information Hiding"
+        ],
+        "supporting_lessons": [
+          "Thermodynamics and Entropy"
+        ],
+        "source_fragments": [
+          {
+            "lesson_title": "Thermodynamics and Entropy",
+            "kind": "lesson_body",
+            "text": "- Objective: Explain how thermodynamic entropy relates to, and differs from, Shannon entropy.\n- Exercise: Compare the two entropy notions and identify what is preserved across the analogy.\nThe course uses entropy as a bridge concept between communication theory and physics while insisting on careful interpretation."
+          },
+          {
+            "lesson_title": "Thermodynamics and Entropy",
+            "kind": "objective",
+            "text": "Explain how thermodynamic entropy relates to, and differs from, Shannon entropy."
+          }
+        ],
+        "recommended_action": "Use Thermodynamics and Entropy as the primary teaching anchor."
+      },
+      {
+        "concept_key": "mit-ocw-information-and-entropy::shannon-entropy",
+        "title": "Shannon Entropy",
+        "status": "mastered",
+        "prerequisites": [
+          "mit-ocw-information-and-entropy::counting-and-probability"
+        ],
+        "prerequisite_titles": [
+          "Counting and Probability"
+        ],
+        "supporting_lessons": [
+          "Shannon Entropy"
+        ],
+        "source_fragments": [
+          {
+            "lesson_title": "Shannon Entropy",
+            "kind": "lesson_body",
+            "text": "- Objective: Explain Shannon entropy as a measure of uncertainty and compare high-entropy and low-entropy sources.\n- Exercise: Compute the entropy of a Bernoulli source and interpret the result.\nThe course then introduces entropy as a quantitative measure of uncertainty for a source model and uses it to reason about representation cost and surprise."
+          },
+          {
+            "lesson_title": "Shannon Entropy",
+            "kind": "objective",
+            "text": "Explain Shannon entropy as a measure of uncertainty and compare high-entropy and low-entropy sources."
+          }
+        ],
+        "recommended_action": "Use Shannon Entropy as the primary teaching anchor."
+      }
+    ],
+    "guided_path_reference": [
+      "mit-ocw-information-and-entropy::mit-ocw-6-050j-information-and-entropy-course-home",
+      "mit-ocw-information-and-entropy::information-and-entropy",
+      "mit-ocw-information-and-entropy::ultimate-limits-to-communication-and-computation",
+      "mit-ocw-information-and-entropy::open-textbooks-problem-sets-and-programming-work",
+      "mit-ocw-information-and-entropy::mit-ocw-6-050j-information-and-entropy-syllabus",
+      "mit-ocw-information-and-entropy::prerequisites-and-mathematical-background",
+      "mit-ocw-information-and-entropy::assessment-structure",
+      "mit-ocw-information-and-entropy::course-notes-and-reference-texts",
+      "mit-ocw-information-and-entropy::independent-reasoning-and-careful-comparison",
+      "mit-ocw-information-and-entropy::mit-ocw-6-050j-information-and-entropy-unit-sequence",
+      "mit-ocw-information-and-entropy::counting-and-probability",
+      "mit-ocw-information-and-entropy::shannon-entropy",
+      "mit-ocw-information-and-entropy::mutual-information",
+      "mit-ocw-information-and-entropy::source-coding-and-compression",
+      "mit-ocw-information-and-entropy::huffman-coding",
+      "mit-ocw-information-and-entropy::channel-capacity",
+      "mit-ocw-information-and-entropy::channel-coding",
+      "mit-ocw-information-and-entropy::error-correcting-codes",
+      "mit-ocw-information-and-entropy::cryptography-and-information-hiding",
+      "mit-ocw-information-and-entropy::thermodynamics-and-entropy"
+    ]
+  },
+  "primary_concept": {
+    "concept_key": "mit-ocw-information-and-entropy::independent-reasoning-and-careful-comparison",
+    "title": "Independent Reasoning and Careful Comparison",
+    "status": "mastered",
+    "prerequisites": [
+      "mit-ocw-information-and-entropy::course-notes-and-reference-texts"
+    ],
+    "prerequisite_titles": [
+      "Course Notes and Reference Texts"
+    ],
+    "supporting_lessons": [
+      "Independent Reasoning and Careful Comparison"
+    ],
+    "source_fragments": [
+      {
+        "lesson_title": "Independent Reasoning and Careful Comparison",
+        "kind": "lesson_body",
+        "text": "- Objective: Explain why the course requires precise comparison of related but non-identical concepts.\n- Exercise: Write a short note distinguishing Shannon entropy, channel capacity, and thermodynamic entropy.\nThe syllabus framing implies a style of work where analogy is useful but dangerous when used loosely. Learners must compare models carefully, state assumptions, and notice where similar mathematics does not imply identical interpretation."
+      },
+      {
+        "lesson_title": "Independent Reasoning and Careful Comparison",
+        "kind": "objective",
+        "text": "Explain why the course requires precise comparison of related but non-identical concepts."
+      }
+    ],
+    "recommended_action": "Use Independent Reasoning and Careful Comparison as the primary teaching anchor."
+  },
+  "secondary_concept": {
+    "concept_key": "mit-ocw-information-and-entropy::thermodynamics-and-entropy",
+    "title": "Thermodynamics and Entropy",
+    "status": "mastered",
+    "prerequisites": [
+      "mit-ocw-information-and-entropy::cryptography-and-information-hiding"
+    ],
+    "prerequisite_titles": [
+      "Cryptography and Information Hiding"
+    ],
+    "supporting_lessons": [
+      "Thermodynamics and Entropy"
+    ],
+    "source_fragments": [
+      {
+        "lesson_title": "Thermodynamics and Entropy",
+        "kind": "lesson_body",
+        "text": "- Objective: Explain how thermodynamic entropy relates to, and differs from, Shannon entropy.\n- Exercise: Compare the two entropy notions and identify what is preserved across the analogy.\nThe course uses entropy as a bridge concept between communication theory and physics while insisting on careful interpretation."
+      },
+      {
+        "lesson_title": "Thermodynamics and Entropy",
+        "kind": "objective",
+        "text": "Explain how thermodynamic entropy relates to, and differs from, Shannon entropy."
+      }
+    ],
+    "recommended_action": "Use Thermodynamics and Entropy as the primary teaching anchor."
+  },
+  "practice_task": "[stubbed-response] [practice] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
+  "evaluation": {
+    "concept_key": "mit-ocw-information-and-entropy::independent-reasoning-and-careful-comparison",
+    "submission": "Entropy measures uncertainty because more possible outcomes require more information to describe, but one limitation is that thermodynamic entropy is not identical to Shannon entropy.",
+    "verdict": "needs_revision",
+    "aggregated": {
+      "correctness": 0.6000000000000001,
+      "explanation": 0.85,
+      "critique": 0.6499999999999999
+    },
+    "evaluators": [
+      {
+        "name": "rubric",
+        "dimensions": {
+          "correctness": 0.8,
+          "explanation": 0.85
+        },
+        "notes": "Heuristic scaffold rubric score."
+      },
+      {
+        "name": "symbolic_rule",
+        "dimensions": {
+          "correctness": 0.4
+        },
+        "notes": "Stub symbolic evaluator."
+      },
+      {
+        "name": "critique",
+        "dimensions": {
+          "critique": 0.6499999999999999
+        },
+        "notes": "Stub critique evaluator."
+      }
+    ],
+    "skill_reference": {
+      "skill_name": "ocw-information-entropy-agent",
+      "mastered_by_demo_agent": true,
+      "supporting_lessons": [
+        "Independent Reasoning and Careful Comparison"
+      ]
+    },
+    "follow_up": "Rework the answer so it states the equality/relationship explicitly and explains why it matters."
+  },
+  "turns": [
+    {
+      "role": "user",
+      "label": "Learner Goal",
+      "content": "Help me understand how Shannon entropy leads into channel capacity and thermodynamic entropy."
+    },
+    {
+      "role": "assistant",
+      "label": "Didactopus Mentor",
+      "content": "[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons"
+    },
+    {
+      "role": "assistant",
+      "label": "Didactopus Practice Designer",
+      "content": "[stubbed-response] [practice] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons"
+    },
+    {
+      "role": "user",
+      "label": "Learner Submission",
+      "content": "Entropy measures uncertainty because more possible outcomes require more information to describe, but one limitation is that thermodynamic entropy is not identical to Shannon entropy."
+    },
+    {
+      "role": "assistant",
+      "label": "Didactopus Evaluator",
+      "content": "[stubbed-response] [evaluator] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons"
+    },
+    {
+      "role": "assistant",
+      "label": "Didactopus Mentor",
+      "content": "[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/examples/ocw-information-entropy-session-es.txt b/examples/ocw-information-entropy-session-es.txt
new file mode 100644
index 0000000..6b45a57
--- /dev/null
+++ b/examples/ocw-information-entropy-session-es.txt
@@ -0,0 +1,55 @@
+Didactopus Learner Session
+
+Learner goal: Help me understand how Shannon entropy leads into channel capacity and thermodynamic entropy.
+Source language: en
+Output language: es
+
+Study plan:
+1. Independent Reasoning and Careful Comparison
+   Status: mastered
+   Prerequisites: Course Notes and Reference Texts
+   Supporting lessons: Independent Reasoning and Careful Comparison
+   Source fragment (lesson_body): - Objective: Explain why the course requires precise comparison of related but non-identical concepts.
+- Exercise: Write a short note distinguishing Shannon entropy, channel capacity, and thermodynamic entropy.
+The syllabus framing implies a style of work where analogy is useful but dangerous when used loosely. Learners must compare models carefully, state assumptions, and notice where similar mathematics does not imply identical interpretation.
+   Source fragment (objective): Explain why the course requires precise comparison of related but non-identical concepts.
+2. Thermodynamics and Entropy
+   Status: mastered
+   Prerequisites: Cryptography and Information Hiding
+   Supporting lessons: Thermodynamics and Entropy
+   Source fragment (lesson_body): - Objective: Explain how thermodynamic entropy relates to, and differs from, Shannon entropy.
+- Exercise: Compare the two entropy notions and identify what is preserved across the analogy.
+The course uses entropy as a bridge concept between communication theory and physics while insisting on careful interpretation.
+   Source fragment (objective): Explain how thermodynamic entropy relates to, and differs from, Shannon entropy.
+3. Shannon Entropy
+   Status: mastered
+   Prerequisites: Counting and Probability
+   Supporting lessons: Shannon Entropy
+   Source fragment (lesson_body): - Objective: Explain Shannon entropy as a measure of uncertainty and compare high-entropy and low-entropy sources.
+- Exercise: Compute the entropy of a Bernoulli source and interpret the result.
+The course then introduces entropy as a quantitative measure of uncertainty for a source model and uses it to reason about representation cost and surprise.
+   Source fragment (objective): Explain Shannon entropy as a measure of uncertainty and compare high-entropy and low-entropy sources.
+
+Conversation:
+Learner Goal:
+Help me understand how Shannon entropy leads into channel capacity and thermodynamic entropy.
+
+Didactopus Mentor:
+[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons
+
+Didactopus Practice Designer:
+[stubbed-response] [practice] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons
+
+Learner Submission:
+Entropy measures uncertainty because more possible outcomes require more information to describe, but one limitation is that thermodynamic entropy is not identical to Shannon entropy.
+
+Didactopus Evaluator:
+[stubbed-response] [evaluator] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons
+
+Didactopus Mentor:
+[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons
+
+Evaluation summary:
+Verdict: needs_revision
+Aggregated dimensions: {"correctness": 0.6000000000000001, "critique": 0.6499999999999999, "explanation": 0.85}
+Follow-up: Rework the answer so it states the equality/relationship explicitly and explains why it matters.
diff --git a/src/didactopus/arena.py b/src/didactopus/arena.py
new file mode 100644
index 0000000..e69195c
--- /dev/null
+++ b/src/didactopus/arena.py
@@ -0,0 +1,262 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from time import perf_counter
+
+import yaml
+
+from .config import load_config
+from .language_support import response_language_instruction
+from .learner_session import _grounding_block
+from .model_bench import _adequacy_rating, _score_evaluator_response, _score_mentor_response, _score_practice_response
+from .model_provider import ModelProvider
+from .ocw_skill_agent_demo import build_skill_grounded_study_plan, evaluate_submission_with_skill, load_ocw_skill_context
+from .role_prompts import system_prompt_for_role_variant
+
+
+def _default_arena_spec() -> dict:
+    return {
+        "candidates": [
+            {"name": "stub-baseline", "config": "configs/config.example.yaml", "prompt_variant": "baseline", "language": "en"},
+            {"name": "stub-strict-grounding", "config": "configs/config.example.yaml", "prompt_variant": "strict_grounding", "language": "en"},
+            {"name": "stub-trust-preserving", "config": "configs/config.example.yaml", "prompt_variant": "trust_preserving", "language": "en"},
+        ],
+        "review": {
+            "enabled": True,
+            "config": "configs/config.example.yaml",
+            "role": "mentor",
+        },
+    }
+
+
+def load_arena_spec(path: str | Path | None) -> dict:
+    if path is None:
+        return _default_arena_spec()
+    data = yaml.safe_load(Path(path).read_text(encoding="utf-8")) or {}
+    if "candidates" not in data:
+        data["candidates"] = _default_arena_spec()["candidates"]
+    return data
+
+
+def _arena_tasks(context) -> tuple[list[dict], str, dict]:
+    study_plan = build_skill_grounded_study_plan(
+        context,
+        "Help a learner connect Shannon entropy, channel capacity, and thermodynamic entropy.",
+    )
+    steps = study_plan.get("steps", [])
+    if len(steps) < 2:
+        raise ValueError("Arena requires at least two grounded study-plan steps.")
+    learner_submission = (
+        "Entropy measures uncertainty because more possible outcomes require more information to describe, "
+        "but thermodynamic entropy is not identical to Shannon entropy without careful interpretation."
+    )
+    deterministic = evaluate_submission_with_skill(
+        context,
+        steps[0]["concept_key"].split("::", 1)[-1],
+        learner_submission,
+    )
+    return steps[:2], learner_submission, deterministic
+
+
+def _arena_role_prompts(primary: dict, secondary: dict, learner_submission: str, deterministic: dict) -> dict[str, str]:
+    return {
+        "mentor": (
+            f"{_grounding_block(primary)}\n\n"
+            f"{_grounding_block(secondary)}\n\n"
+            "Give a grounded mentor response that orients the learner, explains the sequence, and asks one focused question."
+        ),
+        "practice": (
+            f"{_grounding_block(primary)}\n\n"
+            "Create one reasoning-heavy practice task. Keep it grounded and do not provide the full solution."
+        ),
+        "evaluator": (
+            f"{_grounding_block(primary)}\n\n"
+            f"Learner submission: {learner_submission}\n"
+            f"Deterministic evaluator result: verdict={deterministic['verdict']}, aggregated={deterministic['aggregated']}\n"
+            "Respond as evaluator. Acknowledge what the learner already did correctly, preserve existing caveats, and give one next revision target."
+        ),
+    }
+
+
+def _scorer_for_role(role: str):
+    return {
+        "mentor": _score_mentor_response,
+        "practice": _score_practice_response,
+        "evaluator": _score_evaluator_response,
+    }[role]
+
+
+def _run_candidate(candidate: dict, skill_dir: str | Path) -> dict:
+    config = load_config(candidate["config"])
+    provider = ModelProvider(config.model_provider)
+    context = load_ocw_skill_context(skill_dir)
+    steps, learner_submission, deterministic = _arena_tasks(context)
+    primary, secondary = steps
+    prompts = _arena_role_prompts(primary, secondary, learner_submission, deterministic)
+    variant = candidate.get("prompt_variant", "baseline")
+    language = candidate.get("language", "en")
+
+    role_results = []
+    overall = 0.0
+    for role, prompt in prompts.items():
+        started = perf_counter()
+        response = provider.generate(
+            f"{prompt}{response_language_instruction(language, 'en')}",
+            role=role,
+            system_prompt=system_prompt_for_role_variant(role, variant),
+            temperature=0.2,
+            max_tokens=220,
+        )
+        elapsed_ms = round((perf_counter() - started) * 1000.0, 3)
+        score, notes = _scorer_for_role(role)(response.text)
+        overall += score
+        role_results.append(
+            {
+                "role": role,
+                "provider": response.provider,
+                "model_name": response.model_name,
+                "prompt_variant": variant,
+                "language": language,
+                "latency_ms": elapsed_ms,
+                "adequacy_score": round(score, 3),
+                "adequacy_rating": _adequacy_rating(score),
+                "response_preview": response.text[:280],
+                "notes": notes,
+            }
+        )
+
+    overall /= len(role_results)
+    return {
+        "candidate_name": candidate["name"],
+        "config": candidate["config"],
+        "prompt_variant": variant,
+        "language": language,
+        "provider": config.model_provider.provider,
+        "overall_score": round(overall, 3),
+        "overall_rating": _adequacy_rating(overall),
+        "role_results": role_results,
+    }
+
+
+def _build_review_queue(candidate_results: list[dict]) -> list[dict]:
+    queue = []
+    for result in candidate_results:
+        weak_roles = [item["role"] for item in result["role_results"] if item["adequacy_rating"] != "adequate"]
+        queue.append(
+            {
+                "candidate_name": result["candidate_name"],
+                "overall_rating": result["overall_rating"],
+                "overall_score": result["overall_score"],
+                "needs_human_review": result["overall_rating"] != "adequate" or bool(weak_roles),
+                "weak_roles": weak_roles,
+            }
+        )
+    return queue
+
+
+def _llm_review_summary(candidate_results: list[dict], spec: dict) -> dict | None:
+    review_spec = spec.get("review", {})
+    if not review_spec.get("enabled", False):
+        return None
+    config = load_config(review_spec.get("config", "configs/config.example.yaml"))
+    provider = ModelProvider(config.model_provider)
+    ranked = sorted(candidate_results, key=lambda item: item["overall_score"], reverse=True)
+    summary_lines = []
+    for result in ranked[:3]:
+        summary_lines.append(
+            f"- {result['candidate_name']}: overall {result['overall_rating']} ({result['overall_score']}), "
+            f"language={result['language']}, roles={[(item['role'], item['adequacy_rating'], item['adequacy_score']) for item in result['role_results']]}"
+        )
+    prompt = "\n".join(
+        [
+            "Review these Didactopus arena results for a human reviewer.",
+            "Rank the strongest candidates, identify likely prompt improvements, and state uncertainty clearly.",
+            "Do not claim that any candidate is fully validated. Treat this as initial review support only.",
+            "",
+            "Arena results:",
+            *summary_lines,
+        ]
+    )
+    role = review_spec.get("role", "mentor")
+    response = provider.generate(
+        prompt,
+        role=role,
+        system_prompt=system_prompt_for_role_variant(role, "trust_preserving"),
+        temperature=0.2,
+        max_tokens=260,
+    )
+    return {
+        "provider": response.provider,
+        "model_name": response.model_name,
+        "role": role,
+        "summary": response.text,
+    }
+
+
+def run_didactopus_arena(
+    *,
+    arena_spec_path: str | Path | None,
+    skill_dir: str | Path,
+    out_dir: str | Path,
+) -> dict:
+    spec = load_arena_spec(arena_spec_path)
+    candidate_results = [_run_candidate(candidate, skill_dir) for candidate in spec.get("candidates", [])]
+    ranked = sorted(candidate_results, key=lambda item: item["overall_score"], reverse=True)
+    payload = {
+        "arena": {
+            "name": "didactopus-behavior-arena",
+            "candidate_count": len(candidate_results),
+        },
+        "ranked_candidates": ranked,
+        "review_queue": _build_review_queue(ranked),
+        "llm_review": _llm_review_summary(ranked, spec),
+    }
+
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    (out_dir / "arena_results.json").write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    (out_dir / "arena_review_queue.json").write_text(json.dumps(payload["review_queue"], indent=2), encoding="utf-8")
+
+    lines = [
+        "# Didactopus Arena Report",
+        "",
+        f"- Candidates: {payload['arena']['candidate_count']}",
+        "",
+        "## Rankings",
+    ]
+    for result in ranked:
+        lines.append(
+            f"- `{result['candidate_name']}` via `{result['provider']}` / prompt variant `{result['prompt_variant']}`: "
+            f"{result['overall_rating']} ({result['overall_score']}), language `{result['language']}`"
+        )
+    lines.extend(["", "## Human Review Queue"])
+    for item in payload["review_queue"]:
+        lines.append(
+            f"- `{item['candidate_name']}`: needs_human_review={item['needs_human_review']}, weak_roles={item['weak_roles']}"
+        )
+    if payload["llm_review"] is not None:
+        lines.extend(["", "## LLM Review Summary", payload["llm_review"]["summary"]])
+    (out_dir / "arena_report.md").write_text("\n".join(lines), encoding="utf-8")
+    return payload
+
+
+def main() -> None:
+    import argparse
+
+    root = Path(__file__).resolve().parents[2]
+    parser = argparse.ArgumentParser(description="Run the Didactopus model-and-prompt arena.")
+    parser.add_argument("--arena-spec", default=None)
+    parser.add_argument("--skill-dir", default=str(root / "skills" / "ocw-information-entropy-agent"))
+    parser.add_argument("--out-dir", default=str(root / "examples" / "arena"))
+    args = parser.parse_args()
+    payload = run_didactopus_arena(
+        arena_spec_path=args.arena_spec,
+        skill_dir=args.skill_dir,
+        out_dir=args.out_dir,
+    )
+    print(json.dumps(payload, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/didactopus/language_support.py b/src/didactopus/language_support.py
new file mode 100644
index 0000000..8ae297a
--- /dev/null
+++ b/src/didactopus/language_support.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+
+LANGUAGE_LABELS = {
+    "en": "English",
+    "es": "Spanish",
+    "fr": "French",
+    "de": "German",
+    "it": "Italian",
+    "pt": "Portuguese",
+    "ar": "Arabic",
+    "sw": "Swahili",
+    "zh": "Chinese",
+    "ja": "Japanese",
+}
+
+
+def language_label(language: str) -> str:
+    return LANGUAGE_LABELS.get(language, language)
+
+
+def response_language_instruction(language: str, source_language: str = "en") -> str:
+    if language == source_language:
+        return ""
+    return (
+        f" Respond in {language_label(language)}. Preserve key source-grounded concepts and caveats faithfully, "
+        f"and make clear when you are explaining material whose source language is {language_label(source_language)}."
+    )
diff --git a/src/didactopus/learner_accessibility.py b/src/didactopus/learner_accessibility.py
index 4bdce13..9d175b5 100644
--- a/src/didactopus/learner_accessibility.py
+++ b/src/didactopus/learner_accessibility.py
@@ -14,6 +14,8 @@ def build_accessible_session_text(session: dict) -> str:
         "Didactopus Learner Session",
         "",
         f"Learner goal: {session.get('goal', '')}",
+        f"Source language: {session.get('source_language', 'en')}",
+        f"Output language: {session.get('output_language', 'en')}",
         "",
         "Study plan:",
     ]
@@ -88,6 +90,8 @@ def build_accessible_session_html(session: dict) -> str:
         '<h1 id="session-title">Didactopus Learner Session</h1>',
         '<p class="sr-note">This page is structured for keyboard and screen-reader use. It presents the learner goal, study plan, grounded source fragments, and conversation turns in reading order.</p>',
         f"<p><strong>Learner goal:</strong> {_escape(session.get('goal', ''))}</p>",
+        f"<p><strong>Source language:</strong> {_escape(session.get('source_language', 'en'))}</p>",
+        f"<p><strong>Output language:</strong> {_escape(session.get('output_language', 'en'))}</p>",
         "</section>",
         '<section aria-labelledby="study-plan-title">',
         '<h2 id="study-plan-title">Study Plan</h2>',
diff --git a/src/didactopus/learner_session.py b/src/didactopus/learner_session.py
index ae96a30..af0a7ac 100644
--- a/src/didactopus/learner_session.py
+++ b/src/didactopus/learner_session.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 from dataclasses import dataclass
 
+from .language_support import response_language_instruction
 from .model_provider import ModelProvider
 from .ocw_skill_agent_demo import (
     SkillContext,
@@ -31,11 +32,13 @@ def _generate_role_text(
     *,
     role: str,
     prompt: str,
+    language: str = "en",
+    source_language: str = "en",
     temperature: float = 0.2,
     max_tokens: int = 220,
 ) -> str:
     return provider.generate(
-        prompt,
+        f"{prompt}{response_language_instruction(language, source_language)}",
         role=role,
         system_prompt=system_prompt_for_role(role),
         temperature=temperature,
@@ -55,6 +58,8 @@ def build_graph_grounded_session(
     provider: ModelProvider,
     learner_goal: str,
     learner_submission: str,
+    language: str = "en",
+    source_language: str = "en",
 ) -> dict:
     study_plan = build_skill_grounded_study_plan(context, learner_goal)
     steps = study_plan.get("steps", [])
@@ -74,6 +79,8 @@ def build_graph_grounded_session(
         provider,
         role="mentor",
         prompt=mentor_prompt,
+        language=language,
+        source_language=source_language,
         temperature=0.2,
         max_tokens=260,
     )
@@ -87,6 +94,8 @@ def build_graph_grounded_session(
         provider,
         role="practice",
         prompt=practice_prompt,
+        language=language,
+        source_language=source_language,
         temperature=0.3,
         max_tokens=220,
     )
@@ -103,6 +112,8 @@ def build_graph_grounded_session(
         provider,
         role="evaluator",
         prompt=evaluator_prompt,
+        language=language,
+        source_language=source_language,
         temperature=0.2,
         max_tokens=240,
     )
@@ -117,6 +128,8 @@ def build_graph_grounded_session(
         provider,
         role="mentor",
         prompt=next_step_prompt,
+        language=language,
+        source_language=source_language,
         temperature=0.2,
         max_tokens=220,
     )
@@ -132,6 +145,8 @@ def build_graph_grounded_session(
 
     return {
         "goal": learner_goal,
+        "output_language": language,
+        "source_language": source_language,
         "study_plan": study_plan,
         "primary_concept": primary,
         "secondary_concept": secondary,
diff --git a/src/didactopus/learner_session_demo.py b/src/didactopus/learner_session_demo.py
index 832bef1..0327a22 100644
--- a/src/didactopus/learner_session_demo.py
+++ b/src/didactopus/learner_session_demo.py
@@ -16,6 +16,7 @@ def run_learner_session_demo(
     out_path: str | Path | None = None,
     accessible_html_path: str | Path | None = None,
     accessible_text_path: str | Path | None = None,
+    language: str = "en",
 ) -> dict:
     config = load_config(config_path)
     provider = ModelProvider(config.model_provider)
@@ -25,6 +26,8 @@ def run_learner_session_demo(
         provider=provider,
         learner_goal="Help me understand how Shannon entropy leads into channel capacity and thermodynamic entropy.",
         learner_submission="Entropy measures uncertainty because more possible outcomes require more information to describe, but one limitation is that thermodynamic entropy is not identical to Shannon entropy.",
+        language=language,
+        source_language="en",
     )
     if out_path is not None:
         out_path = Path(out_path)
@@ -45,6 +48,7 @@ def main() -> None:
     parser.add_argument("--out", default=str(root / "examples" / "ocw-information-entropy-session.json"))
     parser.add_argument("--accessible-html", default=None)
     parser.add_argument("--accessible-text", default=None)
+    parser.add_argument("--language", default="en")
     args = parser.parse_args()
     payload = run_learner_session_demo(
         args.config,
@@ -52,6 +56,7 @@ def main() -> None:
         args.out,
         args.accessible_html,
         args.accessible_text,
+        args.language,
     )
     print(json.dumps(payload, indent=2))
 
diff --git a/src/didactopus/model_bench.py b/src/didactopus/model_bench.py
index 24228ec..654ba80 100644
--- a/src/didactopus/model_bench.py
+++ b/src/didactopus/model_bench.py
@@ -5,6 +5,7 @@ from pathlib import Path
 from time import perf_counter
 
 from .config import load_config
+from .language_support import response_language_instruction
 from .learner_session import _grounding_block
 from .model_provider import ModelProvider
 from .ocw_skill_agent_demo import build_skill_grounded_study_plan, evaluate_submission_with_skill, load_ocw_skill_context
@@ -100,6 +101,7 @@ def run_model_benchmark(
     hardware_cpu: str = "unknown",
     hardware_ram_gb: float | None = None,
     hardware_notes: str | None = None,
+    language: str = "en",
 ) -> dict:
     config = load_config(config_path)
     provider = ModelProvider(config.model_provider)
@@ -153,7 +155,7 @@ def run_model_benchmark(
     for role, prompt in prompts.items():
         started = perf_counter()
         response = provider.generate(
-            prompt,
+            f"{prompt}{response_language_instruction(language, 'en')}",
             role=role,
             system_prompt=system_prompt_for_role(role),
             temperature=0.2,
@@ -193,6 +195,8 @@ def run_model_benchmark(
             "study_plan_task": study_plan["task"],
             "primary_concept": primary["title"],
             "secondary_concept": secondary["title"],
+            "source_language": "en",
+            "output_language": language,
         },
         "role_results": role_results,
         "summary": {
@@ -248,6 +252,7 @@ def main() -> None:
     parser.add_argument("--hardware-cpu", default="unknown")
     parser.add_argument("--hardware-ram-gb", type=float, default=None)
     parser.add_argument("--hardware-notes", default="")
+    parser.add_argument("--language", default="en")
     args = parser.parse_args()
     payload = run_model_benchmark(
         config_path=args.config,
@@ -257,6 +262,7 @@ def main() -> None:
         hardware_cpu=args.hardware_cpu,
         hardware_ram_gb=args.hardware_ram_gb,
         hardware_notes=args.hardware_notes,
+        language=args.language,
     )
     print(json.dumps(payload, indent=2))
 
diff --git a/src/didactopus/ocw_skill_agent_demo.py b/src/didactopus/ocw_skill_agent_demo.py
index c797fe9..7def545 100644
--- a/src/didactopus/ocw_skill_agent_demo.py
+++ b/src/didactopus/ocw_skill_agent_demo.py
@@ -116,6 +116,7 @@ def build_skill_grounded_study_plan(context: SkillContext, target_task: str) ->
     return {
         "skill": context.skill_name,
         "task": target_task,
+        "source_language": "en",
         "steps": steps,
         "guided_path_reference": list(context.run_summary.get("curriculum_path", [])),
     }
@@ -193,7 +194,7 @@ def evaluate_submission_with_skill(context: SkillContext, concept_id: str, submi
     }
 
 
-def run_ocw_skill_agent_demo(skill_dir: str | Path, out_dir: str | Path) -> dict:
+def run_ocw_skill_agent_demo(skill_dir: str | Path, out_dir: str | Path, language: str = "en") -> dict:
     context = load_ocw_skill_context(skill_dir)
     out_dir = Path(out_dir)
     out_dir.mkdir(parents=True, exist_ok=True)
@@ -214,6 +215,8 @@ def run_ocw_skill_agent_demo(skill_dir: str | Path, out_dir: str | Path) -> dict
             "name": context.skill_name,
             "description": context.skill_description,
         },
+        "source_language": "en",
+        "output_language": language,
         "study_plan": study_plan,
         "explanation": explanation,
         "evaluation": evaluation,
@@ -225,6 +228,8 @@ def run_ocw_skill_agent_demo(skill_dir: str | Path, out_dir: str | Path) -> dict
         "",
         f"- Skill: `{context.skill_name}`",
         f"- Description: {context.skill_description}",
+        f"- Source language: en",
+        f"- Output language: {language}",
         "",
         "## Study Plan",
     ]
@@ -260,8 +265,9 @@ def main() -> None:
         "--out-dir",
         default=str(root / "examples" / "ocw-information-entropy-skill-demo"),
     )
+    parser.add_argument("--language", default="en")
     args = parser.parse_args()
-    payload = run_ocw_skill_agent_demo(args.skill_dir, args.out_dir)
+    payload = run_ocw_skill_agent_demo(args.skill_dir, args.out_dir, args.language)
     print(json.dumps(payload, indent=2))
 
 
diff --git a/src/didactopus/role_prompts.py b/src/didactopus/role_prompts.py
index 0269f58..775780b 100644
--- a/src/didactopus/role_prompts.py
+++ b/src/didactopus/role_prompts.py
@@ -1,6 +1,37 @@
 from __future__ import annotations
 
 
+def _variant_suffix(role: str, variant: str) -> str:
+    variant_map = {
+        "baseline": "",
+        "strict_grounding": {
+            "mentor": " Ground every major claim in the supplied concept structure or source fragments, and say when you are inferring beyond them.",
+            "practice": " Keep every exercise tightly tied to the supplied grounded material and avoid introducing outside topic drift.",
+            "learner": " Keep the reflection tied to the supplied grounded material and avoid importing outside claims unless you mark them as inference.",
+            "project_advisor": " Keep project suggestions anchored to the supplied grounded material and state assumptions explicitly.",
+            "evaluator": " Quote or paraphrase the learner text before judging gaps, and distinguish grounded criticism from inference.",
+        },
+        "trust_preserving": {
+            "mentor": " Be especially careful to preserve learner trust: acknowledge what is already correct before redirecting, and avoid overstating errors.",
+            "practice": " Prefer clear, calm task framing that emphasizes exploration over performance pressure.",
+            "learner": " Preserve an honest, effortful learner voice and explicitly note uncertainty without collapsing into self-dismissal.",
+            "project_advisor": " Emphasize realistic next steps and avoid grandiose scope.",
+            "evaluator": " Preserve learner trust by naming strengths first, avoiding invented omissions, and framing revisions as specific improvements rather than blanket criticism.",
+        },
+        "concise": {
+            "mentor": " Keep the response compact: no more than four short paragraphs or bullets worth of content.",
+            "practice": " Keep the task compact and direct.",
+            "learner": " Keep the reflection short and direct.",
+            "project_advisor": " Keep the advice short and concrete.",
+            "evaluator": " Keep the evaluation compact and specific.",
+        },
+    }
+    entry = variant_map.get(variant, "")
+    if isinstance(entry, str):
+        return entry
+    return entry.get(role, "")
+
+
 def mentor_system_prompt() -> str:
     return (
         "You are Didactopus in mentor mode. Help the learner think through the topic without doing the work for them. "
@@ -53,3 +84,9 @@ def system_prompt_for_role(role: str) -> str:
     if factory is None:
         raise KeyError(f"Unknown Didactopus role: {role}")
     return factory()
+
+
+def system_prompt_for_role_variant(role: str, variant: str = "baseline") -> str:
+    base = system_prompt_for_role(role)
+    suffix = _variant_suffix(role, variant)
+    return f"{base}{suffix}" if suffix else base
diff --git a/tests/test_arena.py b/tests/test_arena.py
new file mode 100644
index 0000000..cb5feaa
--- /dev/null
+++ b/tests/test_arena.py
@@ -0,0 +1,38 @@
+import json
+from pathlib import Path
+
+from didactopus.arena import load_arena_spec, run_didactopus_arena
+from didactopus.role_prompts import system_prompt_for_role_variant
+
+
+def test_system_prompt_for_role_variant_changes_prompt() -> None:
+    baseline = system_prompt_for_role_variant("mentor", "baseline")
+    strict = system_prompt_for_role_variant("mentor", "strict_grounding")
+    trust = system_prompt_for_role_variant("evaluator", "trust_preserving")
+    assert baseline != strict
+    assert "supplied concept structure" in strict
+    assert "preserve learner trust" in trust.lower()
+
+
+def test_load_arena_spec_reads_candidates() -> None:
+    spec = load_arena_spec("configs/arena.example.yaml")
+    assert len(spec["candidates"]) == 3
+    assert spec["review"]["enabled"] is True
+    assert {candidate["language"] for candidate in spec["candidates"]} == {"en", "es", "fr"}
+
+
+def test_run_didactopus_arena_writes_outputs(tmp_path: Path) -> None:
+    payload = run_didactopus_arena(
+        arena_spec_path="configs/arena.example.yaml",
+        skill_dir="skills/ocw-information-entropy-agent",
+        out_dir=tmp_path,
+    )
+    assert payload["arena"]["name"] == "didactopus-behavior-arena"
+    assert len(payload["ranked_candidates"]) == 3
+    assert (tmp_path / "arena_results.json").exists()
+    assert (tmp_path / "arena_review_queue.json").exists()
+    assert (tmp_path / "arena_report.md").exists()
+    queue = json.loads((tmp_path / "arena_review_queue.json").read_text(encoding="utf-8"))
+    assert queue
+    assert payload["ranked_candidates"][0]["language"] in {"en", "es", "fr"}
+    assert "LLM Review Summary" in (tmp_path / "arena_report.md").read_text(encoding="utf-8")
diff --git a/tests/test_learner_accessibility.py b/tests/test_learner_accessibility.py
index 7259343..329f5c3 100644
--- a/tests/test_learner_accessibility.py
+++ b/tests/test_learner_accessibility.py
@@ -28,6 +28,8 @@ def test_accessible_session_html_has_landmarks() -> None:
 def test_accessible_session_text_is_linearized() -> None:
     text = build_accessible_session_text(_session_payload())
     assert "Learner goal:" in text
+    assert "Source language:" in text
+    assert "Output language:" in text
     assert "Study plan:" in text
     assert "Conversation:" in text
     assert "Evaluation summary:" in text
diff --git a/tests/test_learner_session.py b/tests/test_learner_session.py
index 49c4ffc..3036a27 100644
--- a/tests/test_learner_session.py
+++ b/tests/test_learner_session.py
@@ -17,12 +17,14 @@ def test_build_graph_grounded_session_uses_grounded_steps() -> None:
         provider=provider,
         learner_goal="Help me connect Shannon entropy and channel capacity.",
         learner_submission="Entropy measures uncertainty because unlikely outcomes carry more information, but one limitation is that idealized source models may not match physical systems.",
+        language="es",
     )
 
     assert payload["study_plan"]["steps"]
     assert payload["primary_concept"]["supporting_lessons"]
     assert payload["evaluation"]["verdict"] in {"acceptable", "needs_revision"}
     assert len(payload["turns"]) == 6
+    assert payload["output_language"] == "es"
     assert any("Grounding fragments" in turn["content"] or "Concept:" in turn["content"] for turn in payload["turns"])
 
 
@@ -39,3 +41,4 @@ def test_run_learner_session_demo_writes_output(tmp_path: Path) -> None:
     assert (tmp_path / "session.txt").exists()
     assert payload["practice_task"]
     assert payload["evaluation"]["aggregated"]
+    assert payload["output_language"] == "en"
diff --git a/tests/test_model_bench.py b/tests/test_model_bench.py
index 3563eb6..9064506 100644
--- a/tests/test_model_bench.py
+++ b/tests/test_model_bench.py
@@ -19,6 +19,7 @@ def test_run_model_benchmark_writes_reports(tmp_path) -> None:
     assert len(payload["role_results"]) == 3
     assert {result["role"] for result in payload["role_results"]} == {"mentor", "practice", "evaluator"}
     assert payload["summary"]["overall_adequacy_rating"] in {"adequate", "borderline", "inadequate"}
+    assert payload["context"]["output_language"] == "en"
 
     json_path = tmp_path / "model_benchmark.json"
     md_path = tmp_path / "model_benchmark.md"
diff --git a/tests/test_ocw_skill_agent_demo.py b/tests/test_ocw_skill_agent_demo.py
index a724960..2f4248c 100644
--- a/tests/test_ocw_skill_agent_demo.py
+++ b/tests/test_ocw_skill_agent_demo.py
@@ -21,6 +21,8 @@ def test_run_ocw_skill_agent_demo(tmp_path: Path) -> None:
     assert "grounding" in payload["explanation"]
     assert payload["explanation"]["grounding"]["supporting_lessons"]
     assert payload["evaluation"]["verdict"] in {"acceptable", "needs_revision"}
+    assert payload["output_language"] == "en"
+    assert payload["source_language"] == "en"
 
 
 def test_skill_demo_flags_weak_submission() -> None: