Didactopus/tests/test_arena.py

39 lines
1.7 KiB
Python

import json
from pathlib import Path
from didactopus.arena import load_arena_spec, run_didactopus_arena
from didactopus.role_prompts import system_prompt_for_role_variant
def test_system_prompt_for_role_variant_changes_prompt() -> None:
baseline = system_prompt_for_role_variant("mentor", "baseline")
strict = system_prompt_for_role_variant("mentor", "strict_grounding")
trust = system_prompt_for_role_variant("evaluator", "trust_preserving")
assert baseline != strict
assert "supplied concept structure" in strict
assert "preserve learner trust" in trust.lower()
def test_load_arena_spec_reads_candidates() -> None:
spec = load_arena_spec("configs/arena.example.yaml")
assert len(spec["candidates"]) == 3
assert spec["review"]["enabled"] is True
assert {candidate["language"] for candidate in spec["candidates"]} == {"en", "es", "fr"}
def test_run_didactopus_arena_writes_outputs(tmp_path: Path) -> None:
payload = run_didactopus_arena(
arena_spec_path="configs/arena.example.yaml",
skill_dir="skills/ocw-information-entropy-agent",
out_dir=tmp_path,
)
assert payload["arena"]["name"] == "didactopus-behavior-arena"
assert len(payload["ranked_candidates"]) == 3
assert (tmp_path / "arena_results.json").exists()
assert (tmp_path / "arena_review_queue.json").exists()
assert (tmp_path / "arena_report.md").exists()
queue = json.loads((tmp_path / "arena_review_queue.json").read_text(encoding="utf-8"))
assert queue
assert payload["ranked_candidates"][0]["language"] in {"en", "es", "fr"}
assert "LLM Review Summary" in (tmp_path / "arena_report.md").read_text(encoding="utf-8")