Didactopus/examples/model-benchmark-es/model_benchmark.json

152 lines
6.6 KiB
JSON

{
"benchmark": {
"name": "didactopus-local-model-adequacy",
"task_family": "graph-grounded-mentor-loop",
"provider": "stub",
"hardware_profile": {
"profile_name": "unspecified-local",
"cpu": "unknown",
"ram_gb": null,
"notes": ""
}
},
"context": {
"skill_name": "ocw-information-entropy-agent",
"study_plan_task": "Help a learner connect Shannon entropy, channel capacity, and thermodynamic entropy.",
"primary_concept": "Independent Reasoning and Careful Comparison",
"secondary_concept": "Thermodynamics and Entropy",
"source_language": "en",
"output_language": "es"
},
"role_results": [
{
"role": "mentor",
"provider": "stub",
"model_name": "local-demo",
"latency_ms": 0.025,
"response_preview": "[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
"adequacy_score": 0.52,
"adequacy_rating": "inadequate",
"grounded_score": 0.65,
"multilingual_score": 0.0,
"round_trip": {
"warnings": [
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
],
"summary": {
"source_phrase_count": 4,
"round_trip_warning_count": 4,
"drifted_phrases": [
"entropia",
"capacidad del canal",
"entropia termodinamica",
"no es identica"
]
}
},
"notes": [
"Did not ask a focused learner question.",
"Response does not appear to be in Spanish.",
"Missing required multilingual term 'shannon-entropy' for language 'es'.",
"Missing required multilingual term 'channel-capacity' for language 'es'.",
"Missing required multilingual term 'thermodynamic-entropy' for language 'es'.",
"Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.",
"Did not visibly preserve a key grounded concept term in multilingual output.",
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
]
},
{
"role": "practice",
"provider": "stub",
"model_name": "local-demo",
"latency_ms": 0.004,
"response_preview": "[stubbed-response] [practice] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
"adequacy_score": 0.82,
"adequacy_rating": "adequate",
"grounded_score": 1.0,
"multilingual_score": 0.1,
"round_trip": {
"warnings": [
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
],
"summary": {
"source_phrase_count": 4,
"round_trip_warning_count": 4,
"drifted_phrases": [
"entropia",
"capacidad del canal",
"entropia termodinamica",
"no es identica"
]
}
},
"notes": [
"Response does not appear to be in Spanish.",
"Missing required multilingual term 'shannon-entropy' for language 'es'.",
"Missing required multilingual term 'channel-capacity' for language 'es'.",
"Missing required multilingual term 'thermodynamic-entropy' for language 'es'.",
"Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.",
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
]
},
{
"role": "evaluator",
"provider": "stub",
"model_name": "local-demo",
"latency_ms": 0.004,
"response_preview": "[stubbed-response] [evaluator] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
"adequacy_score": 0.3,
"adequacy_rating": "inadequate",
"grounded_score": 0.35,
"multilingual_score": 0.1,
"round_trip": {
"warnings": [
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
],
"summary": {
"source_phrase_count": 4,
"round_trip_warning_count": 4,
"drifted_phrases": [
"entropia",
"capacidad del canal",
"entropia termodinamica",
"no es identica"
]
}
},
"notes": [
"Did not acknowledge learner strengths.",
"Did not provide a concrete next step.",
"Response does not appear to be in Spanish.",
"Missing required multilingual term 'shannon-entropy' for language 'es'.",
"Missing required multilingual term 'channel-capacity' for language 'es'.",
"Missing required multilingual term 'thermodynamic-entropy' for language 'es'.",
"Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.",
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
]
}
],
"summary": {
"overall_adequacy_score": 0.547,
"overall_adequacy_rating": "inadequate",
"recommended_use": "Not recommended for learner-facing local deployment."
}
}