Added multilingual QA review tooling.

This commit is contained in:
welsberr 2026-03-17 21:48:17 -04:00
parent 58466bbf9f
commit 9549961d10
12 changed files with 359 additions and 57 deletions

View File

@ -84,6 +84,12 @@ Didactopus can also generate a starter multilingual QA draft from a pack:
python -m didactopus.multilingual_qa_seed domain-packs/mit-ocw-information-entropy
```
and promote selected seed entries into a curated multilingual QA spec:
```bash
python -m didactopus.multilingual_qa_review --seed domain-packs/mit-ocw-information-entropy/multilingual_qa.seed.yaml --out domain-packs/mit-ocw-information-entropy/multilingual_qa.yaml --language es --required-term-id shannon-entropy
```
## Start Here If You Just Want To Learn
If your main question is "how quickly can this help me learn something?", start here:

View File

@ -20,10 +20,14 @@ targets:
es:
required_terms:
- id: shannon-entropy
round_trip_required: true
round_trip_source: "Shannon entropy"
accepted:
- "entropía de shannon"
required_caveats:
- id: shannon-vs-thermo-not-identical
round_trip_required: true
round_trip_source: "Shannon entropy is not identical to thermodynamic entropy"
accepted:
- "no es idéntica"
forbidden_confusions:
@ -32,6 +36,8 @@ targets:
- "es idéntica a la entropía termodinámica"
```
Use `round_trip_source` for the reviewer-approved source-language phrase that should remain recoverable after back-translation. That is better than using the first target-language phrase mechanically.
## Starter Generation
Didactopus can now generate a draft starter spec for reviewer refinement:
@ -48,6 +54,29 @@ The generated `multilingual_qa.seed.yaml` is not meant for immediate trust. It i
- likely caveat candidates from grounded source fragments
- likely forbidden confusions derived from negated caveat language
## Promotion Tooling
Didactopus can now promote selected seed entries into a curated spec:
```bash
python -m didactopus.multilingual_qa_review \
--seed domain-packs/mit-ocw-information-entropy/multilingual_qa.seed.yaml \
--out domain-packs/mit-ocw-information-entropy/multilingual_qa.yaml \
--language es \
--required-term-id shannon-entropy \
--required-term-id channel-capacity \
--required-caveat-id shannon-vs-thermo-not-identical \
--forbidden-confusion-id shannon-equals-thermodynamic-entropy \
--canonical-round-trip-id shannon-entropy \
--canonical-round-trip-id shannon-vs-thermo-not-identical
```
This is meant to reduce manual editing by letting a reviewer:
- choose which seed entries to keep
- mark which entries should drive canonical round-trip checks
- merge selected entries into the curated `multilingual_qa.yaml`
## What It Checks
For a target language, the QA layer can check:

View File

@ -3,21 +3,29 @@ targets:
es:
required_terms:
- id: shannon-entropy
round_trip_required: true
round_trip_source: "Shannon entropy"
accepted:
- "entropia"
- "entropía"
- "entropia de shannon"
- "entropía de shannon"
- id: channel-capacity
round_trip_required: true
round_trip_source: "channel capacity"
accepted:
- "capacidad del canal"
- "capacidad de canal"
- id: thermodynamic-entropy
round_trip_required: true
round_trip_source: "thermodynamic entropy"
accepted:
- "entropia termodinamica"
- "entropía termodinámica"
required_caveats:
- id: shannon-vs-thermo-not-identical
round_trip_required: true
round_trip_source: "Shannon entropy is not identical to thermodynamic entropy"
accepted:
- "no es identica"
- "no es idéntica"
@ -34,18 +42,26 @@ targets:
fr:
required_terms:
- id: shannon-entropy
round_trip_required: true
round_trip_source: "Shannon entropy"
accepted:
- "entropie"
- "entropie de shannon"
- id: channel-capacity
round_trip_required: true
round_trip_source: "channel capacity"
accepted:
- "capacite du canal"
- "capacité du canal"
- id: thermodynamic-entropy
round_trip_required: true
round_trip_source: "thermodynamic entropy"
accepted:
- "entropie thermodynamique"
required_caveats:
- id: shannon-vs-thermo-not-identical
round_trip_required: true
round_trip_source: "Shannon entropy is not identical to thermodynamic entropy"
accepted:
- "n'est pas identique"
- "ne sont pas identiques"

View File

@ -23,7 +23,7 @@
"role": "mentor",
"provider": "stub",
"model_name": "local-demo",
"latency_ms": 0.025,
"latency_ms": 0.022,
"response_preview": "[stubbed-response] [mentor] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
"adequacy_score": 0.52,
"adequacy_rating": "inadequate",
@ -31,19 +31,19 @@
"multilingual_score": 0.0,
"round_trip": {
"warnings": [
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
"Round-trip translation did not preserve source phrase 'Shannon entropy'.",
"Round-trip translation did not preserve source phrase 'channel capacity'.",
"Round-trip translation did not preserve source phrase 'thermodynamic entropy'.",
"Round-trip translation did not preserve source phrase 'Shannon entropy is not identical to thermodynamic entropy'."
],
"summary": {
"source_phrase_count": 4,
"round_trip_warning_count": 4,
"drifted_phrases": [
"entropia",
"capacidad del canal",
"entropia termodinamica",
"no es identica"
"Shannon entropy",
"channel capacity",
"thermodynamic entropy",
"Shannon entropy is not identical to thermodynamic entropy"
]
}
},
@ -55,17 +55,17 @@
"Missing required multilingual term 'thermodynamic-entropy' for language 'es'.",
"Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.",
"Did not visibly preserve a key grounded concept term in multilingual output.",
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
"Round-trip translation did not preserve source phrase 'Shannon entropy'.",
"Round-trip translation did not preserve source phrase 'channel capacity'.",
"Round-trip translation did not preserve source phrase 'thermodynamic entropy'.",
"Round-trip translation did not preserve source phrase 'Shannon entropy is not identical to thermodynamic entropy'."
]
},
{
"role": "practice",
"provider": "stub",
"model_name": "local-demo",
"latency_ms": 0.004,
"latency_ms": 0.007,
"response_preview": "[stubbed-response] [practice] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
"adequacy_score": 0.82,
"adequacy_rating": "adequate",
@ -73,19 +73,19 @@
"multilingual_score": 0.1,
"round_trip": {
"warnings": [
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
"Round-trip translation did not preserve source phrase 'Shannon entropy'.",
"Round-trip translation did not preserve source phrase 'channel capacity'.",
"Round-trip translation did not preserve source phrase 'thermodynamic entropy'.",
"Round-trip translation did not preserve source phrase 'Shannon entropy is not identical to thermodynamic entropy'."
],
"summary": {
"source_phrase_count": 4,
"round_trip_warning_count": 4,
"drifted_phrases": [
"entropia",
"capacidad del canal",
"entropia termodinamica",
"no es identica"
"Shannon entropy",
"channel capacity",
"thermodynamic entropy",
"Shannon entropy is not identical to thermodynamic entropy"
]
}
},
@ -95,17 +95,17 @@
"Missing required multilingual term 'channel-capacity' for language 'es'.",
"Missing required multilingual term 'thermodynamic-entropy' for language 'es'.",
"Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.",
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
"Round-trip translation did not preserve source phrase 'Shannon entropy'.",
"Round-trip translation did not preserve source phrase 'channel capacity'.",
"Round-trip translation did not preserve source phrase 'thermodynamic entropy'.",
"Round-trip translation did not preserve source phrase 'Shannon entropy is not identical to thermodynamic entropy'."
]
},
{
"role": "evaluator",
"provider": "stub",
"model_name": "local-demo",
"latency_ms": 0.004,
"latency_ms": 0.005,
"response_preview": "[stubbed-response] [evaluator] Concept: Independent Reasoning and Careful Comparison Prerequisites: Course Notes and Reference Texts Supporting lessons",
"adequacy_score": 0.3,
"adequacy_rating": "inadequate",
@ -113,19 +113,19 @@
"multilingual_score": 0.1,
"round_trip": {
"warnings": [
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
"Round-trip translation did not preserve source phrase 'Shannon entropy'.",
"Round-trip translation did not preserve source phrase 'channel capacity'.",
"Round-trip translation did not preserve source phrase 'thermodynamic entropy'.",
"Round-trip translation did not preserve source phrase 'Shannon entropy is not identical to thermodynamic entropy'."
],
"summary": {
"source_phrase_count": 4,
"round_trip_warning_count": 4,
"drifted_phrases": [
"entropia",
"capacidad del canal",
"entropia termodinamica",
"no es identica"
"Shannon entropy",
"channel capacity",
"thermodynamic entropy",
"Shannon entropy is not identical to thermodynamic entropy"
]
}
},
@ -137,10 +137,10 @@
"Missing required multilingual term 'channel-capacity' for language 'es'.",
"Missing required multilingual term 'thermodynamic-entropy' for language 'es'.",
"Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.",
"Round-trip translation did not preserve source phrase 'entropia'.",
"Round-trip translation did not preserve source phrase 'capacidad del canal'.",
"Round-trip translation did not preserve source phrase 'entropia termodinamica'.",
"Round-trip translation did not preserve source phrase 'no es identica'."
"Round-trip translation did not preserve source phrase 'Shannon entropy'.",
"Round-trip translation did not preserve source phrase 'channel capacity'.",
"Round-trip translation did not preserve source phrase 'thermodynamic entropy'.",
"Round-trip translation did not preserve source phrase 'Shannon entropy is not identical to thermodynamic entropy'."
]
}
],

View File

@ -8,9 +8,9 @@
- Recommended use: Not recommended for learner-facing local deployment.
## Role Results
- `mentor` via `local-demo`: inadequate (0.52), latency 0.025 ms
Notes: Did not ask a focused learner question.; Response does not appear to be in Spanish.; Missing required multilingual term 'shannon-entropy' for language 'es'.; Missing required multilingual term 'channel-capacity' for language 'es'.; Missing required multilingual term 'thermodynamic-entropy' for language 'es'.; Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.; Did not visibly preserve a key grounded concept term in multilingual output.; Round-trip translation did not preserve source phrase 'entropia'.; Round-trip translation did not preserve source phrase 'capacidad del canal'.; Round-trip translation did not preserve source phrase 'entropia termodinamica'.; Round-trip translation did not preserve source phrase 'no es identica'.
- `practice` via `local-demo`: adequate (0.82), latency 0.004 ms
Notes: Response does not appear to be in Spanish.; Missing required multilingual term 'shannon-entropy' for language 'es'.; Missing required multilingual term 'channel-capacity' for language 'es'.; Missing required multilingual term 'thermodynamic-entropy' for language 'es'.; Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.; Round-trip translation did not preserve source phrase 'entropia'.; Round-trip translation did not preserve source phrase 'capacidad del canal'.; Round-trip translation did not preserve source phrase 'entropia termodinamica'.; Round-trip translation did not preserve source phrase 'no es identica'.
- `evaluator` via `local-demo`: inadequate (0.3), latency 0.004 ms
Notes: Did not acknowledge learner strengths.; Did not provide a concrete next step.; Response does not appear to be in Spanish.; Missing required multilingual term 'shannon-entropy' for language 'es'.; Missing required multilingual term 'channel-capacity' for language 'es'.; Missing required multilingual term 'thermodynamic-entropy' for language 'es'.; Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.; Round-trip translation did not preserve source phrase 'entropia'.; Round-trip translation did not preserve source phrase 'capacidad del canal'.; Round-trip translation did not preserve source phrase 'entropia termodinamica'.; Round-trip translation did not preserve source phrase 'no es identica'.
- `mentor` via `local-demo`: inadequate (0.52), latency 0.022 ms
Notes: Did not ask a focused learner question.; Response does not appear to be in Spanish.; Missing required multilingual term 'shannon-entropy' for language 'es'.; Missing required multilingual term 'channel-capacity' for language 'es'.; Missing required multilingual term 'thermodynamic-entropy' for language 'es'.; Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.; Did not visibly preserve a key grounded concept term in multilingual output.; Round-trip translation did not preserve source phrase 'Shannon entropy'.; Round-trip translation did not preserve source phrase 'channel capacity'.; Round-trip translation did not preserve source phrase 'thermodynamic entropy'.; Round-trip translation did not preserve source phrase 'Shannon entropy is not identical to thermodynamic entropy'.
- `practice` via `local-demo`: adequate (0.82), latency 0.007 ms
Notes: Response does not appear to be in Spanish.; Missing required multilingual term 'shannon-entropy' for language 'es'.; Missing required multilingual term 'channel-capacity' for language 'es'.; Missing required multilingual term 'thermodynamic-entropy' for language 'es'.; Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.; Round-trip translation did not preserve source phrase 'Shannon entropy'.; Round-trip translation did not preserve source phrase 'channel capacity'.; Round-trip translation did not preserve source phrase 'thermodynamic entropy'.; Round-trip translation did not preserve source phrase 'Shannon entropy is not identical to thermodynamic entropy'.
- `evaluator` via `local-demo`: inadequate (0.3), latency 0.005 ms
Notes: Did not acknowledge learner strengths.; Did not provide a concrete next step.; Response does not appear to be in Spanish.; Missing required multilingual term 'shannon-entropy' for language 'es'.; Missing required multilingual term 'channel-capacity' for language 'es'.; Missing required multilingual term 'thermodynamic-entropy' for language 'es'.; Missing required multilingual caveat 'shannon-vs-thermo-not-identical' for language 'es'.; Round-trip translation did not preserve source phrase 'Shannon entropy'.; Round-trip translation did not preserve source phrase 'channel capacity'.; Round-trip translation did not preserve source phrase 'thermodynamic entropy'.; Round-trip translation did not preserve source phrase 'Shannon entropy is not identical to thermodynamic entropy'.

View File

@ -0,0 +1,25 @@
source_language: en
review_status: curated
promoted_from_seed: domain-packs/mit-ocw-information-entropy/multilingual_qa.seed.yaml
targets:
es:
required_terms:
- id: shannon-entropy
accepted:
- Shannon Entropy
round_trip_required: true
round_trip_source: Shannon Entropy
required_caveats:
- id: thermodynamics-and-entropy
accepted:
- Objective Explain how thermodynamic entropy relates to, and differs from,
Shannon entropy. Exercise Compare the two entropy notions and identify what
is preserved across the analogy. The course uses entropy as a bridge concept
between communication theory and physics while insisting on careful interpretation.
forbidden_confusions:
- id: thermodynamics-and-entropy-confusion
patterns:
- Objective Explain how thermodynamic entropy relates to, and is identical to,
Shannon entropy. Exercise Compare the two entropy notions and identify what
is preserved across the analogy. The course uses entropy as a bridge concept
between communication theory and physics while insisting on careful interpretation.

View File

@ -3,21 +3,29 @@ targets:
es:
required_terms:
- id: shannon-entropy
round_trip_required: true
round_trip_source: "Shannon entropy"
accepted:
- "entropia"
- "entropía"
- "entropia de shannon"
- "entropía de shannon"
- id: channel-capacity
round_trip_required: true
round_trip_source: "channel capacity"
accepted:
- "capacidad del canal"
- "capacidad de canal"
- id: thermodynamic-entropy
round_trip_required: true
round_trip_source: "thermodynamic entropy"
accepted:
- "entropia termodinamica"
- "entropía termodinámica"
required_caveats:
- id: shannon-vs-thermo-not-identical
round_trip_required: true
round_trip_source: "Shannon entropy is not identical to thermodynamic entropy"
accepted:
- "no es identica"
- "no es idéntica"
@ -34,18 +42,26 @@ targets:
fr:
required_terms:
- id: shannon-entropy
round_trip_required: true
round_trip_source: "Shannon entropy"
accepted:
- "entropie"
- "entropie de shannon"
- id: channel-capacity
round_trip_required: true
round_trip_source: "channel capacity"
accepted:
- "capacite du canal"
- "capacité du canal"
- id: thermodynamic-entropy
round_trip_required: true
round_trip_source: "thermodynamic entropy"
accepted:
- "entropie thermodynamique"
required_caveats:
- id: shannon-vs-thermo-not-identical
round_trip_required: true
round_trip_source: "Shannon entropy is not identical to thermodynamic entropy"
accepted:
- "n'est pas identique"
- "ne sont pas identiques"

View File

@ -8,7 +8,7 @@ from .config import load_config
from .language_support import language_alignment_score, response_language_instruction
from .learner_session import _grounding_block
from .model_provider import ModelProvider
from .multilingual_qa import multilingual_qa_for_text, round_trip_warning_for_phrases
from .multilingual_qa import multilingual_qa_for_text, round_trip_source_phrases, round_trip_warning_for_phrases
from .ocw_skill_agent_demo import build_skill_grounded_study_plan, evaluate_submission_with_skill, load_ocw_skill_context
from .role_prompts import system_prompt_for_role
@ -106,17 +106,7 @@ def _multilingual_score(role: str, text: str, language: str, qa_spec: dict | Non
def _round_trip_phrases(qa_spec: dict | None, language: str) -> list[str]:
if not qa_spec or language == "en":
return []
target = (qa_spec.get("targets", {}) or {}).get(language, {}) or {}
phrases: list[str] = []
for entry in target.get("required_terms", []) or []:
accepted = entry.get("accepted", []) or []
if accepted:
phrases.append(str(accepted[0]))
for entry in target.get("required_caveats", []) or []:
accepted = entry.get("accepted", []) or []
if accepted:
phrases.append(str(accepted[0]))
return phrases[:6]
return round_trip_source_phrases(qa_spec, language=language)[:6]
def _hardware_profile(

View File

@ -76,6 +76,31 @@ def multilingual_qa_for_pack(source_dir: str | Path, *, language: str, text: str
return multilingual_qa_for_text(spec, language=language, text=text)
def round_trip_source_phrases(spec: dict, *, language: str) -> list[str]:
targets = spec.get("targets", {}) or {}
target = targets.get(language, {}) or {}
phrases: list[str] = []
for section in ("required_terms", "required_caveats"):
for entry in target.get(section, []) or []:
preferred = entry.get("round_trip_source")
if preferred:
phrases.append(str(preferred))
elif entry.get("round_trip_required"):
accepted = entry.get("accepted", []) or []
if accepted:
phrases.append(str(accepted[0]))
if phrases:
return phrases
# Backward-compatible fallback for older specs with no explicit round-trip fields.
for section in ("required_terms", "required_caveats"):
for entry in target.get(section, []) or []:
accepted = entry.get("accepted", []) or []
if accepted:
phrases.append(str(accepted[0]))
return phrases
def round_trip_warning_for_phrases(
source_phrases: list[str],
back_translated_text: str,

View File

@ -0,0 +1,98 @@
from __future__ import annotations
from pathlib import Path
import yaml
def _load_yaml(path: str | Path) -> dict:
p = Path(path)
if not p.exists():
return {}
return yaml.safe_load(p.read_text(encoding="utf-8")) or {}
def _index_entries(entries: list[dict]) -> dict[str, dict]:
return {str(entry.get("id", "")): dict(entry) for entry in entries if entry.get("id")}
def _select_entries(seed_entries: list[dict], selected_ids: list[str], canonical_ids: set[str]) -> list[dict]:
index = _index_entries(seed_entries)
selected = []
for entry_id in selected_ids:
if entry_id not in index:
continue
entry = dict(index[entry_id])
if entry_id in canonical_ids:
entry["round_trip_required"] = True
accepted = entry.get("accepted", []) or []
if accepted and "round_trip_source" not in entry:
entry["round_trip_source"] = str(accepted[0])
selected.append(entry)
return selected
def promote_multilingual_qa_entries(
*,
seed_path: str | Path,
out_path: str | Path,
language: str,
required_term_ids: list[str] | None = None,
required_caveat_ids: list[str] | None = None,
forbidden_confusion_ids: list[str] | None = None,
canonical_round_trip_ids: list[str] | None = None,
) -> dict:
seed = _load_yaml(seed_path)
curated = _load_yaml(out_path)
target_seed = ((seed.get("targets", {}) or {}).get(language, {})) or {}
target_curated = ((curated.get("targets", {}) or {}).get(language, {})) or {}
canonical_ids = set(canonical_round_trip_ids or [])
promoted_terms = _select_entries(target_seed.get("required_terms", []) or [], required_term_ids or [], canonical_ids)
promoted_caveats = _select_entries(target_seed.get("required_caveats", []) or [], required_caveat_ids or [], canonical_ids)
promoted_confusions = _select_entries(target_seed.get("forbidden_confusions", []) or [], forbidden_confusion_ids or [], canonical_ids)
curated_targets = dict(curated.get("targets", {}) or {})
curated_targets[language] = {
"required_terms": promoted_terms or target_curated.get("required_terms", []) or [],
"required_caveats": promoted_caveats or target_curated.get("required_caveats", []) or [],
"forbidden_confusions": promoted_confusions or target_curated.get("forbidden_confusions", []) or [],
}
payload = {
"source_language": seed.get("source_language", curated.get("source_language", "en")),
"review_status": "curated",
"promoted_from_seed": str(seed_path),
"targets": curated_targets,
}
out = Path(out_path)
out.write_text(yaml.safe_dump(payload, sort_keys=False, allow_unicode=False), encoding="utf-8")
return payload
def main() -> None:
import argparse
parser = argparse.ArgumentParser(description="Promote selected multilingual QA seed entries into a curated spec.")
parser.add_argument("--seed", required=True)
parser.add_argument("--out", required=True)
parser.add_argument("--language", required=True)
parser.add_argument("--required-term-id", action="append", default=[])
parser.add_argument("--required-caveat-id", action="append", default=[])
parser.add_argument("--forbidden-confusion-id", action="append", default=[])
parser.add_argument("--canonical-round-trip-id", action="append", default=[])
args = parser.parse_args()
promote_multilingual_qa_entries(
seed_path=args.seed,
out_path=args.out,
language=args.language,
required_term_ids=args.required_term_id,
required_caveat_ids=args.required_caveat_id,
forbidden_confusion_ids=args.forbidden_confusion_id,
canonical_round_trip_ids=args.canonical_round_trip_id,
)
print(yaml.safe_dump({"written": args.out, "language": args.language}, sort_keys=False))
if __name__ == "__main__":
main()

View File

@ -4,6 +4,7 @@ from didactopus.multilingual_qa import (
load_multilingual_qa_spec,
multilingual_qa_for_pack,
multilingual_qa_for_text,
round_trip_source_phrases,
round_trip_warning_for_phrases,
)
@ -50,3 +51,11 @@ def test_round_trip_warning_for_phrases_flags_drift() -> None:
)
assert result["summary"]["round_trip_warning_count"] == 1
assert result["summary"]["drifted_phrases"] == ["channel capacity"]
def test_round_trip_source_phrases_use_canonical_source_text() -> None:
spec = load_multilingual_qa_spec("domain-packs/mit-ocw-information-entropy")
phrases = round_trip_source_phrases(spec, language="es")
assert "Shannon entropy" in phrases
assert "channel capacity" in phrases
assert "Shannon entropy is not identical to thermodynamic entropy" in phrases

View File

@ -0,0 +1,88 @@
from pathlib import Path
import yaml
from didactopus.multilingual_qa_review import promote_multilingual_qa_entries
def test_promote_multilingual_qa_entries_writes_curated_spec(tmp_path: Path) -> None:
seed = tmp_path / "multilingual_qa.seed.yaml"
seed.write_text(
yaml.safe_dump(
{
"source_language": "en",
"targets": {
"es": {
"required_terms": [
{"id": "shannon-entropy", "accepted": ["Shannon entropy"]},
{"id": "channel-capacity", "accepted": ["channel capacity"]},
],
"required_caveats": [
{"id": "not-identical", "accepted": ["Shannon entropy is not identical to thermodynamic entropy"]},
],
"forbidden_confusions": [
{"id": "identical-confusion", "patterns": ["Shannon entropy is identical to thermodynamic entropy"]},
],
}
},
},
sort_keys=False,
),
encoding="utf-8",
)
out = tmp_path / "multilingual_qa.yaml"
payload = promote_multilingual_qa_entries(
seed_path=seed,
out_path=out,
language="es",
required_term_ids=["shannon-entropy"],
required_caveat_ids=["not-identical"],
forbidden_confusion_ids=["identical-confusion"],
canonical_round_trip_ids=["shannon-entropy", "not-identical"],
)
assert out.exists()
assert payload["review_status"] == "curated"
target = payload["targets"]["es"]
assert len(target["required_terms"]) == 1
assert target["required_terms"][0]["round_trip_required"] is True
assert target["required_terms"][0]["round_trip_source"] == "Shannon entropy"
assert target["required_caveats"][0]["round_trip_required"] is True
def test_promote_multilingual_qa_entries_preserves_other_languages(tmp_path: Path) -> None:
seed = tmp_path / "multilingual_qa.seed.yaml"
seed.write_text(
yaml.safe_dump(
{
"source_language": "en",
"targets": {
"es": {"required_terms": [{"id": "shannon-entropy", "accepted": ["Shannon entropy"]}]},
"fr": {"required_terms": [{"id": "entropie", "accepted": ["Shannon entropy"]}]},
},
},
sort_keys=False,
),
encoding="utf-8",
)
out = tmp_path / "multilingual_qa.yaml"
out.write_text(
yaml.safe_dump(
{
"source_language": "en",
"targets": {
"fr": {"required_terms": [{"id": "entropie", "accepted": ["entropie de Shannon"]}]}
},
},
sort_keys=False,
),
encoding="utf-8",
)
payload = promote_multilingual_qa_entries(
seed_path=seed,
out_path=out,
language="es",
required_term_ids=["shannon-entropy"],
)
assert "fr" in payload["targets"]
assert payload["targets"]["fr"]["required_terms"][0]["id"] == "entropie"