SciSiteForge/scripts/translate_site.py

112 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""Optional offline multilingual translation for SciSiteForge sites."""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from time import sleep
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from scisiteforge.config import load_config
from scisiteforge.translations import GenieHiveTranslator, TranslationConfig
LANGUAGES = {
"es": "Spanish",
"fr": "French",
"pt": "Portuguese",
"de": "German",
"it": "Italian",
"ru": "Russian",
"zh": "Chinese",
"ja": "Japanese",
"ar": "Arabic",
"hi": "Hindi",
}
def _load_bs4():
try:
from bs4 import BeautifulSoup, NavigableString # type: ignore
except Exception as exc: # pragma: no cover - import-time fallback
raise RuntimeError("BeautifulSoup4 is required for HTML translation.") from exc
return BeautifulSoup, NavigableString
def extract_translatable_text(soup):
_, NavigableString = _load_bs4()
for elem in soup.descendants:
if isinstance(elem, NavigableString) and elem.parent.name not in ["script", "style"]:
if elem.strip():
yield elem
def translate_html_file(src_path: Path, dest_path: Path, target_lang_code: str, translator: GenieHiveTranslator, glossary: dict[str, str] | None = None) -> None:
BeautifulSoup, _ = _load_bs4()
print(f"Translating {src_path} -> {dest_path}")
html = src_path.read_text(encoding="utf-8")
soup = BeautifulSoup(html, "html.parser")
for node in extract_translatable_text(soup):
translated = translator.translate(str(node), LANGUAGES[target_lang_code], glossary=glossary)
node.replace_with(translated)
sleep(0.05)
dest_path.parent.mkdir(parents=True, exist_ok=True)
dest_path.write_text(str(soup), encoding="utf-8")
def build_translator(config_path: str | Path | None, args: argparse.Namespace) -> GenieHiveTranslator:
site_config = {}
if config_path:
site_config = load_config(config_path)
translation_cfg = site_config.get("translation", {})
provider = getattr(args, "provider", None) or translation_cfg.get("provider") or "geniehive"
cfg = TranslationConfig(
provider=provider,
base_url=args.base_url or translation_cfg.get("base_url") or "http://127.0.0.1:8800",
model=args.model or translation_cfg.get("model") or "general_assistant",
api_key=args.api_key or translation_cfg.get("api_key") or "",
timeout=args.timeout or int(translation_cfg.get("timeout") or 120),
system_prompt=translation_cfg.get("system_prompt")
or "You are a careful scientific translator. Preserve meaning, structure, and technical terms. Return only the translation.",
)
return GenieHiveTranslator(cfg)
def main() -> None:
parser = argparse.ArgumentParser(description="Translate a SciSiteForge site with an optional provider backend.")
parser.add_argument("--langs", required=True, help="Comma-separated language codes (e.g. es,fr)")
parser.add_argument("--src", default="content/en", help="Source directory (English)")
parser.add_argument("--dest", default="content", help="Base destination directory")
parser.add_argument("--config", help="Optional site config to pull GenieHive settings from")
parser.add_argument("--provider", help="Translation provider (currently: geniehive)")
parser.add_argument("--base-url", help="Provider base URL (default for GenieHive: http://127.0.0.1:8800)")
parser.add_argument("--model", help="Provider model or role alias")
parser.add_argument("--api-key", help="Provider API key")
parser.add_argument("--timeout", type=int, help="HTTP timeout in seconds")
args = parser.parse_args()
translator = build_translator(args.config, args)
src_base = Path(args.src)
dest_base = Path(args.dest)
glossary_cache: dict[str, dict[str, str]] = {}
for lang_code in args.langs.split(","):
if lang_code not in LANGUAGES:
print(f"Unsupported language: {lang_code}")
continue
print(f"\n=== Translating to {LANGUAGES[lang_code]} ({lang_code}) ===")
glossary_path = Path(__file__).parent / f"glossary_{lang_code}.json"
glossary = glossary_cache.setdefault(lang_code, json.loads(glossary_path.read_text(encoding="utf-8")) if glossary_path.exists() else {})
for html_file in src_base.rglob("*.html"):
rel_path = html_file.relative_to(src_base)
translate_html_file(html_file, dest_base / lang_code / rel_path, lang_code, translator, glossary=glossary)
print("\nTranslation complete.")
if __name__ == "__main__":
main()