superset2/scripts/translations/backfill_po.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""Backfill missing translations in a .po file using Claude AI.

For each untranslated (empty msgstr) entry in the target language, the script
sends the English source string along with all available translations in other
languages to Claude as context, then writes the AI-generated translation back
into the .po file marked as #, fuzzy for human review.

Usage:
  # Build the translation index first (one-time or when .po files change)
  python scripts/translations/build_translation_index.py

  # Backfill French translations
  python scripts/translations/backfill_po.py --lang fr

  # Dry run (print what would be translated without writing)
  python scripts/translations/backfill_po.py --lang de --dry-run

  # Limit to 100 entries and use a specific model
  python scripts/translations/backfill_po.py --lang es --limit 100 \
    --model claude-opus-4-6

Options:
  --lang LANG        ISO language code to backfill (required)
  --batch-size N     Number of strings per Claude request (default: 50)
  --limit N          Stop after translating N entries (default: unlimited)
  --min-context N    Skip entries with fewer than N existing translations across
                     reference languages (default: 0 — translate everything)
  --model MODEL      Claude model ID (default: claude-sonnet-4-6)
  --index PATH       Path to translation_index.json (default: auto-detect)
  --dry-run          Print translations without writing to .po file
  --no-fuzzy         Do not mark generated translations as fuzzy (default: mark fuzzy)
"""

from __future__ import annotations

import argparse
import json
import re
import shutil
import subprocess
import sys
from pathlib import Path
from typing import Any

try:
    import polib  # type: ignore[import-untyped]
except ImportError:
    print("polib is required. Run: pip install polib", file=sys.stderr)
    sys.exit(1)

TRANSLATIONS_DIR = Path(__file__).parent.parent.parent / "superset" / "translations"
DEFAULT_INDEX = TRANSLATIONS_DIR / "translation_index.json"
DEFAULT_MODEL = "claude-sonnet-4-6"
DEFAULT_BATCH_SIZE = 50

# Language names for the prompt, keyed by ISO code
LANGUAGE_NAMES: dict[str, str] = {
    "ar": "Arabic",
    "ca": "Catalan",
    "de": "German",
    "es": "Spanish",
    "fa": "Persian (Farsi)",
    "fr": "French",
    "it": "Italian",
    "ja": "Japanese",
    "ko": "Korean",
    "mi": "Māori",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "pt_BR": "Brazilian Portuguese",
    "ru": "Russian",
    "sk": "Slovak",
    "sl": "Slovenian",
    "tr": "Turkish",
    "uk": "Ukrainian",
    "zh": "Chinese (Simplified)",
    "zh_TW": "Chinese (Traditional)",
}


def _lang_name(code: str) -> str:
    """Return a human-readable language name for an ISO language code."""
    return LANGUAGE_NAMES.get(code, code)


def _plural_key(msgid: str, msgid_plural: str) -> str:
    """Build the translation index key used for pluralized entries."""
    return f"{msgid}\x00{msgid_plural}"


def _is_missing(entry: polib.POEntry) -> bool:
    """Return True for entries that need a translation."""
    if entry.obsolete:
        return False
    if entry.msgid_plural:
        return not any(v for v in entry.msgstr_plural.values())
    return not entry.msgstr


def _context_langs(
    item: dict[str, Any], index: dict[str, Any], target_lang: str
) -> list[str]:
    """Return sorted list of language codes that have translations for this entry."""
    key = item["index_key"]
    if key not in index:
        return []
    return sorted(
        lang for lang, val in index[key].items() if lang != target_lang and val
    )


def _context_count(
    item: dict[str, Any], index: dict[str, Any], target_lang: str
) -> int:
    """Return the number of other-language translations available for this entry."""
    return len(_context_langs(item, index, target_lang))


def _render_item(
    i: int,
    item: dict[str, Any],
    index: dict[str, Any],
    target_lang: str,
    reference_langs_sorted: list[str],
) -> list[str]:
    """Render one batch entry as prompt lines."""
    lines: list[str] = []
    ctx = _context_count(item, index, target_lang)
    if ctx == 0:
        lines.append(
            f"--- [{i}] (no reference translations — translate conservatively) ---"
        )
    else:
        plural = "s" if ctx != 1 else ""
        lines.append(f"--- [{i}] ({ctx} reference translation{plural}) ---")
    lines.append(f"English: {json.dumps(item['msgid'], ensure_ascii=False)}")
    if item.get("msgid_plural"):
        plural_json = json.dumps(item["msgid_plural"], ensure_ascii=False)
        lines.append(f"English plural: {plural_json}")
    key = item["index_key"]
    if key in index and reference_langs_sorted:
        for lang in reference_langs_sorted:
            val = index[key].get(lang)
            if val is None:
                continue
            if isinstance(val, dict):
                forms = "; ".join(
                    f"[{k}] {json.dumps(v, ensure_ascii=False)}" for k, v in val.items()
                )
                lines.append(f"{_lang_name(lang)}: {forms}")
            else:
                lines.append(
                    f"{_lang_name(lang)}: {json.dumps(val, ensure_ascii=False)}"
                )
    lines.append("")
    return lines


def build_prompt(
    target_lang: str,
    batch: list[dict[str, Any]],
    index: dict[str, Any],
) -> str:
    """Build the Claude prompt for a batch of entries."""
    lang_name = _lang_name(target_lang)

    # Collect which other languages actually have translations for this batch
    reference_langs: set[str] = set()
    for item in batch:
        key = item["index_key"]
        if key in index:
            reference_langs.update(
                lang for lang, val in index[key].items() if lang != target_lang and val
            )
    reference_langs_sorted = sorted(reference_langs)

    lines: list[str] = [
        "You are a professional translator specializing in software UI strings.",
        f"Translate the following English strings into {lang_name} ({target_lang}).",
        "",
        "Rules:",
        "- Preserve all format placeholders exactly: %(name)s, {name}, %s, %d, etc.",
        "- Preserve HTML tags if present.",
        "- Keep the same tone and register as the reference translations.",
        "- For plural forms, provide translations for all plural forms"
        " required by the language.",
        "- Return ONLY a JSON object mapping each numeric index (as a string)"
        " to its translation.",
        "- Do not add any explanation, preamble, or markdown fences.",
        "",
        "Important: Many strings are short fragments or single words that are"
        " ambiguous in English (e.g. 'Scale' could mean a measurement scale,"
        " to scale an image, or fish scales). Use the translations in other"
        " languages as your primary signal for which meaning is intended —"
        " they collectively disambiguate the intended sense. When no"
        " other-language translations are available for an entry, translate"
        " conservatively based on the most common meaning in a data"
        " visualization UI context.",
        "",
    ]

    if reference_langs_sorted:
        lines.append(
            f"Reference translations are provided per string where available "
            f"({', '.join(_lang_name(lc) for lc in reference_langs_sorted)})."
        )
        lines.append("")

    lines.append("Strings to translate:")
    lines.append("")

    for i, item in enumerate(batch):
        lines.extend(_render_item(i, item, index, target_lang, reference_langs_sorted))

    # Add guidance on plural form counts per language whenever ANY entry in
    # the batch is plural — batches mix singular and plural in .po order, so
    # gating on the first entry would silently drop the guidance whenever
    # the plural entries happen to land after a singular one.
    if any(item.get("msgid_plural") for item in batch):
        lines.append(
            "Note: provide ALL plural forms required by the target language "
            "(e.g. French needs 2, Russian needs 3, Arabic needs 6)."
        )
        lines.append("")

    lines.append(
        'Expected output format: {"0": "<translation>", "1": "<translation>", ...}'
    )
    lines.append("(keys are the numeric indices of the strings above)")

    return "\n".join(lines)


def parse_response(text: str, batch_size: int) -> dict[int, str]:
    """Parse the JSON object from Claude's response."""
    # Strip any accidental markdown fences
    text = re.sub(r"^```[^\n]*\n", "", text.strip())
    text = re.sub(r"\n```$", "", text)
    try:
        raw = json.loads(text)
    except json.JSONDecodeError as exc:
        raise ValueError(
            f"Could not parse response as JSON: {exc}\n\nResponse:\n{text}"
        ) from exc
    # _process_batches only catches ValueError/RuntimeError, so a non-object
    # response (list, scalar, null) must surface as ValueError rather than
    # bubbling up an AttributeError from .items() and aborting the whole run.
    if not isinstance(raw, dict):
        raise ValueError(
            f"Expected a JSON object mapping indices to translations, "
            f"got {type(raw).__name__}.\n\nResponse:\n{text}"
        )
    # Preserve dict/list values as JSON strings so plural responses (where
    # v is a dict of plural forms) can be re-parsed downstream by
    # _apply_translation's json.loads. str(v) on a dict produces Python
    # repr ({'0': 'x'}) which is not valid JSON.
    return {
        int(k): (
            json.dumps(v, ensure_ascii=False) if isinstance(v, (dict, list)) else str(v)
        )
        for k, v in raw.items()
        if str(k).isdigit()
    }


def translate_batch(
    model: str,
    target_lang: str,
    batch: list[dict[str, Any]],
    index: dict[str, Any],
) -> dict[int, str]:
    """Send a batch of strings to Claude via `claude -p`.

    Returns a dict mapping batch index to translated string.
    """
    claude_bin = shutil.which("claude")
    if not claude_bin:
        raise RuntimeError(
            "claude CLI not found. Install Claude Code or add it to PATH."
        )
    prompt = build_prompt(target_lang, batch, index)
    # Pipe the prompt over stdin rather than passing it as argv: a single batch
    # with many reference languages can grow into the tens of KB and approach
    # ARG_MAX on some platforms.
    # claude_bin is resolved via shutil.which — not user-controlled input
    result = subprocess.run(  # noqa: S603
        [claude_bin, "--model", model, "-p"],
        input=prompt,
        capture_output=True,
        text=True,
        check=False,
    )
    if result.returncode != 0:
        raise RuntimeError(
            f"claude exited with code {result.returncode}:\n{result.stderr}"
        )
    return parse_response(result.stdout.strip(), len(batch))


def _apply_plural_translation(entry: polib.POEntry, translation: str) -> None:
    """Distribute a model response across the entry's plural forms.

    Model may return a JSON dict ({"0": "form0", "1": "form1"}), a JSON list
    (["form0", "form1"], also valid since plural forms are ordered), a JSON
    scalar (a single translation that fills every form), or a plain non-JSON
    string (older models that ignore the JSON instruction).
    """
    try:
        plural_value = json.loads(translation)
    except (json.JSONDecodeError, ValueError):
        for k in entry.msgstr_plural:
            entry.msgstr_plural[k] = translation
        return

    if isinstance(plural_value, dict):
        entry.msgstr_plural = {int(k): str(v) for k, v in plural_value.items()}
        return

    if isinstance(plural_value, list) and plural_value:
        # Distribute list items across plural form indices in order; if the
        # model returned fewer forms than the language requires, repeat the
        # last form rather than leaving slots blank.
        forms = [str(v) for v in plural_value]
        for k in sorted(entry.msgstr_plural):
            entry.msgstr_plural[k] = forms[k] if k < len(forms) else forms[-1]
        return

    # Scalar (or empty list) — broadcast to every form.
    fill = str(plural_value) if plural_value not in (None, []) else translation
    for k in entry.msgstr_plural:
        entry.msgstr_plural[k] = fill


def _apply_translation(
    entry: polib.POEntry,
    translation: str,
    item: dict[str, Any],
    model: str,
    mark_fuzzy: bool,
) -> None:
    """Write a translation string into a POEntry and add attribution."""
    if entry.msgid_plural:
        _apply_plural_translation(entry, translation)
    else:
        entry.msgstr = translation

    if mark_fuzzy and "fuzzy" not in entry.flags:
        entry.flags.append("fuzzy")

    refs = item["context_langs"]
    refs_tag = f" [refs: {', '.join(refs)}]" if refs else " [no refs]"
    attribution = f"Machine-translated via backfill_po.py ({model}){refs_tag}"
    if entry.tcomment:
        if attribution not in entry.tcomment:
            entry.tcomment = f"{entry.tcomment}\n{attribution}"
    else:
        entry.tcomment = attribution


def _build_batch_items(
    entries: list[polib.POEntry],
    index: dict[str, Any],
    lang: str,
) -> list[dict[str, Any]]:
    """Convert a list of POEntries into the dict format used by translate_batch."""
    items: list[dict[str, Any]] = []
    for entry in entries:
        if entry.msgid_plural:
            item: dict[str, Any] = {
                "msgid": entry.msgid,
                "msgid_plural": entry.msgid_plural,
                "index_key": _plural_key(entry.msgid, entry.msgid_plural),
                "is_plural": True,
            }
        else:
            item = {
                "msgid": entry.msgid,
                "index_key": entry.msgid,
                "is_plural": False,
            }
        item["context_langs"] = _context_langs(item, index, lang)
        item["context_count"] = len(item["context_langs"])
        items.append(item)
    return items


def _process_batches(
    missing: list[polib.POEntry],
    index: dict[str, Any],
    lang: str,
    batch_size: int,
    model: str,
    dry_run: bool,
    mark_fuzzy: bool,
    cat: polib.POFile | None = None,
    po_path: Path | None = None,
) -> tuple[int, int]:
    """Translate missing entries in batches. Returns (translated, failed) counts.

    When ``cat`` and ``po_path`` are provided and ``dry_run`` is False, the
    catalog is saved to disk after each batch that produced at least one
    successful translation. This means a crash mid-run only loses the in-flight
    batch rather than every batch translated so far.
    """
    translated_count = 0
    failed_count = 0
    for batch_start in range(0, len(missing), batch_size):
        batch_entries = missing[batch_start : batch_start + batch_size]
        batch_items = _build_batch_items(batch_entries, index, lang)
        end = min(batch_start + batch_size, len(missing))
        print(
            f"  Translating entries {batch_start + 1}–{end} of {len(missing)} …",
            file=sys.stderr,
        )
        try:
            translations = translate_batch(model, lang, batch_items, index)
        except (ValueError, RuntimeError) as exc:
            print(f"  ERROR in batch starting at {batch_start}: {exc}", file=sys.stderr)
            failed_count += len(batch_entries)
            continue
        batch_applied = 0
        for i, entry in enumerate(batch_entries):
            translation = translations.get(i)
            if translation is None:
                print(
                    f"  WARNING: no translation returned for index {i} "
                    f"(msgid: {entry.msgid[:60]!r})",
                    file=sys.stderr,
                )
                failed_count += 1
                continue
            if dry_run:
                ctx = batch_items[i]["context_count"]
                ctx_tag = f" [ctx:{ctx}]" if ctx < 3 else ""
                print(
                    f"  [{lang}]{ctx_tag} {entry.msgid[:60]!r} → {translation[:60]!r}"
                )
            else:
                _apply_translation(
                    entry, translation, batch_items[i], model, mark_fuzzy
                )
                batch_applied += 1
            translated_count += 1
        if (
            not dry_run
            and batch_applied > 0
            and cat is not None
            and po_path is not None
        ):
            cat.save()
            print(
                f"  Saved {po_path} ({batch_applied} entry(ies) in this batch).",
                file=sys.stderr,
            )
    return translated_count, failed_count


def backfill(
    lang: str,
    *,
    batch_size: int = DEFAULT_BATCH_SIZE,
    limit: int | None = None,
    min_context: int = 0,
    model: str = DEFAULT_MODEL,
    index_path: Path = DEFAULT_INDEX,
    dry_run: bool = False,
    mark_fuzzy: bool = True,
) -> None:
    """Backfill missing translations in the target language's .po file."""
    # Defense against path traversal: ``lang`` lands in a filesystem path
    # without further sanitization, so reject anything that isn't an
    # ISO 639-1/639-2 code with an optional ISO 3166 region (e.g. ``pt_BR``).
    if not re.fullmatch(r"[a-z]{2,3}(_[A-Z]{2})?", lang):
        print(
            f"Invalid language code: {lang!r} "
            "(expected ISO 639 code, optionally with _<REGION>, e.g. 'fr' or 'pt_BR')",
            file=sys.stderr,
        )
        sys.exit(1)
    po_path = TRANSLATIONS_DIR / lang / "LC_MESSAGES" / "messages.po"
    if not po_path.exists():
        print(f"No .po file found for language '{lang}': {po_path}", file=sys.stderr)
        sys.exit(1)
    if not index_path.exists():
        print(
            f"Translation index not found at {index_path}.\n"
            "Run: python scripts/translations/build_translation_index.py",
            file=sys.stderr,
        )
        sys.exit(1)

    print("Loading translation index …", file=sys.stderr)
    with open(index_path, encoding="utf-8") as f:
        index: dict[str, Any] = json.load(f)

    print(f"Loading {po_path} …", file=sys.stderr)
    cat = polib.pofile(str(po_path))

    missing: list[polib.POEntry] = [e for e in cat if e.msgid and _is_missing(e)]
    print(f"Found {len(missing)} untranslated entries for '{lang}'.", file=sys.stderr)

    if min_context > 0:
        before = len(missing)
        missing = [
            e
            for e in missing
            if _context_count(
                {
                    "index_key": (
                        _plural_key(e.msgid, e.msgid_plural)
                        if e.msgid_plural
                        else e.msgid
                    )
                },
                index,
                lang,
            )
            >= min_context
        ]
        skipped = before - len(missing)
        print(
            f"Skipping {skipped} entries with fewer than {min_context} reference "
            f"translation(s) (use --min-context 0 to include them).",
            file=sys.stderr,
        )

    if limit is not None:
        missing = missing[:limit]
        print(f"Limiting to {limit} entries.", file=sys.stderr)

    if not missing:
        print("Nothing to do.", file=sys.stderr)
        return

    translated_count, failed_count = _process_batches(
        missing,
        index,
        lang,
        batch_size,
        model,
        dry_run,
        mark_fuzzy,
        cat=cat,
        po_path=po_path,
    )

    print(
        f"\nDone. Translated: {translated_count}, Failed/skipped: {failed_count}.",
        file=sys.stderr,
    )
    if not dry_run and translated_count > 0:
        print(
            f"Translations written to {po_path} (marked #, fuzzy for review).",
            file=sys.stderr,
        )


def main() -> None:
    """Parse CLI arguments and run translation backfill."""
    parser = argparse.ArgumentParser(
        description="Backfill missing .po translations using Claude AI",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    parser.add_argument(
        "--lang", required=True, help="ISO language code (e.g. fr, de, ja)"
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=DEFAULT_BATCH_SIZE,
        help=f"Strings per Claude request (default: {DEFAULT_BATCH_SIZE})",
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=None,
        help="Maximum number of entries to translate (default: unlimited)",
    )
    parser.add_argument(
        "--model",
        default=DEFAULT_MODEL,
        help=f"Claude model ID (default: {DEFAULT_MODEL})",
    )
    parser.add_argument(
        "--index",
        type=Path,
        default=DEFAULT_INDEX,
        help=f"Path to translation_index.json (default: {DEFAULT_INDEX})",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Print translations without modifying the .po file",
    )
    parser.add_argument(
        "--min-context",
        type=int,
        default=0,
        metavar="N",
        help=(
            "Skip entries with fewer than N reference translations in other languages "
            "(default: 0 = translate everything). Strings with low context are more "
            "likely to be ambiguous single words or fragments — set to e.g. 2 to only "
            "translate strings that have been confirmed in at least 2 other languages."
        ),
    )
    parser.add_argument(
        "--no-fuzzy",
        dest="mark_fuzzy",
        action="store_false",
        default=True,
        help=(
            "Do not mark generated translations as #, fuzzy. "
            "WARNING: fuzzy entries are excluded from compiled .mo files. "
            "Removing this flag causes AI-generated translations to be served "
            "to end users without human review — only use after you have "
            "manually verified the .po file."
        ),
    )
    args = parser.parse_args()

    backfill(
        lang=args.lang,
        batch_size=args.batch_size,
        limit=args.limit,
        min_context=args.min_context,
        model=args.model,
        index_path=args.index,
        dry_run=args.dry_run,
        mark_fuzzy=args.mark_fuzzy,
    )


if __name__ == "__main__":
    main()