mirror of
https://github.com/apache/superset.git
synced 2026-05-29 20:29:34 +00:00
feat(i18n): AI-assisted translation backfill tooling + Spanish translations (#39448)
Co-authored-by: Claude Code <noreply@anthropic.com> Co-authored-by: codeant-ai-for-open-source[bot] <244253245+codeant-ai-for-open-source[bot]@users.noreply.github.com> Co-authored-by: Superset Dev <dev@superset.apache.org> Co-authored-by: Đỗ Trọng Hải <41283691+hainenber@users.noreply.github.com> Co-authored-by: Claude <claude@anthropic.com>
This commit is contained in:
653
scripts/translations/backfill_po.py
Normal file
653
scripts/translations/backfill_po.py
Normal file
@@ -0,0 +1,653 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
"""Backfill missing translations in a .po file using Claude AI.
|
||||
|
||||
For each untranslated (empty msgstr) entry in the target language, the script
|
||||
sends the English source string along with all available translations in other
|
||||
languages to Claude as context, then writes the AI-generated translation back
|
||||
into the .po file marked as #, fuzzy for human review.
|
||||
|
||||
Usage:
|
||||
# Build the translation index first (one-time or when .po files change)
|
||||
python scripts/translations/build_translation_index.py
|
||||
|
||||
# Backfill French translations
|
||||
python scripts/translations/backfill_po.py --lang fr
|
||||
|
||||
# Dry run (print what would be translated without writing)
|
||||
python scripts/translations/backfill_po.py --lang de --dry-run
|
||||
|
||||
# Limit to 100 entries and use a specific model
|
||||
python scripts/translations/backfill_po.py --lang es --limit 100 \
|
||||
--model claude-opus-4-6
|
||||
|
||||
Options:
|
||||
--lang LANG ISO language code to backfill (required)
|
||||
--batch-size N Number of strings per Claude request (default: 50)
|
||||
--limit N Stop after translating N entries (default: unlimited)
|
||||
--min-context N Skip entries with fewer than N existing translations across
|
||||
reference languages (default: 0 — translate everything)
|
||||
--model MODEL Claude model ID (default: claude-sonnet-4-6)
|
||||
--index PATH Path to translation_index.json (default: auto-detect)
|
||||
--dry-run Print translations without writing to .po file
|
||||
--no-fuzzy Do not mark generated translations as fuzzy (default: mark fuzzy)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
try:
|
||||
import polib # type: ignore[import-untyped]
|
||||
except ImportError:
|
||||
print("polib is required. Run: pip install polib", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
TRANSLATIONS_DIR = Path(__file__).parent.parent.parent / "superset" / "translations"
|
||||
DEFAULT_INDEX = TRANSLATIONS_DIR / "translation_index.json"
|
||||
DEFAULT_MODEL = "claude-sonnet-4-6"
|
||||
DEFAULT_BATCH_SIZE = 50
|
||||
|
||||
# Language names for the prompt, keyed by ISO code
|
||||
LANGUAGE_NAMES: dict[str, str] = {
|
||||
"ar": "Arabic",
|
||||
"ca": "Catalan",
|
||||
"de": "German",
|
||||
"es": "Spanish",
|
||||
"fa": "Persian (Farsi)",
|
||||
"fr": "French",
|
||||
"it": "Italian",
|
||||
"ja": "Japanese",
|
||||
"ko": "Korean",
|
||||
"mi": "Māori",
|
||||
"nl": "Dutch",
|
||||
"pl": "Polish",
|
||||
"pt": "Portuguese",
|
||||
"pt_BR": "Brazilian Portuguese",
|
||||
"ru": "Russian",
|
||||
"sk": "Slovak",
|
||||
"sl": "Slovenian",
|
||||
"tr": "Turkish",
|
||||
"uk": "Ukrainian",
|
||||
"zh": "Chinese (Simplified)",
|
||||
"zh_TW": "Chinese (Traditional)",
|
||||
}
|
||||
|
||||
|
||||
def _lang_name(code: str) -> str:
|
||||
"""Return a human-readable language name for an ISO language code."""
|
||||
return LANGUAGE_NAMES.get(code, code)
|
||||
|
||||
|
||||
def _plural_key(msgid: str, msgid_plural: str) -> str:
|
||||
"""Build the translation index key used for pluralized entries."""
|
||||
return f"{msgid}\x00{msgid_plural}"
|
||||
|
||||
|
||||
def _is_missing(entry: polib.POEntry) -> bool:
|
||||
"""Return True for entries that need a translation."""
|
||||
if entry.obsolete:
|
||||
return False
|
||||
if entry.msgid_plural:
|
||||
return not any(v for v in entry.msgstr_plural.values())
|
||||
return not entry.msgstr
|
||||
|
||||
|
||||
def _context_langs(
|
||||
item: dict[str, Any], index: dict[str, Any], target_lang: str
|
||||
) -> list[str]:
|
||||
"""Return sorted list of language codes that have translations for this entry."""
|
||||
key = item["index_key"]
|
||||
if key not in index:
|
||||
return []
|
||||
return sorted(
|
||||
lang for lang, val in index[key].items() if lang != target_lang and val
|
||||
)
|
||||
|
||||
|
||||
def _context_count(
|
||||
item: dict[str, Any], index: dict[str, Any], target_lang: str
|
||||
) -> int:
|
||||
"""Return the number of other-language translations available for this entry."""
|
||||
return len(_context_langs(item, index, target_lang))
|
||||
|
||||
|
||||
def _render_item(
|
||||
i: int,
|
||||
item: dict[str, Any],
|
||||
index: dict[str, Any],
|
||||
target_lang: str,
|
||||
reference_langs_sorted: list[str],
|
||||
) -> list[str]:
|
||||
"""Render one batch entry as prompt lines."""
|
||||
lines: list[str] = []
|
||||
ctx = _context_count(item, index, target_lang)
|
||||
if ctx == 0:
|
||||
lines.append(
|
||||
f"--- [{i}] (no reference translations — translate conservatively) ---"
|
||||
)
|
||||
else:
|
||||
plural = "s" if ctx != 1 else ""
|
||||
lines.append(f"--- [{i}] ({ctx} reference translation{plural}) ---")
|
||||
lines.append(f"English: {json.dumps(item['msgid'], ensure_ascii=False)}")
|
||||
if item.get("msgid_plural"):
|
||||
plural_json = json.dumps(item["msgid_plural"], ensure_ascii=False)
|
||||
lines.append(f"English plural: {plural_json}")
|
||||
key = item["index_key"]
|
||||
if key in index and reference_langs_sorted:
|
||||
for lang in reference_langs_sorted:
|
||||
val = index[key].get(lang)
|
||||
if val is None:
|
||||
continue
|
||||
if isinstance(val, dict):
|
||||
forms = "; ".join(
|
||||
f"[{k}] {json.dumps(v, ensure_ascii=False)}" for k, v in val.items()
|
||||
)
|
||||
lines.append(f"{_lang_name(lang)}: {forms}")
|
||||
else:
|
||||
lines.append(
|
||||
f"{_lang_name(lang)}: {json.dumps(val, ensure_ascii=False)}"
|
||||
)
|
||||
lines.append("")
|
||||
return lines
|
||||
|
||||
|
||||
def build_prompt(
|
||||
target_lang: str,
|
||||
batch: list[dict[str, Any]],
|
||||
index: dict[str, Any],
|
||||
) -> str:
|
||||
"""Build the Claude prompt for a batch of entries."""
|
||||
lang_name = _lang_name(target_lang)
|
||||
|
||||
# Collect which other languages actually have translations for this batch
|
||||
reference_langs: set[str] = set()
|
||||
for item in batch:
|
||||
key = item["index_key"]
|
||||
if key in index:
|
||||
reference_langs.update(
|
||||
lang for lang, val in index[key].items() if lang != target_lang and val
|
||||
)
|
||||
reference_langs_sorted = sorted(reference_langs)
|
||||
|
||||
lines: list[str] = [
|
||||
"You are a professional translator specializing in software UI strings.",
|
||||
f"Translate the following English strings into {lang_name} ({target_lang}).",
|
||||
"",
|
||||
"Rules:",
|
||||
"- Preserve all format placeholders exactly: %(name)s, {name}, %s, %d, etc.",
|
||||
"- Preserve HTML tags if present.",
|
||||
"- Keep the same tone and register as the reference translations.",
|
||||
"- For plural forms, provide translations for all plural forms"
|
||||
" required by the language.",
|
||||
"- Return ONLY a JSON object mapping each numeric index (as a string)"
|
||||
" to its translation.",
|
||||
"- Do not add any explanation, preamble, or markdown fences.",
|
||||
"",
|
||||
"Important: Many strings are short fragments or single words that are"
|
||||
" ambiguous in English (e.g. 'Scale' could mean a measurement scale,"
|
||||
" to scale an image, or fish scales). Use the translations in other"
|
||||
" languages as your primary signal for which meaning is intended —"
|
||||
" they collectively disambiguate the intended sense. When no"
|
||||
" other-language translations are available for an entry, translate"
|
||||
" conservatively based on the most common meaning in a data"
|
||||
" visualization UI context.",
|
||||
"",
|
||||
]
|
||||
|
||||
if reference_langs_sorted:
|
||||
lines.append(
|
||||
f"Reference translations are provided per string where available "
|
||||
f"({', '.join(_lang_name(lc) for lc in reference_langs_sorted)})."
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append("Strings to translate:")
|
||||
lines.append("")
|
||||
|
||||
for i, item in enumerate(batch):
|
||||
lines.extend(_render_item(i, item, index, target_lang, reference_langs_sorted))
|
||||
|
||||
# Add guidance on plural form counts per language whenever ANY entry in
|
||||
# the batch is plural — batches mix singular and plural in .po order, so
|
||||
# gating on the first entry would silently drop the guidance whenever
|
||||
# the plural entries happen to land after a singular one.
|
||||
if any(item.get("msgid_plural") for item in batch):
|
||||
lines.append(
|
||||
"Note: provide ALL plural forms required by the target language "
|
||||
"(e.g. French needs 2, Russian needs 3, Arabic needs 6)."
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append(
|
||||
'Expected output format: {"0": "<translation>", "1": "<translation>", ...}'
|
||||
)
|
||||
lines.append("(keys are the numeric indices of the strings above)")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def parse_response(text: str, batch_size: int) -> dict[int, str]:
|
||||
"""Parse the JSON object from Claude's response."""
|
||||
# Strip any accidental markdown fences
|
||||
text = re.sub(r"^```[^\n]*\n", "", text.strip())
|
||||
text = re.sub(r"\n```$", "", text)
|
||||
try:
|
||||
raw = json.loads(text)
|
||||
except json.JSONDecodeError as exc:
|
||||
raise ValueError(
|
||||
f"Could not parse response as JSON: {exc}\n\nResponse:\n{text}"
|
||||
) from exc
|
||||
# _process_batches only catches ValueError/RuntimeError, so a non-object
|
||||
# response (list, scalar, null) must surface as ValueError rather than
|
||||
# bubbling up an AttributeError from .items() and aborting the whole run.
|
||||
if not isinstance(raw, dict):
|
||||
raise ValueError(
|
||||
f"Expected a JSON object mapping indices to translations, "
|
||||
f"got {type(raw).__name__}.\n\nResponse:\n{text}"
|
||||
)
|
||||
# Preserve dict/list values as JSON strings so plural responses (where
|
||||
# v is a dict of plural forms) can be re-parsed downstream by
|
||||
# _apply_translation's json.loads. str(v) on a dict produces Python
|
||||
# repr ({'0': 'x'}) which is not valid JSON.
|
||||
return {
|
||||
int(k): (
|
||||
json.dumps(v, ensure_ascii=False) if isinstance(v, (dict, list)) else str(v)
|
||||
)
|
||||
for k, v in raw.items()
|
||||
if str(k).isdigit()
|
||||
}
|
||||
|
||||
|
||||
def translate_batch(
|
||||
model: str,
|
||||
target_lang: str,
|
||||
batch: list[dict[str, Any]],
|
||||
index: dict[str, Any],
|
||||
) -> dict[int, str]:
|
||||
"""Send a batch of strings to Claude via `claude -p`.
|
||||
|
||||
Returns a dict mapping batch index to translated string.
|
||||
"""
|
||||
claude_bin = shutil.which("claude")
|
||||
if not claude_bin:
|
||||
raise RuntimeError(
|
||||
"claude CLI not found. Install Claude Code or add it to PATH."
|
||||
)
|
||||
prompt = build_prompt(target_lang, batch, index)
|
||||
# Pipe the prompt over stdin rather than passing it as argv: a single batch
|
||||
# with many reference languages can grow into the tens of KB and approach
|
||||
# ARG_MAX on some platforms.
|
||||
# claude_bin is resolved via shutil.which — not user-controlled input
|
||||
result = subprocess.run( # noqa: S603
|
||||
[claude_bin, "--model", model, "-p"],
|
||||
input=prompt,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"claude exited with code {result.returncode}:\n{result.stderr}"
|
||||
)
|
||||
return parse_response(result.stdout.strip(), len(batch))
|
||||
|
||||
|
||||
def _apply_plural_translation(entry: polib.POEntry, translation: str) -> None:
|
||||
"""Distribute a model response across the entry's plural forms.
|
||||
|
||||
Model may return a JSON dict ({"0": "form0", "1": "form1"}), a JSON list
|
||||
(["form0", "form1"], also valid since plural forms are ordered), a JSON
|
||||
scalar (a single translation that fills every form), or a plain non-JSON
|
||||
string (older models that ignore the JSON instruction).
|
||||
"""
|
||||
try:
|
||||
plural_value = json.loads(translation)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
for k in entry.msgstr_plural:
|
||||
entry.msgstr_plural[k] = translation
|
||||
return
|
||||
|
||||
if isinstance(plural_value, dict):
|
||||
entry.msgstr_plural = {int(k): str(v) for k, v in plural_value.items()}
|
||||
return
|
||||
|
||||
if isinstance(plural_value, list) and plural_value:
|
||||
# Distribute list items across plural form indices in order; if the
|
||||
# model returned fewer forms than the language requires, repeat the
|
||||
# last form rather than leaving slots blank.
|
||||
forms = [str(v) for v in plural_value]
|
||||
for k in sorted(entry.msgstr_plural):
|
||||
entry.msgstr_plural[k] = forms[k] if k < len(forms) else forms[-1]
|
||||
return
|
||||
|
||||
# Scalar (or empty list) — broadcast to every form.
|
||||
fill = str(plural_value) if plural_value not in (None, []) else translation
|
||||
for k in entry.msgstr_plural:
|
||||
entry.msgstr_plural[k] = fill
|
||||
|
||||
|
||||
def _apply_translation(
|
||||
entry: polib.POEntry,
|
||||
translation: str,
|
||||
item: dict[str, Any],
|
||||
model: str,
|
||||
mark_fuzzy: bool,
|
||||
) -> None:
|
||||
"""Write a translation string into a POEntry and add attribution."""
|
||||
if entry.msgid_plural:
|
||||
_apply_plural_translation(entry, translation)
|
||||
else:
|
||||
entry.msgstr = translation
|
||||
|
||||
if mark_fuzzy and "fuzzy" not in entry.flags:
|
||||
entry.flags.append("fuzzy")
|
||||
|
||||
refs = item["context_langs"]
|
||||
refs_tag = f" [refs: {', '.join(refs)}]" if refs else " [no refs]"
|
||||
attribution = f"Machine-translated via backfill_po.py ({model}){refs_tag}"
|
||||
if entry.tcomment:
|
||||
if attribution not in entry.tcomment:
|
||||
entry.tcomment = f"{entry.tcomment}\n{attribution}"
|
||||
else:
|
||||
entry.tcomment = attribution
|
||||
|
||||
|
||||
def _build_batch_items(
|
||||
entries: list[polib.POEntry],
|
||||
index: dict[str, Any],
|
||||
lang: str,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Convert a list of POEntries into the dict format used by translate_batch."""
|
||||
items: list[dict[str, Any]] = []
|
||||
for entry in entries:
|
||||
if entry.msgid_plural:
|
||||
item: dict[str, Any] = {
|
||||
"msgid": entry.msgid,
|
||||
"msgid_plural": entry.msgid_plural,
|
||||
"index_key": _plural_key(entry.msgid, entry.msgid_plural),
|
||||
"is_plural": True,
|
||||
}
|
||||
else:
|
||||
item = {
|
||||
"msgid": entry.msgid,
|
||||
"index_key": entry.msgid,
|
||||
"is_plural": False,
|
||||
}
|
||||
item["context_langs"] = _context_langs(item, index, lang)
|
||||
item["context_count"] = len(item["context_langs"])
|
||||
items.append(item)
|
||||
return items
|
||||
|
||||
|
||||
def _process_batches(
|
||||
missing: list[polib.POEntry],
|
||||
index: dict[str, Any],
|
||||
lang: str,
|
||||
batch_size: int,
|
||||
model: str,
|
||||
dry_run: bool,
|
||||
mark_fuzzy: bool,
|
||||
cat: polib.POFile | None = None,
|
||||
po_path: Path | None = None,
|
||||
) -> tuple[int, int]:
|
||||
"""Translate missing entries in batches. Returns (translated, failed) counts.
|
||||
|
||||
When ``cat`` and ``po_path`` are provided and ``dry_run`` is False, the
|
||||
catalog is saved to disk after each batch that produced at least one
|
||||
successful translation. This means a crash mid-run only loses the in-flight
|
||||
batch rather than every batch translated so far.
|
||||
"""
|
||||
translated_count = 0
|
||||
failed_count = 0
|
||||
for batch_start in range(0, len(missing), batch_size):
|
||||
batch_entries = missing[batch_start : batch_start + batch_size]
|
||||
batch_items = _build_batch_items(batch_entries, index, lang)
|
||||
end = min(batch_start + batch_size, len(missing))
|
||||
print(
|
||||
f" Translating entries {batch_start + 1}–{end} of {len(missing)} …",
|
||||
file=sys.stderr,
|
||||
)
|
||||
try:
|
||||
translations = translate_batch(model, lang, batch_items, index)
|
||||
except (ValueError, RuntimeError) as exc:
|
||||
print(f" ERROR in batch starting at {batch_start}: {exc}", file=sys.stderr)
|
||||
failed_count += len(batch_entries)
|
||||
continue
|
||||
batch_applied = 0
|
||||
for i, entry in enumerate(batch_entries):
|
||||
translation = translations.get(i)
|
||||
if translation is None:
|
||||
print(
|
||||
f" WARNING: no translation returned for index {i} "
|
||||
f"(msgid: {entry.msgid[:60]!r})",
|
||||
file=sys.stderr,
|
||||
)
|
||||
failed_count += 1
|
||||
continue
|
||||
if dry_run:
|
||||
ctx = batch_items[i]["context_count"]
|
||||
ctx_tag = f" [ctx:{ctx}]" if ctx < 3 else ""
|
||||
print(
|
||||
f" [{lang}]{ctx_tag} {entry.msgid[:60]!r} → {translation[:60]!r}"
|
||||
)
|
||||
else:
|
||||
_apply_translation(
|
||||
entry, translation, batch_items[i], model, mark_fuzzy
|
||||
)
|
||||
batch_applied += 1
|
||||
translated_count += 1
|
||||
if (
|
||||
not dry_run
|
||||
and batch_applied > 0
|
||||
and cat is not None
|
||||
and po_path is not None
|
||||
):
|
||||
cat.save()
|
||||
print(
|
||||
f" Saved {po_path} ({batch_applied} entry(ies) in this batch).",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return translated_count, failed_count
|
||||
|
||||
|
||||
def backfill(
|
||||
lang: str,
|
||||
*,
|
||||
batch_size: int = DEFAULT_BATCH_SIZE,
|
||||
limit: int | None = None,
|
||||
min_context: int = 0,
|
||||
model: str = DEFAULT_MODEL,
|
||||
index_path: Path = DEFAULT_INDEX,
|
||||
dry_run: bool = False,
|
||||
mark_fuzzy: bool = True,
|
||||
) -> None:
|
||||
"""Backfill missing translations in the target language's .po file."""
|
||||
# Defense against path traversal: ``lang`` lands in a filesystem path
|
||||
# without further sanitization, so reject anything that isn't an
|
||||
# ISO 639-1/639-2 code with an optional ISO 3166 region (e.g. ``pt_BR``).
|
||||
if not re.fullmatch(r"[a-z]{2,3}(_[A-Z]{2})?", lang):
|
||||
print(
|
||||
f"Invalid language code: {lang!r} "
|
||||
"(expected ISO 639 code, optionally with _<REGION>, e.g. 'fr' or 'pt_BR')",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
po_path = TRANSLATIONS_DIR / lang / "LC_MESSAGES" / "messages.po"
|
||||
if not po_path.exists():
|
||||
print(f"No .po file found for language '{lang}': {po_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
if not index_path.exists():
|
||||
print(
|
||||
f"Translation index not found at {index_path}.\n"
|
||||
"Run: python scripts/translations/build_translation_index.py",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
print("Loading translation index …", file=sys.stderr)
|
||||
with open(index_path, encoding="utf-8") as f:
|
||||
index: dict[str, Any] = json.load(f)
|
||||
|
||||
print(f"Loading {po_path} …", file=sys.stderr)
|
||||
cat = polib.pofile(str(po_path))
|
||||
|
||||
missing: list[polib.POEntry] = [e for e in cat if e.msgid and _is_missing(e)]
|
||||
print(f"Found {len(missing)} untranslated entries for '{lang}'.", file=sys.stderr)
|
||||
|
||||
if min_context > 0:
|
||||
before = len(missing)
|
||||
missing = [
|
||||
e
|
||||
for e in missing
|
||||
if _context_count(
|
||||
{
|
||||
"index_key": (
|
||||
_plural_key(e.msgid, e.msgid_plural)
|
||||
if e.msgid_plural
|
||||
else e.msgid
|
||||
)
|
||||
},
|
||||
index,
|
||||
lang,
|
||||
)
|
||||
>= min_context
|
||||
]
|
||||
skipped = before - len(missing)
|
||||
print(
|
||||
f"Skipping {skipped} entries with fewer than {min_context} reference "
|
||||
f"translation(s) (use --min-context 0 to include them).",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
if limit is not None:
|
||||
missing = missing[:limit]
|
||||
print(f"Limiting to {limit} entries.", file=sys.stderr)
|
||||
|
||||
if not missing:
|
||||
print("Nothing to do.", file=sys.stderr)
|
||||
return
|
||||
|
||||
translated_count, failed_count = _process_batches(
|
||||
missing,
|
||||
index,
|
||||
lang,
|
||||
batch_size,
|
||||
model,
|
||||
dry_run,
|
||||
mark_fuzzy,
|
||||
cat=cat,
|
||||
po_path=po_path,
|
||||
)
|
||||
|
||||
print(
|
||||
f"\nDone. Translated: {translated_count}, Failed/skipped: {failed_count}.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
if not dry_run and translated_count > 0:
|
||||
print(
|
||||
f"Translations written to {po_path} (marked #, fuzzy for review).",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Parse CLI arguments and run translation backfill."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Backfill missing .po translations using Claude AI",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lang", required=True, help="ISO language code (e.g. fr, de, ja)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=DEFAULT_BATCH_SIZE,
|
||||
help=f"Strings per Claude request (default: {DEFAULT_BATCH_SIZE})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Maximum number of entries to translate (default: unlimited)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=DEFAULT_MODEL,
|
||||
help=f"Claude model ID (default: {DEFAULT_MODEL})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index",
|
||||
type=Path,
|
||||
default=DEFAULT_INDEX,
|
||||
help=f"Path to translation_index.json (default: {DEFAULT_INDEX})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print translations without modifying the .po file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-context",
|
||||
type=int,
|
||||
default=0,
|
||||
metavar="N",
|
||||
help=(
|
||||
"Skip entries with fewer than N reference translations in other languages "
|
||||
"(default: 0 = translate everything). Strings with low context are more "
|
||||
"likely to be ambiguous single words or fragments — set to e.g. 2 to only "
|
||||
"translate strings that have been confirmed in at least 2 other languages."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-fuzzy",
|
||||
dest="mark_fuzzy",
|
||||
action="store_false",
|
||||
default=True,
|
||||
help=(
|
||||
"Do not mark generated translations as #, fuzzy. "
|
||||
"WARNING: fuzzy entries are excluded from compiled .mo files. "
|
||||
"Removing this flag causes AI-generated translations to be served "
|
||||
"to end users without human review — only use after you have "
|
||||
"manually verified the .po file."
|
||||
),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
backfill(
|
||||
lang=args.lang,
|
||||
batch_size=args.batch_size,
|
||||
limit=args.limit,
|
||||
min_context=args.min_context,
|
||||
model=args.model,
|
||||
index_path=args.index,
|
||||
dry_run=args.dry_run,
|
||||
mark_fuzzy=args.mark_fuzzy,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
153
scripts/translations/build_translation_index.py
Normal file
153
scripts/translations/build_translation_index.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
"""Build a cross-language translation index from all .po files.
|
||||
|
||||
Outputs a JSON file structured as:
|
||||
{
|
||||
"<msgid>": {
|
||||
"<lang>": "<translated string or null>",
|
||||
...
|
||||
},
|
||||
...
|
||||
}
|
||||
|
||||
For plural entries the key is "<msgid>\x00<msgid_plural>" and the value
|
||||
is a dict mapping lang -> {0: "...", 1: "..."} (or null if untranslated).
|
||||
|
||||
Usage:
|
||||
python scripts/translations/build_translation_index.py
|
||||
python scripts/translations/build_translation_index.py \
|
||||
--translations-dir superset/translations \
|
||||
--output /tmp/translation_index.json
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
try:
|
||||
import polib # type: ignore[import-untyped]
|
||||
except ImportError:
|
||||
print("polib is required. Install with: pip install polib", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
TRANSLATIONS_DIR = Path(__file__).parent.parent.parent / "superset" / "translations"
|
||||
DEFAULT_OUTPUT = (
|
||||
Path(__file__).parent.parent.parent
|
||||
/ "superset"
|
||||
/ "translations"
|
||||
/ "translation_index.json"
|
||||
)
|
||||
|
||||
|
||||
def _is_translated(entry: polib.POEntry) -> bool:
|
||||
"""Return True if the entry has a non-empty, non-fuzzy translation."""
|
||||
if "fuzzy" in entry.flags:
|
||||
return False
|
||||
if entry.msgid_plural:
|
||||
return any(v for v in entry.msgstr_plural.values())
|
||||
return bool(entry.msgstr)
|
||||
|
||||
|
||||
def _plural_key(entry: polib.POEntry) -> str:
|
||||
"""Build the combined key used for plural translation entries."""
|
||||
return f"{entry.msgid}\x00{entry.msgid_plural}"
|
||||
|
||||
|
||||
def build_index(translations_dir: Path) -> dict[str, Any]:
|
||||
"""Read all .po files and build a combined translation index."""
|
||||
index: dict[str, dict[str, Any]] = {}
|
||||
|
||||
langs = sorted(
|
||||
d
|
||||
for d in os.listdir(translations_dir)
|
||||
if (translations_dir / d / "LC_MESSAGES" / "messages.po").exists()
|
||||
and d != "en" # en has empty msgstr by convention (source = target)
|
||||
)
|
||||
|
||||
for lang in langs:
|
||||
po_path = translations_dir / lang / "LC_MESSAGES" / "messages.po"
|
||||
cat = polib.pofile(str(po_path))
|
||||
for entry in cat:
|
||||
if not entry.msgid:
|
||||
continue # skip header entry
|
||||
|
||||
if entry.msgid_plural:
|
||||
key = _plural_key(entry)
|
||||
if key not in index:
|
||||
index[key] = {}
|
||||
# Fuzzy entries are unreviewed (often machine-generated drafts),
|
||||
# so excluding them prevents feeding unverified translations
|
||||
# back into the AI backfill prompt as trusted context.
|
||||
index[key][lang] = (
|
||||
dict(entry.msgstr_plural) if _is_translated(entry) else None
|
||||
)
|
||||
else:
|
||||
key = entry.msgid
|
||||
if key not in index:
|
||||
index[key] = {}
|
||||
index[key][lang] = entry.msgstr if _is_translated(entry) else None
|
||||
|
||||
# Ensure every entry has a slot for every language (null if missing)
|
||||
for key in index:
|
||||
for lang in langs:
|
||||
index[key].setdefault(lang, None)
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Parse arguments, build the translation index, and write it to disk."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build cross-language translation index"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--translations-dir",
|
||||
type=Path,
|
||||
default=TRANSLATIONS_DIR,
|
||||
help="Path to the translations directory (default: superset/translations)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
"-o",
|
||||
type=Path,
|
||||
default=DEFAULT_OUTPUT,
|
||||
help=(
|
||||
"Output JSON file path"
|
||||
" (default: superset/translations/translation_index.json)"
|
||||
),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Reading .po files from {args.translations_dir} …", file=sys.stderr)
|
||||
index = build_index(args.translations_dir)
|
||||
print(f"Indexed {len(index)} message IDs.", file=sys.stderr)
|
||||
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
json.dump(index, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"Written to {args.output}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user