Files
superset2/scripts/translations/check_translation_regression.py
2026-05-23 16:30:14 -07:00

251 lines
9.0 KiB
Python
Executable File

#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Check that source-code changes don't cause translation regressions.
Usage
-----
Count non-fuzzy translated entries in all .po files and write JSON to stdout:
python check_translation_regression.py --count
Compare the current .po state against a previously-recorded baseline and fail
if any language lost translations:
python check_translation_regression.py --compare /path/to/before.json
Optionally write a markdown report to a file (used by CI to post a PR comment):
python check_translation_regression.py --compare before.json --report report.md
Use a translations directory other than the repo default (used by CI to count
against a separate base-branch worktree):
python check_translation_regression.py --count \\
--translations-dir /tmp/base-worktree/superset/translations
Typical CI workflow
-------------------
1. Create a base-branch worktree alongside the PR worktree
2. Run babel_update.sh in the base worktree (extract from BASE source)
3. Record baseline: python ... --count --translations-dir BASE_TREE > before.json
4. Run babel_update.sh in the PR worktree (extract from PR source) starting
from the same pristine BASE translations
5. Compare: python ... --compare before.json [--report report.md]
Comparing two babel_update outputs that started from the same BASE .po files
isolates regressions caused by the PR's source diff from any pre-existing
drift on the base branch.
"""
import argparse
import json
import re
import subprocess
import sys
from pathlib import Path
from typing import Optional
DEFAULT_TRANSLATIONS_DIR = (
Path(__file__).resolve().parent.parent.parent / "superset" / "translations"
)
# English .po files use empty msgstr by convention (source language == target),
# so they always show 0 translated entries and should not be checked.
SKIP_LANGS = {"en"}
def count_translated(po_file: Path) -> int:
"""Return the number of non-fuzzy translated messages in a .po file.
Raises:
subprocess.CalledProcessError: if ``msgfmt`` fails (e.g. malformed
.po file). The regression check exists to surface translation
problems, so a silent zero would defeat its purpose — let the
caller see a malformed file as a hard failure.
"""
import shutil # noqa: PLC0415
msgfmt = shutil.which("msgfmt") or "msgfmt"
result = subprocess.run( # noqa: S603
[msgfmt, "--statistics", "-o", "/dev/null", str(po_file)],
capture_output=True,
text=True,
check=True,
)
# stderr: "123 translated messages, 4 fuzzy translations, 56 untranslated messages."
match = re.search(r"(\d+) translated message", result.stderr)
if not match:
raise RuntimeError(
f"Could not parse msgfmt --statistics output for {po_file}: "
f"{result.stderr!r}"
)
return int(match.group(1))
def get_counts(translations_dir: Path) -> dict[str, int]:
counts: dict[str, int] = {}
for po_file in sorted(translations_dir.glob("*/LC_MESSAGES/messages.po")):
lang = po_file.parent.parent.name
if lang in SKIP_LANGS:
continue
try:
counts[lang] = count_translated(po_file)
except (subprocess.CalledProcessError, RuntimeError) as exc:
# A malformed .po file (msgfmt non-zero exit, or stderr we
# can't parse) is a real problem worth seeing, but it shouldn't
# take the whole regression check down with it — that would
# hide every other language's status. Skip and warn instead;
# the missing lang will not appear in the comparison output.
print(
f"WARNING: skipping {lang}{po_file} could not be counted: {exc}",
file=sys.stderr,
)
return counts
def build_regression_report(regressions: list[tuple[str, int, int]]) -> str:
"""Build a markdown report for posting as a PR comment."""
rows = "\n".join(
f"| `{lang}` | {b} | {a} | -{b - a} |" for lang, b, a in regressions
)
affected = ", ".join(f"`{lang}`" for lang, _, _ in regressions)
return (
"## ⚠️ Translation Regression Detected\n\n"
f"This PR causes existing translations to become fuzzy or be removed "
f"in {affected}. Please fix the affected `.po` files before merging.\n\n"
"| Language | Before | After | Lost |\n"
"|----------|-------:|------:|-----:|\n"
f"{rows}\n\n"
"### How to fix\n\n"
"**1. Install dependencies** (if not already set up):\n\n"
"```bash\n"
"pip install -r superset/translations/requirements.txt\n"
"sudo apt-get install gettext # or: brew install gettext\n"
"```\n\n"
"**2. Re-extract strings and sync `.po` files:**\n\n"
"```bash\n"
"./scripts/translations/babel_update.sh\n"
"```\n\n"
"This rewrites `superset/translations/messages.pot` from the current "
"source files and merges the changes into every `.po` file. Strings "
"whose `msgid` changed will be marked `#, fuzzy`.\n\n"
f"**3. Resolve the fuzzy entries** in the affected language files "
f"({affected}):\n\n"
"```bash\n"
"grep -n '#, fuzzy' superset/translations/<lang>/LC_MESSAGES/messages.po\n"
"```\n\n"
"For each fuzzy entry, either rewrite the `msgstr` to match the new "
"string and remove the `#, fuzzy` line, or clear the `msgstr` to "
'`""` if you cannot provide a translation.\n\n'
"**4. Commit your changes to the `.po` files.**\n"
)
def cmd_count(translations_dir: Path) -> None:
counts = get_counts(translations_dir)
print(json.dumps(counts, indent=2))
def cmd_compare(
before_path: str,
translations_dir: Path,
report_path: Optional[str] = None,
) -> None:
with open(before_path) as f:
before: dict[str, int] = json.load(f)
after = get_counts(translations_dir)
regressions: list[tuple[str, int, int]] = []
for lang, before_count in sorted(before.items()):
after_count = after.get(lang, 0)
if after_count < before_count:
regressions.append((lang, before_count, after_count))
if regressions:
print("Translation regression detected!\n")
for lang, b, a in regressions:
lost = b - a
print(f" {lang}: {b} -> {a} (-{lost} string(s) became fuzzy or removed)")
print(
"\nStrings renamed or deleted by this PR invalidated existing translations."
)
print(
"Update the affected .po files to restore the lost entries before merging."
)
if report_path:
Path(report_path).write_text(
build_regression_report(regressions), encoding="utf-8"
)
sys.exit(1)
# All good — print a summary so it's easy to read in CI logs.
print("No translation regressions.\n")
for lang in sorted(after):
b = before.get(lang, 0)
a = after[lang]
if a > b:
delta = f"+{a - b}"
elif a == b:
delta = "no change"
else:
delta = f"-{b - a}"
print(f" {lang}: {b} -> {a} ({delta})")
def main() -> None:
parser = argparse.ArgumentParser(
description="Check for translation regressions in .po files."
)
action = parser.add_mutually_exclusive_group(required=True)
action.add_argument(
"--count",
action="store_true",
help="Output translation counts per language as JSON.",
)
action.add_argument(
"--compare",
metavar="BEFORE_JSON",
help="Compare current counts against a baseline JSON file.",
)
parser.add_argument(
"--report",
metavar="REPORT_MD",
help="When --compare detects regressions, write a markdown report here.",
)
parser.add_argument(
"--translations-dir",
type=Path,
default=DEFAULT_TRANSLATIONS_DIR,
help=(
"Path to the translations directory containing per-language "
"LC_MESSAGES/messages.po files (default: <repo>/superset/translations)."
),
)
args = parser.parse_args()
if args.count:
cmd_count(args.translations_dir)
else:
cmd_compare(args.compare, args.translations_dir, args.report)
if __name__ == "__main__":
main()