superset2/scripts/translations/check_translation_regression.py

#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Check that source-code changes don't cause translation regressions.

Usage
-----
Count non-fuzzy translated entries in all .po files and write JSON to stdout:

    python check_translation_regression.py --count

Compare the current .po state against a previously-recorded baseline and fail
if any language lost translations:

    python check_translation_regression.py --compare /path/to/before.json

Optionally write a markdown report to a file (used by CI to post a PR comment):

    python check_translation_regression.py --compare before.json --report report.md

Use a translations directory other than the repo default (used by CI to count
against a separate base-branch worktree):

    python check_translation_regression.py --count \\
        --translations-dir /tmp/base-worktree/superset/translations

Typical CI workflow
-------------------
1. Create a base-branch worktree alongside the PR worktree
2. Run babel_update.sh in the base worktree (extract from BASE source)
3. Record baseline:  python ... --count --translations-dir BASE_TREE > before.json
4. Run babel_update.sh in the PR worktree (extract from PR source) starting
   from the same pristine BASE translations
5. Compare:  python ... --compare before.json [--report report.md]

Comparing two babel_update outputs that started from the same BASE .po files
isolates regressions caused by the PR's source diff from any pre-existing
drift on the base branch.
"""

import argparse
import json
import re
import subprocess
import sys
from pathlib import Path
from typing import Optional

DEFAULT_TRANSLATIONS_DIR = (
    Path(__file__).resolve().parent.parent.parent / "superset" / "translations"
)

# English .po files use empty msgstr by convention (source language == target),
# so they always show 0 translated entries and should not be checked.
SKIP_LANGS = {"en"}


def count_translated(po_file: Path) -> int:
    """Return the number of non-fuzzy translated messages in a .po file.

    Raises:
        subprocess.CalledProcessError: if ``msgfmt`` fails (e.g. malformed
            .po file). The regression check exists to surface translation
            problems, so a silent zero would defeat its purpose — let the
            caller see a malformed file as a hard failure.
    """
    import shutil  # noqa: PLC0415

    msgfmt = shutil.which("msgfmt") or "msgfmt"
    result = subprocess.run(  # noqa: S603
        [msgfmt, "--statistics", "-o", "/dev/null", str(po_file)],
        capture_output=True,
        text=True,
        check=True,
    )
    # stderr: "123 translated messages, 4 fuzzy translations, 56 untranslated messages."
    match = re.search(r"(\d+) translated message", result.stderr)
    if not match:
        raise RuntimeError(
            f"Could not parse msgfmt --statistics output for {po_file}: "
            f"{result.stderr!r}"
        )
    return int(match.group(1))


def get_counts(translations_dir: Path) -> dict[str, int]:
    counts: dict[str, int] = {}
    for po_file in sorted(translations_dir.glob("*/LC_MESSAGES/messages.po")):
        lang = po_file.parent.parent.name
        if lang in SKIP_LANGS:
            continue
        try:
            counts[lang] = count_translated(po_file)
        except (subprocess.CalledProcessError, RuntimeError) as exc:
            # A malformed .po file (msgfmt non-zero exit, or stderr we
            # can't parse) is a real problem worth seeing, but it shouldn't
            # take the whole regression check down with it — that would
            # hide every other language's status. Skip and warn instead;
            # the missing lang will not appear in the comparison output.
            print(
                f"WARNING: skipping {lang} — {po_file} could not be counted: {exc}",
                file=sys.stderr,
            )
    return counts


def build_regression_report(regressions: list[tuple[str, int, int]]) -> str:
    """Build a markdown report for posting as a PR comment."""
    rows = "\n".join(
        f"| `{lang}` | {b} | {a} | -{b - a} |" for lang, b, a in regressions
    )
    affected = ", ".join(f"`{lang}`" for lang, _, _ in regressions)
    return (
        "## ⚠️ Translation Regression Detected\n\n"
        f"This PR causes existing translations to become fuzzy or be removed "
        f"in {affected}. Please fix the affected `.po` files before merging.\n\n"
        "| Language | Before | After | Lost |\n"
        "|----------|-------:|------:|-----:|\n"
        f"{rows}\n\n"
        "### How to fix\n\n"
        "**1. Install dependencies** (if not already set up):\n\n"
        "```bash\n"
        "pip install -r superset/translations/requirements.txt\n"
        "sudo apt-get install gettext   # or: brew install gettext\n"
        "```\n\n"
        "**2. Re-extract strings and sync `.po` files:**\n\n"
        "```bash\n"
        "./scripts/translations/babel_update.sh\n"
        "```\n\n"
        "This rewrites `superset/translations/messages.pot` from the current "
        "source files and merges the changes into every `.po` file. Strings "
        "whose `msgid` changed will be marked `#, fuzzy`.\n\n"
        f"**3. Resolve the fuzzy entries** in the affected language files "
        f"({affected}):\n\n"
        "```bash\n"
        "grep -n '#, fuzzy' superset/translations/<lang>/LC_MESSAGES/messages.po\n"
        "```\n\n"
        "For each fuzzy entry, either rewrite the `msgstr` to match the new "
        "string and remove the `#, fuzzy` line, or clear the `msgstr` to "
        '`""` if you cannot provide a translation.\n\n'
        "**4. Commit your changes to the `.po` files.**\n"
    )


def cmd_count(translations_dir: Path) -> None:
    counts = get_counts(translations_dir)
    print(json.dumps(counts, indent=2))


def cmd_compare(
    before_path: str,
    translations_dir: Path,
    report_path: Optional[str] = None,
) -> None:
    with open(before_path) as f:
        before: dict[str, int] = json.load(f)

    after = get_counts(translations_dir)

    regressions: list[tuple[str, int, int]] = []
    for lang, before_count in sorted(before.items()):
        after_count = after.get(lang, 0)
        if after_count < before_count:
            regressions.append((lang, before_count, after_count))

    if regressions:
        print("Translation regression detected!\n")
        for lang, b, a in regressions:
            lost = b - a
            print(f"  {lang}: {b} -> {a}  (-{lost} string(s) became fuzzy or removed)")
        print(
            "\nStrings renamed or deleted by this PR invalidated existing translations."
        )
        print(
            "Update the affected .po files to restore the lost entries before merging."
        )
        if report_path:
            Path(report_path).write_text(
                build_regression_report(regressions), encoding="utf-8"
            )
        sys.exit(1)

    # All good — print a summary so it's easy to read in CI logs.
    print("No translation regressions.\n")
    for lang in sorted(after):
        b = before.get(lang, 0)
        a = after[lang]
        if a > b:
            delta = f"+{a - b}"
        elif a == b:
            delta = "no change"
        else:
            delta = f"-{b - a}"
        print(f"  {lang}: {b} -> {a}  ({delta})")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Check for translation regressions in .po files."
    )
    action = parser.add_mutually_exclusive_group(required=True)
    action.add_argument(
        "--count",
        action="store_true",
        help="Output translation counts per language as JSON.",
    )
    action.add_argument(
        "--compare",
        metavar="BEFORE_JSON",
        help="Compare current counts against a baseline JSON file.",
    )
    parser.add_argument(
        "--report",
        metavar="REPORT_MD",
        help="When --compare detects regressions, write a markdown report here.",
    )
    parser.add_argument(
        "--translations-dir",
        type=Path,
        default=DEFAULT_TRANSLATIONS_DIR,
        help=(
            "Path to the translations directory containing per-language "
            "LC_MESSAGES/messages.po files (default: <repo>/superset/translations)."
        ),
    )
    args = parser.parse_args()

    if args.count:
        cmd_count(args.translations_dir)
    else:
        cmd_compare(args.compare, args.translations_dir, args.report)


if __name__ == "__main__":
    main()