From 9c90a6854c52265073a41bc70f67ecce06e9150e Mon Sep 17 00:00:00 2001 From: Evan Rusackas Date: Sat, 23 May 2026 16:30:14 -0700 Subject: [PATCH] ci(translations): hard-block translation regressions in CI (#39443) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Claude Code Co-authored-by: Đỗ Trọng Hải <41283691+hainenber@users.noreply.github.com> --- .../superset-translations-comment.yml | 87 ++++++ .github/workflows/superset-translations.yml | 90 ++++++- .../check_translation_regression.py | 250 ++++++++++++++++++ 3 files changed, 422 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/superset-translations-comment.yml create mode 100755 scripts/translations/check_translation_regression.py diff --git a/.github/workflows/superset-translations-comment.yml b/.github/workflows/superset-translations-comment.yml new file mode 100644 index 00000000000..04a43acf39d --- /dev/null +++ b/.github/workflows/superset-translations-comment.yml @@ -0,0 +1,87 @@ +name: Translation Regression Comment + +on: + workflow_run: + workflows: ["Translations"] + types: [completed] + +# This workflow posts a PR comment when the Translations workflow detects a +# regression. It uses the workflow_run trigger so that it always runs in the +# base-branch context and can safely be granted write permissions, even for +# PRs from forks. +# +# IMPORTANT: This workflow must NEVER check out code from the PR branch. +# All data comes from the artifact uploaded by the Translations workflow. + +permissions: + pull-requests: write + actions: read + +jobs: + post-comment: + runs-on: ubuntu-24.04 + # Only act when the Translations workflow failed (which means a regression + # was detected — the workflow exits 1 on regression). + if: github.event.workflow_run.conclusion == 'failure' + + steps: + - name: Download regression artifact + id: download + continue-on-error: true + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 + with: + name: translation-regression + run-id: ${{ github.event.workflow_run.id }} + github-token: ${{ secrets.GITHUB_TOKEN }} + path: /tmp/translation-regression + + - name: Post or update PR comment + if: steps.download.outcome == 'success' + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 + with: + script: | + const fs = require('fs'); + + const prNumberFile = '/tmp/translation-regression/pr-number.txt'; + const reportFile = '/tmp/translation-regression/regression-report.md'; + + if (!fs.existsSync(prNumberFile) || !fs.existsSync(reportFile)) { + console.log('Artifact files not found, skipping comment.'); + return; + } + + const prNumber = parseInt(fs.readFileSync(prNumberFile, 'utf8').trim(), 10); + if (!prNumber) { + console.log('Could not parse PR number, skipping comment.'); + return; + } + + const report = fs.readFileSync(reportFile, 'utf8'); + const marker = ''; + const body = `${marker}\n${report}`; + + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + }); + + const existing = comments.find(c => c.body && c.body.includes(marker)); + + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + console.log(`Updated existing comment ${existing.id} on PR #${prNumber}`); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: prNumber, + body, + }); + console.log(`Created new comment on PR #${prNumber}`); + } diff --git a/.github/workflows/superset-translations.yml b/.github/workflows/superset-translations.yml index 8de7b404519..cb800a8b45c 100644 --- a/.github/workflows/superset-translations.yml +++ b/.github/workflows/superset-translations.yml @@ -20,6 +20,9 @@ concurrency: jobs: frontend-check-translations: runs-on: ubuntu-24.04 + permissions: + contents: read + pull-requests: read steps: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 @@ -51,12 +54,16 @@ jobs: babel-extract: runs-on: ubuntu-24.04 + permissions: + contents: read + pull-requests: read steps: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: persist-credentials: false submodules: recursive + - name: Check for file changes id: check uses: ./.github/actions/change-detector/ @@ -64,12 +71,85 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} - name: Setup Python - if: steps.check.outputs.python + if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true' uses: ./.github/actions/setup-backend/ - - name: Install msgcat - run: sudo apt update && sudo apt install gettext + - name: Install gettext tools + if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true' + run: sudo apt-get update && sudo apt-get install -y gettext - - name: Test babel extraction - if: steps.check.outputs.python + # Fetch the base ref so we can compare PR-introduced regressions + # against a fair baseline (also runs babel_update against the base + # source) — this isolates the PR's contribution from any pre-existing + # drift on the base branch. + - name: Fetch base ref and create comparison worktree + if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true' + run: | + # For PRs use the base branch; for direct pushes compare against the previous commit. + BASE_REF="${{ github.event.pull_request.base.ref }}" + if [ -n "$BASE_REF" ]; then + git fetch --depth=1 origin "$BASE_REF" + else + git fetch --depth=2 origin "${{ github.ref }}" + fi + git worktree add /tmp/base-worktree FETCH_HEAD + + # Run babel_update against BASE source + BASE translations. Any drift + # already present on the base branch (source strings that have changed + # without .po updates) shows up here as fuzzies — and will also show + # up in the PR run, so it cancels out in the comparison. + - name: Baseline — run babel_update against BASE source + if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true' + working-directory: /tmp/base-worktree run: ./scripts/translations/babel_update.sh + + - name: Record baseline translation counts + if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true' + run: | + python scripts/translations/check_translation_regression.py \ + --count \ + --translations-dir /tmp/base-worktree/superset/translations \ + > /tmp/before.json + + # Reset the PR worktree's translations to the pristine BASE state so + # both babel_update runs start from the same .po files. The only + # difference between the runs is the source code. + - name: Reset PR worktree translations to pristine BASE + if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true' + run: git checkout FETCH_HEAD -- superset/translations/ + + - name: Run babel_update against PR source + if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true' + run: ./scripts/translations/babel_update.sh + + - name: Check for translation regression + id: regression + if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true' + continue-on-error: true + run: | + python scripts/translations/check_translation_regression.py \ + --compare /tmp/before.json \ + --report /tmp/regression-report.md + + # Save the PR number so the comment workflow can post the report without + # needing write permissions on this pull_request-triggered job. + - name: Save PR number for comment workflow + if: >- + github.event_name == 'pull_request' && + steps.regression.outcome == 'failure' + run: echo "${{ github.event.pull_request.number }}" > /tmp/pr-number.txt + + - name: Upload regression artifact + if: >- + github.event_name == 'pull_request' && + steps.regression.outcome == 'failure' + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + with: + name: translation-regression + path: | + /tmp/regression-report.md + /tmp/pr-number.txt + + - name: Fail if regression detected + if: steps.regression.outcome == 'failure' + run: exit 1 diff --git a/scripts/translations/check_translation_regression.py b/scripts/translations/check_translation_regression.py new file mode 100755 index 00000000000..23f60bf6535 --- /dev/null +++ b/scripts/translations/check_translation_regression.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Check that source-code changes don't cause translation regressions. + +Usage +----- +Count non-fuzzy translated entries in all .po files and write JSON to stdout: + + python check_translation_regression.py --count + +Compare the current .po state against a previously-recorded baseline and fail +if any language lost translations: + + python check_translation_regression.py --compare /path/to/before.json + +Optionally write a markdown report to a file (used by CI to post a PR comment): + + python check_translation_regression.py --compare before.json --report report.md + +Use a translations directory other than the repo default (used by CI to count +against a separate base-branch worktree): + + python check_translation_regression.py --count \\ + --translations-dir /tmp/base-worktree/superset/translations + +Typical CI workflow +------------------- +1. Create a base-branch worktree alongside the PR worktree +2. Run babel_update.sh in the base worktree (extract from BASE source) +3. Record baseline: python ... --count --translations-dir BASE_TREE > before.json +4. Run babel_update.sh in the PR worktree (extract from PR source) starting + from the same pristine BASE translations +5. Compare: python ... --compare before.json [--report report.md] + +Comparing two babel_update outputs that started from the same BASE .po files +isolates regressions caused by the PR's source diff from any pre-existing +drift on the base branch. +""" + +import argparse +import json +import re +import subprocess +import sys +from pathlib import Path +from typing import Optional + +DEFAULT_TRANSLATIONS_DIR = ( + Path(__file__).resolve().parent.parent.parent / "superset" / "translations" +) + +# English .po files use empty msgstr by convention (source language == target), +# so they always show 0 translated entries and should not be checked. +SKIP_LANGS = {"en"} + + +def count_translated(po_file: Path) -> int: + """Return the number of non-fuzzy translated messages in a .po file. + + Raises: + subprocess.CalledProcessError: if ``msgfmt`` fails (e.g. malformed + .po file). The regression check exists to surface translation + problems, so a silent zero would defeat its purpose — let the + caller see a malformed file as a hard failure. + """ + import shutil # noqa: PLC0415 + + msgfmt = shutil.which("msgfmt") or "msgfmt" + result = subprocess.run( # noqa: S603 + [msgfmt, "--statistics", "-o", "/dev/null", str(po_file)], + capture_output=True, + text=True, + check=True, + ) + # stderr: "123 translated messages, 4 fuzzy translations, 56 untranslated messages." + match = re.search(r"(\d+) translated message", result.stderr) + if not match: + raise RuntimeError( + f"Could not parse msgfmt --statistics output for {po_file}: " + f"{result.stderr!r}" + ) + return int(match.group(1)) + + +def get_counts(translations_dir: Path) -> dict[str, int]: + counts: dict[str, int] = {} + for po_file in sorted(translations_dir.glob("*/LC_MESSAGES/messages.po")): + lang = po_file.parent.parent.name + if lang in SKIP_LANGS: + continue + try: + counts[lang] = count_translated(po_file) + except (subprocess.CalledProcessError, RuntimeError) as exc: + # A malformed .po file (msgfmt non-zero exit, or stderr we + # can't parse) is a real problem worth seeing, but it shouldn't + # take the whole regression check down with it — that would + # hide every other language's status. Skip and warn instead; + # the missing lang will not appear in the comparison output. + print( + f"WARNING: skipping {lang} — {po_file} could not be counted: {exc}", + file=sys.stderr, + ) + return counts + + +def build_regression_report(regressions: list[tuple[str, int, int]]) -> str: + """Build a markdown report for posting as a PR comment.""" + rows = "\n".join( + f"| `{lang}` | {b} | {a} | -{b - a} |" for lang, b, a in regressions + ) + affected = ", ".join(f"`{lang}`" for lang, _, _ in regressions) + return ( + "## ⚠️ Translation Regression Detected\n\n" + f"This PR causes existing translations to become fuzzy or be removed " + f"in {affected}. Please fix the affected `.po` files before merging.\n\n" + "| Language | Before | After | Lost |\n" + "|----------|-------:|------:|-----:|\n" + f"{rows}\n\n" + "### How to fix\n\n" + "**1. Install dependencies** (if not already set up):\n\n" + "```bash\n" + "pip install -r superset/translations/requirements.txt\n" + "sudo apt-get install gettext # or: brew install gettext\n" + "```\n\n" + "**2. Re-extract strings and sync `.po` files:**\n\n" + "```bash\n" + "./scripts/translations/babel_update.sh\n" + "```\n\n" + "This rewrites `superset/translations/messages.pot` from the current " + "source files and merges the changes into every `.po` file. Strings " + "whose `msgid` changed will be marked `#, fuzzy`.\n\n" + f"**3. Resolve the fuzzy entries** in the affected language files " + f"({affected}):\n\n" + "```bash\n" + "grep -n '#, fuzzy' superset/translations//LC_MESSAGES/messages.po\n" + "```\n\n" + "For each fuzzy entry, either rewrite the `msgstr` to match the new " + "string and remove the `#, fuzzy` line, or clear the `msgstr` to " + '`""` if you cannot provide a translation.\n\n' + "**4. Commit your changes to the `.po` files.**\n" + ) + + +def cmd_count(translations_dir: Path) -> None: + counts = get_counts(translations_dir) + print(json.dumps(counts, indent=2)) + + +def cmd_compare( + before_path: str, + translations_dir: Path, + report_path: Optional[str] = None, +) -> None: + with open(before_path) as f: + before: dict[str, int] = json.load(f) + + after = get_counts(translations_dir) + + regressions: list[tuple[str, int, int]] = [] + for lang, before_count in sorted(before.items()): + after_count = after.get(lang, 0) + if after_count < before_count: + regressions.append((lang, before_count, after_count)) + + if regressions: + print("Translation regression detected!\n") + for lang, b, a in regressions: + lost = b - a + print(f" {lang}: {b} -> {a} (-{lost} string(s) became fuzzy or removed)") + print( + "\nStrings renamed or deleted by this PR invalidated existing translations." + ) + print( + "Update the affected .po files to restore the lost entries before merging." + ) + if report_path: + Path(report_path).write_text( + build_regression_report(regressions), encoding="utf-8" + ) + sys.exit(1) + + # All good — print a summary so it's easy to read in CI logs. + print("No translation regressions.\n") + for lang in sorted(after): + b = before.get(lang, 0) + a = after[lang] + if a > b: + delta = f"+{a - b}" + elif a == b: + delta = "no change" + else: + delta = f"-{b - a}" + print(f" {lang}: {b} -> {a} ({delta})") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Check for translation regressions in .po files." + ) + action = parser.add_mutually_exclusive_group(required=True) + action.add_argument( + "--count", + action="store_true", + help="Output translation counts per language as JSON.", + ) + action.add_argument( + "--compare", + metavar="BEFORE_JSON", + help="Compare current counts against a baseline JSON file.", + ) + parser.add_argument( + "--report", + metavar="REPORT_MD", + help="When --compare detects regressions, write a markdown report here.", + ) + parser.add_argument( + "--translations-dir", + type=Path, + default=DEFAULT_TRANSLATIONS_DIR, + help=( + "Path to the translations directory containing per-language " + "LC_MESSAGES/messages.po files (default: /superset/translations)." + ), + ) + args = parser.parse_args() + + if args.count: + cmd_count(args.translations_dir) + else: + cmd_compare(args.compare, args.translations_dir, args.report) + + +if __name__ == "__main__": + main()