From 9c90a6854c52265073a41bc70f67ecce06e9150e Mon Sep 17 00:00:00 2001
From: Evan Rusackas <evan@preset.io>
Date: Sat, 23 May 2026 16:30:14 -0700
Subject: [PATCH] ci(translations): hard-block translation regressions in CI
 (#39443)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Claude Code <noreply@anthropic.com>
Co-authored-by: Đỗ Trọng Hải <41283691+hainenber@users.noreply.github.com>
---
 .../superset-translations-comment.yml         |  87 ++++++
 .github/workflows/superset-translations.yml   |  90 ++++++-
 .../check_translation_regression.py           | 250 ++++++++++++++++++
 3 files changed, 422 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/superset-translations-comment.yml
 create mode 100755 scripts/translations/check_translation_regression.py

diff --git a/.github/workflows/superset-translations-comment.yml b/.github/workflows/superset-translations-comment.yml
new file mode 100644
index 00000000000..04a43acf39d
--- /dev/null
+++ b/.github/workflows/superset-translations-comment.yml
@@ -0,0 +1,87 @@
+name: Translation Regression Comment
+
+on:
+  workflow_run:
+    workflows: ["Translations"]
+    types: [completed]
+
+# This workflow posts a PR comment when the Translations workflow detects a
+# regression. It uses the workflow_run trigger so that it always runs in the
+# base-branch context and can safely be granted write permissions, even for
+# PRs from forks.
+#
+# IMPORTANT: This workflow must NEVER check out code from the PR branch.
+# All data comes from the artifact uploaded by the Translations workflow.
+
+permissions:
+  pull-requests: write
+  actions: read
+
+jobs:
+  post-comment:
+    runs-on: ubuntu-24.04
+    # Only act when the Translations workflow failed (which means a regression
+    # was detected — the workflow exits 1 on regression).
+    if: github.event.workflow_run.conclusion == 'failure'
+
+    steps:
+      - name: Download regression artifact
+        id: download
+        continue-on-error: true
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
+        with:
+          name: translation-regression
+          run-id: ${{ github.event.workflow_run.id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          path: /tmp/translation-regression
+
+      - name: Post or update PR comment
+        if: steps.download.outcome == 'success'
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
+        with:
+          script: |
+            const fs = require('fs');
+
+            const prNumberFile = '/tmp/translation-regression/pr-number.txt';
+            const reportFile = '/tmp/translation-regression/regression-report.md';
+
+            if (!fs.existsSync(prNumberFile) || !fs.existsSync(reportFile)) {
+              console.log('Artifact files not found, skipping comment.');
+              return;
+            }
+
+            const prNumber = parseInt(fs.readFileSync(prNumberFile, 'utf8').trim(), 10);
+            if (!prNumber) {
+              console.log('Could not parse PR number, skipping comment.');
+              return;
+            }
+
+            const report = fs.readFileSync(reportFile, 'utf8');
+            const marker = '<!-- translation-regression-bot -->';
+            const body = `${marker}\n${report}`;
+
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber,
+            });
+
+            const existing = comments.find(c => c.body && c.body.includes(marker));
+
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existing.id,
+                body,
+              });
+              console.log(`Updated existing comment ${existing.id} on PR #${prNumber}`);
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                body,
+              });
+              console.log(`Created new comment on PR #${prNumber}`);
+            }
diff --git a/.github/workflows/superset-translations.yml b/.github/workflows/superset-translations.yml
index 8de7b404519..cb800a8b45c 100644
--- a/.github/workflows/superset-translations.yml
+++ b/.github/workflows/superset-translations.yml
@@ -20,6 +20,9 @@ concurrency:
 jobs:
   frontend-check-translations:
     runs-on: ubuntu-24.04
+    permissions:
+      contents: read
+      pull-requests: read
     steps:
       - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )"
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
@@ -51,12 +54,16 @@ jobs:
 
   babel-extract:
     runs-on: ubuntu-24.04
+    permissions:
+      contents: read
+      pull-requests: read
     steps:
       - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )"
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
         with:
           persist-credentials: false
           submodules: recursive
+
       - name: Check for file changes
         id: check
         uses: ./.github/actions/change-detector/
@@ -64,12 +71,85 @@ jobs:
           token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Setup Python
-        if: steps.check.outputs.python
+        if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true'
         uses: ./.github/actions/setup-backend/
 
-      - name: Install msgcat
-        run: sudo apt update && sudo apt install gettext
+      - name: Install gettext tools
+        if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true'
+        run: sudo apt-get update && sudo apt-get install -y gettext
 
-      - name: Test babel extraction
-        if: steps.check.outputs.python
+      # Fetch the base ref so we can compare PR-introduced regressions
+      # against a fair baseline (also runs babel_update against the base
+      # source) — this isolates the PR's contribution from any pre-existing
+      # drift on the base branch.
+      - name: Fetch base ref and create comparison worktree
+        if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true'
+        run: |
+          # For PRs use the base branch; for direct pushes compare against the previous commit.
+          BASE_REF="${{ github.event.pull_request.base.ref }}"
+          if [ -n "$BASE_REF" ]; then
+            git fetch --depth=1 origin "$BASE_REF"
+          else
+            git fetch --depth=2 origin "${{ github.ref }}"
+          fi
+          git worktree add /tmp/base-worktree FETCH_HEAD
+
+      # Run babel_update against BASE source + BASE translations. Any drift
+      # already present on the base branch (source strings that have changed
+      # without .po updates) shows up here as fuzzies — and will also show
+      # up in the PR run, so it cancels out in the comparison.
+      - name: Baseline — run babel_update against BASE source
+        if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true'
+        working-directory: /tmp/base-worktree
         run: ./scripts/translations/babel_update.sh
+
+      - name: Record baseline translation counts
+        if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true'
+        run: |
+          python scripts/translations/check_translation_regression.py \
+            --count \
+            --translations-dir /tmp/base-worktree/superset/translations \
+            > /tmp/before.json
+
+      # Reset the PR worktree's translations to the pristine BASE state so
+      # both babel_update runs start from the same .po files. The only
+      # difference between the runs is the source code.
+      - name: Reset PR worktree translations to pristine BASE
+        if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true'
+        run: git checkout FETCH_HEAD -- superset/translations/
+
+      - name: Run babel_update against PR source
+        if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true'
+        run: ./scripts/translations/babel_update.sh
+
+      - name: Check for translation regression
+        id: regression
+        if: steps.check.outputs.python == 'true' || steps.check.outputs.frontend == 'true'
+        continue-on-error: true
+        run: |
+          python scripts/translations/check_translation_regression.py \
+            --compare /tmp/before.json \
+            --report /tmp/regression-report.md
+
+      # Save the PR number so the comment workflow can post the report without
+      # needing write permissions on this pull_request-triggered job.
+      - name: Save PR number for comment workflow
+        if: >-
+          github.event_name == 'pull_request' &&
+          steps.regression.outcome == 'failure'
+        run: echo "${{ github.event.pull_request.number }}" > /tmp/pr-number.txt
+
+      - name: Upload regression artifact
+        if: >-
+          github.event_name == 'pull_request' &&
+          steps.regression.outcome == 'failure'
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
+        with:
+          name: translation-regression
+          path: |
+            /tmp/regression-report.md
+            /tmp/pr-number.txt
+
+      - name: Fail if regression detected
+        if: steps.regression.outcome == 'failure'
+        run: exit 1
diff --git a/scripts/translations/check_translation_regression.py b/scripts/translations/check_translation_regression.py
new file mode 100755
index 00000000000..23f60bf6535
--- /dev/null
+++ b/scripts/translations/check_translation_regression.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Check that source-code changes don't cause translation regressions.
+
+Usage
+-----
+Count non-fuzzy translated entries in all .po files and write JSON to stdout:
+
+    python check_translation_regression.py --count
+
+Compare the current .po state against a previously-recorded baseline and fail
+if any language lost translations:
+
+    python check_translation_regression.py --compare /path/to/before.json
+
+Optionally write a markdown report to a file (used by CI to post a PR comment):
+
+    python check_translation_regression.py --compare before.json --report report.md
+
+Use a translations directory other than the repo default (used by CI to count
+against a separate base-branch worktree):
+
+    python check_translation_regression.py --count \\
+        --translations-dir /tmp/base-worktree/superset/translations
+
+Typical CI workflow
+-------------------
+1. Create a base-branch worktree alongside the PR worktree
+2. Run babel_update.sh in the base worktree (extract from BASE source)
+3. Record baseline:  python ... --count --translations-dir BASE_TREE > before.json
+4. Run babel_update.sh in the PR worktree (extract from PR source) starting
+   from the same pristine BASE translations
+5. Compare:  python ... --compare before.json [--report report.md]
+
+Comparing two babel_update outputs that started from the same BASE .po files
+isolates regressions caused by the PR's source diff from any pre-existing
+drift on the base branch.
+"""
+
+import argparse
+import json
+import re
+import subprocess
+import sys
+from pathlib import Path
+from typing import Optional
+
+DEFAULT_TRANSLATIONS_DIR = (
+    Path(__file__).resolve().parent.parent.parent / "superset" / "translations"
+)
+
+# English .po files use empty msgstr by convention (source language == target),
+# so they always show 0 translated entries and should not be checked.
+SKIP_LANGS = {"en"}
+
+
+def count_translated(po_file: Path) -> int:
+    """Return the number of non-fuzzy translated messages in a .po file.
+
+    Raises:
+        subprocess.CalledProcessError: if ``msgfmt`` fails (e.g. malformed
+            .po file). The regression check exists to surface translation
+            problems, so a silent zero would defeat its purpose — let the
+            caller see a malformed file as a hard failure.
+    """
+    import shutil  # noqa: PLC0415
+
+    msgfmt = shutil.which("msgfmt") or "msgfmt"
+    result = subprocess.run(  # noqa: S603
+        [msgfmt, "--statistics", "-o", "/dev/null", str(po_file)],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    # stderr: "123 translated messages, 4 fuzzy translations, 56 untranslated messages."
+    match = re.search(r"(\d+) translated message", result.stderr)
+    if not match:
+        raise RuntimeError(
+            f"Could not parse msgfmt --statistics output for {po_file}: "
+            f"{result.stderr!r}"
+        )
+    return int(match.group(1))
+
+
+def get_counts(translations_dir: Path) -> dict[str, int]:
+    counts: dict[str, int] = {}
+    for po_file in sorted(translations_dir.glob("*/LC_MESSAGES/messages.po")):
+        lang = po_file.parent.parent.name
+        if lang in SKIP_LANGS:
+            continue
+        try:
+            counts[lang] = count_translated(po_file)
+        except (subprocess.CalledProcessError, RuntimeError) as exc:
+            # A malformed .po file (msgfmt non-zero exit, or stderr we
+            # can't parse) is a real problem worth seeing, but it shouldn't
+            # take the whole regression check down with it — that would
+            # hide every other language's status. Skip and warn instead;
+            # the missing lang will not appear in the comparison output.
+            print(
+                f"WARNING: skipping {lang} — {po_file} could not be counted: {exc}",
+                file=sys.stderr,
+            )
+    return counts
+
+
+def build_regression_report(regressions: list[tuple[str, int, int]]) -> str:
+    """Build a markdown report for posting as a PR comment."""
+    rows = "\n".join(
+        f"| `{lang}` | {b} | {a} | -{b - a} |" for lang, b, a in regressions
+    )
+    affected = ", ".join(f"`{lang}`" for lang, _, _ in regressions)
+    return (
+        "## ⚠️ Translation Regression Detected\n\n"
+        f"This PR causes existing translations to become fuzzy or be removed "
+        f"in {affected}. Please fix the affected `.po` files before merging.\n\n"
+        "| Language | Before | After | Lost |\n"
+        "|----------|-------:|------:|-----:|\n"
+        f"{rows}\n\n"
+        "### How to fix\n\n"
+        "**1. Install dependencies** (if not already set up):\n\n"
+        "```bash\n"
+        "pip install -r superset/translations/requirements.txt\n"
+        "sudo apt-get install gettext   # or: brew install gettext\n"
+        "```\n\n"
+        "**2. Re-extract strings and sync `.po` files:**\n\n"
+        "```bash\n"
+        "./scripts/translations/babel_update.sh\n"
+        "```\n\n"
+        "This rewrites `superset/translations/messages.pot` from the current "
+        "source files and merges the changes into every `.po` file. Strings "
+        "whose `msgid` changed will be marked `#, fuzzy`.\n\n"
+        f"**3. Resolve the fuzzy entries** in the affected language files "
+        f"({affected}):\n\n"
+        "```bash\n"
+        "grep -n '#, fuzzy' superset/translations/<lang>/LC_MESSAGES/messages.po\n"
+        "```\n\n"
+        "For each fuzzy entry, either rewrite the `msgstr` to match the new "
+        "string and remove the `#, fuzzy` line, or clear the `msgstr` to "
+        '`""` if you cannot provide a translation.\n\n'
+        "**4. Commit your changes to the `.po` files.**\n"
+    )
+
+
+def cmd_count(translations_dir: Path) -> None:
+    counts = get_counts(translations_dir)
+    print(json.dumps(counts, indent=2))
+
+
+def cmd_compare(
+    before_path: str,
+    translations_dir: Path,
+    report_path: Optional[str] = None,
+) -> None:
+    with open(before_path) as f:
+        before: dict[str, int] = json.load(f)
+
+    after = get_counts(translations_dir)
+
+    regressions: list[tuple[str, int, int]] = []
+    for lang, before_count in sorted(before.items()):
+        after_count = after.get(lang, 0)
+        if after_count < before_count:
+            regressions.append((lang, before_count, after_count))
+
+    if regressions:
+        print("Translation regression detected!\n")
+        for lang, b, a in regressions:
+            lost = b - a
+            print(f"  {lang}: {b} -> {a}  (-{lost} string(s) became fuzzy or removed)")
+        print(
+            "\nStrings renamed or deleted by this PR invalidated existing translations."
+        )
+        print(
+            "Update the affected .po files to restore the lost entries before merging."
+        )
+        if report_path:
+            Path(report_path).write_text(
+                build_regression_report(regressions), encoding="utf-8"
+            )
+        sys.exit(1)
+
+    # All good — print a summary so it's easy to read in CI logs.
+    print("No translation regressions.\n")
+    for lang in sorted(after):
+        b = before.get(lang, 0)
+        a = after[lang]
+        if a > b:
+            delta = f"+{a - b}"
+        elif a == b:
+            delta = "no change"
+        else:
+            delta = f"-{b - a}"
+        print(f"  {lang}: {b} -> {a}  ({delta})")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Check for translation regressions in .po files."
+    )
+    action = parser.add_mutually_exclusive_group(required=True)
+    action.add_argument(
+        "--count",
+        action="store_true",
+        help="Output translation counts per language as JSON.",
+    )
+    action.add_argument(
+        "--compare",
+        metavar="BEFORE_JSON",
+        help="Compare current counts against a baseline JSON file.",
+    )
+    parser.add_argument(
+        "--report",
+        metavar="REPORT_MD",
+        help="When --compare detects regressions, write a markdown report here.",
+    )
+    parser.add_argument(
+        "--translations-dir",
+        type=Path,
+        default=DEFAULT_TRANSLATIONS_DIR,
+        help=(
+            "Path to the translations directory containing per-language "
+            "LC_MESSAGES/messages.po files (default: <repo>/superset/translations)."
+        ),
+    )
+    args = parser.parse_args()
+
+    if args.count:
+        cmd_count(args.translations_dir)
+    else:
+        cmd_compare(args.compare, args.translations_dir, args.report)
+
+
+if __name__ == "__main__":
+    main()