mirror of
https://github.com/apache/superset.git
synced 2026-05-30 12:49:17 +00:00
feat(i18n): AI-assisted translation backfill tooling + Spanish translations (#39448)
Co-authored-by: Claude Code <noreply@anthropic.com> Co-authored-by: codeant-ai-for-open-source[bot] <244253245+codeant-ai-for-open-source[bot]@users.noreply.github.com> Co-authored-by: Superset Dev <dev@superset.apache.org> Co-authored-by: Đỗ Trọng Hải <41283691+hainenber@users.noreply.github.com> Co-authored-by: Claude <claude@anthropic.com>
This commit is contained in:
16
tests/unit_tests/scripts/translations/__init__.py
Normal file
16
tests/unit_tests/scripts/translations/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
312
tests/unit_tests/scripts/translations/backfill_po_test.py
Normal file
312
tests/unit_tests/scripts/translations/backfill_po_test.py
Normal file
@@ -0,0 +1,312 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
"""Tests for ``scripts/translations/backfill_po.py``.
|
||||
|
||||
The script is not installed as a package, so it is loaded via importlib from
|
||||
its filesystem path. The two units exercised here — ``parse_response`` and
|
||||
``_apply_translation`` — have enough edge cases (dict/list/scalar responses,
|
||||
plural vs singular entries, fuzzy flag, attribution comments) to be worth
|
||||
pinning against regressions.
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
import json # noqa: TID251 - testing a standalone script that uses stdlib json
|
||||
from pathlib import Path
|
||||
|
||||
import polib # type: ignore[import-untyped]
|
||||
import pytest
|
||||
|
||||
_SCRIPT_PATH = (
|
||||
Path(__file__).resolve().parents[4] / "scripts" / "translations" / "backfill_po.py"
|
||||
)
|
||||
_spec = importlib.util.spec_from_file_location("backfill_po", _SCRIPT_PATH)
|
||||
assert _spec is not None, f"Could not load {_SCRIPT_PATH}"
|
||||
assert _spec.loader is not None, f"No loader on spec for {_SCRIPT_PATH}"
|
||||
backfill_po = importlib.util.module_from_spec(_spec)
|
||||
_spec.loader.exec_module(backfill_po)
|
||||
|
||||
|
||||
def test_parse_response_singular_strings() -> None:
|
||||
"""A flat object of int-keyed strings is returned as-is."""
|
||||
text = '{"0": "hola", "1": "mundo"}'
|
||||
assert backfill_po.parse_response(text, batch_size=2) == {
|
||||
0: "hola",
|
||||
1: "mundo",
|
||||
}
|
||||
|
||||
|
||||
def test_parse_response_strips_markdown_fences() -> None:
|
||||
"""Models sometimes wrap JSON in ```json fences; those must be stripped."""
|
||||
text = '```json\n{"0": "hola"}\n```'
|
||||
assert backfill_po.parse_response(text, batch_size=1) == {0: "hola"}
|
||||
|
||||
|
||||
def test_parse_response_preserves_plural_dict_as_json() -> None:
|
||||
"""
|
||||
Plural entries arrive as nested dicts and must round-trip through
|
||||
json.loads downstream — str(dict) would emit Python repr (single quotes)
|
||||
and break parsing in _apply_translation. The serialized form must be
|
||||
valid JSON.
|
||||
"""
|
||||
text = '{"0": {"0": "manzana", "1": "manzanas"}}'
|
||||
parsed = backfill_po.parse_response(text, batch_size=1)
|
||||
assert set(parsed.keys()) == {0}
|
||||
# Must be valid JSON (double-quoted), not Python repr (single-quoted).
|
||||
assert json.loads(parsed[0]) == {"0": "manzana", "1": "manzanas"}
|
||||
|
||||
|
||||
def test_parse_response_preserves_non_ascii() -> None:
|
||||
"""ensure_ascii=False keeps non-ASCII characters readable in the .po file."""
|
||||
text = '{"0": {"0": "日本語", "1": "日本語s"}}'
|
||||
parsed = backfill_po.parse_response(text, batch_size=1)
|
||||
assert "日本語" in parsed[0]
|
||||
|
||||
|
||||
def test_parse_response_skips_non_numeric_keys() -> None:
|
||||
"""Keys that are not numeric strings are silently skipped."""
|
||||
text = '{"0": "ok", "comment": "ignored", "2": "kept"}'
|
||||
assert backfill_po.parse_response(text, batch_size=3) == {
|
||||
0: "ok",
|
||||
2: "kept",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw",
|
||||
['["hola", "mundo"]', '"just a string"', "null", "42"],
|
||||
)
|
||||
def test_parse_response_rejects_non_object(raw: str) -> None:
|
||||
"""
|
||||
Non-object JSON (list, string, null, number) must raise ValueError so
|
||||
_process_batches catches it instead of crashing on AttributeError from
|
||||
.items().
|
||||
"""
|
||||
with pytest.raises(ValueError, match="Expected a JSON object"):
|
||||
backfill_po.parse_response(raw, batch_size=1)
|
||||
|
||||
|
||||
def test_parse_response_rejects_invalid_json() -> None:
|
||||
"""Garbage input surfaces as ValueError, not the underlying JSONDecodeError."""
|
||||
with pytest.raises(ValueError, match="Could not parse response as JSON"):
|
||||
backfill_po.parse_response("not even close to json", batch_size=1)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _apply_translation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_singular_entry(msgid: str = "Hello") -> polib.POEntry:
|
||||
return polib.POEntry(msgid=msgid, msgstr="")
|
||||
|
||||
|
||||
def _make_plural_entry(
|
||||
msgid: str = "%(n)s apple",
|
||||
msgid_plural: str = "%(n)s apples",
|
||||
) -> polib.POEntry:
|
||||
entry = polib.POEntry(msgid=msgid, msgid_plural=msgid_plural)
|
||||
entry.msgstr_plural = {0: "", 1: ""}
|
||||
return entry
|
||||
|
||||
|
||||
def _item(refs: list[str] | None = None) -> dict[str, list[str]]:
|
||||
return {"context_langs": refs if refs is not None else ["fr", "de"]}
|
||||
|
||||
|
||||
def test_apply_translation_singular_writes_msgstr_and_marks_fuzzy() -> None:
|
||||
entry = _make_singular_entry()
|
||||
backfill_po._apply_translation(
|
||||
entry, "Hola", _item(["fr", "de"]), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
assert entry.msgstr == "Hola"
|
||||
assert "fuzzy" in entry.flags
|
||||
|
||||
|
||||
def test_apply_translation_singular_no_fuzzy_when_disabled() -> None:
|
||||
entry = _make_singular_entry()
|
||||
backfill_po._apply_translation(
|
||||
entry, "Hola", _item(), model="claude-test", mark_fuzzy=False
|
||||
)
|
||||
assert "fuzzy" not in entry.flags
|
||||
|
||||
|
||||
def test_apply_translation_attribution_includes_refs() -> None:
|
||||
entry = _make_singular_entry()
|
||||
backfill_po._apply_translation(
|
||||
entry, "Hola", _item(["fr", "de"]), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
assert "Machine-translated via backfill_po.py (claude-test)" in entry.tcomment
|
||||
assert "[refs: fr, de]" in entry.tcomment
|
||||
|
||||
|
||||
def test_apply_translation_attribution_marks_no_refs() -> None:
|
||||
entry = _make_singular_entry()
|
||||
backfill_po._apply_translation(
|
||||
entry, "Hola", _item([]), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
assert "[no refs]" in entry.tcomment
|
||||
|
||||
|
||||
def test_apply_translation_attribution_appended_not_duplicated() -> None:
|
||||
"""Re-running on an already-translated entry must not duplicate attribution."""
|
||||
entry = _make_singular_entry()
|
||||
entry.tcomment = "Existing maintainer note"
|
||||
backfill_po._apply_translation(
|
||||
entry, "Hola", _item(["fr"]), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
# Existing comment preserved, attribution appended.
|
||||
assert entry.tcomment.startswith("Existing maintainer note\n")
|
||||
assert "Machine-translated via backfill_po.py" in entry.tcomment
|
||||
|
||||
# Apply again — attribution must not duplicate.
|
||||
backfill_po._apply_translation(
|
||||
entry, "Hola", _item(["fr"]), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
assert entry.tcomment.count("Machine-translated via backfill_po.py") == 1
|
||||
|
||||
|
||||
def test_apply_translation_plural_dict_response() -> None:
|
||||
"""A JSON-dict response writes each plural form to msgstr_plural."""
|
||||
entry = _make_plural_entry()
|
||||
translation = json.dumps({"0": "manzana", "1": "manzanas"})
|
||||
backfill_po._apply_translation(
|
||||
entry, translation, _item(), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
assert entry.msgstr_plural == {0: "manzana", 1: "manzanas"}
|
||||
assert "fuzzy" in entry.flags
|
||||
|
||||
|
||||
def test_apply_translation_plural_scalar_json_fills_all_forms() -> None:
|
||||
"""
|
||||
A JSON-scalar response (e.g. ``"hola"``) is broadcast to every plural form.
|
||||
This is the documented fallback when the model returns a single string for
|
||||
a plural entry.
|
||||
"""
|
||||
entry = _make_plural_entry()
|
||||
backfill_po._apply_translation(
|
||||
entry, '"manzana"', _item(), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
assert entry.msgstr_plural == {0: "manzana", 1: "manzana"}
|
||||
|
||||
|
||||
def test_apply_translation_plural_invalid_json_fills_all_forms() -> None:
|
||||
"""
|
||||
A non-JSON string also broadcasts to every plural form (rather than
|
||||
crashing). This handles older models that ignore the JSON instruction.
|
||||
"""
|
||||
entry = _make_plural_entry()
|
||||
backfill_po._apply_translation(
|
||||
entry, "manzana", _item(), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
assert entry.msgstr_plural == {0: "manzana", 1: "manzana"}
|
||||
|
||||
|
||||
def test_apply_translation_plural_round_trip_from_parse_response() -> None:
|
||||
"""
|
||||
End-to-end guard: the JSON string produced by parse_response for a plural
|
||||
entry must be consumable by _apply_translation without losing forms. This
|
||||
is the regression that #39448 fixed (str(dict) → Python repr broke the
|
||||
round-trip).
|
||||
"""
|
||||
raw = '{"0": {"0": "manzana", "1": "manzanas"}}'
|
||||
parsed = backfill_po.parse_response(raw, batch_size=1)
|
||||
entry = _make_plural_entry()
|
||||
backfill_po._apply_translation(
|
||||
entry, parsed[0], _item(), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
assert entry.msgstr_plural == {0: "manzana", 1: "manzanas"}
|
||||
|
||||
|
||||
def test_apply_translation_plural_list_response() -> None:
|
||||
"""
|
||||
Models sometimes return a JSON array for plural forms (forms are ordered,
|
||||
so a list is a valid representation). Each element must map to the
|
||||
corresponding plural index. Without this branch, ``str(list)`` would emit
|
||||
Python list-repr and broadcast it to every form — observed in the wild
|
||||
on a fresh run for French.
|
||||
"""
|
||||
entry = _make_plural_entry()
|
||||
translation = json.dumps(["manzana", "manzanas"])
|
||||
backfill_po._apply_translation(
|
||||
entry, translation, _item(), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
assert entry.msgstr_plural == {0: "manzana", 1: "manzanas"}
|
||||
|
||||
|
||||
def test_apply_translation_plural_list_round_trip_from_parse_response() -> None:
|
||||
"""
|
||||
The list-of-forms response must also survive parse_response → _apply
|
||||
round-trip. parse_response JSON-serializes lists; _apply_translation
|
||||
must json.loads them back into a list and distribute across forms.
|
||||
"""
|
||||
raw = '{"0": ["manzana", "manzanas"]}'
|
||||
parsed = backfill_po.parse_response(raw, batch_size=1)
|
||||
entry = _make_plural_entry()
|
||||
backfill_po._apply_translation(
|
||||
entry, parsed[0], _item(), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
assert entry.msgstr_plural == {0: "manzana", 1: "manzanas"}
|
||||
|
||||
|
||||
def test_apply_translation_plural_list_shorter_repeats_last_form() -> None:
|
||||
"""
|
||||
If the model returns fewer forms than the language requires, repeat the
|
||||
last form rather than leaving slots empty (which would render as the
|
||||
literal English msgid via gettext fallback).
|
||||
"""
|
||||
entry = polib.POEntry(msgid="apple", msgid_plural="apples")
|
||||
entry.msgstr_plural = {0: "", 1: "", 2: ""}
|
||||
backfill_po._apply_translation(
|
||||
entry,
|
||||
json.dumps(["uno", "dos"]),
|
||||
_item(),
|
||||
model="claude-test",
|
||||
mark_fuzzy=True,
|
||||
)
|
||||
assert entry.msgstr_plural == {0: "uno", 1: "dos", 2: "dos"}
|
||||
|
||||
|
||||
def test_apply_translation_plural_empty_list_falls_back_to_string_broadcast() -> None:
|
||||
"""An empty JSON list isn't usable; fall back to writing the raw string."""
|
||||
entry = _make_plural_entry()
|
||||
backfill_po._apply_translation(
|
||||
entry, "[]", _item(), model="claude-test", mark_fuzzy=True
|
||||
)
|
||||
# "[]" parses cleanly to an empty list, so the JSON branch matches but the
|
||||
# list-handling fork sees a falsy value and falls through to scalar
|
||||
# broadcast — the raw "[]" string ends up filling every plural slot.
|
||||
assert entry.msgstr_plural == {0: "[]", 1: "[]"}
|
||||
|
||||
|
||||
def test_build_prompt_includes_plural_note_when_plural_is_not_first() -> None:
|
||||
"""
|
||||
Regression: batches mix singular and plural entries in .po file order. If
|
||||
the plural-form guidance only fires when the first entry is plural, any
|
||||
batch where the plural lives after a singular would lose the guidance and
|
||||
the model would silently produce malformed plural responses.
|
||||
"""
|
||||
batch = [
|
||||
{"msgid": "Save", "msgstr": "", "index_key": "Save"},
|
||||
{
|
||||
"msgid": "%(num)d row",
|
||||
"msgid_plural": "%(num)d rows",
|
||||
"msgstr_plural": {0: "", 1: ""},
|
||||
"index_key": "%(num)d row\x00%(num)d rows",
|
||||
},
|
||||
]
|
||||
prompt = backfill_po.build_prompt("fr", batch, index={})
|
||||
assert "provide ALL plural forms" in prompt
|
||||
@@ -0,0 +1,256 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
"""Tests for ``scripts/translations/build_translation_index.py``.
|
||||
|
||||
The script is not installed as a package, so it is loaded via importlib
|
||||
from its filesystem path. The units exercised here pin the cross-language
|
||||
index shape that the AI backfill prompt depends on: fuzzy entries must be
|
||||
excluded (so unreviewed drafts don't feed back as trusted context), every
|
||||
entry must have a slot for every language (null when missing), and plural
|
||||
entries must be keyed by the ``msgid\\x00msgid_plural`` composite.
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
|
||||
import polib # type: ignore[import-untyped]
|
||||
import pytest
|
||||
|
||||
_SCRIPT_PATH = (
|
||||
Path(__file__).resolve().parents[4]
|
||||
/ "scripts"
|
||||
/ "translations"
|
||||
/ "build_translation_index.py"
|
||||
)
|
||||
_spec = importlib.util.spec_from_file_location("build_translation_index", _SCRIPT_PATH)
|
||||
assert _spec is not None, f"Could not load {_SCRIPT_PATH}"
|
||||
assert _spec.loader is not None, f"No loader on spec for {_SCRIPT_PATH}"
|
||||
build_translation_index = importlib.util.module_from_spec(_spec)
|
||||
_spec.loader.exec_module(build_translation_index)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _is_translated
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_is_translated_empty_singular() -> None:
|
||||
entry = polib.POEntry(msgid="Hello", msgstr="")
|
||||
assert build_translation_index._is_translated(entry) is False
|
||||
|
||||
|
||||
def test_is_translated_populated_singular() -> None:
|
||||
entry = polib.POEntry(msgid="Hello", msgstr="Hola")
|
||||
assert build_translation_index._is_translated(entry) is True
|
||||
|
||||
|
||||
def test_is_translated_fuzzy_entry_is_not_trusted() -> None:
|
||||
"""
|
||||
Fuzzy entries are unreviewed (often AI-generated drafts). They must not
|
||||
count as translated, or backfill runs will feed their own prior output
|
||||
back into the prompt as trusted context.
|
||||
"""
|
||||
entry = polib.POEntry(msgid="Hello", msgstr="Hola", flags=["fuzzy"])
|
||||
assert build_translation_index._is_translated(entry) is False
|
||||
|
||||
|
||||
def test_is_translated_plural_any_form_counts() -> None:
|
||||
entry = polib.POEntry(msgid="apple", msgid_plural="apples")
|
||||
entry.msgstr_plural = {0: "manzana", 1: ""}
|
||||
assert build_translation_index._is_translated(entry) is True
|
||||
|
||||
|
||||
def test_is_translated_plural_all_empty() -> None:
|
||||
entry = polib.POEntry(msgid="apple", msgid_plural="apples")
|
||||
entry.msgstr_plural = {0: "", 1: ""}
|
||||
assert build_translation_index._is_translated(entry) is False
|
||||
|
||||
|
||||
def test_is_translated_plural_fuzzy_is_not_trusted() -> None:
|
||||
entry = polib.POEntry(msgid="apple", msgid_plural="apples", flags=["fuzzy"])
|
||||
entry.msgstr_plural = {0: "manzana", 1: "manzanas"}
|
||||
assert build_translation_index._is_translated(entry) is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _plural_key
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_plural_key_uses_null_byte_separator() -> None:
|
||||
"""The composite key must use \\x00 so it cannot collide with any msgid."""
|
||||
entry = polib.POEntry(msgid="apple", msgid_plural="apples")
|
||||
assert build_translation_index._plural_key(entry) == "apple\x00apples"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# build_index
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _write_po(path: Path, entries: list[polib.POEntry]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
po = polib.POFile()
|
||||
po.metadata = {
|
||||
"Content-Type": "text/plain; charset=UTF-8",
|
||||
"Content-Transfer-Encoding": "8bit",
|
||||
}
|
||||
for entry in entries:
|
||||
po.append(entry)
|
||||
po.save(str(path))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def translations_dir(tmp_path: Path) -> Path:
|
||||
"""
|
||||
Build a minimal translations directory with three languages:
|
||||
- es: has "Hello" translated, "World" missing, plural translated
|
||||
- fr: has "Hello" fuzzy (must be treated as missing), "World" translated
|
||||
- en: source locale — must be excluded from the index
|
||||
"""
|
||||
root = tmp_path / "translations"
|
||||
|
||||
_write_po(
|
||||
root / "es" / "LC_MESSAGES" / "messages.po",
|
||||
[
|
||||
polib.POEntry(msgid="Hello", msgstr="Hola"),
|
||||
polib.POEntry(msgid="World", msgstr=""),
|
||||
_plural_entry("apple", "apples", {0: "manzana", 1: "manzanas"}),
|
||||
],
|
||||
)
|
||||
_write_po(
|
||||
root / "fr" / "LC_MESSAGES" / "messages.po",
|
||||
[
|
||||
polib.POEntry(msgid="Hello", msgstr="Bonjour", flags=["fuzzy"]),
|
||||
polib.POEntry(msgid="World", msgstr="Monde"),
|
||||
_plural_entry("apple", "apples", {0: "", 1: ""}),
|
||||
],
|
||||
)
|
||||
_write_po(
|
||||
root / "en" / "LC_MESSAGES" / "messages.po",
|
||||
[polib.POEntry(msgid="Hello", msgstr="")],
|
||||
)
|
||||
|
||||
return root
|
||||
|
||||
|
||||
def _plural_entry(
|
||||
msgid: str, msgid_plural: str, plurals: dict[int, str]
|
||||
) -> polib.POEntry:
|
||||
entry = polib.POEntry(msgid=msgid, msgid_plural=msgid_plural)
|
||||
entry.msgstr_plural = plurals
|
||||
return entry
|
||||
|
||||
|
||||
def test_build_index_excludes_en(translations_dir: Path) -> None:
|
||||
"""``en`` is the source locale and must never appear in the index."""
|
||||
index = build_translation_index.build_index(translations_dir)
|
||||
for value in index.values():
|
||||
assert "en" not in value
|
||||
|
||||
|
||||
def test_build_index_records_singular_translations(translations_dir: Path) -> None:
|
||||
index = build_translation_index.build_index(translations_dir)
|
||||
assert index["Hello"]["es"] == "Hola"
|
||||
assert index["World"]["fr"] == "Monde"
|
||||
|
||||
|
||||
def test_build_index_fuzzy_entries_become_null(translations_dir: Path) -> None:
|
||||
"""
|
||||
Fuzzy translations must surface as null. If they leaked through as text,
|
||||
they would (a) feed unreviewed AI output back into the backfill prompt
|
||||
as trusted context, and (b) inflate the --min-context count past the
|
||||
threshold of real reviewed translations.
|
||||
"""
|
||||
index = build_translation_index.build_index(translations_dir)
|
||||
assert index["Hello"]["fr"] is None
|
||||
|
||||
|
||||
def test_build_index_missing_translations_become_null(
|
||||
translations_dir: Path,
|
||||
) -> None:
|
||||
"""Empty msgstr → null (not empty string)."""
|
||||
index = build_translation_index.build_index(translations_dir)
|
||||
assert index["World"]["es"] is None
|
||||
|
||||
|
||||
def test_build_index_fills_every_language_slot(translations_dir: Path) -> None:
|
||||
"""
|
||||
Every msgid must have a slot for every non-en language, even if that
|
||||
language's .po file did not contain the entry. Defaults to null.
|
||||
"""
|
||||
index = build_translation_index.build_index(translations_dir)
|
||||
expected_langs = {"es", "fr"}
|
||||
for key, value in index.items():
|
||||
assert set(value.keys()) == expected_langs, (
|
||||
f"{key!r} missing language slots: {set(value.keys())}"
|
||||
)
|
||||
|
||||
|
||||
def test_build_index_plural_uses_composite_key(translations_dir: Path) -> None:
|
||||
"""Plural entries must be keyed by ``msgid\\x00msgid_plural``."""
|
||||
index = build_translation_index.build_index(translations_dir)
|
||||
assert "apple\x00apples" in index
|
||||
assert "apple" not in index # not stored under bare msgid
|
||||
|
||||
|
||||
def test_build_index_plural_translated_stored_as_dict(
|
||||
translations_dir: Path,
|
||||
) -> None:
|
||||
index = build_translation_index.build_index(translations_dir)
|
||||
plural = index["apple\x00apples"]
|
||||
assert plural["es"] == {0: "manzana", 1: "manzanas"}
|
||||
|
||||
|
||||
def test_build_index_plural_untranslated_stored_as_null(
|
||||
translations_dir: Path,
|
||||
) -> None:
|
||||
"""Empty plural forms across the board → null, not an empty dict."""
|
||||
index = build_translation_index.build_index(translations_dir)
|
||||
plural = index["apple\x00apples"]
|
||||
assert plural["fr"] is None
|
||||
|
||||
|
||||
def test_build_index_skips_languages_without_messages_po(tmp_path: Path) -> None:
|
||||
"""
|
||||
A subdirectory that doesn't contain ``LC_MESSAGES/messages.po`` (e.g.
|
||||
leftover scratch dirs, dotfiles) must not be picked up as a language.
|
||||
"""
|
||||
root = tmp_path / "translations"
|
||||
_write_po(
|
||||
root / "es" / "LC_MESSAGES" / "messages.po",
|
||||
[polib.POEntry(msgid="Hello", msgstr="Hola")],
|
||||
)
|
||||
(root / "scratch").mkdir() # no LC_MESSAGES/messages.po
|
||||
(root / ".DS_Store").touch()
|
||||
|
||||
index = build_translation_index.build_index(root)
|
||||
assert index == {"Hello": {"es": "Hola"}}
|
||||
|
||||
|
||||
def test_build_index_skips_header_entry(tmp_path: Path) -> None:
|
||||
"""
|
||||
The .po header entry has an empty msgid by convention. It must not be
|
||||
included as a translation key.
|
||||
"""
|
||||
root = tmp_path / "translations"
|
||||
_write_po(
|
||||
root / "es" / "LC_MESSAGES" / "messages.po",
|
||||
[polib.POEntry(msgid="Hello", msgstr="Hola")],
|
||||
)
|
||||
index = build_translation_index.build_index(root)
|
||||
assert "" not in index
|
||||
Reference in New Issue
Block a user