Improve linting of PO files

During the mogration to git e9d1cb6cc7 removed the
`lint-translations.sh` script which was used together with
`check_translations.py` to lint POT- and PO-files. By doing so it
removed a valuable option to find problems in translations.

This commit undos that by merging the functionality of the removed
`lint-translations.sh` and `check_translations.py` into the latter
one. The new logic is easier to maintain, produces less false-positives
and has some unit test coverage.
This commit is contained in:
Dunedan
2024-10-17 19:16:54 +02:00
parent b2cdb1e6b4
commit 0bbd2ad7dd
3 changed files with 333 additions and 110 deletions
+219 -110
View File
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
#
# Copyright (C) 2022 Wildfire Games.
# Copyright (C) 2024 Wildfire Games.
# This file is part of 0 A.D.
#
# 0 A.D. is free software: you can redistribute it and/or modify
@@ -16,139 +16,248 @@
# You should have received a copy of the GNU General Public License
# along with 0 A.D. If not, see <http://www.gnu.org/licenses/>.
import multiprocessing
import os
"""Lint PO- and POT-files."""
import logging
import re
import sys
from collections.abc import Generator
from itertools import chain
from pathlib import Path
from i18n_helper import L10N_FOLDER_NAME, PROJECT_ROOT_DIRECTORY
from i18n_helper.catalog import Catalog
from i18n_helper.globber import get_catalogs
import click
from click import ClickException
from dennis.linter import ERROR, LintedEntry, Linter, LintMessage, LintRule, get_lint_rules
from dennis.templatelinter import TemplateLinter
from dennis.templatelinter import get_lint_rules as get_template_lint_rules
from dennis.tools import VariableTokenizer, get_available_formats, withlines
VERBOSE = 0
PROJECT_ROOT_DIRECTORY = Path(__file__).parent.parent.parent.parent
L10N_FOLGER_NAME = "l10n"
logger = logging.getLogger(__name__)
class MessageChecker:
"""Checks all messages in a catalog against a regex."""
class URLSpamRule(LintRule):
num = "E901"
name = "urlspam"
desc = "msgstr contains URL not present in msgid"
def __init__(self, human_name, regex):
self.regex = re.compile(regex, re.IGNORECASE)
self.human_name = human_name
def check(self, input_file_path, template_message, translated_catalogs):
patterns = set(
self.regex.findall(
template_message.id[0] if template_message.pluralizable else template_message.id
)
def __init__(self):
super().__init__()
self.url_regex = re.compile(
r"https?://(?:[a-z0-9-_$@./&+]|(?:%[0-9a-fA-F][" r"0-9a-fA-F]))+", re.IGNORECASE
)
# As a sanity check, verify that the template message is coherent.
# Note that these tend to be false positives.
# TODO: the pssible tags are usually comments, we ought be able to find them.
if template_message.pluralizable:
plural_urls = set(self.regex.findall(template_message.id[1]))
if plural_urls.difference(patterns):
print(
f"{input_file_path} - Different {self.human_name} in "
f"singular and plural source strings "
f"for '{template_message}' in '{input_file_path}'"
)
def lint(self, _: VariableTokenizer, linted_entry: LintedEntry) -> list[LintMessage]:
msgs = []
for translation_catalog in translated_catalogs:
translation_message = translation_catalog.get(
template_message.id, template_message.context
)
if not translation_message:
for trstr in linted_entry.strs:
urls = self.url_regex.findall(trstr.msgstr_string)
if not urls:
continue
translated_patterns = set(
self.regex.findall(
translation_message.string[0]
if translation_message.pluralizable
else translation_message.string
)
)
unknown_patterns = translated_patterns.difference(patterns)
if unknown_patterns:
print(
f'{input_file_path} - {translation_catalog.locale}: '
f'Found unknown {self.human_name} '
f'{", ".join(["`" + x + "`" for x in unknown_patterns])} '
f'in the translation which do not match any of the URLs '
f'in the template: {", ".join(["`" + x + "`" for x in patterns])}'
for url in urls:
if url in linted_entry.msgid:
continue
msgs.append(
LintMessage(
ERROR,
linted_entry.poentry.linenum,
0,
self.num,
f"translation contains spam URL: {url}",
linted_entry.poentry,
)
)
if template_message.pluralizable and translation_message.pluralizable:
for indx, val in enumerate(translation_message.string):
if indx == 0:
continue
translated_patterns_multi = set(self.regex.findall(val))
unknown_patterns_multi = translated_patterns_multi.difference(plural_urls)
if unknown_patterns_multi:
print(
f'{input_file_path} - {translation_catalog.locale}: '
f'Found unknown {self.human_name} '
f'{", ".join(["`" + x + "`" for x in unknown_patterns_multi])} '
f'in the pluralised translation which do not '
f'match any of the URLs in the template: '
f'{", ".join(["`" + x + "`" for x in plural_urls])}'
)
return msgs
def check_translations(input_file_path):
if VERBOSE:
print(f"Checking {input_file_path}")
template_catalog = Catalog.read_from(input_file_path)
class MissingTagRule(LintRule):
num = "E902"
name = "missing-tag"
desc = "tags present in msgid aren't present in msgstr"
# If language codes were specified on the command line, filter by those.
filters = sys.argv[1:]
def __init__(self):
super().__init__()
self.tag_regex = re.compile(r"(?<![\\\\])\[[^]]+\]")
# Load existing translation catalogs.
existing_translation_catalogs = get_catalogs(input_file_path, filters)
def lint(self, _: VariableTokenizer, linted_entry: LintedEntry) -> list[LintMessage]:
msgs = []
spam = MessageChecker("url", r"https?://(?:[a-z0-9-_$@./&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
sprintf = MessageChecker("sprintf", r"%\([^)]+\)s")
tags = MessageChecker("tag", r"[^\\][^\\](\[[^]]+/?\])")
for trstr in linted_entry.strs:
msgid_tags = []
for s in trstr.msgid_strings:
msgid_tags += self.tag_regex.findall(s)
# Check that there are no spam URLs.
# Loop through all messages in the .POT catalog for URLs.
# For each, check for the corresponding key in the .PO catalogs.
# If found, check that URLS in the .PO keys are the same as those in the .POT key.
for template_message in template_catalog:
spam.check(input_file_path, template_message, existing_translation_catalogs)
sprintf.check(input_file_path, template_message, existing_translation_catalogs)
tags.check(input_file_path, template_message, existing_translation_catalogs)
msgstr_tags = self.tag_regex.findall(trstr.msgstr_string)
if not msgstr_tags:
continue
if VERBOSE:
print(f"Done checking {input_file_path}")
missing_in_msgstr = [tag for tag in msgid_tags if tag not in msgstr_tags]
if missing_in_msgstr:
msgs.append(
LintMessage(
ERROR,
linted_entry.poentry.linenum,
0,
self.num,
f"missing tags: {', '.join(sorted(missing_in_msgstr))}",
linted_entry.poentry,
)
)
return msgs
def main():
print(
"\n\tWARNING: Remember to regenerate the POT files with “update_templates.py” "
"before you run this script.\n\tPOT files are not in the repository.\n"
)
found_pots = 0
for root, _folders, filenames in os.walk(PROJECT_ROOT_DIRECTORY):
for filename in filenames:
if (
len(filename) > 4
and filename[-4:] == ".pot"
and os.path.basename(root) == L10N_FOLDER_NAME
):
found_pots += 1
multiprocessing.Process(
target=check_translations, args=(os.path.join(root, filename),)
).start()
if found_pots == 0:
print(
"This script did not work because no '.pot' files were found. "
"Please run 'update_templates.py' to generate the '.pot' files, "
"and run 'pull_translations.py' to pull the latest translations from Transifex. "
"Then you can run this script to check for spam in translations."
)
class InvalidTagRule(LintRule):
num = "E903"
name = "invalid-tag"
desc = "tags not present in msgid are present in msgstr"
def __init__(self):
super().__init__()
self.tag_regex = re.compile(r"(?<![\\\\])\[[^]]+\]")
def lint(self, _: VariableTokenizer, linted_entry: LintedEntry) -> list[LintMessage]:
msgs = []
for trstr in linted_entry.strs:
msgid_tags = []
for s in trstr.msgid_strings:
msgid_tags += self.tag_regex.findall(s)
msgstr_tags = self.tag_regex.findall(trstr.msgstr_string)
if not msgstr_tags:
continue
not_in_msgid = [tag for tag in msgstr_tags if tag not in msgid_tags]
if not_in_msgid:
msgs.append(
LintMessage(
ERROR,
linted_entry.poentry.linenum,
0,
self.num,
f"invalid tags: {', '.join(sorted(not_in_msgid))}",
linted_entry.poentry,
)
)
return msgs
def get_relative_path(path: Path) -> Path:
"""Get relative path to project directory.
If the path isn't below the project directory, return the unchanged path.
"""
try:
return path.relative_to(PROJECT_ROOT_DIRECTORY)
except ValueError:
return path
def get_po_files() -> Generator[Path, None, None]:
"""Return a list of PO- and POT-files in the project."""
for l10n_dir in Path(PROJECT_ROOT_DIRECTORY).glob(f"**/{L10N_FOLGER_NAME}"):
yield from chain(l10n_dir.glob("*.po"), l10n_dir.glob("*.pot"))
def lint_files(paths: list[Path]) -> Generator[tuple[Path, LintMessage], None, None]:
"""Lint files and return results."""
varformat = get_available_formats().keys()
lint_rules = get_lint_rules()
del lint_rules["W302"]
template_lint_rules = get_template_lint_rules()
linter = Linter(varformat, lint_rules)
template_linter = TemplateLinter(varformat, template_lint_rules)
for path in paths:
if path.suffix == ".po":
results = linter.verify_file(path)
elif path.suffix == ".pot":
results = template_linter.verify_file(path)
else:
raise ValueError(f"Unexpected file type for {path}")
for result in results:
yield path, result
def print_results(
results: Generator[tuple[Path, LintMessage], None, None], num_files: int
) -> None:
"""Format results and print them to stdout."""
num_warnings = 0
num_errors = 0
for path, result in results:
if result.kind == "err":
code_color = "red"
num_errors += 1
elif result.kind == "warn":
code_color = "yellow"
num_warnings += 1
else:
raise ValueError(f"Unexpected lint result code {result.kind}")
click.secho(f"{click.format_filename(get_relative_path(path))}", bold=True, nl=False)
click.echo(f":{result.line}: ", nl=False)
click.echo(click.style(result.code, fg=code_color) + f" {result.msg}")
click.echo(withlines(result.poentry.linenum, result.poentry.original))
click.echo()
click.echo(f"Found {num_errors} errors and {num_warnings} warnings in {num_files} files.")
if num_errors > 0:
sys.exit(1)
def run(
locale: str, files: list[Path]
) -> tuple[Generator[tuple[Path, LintMessage], None, None], int]:
"""Collect files to lint and run linting."""
if files:
po_files = []
for f in files:
if f.suffix in [".po", ".pot"]:
po_files.append(f)
else:
logger.warning(
"%s is not a valid PO- or POT-file. Skipping.", get_relative_path(f)
)
else:
po_files = list(get_po_files())
if locale:
files_to_check = []
for f in po_files:
lang_code = f.stem.split(".")[0]
if lang_code == locale:
files_to_check.append(f)
else:
files_to_check = po_files
if len(files_to_check) == 0:
raise ClickException("Found no files to check.")
return lint_files(files_to_check), len(files_to_check)
@click.command()
@click.option("--locale", help="Only check translations for the given locale")
@click.argument("files", nargs=-1, type=click.Path(exists=True, resolve_path=True, path_type=Path))
def cli(locale, files):
"""Lint PO- and POT-files.
Provide one or multiple FILES to check. If omitted all files in
the project will be checked.
"""
lint_results, num_checked_files = run(locale, files)
print_results(lint_results, num_checked_files)
if __name__ == "__main__":
main()
cli()
+2
View File
@@ -1,2 +1,4 @@
babel~=2.12
click~=8.1
dennis~=1.1
lxml~=4.5
@@ -0,0 +1,112 @@
from pathlib import Path
from tempfile import NamedTemporaryFile
from unittest import TestCase
from check_translations import lint_files
class TestCheckTranslations(TestCase):
def test_var_missing_in_msgstr(self):
with NamedTemporaryFile(mode="w+", suffix=".po") as f:
f.write(r"""
#: simulation/components/EntityLimits.js:195
#, javascript-format
msgid "%(category)s build limit of %(limit)s reached"
msgstr "Baulimit von %(limit)s erreicht"
""")
f.flush()
path_to_lint = Path(f.name)
lint_results = list(lint_files([path_to_lint]))
self.assertEqual(1, len(lint_results))
linted_file, lint_result = lint_results[0]
self.assertEqual(linted_file, path_to_lint)
self.assertEqual("W202", lint_result.code)
self.assertEqual("warn", lint_result.kind)
self.assertEqual("missing variables: %(category)s", lint_result.msg)
def test_var_missing_in_msgid(self):
with NamedTemporaryFile(mode="w+", suffix=".po") as f:
f.write(r"""
#: simulation/components/EntityLimits.js:195
#, javascript-format
msgid "build limit of %(limit)s reached"
msgstr "%(category)s Baulimit von %(limit)s erreicht"
""")
f.flush()
path_to_lint = Path(f.name)
lint_results = list(lint_files([path_to_lint]))
self.assertEqual(1, len(lint_results))
linted_file, lint_result = lint_results[0]
self.assertEqual(linted_file, path_to_lint)
self.assertEqual("E201", lint_result.code)
self.assertEqual("err", lint_result.kind)
self.assertEqual("invalid variables: %(category)s", lint_result.msg)
def test_spam_url_detection(self):
with NamedTemporaryFile(mode="w+", suffix=".po") as f:
f.write(r"""
#: gui/incompatible_mods/incompatible_mods.xml:(caption):12
msgid "Incompatible mods"
msgstr "Inkompatible Mods http://example.tld/"
""")
f.flush()
path_to_lint = Path(f.name)
lint_results = list(lint_files([path_to_lint]))
self.assertEqual(1, len(lint_results))
linted_file, lint_result = lint_results[0]
self.assertEqual(linted_file, path_to_lint)
self.assertEqual("E901", lint_result.code)
self.assertEqual("err", lint_result.kind)
self.assertEqual("translation contains spam URL: http://example.tld/", lint_result.msg)
def test_missing_tag_in_msgstr(self):
with NamedTemporaryFile(mode="w+", suffix=".po") as f:
f.write(r"""
#: gui/prelobby/common/terms/Privacy_Policy.txt:6
msgid "[font=\"sans-bold-14\"]1. Player name[/font]"
msgstr "[font=\"sans-bold-14\"]1. Spielername"
""")
f.flush()
path_to_lint = Path(f.name)
lint_results = list(lint_files([path_to_lint]))
self.assertEqual(1, len(lint_results))
linted_file, lint_result = lint_results[0]
self.assertEqual(linted_file, path_to_lint)
self.assertEqual("E902", lint_result.code)
self.assertEqual("err", lint_result.kind)
self.assertEqual("missing tags: [/font]", lint_result.msg)
def test_missing_tag_in_msgstr_escaped(self):
with NamedTemporaryFile(mode="w+", suffix=".po") as f:
f.write(r"""
#: gui/pregame/MainMenuItems.js:194
msgid "Launch the multiplayer lobby. \\[DISABLED BY BUILD]"
msgstr "Multiplayer-Lobby starten. \\[IN BUILD DEAKTIVIERT]"
""")
f.flush()
path_to_lint = Path(f.name)
lint_results = list(lint_files([path_to_lint]))
self.assertEqual(0, len(lint_results))
def test_invalid_tag_in_msgstr(self):
with NamedTemporaryFile(mode="w+", suffix=".po") as f:
f.write(r"""
#: gui/prelobby/common/terms/Privacy_Policy.txt:6
msgid "[font=\"sans-bold-14\"]1. Player name"
msgstr "[font=\"sans-bold-14\"]1. Spielername[/font]"
""")
f.flush()
path_to_lint = Path(f.name)
lint_results = list(lint_files([path_to_lint]))
self.assertEqual(1, len(lint_results))
linted_file, lint_result = lint_results[0]
self.assertEqual(linted_file, path_to_lint)
self.assertEqual("E903", lint_result.code)
self.assertEqual("err", lint_result.kind)
self.assertEqual("invalid tags: [/font]", lint_result.msg)