From 0bbd2ad7ddeacbc549090f6ea1a91354ef097548 Mon Sep 17 00:00:00 2001 From: Dunedan Date: Thu, 17 Oct 2024 19:16:54 +0200 Subject: [PATCH] Improve linting of PO files During the mogration to git e9d1cb6cc7 removed the `lint-translations.sh` script which was used together with `check_translations.py` to lint POT- and PO-files. By doing so it removed a valuable option to find problems in translations. This commit undos that by merging the functionality of the removed `lint-translations.sh` and `check_translations.py` into the latter one. The new logic is easier to maintain, produces less false-positives and has some unit test coverage. --- source/tools/i18n/check_translations.py | 329 ++++++++++++------ source/tools/i18n/requirements.txt | 2 + .../i18n/tests/test_check_translations.py | 112 ++++++ 3 files changed, 333 insertions(+), 110 deletions(-) create mode 100644 source/tools/i18n/tests/test_check_translations.py diff --git a/source/tools/i18n/check_translations.py b/source/tools/i18n/check_translations.py index 9d4044e3f6..8f614101b2 100755 --- a/source/tools/i18n/check_translations.py +++ b/source/tools/i18n/check_translations.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -# Copyright (C) 2022 Wildfire Games. +# Copyright (C) 2024 Wildfire Games. # This file is part of 0 A.D. # # 0 A.D. is free software: you can redistribute it and/or modify @@ -16,139 +16,248 @@ # You should have received a copy of the GNU General Public License # along with 0 A.D. If not, see . -import multiprocessing -import os +"""Lint PO- and POT-files.""" + +import logging import re import sys +from collections.abc import Generator +from itertools import chain +from pathlib import Path -from i18n_helper import L10N_FOLDER_NAME, PROJECT_ROOT_DIRECTORY -from i18n_helper.catalog import Catalog -from i18n_helper.globber import get_catalogs +import click +from click import ClickException +from dennis.linter import ERROR, LintedEntry, Linter, LintMessage, LintRule, get_lint_rules +from dennis.templatelinter import TemplateLinter +from dennis.templatelinter import get_lint_rules as get_template_lint_rules +from dennis.tools import VariableTokenizer, get_available_formats, withlines -VERBOSE = 0 +PROJECT_ROOT_DIRECTORY = Path(__file__).parent.parent.parent.parent +L10N_FOLGER_NAME = "l10n" + +logger = logging.getLogger(__name__) -class MessageChecker: - """Checks all messages in a catalog against a regex.""" +class URLSpamRule(LintRule): + num = "E901" + name = "urlspam" + desc = "msgstr contains URL not present in msgid" - def __init__(self, human_name, regex): - self.regex = re.compile(regex, re.IGNORECASE) - self.human_name = human_name - - def check(self, input_file_path, template_message, translated_catalogs): - patterns = set( - self.regex.findall( - template_message.id[0] if template_message.pluralizable else template_message.id - ) + def __init__(self): + super().__init__() + self.url_regex = re.compile( + r"https?://(?:[a-z0-9-_$@./&+]|(?:%[0-9a-fA-F][" r"0-9a-fA-F]))+", re.IGNORECASE ) - # As a sanity check, verify that the template message is coherent. - # Note that these tend to be false positives. - # TODO: the pssible tags are usually comments, we ought be able to find them. - if template_message.pluralizable: - plural_urls = set(self.regex.findall(template_message.id[1])) - if plural_urls.difference(patterns): - print( - f"{input_file_path} - Different {self.human_name} in " - f"singular and plural source strings " - f"for '{template_message}' in '{input_file_path}'" - ) + def lint(self, _: VariableTokenizer, linted_entry: LintedEntry) -> list[LintMessage]: + msgs = [] - for translation_catalog in translated_catalogs: - translation_message = translation_catalog.get( - template_message.id, template_message.context - ) - if not translation_message: + for trstr in linted_entry.strs: + urls = self.url_regex.findall(trstr.msgstr_string) + if not urls: continue - translated_patterns = set( - self.regex.findall( - translation_message.string[0] - if translation_message.pluralizable - else translation_message.string - ) - ) - unknown_patterns = translated_patterns.difference(patterns) - if unknown_patterns: - print( - f'{input_file_path} - {translation_catalog.locale}: ' - f'Found unknown {self.human_name} ' - f'{", ".join(["`" + x + "`" for x in unknown_patterns])} ' - f'in the translation which do not match any of the URLs ' - f'in the template: {", ".join(["`" + x + "`" for x in patterns])}' + for url in urls: + if url in linted_entry.msgid: + continue + + msgs.append( + LintMessage( + ERROR, + linted_entry.poentry.linenum, + 0, + self.num, + f"translation contains spam URL: {url}", + linted_entry.poentry, + ) ) - if template_message.pluralizable and translation_message.pluralizable: - for indx, val in enumerate(translation_message.string): - if indx == 0: - continue - translated_patterns_multi = set(self.regex.findall(val)) - unknown_patterns_multi = translated_patterns_multi.difference(plural_urls) - if unknown_patterns_multi: - print( - f'{input_file_path} - {translation_catalog.locale}: ' - f'Found unknown {self.human_name} ' - f'{", ".join(["`" + x + "`" for x in unknown_patterns_multi])} ' - f'in the pluralised translation which do not ' - f'match any of the URLs in the template: ' - f'{", ".join(["`" + x + "`" for x in plural_urls])}' - ) + return msgs -def check_translations(input_file_path): - if VERBOSE: - print(f"Checking {input_file_path}") - template_catalog = Catalog.read_from(input_file_path) +class MissingTagRule(LintRule): + num = "E902" + name = "missing-tag" + desc = "tags present in msgid aren't present in msgstr" - # If language codes were specified on the command line, filter by those. - filters = sys.argv[1:] + def __init__(self): + super().__init__() + self.tag_regex = re.compile(r"(? list[LintMessage]: + msgs = [] - spam = MessageChecker("url", r"https?://(?:[a-z0-9-_$@./&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") - sprintf = MessageChecker("sprintf", r"%\([^)]+\)s") - tags = MessageChecker("tag", r"[^\\][^\\](\[[^]]+/?\])") + for trstr in linted_entry.strs: + msgid_tags = [] + for s in trstr.msgid_strings: + msgid_tags += self.tag_regex.findall(s) - # Check that there are no spam URLs. - # Loop through all messages in the .POT catalog for URLs. - # For each, check for the corresponding key in the .PO catalogs. - # If found, check that URLS in the .PO keys are the same as those in the .POT key. - for template_message in template_catalog: - spam.check(input_file_path, template_message, existing_translation_catalogs) - sprintf.check(input_file_path, template_message, existing_translation_catalogs) - tags.check(input_file_path, template_message, existing_translation_catalogs) + msgstr_tags = self.tag_regex.findall(trstr.msgstr_string) + if not msgstr_tags: + continue - if VERBOSE: - print(f"Done checking {input_file_path}") + missing_in_msgstr = [tag for tag in msgid_tags if tag not in msgstr_tags] + if missing_in_msgstr: + msgs.append( + LintMessage( + ERROR, + linted_entry.poentry.linenum, + 0, + self.num, + f"missing tags: {', '.join(sorted(missing_in_msgstr))}", + linted_entry.poentry, + ) + ) + + return msgs -def main(): - print( - "\n\tWARNING: Remember to regenerate the POT files with “update_templates.py” " - "before you run this script.\n\tPOT files are not in the repository.\n" - ) - found_pots = 0 - for root, _folders, filenames in os.walk(PROJECT_ROOT_DIRECTORY): - for filename in filenames: - if ( - len(filename) > 4 - and filename[-4:] == ".pot" - and os.path.basename(root) == L10N_FOLDER_NAME - ): - found_pots += 1 - multiprocessing.Process( - target=check_translations, args=(os.path.join(root, filename),) - ).start() - if found_pots == 0: - print( - "This script did not work because no '.pot' files were found. " - "Please run 'update_templates.py' to generate the '.pot' files, " - "and run 'pull_translations.py' to pull the latest translations from Transifex. " - "Then you can run this script to check for spam in translations." - ) +class InvalidTagRule(LintRule): + num = "E903" + name = "invalid-tag" + desc = "tags not present in msgid are present in msgstr" + + def __init__(self): + super().__init__() + self.tag_regex = re.compile(r"(? list[LintMessage]: + msgs = [] + + for trstr in linted_entry.strs: + msgid_tags = [] + for s in trstr.msgid_strings: + msgid_tags += self.tag_regex.findall(s) + + msgstr_tags = self.tag_regex.findall(trstr.msgstr_string) + if not msgstr_tags: + continue + + not_in_msgid = [tag for tag in msgstr_tags if tag not in msgid_tags] + if not_in_msgid: + msgs.append( + LintMessage( + ERROR, + linted_entry.poentry.linenum, + 0, + self.num, + f"invalid tags: {', '.join(sorted(not_in_msgid))}", + linted_entry.poentry, + ) + ) + + return msgs + + +def get_relative_path(path: Path) -> Path: + """Get relative path to project directory. + + If the path isn't below the project directory, return the unchanged path. + """ + try: + return path.relative_to(PROJECT_ROOT_DIRECTORY) + except ValueError: + return path + + +def get_po_files() -> Generator[Path, None, None]: + """Return a list of PO- and POT-files in the project.""" + for l10n_dir in Path(PROJECT_ROOT_DIRECTORY).glob(f"**/{L10N_FOLGER_NAME}"): + yield from chain(l10n_dir.glob("*.po"), l10n_dir.glob("*.pot")) + + +def lint_files(paths: list[Path]) -> Generator[tuple[Path, LintMessage], None, None]: + """Lint files and return results.""" + varformat = get_available_formats().keys() + lint_rules = get_lint_rules() + del lint_rules["W302"] + template_lint_rules = get_template_lint_rules() + + linter = Linter(varformat, lint_rules) + template_linter = TemplateLinter(varformat, template_lint_rules) + + for path in paths: + if path.suffix == ".po": + results = linter.verify_file(path) + elif path.suffix == ".pot": + results = template_linter.verify_file(path) + else: + raise ValueError(f"Unexpected file type for {path}") + for result in results: + yield path, result + + +def print_results( + results: Generator[tuple[Path, LintMessage], None, None], num_files: int +) -> None: + """Format results and print them to stdout.""" + num_warnings = 0 + num_errors = 0 + for path, result in results: + if result.kind == "err": + code_color = "red" + num_errors += 1 + elif result.kind == "warn": + code_color = "yellow" + num_warnings += 1 + else: + raise ValueError(f"Unexpected lint result code {result.kind}") + + click.secho(f"{click.format_filename(get_relative_path(path))}", bold=True, nl=False) + click.echo(f":{result.line}: ", nl=False) + click.echo(click.style(result.code, fg=code_color) + f" {result.msg}") + click.echo(withlines(result.poentry.linenum, result.poentry.original)) + click.echo() + + click.echo(f"Found {num_errors} errors and {num_warnings} warnings in {num_files} files.") + if num_errors > 0: + sys.exit(1) + + +def run( + locale: str, files: list[Path] +) -> tuple[Generator[tuple[Path, LintMessage], None, None], int]: + """Collect files to lint and run linting.""" + if files: + po_files = [] + for f in files: + if f.suffix in [".po", ".pot"]: + po_files.append(f) + else: + logger.warning( + "%s is not a valid PO- or POT-file. Skipping.", get_relative_path(f) + ) + else: + po_files = list(get_po_files()) + + if locale: + files_to_check = [] + for f in po_files: + lang_code = f.stem.split(".")[0] + if lang_code == locale: + files_to_check.append(f) + else: + files_to_check = po_files + + if len(files_to_check) == 0: + raise ClickException("Found no files to check.") + + return lint_files(files_to_check), len(files_to_check) + + +@click.command() +@click.option("--locale", help="Only check translations for the given locale") +@click.argument("files", nargs=-1, type=click.Path(exists=True, resolve_path=True, path_type=Path)) +def cli(locale, files): + """Lint PO- and POT-files. + + Provide one or multiple FILES to check. If omitted all files in + the project will be checked. + """ + lint_results, num_checked_files = run(locale, files) + print_results(lint_results, num_checked_files) if __name__ == "__main__": - main() + cli() diff --git a/source/tools/i18n/requirements.txt b/source/tools/i18n/requirements.txt index 047384c766..9762b3805e 100644 --- a/source/tools/i18n/requirements.txt +++ b/source/tools/i18n/requirements.txt @@ -1,2 +1,4 @@ babel~=2.12 +click~=8.1 +dennis~=1.1 lxml~=4.5 diff --git a/source/tools/i18n/tests/test_check_translations.py b/source/tools/i18n/tests/test_check_translations.py new file mode 100644 index 0000000000..018fe25c5a --- /dev/null +++ b/source/tools/i18n/tests/test_check_translations.py @@ -0,0 +1,112 @@ +from pathlib import Path +from tempfile import NamedTemporaryFile +from unittest import TestCase + +from check_translations import lint_files + + +class TestCheckTranslations(TestCase): + def test_var_missing_in_msgstr(self): + with NamedTemporaryFile(mode="w+", suffix=".po") as f: + f.write(r""" +#: simulation/components/EntityLimits.js:195 +#, javascript-format +msgid "%(category)s build limit of %(limit)s reached" +msgstr "Baulimit von %(limit)s erreicht" + """) + f.flush() + path_to_lint = Path(f.name) + lint_results = list(lint_files([path_to_lint])) + + self.assertEqual(1, len(lint_results)) + linted_file, lint_result = lint_results[0] + self.assertEqual(linted_file, path_to_lint) + self.assertEqual("W202", lint_result.code) + self.assertEqual("warn", lint_result.kind) + self.assertEqual("missing variables: %(category)s", lint_result.msg) + + def test_var_missing_in_msgid(self): + with NamedTemporaryFile(mode="w+", suffix=".po") as f: + f.write(r""" +#: simulation/components/EntityLimits.js:195 +#, javascript-format +msgid "build limit of %(limit)s reached" +msgstr "%(category)s Baulimit von %(limit)s erreicht" + """) + f.flush() + path_to_lint = Path(f.name) + lint_results = list(lint_files([path_to_lint])) + + self.assertEqual(1, len(lint_results)) + linted_file, lint_result = lint_results[0] + self.assertEqual(linted_file, path_to_lint) + self.assertEqual("E201", lint_result.code) + self.assertEqual("err", lint_result.kind) + self.assertEqual("invalid variables: %(category)s", lint_result.msg) + + def test_spam_url_detection(self): + with NamedTemporaryFile(mode="w+", suffix=".po") as f: + f.write(r""" +#: gui/incompatible_mods/incompatible_mods.xml:(caption):12 +msgid "Incompatible mods" +msgstr "Inkompatible Mods http://example.tld/" + """) + f.flush() + path_to_lint = Path(f.name) + lint_results = list(lint_files([path_to_lint])) + + self.assertEqual(1, len(lint_results)) + linted_file, lint_result = lint_results[0] + self.assertEqual(linted_file, path_to_lint) + self.assertEqual("E901", lint_result.code) + self.assertEqual("err", lint_result.kind) + self.assertEqual("translation contains spam URL: http://example.tld/", lint_result.msg) + + def test_missing_tag_in_msgstr(self): + with NamedTemporaryFile(mode="w+", suffix=".po") as f: + f.write(r""" +#: gui/prelobby/common/terms/Privacy_Policy.txt:6 +msgid "[font=\"sans-bold-14\"]1. Player name[/font]" +msgstr "[font=\"sans-bold-14\"]1. Spielername" + """) + f.flush() + path_to_lint = Path(f.name) + lint_results = list(lint_files([path_to_lint])) + + self.assertEqual(1, len(lint_results)) + linted_file, lint_result = lint_results[0] + self.assertEqual(linted_file, path_to_lint) + self.assertEqual("E902", lint_result.code) + self.assertEqual("err", lint_result.kind) + self.assertEqual("missing tags: [/font]", lint_result.msg) + + def test_missing_tag_in_msgstr_escaped(self): + with NamedTemporaryFile(mode="w+", suffix=".po") as f: + f.write(r""" +#: gui/pregame/MainMenuItems.js:194 +msgid "Launch the multiplayer lobby. \\[DISABLED BY BUILD]" +msgstr "Multiplayer-Lobby starten. \\[IN BUILD DEAKTIVIERT]" + """) + f.flush() + path_to_lint = Path(f.name) + lint_results = list(lint_files([path_to_lint])) + + self.assertEqual(0, len(lint_results)) + + def test_invalid_tag_in_msgstr(self): + with NamedTemporaryFile(mode="w+", suffix=".po") as f: + f.write(r""" +#: gui/prelobby/common/terms/Privacy_Policy.txt:6 +msgid "[font=\"sans-bold-14\"]1. Player name" +msgstr "[font=\"sans-bold-14\"]1. Spielername[/font]" + """) + f.flush() + path_to_lint = Path(f.name) + lint_results = list(lint_files([path_to_lint])) + + self.assertEqual(1, len(lint_results)) + linted_file, lint_result = lint_results[0] + self.assertEqual(linted_file, path_to_lint) + self.assertEqual("E903", lint_result.code) + self.assertEqual("err", lint_result.kind) + self.assertEqual("invalid tags: [/font]", lint_result.msg)