feat(checks): add reStructuredText syntax checking

Use docutils to parse the string and show discovered errors. Issue #8446
WeblateOrg · Jan 16, 2025 · be16ca8 · be16ca8
1 parent 5054173
commit be16ca8
Show file tree

Hide file tree

Showing 10 changed files with 261 additions and 18 deletions.
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -6,6 +6,7 @@ Not yet released.
 **New features**
 
 * :ref:`check-rst-references` check to validate reStructuredText references.
+* :ref:`check-rst-syntax` check to validate reStructuredText syntax.
 
 **Improvements**
 

diff --git a/docs/user/checks.rst b/docs/user/checks.rst
@@ -1385,6 +1385,21 @@ translation file or defined manually using ``regex`` flag:
 
    regex:^foo|bar$
 
+.. _check-rst-syntax:
+
+reStructuredText syntax error
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 5.10
+
+:Summary: reStructuredText syntax error in the translation.
+:Scope: translated strings
+:Check class: ``weblate.checks.markup.RSTSyntaxCheck``
+:Check identifier: ``rst-syntax``
+:Flag to enable: ``rst-text``
+:Flag to ignore: ``ignore-rst-syntax``
+
+reStructuredText syntax error in the translation.
 
 .. _check-reused:
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -65,7 +65,8 @@ types = [
   "types-openpyxl==3.1.5.20241225",
   "types-Pillow==10.2.0.20240822",
   "types-python-dateutil==2.9.0.20241206",
-  "types-requests==2.32.0.20241016"
+  "types-requests==2.32.0.20241016",
+  "types-docutils==0.21.0.20241128"
 ]
 
 [project]
@@ -154,7 +155,8 @@ dependencies = [
   "unidecode>=1.3.8,<1.4",
   "user-agents>=2.0,<2.3",
   "weblate-language-data>=2024.14",
-  "weblate-schemas==2024.2"
+  "weblate-schemas==2024.2",
+  "docutils>=0.21.2,<0.22"
 ]
 description = "A web-based continuous localization system with tight version control integration"
 keywords = [

diff --git a/uv.lock b/uv.lock
diff --git a/weblate/checks/base.py b/weblate/checks/base.py
@@ -5,11 +5,12 @@
 from __future__ import annotations
 
 import re
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, TypedDict
 
 import sentry_sdk
 from django.http import Http404
 from django.utils.html import format_html, format_html_join
+from django.utils.safestring import mark_safe
 from django.utils.translation import gettext
 from lxml import etree
 from siphashc import siphash
@@ -28,7 +29,11 @@
     from .flags import Flags
     from .models import Check
 
-MissingExtraDict = dict[Literal["missing", "extra"], list[str]]
+
+class MissingExtraDict(TypedDict, total=False):
+    missing: list[str]
+    extra: list[str]
+    errors: list[str]
 
 
 class BaseCheck:
@@ -321,19 +326,27 @@ def get_extra_text(self, values: Iterable[str]) -> StrOrPromise:
             gettext("The following format strings are extra: {}"), values
         )
 
+    def get_errors_text(self, values: Iterable[str]) -> StrOrPromise:
+        return format_html_join(
+            mark_safe("<br />"),  # noqa: S308
+            "{}",
+            (
+                (value,)
+                for value in (gettext("The following errors were found:"), *values)
+            ),
+        )
+
     def format_string(self, string: str) -> str:
         """Format parsed format string into human readable value."""
         return string
 
     def format_result(self, result: MissingExtraDict) -> Iterable[StrOrPromise]:
-        if result["missing"]:
-            yield self.get_missing_text(
-                self.format_string(x) for x in set(result["missing"])
-            )
-        if result["extra"]:
-            yield self.get_extra_text(
-                self.format_string(x) for x in set(result["extra"])
-            )
+        if missing := result.get("missing"):
+            yield self.get_missing_text(self.format_string(x) for x in set(missing))
+        if extra := result.get("extra"):
+            yield self.get_extra_text(self.format_string(x) for x in set(extra))
+        if errors := result.get("errors"):
+            yield self.get_errors_text(set(errors))
 
 
 class SourceCheck(BaseCheck):

diff --git a/weblate/checks/markup.py b/weblate/checks/markup.py
@@ -6,6 +6,7 @@
 
 import re
 from collections import defaultdict
+from functools import cache, lru_cache
 from typing import TYPE_CHECKING
 
 from django.core.exceptions import ValidationError
@@ -14,6 +15,9 @@
 from django.utils.html import format_html_join
 from django.utils.safestring import mark_safe
 from django.utils.translation import gettext_lazy
+from docutils import utils
+from docutils.core import Publisher
+from docutils.nodes import Element, system_message
 
 from weblate.checks.base import MissingExtraDict, TargetCheck
 from weblate.utils.html import (
@@ -82,6 +86,8 @@
     ":kbd:",
 }
 
+RST_ROLE_RE = re.compile(r"""Unknown interpreted text role "([^"]*)"\.""")
+
 
 def strip_entities(text):
     """Strip all HTML entities (we don't care about them)."""
@@ -371,18 +377,21 @@ def check_single(self, source: str, target: str, unit: Unit):
         return cleaned_target != target
 
 
-class RSTReferencesCheck(TargetCheck):
-    check_id = "rst-references"
-    name = gettext_lazy("Inconsistent reStructuredText references")
-    description = gettext_lazy(
-        "Inconsistent reStructuredText term references in the translated message."
-    )
+class RSTBaseCheck(TargetCheck):
     default_disabled = True
 
     def __init__(self) -> None:
         super().__init__()
         self.enable_string = "rst-text"
 
+
+class RSTReferencesCheck(RSTBaseCheck):
+    check_id = "rst-references"
+    name = gettext_lazy("Inconsistent reStructuredText references")
+    description = gettext_lazy(
+        "Inconsistent reStructuredText term references in the translated message."
+    )
+
     def extract_references(self, text: str) -> dict[str, str]:
         return {
             role
@@ -437,3 +446,109 @@ def check_highlight(self, source: str, unit: Unit):
             return
         for match in RST_REF_MATCH.finditer(source):
             yield match.start(0), match.end(0), match.group(0)
+
+
+@cache
+def get_rst_publisher() -> Publisher:
+    publisher = Publisher(settings=None)
+    publisher.set_components("standalone", "restructuredtext", "null")
+    publisher.get_settings(halt_level=5)
+    return publisher
+
+
+@lru_cache(maxsize=512)
+def validate_rst_snippet(
+    snippet: str, source_tags: tuple[str] | None = None
+) -> tuple[list[str], list[str]]:
+    publisher = get_rst_publisher()
+    document = utils.new_document(None, publisher.settings)
+    document.reporter.stream = None
+
+    errors: list[str] = []
+    roles: list[str] = []
+
+    def error_collector(data: system_message) -> None:
+        """Save the error."""
+        message = Element.astext(data)
+        if match := RST_ROLE_RE.match(message):
+            role = match.group(1)
+            roles.append(role)
+            if source_tags is not None and role in source_tags:
+                # Skip if the role was found in the source
+                return
+        elif message.startswith("Unknown target name:") and "`" not in message:
+            # Translating targets is okay, just catch obvious errors
+            return
+        elif message.startswith("No role entry"):
+            # Ignore as this duplicates Unknown interpreted in our case
+            return
+        errors.append(message)
+
+    document.reporter.attach_observer(error_collector)
+    publisher.reader.parser.parse(snippet, document)
+    transformer = document.transformer
+    transformer.populate_from_components(
+        (
+            publisher.source,
+            publisher.reader,
+            publisher.reader.parser,
+            publisher.writer,
+            publisher.destination,
+        )
+    )
+    while transformer.transforms:
+        if not transformer.sorted:
+            # Unsorted initially, and whenever a transform is added.
+            transformer.transforms.sort()
+            transformer.transforms.reverse()
+            transformer.sorted = 1
+        priority, transform_class, pending, kwargs = transformer.transforms.pop()
+        transform = transform_class(transformer.document, startnode=pending)
+        transform.apply(**kwargs)
+        transformer.applied.append((priority, transform_class, pending, kwargs))
+    return errors, roles
+
+
+class RSTSyntaxCheck(RSTBaseCheck):
+    check_id = "rst-syntax"
+    name = gettext_lazy("reStructuredText syntax error")
+    description = gettext_lazy("reStructuredText syntax error in the translation.")
+
+    def check_single(
+        self, source: str, target: str, unit: Unit
+    ) -> bool | MissingExtraDict:
+        _errors, source_tags = validate_rst_snippet(source)
+        errors, _target_tags = validate_rst_snippet(target, tuple(source_tags))
+
+        if errors:
+            return {"errors": errors}
+        return False
+
+    def get_description(self, check_obj: Check) -> StrOrPromise:
+        unit = check_obj.unit
+
+        errors: list[StrOrPromise] = []
+        results: MissingExtraDict = defaultdict(list)
+
+        # Merge plurals
+        for result in self.check_target_generator(
+            unit.get_source_plurals(), unit.get_target_plurals(), unit
+        ):
+            if isinstance(result, dict):
+                for key, value in result.items():
+                    results[key].extend(value)
+        if results:
+            errors.extend(self.format_result(results))
+        if errors:
+            return format_html_join(
+                mark_safe("<br />"),  # noqa: S308
+                "{}",
+                ((error,) for error in errors),
+            )
+        return super().get_description(check_obj)
+
+    def check_highlight(self, source: str, unit: Unit):
+        if self.should_skip(unit):
+            return
+        for match in RST_REF_MATCH.finditer(source):
+            yield match.start(0), match.end(0), match.group(0)
diff --git a/weblate/checks/models.py b/weblate/checks/models.py
@@ -102,6 +102,7 @@ class WeblateChecksConf(AppConf):
         "weblate.checks.markup.URLCheck",
         "weblate.checks.markup.SafeHTMLCheck",
         "weblate.checks.markup.RSTReferencesCheck",
+        "weblate.checks.markup.RSTSyntaxCheck",
         "weblate.checks.placeholders.PlaceholderCheck",
         "weblate.checks.placeholders.RegexCheck",
         "weblate.checks.duplicate.DuplicateCheck",