Skip to content

Commit

Permalink
feat(checks): add reStructuredText syntax checking
Browse files Browse the repository at this point in the history
Use docutils to parse the string and show discovered errors.

Issue #8446
  • Loading branch information
nijel committed Jan 16, 2025
1 parent 5054173 commit be16ca8
Show file tree
Hide file tree
Showing 10 changed files with 261 additions and 18 deletions.
1 change: 1 addition & 0 deletions docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Not yet released.
**New features**

* :ref:`check-rst-references` check to validate reStructuredText references.
* :ref:`check-rst-syntax` check to validate reStructuredText syntax.

**Improvements**

Expand Down
15 changes: 15 additions & 0 deletions docs/user/checks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1385,6 +1385,21 @@ translation file or defined manually using ``regex`` flag:
regex:^foo|bar$
.. _check-rst-syntax:

reStructuredText syntax error
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. versionadded:: 5.10

:Summary: reStructuredText syntax error in the translation.
:Scope: translated strings
:Check class: ``weblate.checks.markup.RSTSyntaxCheck``
:Check identifier: ``rst-syntax``
:Flag to enable: ``rst-text``
:Flag to ignore: ``ignore-rst-syntax``

reStructuredText syntax error in the translation.

.. _check-reused:

Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ types = [
"types-openpyxl==3.1.5.20241225",
"types-Pillow==10.2.0.20240822",
"types-python-dateutil==2.9.0.20241206",
"types-requests==2.32.0.20241016"
"types-requests==2.32.0.20241016",
"types-docutils==0.21.0.20241128"
]

[project]
Expand Down Expand Up @@ -154,7 +155,8 @@ dependencies = [
"unidecode>=1.3.8,<1.4",
"user-agents>=2.0,<2.3",
"weblate-language-data>=2024.14",
"weblate-schemas==2024.2"
"weblate-schemas==2024.2",
"docutils>=0.21.2,<0.22"
]
description = "A web-based continuous localization system with tight version control integration"
keywords = [
Expand Down
15 changes: 15 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 23 additions & 10 deletions weblate/checks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING, Any, Literal
from typing import TYPE_CHECKING, Any, TypedDict

import sentry_sdk
from django.http import Http404
from django.utils.html import format_html, format_html_join
from django.utils.safestring import mark_safe
from django.utils.translation import gettext
from lxml import etree
from siphashc import siphash
Expand All @@ -28,7 +29,11 @@
from .flags import Flags
from .models import Check

MissingExtraDict = dict[Literal["missing", "extra"], list[str]]

class MissingExtraDict(TypedDict, total=False):
missing: list[str]
extra: list[str]
errors: list[str]


class BaseCheck:
Expand Down Expand Up @@ -321,19 +326,27 @@ def get_extra_text(self, values: Iterable[str]) -> StrOrPromise:
gettext("The following format strings are extra: {}"), values
)

def get_errors_text(self, values: Iterable[str]) -> StrOrPromise:
return format_html_join(
mark_safe("<br />"), # noqa: S308
"{}",
(
(value,)
for value in (gettext("The following errors were found:"), *values)
),
)

def format_string(self, string: str) -> str:
"""Format parsed format string into human readable value."""
return string

def format_result(self, result: MissingExtraDict) -> Iterable[StrOrPromise]:
if result["missing"]:
yield self.get_missing_text(
self.format_string(x) for x in set(result["missing"])
)
if result["extra"]:
yield self.get_extra_text(
self.format_string(x) for x in set(result["extra"])
)
if missing := result.get("missing"):
yield self.get_missing_text(self.format_string(x) for x in set(missing))
if extra := result.get("extra"):
yield self.get_extra_text(self.format_string(x) for x in set(extra))
if errors := result.get("errors"):
yield self.get_errors_text(set(errors))


class SourceCheck(BaseCheck):
Expand Down
127 changes: 121 additions & 6 deletions weblate/checks/markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import re
from collections import defaultdict
from functools import cache, lru_cache
from typing import TYPE_CHECKING

from django.core.exceptions import ValidationError
Expand All @@ -14,6 +15,9 @@
from django.utils.html import format_html_join
from django.utils.safestring import mark_safe
from django.utils.translation import gettext_lazy
from docutils import utils
from docutils.core import Publisher
from docutils.nodes import Element, system_message

from weblate.checks.base import MissingExtraDict, TargetCheck
from weblate.utils.html import (
Expand Down Expand Up @@ -82,6 +86,8 @@
":kbd:",
}

RST_ROLE_RE = re.compile(r"""Unknown interpreted text role "([^"]*)"\.""")


def strip_entities(text):
"""Strip all HTML entities (we don't care about them)."""
Expand Down Expand Up @@ -371,18 +377,21 @@ def check_single(self, source: str, target: str, unit: Unit):
return cleaned_target != target


class RSTReferencesCheck(TargetCheck):
check_id = "rst-references"
name = gettext_lazy("Inconsistent reStructuredText references")
description = gettext_lazy(
"Inconsistent reStructuredText term references in the translated message."
)
class RSTBaseCheck(TargetCheck):
default_disabled = True

def __init__(self) -> None:
super().__init__()
self.enable_string = "rst-text"


class RSTReferencesCheck(RSTBaseCheck):
check_id = "rst-references"
name = gettext_lazy("Inconsistent reStructuredText references")
description = gettext_lazy(
"Inconsistent reStructuredText term references in the translated message."
)

def extract_references(self, text: str) -> dict[str, str]:
return {
role
Expand Down Expand Up @@ -437,3 +446,109 @@ def check_highlight(self, source: str, unit: Unit):
return
for match in RST_REF_MATCH.finditer(source):
yield match.start(0), match.end(0), match.group(0)


@cache
def get_rst_publisher() -> Publisher:
publisher = Publisher(settings=None)
publisher.set_components("standalone", "restructuredtext", "null")
publisher.get_settings(halt_level=5)
return publisher


@lru_cache(maxsize=512)
def validate_rst_snippet(
snippet: str, source_tags: tuple[str] | None = None
) -> tuple[list[str], list[str]]:
publisher = get_rst_publisher()
document = utils.new_document(None, publisher.settings)
document.reporter.stream = None

errors: list[str] = []
roles: list[str] = []

def error_collector(data: system_message) -> None:
"""Save the error."""
message = Element.astext(data)
if match := RST_ROLE_RE.match(message):
role = match.group(1)
roles.append(role)
if source_tags is not None and role in source_tags:
# Skip if the role was found in the source
return
elif message.startswith("Unknown target name:") and "`" not in message:
# Translating targets is okay, just catch obvious errors
return
elif message.startswith("No role entry"):
# Ignore as this duplicates Unknown interpreted in our case
return
errors.append(message)

document.reporter.attach_observer(error_collector)
publisher.reader.parser.parse(snippet, document)
transformer = document.transformer
transformer.populate_from_components(
(
publisher.source,
publisher.reader,
publisher.reader.parser,
publisher.writer,
publisher.destination,
)
)
while transformer.transforms:
if not transformer.sorted:
# Unsorted initially, and whenever a transform is added.
transformer.transforms.sort()
transformer.transforms.reverse()
transformer.sorted = 1
priority, transform_class, pending, kwargs = transformer.transforms.pop()
transform = transform_class(transformer.document, startnode=pending)
transform.apply(**kwargs)
transformer.applied.append((priority, transform_class, pending, kwargs))
return errors, roles


class RSTSyntaxCheck(RSTBaseCheck):
check_id = "rst-syntax"
name = gettext_lazy("reStructuredText syntax error")
description = gettext_lazy("reStructuredText syntax error in the translation.")

def check_single(
self, source: str, target: str, unit: Unit
) -> bool | MissingExtraDict:
_errors, source_tags = validate_rst_snippet(source)
errors, _target_tags = validate_rst_snippet(target, tuple(source_tags))

if errors:
return {"errors": errors}
return False

def get_description(self, check_obj: Check) -> StrOrPromise:
unit = check_obj.unit

errors: list[StrOrPromise] = []
results: MissingExtraDict = defaultdict(list)

# Merge plurals
for result in self.check_target_generator(
unit.get_source_plurals(), unit.get_target_plurals(), unit
):
if isinstance(result, dict):
for key, value in result.items():
results[key].extend(value)
if results:
errors.extend(self.format_result(results))
if errors:
return format_html_join(
mark_safe("<br />"), # noqa: S308
"{}",
((error,) for error in errors),
)
return super().get_description(check_obj)

def check_highlight(self, source: str, unit: Unit):
if self.should_skip(unit):
return
for match in RST_REF_MATCH.finditer(source):
yield match.start(0), match.end(0), match.group(0)
1 change: 1 addition & 0 deletions weblate/checks/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class WeblateChecksConf(AppConf):
"weblate.checks.markup.URLCheck",
"weblate.checks.markup.SafeHTMLCheck",
"weblate.checks.markup.RSTReferencesCheck",
"weblate.checks.markup.RSTSyntaxCheck",
"weblate.checks.placeholders.PlaceholderCheck",
"weblate.checks.placeholders.RegexCheck",
"weblate.checks.duplicate.DuplicateCheck",
Expand Down
Loading

0 comments on commit be16ca8

Please sign in to comment.