From 5683f9200ee10c8eb9d59aa0d3e3c8d6b3f2fc33 Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Mon, 3 Jun 2024 08:29:43 +0200 Subject: [PATCH] Parametrize language in language detection and coordinate extraction. --- config/matching_params.yml | 12 ++++++++++++ src/stratigraphy/main.py | 4 +++- src/stratigraphy/util/coordinate_extraction.py | 13 +++---------- src/stratigraphy/util/language_detection.py | 12 +++++++----- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/config/matching_params.yml b/config/matching_params.yml index e3885dec..72b78bee 100644 --- a/config/matching_params.yml +++ b/config/matching_params.yml @@ -7,6 +7,8 @@ depth_column_params: # these params should be optimized as soon as there is rel noise_count_threshold: 1.25 noise_count_offset: 2.5 +default_language: de + material_description: de: including_expressions: @@ -102,3 +104,13 @@ material_description: - piezometre - profondeur - désignation + + +coordinate_keys: + - Koordinaten + - Koordinate + - Koord. + - coordinates + - coordinate + - coordonnés + - coordonn diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 91b7b860..7f863fff 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -162,7 +162,9 @@ def start_pipeline( predictions[filename] = {} with fitz.Document(in_path) as doc: - language = detect_language_of_document(doc) + language = detect_language_of_document( + doc, matching_params["default_language"], matching_params["material_description"].keys() + ) predictions[filename]["language"] = language coordinate_extractor = CoordinateExtractor(doc) coordinates = coordinate_extractor.extract_coordinates() diff --git a/src/stratigraphy/util/coordinate_extraction.py b/src/stratigraphy/util/coordinate_extraction.py index e642741d..8328466b 100644 --- a/src/stratigraphy/util/coordinate_extraction.py +++ b/src/stratigraphy/util/coordinate_extraction.py @@ -7,6 +7,8 @@ import fitz import regex +from stratigraphy.util.util import read_params + logger = logging.getLogger(__name__) COORDINATE_ENTRY_REGEX = r"(?:([12])[\.\s'‘’]{0,2})?(\d{3})[\.\s'‘’]{0,2}(\d{3})\.?\d?" @@ -130,16 +132,7 @@ def __init__(self, document: fitz.Document): document (fitz.Document): A PDF document. """ self.doc = document - self.coordinate_keys = [ - "Koordinaten", - "Koordinate", - "Koord.", - "coordinates", - "coordinate", - "coordonnés", - "coordonnes", - ] - # TODO: extend coordinate keys with other languages + self.coordinate_keys = read_params("matching_params.yml")["coordinate_keys"] def find_coordinate_key(self, text: str, allowed_errors: int = 3) -> str | None: # noqa: E501 """Finds the location of a coordinate key in a string of text. diff --git a/src/stratigraphy/util/language_detection.py b/src/stratigraphy/util/language_detection.py index 8a313e2d..9e3562ca 100644 --- a/src/stratigraphy/util/language_detection.py +++ b/src/stratigraphy/util/language_detection.py @@ -23,21 +23,23 @@ def extract_text_from_document(doc: fitz.Document) -> str: return "".join(e for e in text if (e.isalnum() or e.isspace()) and not e.isdigit()) -def detect_language_of_document(doc: fitz.Document) -> str: +def detect_language_of_document(doc: fitz.Document, default_language: str, supported_languages: list) -> str: """Detects the language of a document. Args: doc (fitz.Document): The document to detect the language of. + default_language (str): The default language to use if the language detection fails. + supported_languages (list): A list of supported languages. Returns: - str: The detected language of the document. Either "de" or "fr". + str: The detected language of the document. One of supported_languages. """ text = extract_text_from_document(doc) try: language = detect(text) except LangDetectException: - language = "de" # TODO: default language should be read from config + language = default_language - if language not in ["de", "fr"]: # TODO: This should be read from the config - language = "de" + if language not in supported_languages: + language = default_language return language