Skip to content

Commit

Permalink
Parametrize language in language detection and coordinate extraction.
Browse files Browse the repository at this point in the history
  • Loading branch information
redur committed Jun 3, 2024
1 parent 71bfee2 commit 5683f92
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 16 deletions.
12 changes: 12 additions & 0 deletions config/matching_params.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ depth_column_params: # these params should be optimized as soon as there is rel
noise_count_threshold: 1.25
noise_count_offset: 2.5

default_language: de

material_description:
de:
including_expressions:
Expand Down Expand Up @@ -102,3 +104,13 @@ material_description:
- piezometre
- profondeur
- désignation


coordinate_keys:
- Koordinaten
- Koordinate
- Koord.
- coordinates
- coordinate
- coordonnés
- coordonn
4 changes: 3 additions & 1 deletion src/stratigraphy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ def start_pipeline(
predictions[filename] = {}

with fitz.Document(in_path) as doc:
language = detect_language_of_document(doc)
language = detect_language_of_document(
doc, matching_params["default_language"], matching_params["material_description"].keys()
)
predictions[filename]["language"] = language
coordinate_extractor = CoordinateExtractor(doc)
coordinates = coordinate_extractor.extract_coordinates()
Expand Down
13 changes: 3 additions & 10 deletions src/stratigraphy/util/coordinate_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import fitz
import regex

from stratigraphy.util.util import read_params

logger = logging.getLogger(__name__)

COORDINATE_ENTRY_REGEX = r"(?:([12])[\.\s'‘’]{0,2})?(\d{3})[\.\s'‘’]{0,2}(\d{3})\.?\d?"
Expand Down Expand Up @@ -130,16 +132,7 @@ def __init__(self, document: fitz.Document):
document (fitz.Document): A PDF document.
"""
self.doc = document
self.coordinate_keys = [
"Koordinaten",
"Koordinate",
"Koord.",
"coordinates",
"coordinate",
"coordonnés",
"coordonnes",
]
# TODO: extend coordinate keys with other languages
self.coordinate_keys = read_params("matching_params.yml")["coordinate_keys"]

def find_coordinate_key(self, text: str, allowed_errors: int = 3) -> str | None: # noqa: E501
"""Finds the location of a coordinate key in a string of text.
Expand Down
12 changes: 7 additions & 5 deletions src/stratigraphy/util/language_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,23 @@ def extract_text_from_document(doc: fitz.Document) -> str:
return "".join(e for e in text if (e.isalnum() or e.isspace()) and not e.isdigit())


def detect_language_of_document(doc: fitz.Document) -> str:
def detect_language_of_document(doc: fitz.Document, default_language: str, supported_languages: list) -> str:
"""Detects the language of a document.
Args:
doc (fitz.Document): The document to detect the language of.
default_language (str): The default language to use if the language detection fails.
supported_languages (list): A list of supported languages.
Returns:
str: The detected language of the document. Either "de" or "fr".
str: The detected language of the document. One of supported_languages.
"""
text = extract_text_from_document(doc)
try:
language = detect(text)
except LangDetectException:
language = "de" # TODO: default language should be read from config
language = default_language

if language not in ["de", "fr"]: # TODO: This should be read from the config
language = "de"
if language not in supported_languages:
language = default_language
return language

0 comments on commit 5683f92

Please sign in to comment.