Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Close LGVISIUM-63: Extraction of the groundwater logo using computer vision #83

Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"swissgeol",
"swisstopo",
"textblock",
"USCS",
"venv"
]
}
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,16 @@ With regard to the extraction of coordinates, the [Swiss coordinate systems](htt
#### Groundwater
With the current version of the code, groundwater can only be found at depth smaller than 200 meters. This threshold is defined in `src/stratigraphy/groundwater/groundwater_extraction.py` by the constant `MAX_DEPTH`.

The groundwater is extracted in two main ways from the borehole documents. The first one aims to match a groundwater-related keyword in the text extracted from the document (e.g., groundwater, groundwater-level). The second technique focuses on extracting the groundwater-related illustration from the document by using template matching. The matching of the groundwater illustration is disabled by default as it significantly increases the runtime of the data extraction pipeline. You can control the activation of this feature by using the `IS_SEARCHING_GROUNDWATER_ILLUSTRATION` environment variable.

Add the following line to the `.env` document to turn on the groundwater detection:

```
IS_SEARCHING_GROUNDWATER_ILLUSTRATION="True"
```

The extraction of groundwater relies on the `scikit-image` library. This library is part of the optional dependencies of this project as part of the `groundwater_illustration_matching` dependencies in the `pyproject.toml` file. If you wish to use the template matching algorithm to determine the groundwater elevation, depth, and date, please install this dependency before running the code.

## Main contributors

* Stijn Vermeeren [@stijnvermeeren-swisstopo](https://www.github.com/stijnvermeeren-swisstopo) (swisstopo) - Project Lead
Expand Down
11 changes: 10 additions & 1 deletion config/matching_params.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,14 @@ coordinate_keys:
- coordonnées
- coordonn

coordinate_fp_keys:


groundwater_fp_keys:
- Wasserstau
- Grundwasser-
- Grundwasserfassung
- GW/ # makes it possible to avoid false positives like "GW/" from the USCS Nomenclature columns

groundwater_keys:
# German
Expand All @@ -132,7 +140,6 @@ groundwater_keys:
- W SP
- Gr.W.spiegel
- GrW Sp
- Wsp.
- Wsp
- GW-Spiegel
- Grundwasser
Expand Down Expand Up @@ -170,3 +177,5 @@ elevation_keys:
- Ansatzhöhe
- Terrainkote

elevation_fp_keys:

7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dependencies = [
"PyMuPDF>=1.23.26",
"opencv-python-headless",
"quads>=1.1.0",
"numpy<2",
"numpy<2"
]

[project.optional-dependencies]
Expand All @@ -54,8 +54,11 @@ visualize = [
devtools = [
"tqdm"
]
groundwater_illustration_matching = [
dcleres marked this conversation as resolved.
Show resolved Hide resolved
"scikit-image==0.24.0"
]

all = ["swissgeol-boreholes-dataextraction[test, lint, experiment-tracking, visualize, devtools]"]
all = ["swissgeol-boreholes-dataextraction[test, lint, experiment-tracking, visualize, devtools, groundwater_illustration_matching]"]

[project.scripts]
boreholes-extract-all = "stratigraphy.main:click_pipeline"
Expand Down
35 changes: 27 additions & 8 deletions src/stratigraphy/data_extractor/data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import fitz
import regex
from stratigraphy.data_extractor.utility import get_lines_near_rect
from stratigraphy.lines.line import TextLine
from stratigraphy.util.util import read_params

Expand Down Expand Up @@ -40,6 +41,7 @@ class DataExtractor(ABC):

doc: fitz.Document = None
feature_keys: list[str] = None
feature_fp_keys: list[str] = None
feature_name: str = None

# How much to the left of a key do we look for the feature information, as a multiple of the key line width
Expand All @@ -48,6 +50,8 @@ class DataExtractor(ABC):
search_right_factor: float = 0
# How much below a key do we look for the feature information, as a multiple of the key line height
search_below_factor: float = 0
# How much above a key do we look for the feature information, as a multiple of the key line height
search_above_factor: float = 0

preprocess_replacements: dict[str, str] = {}

Expand All @@ -63,6 +67,7 @@ def __init__(self, document: fitz.Document):

self.doc = document
self.feature_keys = read_params("matching_params.yml")[f"{self.feature_name}_keys"]
self.feature_fp_keys = read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"] or []

def preprocess(self, value: str) -> str:
for old, new in self.preprocess_replacements.items():
Expand Down Expand Up @@ -104,7 +109,8 @@ def find_feature_key(self, lines: list[TextLine], allowed_error_rate: float = 0.

for line in lines:
match = pattern.search(line.text)
if match:
if match and (not any(fp_key in line.text for fp_key in self.feature_fp_keys)):
# Check if there is a match and the matched string is not in the false positive list
matches.add(line)

return list(matches)
Expand All @@ -122,13 +128,7 @@ def get_lines_near_key(self, lines, key_line: TextLine) -> list[TextLine]:
list[TextLine]: The lines close to the key.
"""
key_rect = key_line.rect
elevation_search_rect = fitz.Rect(
key_rect.x0 - self.search_left_factor * key_rect.width,
key_rect.y0,
key_rect.x1 + self.search_right_factor * key_rect.width,
key_rect.y1 + self.search_below_factor * key_rect.height,
)
feature_lines = [line for line in lines if line.rect.intersects(elevation_search_rect)]
feature_lines = self.get_lines_near_rect(lines, key_rect)

# Insert key_line first and remove duplicates
feature_lines.insert(0, key_line)
Expand All @@ -138,3 +138,22 @@ def get_lines_near_key(self, lines, key_line: TextLine) -> list[TextLine]:
feature_lines_sorted = sorted(feature_lines, key=lambda line: abs(line.rect.y0 - key_line.rect.y0))

return feature_lines_sorted

def get_lines_near_rect(self, lines, rect: fitz.Rect) -> list[TextLine]:
"""Find the lines of the text that are close to a given rectangle.

Args:
lines (list[TextLine]): Arbitrary text lines to search in.
rect (fitz.Rect): The rectangle to search around.

Returns:
list[TextLine]: The lines close to the rectangle.
"""
return get_lines_near_rect(
self.search_left_factor,
self.search_right_factor,
self.search_above_factor,
self.search_below_factor,
lines,
rect,
)
36 changes: 36 additions & 0 deletions src/stratigraphy/data_extractor/utility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Utility functions for the data extractor module."""

from fitz import Rect
from stratigraphy.lines.line import TextLine


def get_lines_near_rect(
search_left_factor: float,
search_right_factor: float,
search_above_factor: float,
search_below_factor: float,
lines: list[TextLine],
rect: Rect,
) -> list[TextLine]:
"""Find the lines of the text that are close to a given rectangle.

Args:
search_left_factor (float): The factor to search to the left of the rectangle.
search_right_factor (float): The factor to search to the right of the rectangle.
search_above_factor (float): The factor to search above the rectangle.
search_below_factor (float): The factor to search below the rectangle
lines (list[TextLine]): Arbitrary text lines to search in.
rect (fitz.Rect): The rectangle to search around.

Returns:
list[TextLine]: The lines close to the rectangle.
"""
search_rect = Rect(
rect.x0 - search_left_factor * rect.width,
rect.y0 - search_above_factor * rect.height,
rect.x1 + search_right_factor * rect.width,
rect.y1 + search_below_factor * rect.height,
)
feature_lines = [line for line in lines if line.rect.intersects(search_rect)]

return feature_lines
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
43 changes: 36 additions & 7 deletions src/stratigraphy/groundwater/groundwater_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import abc
import logging
import os
from dataclasses import dataclass
from datetime import date as dt
from datetime import datetime
Expand Down Expand Up @@ -136,13 +137,23 @@ class GroundwaterLevelExtractor(DataExtractor):

feature_name = "groundwater"

is_searching_groundwater_illustration: bool = False

# look for elevation values to the left, right and/or immediately below the key
search_left_factor: float = 2
search_right_factor: float = 10
search_below_factor: float = 4
search_above_factor: float = 0

preprocess_replacements = {",": ".", "'": ".", "o": "0", "\n": " ", "ü": "u"}

def __init__(self, document):
super().__init__(document)

self.is_searching_groundwater_illustration = os.getenv("IS_SEARCHING_GROUNDWATER_ILLUSTRATION") == "True"
if self.is_searching_groundwater_illustration:
logger.info("Searching for groundwater information in illustrations.")

def get_groundwater_near_key(self, lines: list[TextLine], page: int) -> list[GroundwaterInformationOnPage]:
"""Find groundwater information from text lines that are close to an explicit "groundwater" label.

Expand Down Expand Up @@ -210,7 +221,6 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> G

elevation = extract_elevation(text)

# Pattern for matching depth (e.g., "1,48 m u.T.")
matched_lines_rect.append(line.rect)
else:
# Pattern for matching date
Expand All @@ -219,6 +229,16 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> G
if extracted_date_str:
text = text.replace(extracted_date_str, "").strip()
date = extracted_date
matched_lines_rect.append(
line.rect
) # Add the rectangle of the line to the matched lines list to make sure it is drawn
# in the output image.
else:
# If a second date is present in the lines around the groundwater key, then we skip this line,
# instead of potentially falsely extracting a depth value from the date.
extracted_date, extracted_date_str = extract_date(text)
if extracted_date_str:
continue

# Pattern for matching depth (e.g., "1,48 m u.T.")
if not depth:
Expand Down Expand Up @@ -285,12 +305,21 @@ def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[Groun
"""
for page in self.doc:
lines = extract_text_lines(page)
page_number = page.number + 1 # page.number is 0-based

found_groundwater = (
self.get_groundwater_near_key(lines, page_number)
# or XXXX # Add other techniques here
)
page_number = page.number + 1 # NOTE: page.number is 0-based

found_groundwater = self.get_groundwater_near_key(lines, page_number)
if not found_groundwater and self.is_searching_groundwater_illustration:
from stratigraphy.groundwater.gw_illustration_template_matching import (
get_groundwater_from_illustration,
)

# Extract groundwater from illustration
found_groundwater, confidence_list = get_groundwater_from_illustration(
self, lines, page_number, terrain_elevation
)
if found_groundwater:
logger.info("Confidence list: %s", confidence_list)
logger.info("Found groundwater from illustration on page %s: %s", page_number, found_groundwater)

if terrain_elevation:
# If the elevation is provided, calculate the depth of the groundwater
Expand Down
Loading
Loading