Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Close LGVISIUM-63: Extraction of the groundwater logo using computer vision #83

Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"swissgeol",
"swisstopo",
"textblock",
"USCS",
"venv"
]
}
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ With regard to the extraction of coordinates, the [Swiss coordinate systems](htt
#### Groundwater
With the current version of the code, groundwater can only be found at depth smaller than 200 meters. This threshold is defined in `src/stratigraphy/groundwater/groundwater_extraction.py` by the constant `MAX_DEPTH`.

The groundwater is extracted in two main ways from the borehole documents. The first one aims to match a groundwater-related keyword in the text extracted from the document (e.g., groundwater, groundwater-level). The second technique focuses on extracting the groundwater-related illustration from the document by using template matching. The matching of the groundwater illustration is disabled by default as it significantly increases the runtime of the data extraction pipeline. You can control the activation of this feature by using the `IS_SEARCHING_GROUNDWATER_ILLUSTRATION`.

Add the following line to the `.env` document to turn on the groundwater detection:

```
IS_SEARCHING_GROUNDWATER_ILLUSTRATION="True"
```

## Main contributors

* Stijn Vermeeren [@stijnvermeeren-swisstopo](https://www.github.com/stijnvermeeren-swisstopo) (swisstopo) - Project Lead
Expand Down
Binary file added Screenshot 2024-09-16 at 19.04.42_template.npy
Binary file not shown.
11 changes: 10 additions & 1 deletion config/matching_params.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,14 @@ coordinate_keys:
- coordonnées
- coordonn

coordinate_fp_keys:


groundwater_fp_keys:
- Wasserstau
- Grundwasser-
- Grundwasserfassung
- GW/ # makes it possible to avoid false positives like "GW/" from the USCS Nomenclature columns

groundwater_keys:
# German
Expand All @@ -132,7 +140,6 @@ groundwater_keys:
- W SP
- Gr.W.spiegel
- GrW Sp
- Wsp.
- Wsp
- GW-Spiegel
- Grundwasser
Expand Down Expand Up @@ -170,3 +177,5 @@ elevation_keys:
- Ansatzhöhe
- Terrainkote

elevation_fp_keys:

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies = [
"opencv-python-headless",
"quads>=1.1.0",
"numpy<2",
"scikit-image==0.24.0"
dcleres marked this conversation as resolved.
Show resolved Hide resolved
]

[project.optional-dependencies]
Expand Down
46 changes: 38 additions & 8 deletions src/stratigraphy/data_extractor/data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class DataExtractor(ABC):

doc: fitz.Document = None
feature_keys: list[str] = None
feature_fp_keys: list[str] = None
feature_name: str = None

# How much to the left of a key do we look for the feature information, as a multiple of the key line width
Expand All @@ -48,6 +49,8 @@ class DataExtractor(ABC):
search_right_factor: float = 0
# How much below a key do we look for the feature information, as a multiple of the key line height
search_below_factor: float = 0
# How much above a key do we look for the feature information, as a multiple of the key line height
search_above_factor: float = 0

preprocess_replacements: dict[str, str] = {}

Expand All @@ -63,6 +66,11 @@ def __init__(self, document: fitz.Document):

self.doc = document
self.feature_keys = read_params("matching_params.yml")[f"{self.feature_name}_keys"]
self.feature_fp_keys = (
read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"]
if read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"]
else []
)
dcleres marked this conversation as resolved.
Show resolved Hide resolved

def preprocess(self, value: str) -> str:
for old, new in self.preprocess_replacements.items():
Expand Down Expand Up @@ -105,7 +113,15 @@ def find_feature_key(self, lines: list[TextLine], allowed_error_rate: float = 0.
for line in lines:
match = pattern.search(line.text)
if match:
matches.add(line)
# Make sure the key is not in the false positive list
is_fp_key = False
for fp_key in self.feature_fp_keys:
if fp_key in line.text:
is_fp_key = True
break

if not is_fp_key:
dcleres marked this conversation as resolved.
Show resolved Hide resolved
matches.add(line)

return list(matches)

Expand All @@ -122,13 +138,7 @@ def get_lines_near_key(self, lines, key_line: TextLine) -> list[TextLine]:
list[TextLine]: The lines close to the key.
"""
key_rect = key_line.rect
elevation_search_rect = fitz.Rect(
key_rect.x0 - self.search_left_factor * key_rect.width,
key_rect.y0,
key_rect.x1 + self.search_right_factor * key_rect.width,
key_rect.y1 + self.search_below_factor * key_rect.height,
)
feature_lines = [line for line in lines if line.rect.intersects(elevation_search_rect)]
feature_lines = self.get_lines_near_rect(lines, key_rect)

# Insert key_line first and remove duplicates
feature_lines.insert(0, key_line)
Expand All @@ -138,3 +148,23 @@ def get_lines_near_key(self, lines, key_line: TextLine) -> list[TextLine]:
feature_lines_sorted = sorted(feature_lines, key=lambda line: abs(line.rect.y0 - key_line.rect.y0))

return feature_lines_sorted

def get_lines_near_rect(self, lines, rect: fitz.Rect) -> list[TextLine]:
"""Find the lines of the text that are close to a given rectangle.

Args:
lines (list[TextLine]): Arbitrary text lines to search in.
rect (fitz.Rect): The rectangle to search around.

Returns:
list[TextLine]: The lines close to the rectangle.
"""
search_rect = fitz.Rect(
rect.x0 - self.search_left_factor * rect.width,
rect.y0 - self.search_above_factor * rect.height,
rect.x1 + self.search_right_factor * rect.width,
rect.y1 + self.search_below_factor * rect.height,
)
feature_lines = [line for line in lines if line.rect.intersects(search_rect)]

return feature_lines
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Loading