Skip to content

Commit

Permalink
Close LGVISIUM-52: Moved the page information within the JSON
Browse files Browse the repository at this point in the history
  • Loading branch information
dcleres committed Jul 31, 2024
1 parent efc655d commit 29fa7b9
Show file tree
Hide file tree
Showing 25 changed files with 455 additions and 268 deletions.
21 changes: 21 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Run boreholes-extract-all",
"type": "debugpy",
"request": "launch",
"module": "src.stratigraphy.main",
"args": [
"-i", "data/zurich",
"-g", "data/zurich_ground_truth.json"
],
"cwd": "${workspaceFolder}",
"justMyCode": true,
"python": "./swisstopo/bin/python3",
}
]
}
5 changes: 5 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"cSpell.words": [
"fitz"
]
}
3 changes: 2 additions & 1 deletion src/stratigraphy/benchmark/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,8 @@ def evaluate_layer_extraction(predictions: dict, number_of_truth_values: dict) -


def create_predictions_objects(
predictions: dict, ground_truth_path: Path | None
predictions: dict,
ground_truth_path: Path | None,
) -> tuple[dict[FilePredictions], dict]:
"""Create predictions objects from the predictions and evaluate them against the ground truth.
Expand Down
35 changes: 21 additions & 14 deletions src/stratigraphy/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@
logger = logging.getLogger(__name__)


def process_page(lines: list[TextLine], geometric_lines, language: str, **params: dict) -> list[dict]:
def process_page(
lines: list[TextLine], geometric_lines, language: str, page_number: int, **params: dict
) -> list[dict]:
"""Process a single page of a pdf.
Finds all descriptions and depth intervals on the page and matches them.
Expand All @@ -39,6 +41,7 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
lines (list[TextLine]): all the text lines on the page.
geometric_lines (list[Line]): The geometric lines of the page.
language (str): The language of the page.
page_number (int): The page number.
**params (dict): Additional parameters for the matching pipeline.
Returns:
Expand Down Expand Up @@ -67,7 +70,7 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
# If there is a layer identifier column, then we use this directly.
# Else, we search for depth columns. We could also think of some scoring mechanism to decide which one to use.
if not pairs:
depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True)
depth_column_entries = find_depth_columns.depth_column_entries(words, page_number, include_splits=True)
layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words)

used_entry_rects = []
Expand All @@ -77,13 +80,13 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params

depth_column_entries = [
entry
for entry in find_depth_columns.depth_column_entries(words, include_splits=False)
for entry in find_depth_columns.depth_column_entries(words, page_number, include_splits=False)
if entry.rect not in used_entry_rects
]
depth_columns: list[DepthColumn] = layer_depth_columns
depth_columns.extend(
find_depth_columns.find_depth_columns(
depth_column_entries, words, depth_column_params=params["depth_column_params"]
depth_column_entries, words, page_number, depth_column_params=params["depth_column_params"]
)
)

Expand All @@ -106,12 +109,12 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params

groups = [] # list of matched depth intervals and text blocks
# groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}]
if len(filtered_pairs): # match depth column items with material description
if filtered_pairs: # match depth column items with material description
for depth_column, material_description_rect in filtered_pairs:
description_lines = get_description_lines(lines, material_description_rect)
if len(description_lines) > 1:
new_groups = match_columns(
depth_column, description_lines, geometric_lines, material_description_rect, **params
depth_column, description_lines, geometric_lines, material_description_rect, page_number, **params
)
groups.extend(new_groups)
json_filtered_pairs = [
Expand Down Expand Up @@ -157,9 +160,11 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
]
)
predictions = [
{"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()}
if "depth_interval" in group
else {"material_description": group["block"].to_json()}
(
{"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()}
if "depth_interval" in group
else {"material_description": group["block"].to_json()}
)
for group in groups
]
predictions = parse_and_remove_empty_predictions(predictions)
Expand Down Expand Up @@ -205,6 +210,7 @@ def match_columns(
description_lines: list[TextLine],
geometric_lines: list[Line],
material_description_rect: fitz.Rect,
page_number: int,
**params: dict,
) -> list:
"""Match the depth column entries with the description lines.
Expand All @@ -218,6 +224,7 @@ def match_columns(
description_lines (list[TextLine]): The description lines.
geometric_lines (list[Line]): The geometric lines.
material_description_rect (fitz.Rect): The material description rectangle.
page_number (int): The page number.
**params (dict): Additional parameters for the matching pipeline.
Returns:
Expand All @@ -235,7 +242,7 @@ def match_columns(
blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines)
groups = []
for block in blocks:
depth_interval = depth_column.get_depth_interval(block)
depth_interval = depth_column.get_depth_interval(block, page_number)
if depth_interval:
groups.append({"depth_interval": depth_interval, "block": block})
else:
Expand Down Expand Up @@ -320,7 +327,7 @@ def merge_blocks_by_vertical_spacing(blocks: list[TextBlock], target_merge_count
merged_blocks.append(current_merged_block)
current_merged_block = new_block

if len(current_merged_block.lines):
if current_merged_block.lines:
merged_blocks.append(current_merged_block)
return merged_blocks

Expand Down Expand Up @@ -355,7 +362,7 @@ def split_blocks_by_textline_length(blocks: list[TextBlock], target_split_count:
split_blocks.append(TextBlock(current_block_lines))
cutoff_values.remove(line.rect.x1)
current_block_lines = []
if len(current_block_lines):
if current_block_lines:
split_blocks.append(TextBlock(current_block_lines))
current_block_lines = []
if (
Expand Down Expand Up @@ -386,7 +393,7 @@ def find_material_description_column(
if x_overlap(line.rect, depth_column.rect()) and line.rect.y0 < depth_column.rect().y0
]

min_y0 = max(line.rect.y0 for line in above_depth_column) if len(above_depth_column) else -1
min_y0 = max(line.rect.y0 for line in above_depth_column) if above_depth_column else -1

def check_y0_condition(y0):
return y0 > min_y0 and y0 < depth_column.rect().y1
Expand All @@ -409,7 +416,7 @@ def check_y0_condition(y0):
]

def filter_coverage(coverage):
if len(coverage):
if coverage:
min_x0 = min(line.rect.x0 for line in coverage)
max_x1 = max(line.rect.x1 for line in coverage)
x0_threshold = max_x1 - 0.4 * (
Expand Down
35 changes: 24 additions & 11 deletions src/stratigraphy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,30 +173,36 @@ def start_pipeline(
predictions[filename]["metadata"] = {"coordinates": coordinates.to_json()}
else:
predictions[filename]["metadata"] = {"coordinates": None}

layer_predictions_list = []
depths_materials_column_pairs_list = []
page_heights_list = []
page_widths_list = []
for page_index, page in enumerate(doc):
page_number = page_index + 1
logger.info("Processing page %s", page_number)

text_lines = extract_text_lines(page)
text_lines = extract_text_lines(page, page_number)
geometric_lines = extract_lines(page, line_detection_params)
layer_predictions, depths_materials_column_pairs = process_page(
text_lines, geometric_lines, language, **matching_params
text_lines, geometric_lines, language, page_number, **matching_params
)
# Add remove duplicates here!

# TODO: Add remove duplicates here!
if page_index > 0:
layer_predictions = remove_duplicate_layers(
doc[page_index - 1],
page,
predictions[filename][f"page_{page_number - 1}"]["layers"],
layer_predictions_list,
layer_predictions,
matching_params["img_template_probability_threshold"],
)
predictions[filename][f"page_{page_number}"] = {
"layers": layer_predictions,
"depths_materials_column_pairs": depths_materials_column_pairs,
"page_height": page.rect.height,
"page_width": page.rect.width,
}

layer_predictions_list.extend(layer_predictions)
depths_materials_column_pairs_list.extend(depths_materials_column_pairs)
page_heights_list.append(page.rect.height)
page_widths_list.append(page.rect.width)

if draw_lines: # could be changed to if draw_lines and mflow_tracking:
if not mlflow_tracking:
logger.warning(
Expand All @@ -208,11 +214,18 @@ def start_pipeline(
)
mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")

predictions[filename]["layers"] = layer_predictions_list
predictions[filename]["depths_materials_column_pairs"] = depths_materials_column_pairs_list
predictions[filename]["page_height"] = page_heights_list
predictions[filename]["page_width"] = page_widths_list

assert len(page_heights_list) == len(page_widths_list) == doc.page_count, "Page count mismatch."

logger.info("Writing predictions to JSON file %s", predictions_path)
with open(predictions_path, "w") as file:
file.write(json.dumps(predictions))

# evaluate the predictions; if file doesnt exist, the predictions are not changed.
# evaluate the predictions; if file does not exist, the predictions are not changed.
predictions, number_of_truth_values = create_predictions_objects(predictions, ground_truth_path)

if not skip_draw_predictions:
Expand Down
15 changes: 8 additions & 7 deletions src/stratigraphy/util/boundarydepthcolumnvalidator.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,27 +63,27 @@ def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.9

return corr_coef and corr_coef > corr_coef_threshold

def reduce_until_valid(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn:
def reduce_until_valid(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn:
"""Removes entries from the depth column until it fulfills the is_valid condition.
is_valid checks whether there is too much noise (i.e. other text) in the column and whether the entries are
linearly correlated with their vertical position.
Args:
column (BoundaryDepthColumn): The depth column to validate
page_number (int): The page number of the depth column
Returns:
BoundaryDepthColumn: The current depth column with entries removed until it is valid.
"""
while column:
if self.is_valid(column):
return column
elif self.correct_OCR_mistakes(column) is not None:
return self.correct_OCR_mistakes(column)
elif self.correct_OCR_mistakes(column, page_number) is not None:
return self.correct_OCR_mistakes(column, page_number)
else:
column = column.remove_entry_by_correlation_gradient()

def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn | None:
def correct_OCR_mistakes(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn | None:
"""Corrects OCR mistakes in the depth column entries.
Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the
Expand All @@ -102,22 +102,23 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu
Args:
column (BoundaryDepthColumn): The depth column to validate
page_number (int): The page number of the depth column
Returns:
BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible.
"""
new_columns = [BoundaryDepthColumn()]
for entry in column.entries:
new_columns = [
BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value)])
BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value, page_number)])
for column in new_columns
for new_value in _value_alternatives(entry.value)
]
# Immediately require strictly increasing values, to avoid exponential complexity when many implausible
# alternative values are suggested
new_columns = [column for column in new_columns if column.is_strictly_increasing()]

if len(new_columns):
if new_columns:
best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef())

# We require a higher correlation coefficient when we've already corrected a mistake.
Expand Down
5 changes: 3 additions & 2 deletions src/stratigraphy/util/coordinate_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,8 +312,9 @@ def extract_coordinates(self) -> Coordinate | None:
Returns:
Coordinate | None: the extracted coordinates (if any)
"""
for page in self.doc:
lines = extract_text_lines(page)
for page_idx, page in enumerate(self.doc):
page_number = page_idx + 1
lines = extract_text_lines(page, page_number)
page_number = page.number + 1 # page.number is 0-based

found_coordinates = (
Expand Down
4 changes: 2 additions & 2 deletions src/stratigraphy/util/depthcolumn.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def break_on_mismatch(self) -> list[LayerDepthColumn]:
segment_start = index

final_segment = self.entries[segment_start:]
if len(final_segment):
if final_segment:
segments.append(final_segment)

return [LayerDepthColumn(segment) for segment in segments]
Expand Down Expand Up @@ -338,7 +338,7 @@ def break_on_double_descending(self) -> list[BoundaryDepthColumn]:
segment_start = index

final_segment = self.entries[segment_start:]
if len(final_segment):
if final_segment:
segments.append(final_segment)

return [BoundaryDepthColumn(segment) for segment in segments]
Expand Down
Loading

0 comments on commit 29fa7b9

Please sign in to comment.