diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..81373420 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,21 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Run boreholes-extract-all", + "type": "debugpy", + "request": "launch", + "module": "src.stratigraphy.main", + "args": [ + "-i", "data/zurich", + "-g", "data/zurich_ground_truth.json" + ], + "cwd": "${workspaceFolder}", + "justMyCode": true, + "python": "./swisstopo/bin/python3", + } + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..163a9c49 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "cSpell.words": [ + "fitz" + ] +} \ No newline at end of file diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py index 33addf9b..00d17a47 100644 --- a/src/stratigraphy/benchmark/score.py +++ b/src/stratigraphy/benchmark/score.py @@ -252,7 +252,8 @@ def evaluate_layer_extraction(predictions: dict, number_of_truth_values: dict) - def create_predictions_objects( - predictions: dict, ground_truth_path: Path | None + predictions: dict, + ground_truth_path: Path | None, ) -> tuple[dict[FilePredictions], dict]: """Create predictions objects from the predictions and evaluate them against the ground truth. diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py index 5753f3b4..00240f81 100644 --- a/src/stratigraphy/extract.py +++ b/src/stratigraphy/extract.py @@ -30,7 +30,9 @@ logger = logging.getLogger(__name__) -def process_page(lines: list[TextLine], geometric_lines, language: str, **params: dict) -> list[dict]: +def process_page( + lines: list[TextLine], geometric_lines, language: str, page_number: int, **params: dict +) -> list[dict]: """Process a single page of a pdf. Finds all descriptions and depth intervals on the page and matches them. @@ -39,6 +41,7 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params lines (list[TextLine]): all the text lines on the page. geometric_lines (list[Line]): The geometric lines of the page. language (str): The language of the page. + page_number (int): The page number. **params (dict): Additional parameters for the matching pipeline. Returns: @@ -67,7 +70,7 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params # If there is a layer identifier column, then we use this directly. # Else, we search for depth columns. We could also think of some scoring mechanism to decide which one to use. if not pairs: - depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True) + depth_column_entries = find_depth_columns.depth_column_entries(words, page_number, include_splits=True) layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words) used_entry_rects = [] @@ -77,13 +80,13 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params depth_column_entries = [ entry - for entry in find_depth_columns.depth_column_entries(words, include_splits=False) + for entry in find_depth_columns.depth_column_entries(words, page_number, include_splits=False) if entry.rect not in used_entry_rects ] depth_columns: list[DepthColumn] = layer_depth_columns depth_columns.extend( find_depth_columns.find_depth_columns( - depth_column_entries, words, depth_column_params=params["depth_column_params"] + depth_column_entries, words, page_number, depth_column_params=params["depth_column_params"] ) ) @@ -106,12 +109,12 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params groups = [] # list of matched depth intervals and text blocks # groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}] - if len(filtered_pairs): # match depth column items with material description + if filtered_pairs: # match depth column items with material description for depth_column, material_description_rect in filtered_pairs: description_lines = get_description_lines(lines, material_description_rect) if len(description_lines) > 1: new_groups = match_columns( - depth_column, description_lines, geometric_lines, material_description_rect, **params + depth_column, description_lines, geometric_lines, material_description_rect, page_number, **params ) groups.extend(new_groups) json_filtered_pairs = [ @@ -157,9 +160,11 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params ] ) predictions = [ - {"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()} - if "depth_interval" in group - else {"material_description": group["block"].to_json()} + ( + {"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()} + if "depth_interval" in group + else {"material_description": group["block"].to_json()} + ) for group in groups ] predictions = parse_and_remove_empty_predictions(predictions) @@ -205,6 +210,7 @@ def match_columns( description_lines: list[TextLine], geometric_lines: list[Line], material_description_rect: fitz.Rect, + page_number: int, **params: dict, ) -> list: """Match the depth column entries with the description lines. @@ -218,6 +224,7 @@ def match_columns( description_lines (list[TextLine]): The description lines. geometric_lines (list[Line]): The geometric lines. material_description_rect (fitz.Rect): The material description rectangle. + page_number (int): The page number. **params (dict): Additional parameters for the matching pipeline. Returns: @@ -235,7 +242,7 @@ def match_columns( blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines) groups = [] for block in blocks: - depth_interval = depth_column.get_depth_interval(block) + depth_interval = depth_column.get_depth_interval(block, page_number) if depth_interval: groups.append({"depth_interval": depth_interval, "block": block}) else: @@ -320,7 +327,7 @@ def merge_blocks_by_vertical_spacing(blocks: list[TextBlock], target_merge_count merged_blocks.append(current_merged_block) current_merged_block = new_block - if len(current_merged_block.lines): + if current_merged_block.lines: merged_blocks.append(current_merged_block) return merged_blocks @@ -355,7 +362,7 @@ def split_blocks_by_textline_length(blocks: list[TextBlock], target_split_count: split_blocks.append(TextBlock(current_block_lines)) cutoff_values.remove(line.rect.x1) current_block_lines = [] - if len(current_block_lines): + if current_block_lines: split_blocks.append(TextBlock(current_block_lines)) current_block_lines = [] if ( @@ -386,7 +393,7 @@ def find_material_description_column( if x_overlap(line.rect, depth_column.rect()) and line.rect.y0 < depth_column.rect().y0 ] - min_y0 = max(line.rect.y0 for line in above_depth_column) if len(above_depth_column) else -1 + min_y0 = max(line.rect.y0 for line in above_depth_column) if above_depth_column else -1 def check_y0_condition(y0): return y0 > min_y0 and y0 < depth_column.rect().y1 @@ -409,7 +416,7 @@ def check_y0_condition(y0): ] def filter_coverage(coverage): - if len(coverage): + if coverage: min_x0 = min(line.rect.x0 for line in coverage) max_x1 = max(line.rect.x1 for line in coverage) x0_threshold = max_x1 - 0.4 * ( diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py index 2bcaf64f..6dbbb3b1 100644 --- a/src/stratigraphy/main.py +++ b/src/stratigraphy/main.py @@ -173,30 +173,36 @@ def start_pipeline( predictions[filename]["metadata"] = {"coordinates": coordinates.to_json()} else: predictions[filename]["metadata"] = {"coordinates": None} + + layer_predictions_list = [] + depths_materials_column_pairs_list = [] + page_heights_list = [] + page_widths_list = [] for page_index, page in enumerate(doc): page_number = page_index + 1 logger.info("Processing page %s", page_number) - text_lines = extract_text_lines(page) + text_lines = extract_text_lines(page, page_number) geometric_lines = extract_lines(page, line_detection_params) layer_predictions, depths_materials_column_pairs = process_page( - text_lines, geometric_lines, language, **matching_params + text_lines, geometric_lines, language, page_number, **matching_params ) - # Add remove duplicates here! + + # TODO: Add remove duplicates here! if page_index > 0: layer_predictions = remove_duplicate_layers( doc[page_index - 1], page, - predictions[filename][f"page_{page_number - 1}"]["layers"], + layer_predictions_list, layer_predictions, matching_params["img_template_probability_threshold"], ) - predictions[filename][f"page_{page_number}"] = { - "layers": layer_predictions, - "depths_materials_column_pairs": depths_materials_column_pairs, - "page_height": page.rect.height, - "page_width": page.rect.width, - } + + layer_predictions_list.extend(layer_predictions) + depths_materials_column_pairs_list.extend(depths_materials_column_pairs) + page_heights_list.append(page.rect.height) + page_widths_list.append(page.rect.width) + if draw_lines: # could be changed to if draw_lines and mflow_tracking: if not mlflow_tracking: logger.warning( @@ -208,11 +214,18 @@ def start_pipeline( ) mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png") + predictions[filename]["layers"] = layer_predictions_list + predictions[filename]["depths_materials_column_pairs"] = depths_materials_column_pairs_list + predictions[filename]["page_height"] = page_heights_list + predictions[filename]["page_width"] = page_widths_list + + assert len(page_heights_list) == len(page_widths_list) == doc.page_count, "Page count mismatch." + logger.info("Writing predictions to JSON file %s", predictions_path) with open(predictions_path, "w") as file: file.write(json.dumps(predictions)) - # evaluate the predictions; if file doesnt exist, the predictions are not changed. + # evaluate the predictions; if file does not exist, the predictions are not changed. predictions, number_of_truth_values = create_predictions_objects(predictions, ground_truth_path) if not skip_draw_predictions: diff --git a/src/stratigraphy/util/boundarydepthcolumnvalidator.py b/src/stratigraphy/util/boundarydepthcolumnvalidator.py index 49c019f8..c1178d03 100644 --- a/src/stratigraphy/util/boundarydepthcolumnvalidator.py +++ b/src/stratigraphy/util/boundarydepthcolumnvalidator.py @@ -63,7 +63,7 @@ def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.9 return corr_coef and corr_coef > corr_coef_threshold - def reduce_until_valid(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn: + def reduce_until_valid(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn: """Removes entries from the depth column until it fulfills the is_valid condition. is_valid checks whether there is too much noise (i.e. other text) in the column and whether the entries are @@ -71,19 +71,19 @@ def reduce_until_valid(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn Args: column (BoundaryDepthColumn): The depth column to validate - + page_number (int): The page number of the depth column Returns: BoundaryDepthColumn: The current depth column with entries removed until it is valid. """ while column: if self.is_valid(column): return column - elif self.correct_OCR_mistakes(column) is not None: - return self.correct_OCR_mistakes(column) + elif self.correct_OCR_mistakes(column, page_number) is not None: + return self.correct_OCR_mistakes(column, page_number) else: column = column.remove_entry_by_correlation_gradient() - def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn | None: + def correct_OCR_mistakes(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn | None: """Corrects OCR mistakes in the depth column entries. Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the @@ -102,6 +102,7 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu Args: column (BoundaryDepthColumn): The depth column to validate + page_number (int): The page number of the depth column Returns: BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible. @@ -109,7 +110,7 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu new_columns = [BoundaryDepthColumn()] for entry in column.entries: new_columns = [ - BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value)]) + BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value, page_number)]) for column in new_columns for new_value in _value_alternatives(entry.value) ] @@ -117,7 +118,7 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu # alternative values are suggested new_columns = [column for column in new_columns if column.is_strictly_increasing()] - if len(new_columns): + if new_columns: best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef()) # We require a higher correlation coefficient when we've already corrected a mistake. diff --git a/src/stratigraphy/util/coordinate_extraction.py b/src/stratigraphy/util/coordinate_extraction.py index 8f8fd484..66fb4f06 100644 --- a/src/stratigraphy/util/coordinate_extraction.py +++ b/src/stratigraphy/util/coordinate_extraction.py @@ -312,8 +312,9 @@ def extract_coordinates(self) -> Coordinate | None: Returns: Coordinate | None: the extracted coordinates (if any) """ - for page in self.doc: - lines = extract_text_lines(page) + for page_idx, page in enumerate(self.doc): + page_number = page_idx + 1 + lines = extract_text_lines(page, page_number) page_number = page.number + 1 # page.number is 0-based found_coordinates = ( diff --git a/src/stratigraphy/util/depthcolumn.py b/src/stratigraphy/util/depthcolumn.py index d3d7a0a5..32920c06 100644 --- a/src/stratigraphy/util/depthcolumn.py +++ b/src/stratigraphy/util/depthcolumn.py @@ -111,7 +111,7 @@ def break_on_mismatch(self) -> list[LayerDepthColumn]: segment_start = index final_segment = self.entries[segment_start:] - if len(final_segment): + if final_segment: segments.append(final_segment) return [LayerDepthColumn(segment) for segment in segments] @@ -338,7 +338,7 @@ def break_on_double_descending(self) -> list[BoundaryDepthColumn]: segment_start = index final_segment = self.entries[segment_start:] - if len(final_segment): + if final_segment: segments.append(final_segment) return [BoundaryDepthColumn(segment) for segment in segments] diff --git a/src/stratigraphy/util/depthcolumnentry.py b/src/stratigraphy/util/depthcolumnentry.py index 950d0776..a0dbb64c 100644 --- a/src/stratigraphy/util/depthcolumnentry.py +++ b/src/stratigraphy/util/depthcolumnentry.py @@ -1,20 +1,27 @@ """Contains dataclasses for entries in a depth column.""" +from typing import Any + import fitz class DepthColumnEntry: # noqa: D101 - def __init__(self, rect: fitz.Rect, value: float): + """Class to represent a depth column entry.""" + + def __init__(self, rect: fitz.Rect, value: float, page_number: int): self.rect = rect self.value = value + self.page_number = page_number - def __repr__(self): + def __repr__(self) -> str: return str(self.value) - def to_json(self): + def to_json(self) -> dict[str, Any]: + """Convert the depth column entry to a JSON serializable format.""" return { "value": self.value, "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], + "page": self.page_number, } @@ -26,31 +33,38 @@ class AnnotatedDepthColumnEntry(DepthColumnEntry): # noqa: D101 """ def __init__(self, value): - self.value = value - self.rect = None + super().__init__(None, value, None) - def to_json(self): + def to_json(self) -> dict[str, Any]: return { "value": self.value, - "rect": None, + "rect": self.rect, + "page": self.page_number, } class LayerDepthColumnEntry: # noqa: D101 + """Class to represent a layer depth column entry.""" + def __init__(self, start: DepthColumnEntry, end: DepthColumnEntry): self.start = start self.end = end - def __repr__(self): + assert start.page_number == end.page_number, "Start and end entries are on different pages." + + def __repr__(self) -> str: return f"{self.start.value}-{self.end.value}" @property - def rect(self): + def rect(self) -> fitz.Rect: + """Get the rectangle of the layer depth column entry.""" return fitz.Rect(self.start.rect).include_rect(self.end.rect) - def to_json(self): + def to_json(self) -> dict[str, Any]: + """Convert the layer depth column entry to a JSON serializable format.""" return { "start": self.start.to_json(), "end": self.end.to_json(), "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], + "page": self.start.page_number, } diff --git a/src/stratigraphy/util/description_block_splitter.py b/src/stratigraphy/util/description_block_splitter.py index 7e45c446..16cf4586 100644 --- a/src/stratigraphy/util/description_block_splitter.py +++ b/src/stratigraphy/util/description_block_splitter.py @@ -21,6 +21,7 @@ def __init__(self): # noqa: D107 @abc.abstractmethod def separator_condition(self, last_line: TextLine, current_line: TextLine) -> bool: # noqa: D107 + """Check if a block is separated by a certain condition.""" pass def create_blocks(self, description_lines: list[TextLine]) -> list[TextBlock]: @@ -30,7 +31,7 @@ def create_blocks(self, description_lines: list[TextLine]) -> list[TextBlock]: description_lines (list[TextLine]): all the text lines from the material descriptions. Returns: - list[TextBlock]: the list of textblocks + list[TextBlock]: the list of TextBlocks """ blocks = [] current_block_lines = [] @@ -43,7 +44,7 @@ def create_blocks(self, description_lines: list[TextLine]) -> list[TextBlock]: ) current_block_lines = [] current_block_lines.append(line) - if len(current_block_lines): + if current_block_lines: blocks.append(TextBlock(current_block_lines)) return blocks diff --git a/src/stratigraphy/util/draw.py b/src/stratigraphy/util/draw.py index 82183815..e16b6529 100644 --- a/src/stratigraphy/util/draw.py +++ b/src/stratigraphy/util/draw.py @@ -47,8 +47,7 @@ def draw_predictions(predictions: list[FilePredictions], directory: Path, out_di with fitz.Document(directory / file_name) as doc: for page_index, page in enumerate(doc): page_number = page_index + 1 - layers = file_prediction.pages[page_index].layers - depths_materials_column_pairs = file_prediction.pages[page_index].depths_materials_columns_pairs + depths_materials_column_pairs = file_prediction.depths_materials_columns_pairs if page_index == 0: draw_metadata( page, @@ -58,10 +57,12 @@ def draw_predictions(predictions: list[FilePredictions], directory: Path, out_di if file_prediction.metadata.coordinates is not None: draw_coordinates(page, file_prediction.metadata.coordinates) draw_depth_columns_and_material_rect(page, depths_materials_column_pairs) - draw_material_descriptions(page, layers) + draw_material_descriptions(page, file_prediction.layers) tmp_file_path = out_directory / f"{file_name}_page{page_number}.png" fitz.utils.get_pixmap(page, matrix=fitz.Matrix(2, 2), clip=page.rect).save(tmp_file_path) + print(f"Saved image to {tmp_file_path}") + if mlflow_tracking: # This is only executed if MLFlow tracking is enabled try: import mlflow @@ -119,21 +120,24 @@ def draw_material_descriptions(page: fitz.Page, layers: LayerPrediction) -> None page (fitz.Page): The page to draw on. layers (LayerPrediction): The predictions for the page. """ + page_number = page.number + 1 + for index, layer in enumerate(layers): - if layer.material_description.rect is not None: - fitz.utils.draw_rect( - page, - fitz.Rect(layer.material_description.rect) * page.derotation_matrix, - color=fitz.utils.getColor("orange"), + if layer.material_description.page_number == page_number: + if layer.material_description.rect is not None: + fitz.utils.draw_rect( + page, + fitz.Rect(layer.material_description.rect) * page.derotation_matrix, + color=fitz.utils.getColor("orange"), + ) + draw_layer( + page=page, + interval=layer.depth_interval, # None if no depth interval + layer=layer.material_description, + index=index, + is_correct=layer.material_is_correct, # None if no ground truth + depth_is_correct=layer.depth_interval_is_correct, # None if no ground truth ) - draw_layer( - page=page, - interval=layer.depth_interval, # None if no depth interval - layer=layer.material_description, - index=index, - is_correct=layer.material_is_correct, # None if no ground truth - depth_is_correct=layer.depth_interval_is_correct, # None if no ground truth - ) def draw_depth_columns_and_material_rect(page: fitz.Page, depths_materials_column_pairs: list) -> fitz.Page: @@ -197,7 +201,7 @@ def draw_layer( is_correct (bool): Whether the text block was correctly identified. depth_is_correct (bool): Whether the depth interval was correctly identified. """ - if len(layer.lines): + if layer.lines: layer_rect = fitz.Rect(layer.rect) color = colors[index % len(colors)] diff --git a/src/stratigraphy/util/extract_text.py b/src/stratigraphy/util/extract_text.py index fe78fa72..4620eefa 100644 --- a/src/stratigraphy/util/extract_text.py +++ b/src/stratigraphy/util/extract_text.py @@ -5,13 +5,14 @@ from stratigraphy.util.line import TextLine, TextWord -def extract_text_lines(page: fitz.Page) -> list[TextLine]: +def extract_text_lines(page: fitz.Page, page_number: int) -> list[TextLine]: """Extract all text lines from the page. Sometimes, a single lines as identified by PyMuPDF, is still split into separate lines. Args: page (fitz.page): the page to extract text from + page_number (int): the page number (first page is 1) Returns: list[TextLine]: A list of text lines. @@ -20,7 +21,7 @@ def extract_text_lines(page: fitz.Page) -> list[TextLine]: words_by_line = {} for x0, y0, x1, y1, word, block_no, line_no, _word_no in fitz.utils.get_text(page, "words"): rect = fitz.Rect(x0, y0, x1, y1) * page.rotation_matrix - text_word = TextWord(rect, word) + text_word = TextWord(rect, word, page_number) words.append(text_word) key = f"{block_no}_{line_no}" if key not in words_by_line: @@ -38,7 +39,7 @@ def extract_text_lines(page: fitz.Page) -> list[TextLine]: lines.append(TextLine(current_line_words)) current_line_words = [] current_line_words.append(word) - if len(current_line_words): + if current_line_words: lines.append(TextLine(current_line_words)) current_line_words = [] diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py index 5a8bb432..4982dac3 100644 --- a/src/stratigraphy/util/find_depth_columns.py +++ b/src/stratigraphy/util/find_depth_columns.py @@ -10,13 +10,14 @@ from stratigraphy.util.line import TextWord -def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]: +def depth_column_entries(all_words: list[TextWord], page_number: int, include_splits: bool) -> list[DepthColumnEntry]: """Find all depth column entries given a list of TextLine objects. Note: Only depths up to two digits before the decimal point are supported. Args: all_words (list[TextWord]): List of text words to extract depth column entries from. + page_number (int): The page number of the entries. include_splits (bool): Whether to include split entries. Returns: @@ -32,10 +33,10 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis match = regex.match(input_string) if match: value = value_as_float(match.group(1)) - entries.append(DepthColumnEntry(word.rect, value)) + entries.append(DepthColumnEntry(word.rect, value, page_number)) elif include_splits: # support for e.g. "1.10-1.60m" extracted as a single word - layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect) + layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect, page_number) entries.extend( [layer_depth_column_entry.start, layer_depth_column_entry.end] if layer_depth_column_entry else [] ) @@ -45,19 +46,21 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis def value_as_float(string_value: str) -> float: # noqa: D103 + """Converts a string to a float.""" # OCR sometimes tends to miss the decimal comma parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value) return abs(float(parsed_text)) def extract_layer_depth_interval( - text: str, rect: fitz.Rect, require_start_of_string: bool = True + text: str, rect: fitz.Rect, page_number: int, require_start_of_string: bool = True ) -> LayerDepthColumnEntry | None: """Extracts a LayerDepthColumnEntry from a string. Args: text (str): The string to extract the depth interval from. rect (fitz.Rect): The rectangle of the text. + page_number (int): The page number of the text. require_start_of_string (bool, optional): Whether the number to extract needs to be at the start of a string. Defaults to True. @@ -78,7 +81,8 @@ def extract_layer_depth_interval( value2 = value_as_float(match.group(3)) second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1) return LayerDepthColumnEntry( - DepthColumnEntry(first_half_rect, value1), DepthColumnEntry(second_half_rect, value2) + DepthColumnEntry(first_half_rect, value1, page_number), + DepthColumnEntry(second_half_rect, value2, page_number), ) return None @@ -150,13 +154,14 @@ def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None: # noqa: D103 def find_depth_columns( - entries: list[DepthColumnEntry], all_words: list[TextWord], depth_column_params: dict + entries: list[DepthColumnEntry], all_words: list[TextWord], page_number: int, depth_column_params: dict ) -> list[BoundaryDepthColumn]: """Construct all possible BoundaryDepthColumn objects from the given DepthColumnEntry objects. Args: entries (list[DepthColumnEntry]): All found depth column entries in the page. all_words (list[TextLine]): All words in the page. + page_number (int): The page number of the entries. depth_column_params (dict): Parameters for the BoundaryDepthColumn objects. Returns: @@ -190,7 +195,7 @@ def find_depth_columns( boundary_depth_column_validator = BoundaryDepthColumnValidator(all_words, **depth_column_params) numeric_columns = [ - boundary_depth_column_validator.reduce_until_valid(column) + boundary_depth_column_validator.reduce_until_valid(column, page_number) for numeric_column in numeric_columns for column in numeric_column.break_on_double_descending() # when we have a perfect arithmetic progression, this is usually just a scale diff --git a/src/stratigraphy/util/find_description.py b/src/stratigraphy/util/find_description.py index 4c79d237..f9219e06 100644 --- a/src/stratigraphy/util/find_description.py +++ b/src/stratigraphy/util/find_description.py @@ -89,7 +89,7 @@ def matching_blocks( else: break - if len(matched_lines): + if matched_lines: return [TextBlock(matched_lines)] else: return [] @@ -128,7 +128,7 @@ def get_description_blocks( distances.append(line2rect.y0 - line1rect.y0) threshold = None - if len(distances): + if distances: threshold = min(distances) * 1.15 # Create blocks separated by lines diff --git a/src/stratigraphy/util/interval.py b/src/stratigraphy/util/interval.py index ffb42c17..16e32389 100644 --- a/src/stratigraphy/util/interval.py +++ b/src/stratigraphy/util/interval.py @@ -21,6 +21,7 @@ def __init__(self, start: DepthColumnEntry | None, end: DepthColumnEntry | None) @property def start_value(self) -> float | None: + """Get the start value of the interval.""" if self.start: return self.start.value else: @@ -28,6 +29,7 @@ def start_value(self) -> float | None: @property def end_value(self) -> float | None: + """Get the end value of the interval.""" if self.end: return self.end.value else: @@ -36,14 +38,17 @@ def end_value(self) -> float | None: @property @abc.abstractmethod def line_anchor(self) -> fitz.Point: + """Get the line anchor of the interval.""" pass @property @abc.abstractmethod def background_rect(self) -> fitz.Rect | None: + """Get the background rectangle of the interval.""" pass def to_json(self): + """Convert the interval to a JSON serializable format.""" return { "start": self.start.to_json() if self.start else None, "end": self.end.to_json() if self.end else None, @@ -65,9 +70,6 @@ class BoundaryInterval(Interval): Boundary intervals are intervals that are defined by a start and an end point. """ - def __init__(self, start: DepthColumnEntry | None, end: DepthColumnEntry | None): - super().__init__(start, end) - @property def line_anchor(self) -> fitz.Point | None: if self.start and self.end: @@ -132,10 +134,10 @@ def matching_blocks(self, all_blocks: list[TextBlock], block_index: int) -> tupl if not can_end_exact_match: exact_match_blocks = [] - if len(exact_match_blocks): + if exact_match_blocks: exact.extend(exact_match_blocks) block_index = exact_match_index - 1 - elif len(exact): + elif exact: post.append(current_block) else: pre.append(current_block) @@ -168,6 +170,7 @@ def background_rect(self) -> fitz.Rect | None: def matching_blocks( self, all_lines: list[TextLine], line_index: int, next_interval: Interval | None ) -> list[TextBlock]: + """Adds lines to a block until the next layer identifier is reached.""" y1_threshold = None if next_interval: next_interval_start_rect = next_interval.start.rect @@ -181,7 +184,7 @@ def matching_blocks( else: break - if len(matched_lines): + if matched_lines: return [TextBlock(matched_lines)] else: return [] diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py index 17a56a86..1aed65c8 100644 --- a/src/stratigraphy/util/layer_identifier_column.py +++ b/src/stratigraphy/util/layer_identifier_column.py @@ -115,7 +115,7 @@ def is_contained(self, rect: fitz.Rect) -> bool: and self.rect().y1 <= rect.y1 ) - def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry: + def get_depth_interval(self, block: TextBlock, page_number: int) -> LayerDepthColumnEntry: """Extract depth interval from a material description block. For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description @@ -125,6 +125,7 @@ def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry: Args: block (TextBlock): The block to calculate the depth interval for. + page_number (int): The page number of the block. Returns: LayerDepthColumnEntry: The depth interval. @@ -132,7 +133,9 @@ def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry: depth_entries = [] for line in block.lines: try: - layer_depth_entry = extract_layer_depth_interval(line.text, line.rect, require_start_of_string=False) + layer_depth_entry = extract_layer_depth_interval( + line.text, line.rect, page_number, require_start_of_string=False + ) # require_start_of_string = False because the depth interval may not always start at the beginning # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m" if layer_depth_entry: diff --git a/src/stratigraphy/util/line.py b/src/stratigraphy/util/line.py index b88d0215..05526413 100644 --- a/src/stratigraphy/util/line.py +++ b/src/stratigraphy/util/line.py @@ -17,9 +17,10 @@ class TextWord: to represent the location of the word in a PDF document. """ - def __init__(self, rect: fitz.Rect, text: str): + def __init__(self, rect: fitz.Rect, text: str, page: int): self.rect = rect self.text = text + self.page_number = page def __repr__(self) -> str: return f"TextWord({self.rect}, {self.text})" @@ -33,18 +34,27 @@ class TextLine: """ def __init__(self, words: list[TextWord]): + """Initialize the TextLine object. + + Args: + words (list[TextWord]): The words that make up the line. + page_number (int): The page number of the line. The first page has idx 1. + """ self.rect = fitz.Rect() for word in words: self.rect.include_rect(word.rect) self.words = words + self.page_number = words[0].page_number def is_description(self, material_description): + """Check if the line is a material description.""" return any( self.text.lower().find(word) > -1 for word in material_description["including_expressions"] ) and not any(self.text.lower().find(word) > -1 for word in material_description["excluding_expressions"]) @property def text(self) -> str: + """Get the text of the line.""" return " ".join([word.text for word in self.words]) def __repr__(self) -> str: @@ -62,13 +72,15 @@ def __repr__(self) -> str: """ def is_line_start(self, raw_lines_before: list[TextLine], raw_lines_after: list[TextLine]) -> bool: + """Check if the current line is the start of a new line.""" + def significant_overlap(line: TextLine) -> bool: return x_overlap_significant_largest(line.rect, self.rect, 0.5) matching_lines_before = [line for line in raw_lines_before if significant_overlap(line)] matching_lines_after = [line for line in raw_lines_after if significant_overlap(line)] - def count_points(lines: list[TextLine]) -> (int, int): + def count_points(lines: list[TextLine]) -> tuple[int, int]: exact_points = 0 indentation_points = 0 for other in lines: @@ -95,7 +107,9 @@ def count_points(lines: list[TextLine]) -> (int, int): return exact_points >= 3 or (exact_points >= 2 and indentation_points >= 1) def to_json(self): + """Convert the TextLine object to a JSON serializable dictionary.""" return { "text": self.text, "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], + "page": self.page_number, } diff --git a/src/stratigraphy/util/predictions.py b/src/stratigraphy/util/predictions.py index 04075950..a0ce6c22 100644 --- a/src/stratigraphy/util/predictions.py +++ b/src/stratigraphy/util/predictions.py @@ -36,31 +36,25 @@ class LayerPrediction: id: uuid.UUID = field(default_factory=uuid.uuid4) -@dataclass -class PagePredictions: - """A class to represent predictions for a single page.""" - - layers: list[LayerPrediction] - page_number: int - page_width: int - page_height: int - depths_materials_columns_pairs: list[dict] = None - - def __post__init__(self): - """Sort layers by their occurence on the page.""" - self.layers = sorted(self.layers, key=lambda layer: layer.material_description.rect.y0) - - class FilePredictions: """A class to represent predictions for a single file.""" - def __init__(self, pages: list[PagePredictions], file_name: str, language: str, metadata: BoreholeMetaData = None): - self.pages = pages + def __init__( + self, + layers: list[LayerPrediction], + file_name: str, + language: str, + metadata: BoreholeMetaData = None, + depths_materials_columns_pairs: list[dict] = None, + page_sizes: list[tuple[int, int]] = None, + ): + self.layers: list[LayerPrediction] = sorted(layers, key=lambda layer: layer.material_description.rect.y0) + self.depths_materials_columns_pairs: list[dict] = depths_materials_columns_pairs self.file_name = file_name self.language = language - self.layers = sum([page.layers for page in self.pages], []) self.metadata = metadata self.metadata_is_correct: dict = {} + self.page_sizes: list[tuple[int, int]] = page_sizes @staticmethod def create_from_json(predictions_for_file: dict, file_name: str): @@ -70,8 +64,14 @@ def create_from_json(predictions_for_file: dict, file_name: str): predictions_for_file (dict): The predictions for the file in json format. file_name (str): The name of the file. """ - page_predictions_class = [] + page_layer_predictions_list: list[LayerPrediction] = [] + pages_width_list: list[int] = [] + pages_height_list: list[int] = [] + depths_materials_columns_pairs_list: list[dict] = [] + for page_number, page_predictions in predictions_for_file.items(): + # TODO: Look into this as it seems to be a quite dirty fix here. + # As languages and metadata are not pages, they should be handled differently. if page_number == "language": file_language = page_predictions continue @@ -85,53 +85,53 @@ def create_from_json(predictions_for_file: dict, file_name: str): file_metadata = BoreholeMetaData(coordinates=coordinates) # TODO: Add additional metadata here. continue - page_layers = page_predictions["layers"] - layer_predictions = [] - for layer in page_layers: - material_prediction = _create_textblock_object(layer["material_description"]["lines"]) - if "depth_interval" in layer: - start = ( - DepthColumnEntry( - value=layer["depth_interval"]["start"]["value"], - rect=fitz.Rect(layer["depth_interval"]["start"]["rect"]), + elif page_number == "layers": + for layer in page_predictions: + material_prediction = _create_textblock_object(layer["material_description"]["lines"]) + if "depth_interval" in layer: + start = ( + DepthColumnEntry( + value=layer["depth_interval"]["start"]["value"], + rect=fitz.Rect(layer["depth_interval"]["start"]["rect"]), + page_number=layer["depth_interval"]["start"]["page"], + ) + if layer["depth_interval"]["start"] is not None + else None ) - if layer["depth_interval"]["start"] is not None - else None - ) - end = ( - DepthColumnEntry( - value=layer["depth_interval"]["end"]["value"], - rect=fitz.Rect(layer["depth_interval"]["end"]["rect"]), + end = ( + DepthColumnEntry( + value=layer["depth_interval"]["end"]["value"], + rect=fitz.Rect(layer["depth_interval"]["end"]["rect"]), + page_number=layer["depth_interval"]["end"]["page"], + ) + if layer["depth_interval"]["end"] is not None + else None ) - if layer["depth_interval"]["end"] is not None - else None - ) - depth_interval_prediction = BoundaryInterval(start=start, end=end) - layer_predictions.append( - LayerPrediction( + depth_interval_prediction = BoundaryInterval(start=start, end=end) + layer_predictions = LayerPrediction( material_description=material_prediction, depth_interval=depth_interval_prediction ) - ) - else: - layer_predictions.append( - LayerPrediction(material_description=material_prediction, depth_interval=None) - ) + else: + layer_predictions = LayerPrediction( + material_description=material_prediction, depth_interval=None + ) + + page_layer_predictions_list.append(layer_predictions) + if "depths_materials_column_pairs" in page_predictions: - page_predictions_class.append( - PagePredictions( - page_number=page_number, - page_width=page_predictions["page_width"], - page_height=page_predictions["page_height"], - layers=layer_predictions, - depths_materials_columns_pairs=page_predictions["depths_materials_column_pairs"], - ) - ) - else: - page_predictions_class.append(PagePredictions(page_number=page_number, layers=layer_predictions)) + depths_materials_columns_pairs_list.extend(page_predictions["depths_materials_column_pairs"]) + + pages_width_list.extend(predictions_for_file["page_width"]) + pages_height_list.extend(predictions_for_file["page_height"]) return FilePredictions( - pages=page_predictions_class, file_name=file_name, language=file_language, metadata=file_metadata + layers=page_layer_predictions_list, + file_name=file_name, + language=file_language, + metadata=file_metadata, + depths_materials_columns_pairs=depths_materials_columns_pairs_list, + page_sizes=list(zip(pages_width_list, pages_height_list, strict=False)), ) def convert_to_ground_truth(self): @@ -147,14 +147,13 @@ def convert_to_ground_truth(self): """ ground_truth = {self.file_name: {"metadata": self.metadata}} layers = [] - for page in self.pages: - for layer in page.layers: - material_description = layer.material_description.text - depth_interval = { - "start": layer.depth_interval.start.value if layer.depth_interval.start else None, - "end": layer.depth_interval.end.value if layer.depth_interval.end else None, - } - layers.append({"material_description": material_description, "depth_interval": depth_interval}) + for layer in self.layers: + material_description = layer.material_description.text + depth_interval = { + "start": layer.depth_interval.start.value if layer.depth_interval.start else None, + "end": layer.depth_interval.end.value if layer.depth_interval.end else None, + } + layers.append({"material_description": material_description, "depth_interval": depth_interval}) ground_truth[self.file_name]["layers"] = layers if self.metadata.coordinates is not None: ground_truth[self.file_name]["metadata"] = { @@ -166,6 +165,11 @@ def convert_to_ground_truth(self): return ground_truth def evaluate(self, ground_truth: dict): + """Evaluate the predictions against the ground truth. + + Args: + ground_truth (dict): The ground truth for the file. + """ self.evaluate_layers(ground_truth["layers"]) self.evaluate_metadata(ground_truth.get("metadata")) @@ -175,6 +179,7 @@ def evaluate_layers(self, ground_truth_layers: list): Args: ground_truth_layers (list): The ground truth layers for the file. """ + # TODO: Attribute 'unmatched_layers' defined outside __init__ method. This is not a good practice. self.unmatched_layers = ground_truth_layers.copy() for layer in self.layers: match, depth_interval_is_correct = self._find_matching_layer(layer) diff --git a/src/stratigraphy/util/textblock.py b/src/stratigraphy/util/textblock.py index 15da4aa9..98b82b62 100644 --- a/src/stratigraphy/util/textblock.py +++ b/src/stratigraphy/util/textblock.py @@ -3,6 +3,7 @@ from __future__ import annotations from dataclasses import dataclass +from typing import Any import fitz import numpy as np @@ -24,6 +25,7 @@ class MaterialDescription: rect: fitz.Rect def to_json(self): + """Convert the MaterialDescription object to a JSON serializable dictionary.""" return { "text": self.text, "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], @@ -32,7 +34,11 @@ def to_json(self): @dataclass class TextBlock: - """Class to represent a block of text in a PDF document.""" + """Class to represent a block of text in a PDF document. + + A TextBlock is a collection of Lines surrounded by Lines. + It is used to represent a block of text in a PDF document. + """ lines: list[TextLine] is_terminated_by_line: bool = False @@ -50,7 +56,23 @@ def __post_init__(self): else: self.rect = fitz.Rect() - def concatenate(self, other: TextBlock): + # go through all the lines and check if they are on the same page + page_number_set = set(line.page_number for line in self.lines) + assert len(page_number_set) < 2, "TextBlock spans multiple pages" + if page_number_set: + self.page_number = page_number_set.pop() + else: + self.page_number = None + + def concatenate(self, other: TextBlock) -> TextBlock: + """Concatenate two text blocks. + + Args: + other (TextBlock): The other text block. + + Returns: + TextBlock: The concatenated text block. + """ new_lines = [] new_lines.extend(self.lines) new_lines.extend(other.lines) @@ -59,6 +81,11 @@ def concatenate(self, other: TextBlock): # LGD-288: sometimes indentation is the only significant signal for deciding where we need to split the material # descriptions of adjacent layers. def split_based_on_indentation(self) -> list[TextBlock]: + """Split the text block based on indentation. + + Returns: + list[TextBlock]: The split text blocks. + """ if len(self.lines) == 0: return [] @@ -84,14 +111,14 @@ def split_based_on_indentation(self) -> list[TextBlock]: for line in self.lines: if line.rect.x0 < indentation_low: # start new block - if len(current_block_lines): + if current_block_lines: blocks.append(TextBlock(current_block_lines)) current_block_lines = [line] else: # continue block current_block_lines.append(line) - if len(current_block_lines): + if current_block_lines: blocks.append(TextBlock(current_block_lines)) if self.is_terminated_by_line: # if the block was terminated by a line, then the last block should be as well @@ -123,11 +150,13 @@ def _is_legend(self) -> bool: y0_coordinates.append(line.rect.y0) return number_horizontally_close > 1 or number_vertically_close > 2 - def to_json(self): + def to_json(self) -> dict[str, Any]: + """Convert the TextBlock object to a JSON serializable dictionary.""" return { "text": self.text, "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1], "lines": [line.to_json() for line in self.lines], + "page": self.page_number, } diff --git a/tests/test_coordinate_extraction.py b/tests/test_coordinate_extraction.py index 4c305bd7..6fdc1253 100644 --- a/tests/test_coordinate_extraction.py +++ b/tests/test_coordinate_extraction.py @@ -14,16 +14,19 @@ def test_strLV95(): # noqa: D103 + """Test the string representation of an LV95Coordinate object.""" coord = LV95Coordinate(CoordinateEntry(2789456), CoordinateEntry(1123012), fitz.Rect(), page=1) assert str(coord) == "E: 2'789'456, N: 1'123'012" def test_to_jsonLV95(): # noqa: D103 + """Test the to_json method of an LV95Coordinate object.""" coord = LV95Coordinate(CoordinateEntry(2789456), CoordinateEntry(1123012), fitz.Rect(0, 1, 2, 3), page=1) assert coord.to_json() == {"E": 2789456, "N": 1123012, "rect": [0, 1, 2, 3], "page": 1} def test_swap_coordinates(): # noqa: D103 + """Test the swapping of coordinates in an LV95Coordinate object.""" north = CoordinateEntry(789456) east = CoordinateEntry(123012) coord = LV95Coordinate(north=north, east=east, rect=fitz.Rect(), page=1) @@ -32,11 +35,13 @@ def test_swap_coordinates(): # noqa: D103 def test_strLV03(): # noqa: D103 + """Test the string representation of an LV03Coordinate object.""" coord = LV03Coordinate(CoordinateEntry(789456), CoordinateEntry(123012), rect=fitz.Rect(), page=1) assert str(coord) == "E: 789'456, N: 123'012" def test_to_jsonLV03(): # noqa: D103 + """Test the to_json method of an LV03Coordinate object.""" coord = LV03Coordinate(CoordinateEntry(789456), CoordinateEntry(123012), fitz.Rect(0, 1, 2, 3), page=1) assert coord.to_json() == {"E": 789456, "N": 123012, "rect": [0, 1, 2, 3], "page": 1} @@ -46,6 +51,7 @@ def test_to_jsonLV03(): # noqa: D103 def test_CoordinateExtractor_extract_coordinates(): # noqa: D103 + """Test the extraction of coordinates from a PDF document.""" # Assuming there is a method called 'extract' in CoordinateExtractor class coordinates = extractor.extract_coordinates() # Check if the returned value is a list @@ -55,10 +61,12 @@ def test_CoordinateExtractor_extract_coordinates(): # noqa: D103 def _create_simple_lines(text_lines: list[str]) -> list[TextLine]: + """Create a list of TextLine objects from a list of text lines.""" + page_number = 1 return [ TextLine( [ - TextWord(fitz.Rect(word_index, line_index, word_index + 1, line_index + 1), word_text) + TextWord(fitz.Rect(word_index, line_index, word_index + 1, line_index + 1), word_text, page_number) for word_index, word_text in enumerate(text_line.split(" ")) ] ) @@ -67,6 +75,7 @@ def _create_simple_lines(text_lines: list[str]) -> list[TextLine]: def test_CoordinateExtractor_find_coordinate_key(): # noqa: D103 + """Test the extraction of the coordinate key from a list of text lines.""" lines = _create_simple_lines( ["This is a sample text", "followed by a key with a spelling mistake", "Ko0rdinate 615.790 / 157.500"] ) @@ -83,6 +92,7 @@ def test_CoordinateExtractor_find_coordinate_key(): # noqa: D103 def test_CoordinateExtractor_get_coordinates_with_x_y_labels(): # noqa: D103 + """Test the extraction of coordinates with explicit "X" and "Y" labels.""" lines = _create_simple_lines( [ "X = 2 600 000", @@ -109,6 +119,7 @@ def test_CoordinateExtractor_get_coordinates_with_x_y_labels(): # noqa: D103 def test_CoordinateExtractor_get_coordinates_near_key(): # noqa: D103 + """Test the extraction of coordinates near a key.""" lines = _create_simple_lines( [ "This is a sample text followed by a key with a spelling", @@ -158,6 +169,7 @@ def test_CoordinateExtractor_get_coordinates_near_key(): # noqa: D103 ], ) def test_CoordinateExtractor_get_coordinates_from_lines(text, expected): # noqa: D103 + """Test the extraction of coordinates from a list of text lines.""" lines = _create_simple_lines([text]) coordinates = extractor.get_coordinates_from_lines(lines, page=1) expected_east, expected_north = expected @@ -167,6 +179,7 @@ def test_CoordinateExtractor_get_coordinates_from_lines(text, expected): # noqa def test_CoordinateExtractor_get_coordinates_from_lines_rect(): # noqa: D103 + """Test the extraction of coordinates from a list of text lines with different rect formats.""" lines = _create_simple_lines(["start", "2600000 1200000", "end"]) coordinates = extractor.get_coordinates_from_lines(lines, page=1) assert coordinates[0].rect == lines[1].rect diff --git a/tests/test_depthcolumn.py b/tests/test_depthcolumn.py index adeb414e..66d53fb8 100644 --- a/tests/test_depthcolumn.py +++ b/tests/test_depthcolumn.py @@ -6,25 +6,27 @@ def test_boundarydepthcolumn_isarithmeticprogression(): # noqa: D103 + """Test the is_arithmetic_progression method of the BoundaryDepthColumn class.""" + page_number = 1 column = BoundaryDepthColumn( [ - DepthColumnEntry(fitz.Rect(), value=1), - DepthColumnEntry(fitz.Rect(), value=2), - DepthColumnEntry(fitz.Rect(), value=3), - DepthColumnEntry(fitz.Rect(), value=4), - DepthColumnEntry(fitz.Rect(), value=5), + DepthColumnEntry(fitz.Rect(), value=1, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=2, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=3, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=4, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=5, page_number=page_number), ] ) assert column.is_arithmetic_progression(), "The column should be recognized as arithmetic progression" column = BoundaryDepthColumn( [ - DepthColumnEntry(fitz.Rect(), value=17.6), - DepthColumnEntry(fitz.Rect(), value=18.15), - DepthColumnEntry(fitz.Rect(), value=18.65), - DepthColumnEntry(fitz.Rect(), value=19.3), - DepthColumnEntry(fitz.Rect(), value=19.9), - DepthColumnEntry(fitz.Rect(), value=20.5), + DepthColumnEntry(fitz.Rect(), value=17.6, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=18.15, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=18.65, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=19.3, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=19.9, page_number=page_number), + DepthColumnEntry(fitz.Rect(), value=20.5, page_number=page_number), ] ) assert not column.is_arithmetic_progression(), "The column should not be recognized as arithmetic progression" diff --git a/tests/test_find_depth_columns.py b/tests/test_find_depth_columns.py index 4ff107bc..9074d6a7 100644 --- a/tests/test_find_depth_columns.py +++ b/tests/test_find_depth_columns.py @@ -8,13 +8,15 @@ def test_depth_column_entries(): # noqa: D103 + """Test the depth_column_entries function.""" + page_number = 1 all_words = [ - TextWord(fitz.Rect(0, 0, 5, 1), "10.00m"), - TextWord(fitz.Rect(0, 2, 5, 3), "20.0m"), - TextWord(fitz.Rect(0, 4, 5, 5), "30.0m"), - TextWord(fitz.Rect(0, 6, 5, 7), "40.0m"), + TextWord(fitz.Rect(0, 0, 5, 1), "10.00m", page_number), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0m", page_number), + TextWord(fitz.Rect(0, 4, 5, 5), "30.0m", page_number), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0m", page_number), ] - entries = depth_column_entries(all_words, include_splits=False) + entries = depth_column_entries(all_words, page_number, include_splits=False) assert len(entries) == 4, "There should be 4 entries" assert pytest.approx(entries[0].value) == 10.0, "The first entry should have a value of 10.0" assert pytest.approx(entries[1].value) == 20.0, "The second entry should have a value of 20.0" @@ -23,11 +25,13 @@ def test_depth_column_entries(): # noqa: D103 def test_depth_column_entries_with_splits(): # noqa: D103 + """Test the depth_column_entries function with include_splits=True.""" + page_number = 1 all_words = [ - TextLine([TextWord(fitz.Rect(0, 0, 10, 1), "10.00-20.0m")]), - TextLine([TextWord(fitz.Rect(0, 2, 10, 3), "30.0-40.0m")]), + TextLine([TextWord(fitz.Rect(0, 0, 10, 1), "10.00-20.0m", page_number)]), + TextLine([TextWord(fitz.Rect(0, 2, 10, 3), "30.0-40.0m", page_number)]), ] - entries = depth_column_entries(all_words, include_splits=True) + entries = depth_column_entries(all_words, page_number, include_splits=True) assert len(entries) == 4, "There should be 4 entries" assert entries[0].value == 10.0, "The first entry should have a value of 10.0" assert entries[1].value == 20.0, "The second entry should have a value of 20.0" @@ -36,13 +40,15 @@ def test_depth_column_entries_with_splits(): # noqa: D103 def test_depth_column_entries_with_leading_character(): # noqa: D103 + """Test the depth_column_entries function with a leading character.""" + page_number = 1 all_words = [ - TextWord(fitz.Rect(0, 0, 5, 1), "0.00m"), - TextWord(fitz.Rect(0, 2, 5, 3), ".2m"), # this is a test for an ocr error from '-2m' to '.2m' - TextWord(fitz.Rect(0, 4, 5, 5), "-3.0m"), - TextWord(fitz.Rect(0, 6, 5, 7), ".4.2m"), + TextWord(fitz.Rect(0, 0, 5, 1), "0.00m", page_number), + TextWord(fitz.Rect(0, 2, 5, 3), ".2m", page_number), # this is a test for an ocr error from '-2m' to '.2m' + TextWord(fitz.Rect(0, 4, 5, 5), "-3.0m", page_number), + TextWord(fitz.Rect(0, 6, 5, 7), ".4.2m", page_number), ] - entries = depth_column_entries(all_words, include_splits=True) + entries = depth_column_entries(all_words, page_number, include_splits=True) assert len(entries) == 4, "There should be 4 entries" assert entries[0].value == 0.0, "The first entry should have a value of 0" assert entries[1].value == 2.0, "The second entry should have a value of 2.0" @@ -51,45 +57,51 @@ def test_depth_column_entries_with_leading_character(): # noqa: D103 all_words_find_depth_column = [ - TextWord(fitz.Rect(0, 0, 5, 1), "10.00m"), - TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand"), - TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert."), - TextWord(fitz.Rect(0, 2, 5, 3), "20.0m"), - TextWord(fitz.Rect(0, 4, 5, 5), "30.0m"), - TextWord(fitz.Rect(0, 6, 5, 7), "40.0m"), - TextWord(fitz.Rect(0, 8, 5, 9), "50.0m"), + TextWord(fitz.Rect(0, 0, 5, 1), "10.00m", 1), + TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand", 1), + TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert.", 1), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0m", 1), + TextWord(fitz.Rect(0, 4, 5, 5), "30.0m", 1), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0m", 1), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0m", 1), ] def test_find_depth_columns_arithmetic_progression(): # noqa: D103 + """Test the find_depth_columns function with an arithmetic progression.""" entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 10.0), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 30.0), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 10.0, 1), + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, 1), + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 30.0, 1), + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, 1), + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, 1), ] + page_number = 1 columns = find_depth_columns( entries, all_words_find_depth_column, + page_number, depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 0, "There should be 0 columns as the above is a perfect arithmetic progression" def test_find_depth_columns(): # noqa: D103 + """Test the find_depth_columns function.""" + page_number = 1 entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, page_number), + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, page_number), + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, page_number), + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, page_number), + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, page_number), ] columns = find_depth_columns( entries, all_words_find_depth_column, + page_number, depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 1, "There should be 1 column" @@ -102,22 +114,25 @@ def test_find_depth_columns(): # noqa: D103 def test_two_columns_find_depth_columns(): # noqa: D103 + """Test the find_depth_columns function with two columns.""" + page_number = 1 entries = [ # first depth column - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0), # second depth column - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0), - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0), - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0), - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0), - DepthColumnEntry(fitz.Rect(20, 10, 25, 11), 61.0), + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, page_number), + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, page_number), + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, page_number), + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, page_number), + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, page_number), + DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0, page_number), # second depth column + DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0, page_number), + DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0, page_number), + DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0, page_number), + DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0, page_number), + DepthColumnEntry(fitz.Rect(20, 10, 25, 11), 61.0, page_number), ] columns = find_depth_columns( entries, all_words_find_depth_column, + page_number, depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0}, ) assert len(columns) == 2, "There should be 2 columns" @@ -126,28 +141,30 @@ def test_two_columns_find_depth_columns(): # noqa: D103 all_words_find_layer_depth_column = [ - TextWord(fitz.Rect(0, 0, 5, 1), "12.00-20.0m"), - TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand"), - TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert."), - TextWord(fitz.Rect(0, 2, 5, 3), "20.0-34.0m"), - TextWord(fitz.Rect(0, 4, 5, 5), "34.0 - 40.0m"), - TextWord(fitz.Rect(0, 6, 5, 7), "40.0-50m"), - TextWord(fitz.Rect(0, 8, 5, 9), "50.0-60m"), + TextWord(fitz.Rect(0, 0, 5, 1), "12.00-20.0m", 1), + TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand", 1), + TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert.", 1), + TextWord(fitz.Rect(0, 2, 5, 3), "20.0-34.0m", 1), + TextWord(fitz.Rect(0, 4, 5, 5), "34.0 - 40.0m", 1), + TextWord(fitz.Rect(0, 6, 5, 7), "40.0-50m", 1), + TextWord(fitz.Rect(0, 8, 5, 9), "50.0-60m", 1), ] def test_find_layer_depth_columns(): # noqa: D103 + """Test the find_layer_depth_columns function.""" + page_number = 1 entries = [ - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0), + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, page_number), # layer 12.0-20.0m + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0, page_number), + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, page_number), # layer 20.0-34.0m + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0, page_number), + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, page_number), # layer 34.0-40.0m + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0, page_number), + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, page_number), # layer 40.0-50.0m + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0, page_number), + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, page_number), # layer 50.0-60.0m + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0, page_number), ] columns = find_layer_depth_columns(entries, all_words_find_depth_column) @@ -166,28 +183,30 @@ def test_find_layer_depth_columns(): # noqa: D103 def test_two_columns_find_layer_depth_columns(): # noqa: D103 + """Test the find_layer_depth_columns function with two columns.""" + page_number = 1 entries = [ # first depth column - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0), - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0), - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0), - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0), - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0), + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, page_number), # layer 12.0-20.0m + DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0, page_number), + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, page_number), # layer 20.0-34.0m + DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0, page_number), + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, page_number), # layer 34.0-40.0m + DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0, page_number), + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, page_number), # layer 40.0-50.0m + DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0, page_number), + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, page_number), # layer 50.0-60.0m + DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0, page_number), # second depth column - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0), # layer 12.0-20.0m - DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 20.0), - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0), # layer 20.0-34.0m - DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 34.0), - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0), # layer 34.0-40.0m - DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 40.0), - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0), # layer 40.0-50.0m - DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 50.0), - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0), # layer 50.0-60.0m - DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 60.0), + DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0, page_number), # layer 12.0-20.0m + DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 20.0, page_number), + DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0, page_number), # layer 20.0-34.0m + DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 34.0, page_number), + DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0, page_number), # layer 34.0-40.0m + DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 40.0, page_number), + DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0, page_number), # layer 40.0-50.0m + DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 50.0, page_number), + DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0, page_number), # layer 50.0-60.0m + DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 60.0, page_number), ] columns = find_layer_depth_columns(entries, all_words_find_layer_depth_column) assert len(columns) == 2, "There should be 2 columns" diff --git a/tests/test_find_descripton.py b/tests/test_find_descripton.py index 80dd1ab5..721c10a9 100644 --- a/tests/test_find_descripton.py +++ b/tests/test_find_descripton.py @@ -5,9 +5,12 @@ from stratigraphy.util.find_description import get_description_blocks from stratigraphy.util.line import TextLine, TextWord -textline1 = TextLine([TextWord(fitz.Rect([0, 0, 10, 10]), "Hello")]) -textline2 = TextLine([TextWord(fitz.Rect([0, 15, 10, 25]), "World")]) -textline3 = TextLine([TextWord(fitz.Rect([0, 37, 10, 47]), "Hey")]) # larger vertical distance to previous blocks +page_number = 1 +textline1 = TextLine([TextWord(fitz.Rect([0, 0, 10, 10]), "Hello", page_number)]) +textline2 = TextLine([TextWord(fitz.Rect([0, 15, 10, 25]), "World", page_number)]) +textline3 = TextLine( + [TextWord(fitz.Rect([0, 37, 10, 47]), "Hey", page_number)] +) # larger vertical distance to previous blocks geometric_lines = [Line(Point(500, 1), Point(505, 1))] # line does not cut the blocks geometric_lines_cut = [Line(Point(-5, 12), Point(10, 12))] # line cuts the first and second line @@ -24,6 +27,7 @@ def test_get_description_blocks(): # noqa: D103 + """Test the grouping of description lines into blocks.""" target_layer_count = 2 # expect two blocks. But the line do not cut the blocks blocks = get_description_blocks( description_lines, @@ -40,6 +44,7 @@ def test_get_description_blocks(): # noqa: D103 def test_get_description_blocks_separated_by_line(): # noqa: D103 + """Test the splitting of blocks based on the presence of a line.""" target_layer_count = 1 # should not trigger splitting the blocks with vertical distances blocks = get_description_blocks( description_lines, @@ -56,6 +61,7 @@ def test_get_description_blocks_separated_by_line(): # noqa: D103 def test_get_description_blocks_separated_by_lefthandside_line(): # noqa: D103 + """Test the splitting of blocks based on the presence of a lefthandside line.""" target_layer_count = 1 # only one block, but the lefthand line still cuts them into two blocks geometric_lines_all = geometric_lines_cut + geometric_lines_lefthandside blocks = get_description_blocks( diff --git a/tests/test_interval.py b/tests/test_interval.py index f3e9755e..d88bbee9 100644 --- a/tests/test_interval.py +++ b/tests/test_interval.py @@ -6,8 +6,10 @@ def test_line_anchor(): # noqa: D103 - start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) - end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10) + """Test the line anchor property of the BoundaryInterval and LayerInterval classes.""" + page_number = 1 + start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number) + end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10, page_number) boundary_interval = BoundaryInterval(start, end) assert boundary_interval.line_anchor == fitz.Point(1, 1.5), ( "The 'line anchor' for a BoundaryInterval should be halfway between the bottom-right of the start depth and " @@ -24,8 +26,8 @@ def test_line_anchor(): # noqa: D103 1, 2 ), "The 'line anchor' for a BoundaryInterval without start should be the top-right of the end depth." - start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) - end = DepthColumnEntry(fitz.Rect(2, 0, 3, 1), 10) + start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number) + end = DepthColumnEntry(fitz.Rect(2, 0, 3, 1), 10, page_number) entry = LayerDepthColumnEntry(start, end) layer_interval = LayerInterval(entry) assert layer_interval.line_anchor == fitz.Point( @@ -34,8 +36,10 @@ def test_line_anchor(): # noqa: D103 def test_background_rect(): # noqa: D103 - start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5) - end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10) + """Test the background_rect property of the BoundaryInterval class.""" + page_number = 1 + start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number) + end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10, page_number) boundary_interval = BoundaryInterval(start, end) assert boundary_interval.background_rect == fitz.Rect( start.rect.x0, start.rect.y1, start.rect.x1, end.rect.y0 diff --git a/tests/test_textblock.py b/tests/test_textblock.py index 4bf8ba5e..69536ee8 100644 --- a/tests/test_textblock.py +++ b/tests/test_textblock.py @@ -6,19 +6,23 @@ def test_concatenate(): # noqa: D103 - tb1 = TextBlock([TextLine([TextWord(fitz.Rect([0, 0, 5, 1]), "Hello")])]) - tb2 = TextBlock([TextLine([TextWord(fitz.Rect([0, 1, 5, 2]), "World")])]) + """Test the concatenation of two TextBlocks.""" + page_number = 1 + tb1 = TextBlock([TextLine([TextWord(fitz.Rect([0, 0, 5, 1]), "Hello", page_number)])]) + tb2 = TextBlock([TextLine([TextWord(fitz.Rect([0, 1, 5, 2]), "World", page_number)])]) tb3 = tb1.concatenate(tb2) assert len(tb3.lines) == 2, "There should be 2 lines in the concatenated TextBlock" assert tb3.text == "Hello World", "The text should be 'Hello World'" def test_split_based_on_indentation(): # noqa: D103 + """Test the splitting of a TextBlock based on indentation.""" + page_number = 1 tb = TextBlock( [ - TextLine([TextWord(fitz.Rect(0, 0, 20, 5), "Hello")]), - TextLine([TextWord(fitz.Rect(0, 8, 20, 13), "Hello")]), - TextLine([TextWord(fitz.Rect(3, 16, 22, 21), "World")]), # Indented line + TextLine([TextWord(fitz.Rect(0, 0, 20, 5), "Hello", page_number)]), + TextLine([TextWord(fitz.Rect(0, 8, 20, 13), "Hello", page_number)]), + TextLine([TextWord(fitz.Rect(3, 16, 22, 21), "World", page_number)]), # Indented line ] ) blocks = tb.split_based_on_indentation() @@ -26,17 +30,26 @@ def test_split_based_on_indentation(): # noqa: D103 def test_post_init(): # noqa: D103 - tb = TextBlock([TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello")])]) + """Test the post-init method of the TextBlock class.""" + page_number = 1 + tb = TextBlock([TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello", page_number)])]) assert tb.line_count == 1, "The line count should be 1" assert tb.text == "Hello", "The text should be 'Hello'" assert tb.rect == fitz.Rect(0, 0, 5, 1), "The rect should be the same as the line's rect" def test_post_init_longer_text(): # noqa: D103 + """Test the post-init method of the TextBlock class with multiple lines.""" + page_number = 1 tb = TextBlock( [ - TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello")]), - TextLine([TextWord(fitz.Rect(0, 1, 5, 2), "It's"), TextWord(fitz.Rect(5, 1, 10, 2), "me")]), + TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello", page_number)]), + TextLine( + [ + TextWord(fitz.Rect(0, 1, 5, 2), "It's", page_number), + TextWord(fitz.Rect(5, 1, 10, 2), "me", page_number), + ] + ), ] ) assert tb.line_count == 2, "The line count should be 2" @@ -45,8 +58,10 @@ def test_post_init_longer_text(): # noqa: D103 def test_block_distance(): # noqa: D103 - block_1 = TextBlock([TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello")])]) - block_2 = TextBlock([TextLine([TextWord(fitz.Rect(0, 2, 5, 3), "Hello")])]) + """Test the calculation of the distance between two TextBlocks.""" + page_number = 1 + block_1 = TextBlock([TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello", page_number)])]) + block_2 = TextBlock([TextLine([TextWord(fitz.Rect(0, 2, 5, 3), "Hello", page_number)])]) assert ( block_distance(block_1, block_2) == 1 ), "The distance should be measured from the bottom of the first block to the top of the second block."