From 29fa7b9e6451ff0ee9da5b3afdca140d0bfdb5f9 Mon Sep 17 00:00:00 2001
From: dcleres <davidcleres@gmail.com>
Date: Tue, 30 Jul 2024 14:11:32 +0200
Subject: [PATCH] Close LGVISIUM-52: Moved the page information within the JSON

---
 .vscode/launch.json                           |  21 +++
 .vscode/settings.json                         |   5 +
 src/stratigraphy/benchmark/score.py           |   3 +-
 src/stratigraphy/extract.py                   |  35 ++--
 src/stratigraphy/main.py                      |  35 ++--
 .../util/boundarydepthcolumnvalidator.py      |  15 +-
 .../util/coordinate_extraction.py             |   5 +-
 src/stratigraphy/util/depthcolumn.py          |   4 +-
 src/stratigraphy/util/depthcolumnentry.py     |  34 +++-
 .../util/description_block_splitter.py        |   5 +-
 src/stratigraphy/util/draw.py                 |  38 ++--
 src/stratigraphy/util/extract_text.py         |   7 +-
 src/stratigraphy/util/find_depth_columns.py   |  19 +-
 src/stratigraphy/util/find_description.py     |   4 +-
 src/stratigraphy/util/interval.py             |  15 +-
 .../util/layer_identifier_column.py           |   7 +-
 src/stratigraphy/util/line.py                 |  18 +-
 src/stratigraphy/util/predictions.py          | 137 +++++++-------
 src/stratigraphy/util/textblock.py            |  39 +++-
 tests/test_coordinate_extraction.py           |  15 +-
 tests/test_depthcolumn.py                     |  24 +--
 tests/test_find_depth_columns.py              | 175 ++++++++++--------
 tests/test_find_descripton.py                 |  12 +-
 tests/test_interval.py                        |  16 +-
 tests/test_textblock.py                       |  35 +++-
 25 files changed, 455 insertions(+), 268 deletions(-)
 create mode 100644 .vscode/launch.json
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000..81373420
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,21 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Run boreholes-extract-all",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "src.stratigraphy.main",
+            "args": [
+                "-i", "data/zurich",
+                "-g", "data/zurich_ground_truth.json"
+            ],
+            "cwd": "${workspaceFolder}",
+            "justMyCode": true,
+            "python": "./swisstopo/bin/python3",
+        }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000..163a9c49
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+    "cSpell.words": [
+        "fitz"
+    ]
+}
\ No newline at end of file
diff --git a/src/stratigraphy/benchmark/score.py b/src/stratigraphy/benchmark/score.py
index 33addf9b..00d17a47 100644
--- a/src/stratigraphy/benchmark/score.py
+++ b/src/stratigraphy/benchmark/score.py
@@ -252,7 +252,8 @@ def evaluate_layer_extraction(predictions: dict, number_of_truth_values: dict) -
 
 
 def create_predictions_objects(
-    predictions: dict, ground_truth_path: Path | None
+    predictions: dict,
+    ground_truth_path: Path | None,
 ) -> tuple[dict[FilePredictions], dict]:
     """Create predictions objects from the predictions and evaluate them against the ground truth.
 
diff --git a/src/stratigraphy/extract.py b/src/stratigraphy/extract.py
index 5753f3b4..00240f81 100644
--- a/src/stratigraphy/extract.py
+++ b/src/stratigraphy/extract.py
@@ -30,7 +30,9 @@
 logger = logging.getLogger(__name__)
 
 
-def process_page(lines: list[TextLine], geometric_lines, language: str, **params: dict) -> list[dict]:
+def process_page(
+    lines: list[TextLine], geometric_lines, language: str, page_number: int, **params: dict
+) -> list[dict]:
     """Process a single page of a pdf.
 
     Finds all descriptions and depth intervals on the page and matches them.
@@ -39,6 +41,7 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
         lines (list[TextLine]): all the text lines on the page.
         geometric_lines (list[Line]): The geometric lines of the page.
         language (str): The language of the page.
+        page_number (int): The page number.
         **params (dict): Additional parameters for the matching pipeline.
 
     Returns:
@@ -67,7 +70,7 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
     # If there is a layer identifier column, then we use this directly.
     # Else, we search for depth columns. We could also think of some scoring mechanism to decide which one to use.
     if not pairs:
-        depth_column_entries = find_depth_columns.depth_column_entries(words, include_splits=True)
+        depth_column_entries = find_depth_columns.depth_column_entries(words, page_number, include_splits=True)
         layer_depth_columns = find_depth_columns.find_layer_depth_columns(depth_column_entries, words)
 
         used_entry_rects = []
@@ -77,13 +80,13 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
 
         depth_column_entries = [
             entry
-            for entry in find_depth_columns.depth_column_entries(words, include_splits=False)
+            for entry in find_depth_columns.depth_column_entries(words, page_number, include_splits=False)
             if entry.rect not in used_entry_rects
         ]
         depth_columns: list[DepthColumn] = layer_depth_columns
         depth_columns.extend(
             find_depth_columns.find_depth_columns(
-                depth_column_entries, words, depth_column_params=params["depth_column_params"]
+                depth_column_entries, words, page_number, depth_column_params=params["depth_column_params"]
             )
         )
 
@@ -106,12 +109,12 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
 
     groups = []  # list of matched depth intervals and text blocks
     # groups is of the form: [{"depth_interval": BoundaryInterval, "block": TextBlock}]
-    if len(filtered_pairs):  # match depth column items with material description
+    if filtered_pairs:  # match depth column items with material description
         for depth_column, material_description_rect in filtered_pairs:
             description_lines = get_description_lines(lines, material_description_rect)
             if len(description_lines) > 1:
                 new_groups = match_columns(
-                    depth_column, description_lines, geometric_lines, material_description_rect, **params
+                    depth_column, description_lines, geometric_lines, material_description_rect, page_number, **params
                 )
                 groups.extend(new_groups)
         json_filtered_pairs = [
@@ -157,9 +160,11 @@ def process_page(lines: list[TextLine], geometric_lines, language: str, **params
                 ]
             )
     predictions = [
-        {"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()}
-        if "depth_interval" in group
-        else {"material_description": group["block"].to_json()}
+        (
+            {"material_description": group["block"].to_json(), "depth_interval": group["depth_interval"].to_json()}
+            if "depth_interval" in group
+            else {"material_description": group["block"].to_json()}
+        )
         for group in groups
     ]
     predictions = parse_and_remove_empty_predictions(predictions)
@@ -205,6 +210,7 @@ def match_columns(
     description_lines: list[TextLine],
     geometric_lines: list[Line],
     material_description_rect: fitz.Rect,
+    page_number: int,
     **params: dict,
 ) -> list:
     """Match the depth column entries with the description lines.
@@ -218,6 +224,7 @@ def match_columns(
         description_lines (list[TextLine]): The description lines.
         geometric_lines (list[Line]): The geometric lines.
         material_description_rect (fitz.Rect): The material description rectangle.
+        page_number (int): The page number.
         **params (dict): Additional parameters for the matching pipeline.
 
     Returns:
@@ -235,7 +242,7 @@ def match_columns(
         blocks = get_description_blocks_from_layer_identifier(depth_column.entries, description_lines)
         groups = []
         for block in blocks:
-            depth_interval = depth_column.get_depth_interval(block)
+            depth_interval = depth_column.get_depth_interval(block, page_number)
             if depth_interval:
                 groups.append({"depth_interval": depth_interval, "block": block})
             else:
@@ -320,7 +327,7 @@ def merge_blocks_by_vertical_spacing(blocks: list[TextBlock], target_merge_count
             merged_blocks.append(current_merged_block)
             current_merged_block = new_block
 
-    if len(current_merged_block.lines):
+    if current_merged_block.lines:
         merged_blocks.append(current_merged_block)
     return merged_blocks
 
@@ -355,7 +362,7 @@ def split_blocks_by_textline_length(blocks: list[TextBlock], target_split_count:
                     split_blocks.append(TextBlock(current_block_lines))
                     cutoff_values.remove(line.rect.x1)
                     current_block_lines = []
-            if len(current_block_lines):
+            if current_block_lines:
                 split_blocks.append(TextBlock(current_block_lines))
                 current_block_lines = []
             if (
@@ -386,7 +393,7 @@ def find_material_description_column(
             if x_overlap(line.rect, depth_column.rect()) and line.rect.y0 < depth_column.rect().y0
         ]
 
-        min_y0 = max(line.rect.y0 for line in above_depth_column) if len(above_depth_column) else -1
+        min_y0 = max(line.rect.y0 for line in above_depth_column) if above_depth_column else -1
 
         def check_y0_condition(y0):
             return y0 > min_y0 and y0 < depth_column.rect().y1
@@ -409,7 +416,7 @@ def check_y0_condition(y0):
         ]
 
         def filter_coverage(coverage):
-            if len(coverage):
+            if coverage:
                 min_x0 = min(line.rect.x0 for line in coverage)
                 max_x1 = max(line.rect.x1 for line in coverage)
                 x0_threshold = max_x1 - 0.4 * (
diff --git a/src/stratigraphy/main.py b/src/stratigraphy/main.py
index 2bcaf64f..6dbbb3b1 100644
--- a/src/stratigraphy/main.py
+++ b/src/stratigraphy/main.py
@@ -173,30 +173,36 @@ def start_pipeline(
                         predictions[filename]["metadata"] = {"coordinates": coordinates.to_json()}
                     else:
                         predictions[filename]["metadata"] = {"coordinates": None}
+
+                    layer_predictions_list = []
+                    depths_materials_column_pairs_list = []
+                    page_heights_list = []
+                    page_widths_list = []
                     for page_index, page in enumerate(doc):
                         page_number = page_index + 1
                         logger.info("Processing page %s", page_number)
 
-                        text_lines = extract_text_lines(page)
+                        text_lines = extract_text_lines(page, page_number)
                         geometric_lines = extract_lines(page, line_detection_params)
                         layer_predictions, depths_materials_column_pairs = process_page(
-                            text_lines, geometric_lines, language, **matching_params
+                            text_lines, geometric_lines, language, page_number, **matching_params
                         )
-                        # Add remove duplicates here!
+
+                        # TODO: Add remove duplicates here!
                         if page_index > 0:
                             layer_predictions = remove_duplicate_layers(
                                 doc[page_index - 1],
                                 page,
-                                predictions[filename][f"page_{page_number - 1}"]["layers"],
+                                layer_predictions_list,
                                 layer_predictions,
                                 matching_params["img_template_probability_threshold"],
                             )
-                        predictions[filename][f"page_{page_number}"] = {
-                            "layers": layer_predictions,
-                            "depths_materials_column_pairs": depths_materials_column_pairs,
-                            "page_height": page.rect.height,
-                            "page_width": page.rect.width,
-                        }
+
+                        layer_predictions_list.extend(layer_predictions)
+                        depths_materials_column_pairs_list.extend(depths_materials_column_pairs)
+                        page_heights_list.append(page.rect.height)
+                        page_widths_list.append(page.rect.width)
+
                         if draw_lines:  # could be changed to if draw_lines and mflow_tracking:
                             if not mlflow_tracking:
                                 logger.warning(
@@ -208,11 +214,18 @@ def start_pipeline(
                                 )
                                 mlflow.log_image(img, f"pages/{filename}_page_{page.number + 1}_lines.png")
 
+                    predictions[filename]["layers"] = layer_predictions_list
+                    predictions[filename]["depths_materials_column_pairs"] = depths_materials_column_pairs_list
+                    predictions[filename]["page_height"] = page_heights_list
+                    predictions[filename]["page_width"] = page_widths_list
+
+                    assert len(page_heights_list) == len(page_widths_list) == doc.page_count, "Page count mismatch."
+
     logger.info("Writing predictions to JSON file %s", predictions_path)
     with open(predictions_path, "w") as file:
         file.write(json.dumps(predictions))
 
-    # evaluate the predictions; if file doesnt exist, the predictions are not changed.
+    # evaluate the predictions; if file does not exist, the predictions are not changed.
     predictions, number_of_truth_values = create_predictions_objects(predictions, ground_truth_path)
 
     if not skip_draw_predictions:
diff --git a/src/stratigraphy/util/boundarydepthcolumnvalidator.py b/src/stratigraphy/util/boundarydepthcolumnvalidator.py
index 49c019f8..c1178d03 100644
--- a/src/stratigraphy/util/boundarydepthcolumnvalidator.py
+++ b/src/stratigraphy/util/boundarydepthcolumnvalidator.py
@@ -63,7 +63,7 @@ def is_valid(self, column: BoundaryDepthColumn, corr_coef_threshold: float = 0.9
 
         return corr_coef and corr_coef > corr_coef_threshold
 
-    def reduce_until_valid(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn:
+    def reduce_until_valid(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn:
         """Removes entries from the depth column until it fulfills the is_valid condition.
 
         is_valid checks whether there is too much noise (i.e. other text) in the column and whether the entries are
@@ -71,19 +71,19 @@ def reduce_until_valid(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn
 
         Args:
             column (BoundaryDepthColumn): The depth column to validate
-
+            page_number (int): The page number of the depth column
         Returns:
             BoundaryDepthColumn: The current depth column with entries removed until it is valid.
         """
         while column:
             if self.is_valid(column):
                 return column
-            elif self.correct_OCR_mistakes(column) is not None:
-                return self.correct_OCR_mistakes(column)
+            elif self.correct_OCR_mistakes(column, page_number) is not None:
+                return self.correct_OCR_mistakes(column, page_number)
             else:
                 column = column.remove_entry_by_correlation_gradient()
 
-    def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColumn | None:
+    def correct_OCR_mistakes(self, column: BoundaryDepthColumn, page_number: int) -> BoundaryDepthColumn | None:
         """Corrects OCR mistakes in the depth column entries.
 
         Loops through all values and corrects common OCR mistakes for the given entry. Then, the column with the
@@ -102,6 +102,7 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu
 
         Args:
             column (BoundaryDepthColumn): The depth column to validate
+            page_number (int): The page number of the depth column
 
         Returns:
             BoundaryDepthColumn | None: The corrected depth column, or None if no correction was possible.
@@ -109,7 +110,7 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu
         new_columns = [BoundaryDepthColumn()]
         for entry in column.entries:
             new_columns = [
-                BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value)])
+                BoundaryDepthColumn([*column.entries, DepthColumnEntry(entry.rect, new_value, page_number)])
                 for column in new_columns
                 for new_value in _value_alternatives(entry.value)
             ]
@@ -117,7 +118,7 @@ def correct_OCR_mistakes(self, column: BoundaryDepthColumn) -> BoundaryDepthColu
             # alternative values are suggested
             new_columns = [column for column in new_columns if column.is_strictly_increasing()]
 
-        if len(new_columns):
+        if new_columns:
             best_column = max(new_columns, key=lambda column: column.pearson_correlation_coef())
 
             # We require a higher correlation coefficient when we've already corrected a mistake.
diff --git a/src/stratigraphy/util/coordinate_extraction.py b/src/stratigraphy/util/coordinate_extraction.py
index 8f8fd484..66fb4f06 100644
--- a/src/stratigraphy/util/coordinate_extraction.py
+++ b/src/stratigraphy/util/coordinate_extraction.py
@@ -312,8 +312,9 @@ def extract_coordinates(self) -> Coordinate | None:
         Returns:
             Coordinate | None: the extracted coordinates (if any)
         """
-        for page in self.doc:
-            lines = extract_text_lines(page)
+        for page_idx, page in enumerate(self.doc):
+            page_number = page_idx + 1
+            lines = extract_text_lines(page, page_number)
             page_number = page.number + 1  # page.number is 0-based
 
             found_coordinates = (
diff --git a/src/stratigraphy/util/depthcolumn.py b/src/stratigraphy/util/depthcolumn.py
index d3d7a0a5..32920c06 100644
--- a/src/stratigraphy/util/depthcolumn.py
+++ b/src/stratigraphy/util/depthcolumn.py
@@ -111,7 +111,7 @@ def break_on_mismatch(self) -> list[LayerDepthColumn]:
                 segment_start = index
 
         final_segment = self.entries[segment_start:]
-        if len(final_segment):
+        if final_segment:
             segments.append(final_segment)
 
         return [LayerDepthColumn(segment) for segment in segments]
@@ -338,7 +338,7 @@ def break_on_double_descending(self) -> list[BoundaryDepthColumn]:
                 segment_start = index
 
         final_segment = self.entries[segment_start:]
-        if len(final_segment):
+        if final_segment:
             segments.append(final_segment)
 
         return [BoundaryDepthColumn(segment) for segment in segments]
diff --git a/src/stratigraphy/util/depthcolumnentry.py b/src/stratigraphy/util/depthcolumnentry.py
index 950d0776..a0dbb64c 100644
--- a/src/stratigraphy/util/depthcolumnentry.py
+++ b/src/stratigraphy/util/depthcolumnentry.py
@@ -1,20 +1,27 @@
 """Contains dataclasses for entries in a depth column."""
 
+from typing import Any
+
 import fitz
 
 
 class DepthColumnEntry:  # noqa: D101
-    def __init__(self, rect: fitz.Rect, value: float):
+    """Class to represent a depth column entry."""
+
+    def __init__(self, rect: fitz.Rect, value: float, page_number: int):
         self.rect = rect
         self.value = value
+        self.page_number = page_number
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return str(self.value)
 
-    def to_json(self):
+    def to_json(self) -> dict[str, Any]:
+        """Convert the depth column entry to a JSON serializable format."""
         return {
             "value": self.value,
             "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1],
+            "page": self.page_number,
         }
 
 
@@ -26,31 +33,38 @@ class AnnotatedDepthColumnEntry(DepthColumnEntry):  # noqa: D101
     """
 
     def __init__(self, value):
-        self.value = value
-        self.rect = None
+        super().__init__(None, value, None)
 
-    def to_json(self):
+    def to_json(self) -> dict[str, Any]:
         return {
             "value": self.value,
-            "rect": None,
+            "rect": self.rect,
+            "page": self.page_number,
         }
 
 
 class LayerDepthColumnEntry:  # noqa: D101
+    """Class to represent a layer depth column entry."""
+
     def __init__(self, start: DepthColumnEntry, end: DepthColumnEntry):
         self.start = start
         self.end = end
 
-    def __repr__(self):
+        assert start.page_number == end.page_number, "Start and end entries are on different pages."
+
+    def __repr__(self) -> str:
         return f"{self.start.value}-{self.end.value}"
 
     @property
-    def rect(self):
+    def rect(self) -> fitz.Rect:
+        """Get the rectangle of the layer depth column entry."""
         return fitz.Rect(self.start.rect).include_rect(self.end.rect)
 
-    def to_json(self):
+    def to_json(self) -> dict[str, Any]:
+        """Convert the layer depth column entry to a JSON serializable format."""
         return {
             "start": self.start.to_json(),
             "end": self.end.to_json(),
             "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1],
+            "page": self.start.page_number,
         }
diff --git a/src/stratigraphy/util/description_block_splitter.py b/src/stratigraphy/util/description_block_splitter.py
index 7e45c446..16cf4586 100644
--- a/src/stratigraphy/util/description_block_splitter.py
+++ b/src/stratigraphy/util/description_block_splitter.py
@@ -21,6 +21,7 @@ def __init__(self):  # noqa: D107
 
     @abc.abstractmethod
     def separator_condition(self, last_line: TextLine, current_line: TextLine) -> bool:  # noqa: D107
+        """Check if a block is separated by a certain condition."""
         pass
 
     def create_blocks(self, description_lines: list[TextLine]) -> list[TextBlock]:
@@ -30,7 +31,7 @@ def create_blocks(self, description_lines: list[TextLine]) -> list[TextBlock]:
             description_lines (list[TextLine]): all the text lines from the material descriptions.
 
         Returns:
-            list[TextBlock]: the list of textblocks
+            list[TextBlock]: the list of TextBlocks
         """
         blocks = []
         current_block_lines = []
@@ -43,7 +44,7 @@ def create_blocks(self, description_lines: list[TextLine]) -> list[TextBlock]:
                     )
                     current_block_lines = []
             current_block_lines.append(line)
-        if len(current_block_lines):
+        if current_block_lines:
             blocks.append(TextBlock(current_block_lines))
         return blocks
 
diff --git a/src/stratigraphy/util/draw.py b/src/stratigraphy/util/draw.py
index 82183815..e16b6529 100644
--- a/src/stratigraphy/util/draw.py
+++ b/src/stratigraphy/util/draw.py
@@ -47,8 +47,7 @@ def draw_predictions(predictions: list[FilePredictions], directory: Path, out_di
         with fitz.Document(directory / file_name) as doc:
             for page_index, page in enumerate(doc):
                 page_number = page_index + 1
-                layers = file_prediction.pages[page_index].layers
-                depths_materials_column_pairs = file_prediction.pages[page_index].depths_materials_columns_pairs
+                depths_materials_column_pairs = file_prediction.depths_materials_columns_pairs
                 if page_index == 0:
                     draw_metadata(
                         page,
@@ -58,10 +57,12 @@ def draw_predictions(predictions: list[FilePredictions], directory: Path, out_di
                 if file_prediction.metadata.coordinates is not None:
                     draw_coordinates(page, file_prediction.metadata.coordinates)
                 draw_depth_columns_and_material_rect(page, depths_materials_column_pairs)
-                draw_material_descriptions(page, layers)
+                draw_material_descriptions(page, file_prediction.layers)
 
                 tmp_file_path = out_directory / f"{file_name}_page{page_number}.png"
                 fitz.utils.get_pixmap(page, matrix=fitz.Matrix(2, 2), clip=page.rect).save(tmp_file_path)
+                print(f"Saved image to {tmp_file_path}")
+
                 if mlflow_tracking:  # This is only executed if MLFlow tracking is enabled
                     try:
                         import mlflow
@@ -119,21 +120,24 @@ def draw_material_descriptions(page: fitz.Page, layers: LayerPrediction) -> None
         page (fitz.Page): The page to draw on.
         layers (LayerPrediction): The predictions for the page.
     """
+    page_number = page.number + 1
+
     for index, layer in enumerate(layers):
-        if layer.material_description.rect is not None:
-            fitz.utils.draw_rect(
-                page,
-                fitz.Rect(layer.material_description.rect) * page.derotation_matrix,
-                color=fitz.utils.getColor("orange"),
+        if layer.material_description.page_number == page_number:
+            if layer.material_description.rect is not None:
+                fitz.utils.draw_rect(
+                    page,
+                    fitz.Rect(layer.material_description.rect) * page.derotation_matrix,
+                    color=fitz.utils.getColor("orange"),
+                )
+            draw_layer(
+                page=page,
+                interval=layer.depth_interval,  # None if no depth interval
+                layer=layer.material_description,
+                index=index,
+                is_correct=layer.material_is_correct,  # None if no ground truth
+                depth_is_correct=layer.depth_interval_is_correct,  # None if no ground truth
             )
-        draw_layer(
-            page=page,
-            interval=layer.depth_interval,  # None if no depth interval
-            layer=layer.material_description,
-            index=index,
-            is_correct=layer.material_is_correct,  # None if no ground truth
-            depth_is_correct=layer.depth_interval_is_correct,  # None if no ground truth
-        )
 
 
 def draw_depth_columns_and_material_rect(page: fitz.Page, depths_materials_column_pairs: list) -> fitz.Page:
@@ -197,7 +201,7 @@ def draw_layer(
         is_correct (bool): Whether the text block was correctly identified.
         depth_is_correct (bool): Whether the depth interval was correctly identified.
     """
-    if len(layer.lines):
+    if layer.lines:
         layer_rect = fitz.Rect(layer.rect)
         color = colors[index % len(colors)]
 
diff --git a/src/stratigraphy/util/extract_text.py b/src/stratigraphy/util/extract_text.py
index fe78fa72..4620eefa 100644
--- a/src/stratigraphy/util/extract_text.py
+++ b/src/stratigraphy/util/extract_text.py
@@ -5,13 +5,14 @@
 from stratigraphy.util.line import TextLine, TextWord
 
 
-def extract_text_lines(page: fitz.Page) -> list[TextLine]:
+def extract_text_lines(page: fitz.Page, page_number: int) -> list[TextLine]:
     """Extract all text lines from the page.
 
     Sometimes, a single lines as identified by PyMuPDF, is still split into separate lines.
 
     Args:
         page (fitz.page): the page to extract text from
+        page_number (int): the page number (first page is 1)
 
     Returns:
         list[TextLine]: A list of text lines.
@@ -20,7 +21,7 @@ def extract_text_lines(page: fitz.Page) -> list[TextLine]:
     words_by_line = {}
     for x0, y0, x1, y1, word, block_no, line_no, _word_no in fitz.utils.get_text(page, "words"):
         rect = fitz.Rect(x0, y0, x1, y1) * page.rotation_matrix
-        text_word = TextWord(rect, word)
+        text_word = TextWord(rect, word, page_number)
         words.append(text_word)
         key = f"{block_no}_{line_no}"
         if key not in words_by_line:
@@ -38,7 +39,7 @@ def extract_text_lines(page: fitz.Page) -> list[TextLine]:
                 lines.append(TextLine(current_line_words))
                 current_line_words = []
             current_line_words.append(word)
-        if len(current_line_words):
+        if current_line_words:
             lines.append(TextLine(current_line_words))
             current_line_words = []
 
diff --git a/src/stratigraphy/util/find_depth_columns.py b/src/stratigraphy/util/find_depth_columns.py
index 5a8bb432..4982dac3 100644
--- a/src/stratigraphy/util/find_depth_columns.py
+++ b/src/stratigraphy/util/find_depth_columns.py
@@ -10,13 +10,14 @@
 from stratigraphy.util.line import TextWord
 
 
-def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> list[DepthColumnEntry]:
+def depth_column_entries(all_words: list[TextWord], page_number: int, include_splits: bool) -> list[DepthColumnEntry]:
     """Find all depth column entries given a list of TextLine objects.
 
     Note: Only depths up to two digits before the decimal point are supported.
 
     Args:
         all_words (list[TextWord]): List of text words to extract depth column entries from.
+        page_number (int): The page number of the entries.
         include_splits (bool): Whether to include split entries.
 
     Returns:
@@ -32,10 +33,10 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis
             match = regex.match(input_string)
             if match:
                 value = value_as_float(match.group(1))
-                entries.append(DepthColumnEntry(word.rect, value))
+                entries.append(DepthColumnEntry(word.rect, value, page_number))
             elif include_splits:
                 # support for e.g. "1.10-1.60m" extracted as a single word
-                layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect)
+                layer_depth_column_entry = extract_layer_depth_interval(input_string, word.rect, page_number)
                 entries.extend(
                     [layer_depth_column_entry.start, layer_depth_column_entry.end] if layer_depth_column_entry else []
                 )
@@ -45,19 +46,21 @@ def depth_column_entries(all_words: list[TextWord], include_splits: bool) -> lis
 
 
 def value_as_float(string_value: str) -> float:  # noqa: D103
+    """Converts a string to a float."""
     # OCR sometimes tends to miss the decimal comma
     parsed_text = re.sub(r"^-?([0-9]+)([0-9]{2})", r"\1.\2", string_value)
     return abs(float(parsed_text))
 
 
 def extract_layer_depth_interval(
-    text: str, rect: fitz.Rect, require_start_of_string: bool = True
+    text: str, rect: fitz.Rect, page_number: int, require_start_of_string: bool = True
 ) -> LayerDepthColumnEntry | None:
     """Extracts a LayerDepthColumnEntry from a string.
 
     Args:
         text (str): The string to extract the depth interval from.
         rect (fitz.Rect): The rectangle of the text.
+        page_number (int): The page number of the text.
         require_start_of_string (bool, optional): Whether the number to extract needs to be
                                                   at the start of a string. Defaults to True.
 
@@ -78,7 +81,8 @@ def extract_layer_depth_interval(
         value2 = value_as_float(match.group(3))
         second_half_rect = fitz.Rect(rect.x0 + rect.width / 2, rect.y0, rect.x1, rect.y1)
         return LayerDepthColumnEntry(
-            DepthColumnEntry(first_half_rect, value1), DepthColumnEntry(second_half_rect, value2)
+            DepthColumnEntry(first_half_rect, value1, page_number),
+            DepthColumnEntry(second_half_rect, value2, page_number),
         )
     return None
 
@@ -150,13 +154,14 @@ def find_pair(entry: DepthColumnEntry) -> DepthColumnEntry | None:  # noqa: D103
 
 
 def find_depth_columns(
-    entries: list[DepthColumnEntry], all_words: list[TextWord], depth_column_params: dict
+    entries: list[DepthColumnEntry], all_words: list[TextWord], page_number: int, depth_column_params: dict
 ) -> list[BoundaryDepthColumn]:
     """Construct all possible BoundaryDepthColumn objects from the given DepthColumnEntry objects.
 
     Args:
         entries (list[DepthColumnEntry]): All found depth column entries in the page.
         all_words (list[TextLine]): All words in the page.
+        page_number (int): The page number of the entries.
         depth_column_params (dict): Parameters for the BoundaryDepthColumn objects.
 
     Returns:
@@ -190,7 +195,7 @@ def find_depth_columns(
     boundary_depth_column_validator = BoundaryDepthColumnValidator(all_words, **depth_column_params)
 
     numeric_columns = [
-        boundary_depth_column_validator.reduce_until_valid(column)
+        boundary_depth_column_validator.reduce_until_valid(column, page_number)
         for numeric_column in numeric_columns
         for column in numeric_column.break_on_double_descending()
         # when we have a perfect arithmetic progression, this is usually just a scale
diff --git a/src/stratigraphy/util/find_description.py b/src/stratigraphy/util/find_description.py
index 4c79d237..f9219e06 100644
--- a/src/stratigraphy/util/find_description.py
+++ b/src/stratigraphy/util/find_description.py
@@ -89,7 +89,7 @@ def matching_blocks(
         else:
             break
 
-    if len(matched_lines):
+    if matched_lines:
         return [TextBlock(matched_lines)]
     else:
         return []
@@ -128,7 +128,7 @@ def get_description_blocks(
             distances.append(line2rect.y0 - line1rect.y0)
 
     threshold = None
-    if len(distances):
+    if distances:
         threshold = min(distances) * 1.15
 
     # Create blocks separated by lines
diff --git a/src/stratigraphy/util/interval.py b/src/stratigraphy/util/interval.py
index ffb42c17..16e32389 100644
--- a/src/stratigraphy/util/interval.py
+++ b/src/stratigraphy/util/interval.py
@@ -21,6 +21,7 @@ def __init__(self, start: DepthColumnEntry | None, end: DepthColumnEntry | None)
 
     @property
     def start_value(self) -> float | None:
+        """Get the start value of the interval."""
         if self.start:
             return self.start.value
         else:
@@ -28,6 +29,7 @@ def start_value(self) -> float | None:
 
     @property
     def end_value(self) -> float | None:
+        """Get the end value of the interval."""
         if self.end:
             return self.end.value
         else:
@@ -36,14 +38,17 @@ def end_value(self) -> float | None:
     @property
     @abc.abstractmethod
     def line_anchor(self) -> fitz.Point:
+        """Get the line anchor of the interval."""
         pass
 
     @property
     @abc.abstractmethod
     def background_rect(self) -> fitz.Rect | None:
+        """Get the background rectangle of the interval."""
         pass
 
     def to_json(self):
+        """Convert the interval to a JSON serializable format."""
         return {
             "start": self.start.to_json() if self.start else None,
             "end": self.end.to_json() if self.end else None,
@@ -65,9 +70,6 @@ class BoundaryInterval(Interval):
     Boundary intervals are intervals that are defined by a start and an end point.
     """
 
-    def __init__(self, start: DepthColumnEntry | None, end: DepthColumnEntry | None):
-        super().__init__(start, end)
-
     @property
     def line_anchor(self) -> fitz.Point | None:
         if self.start and self.end:
@@ -132,10 +134,10 @@ def matching_blocks(self, all_blocks: list[TextBlock], block_index: int) -> tupl
                 if not can_end_exact_match:
                     exact_match_blocks = []
 
-            if len(exact_match_blocks):
+            if exact_match_blocks:
                 exact.extend(exact_match_blocks)
                 block_index = exact_match_index - 1
-            elif len(exact):
+            elif exact:
                 post.append(current_block)
             else:
                 pre.append(current_block)
@@ -168,6 +170,7 @@ def background_rect(self) -> fitz.Rect | None:
     def matching_blocks(
         self, all_lines: list[TextLine], line_index: int, next_interval: Interval | None
     ) -> list[TextBlock]:
+        """Adds lines to a block until the next layer identifier is reached."""
         y1_threshold = None
         if next_interval:
             next_interval_start_rect = next_interval.start.rect
@@ -181,7 +184,7 @@ def matching_blocks(
             else:
                 break
 
-        if len(matched_lines):
+        if matched_lines:
             return [TextBlock(matched_lines)]
         else:
             return []
diff --git a/src/stratigraphy/util/layer_identifier_column.py b/src/stratigraphy/util/layer_identifier_column.py
index 17a56a86..1aed65c8 100644
--- a/src/stratigraphy/util/layer_identifier_column.py
+++ b/src/stratigraphy/util/layer_identifier_column.py
@@ -115,7 +115,7 @@ def is_contained(self, rect: fitz.Rect) -> bool:
             and self.rect().y1 <= rect.y1
         )
 
-    def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry:
+    def get_depth_interval(self, block: TextBlock, page_number: int) -> LayerDepthColumnEntry:
         """Extract depth interval from a material description block.
 
         For borehole profiles in the Deriaz layout, the depth interval is usually found in the text description
@@ -125,6 +125,7 @@ def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry:
 
         Args:
             block (TextBlock): The block to calculate the depth interval for.
+            page_number (int): The page number of the block.
 
         Returns:
             LayerDepthColumnEntry: The depth interval.
@@ -132,7 +133,9 @@ def get_depth_interval(self, block: TextBlock) -> LayerDepthColumnEntry:
         depth_entries = []
         for line in block.lines:
             try:
-                layer_depth_entry = extract_layer_depth_interval(line.text, line.rect, require_start_of_string=False)
+                layer_depth_entry = extract_layer_depth_interval(
+                    line.text, line.rect, page_number, require_start_of_string=False
+                )
                 # require_start_of_string = False because the depth interval may not always start at the beginning
                 # of the line e.g. "Remblais Heterogene: 0.00 - 0.5m"
                 if layer_depth_entry:
diff --git a/src/stratigraphy/util/line.py b/src/stratigraphy/util/line.py
index b88d0215..05526413 100644
--- a/src/stratigraphy/util/line.py
+++ b/src/stratigraphy/util/line.py
@@ -17,9 +17,10 @@ class TextWord:
     to represent the location of the word in a PDF document.
     """
 
-    def __init__(self, rect: fitz.Rect, text: str):
+    def __init__(self, rect: fitz.Rect, text: str, page: int):
         self.rect = rect
         self.text = text
+        self.page_number = page
 
     def __repr__(self) -> str:
         return f"TextWord({self.rect}, {self.text})"
@@ -33,18 +34,27 @@ class TextLine:
     """
 
     def __init__(self, words: list[TextWord]):
+        """Initialize the TextLine object.
+
+        Args:
+            words (list[TextWord]): The words that make up the line.
+            page_number (int): The page number of the line. The first page has idx 1.
+        """
         self.rect = fitz.Rect()
         for word in words:
             self.rect.include_rect(word.rect)
         self.words = words
+        self.page_number = words[0].page_number
 
     def is_description(self, material_description):
+        """Check if the line is a material description."""
         return any(
             self.text.lower().find(word) > -1 for word in material_description["including_expressions"]
         ) and not any(self.text.lower().find(word) > -1 for word in material_description["excluding_expressions"])
 
     @property
     def text(self) -> str:
+        """Get the text of the line."""
         return " ".join([word.text for word in self.words])
 
     def __repr__(self) -> str:
@@ -62,13 +72,15 @@ def __repr__(self) -> str:
     """
 
     def is_line_start(self, raw_lines_before: list[TextLine], raw_lines_after: list[TextLine]) -> bool:
+        """Check if the current line is the start of a new line."""
+
         def significant_overlap(line: TextLine) -> bool:
             return x_overlap_significant_largest(line.rect, self.rect, 0.5)
 
         matching_lines_before = [line for line in raw_lines_before if significant_overlap(line)]
         matching_lines_after = [line for line in raw_lines_after if significant_overlap(line)]
 
-        def count_points(lines: list[TextLine]) -> (int, int):
+        def count_points(lines: list[TextLine]) -> tuple[int, int]:
             exact_points = 0
             indentation_points = 0
             for other in lines:
@@ -95,7 +107,9 @@ def count_points(lines: list[TextLine]) -> (int, int):
         return exact_points >= 3 or (exact_points >= 2 and indentation_points >= 1)
 
     def to_json(self):
+        """Convert the TextLine object to a JSON serializable dictionary."""
         return {
             "text": self.text,
             "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1],
+            "page": self.page_number,
         }
diff --git a/src/stratigraphy/util/predictions.py b/src/stratigraphy/util/predictions.py
index 04075950..a0ce6c22 100644
--- a/src/stratigraphy/util/predictions.py
+++ b/src/stratigraphy/util/predictions.py
@@ -36,31 +36,25 @@ class LayerPrediction:
     id: uuid.UUID = field(default_factory=uuid.uuid4)
 
 
-@dataclass
-class PagePredictions:
-    """A class to represent predictions for a single page."""
-
-    layers: list[LayerPrediction]
-    page_number: int
-    page_width: int
-    page_height: int
-    depths_materials_columns_pairs: list[dict] = None
-
-    def __post__init__(self):
-        """Sort layers by their occurence on the page."""
-        self.layers = sorted(self.layers, key=lambda layer: layer.material_description.rect.y0)
-
-
 class FilePredictions:
     """A class to represent predictions for a single file."""
 
-    def __init__(self, pages: list[PagePredictions], file_name: str, language: str, metadata: BoreholeMetaData = None):
-        self.pages = pages
+    def __init__(
+        self,
+        layers: list[LayerPrediction],
+        file_name: str,
+        language: str,
+        metadata: BoreholeMetaData = None,
+        depths_materials_columns_pairs: list[dict] = None,
+        page_sizes: list[tuple[int, int]] = None,
+    ):
+        self.layers: list[LayerPrediction] = sorted(layers, key=lambda layer: layer.material_description.rect.y0)
+        self.depths_materials_columns_pairs: list[dict] = depths_materials_columns_pairs
         self.file_name = file_name
         self.language = language
-        self.layers = sum([page.layers for page in self.pages], [])
         self.metadata = metadata
         self.metadata_is_correct: dict = {}
+        self.page_sizes: list[tuple[int, int]] = page_sizes
 
     @staticmethod
     def create_from_json(predictions_for_file: dict, file_name: str):
@@ -70,8 +64,14 @@ def create_from_json(predictions_for_file: dict, file_name: str):
             predictions_for_file (dict): The predictions for the file in json format.
             file_name (str): The name of the file.
         """
-        page_predictions_class = []
+        page_layer_predictions_list: list[LayerPrediction] = []
+        pages_width_list: list[int] = []
+        pages_height_list: list[int] = []
+        depths_materials_columns_pairs_list: list[dict] = []
+
         for page_number, page_predictions in predictions_for_file.items():
+            # TODO: Look into this as it seems to be a quite dirty fix here.
+            # As languages and metadata are not pages, they should be handled differently.
             if page_number == "language":
                 file_language = page_predictions
                 continue
@@ -85,53 +85,53 @@ def create_from_json(predictions_for_file: dict, file_name: str):
                 file_metadata = BoreholeMetaData(coordinates=coordinates)
                 # TODO: Add additional metadata here.
                 continue
-            page_layers = page_predictions["layers"]
-            layer_predictions = []
-            for layer in page_layers:
-                material_prediction = _create_textblock_object(layer["material_description"]["lines"])
-                if "depth_interval" in layer:
-                    start = (
-                        DepthColumnEntry(
-                            value=layer["depth_interval"]["start"]["value"],
-                            rect=fitz.Rect(layer["depth_interval"]["start"]["rect"]),
+            elif page_number == "layers":
+                for layer in page_predictions:
+                    material_prediction = _create_textblock_object(layer["material_description"]["lines"])
+                    if "depth_interval" in layer:
+                        start = (
+                            DepthColumnEntry(
+                                value=layer["depth_interval"]["start"]["value"],
+                                rect=fitz.Rect(layer["depth_interval"]["start"]["rect"]),
+                                page_number=layer["depth_interval"]["start"]["page"],
+                            )
+                            if layer["depth_interval"]["start"] is not None
+                            else None
                         )
-                        if layer["depth_interval"]["start"] is not None
-                        else None
-                    )
-                    end = (
-                        DepthColumnEntry(
-                            value=layer["depth_interval"]["end"]["value"],
-                            rect=fitz.Rect(layer["depth_interval"]["end"]["rect"]),
+                        end = (
+                            DepthColumnEntry(
+                                value=layer["depth_interval"]["end"]["value"],
+                                rect=fitz.Rect(layer["depth_interval"]["end"]["rect"]),
+                                page_number=layer["depth_interval"]["end"]["page"],
+                            )
+                            if layer["depth_interval"]["end"] is not None
+                            else None
                         )
-                        if layer["depth_interval"]["end"] is not None
-                        else None
-                    )
 
-                    depth_interval_prediction = BoundaryInterval(start=start, end=end)
-                    layer_predictions.append(
-                        LayerPrediction(
+                        depth_interval_prediction = BoundaryInterval(start=start, end=end)
+                        layer_predictions = LayerPrediction(
                             material_description=material_prediction, depth_interval=depth_interval_prediction
                         )
-                    )
-                else:
-                    layer_predictions.append(
-                        LayerPrediction(material_description=material_prediction, depth_interval=None)
-                    )
+                    else:
+                        layer_predictions = LayerPrediction(
+                            material_description=material_prediction, depth_interval=None
+                        )
+
+                    page_layer_predictions_list.append(layer_predictions)
+
             if "depths_materials_column_pairs" in page_predictions:
-                page_predictions_class.append(
-                    PagePredictions(
-                        page_number=page_number,
-                        page_width=page_predictions["page_width"],
-                        page_height=page_predictions["page_height"],
-                        layers=layer_predictions,
-                        depths_materials_columns_pairs=page_predictions["depths_materials_column_pairs"],
-                    )
-                )
-            else:
-                page_predictions_class.append(PagePredictions(page_number=page_number, layers=layer_predictions))
+                depths_materials_columns_pairs_list.extend(page_predictions["depths_materials_column_pairs"])
+
+            pages_width_list.extend(predictions_for_file["page_width"])
+            pages_height_list.extend(predictions_for_file["page_height"])
 
         return FilePredictions(
-            pages=page_predictions_class, file_name=file_name, language=file_language, metadata=file_metadata
+            layers=page_layer_predictions_list,
+            file_name=file_name,
+            language=file_language,
+            metadata=file_metadata,
+            depths_materials_columns_pairs=depths_materials_columns_pairs_list,
+            page_sizes=list(zip(pages_width_list, pages_height_list, strict=False)),
         )
 
     def convert_to_ground_truth(self):
@@ -147,14 +147,13 @@ def convert_to_ground_truth(self):
         """
         ground_truth = {self.file_name: {"metadata": self.metadata}}
         layers = []
-        for page in self.pages:
-            for layer in page.layers:
-                material_description = layer.material_description.text
-                depth_interval = {
-                    "start": layer.depth_interval.start.value if layer.depth_interval.start else None,
-                    "end": layer.depth_interval.end.value if layer.depth_interval.end else None,
-                }
-                layers.append({"material_description": material_description, "depth_interval": depth_interval})
+        for layer in self.layers:
+            material_description = layer.material_description.text
+            depth_interval = {
+                "start": layer.depth_interval.start.value if layer.depth_interval.start else None,
+                "end": layer.depth_interval.end.value if layer.depth_interval.end else None,
+            }
+            layers.append({"material_description": material_description, "depth_interval": depth_interval})
         ground_truth[self.file_name]["layers"] = layers
         if self.metadata.coordinates is not None:
             ground_truth[self.file_name]["metadata"] = {
@@ -166,6 +165,11 @@ def convert_to_ground_truth(self):
         return ground_truth
 
     def evaluate(self, ground_truth: dict):
+        """Evaluate the predictions against the ground truth.
+
+        Args:
+            ground_truth (dict): The ground truth for the file.
+        """
         self.evaluate_layers(ground_truth["layers"])
         self.evaluate_metadata(ground_truth.get("metadata"))
 
@@ -175,6 +179,7 @@ def evaluate_layers(self, ground_truth_layers: list):
         Args:
             ground_truth_layers (list): The ground truth layers for the file.
         """
+        # TODO: Attribute 'unmatched_layers' defined outside __init__ method. This is not a good practice.
         self.unmatched_layers = ground_truth_layers.copy()
         for layer in self.layers:
             match, depth_interval_is_correct = self._find_matching_layer(layer)
diff --git a/src/stratigraphy/util/textblock.py b/src/stratigraphy/util/textblock.py
index 15da4aa9..98b82b62 100644
--- a/src/stratigraphy/util/textblock.py
+++ b/src/stratigraphy/util/textblock.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
+from typing import Any
 
 import fitz
 import numpy as np
@@ -24,6 +25,7 @@ class MaterialDescription:
     rect: fitz.Rect
 
     def to_json(self):
+        """Convert the MaterialDescription object to a JSON serializable dictionary."""
         return {
             "text": self.text,
             "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1],
@@ -32,7 +34,11 @@ def to_json(self):
 
 @dataclass
 class TextBlock:
-    """Class to represent a block of text in a PDF document."""
+    """Class to represent a block of text in a PDF document.
+
+    A TextBlock is a collection of Lines surrounded by Lines.
+    It is used to represent a block of text in a PDF document.
+    """
 
     lines: list[TextLine]
     is_terminated_by_line: bool = False
@@ -50,7 +56,23 @@ def __post_init__(self):
         else:
             self.rect = fitz.Rect()
 
-    def concatenate(self, other: TextBlock):
+        # go through all the lines and check if they are on the same page
+        page_number_set = set(line.page_number for line in self.lines)
+        assert len(page_number_set) < 2, "TextBlock spans multiple pages"
+        if page_number_set:
+            self.page_number = page_number_set.pop()
+        else:
+            self.page_number = None
+
+    def concatenate(self, other: TextBlock) -> TextBlock:
+        """Concatenate two text blocks.
+
+        Args:
+            other (TextBlock): The other text block.
+
+        Returns:
+            TextBlock: The concatenated text block.
+        """
         new_lines = []
         new_lines.extend(self.lines)
         new_lines.extend(other.lines)
@@ -59,6 +81,11 @@ def concatenate(self, other: TextBlock):
     # LGD-288: sometimes indentation is the only significant signal for deciding where we need to split the material
     # descriptions of adjacent layers.
     def split_based_on_indentation(self) -> list[TextBlock]:
+        """Split the text block based on indentation.
+
+        Returns:
+            list[TextBlock]: The split text blocks.
+        """
         if len(self.lines) == 0:
             return []
 
@@ -84,14 +111,14 @@ def split_based_on_indentation(self) -> list[TextBlock]:
         for line in self.lines:
             if line.rect.x0 < indentation_low:
                 # start new block
-                if len(current_block_lines):
+                if current_block_lines:
                     blocks.append(TextBlock(current_block_lines))
                 current_block_lines = [line]
             else:
                 # continue block
                 current_block_lines.append(line)
 
-        if len(current_block_lines):
+        if current_block_lines:
             blocks.append(TextBlock(current_block_lines))
 
         if self.is_terminated_by_line:  # if the block was terminated by a line, then the last block should be as well
@@ -123,11 +150,13 @@ def _is_legend(self) -> bool:
                 y0_coordinates.append(line.rect.y0)
         return number_horizontally_close > 1 or number_vertically_close > 2
 
-    def to_json(self):
+    def to_json(self) -> dict[str, Any]:
+        """Convert the TextBlock object to a JSON serializable dictionary."""
         return {
             "text": self.text,
             "rect": [self.rect.x0, self.rect.y0, self.rect.x1, self.rect.y1],
             "lines": [line.to_json() for line in self.lines],
+            "page": self.page_number,
         }
 
 
diff --git a/tests/test_coordinate_extraction.py b/tests/test_coordinate_extraction.py
index 4c305bd7..6fdc1253 100644
--- a/tests/test_coordinate_extraction.py
+++ b/tests/test_coordinate_extraction.py
@@ -14,16 +14,19 @@
 
 
 def test_strLV95():  # noqa: D103
+    """Test the string representation of an LV95Coordinate object."""
     coord = LV95Coordinate(CoordinateEntry(2789456), CoordinateEntry(1123012), fitz.Rect(), page=1)
     assert str(coord) == "E: 2'789'456, N: 1'123'012"
 
 
 def test_to_jsonLV95():  # noqa: D103
+    """Test the to_json method of an LV95Coordinate object."""
     coord = LV95Coordinate(CoordinateEntry(2789456), CoordinateEntry(1123012), fitz.Rect(0, 1, 2, 3), page=1)
     assert coord.to_json() == {"E": 2789456, "N": 1123012, "rect": [0, 1, 2, 3], "page": 1}
 
 
 def test_swap_coordinates():  # noqa: D103
+    """Test the swapping of coordinates in an LV95Coordinate object."""
     north = CoordinateEntry(789456)
     east = CoordinateEntry(123012)
     coord = LV95Coordinate(north=north, east=east, rect=fitz.Rect(), page=1)
@@ -32,11 +35,13 @@ def test_swap_coordinates():  # noqa: D103
 
 
 def test_strLV03():  # noqa: D103
+    """Test the string representation of an LV03Coordinate object."""
     coord = LV03Coordinate(CoordinateEntry(789456), CoordinateEntry(123012), rect=fitz.Rect(), page=1)
     assert str(coord) == "E: 789'456, N: 123'012"
 
 
 def test_to_jsonLV03():  # noqa: D103
+    """Test the to_json method of an LV03Coordinate object."""
     coord = LV03Coordinate(CoordinateEntry(789456), CoordinateEntry(123012), fitz.Rect(0, 1, 2, 3), page=1)
     assert coord.to_json() == {"E": 789456, "N": 123012, "rect": [0, 1, 2, 3], "page": 1}
 
@@ -46,6 +51,7 @@ def test_to_jsonLV03():  # noqa: D103
 
 
 def test_CoordinateExtractor_extract_coordinates():  # noqa: D103
+    """Test the extraction of coordinates from a PDF document."""
     # Assuming there is a method called 'extract' in CoordinateExtractor class
     coordinates = extractor.extract_coordinates()
     # Check if the returned value is a list
@@ -55,10 +61,12 @@ def test_CoordinateExtractor_extract_coordinates():  # noqa: D103
 
 
 def _create_simple_lines(text_lines: list[str]) -> list[TextLine]:
+    """Create a list of TextLine objects from a list of text lines."""
+    page_number = 1
     return [
         TextLine(
             [
-                TextWord(fitz.Rect(word_index, line_index, word_index + 1, line_index + 1), word_text)
+                TextWord(fitz.Rect(word_index, line_index, word_index + 1, line_index + 1), word_text, page_number)
                 for word_index, word_text in enumerate(text_line.split(" "))
             ]
         )
@@ -67,6 +75,7 @@ def _create_simple_lines(text_lines: list[str]) -> list[TextLine]:
 
 
 def test_CoordinateExtractor_find_coordinate_key():  # noqa: D103
+    """Test the extraction of the coordinate key from a list of text lines."""
     lines = _create_simple_lines(
         ["This is a sample text", "followed by a key with a spelling mistake", "Ko0rdinate 615.790 / 157.500"]
     )
@@ -83,6 +92,7 @@ def test_CoordinateExtractor_find_coordinate_key():  # noqa: D103
 
 
 def test_CoordinateExtractor_get_coordinates_with_x_y_labels():  # noqa: D103
+    """Test the extraction of coordinates with explicit "X" and "Y" labels."""
     lines = _create_simple_lines(
         [
             "X = 2 600 000",
@@ -109,6 +119,7 @@ def test_CoordinateExtractor_get_coordinates_with_x_y_labels():  # noqa: D103
 
 
 def test_CoordinateExtractor_get_coordinates_near_key():  # noqa: D103
+    """Test the extraction of coordinates near a key."""
     lines = _create_simple_lines(
         [
             "This is a sample text followed by a key with a spelling",
@@ -158,6 +169,7 @@ def test_CoordinateExtractor_get_coordinates_near_key():  # noqa: D103
     ],
 )
 def test_CoordinateExtractor_get_coordinates_from_lines(text, expected):  # noqa: D103
+    """Test the extraction of coordinates from a list of text lines."""
     lines = _create_simple_lines([text])
     coordinates = extractor.get_coordinates_from_lines(lines, page=1)
     expected_east, expected_north = expected
@@ -167,6 +179,7 @@ def test_CoordinateExtractor_get_coordinates_from_lines(text, expected):  # noqa
 
 
 def test_CoordinateExtractor_get_coordinates_from_lines_rect():  # noqa: D103
+    """Test the extraction of coordinates from a list of text lines with different rect formats."""
     lines = _create_simple_lines(["start", "2600000 1200000", "end"])
     coordinates = extractor.get_coordinates_from_lines(lines, page=1)
     assert coordinates[0].rect == lines[1].rect
diff --git a/tests/test_depthcolumn.py b/tests/test_depthcolumn.py
index adeb414e..66d53fb8 100644
--- a/tests/test_depthcolumn.py
+++ b/tests/test_depthcolumn.py
@@ -6,25 +6,27 @@
 
 
 def test_boundarydepthcolumn_isarithmeticprogression():  # noqa: D103
+    """Test the is_arithmetic_progression method of the BoundaryDepthColumn class."""
+    page_number = 1
     column = BoundaryDepthColumn(
         [
-            DepthColumnEntry(fitz.Rect(), value=1),
-            DepthColumnEntry(fitz.Rect(), value=2),
-            DepthColumnEntry(fitz.Rect(), value=3),
-            DepthColumnEntry(fitz.Rect(), value=4),
-            DepthColumnEntry(fitz.Rect(), value=5),
+            DepthColumnEntry(fitz.Rect(), value=1, page_number=page_number),
+            DepthColumnEntry(fitz.Rect(), value=2, page_number=page_number),
+            DepthColumnEntry(fitz.Rect(), value=3, page_number=page_number),
+            DepthColumnEntry(fitz.Rect(), value=4, page_number=page_number),
+            DepthColumnEntry(fitz.Rect(), value=5, page_number=page_number),
         ]
     )
     assert column.is_arithmetic_progression(), "The column should be recognized as arithmetic progression"
 
     column = BoundaryDepthColumn(
         [
-            DepthColumnEntry(fitz.Rect(), value=17.6),
-            DepthColumnEntry(fitz.Rect(), value=18.15),
-            DepthColumnEntry(fitz.Rect(), value=18.65),
-            DepthColumnEntry(fitz.Rect(), value=19.3),
-            DepthColumnEntry(fitz.Rect(), value=19.9),
-            DepthColumnEntry(fitz.Rect(), value=20.5),
+            DepthColumnEntry(fitz.Rect(), value=17.6, page_number=page_number),
+            DepthColumnEntry(fitz.Rect(), value=18.15, page_number=page_number),
+            DepthColumnEntry(fitz.Rect(), value=18.65, page_number=page_number),
+            DepthColumnEntry(fitz.Rect(), value=19.3, page_number=page_number),
+            DepthColumnEntry(fitz.Rect(), value=19.9, page_number=page_number),
+            DepthColumnEntry(fitz.Rect(), value=20.5, page_number=page_number),
         ]
     )
     assert not column.is_arithmetic_progression(), "The column should not be recognized as arithmetic progression"
diff --git a/tests/test_find_depth_columns.py b/tests/test_find_depth_columns.py
index 4ff107bc..9074d6a7 100644
--- a/tests/test_find_depth_columns.py
+++ b/tests/test_find_depth_columns.py
@@ -8,13 +8,15 @@
 
 
 def test_depth_column_entries():  # noqa: D103
+    """Test the depth_column_entries function."""
+    page_number = 1
     all_words = [
-        TextWord(fitz.Rect(0, 0, 5, 1), "10.00m"),
-        TextWord(fitz.Rect(0, 2, 5, 3), "20.0m"),
-        TextWord(fitz.Rect(0, 4, 5, 5), "30.0m"),
-        TextWord(fitz.Rect(0, 6, 5, 7), "40.0m"),
+        TextWord(fitz.Rect(0, 0, 5, 1), "10.00m", page_number),
+        TextWord(fitz.Rect(0, 2, 5, 3), "20.0m", page_number),
+        TextWord(fitz.Rect(0, 4, 5, 5), "30.0m", page_number),
+        TextWord(fitz.Rect(0, 6, 5, 7), "40.0m", page_number),
     ]
-    entries = depth_column_entries(all_words, include_splits=False)
+    entries = depth_column_entries(all_words, page_number, include_splits=False)
     assert len(entries) == 4, "There should be 4 entries"
     assert pytest.approx(entries[0].value) == 10.0, "The first entry should have a value of 10.0"
     assert pytest.approx(entries[1].value) == 20.0, "The second entry should have a value of 20.0"
@@ -23,11 +25,13 @@ def test_depth_column_entries():  # noqa: D103
 
 
 def test_depth_column_entries_with_splits():  # noqa: D103
+    """Test the depth_column_entries function with include_splits=True."""
+    page_number = 1
     all_words = [
-        TextLine([TextWord(fitz.Rect(0, 0, 10, 1), "10.00-20.0m")]),
-        TextLine([TextWord(fitz.Rect(0, 2, 10, 3), "30.0-40.0m")]),
+        TextLine([TextWord(fitz.Rect(0, 0, 10, 1), "10.00-20.0m", page_number)]),
+        TextLine([TextWord(fitz.Rect(0, 2, 10, 3), "30.0-40.0m", page_number)]),
     ]
-    entries = depth_column_entries(all_words, include_splits=True)
+    entries = depth_column_entries(all_words, page_number, include_splits=True)
     assert len(entries) == 4, "There should be 4 entries"
     assert entries[0].value == 10.0, "The first entry should have a value of 10.0"
     assert entries[1].value == 20.0, "The second entry should have a value of 20.0"
@@ -36,13 +40,15 @@ def test_depth_column_entries_with_splits():  # noqa: D103
 
 
 def test_depth_column_entries_with_leading_character():  # noqa: D103
+    """Test the depth_column_entries function with a leading character."""
+    page_number = 1
     all_words = [
-        TextWord(fitz.Rect(0, 0, 5, 1), "0.00m"),
-        TextWord(fitz.Rect(0, 2, 5, 3), ".2m"),  # this is a test for an ocr error from '-2m' to '.2m'
-        TextWord(fitz.Rect(0, 4, 5, 5), "-3.0m"),
-        TextWord(fitz.Rect(0, 6, 5, 7), ".4.2m"),
+        TextWord(fitz.Rect(0, 0, 5, 1), "0.00m", page_number),
+        TextWord(fitz.Rect(0, 2, 5, 3), ".2m", page_number),  # this is a test for an ocr error from '-2m' to '.2m'
+        TextWord(fitz.Rect(0, 4, 5, 5), "-3.0m", page_number),
+        TextWord(fitz.Rect(0, 6, 5, 7), ".4.2m", page_number),
     ]
-    entries = depth_column_entries(all_words, include_splits=True)
+    entries = depth_column_entries(all_words, page_number, include_splits=True)
     assert len(entries) == 4, "There should be 4 entries"
     assert entries[0].value == 0.0, "The first entry should have a value of 0"
     assert entries[1].value == 2.0, "The second entry should have a value of 2.0"
@@ -51,45 +57,51 @@ def test_depth_column_entries_with_leading_character():  # noqa: D103
 
 
 all_words_find_depth_column = [
-    TextWord(fitz.Rect(0, 0, 5, 1), "10.00m"),
-    TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand"),
-    TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert."),
-    TextWord(fitz.Rect(0, 2, 5, 3), "20.0m"),
-    TextWord(fitz.Rect(0, 4, 5, 5), "30.0m"),
-    TextWord(fitz.Rect(0, 6, 5, 7), "40.0m"),
-    TextWord(fitz.Rect(0, 8, 5, 9), "50.0m"),
+    TextWord(fitz.Rect(0, 0, 5, 1), "10.00m", 1),
+    TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand", 1),
+    TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert.", 1),
+    TextWord(fitz.Rect(0, 2, 5, 3), "20.0m", 1),
+    TextWord(fitz.Rect(0, 4, 5, 5), "30.0m", 1),
+    TextWord(fitz.Rect(0, 6, 5, 7), "40.0m", 1),
+    TextWord(fitz.Rect(0, 8, 5, 9), "50.0m", 1),
 ]
 
 
 def test_find_depth_columns_arithmetic_progression():  # noqa: D103
+    """Test the find_depth_columns function with an arithmetic progression."""
     entries = [
-        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 10.0),
-        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0),
-        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 30.0),
-        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0),
-        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0),
+        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 10.0, 1),
+        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, 1),
+        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 30.0, 1),
+        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, 1),
+        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, 1),
     ]
+    page_number = 1
 
     columns = find_depth_columns(
         entries,
         all_words_find_depth_column,
+        page_number,
         depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0},
     )
     assert len(columns) == 0, "There should be 0 columns as the above is a perfect arithmetic progression"
 
 
 def test_find_depth_columns():  # noqa: D103
+    """Test the find_depth_columns function."""
+    page_number = 1
     entries = [
-        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0),
-        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0),
-        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0),
-        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0),
-        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0),
+        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, page_number),
     ]
 
     columns = find_depth_columns(
         entries,
         all_words_find_depth_column,
+        page_number,
         depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0},
     )
     assert len(columns) == 1, "There should be 1 column"
@@ -102,22 +114,25 @@ def test_find_depth_columns():  # noqa: D103
 
 
 def test_two_columns_find_depth_columns():  # noqa: D103
+    """Test the find_depth_columns function with two columns."""
+    page_number = 1
     entries = [  # first depth column
-        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0),
-        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0),
-        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0),
-        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0),
-        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0),
-        DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0),  # second depth column
-        DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0),
-        DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0),
-        DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0),
-        DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0),
-        DepthColumnEntry(fitz.Rect(20, 10, 25, 11), 61.0),
+        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, page_number),
+        DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0, page_number),  # second depth column
+        DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0, page_number),
+        DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0, page_number),
+        DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0, page_number),
+        DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0, page_number),
+        DepthColumnEntry(fitz.Rect(20, 10, 25, 11), 61.0, page_number),
     ]
     columns = find_depth_columns(
         entries,
         all_words_find_depth_column,
+        page_number,
         depth_column_params={"noise_count_threshold": 1.25, "noise_count_offset": 0},
     )
     assert len(columns) == 2, "There should be 2 columns"
@@ -126,28 +141,30 @@ def test_two_columns_find_depth_columns():  # noqa: D103
 
 
 all_words_find_layer_depth_column = [
-    TextWord(fitz.Rect(0, 0, 5, 1), "12.00-20.0m"),
-    TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand"),
-    TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert."),
-    TextWord(fitz.Rect(0, 2, 5, 3), "20.0-34.0m"),
-    TextWord(fitz.Rect(0, 4, 5, 5), "34.0 - 40.0m"),
-    TextWord(fitz.Rect(0, 6, 5, 7), "40.0-50m"),
-    TextWord(fitz.Rect(0, 8, 5, 9), "50.0-60m"),
+    TextWord(fitz.Rect(0, 0, 5, 1), "12.00-20.0m", 1),
+    TextWord(fitz.Rect(20, 0, 30, 1), "Kies, Torf und Sand", 1),
+    TextWord(fitz.Rect(20, 2, 30, 3), "Kies, verwittert.", 1),
+    TextWord(fitz.Rect(0, 2, 5, 3), "20.0-34.0m", 1),
+    TextWord(fitz.Rect(0, 4, 5, 5), "34.0 - 40.0m", 1),
+    TextWord(fitz.Rect(0, 6, 5, 7), "40.0-50m", 1),
+    TextWord(fitz.Rect(0, 8, 5, 9), "50.0-60m", 1),
 ]
 
 
 def test_find_layer_depth_columns():  # noqa: D103
+    """Test the find_layer_depth_columns function."""
+    page_number = 1
     entries = [
-        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0),  # layer 12.0-20.0m
-        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0),
-        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0),  # layer 20.0-34.0m
-        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0),
-        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0),  # layer 34.0-40.0m
-        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0),
-        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0),  # layer 40.0-50.0m
-        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0),
-        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0),  # layer 50.0-60.0m
-        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0),
+        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, page_number),  # layer 12.0-20.0m
+        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, page_number),  # layer 20.0-34.0m
+        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, page_number),  # layer 34.0-40.0m
+        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, page_number),  # layer 40.0-50.0m
+        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, page_number),  # layer 50.0-60.0m
+        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0, page_number),
     ]
 
     columns = find_layer_depth_columns(entries, all_words_find_depth_column)
@@ -166,28 +183,30 @@ def test_find_layer_depth_columns():  # noqa: D103
 
 
 def test_two_columns_find_layer_depth_columns():  # noqa: D103
+    """Test the find_layer_depth_columns function with two columns."""
+    page_number = 1
     entries = [  # first depth column
-        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0),  # layer 12.0-20.0m
-        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0),
-        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0),  # layer 20.0-34.0m
-        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0),
-        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0),  # layer 34.0-40.0m
-        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0),
-        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0),  # layer 40.0-50.0m
-        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0),
-        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0),  # layer 50.0-60.0m
-        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0),
+        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 12.0, page_number),  # layer 12.0-20.0m
+        DepthColumnEntry(fitz.Rect(0, 0, 5, 1), 20.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 20.0, page_number),  # layer 20.0-34.0m
+        DepthColumnEntry(fitz.Rect(0, 2, 5, 3), 34.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 34.0, page_number),  # layer 34.0-40.0m
+        DepthColumnEntry(fitz.Rect(0, 4, 5, 5), 40.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 40.0, page_number),  # layer 40.0-50.0m
+        DepthColumnEntry(fitz.Rect(0, 6, 5, 7), 50.0, page_number),
+        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 50.0, page_number),  # layer 50.0-60.0m
+        DepthColumnEntry(fitz.Rect(0, 8, 5, 9), 60.0, page_number),
         # second depth column
-        DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0),  # layer 12.0-20.0m
-        DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 20.0),
-        DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0),  # layer 20.0-34.0m
-        DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 34.0),
-        DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0),  # layer 34.0-40.0m
-        DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 40.0),
-        DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0),  # layer 40.0-50.0m
-        DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 50.0),
-        DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0),  # layer 50.0-60.0m
-        DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 60.0),
+        DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 12.0, page_number),  # layer 12.0-20.0m
+        DepthColumnEntry(fitz.Rect(20, 0, 25, 1), 20.0, page_number),
+        DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 20.0, page_number),  # layer 20.0-34.0m
+        DepthColumnEntry(fitz.Rect(20, 2, 25, 3), 34.0, page_number),
+        DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 34.0, page_number),  # layer 34.0-40.0m
+        DepthColumnEntry(fitz.Rect(20, 4, 25, 5), 40.0, page_number),
+        DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 40.0, page_number),  # layer 40.0-50.0m
+        DepthColumnEntry(fitz.Rect(20, 6, 25, 7), 50.0, page_number),
+        DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 50.0, page_number),  # layer 50.0-60.0m
+        DepthColumnEntry(fitz.Rect(20, 8, 25, 9), 60.0, page_number),
     ]
     columns = find_layer_depth_columns(entries, all_words_find_layer_depth_column)
     assert len(columns) == 2, "There should be 2 columns"
diff --git a/tests/test_find_descripton.py b/tests/test_find_descripton.py
index 80dd1ab5..721c10a9 100644
--- a/tests/test_find_descripton.py
+++ b/tests/test_find_descripton.py
@@ -5,9 +5,12 @@
 from stratigraphy.util.find_description import get_description_blocks
 from stratigraphy.util.line import TextLine, TextWord
 
-textline1 = TextLine([TextWord(fitz.Rect([0, 0, 10, 10]), "Hello")])
-textline2 = TextLine([TextWord(fitz.Rect([0, 15, 10, 25]), "World")])
-textline3 = TextLine([TextWord(fitz.Rect([0, 37, 10, 47]), "Hey")])  # larger vertical distance to previous blocks
+page_number = 1
+textline1 = TextLine([TextWord(fitz.Rect([0, 0, 10, 10]), "Hello", page_number)])
+textline2 = TextLine([TextWord(fitz.Rect([0, 15, 10, 25]), "World", page_number)])
+textline3 = TextLine(
+    [TextWord(fitz.Rect([0, 37, 10, 47]), "Hey", page_number)]
+)  # larger vertical distance to previous blocks
 
 geometric_lines = [Line(Point(500, 1), Point(505, 1))]  # line does not cut the blocks
 geometric_lines_cut = [Line(Point(-5, 12), Point(10, 12))]  # line cuts the first and second line
@@ -24,6 +27,7 @@
 
 
 def test_get_description_blocks():  # noqa: D103
+    """Test the grouping of description lines into blocks."""
     target_layer_count = 2  # expect two blocks. But the line do not cut the blocks
     blocks = get_description_blocks(
         description_lines,
@@ -40,6 +44,7 @@ def test_get_description_blocks():  # noqa: D103
 
 
 def test_get_description_blocks_separated_by_line():  # noqa: D103
+    """Test the splitting of blocks based on the presence of a line."""
     target_layer_count = 1  # should not trigger splitting the blocks with vertical distances
     blocks = get_description_blocks(
         description_lines,
@@ -56,6 +61,7 @@ def test_get_description_blocks_separated_by_line():  # noqa: D103
 
 
 def test_get_description_blocks_separated_by_lefthandside_line():  # noqa: D103
+    """Test the splitting of blocks based on the presence of a lefthandside line."""
     target_layer_count = 1  # only one block, but the lefthand line still cuts them into two blocks
     geometric_lines_all = geometric_lines_cut + geometric_lines_lefthandside
     blocks = get_description_blocks(
diff --git a/tests/test_interval.py b/tests/test_interval.py
index f3e9755e..d88bbee9 100644
--- a/tests/test_interval.py
+++ b/tests/test_interval.py
@@ -6,8 +6,10 @@
 
 
 def test_line_anchor():  # noqa: D103
-    start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5)
-    end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10)
+    """Test the line anchor property of the BoundaryInterval and LayerInterval classes."""
+    page_number = 1
+    start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number)
+    end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10, page_number)
     boundary_interval = BoundaryInterval(start, end)
     assert boundary_interval.line_anchor == fitz.Point(1, 1.5), (
         "The 'line anchor' for a BoundaryInterval should be halfway between the bottom-right of the start depth and "
@@ -24,8 +26,8 @@ def test_line_anchor():  # noqa: D103
         1, 2
     ), "The 'line anchor' for a BoundaryInterval without start should be the top-right of the end depth."
 
-    start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5)
-    end = DepthColumnEntry(fitz.Rect(2, 0, 3, 1), 10)
+    start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number)
+    end = DepthColumnEntry(fitz.Rect(2, 0, 3, 1), 10, page_number)
     entry = LayerDepthColumnEntry(start, end)
     layer_interval = LayerInterval(entry)
     assert layer_interval.line_anchor == fitz.Point(
@@ -34,8 +36,10 @@ def test_line_anchor():  # noqa: D103
 
 
 def test_background_rect():  # noqa: D103
-    start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5)
-    end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10)
+    """Test the background_rect property of the BoundaryInterval class."""
+    page_number = 1
+    start = DepthColumnEntry(fitz.Rect(0, 0, 1, 1), 5, page_number)
+    end = DepthColumnEntry(fitz.Rect(0, 2, 1, 3), 10, page_number)
     boundary_interval = BoundaryInterval(start, end)
     assert boundary_interval.background_rect == fitz.Rect(
         start.rect.x0, start.rect.y1, start.rect.x1, end.rect.y0
diff --git a/tests/test_textblock.py b/tests/test_textblock.py
index 4bf8ba5e..69536ee8 100644
--- a/tests/test_textblock.py
+++ b/tests/test_textblock.py
@@ -6,19 +6,23 @@
 
 
 def test_concatenate():  # noqa: D103
-    tb1 = TextBlock([TextLine([TextWord(fitz.Rect([0, 0, 5, 1]), "Hello")])])
-    tb2 = TextBlock([TextLine([TextWord(fitz.Rect([0, 1, 5, 2]), "World")])])
+    """Test the concatenation of two TextBlocks."""
+    page_number = 1
+    tb1 = TextBlock([TextLine([TextWord(fitz.Rect([0, 0, 5, 1]), "Hello", page_number)])])
+    tb2 = TextBlock([TextLine([TextWord(fitz.Rect([0, 1, 5, 2]), "World", page_number)])])
     tb3 = tb1.concatenate(tb2)
     assert len(tb3.lines) == 2, "There should be 2 lines in the concatenated TextBlock"
     assert tb3.text == "Hello World", "The text should be 'Hello World'"
 
 
 def test_split_based_on_indentation():  # noqa: D103
+    """Test the splitting of a TextBlock based on indentation."""
+    page_number = 1
     tb = TextBlock(
         [
-            TextLine([TextWord(fitz.Rect(0, 0, 20, 5), "Hello")]),
-            TextLine([TextWord(fitz.Rect(0, 8, 20, 13), "Hello")]),
-            TextLine([TextWord(fitz.Rect(3, 16, 22, 21), "World")]),  # Indented line
+            TextLine([TextWord(fitz.Rect(0, 0, 20, 5), "Hello", page_number)]),
+            TextLine([TextWord(fitz.Rect(0, 8, 20, 13), "Hello", page_number)]),
+            TextLine([TextWord(fitz.Rect(3, 16, 22, 21), "World", page_number)]),  # Indented line
         ]
     )
     blocks = tb.split_based_on_indentation()
@@ -26,17 +30,26 @@ def test_split_based_on_indentation():  # noqa: D103
 
 
 def test_post_init():  # noqa: D103
-    tb = TextBlock([TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello")])])
+    """Test the post-init method of the TextBlock class."""
+    page_number = 1
+    tb = TextBlock([TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello", page_number)])])
     assert tb.line_count == 1, "The line count should be 1"
     assert tb.text == "Hello", "The text should be 'Hello'"
     assert tb.rect == fitz.Rect(0, 0, 5, 1), "The rect should be the same as the line's rect"
 
 
 def test_post_init_longer_text():  # noqa: D103
+    """Test the post-init method of the TextBlock class with multiple lines."""
+    page_number = 1
     tb = TextBlock(
         [
-            TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello")]),
-            TextLine([TextWord(fitz.Rect(0, 1, 5, 2), "It's"), TextWord(fitz.Rect(5, 1, 10, 2), "me")]),
+            TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello", page_number)]),
+            TextLine(
+                [
+                    TextWord(fitz.Rect(0, 1, 5, 2), "It's", page_number),
+                    TextWord(fitz.Rect(5, 1, 10, 2), "me", page_number),
+                ]
+            ),
         ]
     )
     assert tb.line_count == 2, "The line count should be 2"
@@ -45,8 +58,10 @@ def test_post_init_longer_text():  # noqa: D103
 
 
 def test_block_distance():  # noqa: D103
-    block_1 = TextBlock([TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello")])])
-    block_2 = TextBlock([TextLine([TextWord(fitz.Rect(0, 2, 5, 3), "Hello")])])
+    """Test the calculation of the distance between two TextBlocks."""
+    page_number = 1
+    block_1 = TextBlock([TextLine([TextWord(fitz.Rect(0, 0, 5, 1), "Hello", page_number)])])
+    block_2 = TextBlock([TextLine([TextWord(fitz.Rect(0, 2, 5, 3), "Hello", page_number)])])
     assert (
         block_distance(block_1, block_2) == 1
     ), "The distance should be measured from the bottom of the first block to the top of the second block."