Skip to content

Commit

Permalink
cleanup / directory structure improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
stijnvermeeren-swisstopo committed Jul 23, 2024
1 parent 235e185 commit 9bf677e
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 23 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ To execute the data extraction pipeline, follow these steps:

The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. A cli command is created to run this script.

Run `boreholes-extract-layers` to run the main extraction script. With the default options, the command will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory.
Run `boreholes-extract-all` to run the main extraction script. With the default options, the command will source all PDFs from the `data/Benchmark` directory and create PNG files in the `data/Benchmark/extract` directory.

Use `boreholes-extract-layers --help` to see all options for the extraction script.
Use `boreholes-extract-all --help` to see all options for the extraction script.

4. **Check the results**

Expand Down Expand Up @@ -154,9 +154,9 @@ The project structure and the most important files are as follows:
- `util/` : Utility scripts and modules.
- `benchmark/` : Scripts to evaluate the data extraction.
- `data/` : The data used by the project.
- `Benchmark/` : The directory containing the PDF files to be analyzed.
- `extract/` : The directory where the PNG files are saved.
- `predictions.json` : The output file of the project, containing the results of the data extraction process.
- `output/` :
- `draw/` : The directory where the PNG files are saved.
- `predictions.json` : The output file of the project, containing the results of the data extraction process.
- `tests/` : The tests for the project.
- `README.md` : The README file for the project.

Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,11 @@ experiment-tracking = [
visualize = [
"matplotlib==3.8.0"
]
devtools = [
"tqdm"
]

all = ["swissgeol-boreholes-dataextraction[test, lint, experiment-tracking, visualize]"]
all = ["swissgeol-boreholes-dataextraction[test, lint, experiment-tracking, visualize, devtools]"]

[project.scripts]
boreholes-extract-all = "stratigraphy.main:click_pipeline"
Expand Down
19 changes: 9 additions & 10 deletions src/stratigraphy/get_files.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Script to download the borehole profiles from the S3 bucket."""

import os
from pathlib import Path

import boto3
Expand All @@ -14,15 +13,15 @@
@click.option("--bucket-name", default="stijnvermeeren-boreholes-data", help="The name of the bucket.")
@click.option(
"--remote-directory-name",
default="data_v2/validation",
default="",
help="The name of the directory in the bucket to be downloaded.",
)
@click.option(
"--output-path", default=DATAPATH, type=click.Path(path_type=Path), help="The path to save the downloaded files."
)
def download_directory_froms3(
bucket_name: str = "stijnvermeeren-boreholes-data",
remote_directory_name: str = "data_v2/validation",
bucket_name: str,
remote_directory_name: str,
output_path: Path = DATAPATH,
):
"""Download a directory from S3 bucket.
Expand All @@ -31,17 +30,17 @@ def download_directory_froms3(
\f
Args:
bucketName (str): The name of the bucket.
remoteDirectoryName (str): The name of the directory in the bucket to be downloaded.
bucket_name (str): The name of the bucket.
remote_directory_name (str): The name of the directory in the bucket to be downloaded.
output_path (Path): Where to store the files locally
""" # noqa: D301
s3_resource = boto3.resource("s3")
bucket = s3_resource.Bucket(bucket_name)
total_files = sum(1 for _ in bucket.objects.filter(Prefix=remote_directory_name)) # this is fast
for obj in tqdm(bucket.objects.filter(Prefix=remote_directory_name), total=total_files):
Path(output_path / obj.key).parent.mkdir(parents=True, exist_ok=True)
if not os.path.exists(os.path.dirname(obj.key)):
os.makedirs(os.path.dirname(obj.key))
bucket.download_file(obj.key, output_path / obj.key) # save to same path
if obj.key:
Path(output_path / obj.key).parent.mkdir(parents=True, exist_ok=True)
bucket.download_file(obj.key, output_path / obj.key) # save to same path


if __name__ == "__main__":
Expand Down
13 changes: 6 additions & 7 deletions src/stratigraphy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,28 +35,26 @@
"-i",
"--input-directory",
type=click.Path(exists=True, path_type=Path),
default=DATAPATH / "Benchmark",
help="Path to the input directory, or path to a single pdf file.",
)
@click.option(
"-g",
"--ground-truth-path",
type=click.Path(exists=False, path_type=Path),
default=DATAPATH / "Benchmark" / "ground_truth.json",
help="Path to the ground truth file.",
)
@click.option(
"-o",
"--out-directory",
type=click.Path(path_type=Path),
default=DATAPATH / "Benchmark" / "evaluation",
default=DATAPATH / "output",
help="Path to the output directory.",
)
@click.option(
"-p",
"--predictions-path",
type=click.Path(path_type=Path),
default=DATAPATH / "Benchmark" / "extract" / "predictions.json",
default=DATAPATH / "output" / "predictions.json",
help="Path to the predictions file.",
)
@click.option(
Expand Down Expand Up @@ -144,8 +142,9 @@ def start_pipeline(

temp_directory = DATAPATH / "_temp" # temporary directory to dump files for mlflow artifact logging

# check if directories exist and create them when neccessary
out_directory.mkdir(parents=True, exist_ok=True)
# check if directories exist and create them when necessary
draw_directory = out_directory / "draw"
draw_directory.mkdir(parents=True, exist_ok=True)
temp_directory.mkdir(parents=True, exist_ok=True)

# if a file is specified instead of an input directory, copy the file to a temporary directory and work with that.
Expand Down Expand Up @@ -216,7 +215,7 @@ def start_pipeline(
predictions, number_of_truth_values = create_predictions_objects(predictions, ground_truth_path)

if not skip_draw_predictions:
draw_predictions(predictions, input_directory, out_directory)
draw_predictions(predictions, input_directory, draw_directory)

if number_of_truth_values: # only evaluate if ground truth is available
metrics, document_level_metrics = evaluate_borehole_extraction(predictions, number_of_truth_values)
Expand Down

1 comment on commit 9bf677e

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/stratigraphy
   __init__.py8188%11
   extract.py1881880%3–482
   get_files.py19190%3–47
   line_detection.py26260%3–76
   main.py95950%3–236
src/stratigraphy/util
   boundarydepthcolumnvalidator.py412051%47, 57, 60, 81–84, 109–127, 139–148
   coordinate_extraction.py127794%31, 62, 75–76, 80, 205, 328
   dataclasses.py32391%37–39
   depthcolumn.py1946467%26, 30, 51, 57, 60–61, 85, 88, 95, 102, 110–111, 121, 138–154, 192, 229, 248–256, 267, 272, 279, 310, 315–322, 337–338, 381–423
   depthcolumnentry.py26773%12, 15, 29–30, 33, 45, 52
   description_block_splitter.py70297%24, 139
   draw.py80800%3–244
   duplicate_detection.py51510%3–146
   extract_text.py27293%38–39
   find_depth_columns.py91693%42–43, 71, 83, 176–177
   find_description.py632856%27–35, 50–63, 79–95, 172–175
   geometric_line_utilities.py86298%82, 132
   interval.py1065548%24–27, 31–34, 39, 44, 47, 57–59, 99–145, 166, 171–187
   language_detection.py18180%3–45
   layer_identifier_column.py91910%3–227
   line.py49492%25, 42, 51, 98
   linesquadtree.py46198%76
   plot_utils.py43430%3–120
   predictions.py1301300%3–272
   textblock.py74889%27, 51, 63, 75, 98, 119, 127, 155
   util.py401855%22, 40–47, 61–63, 87–88, 100–105
TOTAL182196947% 

Tests Skipped Failures Errors Time
61 0 💤 0 ❌ 0 🔥 0.947s ⏱️

Please sign in to comment.