diff --git a/.env.template b/.env.template new file mode 100644 index 00000000..09de5f9d --- /dev/null +++ b/.env.template @@ -0,0 +1,6 @@ +MLFLOW_TRACKING="True" +MLFLOW_TRACKING_URI="http://127.0.0.1:5000" + +AWS_ACCESS_KEY_ID=your_access_key_id +AWS_SECRET_ACCESS_KEY=your_secret_access_key +AWS_ENDPOINT=your_endpoint_url diff --git a/.vscode/settings.json b/.vscode/settings.json index 8fa809ca..6767e200 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,6 +5,7 @@ "depthcolumn", "depthcolumnentry", "dotenv", + "fastapi", "fitz", "mlflow", "pixmap", diff --git a/README.md b/README.md index 8240136d..f6cd0448 100644 --- a/README.md +++ b/README.md @@ -89,28 +89,28 @@ To execute the data extraction pipeline, follow these steps: 1. **Activate the virtual environment** - Activate your virtual environment. On unix systems this is +Activate your virtual environment. On unix systems this is - ``` bash - source env/bin/activate - ``` +``` bash +source env/bin/activate +``` 2. **Download the borehole profiles, optional** - Use `boreholes-download-profiles` to download the files to be processed from an AWS S3 storage. In order to do so, you need to authenticate with aws first. We recommend to use the aws CLI for that purpose. This step is optional, you can continue with step 3 on your own set of borehole profiles. +Use `boreholes-download-profiles` to download the files to be processed from an AWS S3 storage. In order to do so, you need to authenticate with aws first. We recommend to use the aws CLI for that purpose. This step is optional, you can continue with step 3 on your own set of borehole profiles. 3. **Run the extraction script** - The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. A cli command is created to run this script. +The main script for the extraction pipeline is located at `src/stratigraphy/main.py`. A cli command is created to run this script. - Run `boreholes-extract-all` to run the main extraction script. You need to specify the input directory or a single PDF file using the `-i` or `--input-directory` flag. - The script will source all PDFs from the specified directory and create PNG files in the `data/output/draw` directory. +Run `boreholes-extract-all` to run the main extraction script. You need to specify the input directory or a single PDF file using the `-i` or `--input-directory` flag. +The script will source all PDFs from the specified directory and create PNG files in the `data/output/draw` directory. - Use `boreholes-extract-all --help` to see all options for the extraction script. +Use `boreholes-extract-all --help` to see all options for the extraction script. 4. **Check the results** - Once the script has finished running, you can check the results in the `data/output/draw` directory. The result is a `predictions.json` file as well as a png file for each page of each PDF in the specified input directory. +Once the script has finished running, you can check the results in the `data/output/draw` directory. The result is a `predictions.json` file as well as a png file for each page of each PDF in the specified input directory. ### Output Structure The `predictions.json` file contains the results of a data extraction process from PDF files. Each key in the JSON object is the name of a PDF file, and the value is a list of extracted items in a dictionary like object. The extracted items for now are the material descriptions in their correct order (given by their depths). @@ -247,46 +247,45 @@ To launch the API and access its endpoints, follow these steps: 1. **Activate the virtual environment** - Activate your virtual environment. On Unix systems, this can be done with the following command: +Activate your virtual environment. On Unix systems, this can be done with the following command: - ```bash - source env/bin/activate - ``` +```bash +source env/bin/activate +``` 2. **Environment variables** - Please make sure to define the environment variables needed for the API to access the S3 Bucket of interest. +Please make sure to define the environment variables needed for the API to access the S3 Bucket of interest. - ```python - aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID") - aws_secret_key_access = os.environ.get("AWS_SECRET_ACCESS_KEY") - aws_session_token = os.environ.get("AWS_SESSION_TOKEN") - aws_endpoint = os.environ.get("AWS_ENDPOINT") - ``` +```python +aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID") +aws_secret_key_access = os.environ.get("AWS_SECRET_ACCESS_KEY") +aws_endpoint = os.environ.get("AWS_ENDPOINT") +``` 3. **Start the FastAPI server** - Run the following command to start the FastAPI server: +Run the following command to start the FastAPI server: - ```bash - uvicorn src.app.main:app --reload --host 0.0.0.0 --port 8002 - ``` +```bash +uvicorn src.app.main:app --reload --host 0.0.0.0 --port 8002 +``` - This will start the server on port 8002 of the localhost and enable automatic reloading whenever changes are made to the code. You can see the OpenAPI Specification (formerly Swagger Specification) by opening: `http://127.0.0.1:8002/docs#/` in your favorite browser. +This will start the server on port 8002 of the localhost and enable automatic reloading whenever changes are made to the code. You can see the OpenAPI Specification (formerly Swagger Specification) by opening: `http://127.0.0.1:8002/docs#/` in your favorite browser. 4. **Access the API endpoints** - Once the server is running, you can access the API endpoints using a web browser or an API testing tool like Postman. +Once the server is running, you can access the API endpoints using a web browser or an API testing tool like Postman. - The main endpoint for the data extraction pipeline is `http://localhost:8000/extract-data`. You can send a POST request to this endpoint with the PDF file you want to extract data from. +The main endpoint for the data extraction pipeline is `http://localhost:8000/extract-data`. You can send a POST request to this endpoint with the PDF file you want to extract data from. - Additional endpoints and their functionalities can be found in the project's source code. +Additional endpoints and their functionalities can be found in the project's source code. - **Note:** Make sure to replace `localhost` with the appropriate hostname or IP address if you are running the server on a remote machine. +**Note:** Make sure to replace `localhost` with the appropriate hostname or IP address if you are running the server on a remote machine. 5. **Stop the server** - To stop the FastAPI server, press `Ctrl + C` in the terminal where the server is running. Please refer to the [FastAPI documentation](https://fastapi.tiangolo.com) for more information on how to work with FastAPI and build APIs using this framework. +To stop the FastAPI server, press `Ctrl + C` in the terminal where the server is running. Please refer to the [FastAPI documentation](https://fastapi.tiangolo.com) for more information on how to work with FastAPI and build APIs using this framework. ## API as Docker Image @@ -295,99 +294,166 @@ The borehole application offers a given amount of functionalities (extract text, 1. **Navigate to the project directory** - Change your current directory to the project directory: +Change your current directory to the project directory: - ```bash - cd swissgeol-boreholes-dataextraction - ``` +```bash +cd swissgeol-boreholes-dataextraction +``` 2. **Build the Docker image** - Build the Docker image using the following command: +Build the Docker image using the following command: - ```bash - docker build -t borehole-api . -f Dockerfile - ``` +```bash +docker build -t borehole-api . -f Dockerfile +``` - ```bash - docker build --platform linux/amd64 -t borehole-api:test . - ``` +```bash +docker build --platform linux/amd64 -t borehole-api:test . +``` - This command will build the Docker image with the tag `borehole-api`. +This command will build the Docker image with the tag `borehole-api`. 3. **Verify the Docker image** - Verify that the Docker image has been successfully built by running the following command: +Verify that the Docker image has been successfully built by running the following command: - ```bash - docker images - ``` +```bash +docker images +``` - You should see the `borehole-api` image listed in the output. +You should see the `borehole-api` image listed in the output. 4. **Run the Docker container** - To run the Docker container, use the following command: +4.1. **Run the Docker Container without concerning about AWS Credentials** + +To run the Docker container, use the following command: + +```bash +docker run -p 8000:8000 borehole-api +``` - ```bash - docker run -p 8000:8000 borehole-api - ``` +This command will start the container and map port 8000 of the container to port 8000 of the host machine. - This command will start the container and map port 8000 of the container to port 8000 of the host machine. +4.2. **Run the docker image with the AWS credentials** -5. **Run the docker image with the AWS credentials** +4.2.1. **Using a `~/.aws` file** If you have the AWS credentials configured locally in the `~/.aws` file, you can run the following command to forward your AWS credentials to the docker container - ```bash +To run the docker image from `Dockerfile` locally: - docker run -v ~/.aws:/root/.aws -d -p 8000:8000 borehole-api - ``` +```bash - ```bash - docker run --platform linux/amd64 -v ~/.aws:/root/.aws -d -p 8000:8000 borehole-api:test - ``` +docker run -v ~/.aws:/root/.aws -d -p 8000:8000 borehole-api +``` +To run the Docker image from `Dockerfile` with the environment variables from the `.env` file -6. **Access the API** +```bash +docker run --env-file .env -d -p 8000:8000 borehole-api +``` + +To run the docker image used for AWS Lambda: `Dockerfile.aws.lambda`: - Once the container is running, you can access the API by opening a web browser and navigating to `http://localhost:8000`. +```bash +docker run --platform linux/amd64 -v ~/.aws:/root/.aws -d -p 8000:8000 borehole-api:test +``` - You can also use an API testing tool like Postman to send requests to the API endpoints. +4.2.2. **Passing the AWS credentials as Environment Variables** - **Note:** If you are running Docker on a remote machine, replace `localhost` with the appropriate hostname or IP address. +It is also possible to set the AWS credentials as you environment variables and the environment variables of the Docker image you are trying to run. +Unix-based Systems (Linux/macOS) -7. **Query the API** +Add the following lines to your `~/.bashrc`, `~/.bash_profile`, or `~/.zshrc` (depending on your shell): ```bash - curl -X 'POST' \ - 'http://localhost:8000/api/V1/create_pngs' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "filename": "10021.pdf" - }' +export AWS_ACCESS_KEY_ID=your_access_key_id +export AWS_SECRET_ACCESS_KEY=your_secret_access_key +export AWS_ENDPOINT=your_endpoint_url ``` -8. **Stop the Docker container** +Please note that the endpoint url is in the following format: `https://{bucket}.s3..amazonaws.com`. This +URL can be found in AWS when you go to your target S3 bucket, select any item in the bucket and look into the +Properties under `Object URL`. Please remove the file specific extension and you will end up with your endpoint URL. - To stop the Docker container, press `Ctrl + C` in the terminal where the container is running. +After editing, run the following command to apply the changes: - Alternatively, you can use the following command to stop the container: +```bash +source ~/.bashrc # Or ~/.bash_profile, ~/.zshrc based on your configuration +``` - ```bash - docker stop - ``` +Windows (Command Prompt or PowerShell) - Replace `` with the ID of the running container, which can be obtained by running `docker ps`. +For Command Prompt: + +```bash +setx AWS_ACCESS_KEY_ID your_access_key_id +setx AWS_SECRET_ACCESS_KEY your_secret_access_key +setx AWS_ENDPOINT your_endpoint_url +``` + +For PowerShell: + +```bash +$env:AWS_ACCESS_KEY_ID=your_access_key_id +$env:AWS_SECRET_ACCESS_KEY=your_secret_access_key +$env:AWS_ENDPOINT=your_endpoint_url +``` + +4.2.3. **Passing the AWS credentials in an Environment File** + +Another option is to store the credentials in a .env file and load them into your Python environment using the `python-dotenv` package: + +```bash +AWS_ACCESS_KEY_ID=your_access_key_id +AWS_SECRET_ACCESS_KEY=your_secret_access_key +AWS_ENDPOINT=your_endpoint_url +``` + +You can find an example for such a `.env` file in `.env.template`. If you rename this file to `.env` and add your AWS credentials you should be good to go. + +5. **Access the API** + +Once the container is running, you can access the API by opening a web browser and navigating to `http://localhost:8000`. + +You can also use an API testing tool like Postman to send requests to the API endpoints. + +**Note:** If you are running Docker on a remote machine, replace `localhost` with the appropriate hostname or IP address. + + +6. **Query the API** + +```bash +curl -X 'POST' \ +'http://localhost:8000/api/V1/create_pngs' \ +-H 'accept: application/json' \ +-H 'Content-Type: application/json' \ +-d '{ +"filename": "10021.pdf" +}' +``` + +7. **Stop the Docker container** + +To stop the Docker container, press `Ctrl + C` in the terminal where the container is running. + +Alternatively, you can use the following command to stop the container: + +```bash +docker stop +``` + +Replace `` with the ID of the running container, which can be obtained by running `docker ps`. ## AWS Lambda Deployment AWS Lambda is a serverless computing service provided by Amazon Web Services that allows you to run code without managing servers. It automatically scales your applications by executing code in response to triggers. You only pay for the compute time used. -In this project we are using Mangum to wrap the FastAPI with a handler that we will package and deploy as a Lambda function in AWS. Then using AWS API Gateway we will route all incoming requests to invoke the lambda and handle the routing internally within our application. +In this project we are using `Mangum` to wrap the FastAPI with a handler that we will package and deploy as a Lambda function in AWS. Then using AWS API Gateway we will route all incoming requests to invoke the lambda and handle the routing internally within our application. We created a script that should make it possible for you to deploy the FastAPI in AWS lambda using a single command. The script is creating all the required AWS resources to run the API. The resources that will be created for you are: - AWS Lambda Function @@ -397,11 +463,10 @@ We created a script that should make it possible for you to deploy the FastAPI i To deploy the staging version of the FastPI, run the following command: -```shell +```bash IMAGE=borehole-fastapi ENV=stage AWS_PROFILE=dcleres-visium AWS_S3_BUCKET=dcleres-boreholes-integration-tmp ./deploy_api_aws_lambda.sh ``` - ## Experiment Tracking We perform experiment tracking using MLFlow. Each developer has his own local MLFlow instance. diff --git a/src/app/api/v1/endpoints/create_pngs.py b/src/app/api/v1/endpoints/create_pngs.py index 912c3893..6808b59a 100644 --- a/src/app/api/v1/endpoints/create_pngs.py +++ b/src/app/api/v1/endpoints/create_pngs.py @@ -5,7 +5,6 @@ import fitz from app.common.aws import load_pdf_from_aws, upload_file_to_s3 -from app.common.config import config from app.common.schemas import PNGResponse from fastapi import HTTPException @@ -49,8 +48,7 @@ def create_pngs(aws_filename: Path): ) # Generate the S3 URL - png_url = f"https://{config.bucket_name}.s3.amazonaws.com/{s3_bucket_png_path}" - png_urls.append(png_url) + png_urls.append(s3_bucket_png_path) # Clean up the local file os.remove(png_path) diff --git a/src/app/common/aws.py b/src/app/common/aws.py index cd2275a3..b0cfb8d5 100644 --- a/src/app/common/aws.py +++ b/src/app/common/aws.py @@ -7,12 +7,55 @@ import fitz import numpy as np from app.common.config import config +from botocore.exceptions import ClientError, NoCredentialsError +from dotenv import load_dotenv from fastapi import HTTPException from PIL import Image -# Initialize the S3 client -# AWS S3 Configuration -s3_client = boto3.client("s3") +load_dotenv() + +_s3_client = None # Global reference to the S3 client + + +def get_s3_client(): + """Lazy initialization of the S3 client. + + Returns: + boto3.client: The S3 client. + """ + global _s3_client + if _s3_client is None: + _s3_client = create_s3_client() + return _s3_client + + +def create_s3_client(): + """Create an S3 client using default or custom credentials. + + Returns: + boto3.client: The S3 client. + """ + try: + # Attempt to use default AWS credentials + s3_client = boto3.client("s3") + # Perform a quick test to ensure credentials are valid + s3_client.list_buckets() + return s3_client + except (NoCredentialsError, ClientError): + # Fallback to custom credentials if no credentials are found + try: + s3_client = boto3.client( + "s3", + aws_access_key_id=config.aws_access_key_id, + aws_secret_access_key=config.aws_secret_access_key, + endpoint_url=config.aws_endpoint, + ) + # Test the fallback client + s3_client.list_buckets() + return s3_client + except (NoCredentialsError, ClientError) as e: + print(f"Error accessing S3 with custom credentials: {e}") + raise HTTPException(status_code=500, detail="Failed to access S3.") from None def load_pdf_from_aws(filename: Path) -> fitz.Document: @@ -25,15 +68,8 @@ def load_pdf_from_aws(filename: Path) -> fitz.Document: fitz.Document: The loaded PDF document. """ # Load the PDF from the S3 object - try: - data = load_data_from_aws(filename) - pdf_document = fitz.open(stream=data, filetype="pdf") - except Exception: - raise HTTPException( - status_code=404, detail="Failed to load PDF document. The filename is not found in the bucket." - ) from None - - return pdf_document + data = load_data_from_aws(filename) + return fitz.open(stream=data, filetype="pdf") def load_png_from_aws(filename: Path) -> np.ndarray: @@ -58,21 +94,23 @@ def load_data_from_aws(filename: Path, prefix: str = "") -> bytes: """Load a document from AWS S3. Args: - filename (str): The filename of the PNG image. + filename (str): The filename of the document. prefix (str): The prefix of the file in the bucket. Returns: - bytes: The loaded PNG image. + bytes: The loaded document. """ - # Check if the PNG exists in S3 + s3_client = get_s3_client() + + # Check if the document exists in S3 try: - png_object = s3_client.get_object(Bucket=config.bucket_name, Key=str(prefix / filename)) + s3_object = s3_client.get_object(Bucket=config.bucket_name, Key=str(prefix / filename)) except s3_client.exceptions.NoSuchKey: raise HTTPException(status_code=404, detail=f"Document {prefix / filename} not found in S3 bucket.") from None - # Load the PNG from the S3 object + # Load the document from the S3 object try: - data = png_object["Body"].read() + data = s3_object["Body"].read() except Exception: raise HTTPException(status_code=500, detail="Failed to load data.") from None @@ -86,4 +124,4 @@ def upload_file_to_s3(file_path: str, key: str): file_path (str): The local path to the file to upload. key (str): The key (name) of the file in the bucket. """ - s3_client.upload_file(file_path, config.bucket_name, key) + get_s3_client().upload_file(file_path, config.bucket_name, key) diff --git a/src/app/common/config.py b/src/app/common/config.py index 80c3bfe6..f91b8725 100644 --- a/src/app/common/config.py +++ b/src/app/common/config.py @@ -22,16 +22,17 @@ class Config(BaseSettings): logging_level: int = logging.DEBUG ########################################################### - # AWS + # AWS Settings ########################################################### bucket_name: str = get_aws_bucket_name() test_bucket_name: str = "test-bucket" - # TODO: check how this is used on the VM - # aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID") - # aws_secret_key_access = os.environ.get("AWS_SECRET_ACCESS_KEY") - # aws_session_token = os.environ.get("AWS_SESSION_TOKEN") - # aws_endpoint = os.environ.get("AWS_ENDPOINT") + ########################################################### + # AWS Credentials + ########################################################### + aws_access_key_id: str | None = os.environ.get("AWS_ACCESS_KEY_ID") + aws_secret_access_key: str | None = os.environ.get("AWS_SECRET_ACCESS_KEY") + aws_endpoint: str | None = os.environ.get("AWS_ENDPOINT") config = Config() diff --git a/tests/conftest.py b/tests/conftest.py index 9e8fceba..a6c8ef10 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ import boto3 import pytest +from app.common.aws import get_s3_client from app.common.config import config from app.main import app from fastapi.testclient import TestClient @@ -18,6 +19,7 @@ def test_client(): def s3_client(monkeypatch): """Mocked S3 client.""" with mock_aws(): + # Create the mocked S3 client conn = boto3.client("s3", region_name="eu-central-1") # We need to create the bucket since this is all in Moto's 'virtual' AWS account conn.create_bucket( @@ -45,10 +47,27 @@ def mock_upload_file(Filename, Bucket, Key, *args, **kwargs): # Call the original upload_file with the modified or original arguments return original_upload_file(*args, Filename=Filename, Bucket=Bucket, Key=Key, **kwargs) + # Mock the list_buckets method with the test bucket name + original_list_buckets = conn.list_buckets + + def mock_list_buckets(*args, **kwargs): + response = original_list_buckets(*args, **kwargs) + response["Buckets"] = [{"Name": config.test_bucket_name}] + return response + monkeypatch.setattr(conn, "get_object", mock_get_object) monkeypatch.setattr(conn, "upload_file", mock_upload_file) + monkeypatch.setattr(conn, "list_buckets", mock_list_buckets) # Patch the s3_client in the aws module to use the mock - monkeypatch.setattr("app.common.aws.s3_client", conn) + monkeypatch.setattr("app.common.aws._s3_client", conn) yield conn + + +def test_s3_functionality(s3_client): + """Test the S3 functionality.""" + # This now runs in the mock_s3 context + s3_client_instance = get_s3_client() + response = s3_client_instance.list_buckets() + assert "Buckets" in response diff --git a/tests/test_create_pngs.py b/tests/test_create_pngs.py index 231c71c9..37efbf60 100644 --- a/tests/test_create_pngs.py +++ b/tests/test_create_pngs.py @@ -4,6 +4,10 @@ test_client fixture is created by the TestClient(app) call, which creates a test client for the FastAPI app. The s3_client fixture is created by the mock_aws decorator, which mocks the AWS S3 client using Moto. +NOTE: Please note that the code in tests/conftest.py is called before the tests are run. This is where the AWS S3 +client is mocked using Moto. The s3_client fixture is then used in the test functions to interact with the mocked +S3 client. Furthermore, the upload_test_pdf fixture is used to upload a test PDF file to the S3 bucket before +running the tests. """ from pathlib import Path @@ -49,11 +53,10 @@ def test_create_pngs_success(test_client: TestClient, s3_client, upload_test_pdf # Verify that PNG files are uploaded to S3 for png_url in json_response["png_urls"]: - png_key = png_url.split("/", 3)[-1] try: - s3_client.head_object(Bucket=config.test_bucket_name, Key=png_key) + s3_client.head_object(Bucket=config.test_bucket_name, Key=png_url) except ClientError: - pytest.fail(f"PNG file {png_key} not found in S3.") + pytest.fail(f"PNG file {png_url} not found in S3.") def test_create_pngs_invalid_filename(test_client: TestClient): @@ -65,11 +68,11 @@ def test_create_pngs_invalid_filename(test_client: TestClient): } -def test_create_pngs_nonexistent_pdf(test_client: TestClient): +def test_create_pngs_nonexistent_pdf(test_client: TestClient, s3_client): """Test the create_pngs endpoint with a nonexistent PDF file.""" response = test_client.post("/api/V1/create_pngs", json={"filename": "nonexistent.pdf"}) assert response.status_code == 404 - assert response.json() == {"detail": "Failed to load PDF document. The filename is not found in the bucket."} + assert response.json() == {"detail": "Document nonexistent.pdf not found in S3 bucket."} def test_create_pngs_missing_pdf_extension(test_client: TestClient): diff --git a/tests/test_data_extraction_from_bbox.py b/tests/test_data_extraction_from_bbox.py index f3172bac..f387552f 100644 --- a/tests/test_data_extraction_from_bbox.py +++ b/tests/test_data_extraction_from_bbox.py @@ -210,7 +210,7 @@ def test_invalid_pdf(test_client: TestClient, upload_test_pdf, upload_test_png): request_json["filename"] = "invalid.pdf" response = test_client.post("/api/V1/extract_data", json=request_json) assert response.status_code == 404 - assert response.json() == {"detail": "Failed to load PDF document. The filename is not found in the bucket."} + assert response.json() == {"detail": "Document invalid.pdf not found in S3 bucket."} def test_number_extraction(test_client: TestClient, upload_test_pdf, upload_test_png):