Skip to content

Commit

Permalink
fix: add missing bucket write permissions to downloader Lambda and cl…
Browse files Browse the repository at this point in the history
…eanup IntHub references (#43)

* Add write permission to upload bucket for downloader

* Remove mentions of IntHub/SciHub now we're on CDSE
  • Loading branch information
ceholden authored Nov 7, 2024
1 parent 4553661 commit e202d60
Show file tree
Hide file tree
Showing 9 changed files with 15 additions and 28 deletions.
1 change: 0 additions & 1 deletion .github/workflows/deploy-on-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,6 @@ jobs:
IDENTIFIER="esa-prod"
ENABLE_DOWNLOADING="TRUE"
SCHEDULE_LINK_FETCHING="TRUE"
USE_INTHUB2="TRUE"
REMOVAL_POLICY_DESTROY="FALSE"
UPLOAD_BUCKET="${{ secrets.UPLOAD_BUCKET }}"
PERMISSIONS_BOUNDARY_ARN="${{ vars.PERMISSIONS_BOUNDARY_ARN }}"
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ AWS_DEFAULT_PROFILE="<your named AWS CLI profile to use for deployment>"
PIPENV_NO_INHERIT=TRUE # This is used to ensure our Lambdas/Layers get separate Pipenv environments
ENABLE_DOWNLOADING="TRUE" # Or "FALSE" - If TRUE then the TO_UPLOAD queue is set as an enabled source to the Downloader
SCHEDULE_LINK_FETCHING="TRUE" # Or "FALSE" - If TRUE then link fetching will happen every day at midday.
USE_INTHUB2="TRUE" # Or "FALSE" - If TRUE then the Downloader will use IntHub2 credentials when downloading
REMOVAL_POLICY_DESTROY="TRUE" # Or "FALSE" - See below for what is deleted if TRUE
UPLOAD_BUCKET="<name-of-aws-s3-bucket-to-upload-images-to>"
```
Expand Down
2 changes: 0 additions & 2 deletions cdk/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
upload_bucket = os.environ["UPLOAD_BUCKET"]
enable_downloading = os.environ["ENABLE_DOWNLOADING"] == "TRUE"
schedule_link_fetching = os.environ["SCHEDULE_LINK_FETCHING"] == "TRUE"
use_inthub2 = os.environ["USE_INTHUB2"] == "TRUE"
removal_policy_destroy = os.environ["REMOVAL_POLICY_DESTROY"] == "TRUE"
print(identifier)

Expand All @@ -25,7 +24,6 @@
upload_bucket=upload_bucket,
permissions_boundary_arn=permissions_boundary_arn,
enable_downloading=enable_downloading,
use_inthub2=use_inthub2,
schedule_link_fetching=schedule_link_fetching,
removal_policy_destroy=removal_policy_destroy,
)
Expand Down
20 changes: 8 additions & 12 deletions cdk/downloader_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
aws_events_targets,
aws_iam,
aws_lambda,
aws_s3,
)
from aws_cdk import aws_lambda_python_alpha as aws_lambda_python
from aws_cdk import aws_logs, aws_rds, aws_secretsmanager, aws_sqs, aws_ssm
Expand All @@ -33,7 +34,6 @@ def __init__(
zipper_url: Optional[str] = None,
checksum_url: Optional[str] = None,
enable_downloading: bool = False,
use_inthub2: bool = False,
schedule_link_fetching: bool = False,
removal_policy_destroy: bool = True,
**kwargs,
Expand Down Expand Up @@ -290,7 +290,6 @@ def __init__(
"STAGE": identifier,
"DB_CONNECTION_SECRET_ARN": downloader_rds_secret.secret_arn,
"UPLOAD_BUCKET": upload_bucket,
"USE_INTHUB2": "YES" if use_inthub2 else "NO",
**({"COPERNICUS_ZIPPER_URL": zipper_url} if zipper_url else {}),
**({"COPERNICUS_CHECKSUM_URL": checksum_url} if checksum_url else {}),
}
Expand Down Expand Up @@ -344,6 +343,13 @@ def __init__(

self.downloader.role.add_managed_policy(lambda_insights_policy)

downloader_bucket = aws_s3.Bucket.from_bucket_name(
self,
"UploadBucket",
bucket_name=upload_bucket,
)
downloader_bucket.grant_write(self.downloader)

downloader_rds_secret.grant_read(link_fetcher)
downloader_rds_secret.grant_read(self.downloader)

Expand All @@ -364,16 +370,6 @@ def __init__(

token_parameter.grant_read(self.downloader)

if use_inthub2:
inthub2_credentials = aws_secretsmanager.Secret.from_secret_name_v2(
self,
id=f"{identifier}-inthub2-credentials",
secret_name=(
f"hls-s2-downloader-serverless/{identifier}/inthub2-credentials"
),
)
inthub2_credentials.grant_read(self.downloader)

to_download_queue.grant_send_messages(link_fetcher)
to_download_queue.grant_consume_messages(self.downloader)

Expand Down
1 change: 0 additions & 1 deletion example.env
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,5 @@ AWS_DEFAULT_PROFILE="my-aws-account"
PIPENV_NO_INHERIT=TRUE
ENABLE_DOWNLOADING="FALSE"
SCHEDULE_LINK_FETCHING="FALSE"
USE_INTHUB2="FALSE"
REMOVAL_POLICY_DESTROY="TRUE"
UPLOAD_BUCKET="<name-of-aws-s3-bucket-to-upload-images-to>"
10 changes: 3 additions & 7 deletions lambdas/downloader/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

![Downloader in S2 Downloader diagram](../../images/hls-s2-downloader-downloader.png)

The Downloaders purpose is download Sentinel 2 Images from Sci/IntHub. It is invoked via SQS messages being available within the `TO_DOWNLOAD` SQS queue; this handler will be limited to a concurrency limit of 15, due to the nature of the dedicated connection we have to IntHub. Images are downloaded and uploaded to S3 in one step, they are stored under a key in the format `<YYYY-MM-DD>/<image_filename>` where the date is the `beginposition` of the image.
The Downloaders purpose is download Sentinel 2 Images from Copernicus Data Space Ecosystem. It is invoked via SQS messages being available within the `TO_DOWNLOAD` SQS queue; this handler will be limited to a concurrency limit of 15, due to the nature of the dedicated connection we have to Copernicus Data Space Ecosystem. Images are downloaded and uploaded to S3 in one step, they are stored under a key in the format `<YYYY-MM-DD>/<image_filename>` where the date is the `beginposition` of the image.

S3 performs a MD5 checksum comparison on upload, this way we ensure that we only store images that match the MD5 that Sci/IntHub provided us for the image.
S3 performs a MD5 checksum comparison on upload, this way we ensure that we only store images that match the MD5 that Copernicus Data Space Ecosystem provided us for the image.

Interactions with the `granule` table include marking the download as having started, updating the checksum of the image, and marking that the download is complete.

Expand All @@ -27,7 +27,7 @@ except AlreadyDownloaded:
return # End gracefully - We received a duplicate from SQS, this is OK

try:
checksum = get_checksum_from_scihub()
checksum = get_checksum_from_esa_copernicus()
download_file()
except Exception as ex:
increase_retry_count()
Expand All @@ -40,10 +40,6 @@ update_status()

Due to the nature of how Lambda is invoked by SQS, a non-failed invocation of a Lambda will result in the SQS message being deleted. Because of this, if we need to gracefully handle an error, we tidy up (namely database rollbacks), then raise the error to the handler, this then results in the Lambda failing and the SQS message being re-added to the Queue.

We use the flag `USE_INTHUB2` with possible values of `YES` and `NO` to determine whether we:
* A - Replace `scihub` in the fetched links download urls with `inthub2`
* B - Retrieve `inthub2` credentials when downloading files

---

## Development
Expand Down
4 changes: 2 additions & 2 deletions lambdas/link_fetcher/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

![Link fetcher in S2 Downloader diagram](../../images/hls-s2-downloader-link-fetcher.png)

The Link Fetchers purpose is to query IntHub/SciHub for new imagery links to download. It is invoked within the `Link Fetching` Step Function; every invocation is performed on one day in the form `YYYY-MM-DD`. Images to download are stored as records in the `granule` table, the `granule_count` table is also updated with available and fetched link counts. The `To Download` queue is also populated with the images IDs and download URLs.
The Link Fetchers purpose is to query Copernicus Data Space Ecosystem for new imagery links to download. It is invoked within the `Link Fetching` Step Function; every invocation is performed on one day in the form `YYYY-MM-DD`. Images to download are stored as records in the `granule` table, the `granule_count` table is also updated with available and fetched link counts. The `To Download` queue is also populated with the images IDs and download URLs.

---

Expand All @@ -14,7 +14,7 @@ Provided below is some pseudo-code to explain the process happening each time th

```python
available, processed = get_how_many_links_are_available_and_have_been_processed()
query = generate_the_query_for_inthub_for_a_day(day)
query = generate_the_query_for_esa_for_a_day(day)

while there_is_still_imagery_to_process:

Expand Down
3 changes: 2 additions & 1 deletion lambdas/link_fetcher/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,8 @@ def filter_search_results(
def get_query_parameters(start: int, day: date) -> Mapping[str, Any]:
"""
Returns the query parameters that are needed for getting new imagery from
search/IntHub
search (Copernicus Data Space Ecosystem)
:param start: An int representing the offset to get results from a query
:param day: A date object representing the date to query for imagery
:returns: mapping of query parameters
Expand Down
1 change: 0 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ passenv =
PIPENV_NO_INHERIT
ENABLE_DOWNLOADING
SCHEDULE_LINK_FETCHING
USE_INTHUB2
REMOVAL_POLICY_DESTROY
UPLOAD_BUCKET
allowlist_externals = make
Expand Down

0 comments on commit e202d60

Please sign in to comment.