From e202d601de90cbc6973d4573d04e1f18698167ce Mon Sep 17 00:00:00 2001 From: Chris Holden Date: Thu, 7 Nov 2024 16:12:33 -0500 Subject: [PATCH] fix: add missing bucket write permissions to downloader Lambda and cleanup IntHub references (#43) * Add write permission to upload bucket for downloader * Remove mentions of IntHub/SciHub now we're on CDSE --- .github/workflows/deploy-on-release.yml | 1 - README.md | 1 - cdk/app.py | 2 -- cdk/downloader_stack.py | 20 ++++++++------------ example.env | 1 - lambdas/downloader/README.md | 10 +++------- lambdas/link_fetcher/README.md | 4 ++-- lambdas/link_fetcher/handler.py | 3 ++- tox.ini | 1 - 9 files changed, 15 insertions(+), 28 deletions(-) diff --git a/.github/workflows/deploy-on-release.yml b/.github/workflows/deploy-on-release.yml index b25d04b6..45dc8cc9 100644 --- a/.github/workflows/deploy-on-release.yml +++ b/.github/workflows/deploy-on-release.yml @@ -193,7 +193,6 @@ jobs: IDENTIFIER="esa-prod" ENABLE_DOWNLOADING="TRUE" SCHEDULE_LINK_FETCHING="TRUE" - USE_INTHUB2="TRUE" REMOVAL_POLICY_DESTROY="FALSE" UPLOAD_BUCKET="${{ secrets.UPLOAD_BUCKET }}" PERMISSIONS_BOUNDARY_ARN="${{ vars.PERMISSIONS_BOUNDARY_ARN }}" diff --git a/README.md b/README.md index b9a020df..f51be5b7 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,6 @@ AWS_DEFAULT_PROFILE="" PIPENV_NO_INHERIT=TRUE # This is used to ensure our Lambdas/Layers get separate Pipenv environments ENABLE_DOWNLOADING="TRUE" # Or "FALSE" - If TRUE then the TO_UPLOAD queue is set as an enabled source to the Downloader SCHEDULE_LINK_FETCHING="TRUE" # Or "FALSE" - If TRUE then link fetching will happen every day at midday. -USE_INTHUB2="TRUE" # Or "FALSE" - If TRUE then the Downloader will use IntHub2 credentials when downloading REMOVAL_POLICY_DESTROY="TRUE" # Or "FALSE" - See below for what is deleted if TRUE UPLOAD_BUCKET="" ``` diff --git a/cdk/app.py b/cdk/app.py index fb22c57d..6f1ddeea 100755 --- a/cdk/app.py +++ b/cdk/app.py @@ -13,7 +13,6 @@ upload_bucket = os.environ["UPLOAD_BUCKET"] enable_downloading = os.environ["ENABLE_DOWNLOADING"] == "TRUE" schedule_link_fetching = os.environ["SCHEDULE_LINK_FETCHING"] == "TRUE" -use_inthub2 = os.environ["USE_INTHUB2"] == "TRUE" removal_policy_destroy = os.environ["REMOVAL_POLICY_DESTROY"] == "TRUE" print(identifier) @@ -25,7 +24,6 @@ upload_bucket=upload_bucket, permissions_boundary_arn=permissions_boundary_arn, enable_downloading=enable_downloading, - use_inthub2=use_inthub2, schedule_link_fetching=schedule_link_fetching, removal_policy_destroy=removal_policy_destroy, ) diff --git a/cdk/downloader_stack.py b/cdk/downloader_stack.py index 1fe450d9..76425c11 100644 --- a/cdk/downloader_stack.py +++ b/cdk/downloader_stack.py @@ -12,6 +12,7 @@ aws_events_targets, aws_iam, aws_lambda, + aws_s3, ) from aws_cdk import aws_lambda_python_alpha as aws_lambda_python from aws_cdk import aws_logs, aws_rds, aws_secretsmanager, aws_sqs, aws_ssm @@ -33,7 +34,6 @@ def __init__( zipper_url: Optional[str] = None, checksum_url: Optional[str] = None, enable_downloading: bool = False, - use_inthub2: bool = False, schedule_link_fetching: bool = False, removal_policy_destroy: bool = True, **kwargs, @@ -290,7 +290,6 @@ def __init__( "STAGE": identifier, "DB_CONNECTION_SECRET_ARN": downloader_rds_secret.secret_arn, "UPLOAD_BUCKET": upload_bucket, - "USE_INTHUB2": "YES" if use_inthub2 else "NO", **({"COPERNICUS_ZIPPER_URL": zipper_url} if zipper_url else {}), **({"COPERNICUS_CHECKSUM_URL": checksum_url} if checksum_url else {}), } @@ -344,6 +343,13 @@ def __init__( self.downloader.role.add_managed_policy(lambda_insights_policy) + downloader_bucket = aws_s3.Bucket.from_bucket_name( + self, + "UploadBucket", + bucket_name=upload_bucket, + ) + downloader_bucket.grant_write(self.downloader) + downloader_rds_secret.grant_read(link_fetcher) downloader_rds_secret.grant_read(self.downloader) @@ -364,16 +370,6 @@ def __init__( token_parameter.grant_read(self.downloader) - if use_inthub2: - inthub2_credentials = aws_secretsmanager.Secret.from_secret_name_v2( - self, - id=f"{identifier}-inthub2-credentials", - secret_name=( - f"hls-s2-downloader-serverless/{identifier}/inthub2-credentials" - ), - ) - inthub2_credentials.grant_read(self.downloader) - to_download_queue.grant_send_messages(link_fetcher) to_download_queue.grant_consume_messages(self.downloader) diff --git a/example.env b/example.env index eec4d440..b938cf67 100644 --- a/example.env +++ b/example.env @@ -5,6 +5,5 @@ AWS_DEFAULT_PROFILE="my-aws-account" PIPENV_NO_INHERIT=TRUE ENABLE_DOWNLOADING="FALSE" SCHEDULE_LINK_FETCHING="FALSE" -USE_INTHUB2="FALSE" REMOVAL_POLICY_DESTROY="TRUE" UPLOAD_BUCKET="" diff --git a/lambdas/downloader/README.md b/lambdas/downloader/README.md index e90e6c63..5e4ab001 100644 --- a/lambdas/downloader/README.md +++ b/lambdas/downloader/README.md @@ -4,9 +4,9 @@ ![Downloader in S2 Downloader diagram](../../images/hls-s2-downloader-downloader.png) -The Downloaders purpose is download Sentinel 2 Images from Sci/IntHub. It is invoked via SQS messages being available within the `TO_DOWNLOAD` SQS queue; this handler will be limited to a concurrency limit of 15, due to the nature of the dedicated connection we have to IntHub. Images are downloaded and uploaded to S3 in one step, they are stored under a key in the format `/` where the date is the `beginposition` of the image. +The Downloaders purpose is download Sentinel 2 Images from Copernicus Data Space Ecosystem. It is invoked via SQS messages being available within the `TO_DOWNLOAD` SQS queue; this handler will be limited to a concurrency limit of 15, due to the nature of the dedicated connection we have to Copernicus Data Space Ecosystem. Images are downloaded and uploaded to S3 in one step, they are stored under a key in the format `/` where the date is the `beginposition` of the image. -S3 performs a MD5 checksum comparison on upload, this way we ensure that we only store images that match the MD5 that Sci/IntHub provided us for the image. +S3 performs a MD5 checksum comparison on upload, this way we ensure that we only store images that match the MD5 that Copernicus Data Space Ecosystem provided us for the image. Interactions with the `granule` table include marking the download as having started, updating the checksum of the image, and marking that the download is complete. @@ -27,7 +27,7 @@ except AlreadyDownloaded: return # End gracefully - We received a duplicate from SQS, this is OK try: - checksum = get_checksum_from_scihub() + checksum = get_checksum_from_esa_copernicus() download_file() except Exception as ex: increase_retry_count() @@ -40,10 +40,6 @@ update_status() Due to the nature of how Lambda is invoked by SQS, a non-failed invocation of a Lambda will result in the SQS message being deleted. Because of this, if we need to gracefully handle an error, we tidy up (namely database rollbacks), then raise the error to the handler, this then results in the Lambda failing and the SQS message being re-added to the Queue. -We use the flag `USE_INTHUB2` with possible values of `YES` and `NO` to determine whether we: -* A - Replace `scihub` in the fetched links download urls with `inthub2` -* B - Retrieve `inthub2` credentials when downloading files - --- ## Development diff --git a/lambdas/link_fetcher/README.md b/lambdas/link_fetcher/README.md index 6e6c6969..5ec4a546 100644 --- a/lambdas/link_fetcher/README.md +++ b/lambdas/link_fetcher/README.md @@ -4,7 +4,7 @@ ![Link fetcher in S2 Downloader diagram](../../images/hls-s2-downloader-link-fetcher.png) -The Link Fetchers purpose is to query IntHub/SciHub for new imagery links to download. It is invoked within the `Link Fetching` Step Function; every invocation is performed on one day in the form `YYYY-MM-DD`. Images to download are stored as records in the `granule` table, the `granule_count` table is also updated with available and fetched link counts. The `To Download` queue is also populated with the images IDs and download URLs. +The Link Fetchers purpose is to query Copernicus Data Space Ecosystem for new imagery links to download. It is invoked within the `Link Fetching` Step Function; every invocation is performed on one day in the form `YYYY-MM-DD`. Images to download are stored as records in the `granule` table, the `granule_count` table is also updated with available and fetched link counts. The `To Download` queue is also populated with the images IDs and download URLs. --- @@ -14,7 +14,7 @@ Provided below is some pseudo-code to explain the process happening each time th ```python available, processed = get_how_many_links_are_available_and_have_been_processed() -query = generate_the_query_for_inthub_for_a_day(day) +query = generate_the_query_for_esa_for_a_day(day) while there_is_still_imagery_to_process: diff --git a/lambdas/link_fetcher/handler.py b/lambdas/link_fetcher/handler.py index 09b2f87d..5ea8b74b 100644 --- a/lambdas/link_fetcher/handler.py +++ b/lambdas/link_fetcher/handler.py @@ -284,7 +284,8 @@ def filter_search_results( def get_query_parameters(start: int, day: date) -> Mapping[str, Any]: """ Returns the query parameters that are needed for getting new imagery from - search/IntHub + search (Copernicus Data Space Ecosystem) + :param start: An int representing the offset to get results from a query :param day: A date object representing the date to query for imagery :returns: mapping of query parameters diff --git a/tox.ini b/tox.ini index 37617efe..22ce7449 100644 --- a/tox.ini +++ b/tox.ini @@ -34,7 +34,6 @@ passenv = PIPENV_NO_INHERIT ENABLE_DOWNLOADING SCHEDULE_LINK_FETCHING - USE_INTHUB2 REMOVAL_POLICY_DESTROY UPLOAD_BUCKET allowlist_externals = make