fix: add missing bucket write permissions to downloader Lambda and cl…

…eanup IntHub references (#43) * Add write permission to upload bucket for downloader * Remove mentions of IntHub/SciHub now we're on CDSE
NASA-IMPACT · Nov 7, 2024 · e202d60 · e202d60
1 parent 4553661
commit e202d60
Show file tree

Hide file tree

Showing 9 changed files with 15 additions and 28 deletions.
diff --git a/.github/workflows/deploy-on-release.yml b/.github/workflows/deploy-on-release.yml
@@ -193,7 +193,6 @@ jobs:
           IDENTIFIER="esa-prod"
           ENABLE_DOWNLOADING="TRUE"
           SCHEDULE_LINK_FETCHING="TRUE"
-          USE_INTHUB2="TRUE"
           REMOVAL_POLICY_DESTROY="FALSE"
           UPLOAD_BUCKET="${{ secrets.UPLOAD_BUCKET }}"
           PERMISSIONS_BOUNDARY_ARN="${{ vars.PERMISSIONS_BOUNDARY_ARN }}"

diff --git a/README.md b/README.md
@@ -52,7 +52,6 @@ AWS_DEFAULT_PROFILE="<your named AWS CLI profile to use for deployment>"
 PIPENV_NO_INHERIT=TRUE # This is used to ensure our Lambdas/Layers get separate Pipenv environments
 ENABLE_DOWNLOADING="TRUE" # Or "FALSE" - If TRUE then the TO_UPLOAD queue is set as an enabled source to the Downloader
 SCHEDULE_LINK_FETCHING="TRUE" # Or "FALSE" - If TRUE then link fetching will happen every day at midday.
-USE_INTHUB2="TRUE" # Or "FALSE" - If TRUE then the Downloader will use IntHub2 credentials when downloading
 REMOVAL_POLICY_DESTROY="TRUE" # Or "FALSE" - See below for what is deleted if TRUE
 UPLOAD_BUCKET="<name-of-aws-s3-bucket-to-upload-images-to>"
 ```

diff --git a/cdk/app.py b/cdk/app.py
@@ -13,7 +13,6 @@
 upload_bucket = os.environ["UPLOAD_BUCKET"]
 enable_downloading = os.environ["ENABLE_DOWNLOADING"] == "TRUE"
 schedule_link_fetching = os.environ["SCHEDULE_LINK_FETCHING"] == "TRUE"
-use_inthub2 = os.environ["USE_INTHUB2"] == "TRUE"
 removal_policy_destroy = os.environ["REMOVAL_POLICY_DESTROY"] == "TRUE"
 print(identifier)
 
@@ -25,7 +24,6 @@
     upload_bucket=upload_bucket,
     permissions_boundary_arn=permissions_boundary_arn,
     enable_downloading=enable_downloading,
-    use_inthub2=use_inthub2,
     schedule_link_fetching=schedule_link_fetching,
     removal_policy_destroy=removal_policy_destroy,
 )

diff --git a/cdk/downloader_stack.py b/cdk/downloader_stack.py
@@ -12,6 +12,7 @@
     aws_events_targets,
     aws_iam,
     aws_lambda,
+    aws_s3,
 )
 from aws_cdk import aws_lambda_python_alpha as aws_lambda_python
 from aws_cdk import aws_logs, aws_rds, aws_secretsmanager, aws_sqs, aws_ssm
@@ -33,7 +34,6 @@ def __init__(
         zipper_url: Optional[str] = None,
         checksum_url: Optional[str] = None,
         enable_downloading: bool = False,
-        use_inthub2: bool = False,
         schedule_link_fetching: bool = False,
         removal_policy_destroy: bool = True,
         **kwargs,
@@ -290,7 +290,6 @@ def __init__(
             "STAGE": identifier,
             "DB_CONNECTION_SECRET_ARN": downloader_rds_secret.secret_arn,
             "UPLOAD_BUCKET": upload_bucket,
-            "USE_INTHUB2": "YES" if use_inthub2 else "NO",
             **({"COPERNICUS_ZIPPER_URL": zipper_url} if zipper_url else {}),
             **({"COPERNICUS_CHECKSUM_URL": checksum_url} if checksum_url else {}),
         }
@@ -344,6 +343,13 @@ def __init__(
 
         self.downloader.role.add_managed_policy(lambda_insights_policy)
 
+        downloader_bucket = aws_s3.Bucket.from_bucket_name(
+            self,
+            "UploadBucket",
+            bucket_name=upload_bucket,
+        )
+        downloader_bucket.grant_write(self.downloader)
+
         downloader_rds_secret.grant_read(link_fetcher)
         downloader_rds_secret.grant_read(self.downloader)
 
@@ -364,16 +370,6 @@ def __init__(
 
         token_parameter.grant_read(self.downloader)
 
-        if use_inthub2:
-            inthub2_credentials = aws_secretsmanager.Secret.from_secret_name_v2(
-                self,
-                id=f"{identifier}-inthub2-credentials",
-                secret_name=(
-                    f"hls-s2-downloader-serverless/{identifier}/inthub2-credentials"
-                ),
-            )
-            inthub2_credentials.grant_read(self.downloader)
-
         to_download_queue.grant_send_messages(link_fetcher)
         to_download_queue.grant_consume_messages(self.downloader)
 

diff --git a/example.env b/example.env
@@ -5,6 +5,5 @@ AWS_DEFAULT_PROFILE="my-aws-account"
 PIPENV_NO_INHERIT=TRUE
 ENABLE_DOWNLOADING="FALSE"
 SCHEDULE_LINK_FETCHING="FALSE"
-USE_INTHUB2="FALSE"
 REMOVAL_POLICY_DESTROY="TRUE"
 UPLOAD_BUCKET="<name-of-aws-s3-bucket-to-upload-images-to>"
diff --git a/lambdas/downloader/README.md b/lambdas/downloader/README.md
@@ -4,9 +4,9 @@
 
 ![Downloader in S2 Downloader diagram](../../images/hls-s2-downloader-downloader.png)
 
-The Downloaders purpose is download Sentinel 2 Images from Sci/IntHub. It is invoked via SQS messages being available within the `TO_DOWNLOAD` SQS queue; this handler will be limited to a concurrency limit of 15, due to the nature of the dedicated connection we have to IntHub. Images are downloaded and uploaded to S3 in one step, they are stored under a key in the format `<YYYY-MM-DD>/<image_filename>` where the date is the `beginposition` of the image.
+The Downloaders purpose is download Sentinel 2 Images from Copernicus Data Space Ecosystem. It is invoked via SQS messages being available within the `TO_DOWNLOAD` SQS queue; this handler will be limited to a concurrency limit of 15, due to the nature of the dedicated connection we have to Copernicus Data Space Ecosystem. Images are downloaded and uploaded to S3 in one step, they are stored under a key in the format `<YYYY-MM-DD>/<image_filename>` where the date is the `beginposition` of the image.
 
-S3 performs a MD5 checksum comparison on upload, this way we ensure that we only store images that match the MD5 that Sci/IntHub provided us for the image.
+S3 performs a MD5 checksum comparison on upload, this way we ensure that we only store images that match the MD5 that Copernicus Data Space Ecosystem provided us for the image.
 
 Interactions with the `granule` table include marking the download as having started, updating the checksum of the image, and marking that the download is complete.
 
@@ -27,7 +27,7 @@ except AlreadyDownloaded:
     return # End gracefully - We received a duplicate from SQS, this is OK
 
 try:
-    checksum = get_checksum_from_scihub()
+    checksum = get_checksum_from_esa_copernicus()
     download_file()
 except Exception as ex:
     increase_retry_count()
@@ -40,10 +40,6 @@ update_status()
 
 Due to the nature of how Lambda is invoked by SQS, a non-failed invocation of a Lambda will result in the SQS message being deleted. Because of this, if we need to gracefully handle an error, we tidy up (namely database rollbacks), then raise the error to the handler, this then results in the Lambda failing and the SQS message being re-added to the Queue.
 
-We use the flag `USE_INTHUB2` with possible values of `YES` and `NO` to determine whether we:
-* A - Replace `scihub` in the fetched links download urls with `inthub2`
-* B - Retrieve `inthub2` credentials when downloading files
-
 ---
 
 ## Development

diff --git a/lambdas/link_fetcher/README.md b/lambdas/link_fetcher/README.md
@@ -4,7 +4,7 @@
 
 ![Link fetcher in S2 Downloader diagram](../../images/hls-s2-downloader-link-fetcher.png)
 
-The Link Fetchers purpose is to query IntHub/SciHub for new imagery links to download. It is invoked within the `Link Fetching` Step Function; every invocation is performed on one day in the form `YYYY-MM-DD`. Images to download are stored as records in the `granule` table, the `granule_count` table is also updated with available and fetched link counts. The `To Download` queue is also populated with the images IDs and download URLs.
+The Link Fetchers purpose is to query Copernicus Data Space Ecosystem for new imagery links to download. It is invoked within the `Link Fetching` Step Function; every invocation is performed on one day in the form `YYYY-MM-DD`. Images to download are stored as records in the `granule` table, the `granule_count` table is also updated with available and fetched link counts. The `To Download` queue is also populated with the images IDs and download URLs.
 
 ---
 
@@ -14,7 +14,7 @@ Provided below is some pseudo-code to explain the process happening each time th
 
 ```python
 available, processed = get_how_many_links_are_available_and_have_been_processed()
-query = generate_the_query_for_inthub_for_a_day(day)
+query = generate_the_query_for_esa_for_a_day(day)
 
 while there_is_still_imagery_to_process:
 

diff --git a/lambdas/link_fetcher/handler.py b/lambdas/link_fetcher/handler.py
@@ -284,7 +284,8 @@ def filter_search_results(
 def get_query_parameters(start: int, day: date) -> Mapping[str, Any]:
     """
     Returns the query parameters that are needed for getting new imagery from
-    search/IntHub
+    search (Copernicus Data Space Ecosystem)
+
     :param start: An int representing the offset to get results from a query
     :param day: A date object representing the date to query for imagery
     :returns: mapping of query parameters

diff --git a/tox.ini b/tox.ini
@@ -34,7 +34,6 @@ passenv =
   PIPENV_NO_INHERIT
   ENABLE_DOWNLOADING
   SCHEDULE_LINK_FETCHING
-  USE_INTHUB2
   REMOVAL_POLICY_DESTROY
   UPLOAD_BUCKET
 allowlist_externals = make