Merge branch 'main' into logging

harvard-lts · Oct 17, 2024 · 682eac8 · 682eac8
2 parents f932cce + 960dfe7
commit 682eac8
Show file tree

Hide file tree

Showing 14 changed files with 224 additions and 3,962 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,2 @@
+input/*
+output/*
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,69 @@
+name: Tests and Style
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    strategy:
+      matrix:
+        python-version: [3.11.4]
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Switch to Current Branch
+        run: git checkout ${{ env.BRANCH }}
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install -e .
+      
+      - name: Run flake8
+        run: | 
+          pip install flake8
+          # stop the build if there are flake8 errors
+          flake8 . --count --show-source --statistics
+
+      - name: Run unit tests
+        run: |
+          pip install pytest
+          python -m pytest src/jp2_remediator/tests/unit
+
+      - name: Run coverage
+        run: |
+          pip install coverage
+          python -m coverage run -p -m pytest src/jp2_remediator/tests/unit
+          python -m coverage combine
+          python -m coverage report -m --skip-covered
+          python -m coverage xml
+      
+      # Fetch base branch for comparison (e.g., main)
+      - name: Fetch base branch
+        run: git fetch origin main
+
+      # Compare coverage with the base branch
+      - name: Compare coverage
+        run: |
+         pip install diff-cover
+         git checkout main
+         python -m coverage run -p -m pytest src/jp2_remediator/tests/unit
+         python -m coverage xml -o coverage-base.xml
+         git checkout -
+         python diff-cover --compare-branch=main coverage.xml
+         
+      # Fail if coverage decreases
+      - name: Fail if coverage decreases
+        run: |
+          python diff-cover --compare-branch=main coverage.xml --fail-under=100
+
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,14 @@
 *~
 *.swp
+
 logs/
 
+input/*
+output/*
+.coverage
+coverage.*
+
+
 dist/*
 */*.egg-info/*
 __pycache__

diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ pip install jp2_remediator==0.0.2
 ## Process all .jp2 files in the bucket:
 `python3 box_reader.py --bucket remediation-folder`
 
-##Process only files with a specific prefix (folder):
+## Process only files with a specific prefix (folder):
 `python3 box_reader.py --bucket remediation-folder --prefix testbatch_20240923`
 
 `python3 box_reader.py --help`

diff --git a/pyproject.toml b/pyproject.toml
@@ -21,3 +21,17 @@ dependencies = []
 [project.urls]
 Homepage = "https://github.com/kimpham54/jp2_remediator"
 Issues = "https://github.com/kimpham54/jp2_remediator/issues"
+
+[tool.pytest.ini_options]
+pythonpath = [
+  ".", "src"
+]
+
+[tool.coverage.run]
+omit = [
+  "**/tests/*"
+]
+
+[tool.project-paths]
+dir_unit_out = "src/jp2_remediator/tests/out/"
+dir_unit_resources = "src/jp2_remediator/tests/resources/"
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,10 @@
+boto3==1.35.39
+botocore==1.35.39
+jmespath==1.0.1
+jpylyzer==2.2.1
+project-paths==1.1.1
+python-dateutil==2.9.0.post0
+s3transfer==0.10.3
+six==1.16.0
+toml==0.10.2
+urllib3==2.2.3
diff --git a/src/jp2_remediator/box_reader.py b/src/jp2_remediator/box_reader.py
@@ -1,12 +1,15 @@
-import sys
+# import sys
 import os
 import argparse
 import boto3
 import datetime
 from jp2_remediator import configure_logger
 from jpylyzer import jpylyzer
+# from jpylyzer import jpylyzer
 from jpylyzer import boxvalidator
-from jpylyzer import byteconv
+
+# from jpylyzer import byteconv
+
 
 class BoxReader:
     def __init__(self, file_path):
@@ -19,16 +22,22 @@ def __init__(self, file_path):
     def read_file(self, file_path):
         """Reads the file content from the given path."""
         try:
-            with open(file_path, 'rb') as file:
+            with open(file_path, "rb") as file:
                 return file.read()
         except IOError as e:
             self.logger.error(f"Error reading file {file_path}: {e}")
             return None
 
     def initialize_validator(self):
         """Initializes the jpylyzer BoxValidator for JP2 file validation."""
-        options = {'validationFormat': 'jp2', 'verboseFlag': True, 'nullxmlFlag': False, 'packetmarkersFlag': False}
-        self.validator = boxvalidator.BoxValidator(options, 'JP2', self.file_contents)
+        options = {
+            "validationFormat": "jp2",
+            "verboseFlag": True,
+            "nullxmlFlag": False,
+            "packetmarkersFlag": False,
+        }
+        self.validator = boxvalidator.BoxValidator(
+            options, "JP2", self.file_contents)
         self.validator.validate()
         return self.validator
 
@@ -37,14 +46,18 @@ def find_box_position(self, box_hex):
         return self.file_contents.find(box_hex)
 
     def check_boxes(self):
-        """Checks for presence of 'jp2h' and 'colr' boxes in the file contents."""
-        jp2h_position = self.find_box_position(b'\x6a\x70\x32\x68')  # search hex for 'jp2h'
+        """Checks for presence of 'jp2h' and 'colr' boxes in file contents."""
+        jp2h_position = self.find_box_position(
+            b"\x6a\x70\x32\x68"
+        )  # search hex for 'jp2h'
         if jp2h_position != -1:
             self.logger.debug(f"'jp2h' found at byte position: {jp2h_position}")
         else:
             self.logger.debug("'jp2h' not found in the file.")
 
-        colr_position = self.find_box_position(b'\x63\x6f\x6c\x72')  # search hex for 'colr'
+        colr_position = self.find_box_position(
+            b"\x63\x6f\x6c\x72"
+        )  # search hex for 'colr'
         if colr_position != -1:
             self.logger.debug(f"'colr' found at byte position: {colr_position}")
         else:
@@ -77,7 +90,11 @@ def process_colr_box(self, colr_position):
 
         return header_offset_position
 
-    def process_trc_tag(self, trc_hex, trc_name, new_contents, header_offset_position):
+    def process_trc_tag(self,
+                        trc_hex,
+                        trc_name,
+                        new_contents,
+                        header_offset_position):
         """Processes the TRC tag and modifies contents if necessary."""
         trc_position = new_contents.find(trc_hex)
         if trc_position == -1:
@@ -102,25 +119,35 @@ def process_trc_tag(self, trc_hex, trc_name, new_contents, header_offset_positio
             self.logger.debug(f"Cannot calculate 'curv_{trc_name}_position' due to an unrecognized 'meth' value.")
             return new_contents
 
-        curv_trc_position = trc_tag_offset + header_offset_position  # start of curv profile data
-        curv_profile = new_contents[curv_trc_position:curv_trc_position + 12]  # 12-byte curv profile data length
+        curv_trc_position = (
+            trc_tag_offset + header_offset_position
+        )  # start of curv profile data
+        curv_profile = new_contents[
+            curv_trc_position: curv_trc_position + 12
+        ]  # 12-byte curv profile data length
 
         if len(curv_profile) < 12:
             self.logger.debug(f"Could not read the full 'curv' profile data for {trc_name}.")
             return new_contents
 
-        curv_signature = curv_profile[0:4].decode('utf-8')  # ICC.1:2022 Table 35 tag signature
-        curv_reserved = int.from_bytes(curv_profile[4:8], byteorder='big') # ICC.1:2022 Table 35 reserved 0's
-        curv_trc_gamma_n = int.from_bytes(curv_profile[8:12], byteorder='big') # # ICC.1:2022 Table 35 n value
+        curv_signature = curv_profile[0:4].decode(
+            "utf-8"
+        )  # ICC.1:2022 Table 35 tag signature
+        curv_reserved = int.from_bytes(
+            curv_profile[4:8], byteorder="big"
+        )  # ICC.1:2022 Table 35 reserved 0's
+        curv_trc_gamma_n = int.from_bytes(
+            curv_profile[8:12], byteorder="big"
+        )  # # ICC.1:2022 Table 35 n value
 
         self.logger.debug(f"'curv' Profile Signature for {trc_name}: {curv_signature}")
         self.logger.debug(f"'curv' Reserved Value: {curv_reserved}")
         self.logger.debug(f"'curv_{trc_name}_gamma_n' Value: {curv_trc_gamma_n}")
-
         curv_trc_field_length = curv_trc_gamma_n * 2 + 12  # ICC.1:2022 Table 35 2n field length
         self.logger.debug(f"'curv_{trc_name}_field_length': {curv_trc_field_length}")
 
-        # Check if curv_trc_gamma_n is not 1 and ask for confirmation to proceed, loops through all TRC tags
+        """Check if curv_trc_gamma_n is not 1 and ask
+        for confirmation to proceed, loops through all TRC tags"""
         if curv_trc_gamma_n != 1:
             self.logger.warning(f"Warning: 'curv_{trc_name}_gamma_n' value is {curv_trc_gamma_n}, expected 1.")
             proceed = input(f"Do you want to proceed with fixing the file {self.file_path}? (y/n): ").lower()
@@ -132,29 +159,33 @@ def process_trc_tag(self, trc_hex, trc_name, new_contents, header_offset_positio
             self.logger.warning(f"'{trc_name}' Tag Size ({trc_tag_size}) does not match 'curv_{trc_name}_field_length' ({curv_trc_field_length}). Modifying the size...")
             new_trc_size_bytes = curv_trc_field_length.to_bytes(4, byteorder='big')
             new_contents[trc_position + 8: trc_position + 12] = new_trc_size_bytes
-
         return new_contents
 
     def process_all_trc_tags(self, header_offset_position):
         """Function to process 'TRC' tags (rTRC, gTRC, bTRC)."""
         new_file_contents = bytearray(self.file_contents)
         trc_tags = {
-            b'\x72\x54\x52\x43': 'rTRC', # search hex for 'rTRC'
-            b'\x67\x54\x52\x43': 'gTRC', # search hex for 'gTRC'
-            b'\x62\x54\x52\x43': 'bTRC' # search hex for 'bTRC'
+            b"\x72\x54\x52\x43": "rTRC",  # search hex for 'rTRC'
+            b"\x67\x54\x52\x43": "gTRC",  # search hex for 'gTRC'
+            b"\x62\x54\x52\x43": "bTRC",  # search hex for 'bTRC'
         }
 
         for trc_hex, trc_name in trc_tags.items():
-            new_file_contents = self.process_trc_tag(trc_hex, trc_name, new_file_contents, header_offset_position)
+            new_file_contents = self.process_trc_tag(
+                trc_hex, trc_name, new_file_contents, header_offset_position
+            )
 
         return new_file_contents
 
     def write_modified_file(self, new_file_contents):
-        """Writes the modified file contents to a new file if changes were made."""
+        """Writes modified file contents to new file if changes were made."""
         if new_file_contents != self.file_contents:
-            timestamp = datetime.datetime.now().strftime("%Y%m%d") # use "%Y%m%d_%H%M%S" for more precision
-            new_file_path = self.file_path.replace(".jp2", f"_modified_{timestamp}.jp2")
-            with open(new_file_path, 'wb') as new_file:
+            timestamp = datetime.datetime.now().strftime(
+                "%Y%m%d"
+            )  # use "%Y%m%d_%H%M%S" for more precision
+            new_file_path = self.file_path.replace(
+                ".jp2", f"_modified_{timestamp}.jp2")
+            with open(new_file_path, "wb") as new_file:
                 new_file.write(new_file_contents)
             self.logger.info(f"New JP2 file created with modifications: {new_file_path}")
         else:
@@ -174,40 +205,59 @@ def read_jp2_file(self):
 
         self.write_modified_file(new_file_contents)
 
+
 def process_directory(directory_path):
     """Process all JP2 files in a given directory."""
     for root, _, files in os.walk(directory_path):
         for file in files:
-            if file.lower().endswith('.jp2'):
+            if file.lower().endswith(".jp2"):
                 file_path = os.path.join(root, file)
                 print(f"Processing file: {file_path}")
                 reader = BoxReader(file_path)
                 reader.read_jp2_file()
 
-def process_s3_bucket(bucket_name, prefix=''):
+
+def process_s3_bucket(bucket_name, prefix=""):
     """Process all JP2 files in a given S3 bucket."""
-    s3 = boto3.client('s3')
+    s3 = boto3.client("s3")
     response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
 
-    if 'Contents' in response:
-        for obj in response['Contents']:
-            if obj['Key'].lower().endswith('.jp2'):
-                file_path = obj['Key']
-                print(f"Processing file: {file_path} from bucket {bucket_name}")
+    if "Contents" in response:
+        for obj in response["Contents"]:
+            if obj["Key"].lower().endswith(".jp2"):
+                file_path = obj["Key"]
+                print(f"""Processing file: {file_path} from bucket {
+                    bucket_name
+                    }""")
                 download_path = f"/tmp/{os.path.basename(file_path)}"
                 s3.download_file(bucket_name, file_path, download_path)
                 reader = BoxReader(download_path)
                 reader.read_jp2_file()
                 # Optionally, upload modified file back to S3
-                timestamp = datetime.datetime.now().strftime("%Y%m%d") # use "%Y%m%d_%H%M%S" for more precision
-                s3.upload_file(download_path.replace(".jp2", f"_modified_{timestamp}.jp2"), bucket_name, file_path.replace(".jp2", f"_modified_{timestamp}.jp2"))
+                timestamp = datetime.datetime.now().strftime(
+                    "%Y%m%d"
+                )  # use "%Y%m%d_%H%M%S" for more precision
+                s3.upload_file(
+                    download_path.replace(
+                        ".jp2", f"_modified_{timestamp}.jp2"
+                        ),
+                    bucket_name,
+                    file_path.replace(".jp2", f"_modified_{timestamp}.jp2"),
+                )
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="JP2 file processor")
     parser.add_argument("--file", help="Path to a single JP2 file to process.")
-    parser.add_argument("--directory", help="Path to a directory of JP2 files to process.")
-    parser.add_argument("--bucket", help="Name of the AWS S3 bucket to process JP2 files from.")
-    parser.add_argument("--prefix", help="Prefix of files in the AWS S3 bucket (optional).")
+    parser.add_argument(
+        "--directory", help="Path to a directory of JP2 files to process."
+    )
+    parser.add_argument(
+        "--bucket", help="Name of the AWS S3 bucket to process JP2 files from."
+    )
+    parser.add_argument(
+        "--prefix", help="Prefix of files in the AWS S3 bucket (optional)."
+    )
 
     args = parser.parse_args()
 
@@ -219,4 +269,4 @@ def process_s3_bucket(bucket_name, prefix=''):
     elif args.bucket:
         process_s3_bucket(args.bucket, args.prefix)
     else:
-        print("Please specify either --file, --directory, or --bucket.")
+        print("Please specify either --file, --directory, or --bucket.")