Refactor 'main.py'

harvard-lts · Oct 27, 2024 · e5ce029 · e5ce029
1 parent 960dfe7
commit e5ce029
Show file tree

Hide file tree

Showing 5 changed files with 167 additions and 117 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,8 @@ output/*
 .coverage
 coverage.*
 
+myenv/
+
 dist/*
 */*.egg-info/*
 __pycache__

diff --git a/README.md b/README.md
@@ -15,30 +15,56 @@ pip install jp2_remediator==0.0.2
 
 ## Usage
 
-## Process one file
-`python3 box_reader.py --file tests/test-images/7514499.jp2`
+```bash
+python3 src/jp2_remediator/main.py  -h
 
-`python3 box_reader.py --file tests/test-images/481014278.jp2`
+usage: main.py [-h] {file,directory,bucket} ...
 
-## Process directory
-`python3 box_reader.py --directory tests/test-images/`
+JP2 file processor
 
-## Process Amazon S3 bucket
-`python3 box_reader.py --bucket your-bucket-name --prefix optional-prefix`
+options:
+  -h, --help            show this help message and exit
 
-## Process all .jp2 files in the bucket:
-`python3 box_reader.py --bucket remediation-folder`
+Input source:
+  {file,directory,bucket}
+    file                Process a single JP2 file
+    directory           Process all JP2 files in a directory
+    bucket              Process all JP2 files in an S3 bucket
+```
 
-## Process only files with a specific prefix (folder):
-`python3 box_reader.py --bucket remediation-folder --prefix testbatch_20240923`
+### Process one file
+```bash
+python3 src/jp2_remediator/main.py file tests/test-images/7514499.jp2
 
-`python3 box_reader.py --help`
+python3 src/jp2_remediator/main.py file tests/test-images/481014278.jp2
+```
 
-## Run Tests
-`python3 test_aws_connection.py`
+### Process directory
+```bash
+python3 src/jp2_remediator/main.py directory tests/test-images/
+```
 
-### Run from src folder
-`python3 -m unittest jp2_remediator.tests.test_box_reader`
+### Process all .jp2 files in an S3 bucket:
+```bash
+python3 src/jp2_remediator/main.py bucket remediation-folder
+```
+
+### Process only files with a specific prefix (folder):
+```bash
+python3 src/jp2_remediator/main.py bucket remediation-folder --prefix testbatch_20240923`
+```
+
+## Run tests
+
+### Run integration tests
+```bash
+pytest src/jp2_remediator/tests/integration/
+```
+
+### Run unit tests
+```bash
+pytest src/jp2_remediator/tests/unit/
+```
 
 ## Docker environment
 
@@ -51,3 +77,13 @@ Start Docker container
 ```bash
 ./bin/docker-run.sh
 ```
+
+## Development environment
+```bash
+python3 -m venv myenv
+source myenv/bin/activate
+export PYTHONPATH="${PYTHONPATH}:src"
+pip install -r requirements.txt
+
+python src/jp2_remediator/main.py -h
+```
diff --git a/src/jp2_remediator/box_reader.py b/src/jp2_remediator/box_reader.py
@@ -1,14 +1,6 @@
-# import sys
-import os
-import argparse
-import boto3
 import datetime
-
-# from jpylyzer import jpylyzer
 from jpylyzer import boxvalidator
 
-# from jpylyzer import byteconv
-
 
 class BoxReader:
     def __init__(self, file_path):
@@ -272,68 +264,3 @@ def read_jp2_file(self):
 
         self.write_modified_file(new_file_contents)
 
-
-def process_directory(directory_path):
-    """Process all JP2 files in a given directory."""
-    for root, _, files in os.walk(directory_path):
-        for file in files:
-            if file.lower().endswith(".jp2"):
-                file_path = os.path.join(root, file)
-                print(f"Processing file: {file_path}")
-                reader = BoxReader(file_path)
-                reader.read_jp2_file()
-
-
-def process_s3_bucket(bucket_name, prefix=""):
-    """Process all JP2 files in a given S3 bucket."""
-    s3 = boto3.client("s3")
-    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
-
-    if "Contents" in response:
-        for obj in response["Contents"]:
-            if obj["Key"].lower().endswith(".jp2"):
-                file_path = obj["Key"]
-                print(f"""Processing file: {file_path} from bucket {
-                    bucket_name
-                    }""")
-                download_path = f"/tmp/{os.path.basename(file_path)}"
-                s3.download_file(bucket_name, file_path, download_path)
-                reader = BoxReader(download_path)
-                reader.read_jp2_file()
-                # Optionally, upload modified file back to S3
-                timestamp = datetime.datetime.now().strftime(
-                    "%Y%m%d"
-                )  # use "%Y%m%d_%H%M%S" for more precision
-                s3.upload_file(
-                    download_path.replace(
-                        ".jp2", f"_modified_{timestamp}.jp2"
-                        ),
-                    bucket_name,
-                    file_path.replace(".jp2", f"_modified_{timestamp}.jp2"),
-                )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="JP2 file processor")
-    parser.add_argument("--file", help="Path to a single JP2 file to process.")
-    parser.add_argument(
-        "--directory", help="Path to a directory of JP2 files to process."
-    )
-    parser.add_argument(
-        "--bucket", help="Name of the AWS S3 bucket to process JP2 files from."
-    )
-    parser.add_argument(
-        "--prefix", help="Prefix of files in the AWS S3 bucket (optional)."
-    )
-
-    args = parser.parse_args()
-
-    if args.file:
-        reader = BoxReader(args.file)
-        reader.read_jp2_file()
-    elif args.directory:
-        process_directory(args.directory)
-    elif args.bucket:
-        process_s3_bucket(args.bucket, args.prefix)
-    else:
-        print("Please specify either --file, --directory, or --bucket.")
diff --git a/src/jp2_remediator/main.py b/src/jp2_remediator/main.py
@@ -1,34 +1,62 @@
-import sys
-import os
+import argparse
+from jp2_remediator.processor import Processor
 
 
-def main():
-    if len(sys.argv) != 3:
-        print("Usage: python script.py <folder_path1> <folder_path2>")
-        sys.exit(1)
-
-    folder_path1 = sys.argv[1]
-    folder_path2 = sys.argv[2]
-
-    if not os.path.isdir(folder_path1):
-        print(f"Error: {folder_path1} is not a valid directory.")
-        sys.exit(1)
 
-    if not os.path.isdir(folder_path2):
-        print(f"Error: {folder_path2} is not a valid directory.")
-        sys.exit(1)
-
-    print(f"Folder 1: {folder_path1}")
-    print(f"Folder 2: {folder_path2}")
+def main():
+    processor = Processor()
+
+    parser = argparse.ArgumentParser(description="JP2 file processor")
+
+    # Create mutually exclusive subparsers for specifying input source
+    subparsers = parser.add_subparsers(
+        title="Input source", dest="input_source"
+    )
+
+    # Subparser for processing a single JP2 file
+    file_parser = subparsers.add_parser(
+        "file", help="Process a single JP2 file"
+    )
+    file_parser.add_argument(
+        "file", help="Path to a single JP2 file to process"
+    )
+    file_parser.set_defaults(
+        func=lambda args: processor.process_file(args.file)
+    )
+
+    # Subparser for processing all JP2 files in a directory
+    directory_parser = subparsers.add_parser(
+        "directory", help="Process all JP2 files in a directory"
+    )
+    directory_parser.add_argument(
+        "directory", help="Path to a directory of JP2 files to process"
+    )
+    directory_parser.set_defaults(
+        func=lambda args: processor.process_directory(args.directory)
+    )
+
+    # Subparser for processing all JP2 files in an S3 bucket
+    bucket_parser = subparsers.add_parser(
+        "bucket", help="Process all JP2 files in an S3 bucket"
+    )
+    bucket_parser.add_argument(
+        "bucket", help="Name of the AWS S3 bucket to process JP2 files from"
+    )
+    bucket_parser.add_argument(
+        "--prefix", help="Prefix of files in the AWS S3 bucket (optional)",
+        default=""
+    )
+    bucket_parser.set_defaults(
+        func=lambda args: processor.process_s3_bucket(args.bucket, args.prefix)
+    )
+
+    args = parser.parse_args()
+
+    if hasattr(args, "func"):
+        args.func(args)
+    else:
+        parser.print_help()
 
 
 if __name__ == "__main__":
-    main()
-
-
-def hello_world():
-    print("Hello, world!")
-
-
-def add_one(number):
-    return number + 1
+    main()
diff --git a/src/jp2_remediator/processor.py b/src/jp2_remediator/processor.py
@@ -0,0 +1,57 @@
+import datetime
+import os
+import boto3
+
+from jp2_remediator.box_reader import BoxReader
+
+
+class Processor:
+    """Class to process JP2 files."""
+
+    def process_file(self, file_path):
+        """Process a single JP2 file."""
+        print(f"Processing file: {file_path}")
+        reader = BoxReader(file_path)
+        reader.read_jp2_file()
+
+
+    def process_directory(self, directory_path):
+        """Process all JP2 files in a given directory."""
+        for root, _, files in os.walk(directory_path):
+            for file in files:
+                if file.lower().endswith(".jp2"):
+                    file_path = os.path.join(root, file)
+                    print(f"Processing file: {file_path}")
+                    reader = BoxReader(file_path)
+                    reader.read_jp2_file()
+
+
+    def process_s3_bucket(self, bucket_name, prefix=""):
+        """Process all JP2 files in a given S3 bucket."""
+        s3 = boto3.client("s3")
+        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
+
+        if "Contents" in response:
+            for obj in response["Contents"]:
+                if obj["Key"].lower().endswith(".jp2"):
+                    file_path = obj["Key"]
+                    print(f"""Processing file: {file_path} from bucket {
+                        bucket_name
+                        }""")
+                    download_path = f"/tmp/{os.path.basename(file_path)}"
+                    s3.download_file(bucket_name, file_path, download_path)
+                    reader = BoxReader(download_path)
+                    reader.read_jp2_file()
+                    # Optionally, upload modified file back to S3
+                    timestamp = datetime.datetime.now().strftime(
+                        "%Y%m%d"
+                    )  # use "%Y%m%d_%H%M%S" for more precision
+                    s3.upload_file(
+                        download_path.replace(
+                            ".jp2", f"_modified_{timestamp}.jp2"
+                            ),
+                        bucket_name,
+                        file_path.replace(".jp2", f"_modified_{timestamp}.jp2"),
+                    )
+
+