Merge pull request #176 from weecology/schema_field

weecology · Apr 1, 2024 · f012979 · f012979
2 parents 951bc51 + a43369a
commit f012979
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 45 deletions.
diff --git a/deepforest_config.yml b/deepforest_config.yml
@@ -0,0 +1,40 @@
+# Config file for DeepForest pytorch module
+
+# Cpu workers for data loaders
+# Dataloaders
+workers: 1
+devices: 1
+accelerator: 'gpu'
+batch_size: 1
+
+# Model Architecture
+architecture: 'retinanet'
+num_classes: 1
+nms_thresh: 0.05
+
+# Architecture specific params
+retinanet:
+    # Non-max supression of overlapping predictions
+    score_thresh: 0.1
+
+train:
+    csv_file:
+    root_dir:
+
+    # Optimizer initial learning rate
+    lr: 0.001
+
+    # Print loss every n epochs
+    epochs: 1
+    # Useful debugging flag in pytorch lightning, set to True to get a single batch of training to test settings.
+    fast_dev_run: False
+    # pin images to GPU memory for fast training. This depends on GPU size and number of images.
+    preload_images: False
+
+validation:
+    # callback args
+    csv_file:
+    root_dir:
+    # Intersection over union evaluation
+    iou_threshold: 0.4
+    val_accuracy_interval: 20
diff --git a/everglades_dryrun_workflow.sh b/everglades_dryrun_workflow.sh
@@ -2,10 +2,10 @@
 #SBATCH --job-name=everglades_workflow 
 #SBATCH [email protected] 
 #SBATCH --mail-type=FAIL
-#SBATCH --gpus=a100:4
-#SBATCH --cpus-per-task=5
+#SBATCH --gpus=a100:1
+#SBATCH --cpus-per-task=3
 #SBATCH --mem=200gb
-#SBATCH --time=00:40:00
+#SBATCH --time=01:30:00
 #SBATCH --partition=gpu
 #SBATCH --output=/blue/ewhite/everglades/EvergladesTools/logs/everglades_dryrun_workflow.out
 #SBATCH --error=/blue/ewhite/everglades/EvergladesTools/logs/everglades_dryrun_workflow.err
@@ -20,5 +20,8 @@ conda activate EvergladesTools
 export TEST_ENV=True
 
 cd /blue/ewhite/everglades/EvergladesTools/Zooniverse
+
 snakemake --unlock
-snakemake --printshellcmds --keep-going --cores 5 --resources gpu=4 --rerun-incomplete --latency-wait 10 --use-conda
+echo "INFO [$(date "+%Y-%m-%d %H:%M:%S")] Starting Snakemake pipeline"
+snakemake --printshellcmds --keep-going --cores 3 --resources gpu=1 --rerun-incomplete --latency-wait 1 --use-conda
+echo "INFO [$(date "+%Y-%m-%d %H:%M:%S")] End"
diff --git a/everglades_workflow.sh b/everglades_workflow.sh
@@ -2,9 +2,9 @@
 #SBATCH --job-name=everglades_workflow 
 #SBATCH [email protected] 
 #SBATCH --mail-type=FAIL
-#SBATCH --gpus=a100:4
-#SBATCH --cpus-per-task=10
-#SBATCH --mem=200gb
+#SBATCH --gpus=a100:1
+#SBATCH --cpus-per-task=30
+#SBATCH --mem=1200gb
 #SBATCH --time=80:00:00
 #SBATCH --partition=gpu
 #SBATCH --output=/blue/ewhite/everglades/EvergladesTools/logs/everglades_workflow.out
@@ -19,5 +19,8 @@ ml conda
 conda activate EvergladesTools
 
 cd /blue/ewhite/everglades/EvergladesTools/Zooniverse
+
 snakemake --unlock
-snakemake --printshellcmds --keep-going --cores 10 --resources gpu=4 --rerun-incomplete --latency-wait 10 --use-conda
+echo "INFO [$(date "+%Y-%m-%d %H:%M:%S")] Starting Snakemake pipeline"
+snakemake --printshellcmds --keep-going --cores 30 --resources gpu=1 --rerun-incomplete --latency-wait 10 --use-conda
+echo "INFO [$(date "+%Y-%m-%d %H:%M:%S")] End"
diff --git a/nest_detection.py b/nest_detection.py
@@ -82,8 +82,8 @@ def compare_site(gdf):
         results = pd.concat(results)
     else:
         results = pd.DataFrame(columns=[
-            'matched_xmin', 'matched_ymin', 'xmax', 'matched_ymax', 'label', 'score', 'Date', 'bird_id', 'target_index',
-            'geometry'
+            'matched_xmin', 'matched_ymin', 'xmax', 'matched_ymax', 'label', 'score', 'image_path', 'Date', 'bird_id',
+            'target_index', 'geometry'
         ])
 
     return results
@@ -108,6 +108,7 @@ def detect_nests(bird_detection_file, year, site, savedir):
             'matched_ymax': 'float',
             'label': 'str',
             'score': 'float',
+            'image_path': 'str',
             'Site': 'str',
             'Date': 'str',
             'Year': 'str',

diff --git a/predict.py b/predict.py
@@ -8,36 +8,7 @@
 import shapely
 import torch
 from deepforest import main
-
-
-def project(raster_path, boxes):
-    """
-    Convert image coordinates into a geospatial object to overlap with input image. 
-    Args:
-        raster_path: path to the raster .tif on disk. Assumed to have a valid spatial projection
-        boxes: a prediction pandas dataframe from deepforest.predict_tile()
-    Returns:
-        a geopandas dataframe with predictions in input projection.
-    """
-    with rasterio.open(raster_path) as dataset:
-        bounds = dataset.bounds
-        pixelSizeX, pixelSizeY = dataset.res
-
-    # subtract origin. Recall that numpy origin is top left! Not bottom left.
-    boxes["xmin"] = (boxes["xmin"] * pixelSizeX) + bounds.left
-    boxes["xmax"] = (boxes["xmax"] * pixelSizeX) + bounds.left
-    boxes["ymin"] = bounds.top - (boxes["ymin"] * pixelSizeY)
-    boxes["ymax"] = bounds.top - (boxes["ymax"] * pixelSizeY)
-
-    # combine column to a shapely Box() object, save shapefile
-    boxes['geometry'] = boxes.apply(lambda x: shapely.geometry.box(x.xmin, x.ymin, x.xmax, x.ymax), axis=1)
-    boxes = geopandas.GeoDataFrame(boxes, geometry='geometry')
-
-    boxes.crs = dataset.crs.to_wkt()
-
-    # Shapefiles could be written with geopandas boxes.to_file(<filename>, driver='ESRI Shapefile')
-
-    return boxes
+from deepforest.utilities import boxes_to_shapefile
 
 
 def run(proj_tile_path, checkpoint_path, savedir="."):
@@ -57,19 +28,19 @@ def run(proj_tile_path, checkpoint_path, savedir="."):
     model.load_state_dict(checkpoint["state_dict"])
 
     boxes = model.predict_tile(raster_path=proj_tile_path, patch_overlap=0, patch_size=1500)
-    projected_boxes = project(proj_tile_path, boxes)
-
+    proj_tile_dir = os.path.dirname(proj_tile_path)
+    projected_boxes = boxes_to_shapefile(boxes, proj_tile_dir)
     if not os.path.exists(savedir):
         os.makedirs(savedir)
     basename = os.path.splitext(os.path.basename(proj_tile_path))[0]
     fn = "{}/{}.shp".format(savedir, basename)
-    projected_boxes.to_file(fn)
-
+    # Write GeoDataFrame to a new shapefile (avoid appending)
+    projected_boxes.to_file(fn, driver="ESRI Shapefile")
     return fn
 
 
 if __name__ == "__main__":
-    checkpoint_path = "/blue/ewhite/everglades/Zooniverse//20220910_182547/species_model.pl"
+    checkpoint_path = "/blue/ewhite/everglades/Zooniverse/20220910_182547/species_model.pl"
 
     path = sys.argv[1]
     split_path = os.path.normpath(path).split(os.path.sep)