diff --git a/USGS_backbone.py b/USGS_backbone.py new file mode 100644 index 0000000..795328b --- /dev/null +++ b/USGS_backbone.py @@ -0,0 +1,64 @@ +from deepforest import main +import pandas as pd +import os +import tempfile +import comet_ml +from pytorch_lightning.loggers import CometLogger + +df = pd.read_csv("/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/20231118/20231116_cropped_annotations.csv") +df.wat_label.value_counts() +df = df[df.wat_label.isin(["Bird","Cartilaginous Fish","Bony Fish","Mammal","Reptile"])] + +# Combine Fish classes +df.loc[df.wat_label.isin(["Cartilaginous Fish","Bony Fish"]),"wat_label"] = "Fish" + +# Construct padded crop name +df["image_path"] = df["bname_parent"] +"_" + df["tile_xtl"].astype(str) + "_" + df["tile_ytl"].astype(str) + "_" + df["tile_xbr"].astype(str) + "_" + df["tile_ybr"].astype(str) + ".JPG" + +# Check if all images exist +df["image_exists"] = df["image_path"].apply(lambda x: os.path.exists(os.path.join("/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/20231118/padded",x))) + +df["xmin"] = df["xtl"] +df["ymin"] = df["ytl"] +df["xmax"] = df["xbr"] +df["ymax"] = df["ybr"] +df["label"] = df["wat_label"] + +# Randomly split 80 - 20 for each class +train = df.groupby("wat_label").sample(frac=0.85) +test = df.drop(train.index) + +# Write to tmp data directory +tmpdir = tempfile.mkdtemp() +train.to_csv(os.path.join(tmpdir,"train.csv"),index=False) +test.to_csv(os.path.join(tmpdir,"test.csv"),index=False) + +# Initialize new Deepforest model ( the model that you will train ) with your classes +m = main.deepforest(config_args={"num_classes":4}, label_dict={"Bird":0,"Fish":1,"Mammal":2,"Reptile":3}) + +# Inatialize Deepforest model ( the model that you will modify its regression head ) +deepforest_release_model = main.deepforest() +deepforest_release_model.load_model("weecology/deepforest-bird") # or load_model('weecology/deepforest-bird') + +# Extract single class backbone that will have useful features for multi-class classification +m.model.backbone.load_state_dict(deepforest_release_model.model.backbone.state_dict()) + +# load regression head in the new model +m.model.head.regression_head.load_state_dict(deepforest_release_model.model.head.regression_head.state_dict()) + +m.config["train"]["csv_file"] = os.path.join(tmpdir,"train.csv") +m.config["train"]["root_dir"] = "/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/20231118/padded" +m.config["train"]["fast_dev_run"] = False +m.config["validation"]["csv_file"] = os.path.join(tmpdir,"test.csv") +m.config["validation"]["root_dir"] = "/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/20231118/padded" +m.config["batch_size"] = 6 +m.config["train"]["epochs"] = 25 +m.config["validation"]["val_accuracy_interval"] = 5 +m.config["train"]["scheduler"]["params"]["eps"] = 0 +comet_logger = CometLogger(project_name="BOEM", workspace="bw4sz") + +m.create_trainer(logger=comet_logger) +m.trainer.fit(m) + +# Save the model +m.trainer.save_checkpoint("/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/checkpoints/{}.pl".format(comet_logger.experiment.id)) diff --git a/conf/config.yaml b/conf/config.yaml index c5f632a..3d0712a 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -26,7 +26,7 @@ predict: min_score: 0.4 pipeline: - confidence_threshold: 0.5 + confidence_threshold: 0.9 limit_empty_frac: 0.01 propagate: @@ -34,12 +34,12 @@ propagate: distance_threshold_pixels: 50 detection_model: - checkpoint: bird + checkpoint: "/blue/ewhite/b.weinstein/BOEM/UBFAI Annotations/checkpoints/5420a9c3f27d4299992094a7b9b49cb7.pl" checkpoint_dir: /blue/ewhite/b.weinstein/BOEM/detection/checkpoints train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train/ train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated crop_image_dir: /blue/ewhite/b.weinstein/BOEM/detection/crops/ - limit_empty_frac: 0.25 + limit_empty_frac: 0.2 labels: - "Bird" trainer: @@ -49,7 +49,7 @@ detection_model: lr: 0.0001 workers: 0 validation: - val_accuracy_interval: 20 + val_accuracy_interval: 3 classification_model: checkpoint: @@ -84,7 +84,7 @@ active_learning: n_images: 50 patch_size: 2000 patch_overlap: 0 - min_score: 0.25 + min_score: 0.1 model_checkpoint: target_labels: - "Bird" @@ -98,7 +98,7 @@ active_learning: active_testing: image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27 strategy: 'random' - n_images: 1 + n_images: 1000 m: patch_size: 2000 patch_overlap: 0 diff --git a/src/active_learning.py b/src/active_learning.py index e0b579d..73749d1 100644 --- a/src/active_learning.py +++ b/src/active_learning.py @@ -70,10 +70,15 @@ def update_sys_path(): dask_results.append(pd.concat(block_result)) preannotations = pd.concat(dask_results) else: - preannotations = detection.predict(m=model, image_paths=pool, patch_size=patch_size, patch_overlap=patch_overlap) + preannotations = detection.predict(m=model, image_paths=pool, patch_size=patch_size, patch_overlap=patch_overlap, batch_size=32) preannotations = pd.concat(preannotations) + # Print the number of preannotations before removing min score + print("There are {} preannotations before removing min score".format(preannotations.shape[0])) + print("There are {} images before removing min score".format(preannotations["image_path"].nunique())) preannotations = preannotations[preannotations["score"] >= min_score] + print("There are {} preannotations after removing min score".format(preannotations.shape[0])) + print("There are {} images after removing min score".format(preannotations["image_path"].nunique())) if strategy == "most-detections": # Sort images by total number of predictions @@ -158,6 +163,7 @@ def update_sys_path(): preannotations = detection.predict(model=model, image_paths=pool, patch_size=patch_size, patch_overlap=patch_overlap) preannotations = pd.concat(preannotations) + print("There are {} preannotations before removing min score".format(preannotations.shape[0])) preannotations = preannotations[preannotations["score"] >= min_score] if strategy == "most-detections": diff --git a/src/detection.py b/src/detection.py index 7fa1578..6a7f4eb 100644 --- a/src/detection.py +++ b/src/detection.py @@ -59,7 +59,7 @@ def load(checkpoint, annotations = None): if not annotations is None: num_labels = len(annotations.label.unique()) - if num_labels != len(snapshot.label_dict): + if num_labels > len(snapshot.label_dict): snapshot = extract_backbone(snapshot, annotations) return snapshot @@ -135,6 +135,10 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro """ tmpdir = tempfile.gettempdir() + # Fix taxonomy + train_annotations = fix_taxonomy(train_annotations) + test_annotations = fix_taxonomy(test_annotations) + train_annotations.to_csv(os.path.join(tmpdir,"train.csv"), index=False) test_annotations.to_csv(os.path.join(tmpdir,"test.csv"), index=False) @@ -142,8 +146,8 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro model.config["train"]["csv_file"] = os.path.join(tmpdir,"train.csv") model.config["train"]["root_dir"] = train_image_dir - #model.config["validation"]["csv_file"] = os.path.join(tmpdir,"test.csv") - #model.config["validation"]["root_dir"] = train_image_dir + model.config["validation"]["csv_file"] = os.path.join(tmpdir,"test.csv") + model.config["validation"]["root_dir"] = train_image_dir # Loop through all keys in model.config and set them to the value of the key in model.config config_args = OmegaConf.to_container(config_args) @@ -173,13 +177,14 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro sample_train_annotations_for_image.root_dir = train_image_dir visualize.plot_annotations(sample_train_annotations_for_image, savedir=tmpdir) comet_logger.experiment.log_image(os.path.join(tmpdir, filename)) - # with comet_logger.experiment.context_manager("test_images"): - # non_empty_train_annotations = read_file(model.config["validation"]["csv_file"], root_dir=train_image_dir) - # for filename in non_empty_train_annotations.image_path.sample(5): - # sample_train_annotations_for_image = non_empty_train_annotations[non_empty_train_annotations.image_path == filename] - # sample_train_annotations_for_image.root_dir = train_image_dir - # visualize.plot_annotations(sample_train_annotations_for_image, savedir=tmpdir) - # comet_logger.experiment.log_image(os.path.join(tmpdir, filename)) + + with comet_logger.experiment.context_manager("test_images"): + non_empty_validation_annotations = read_file(model.config["validation"]["csv_file"], root_dir=train_image_dir) + for filename in non_empty_validation_annotations.image_path.head(5): + sample_validation_annotations_for_image = non_empty_validation_annotations[non_empty_validation_annotations.image_path == filename] + sample_validation_annotations_for_image.root_dir = train_image_dir + visualize.plot_annotations(sample_validation_annotations_for_image, savedir=tmpdir) + comet_logger.experiment.log_image(os.path.join(tmpdir, filename)) model.trainer.fit(model) @@ -196,6 +201,12 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro return model +def fix_taxonomy(df): + df["label"] = df.label.replace('Turtle', 'Reptile') + df["label"] = df.label.replace('Cetacean', 'Mammal') + + return df + def preprocess_and_train(config, model_type="detection"): """Preprocess data and train model. @@ -208,6 +219,7 @@ def preprocess_and_train(config, model_type="detection"): # Get and split annotations train_df = gather_data(config.detection_model.train_csv_folder) validation = gather_data(config.label_studio.csv_dir_validation) + validation.loc[validation.label==0,"label"] = "Bird" # Remove the empty frames, using hard mining instead @@ -239,7 +251,9 @@ def preprocess_and_train(config, model_type="detection"): if config.detection_model.limit_empty_frac > 0: train_df = limit_empty_frames(train_df, config.detection_model.limit_empty_frac) if not validation_df.empty: - validation_df = limit_empty_frames(validation_df, config.detection_model.limit_empty_frac) + #validation_df = limit_empty_frames(validation_df, config.detection_model.limit_empty_frac) + # DeepForest evaluate doesn't work with empty frames yet, see https://github.com/weecology/DeepForest/pull/858 + validation_df = validation_df[validation_df.xmin!=0] # Train model @@ -283,7 +297,7 @@ def get_latest_checkpoint(checkpoint_dir, annotations): return m -def _predict_list_(image_paths, patch_size, patch_overlap, model_path, m=None, crop_model=None): +def _predict_list_(image_paths, patch_size, patch_overlap, model_path, m=None, crop_model=None, batch_size=64): if model_path: m = load(model_path) else: @@ -291,17 +305,17 @@ def _predict_list_(image_paths, patch_size, patch_overlap, model_path, m=None, c raise ValueError("A model or model_path is required for prediction.") m.create_trainer(fast_dev_run=False) - + m.config["batch_size"] = batch_size predictions = [] for image_path in image_paths: - prediction = m.predict_tile(raster_path=image_path, return_plot=False, patch_size=patch_size, patch_overlap=patch_overlap, crop_model=crop_model, verbose=True) + prediction = m.predict_tile(raster_path=image_path, return_plot=False, patch_size=patch_size, patch_overlap=patch_overlap, crop_model=crop_model) if prediction is None: prediction = pd.DataFrame({"image_path": image_path, "xmin": [None], "ymin": [None], "xmax": [None], "ymax": [None], "label": [None], "score": [None]}) predictions.append(prediction) return predictions -def predict(image_paths, patch_size, patch_overlap, m=None, model_path=None, dask_client=None, crop_model=None): +def predict(image_paths, patch_size, patch_overlap, m=None, model_path=None, dask_client=None, crop_model=None, batch_size=8): """Predict bounding boxes for images Args: m (main.deepforest): A trained deepforest model. @@ -309,6 +323,7 @@ def predict(image_paths, patch_size, patch_overlap, m=None, model_path=None, das crop_model (main.deepforest): A trained deepforest model for classification. model_path (str): The path to a model checkpoint. dask_client (dask.distributed.Client): A dask client for parallel prediction. + batch_size (int): The batch size for prediction. Returns: list: A list of image predictions. """ @@ -337,6 +352,6 @@ def update_sys_path(): block_result = block_result.result() predictions.append(pd.concat(block_result)) else: - predictions = _predict_list_(image_paths=image_paths, patch_size=patch_size, patch_overlap=patch_overlap, model_path=model_path, m=m, crop_model=crop_model) + predictions = _predict_list_(image_paths=image_paths, patch_size=patch_size, patch_overlap=patch_overlap, model_path=model_path, m=m, crop_model=crop_model, batch_size=batch_size) return predictions diff --git a/src/pipeline.py b/src/pipeline.py index 8ab23cb..ed4ca95 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -165,8 +165,8 @@ def run(self): min_score=self.config.active_learning.min_score ) - print(f"Images requiring human review: {len(confident_predictions)}") - print(f"Images auto-annotated: {len(uncertain_predictions)}") + print(f"Images requiring human review: {len(uncertain_predictions)}") + print(f"Images auto-annotated: {len(confident_predictions)}") # Intelligent cropping image_paths = uncertain_predictions["image_path"].unique() @@ -199,6 +199,7 @@ def run(self): uncertain_predictions=uncertain_predictions, pipeline_monitor=pipeline_monitor) - reporter.generate_report() + reporter.generate_report(create_video=True) else: print("No images to annotate") + diff --git a/src/pipeline_evaluation.py b/src/pipeline_evaluation.py index 1794f6d..bccdfc0 100644 --- a/src/pipeline_evaluation.py +++ b/src/pipeline_evaluation.py @@ -128,6 +128,7 @@ def predict_classification(self): image_paths=full_image_paths, patch_size=self.patch_size, patch_overlap=self.patch_overlap, + batch_size=32 ) combined_predictions = pd.concat(predictions) self.predictions.append(combined_predictions) diff --git a/src/reporting.py b/src/reporting.py index 18b8bb5..43dca99 100644 --- a/src/reporting.py +++ b/src/reporting.py @@ -50,14 +50,15 @@ def concat_predictions(self): """ self.all_predictions = pd.concat(self.pipeline_monitor.predictions, ignore_index=True) - def generate_report(self): + def generate_report(self, create_video=False): """Generate a report""" if self.pipeline_monitor: self.concat_predictions() self.write_predictions() self.write_metrics() - self.generate_video() + if create_video: + self.generate_video() def write_predictions(self): """Write predictions to a csv file""" diff --git a/submit.sh b/submit.sh index 01a4aa0..7701ca7 100644 --- a/submit.sh +++ b/submit.sh @@ -15,4 +15,4 @@ source activate BOEM cd ~/BOEM/ -python main.py check_annotations=True active_learning.pool_limit=20000 active_testing.n_images=1 active_learning.n_images=30 detection_model.trainer.epochs = 20 +python main.py check_annotations=True active_learning.pool_limit=10000 active_testing.n_images=1 active_learning.n_images=100 pipeline_evaluation.debug=False diff --git a/submit_USGS.sh b/submit_USGS.sh new file mode 100644 index 0000000..59f2a82 --- /dev/null +++ b/submit_USGS.sh @@ -0,0 +1,18 @@ +#!/bin/bash +#SBATCH --job-name=BOEM # Job name +#SBATCH --mail-type=END # Mail events +#SBATCH --mail-user=benweinstein2010@gmail.com # Where to send mail +#SBATCH --account=ewhite +#SBATCH --nodes=1 # Number of MPI ran +#SBATCH --cpus-per-task=1 +#SBATCH --mem=150GB +#SBATCH --time=48:00:00 #Time limit hrs:min:sec +#SBATCH --output=/home/b.weinstein/logs/BOEM%j.out # Standard output and error log +#SBATCH --error=/home/b.weinstein/logs/BOEM%j.err +#SBATCH --partition=gpu +#SBATCH --gpus=1 + +source activate BOEM + +cd ~/BOEM/ +python USGS_backbone.py \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 8df4acb..69344d4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -40,23 +40,22 @@ def config(tmpdir_factory): # Create sample bounding box annotations train_data = { 'image_path': ['empty.jpg', 'birds.jpg', "birds.jpg"], - 'xmin': [0, 200, 150], - 'ymin': [0, 300, 250], - 'xmax': [0, 300, 250], - 'ymax': [0, 400, 350], - 'label': ['Bird', 'Bird', 'Bird2'], + 'xmin': [20, 200, 150], + 'ymin': [10, 300, 250], + 'xmax': [40, 300, 250], + 'ymax': [20, 400, 350], + 'label': ['FalsePositive', 'Bird', 'Bird2'], 'annotator': ['test_user', 'test_user', 'test_user'] } val_data = { - 'image_path': ['birds_val.jpg', 'birds_val.jpg'], - 'xmin': [150, 150], - 'ymin': [250, 250], - 'xmax': [250, 250], - 'ymax': [350, 350], - 'label': ['Bird', 'Bird2'], - 'annotator': ['test_user', 'test_user'], - "score": [0.9, 0.8] + 'image_path': ['empty.jpg','birds_val.jpg', 'birds_val.jpg'], + 'xmin': [None,150, 150], + 'ymin': [None,250, 250], + 'xmax': [None,250, 250], + 'ymax': [None,350, 350], + 'label': ['Bird','Bird', 'Bird2'], + 'annotator': ['test_user','test_user', 'test_user'], } metadata = { diff --git a/tests/test_pipeline_evaluation.py b/tests/test_pipeline_evaluation.py index a13d288..f75b9f1 100644 --- a/tests/test_pipeline_evaluation.py +++ b/tests/test_pipeline_evaluation.py @@ -17,12 +17,13 @@ def predict_tile(self, raster_path, patch_size=450, patch_overlap=0, return_plot # Return realistic predictions based on image name if "empty" in raster_path.lower(): return pd.DataFrame({ - 'xmin': [], - 'ymin': [], - 'xmax': [], - 'ymax': [], - 'label': [], - 'score': [] + 'xmin': [None], + 'ymin': [None], + 'xmax': [None], + 'ymax': [None], + 'label': [None], + 'score': [None], + "image_path": [os.path.basename(raster_path)] }) # If random, Generate 1-3 random predictions for non-empty images @@ -34,7 +35,7 @@ def predict_tile(self, raster_path, patch_size=450, patch_overlap=0, return_plot 'xmax': np.random.randint(800, 1000, num_predictions), 'ymax': np.random.randint(600, 800, num_predictions), 'label': ['Bird1'] * num_predictions, - 'score': np.random.uniform(0.5, 0.99, num_predictions), + 'score': np.random.uniform(0.1, 0.99, num_predictions), 'image_path': [os.path.basename(raster_path)] * num_predictions }) else: