diff --git a/conf/config.yaml b/conf/config.yaml index f17f1bf..c5f632a 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -7,7 +7,9 @@ comet: check_annotations: true # Force upload bypasses the pipeline, useful for debugging and starting a new project -force_upload: true +force_upload: False +force_training: True + label_studio: project_name_train: "Bureau of Ocean Energy Management - Training" project_name_validation: "Bureau of Ocean Energy Management - Validation" @@ -37,15 +39,17 @@ detection_model: train_csv_folder: /blue/ewhite/b.weinstein/BOEM/annotations/train/ train_image_dir: /blue/ewhite/b.weinstein/BOEM/sample_flight/JPG_2024_Jan27/annotated crop_image_dir: /blue/ewhite/b.weinstein/BOEM/detection/crops/ - limit_empty_frac: 0.05 + limit_empty_frac: 0.25 labels: - "Bird" - trainer: + trainer: train: fast_dev_run: False - epochs: 3 - lr: 0.00001 + epochs: 10 + lr: 0.0001 workers: 0 + validation: + val_accuracy_interval: 20 classification_model: checkpoint: @@ -56,8 +60,8 @@ classification_model: under_sample_ratio: 0 trainer: fast_dev_run: False - max_epochs: 4 - lr: 0.001 + max_epochs: 1 + lr: 0.00001 pipeline_evaluation: detect_ground_truth_dir: /blue/ewhite/b.weinstein/BOEM/annotations/validation @@ -80,7 +84,7 @@ active_learning: n_images: 50 patch_size: 2000 patch_overlap: 0 - min_score: 0.3 + min_score: 0.25 model_checkpoint: target_labels: - "Bird" diff --git a/src/active_learning.py b/src/active_learning.py index b3ad34f..e0b579d 100644 --- a/src/active_learning.py +++ b/src/active_learning.py @@ -27,7 +27,8 @@ def choose_train_images(evaluation, image_dir, strategy, n=10, patch_size=512, p Returns: list: A list of image paths. """ - pool = glob.glob(os.path.join(image_dir,"*")) # Get all images in the data directory + pool = glob.glob(os.path.join(image_dir,"*.jpg")) # Get all images in the data directory + # Remove .csv files from the pool pool = [image for image in pool if not image.endswith('.csv')] @@ -81,7 +82,7 @@ def update_sys_path(): if target_labels is None: raise ValueError("Target labels are required for the 'target-labels' strategy.") # Filter images by target labels - chosen_images = preannotations[preannotations.label.isin(target_labels)].groupby("image_path").size().sort_values(ascending=False).head(n).index.tolist() + chosen_images = preannotations[preannotations.label.isin(target_labels)].groupby("image_path")["score"].mean().sort_values(ascending=False).head(n).index.tolist() else: raise ValueError("Invalid strategy. Must be one of 'random', 'most-detections', or 'target-labels'.") # Get full path diff --git a/src/data_processing.py b/src/data_processing.py index d870d5d..b49747d 100644 --- a/src/data_processing.py +++ b/src/data_processing.py @@ -2,6 +2,7 @@ import os from logging import warn from deepforest import preprocess +from deepforest.utilities import read_file from typing import Optional, Union, List, Dict import numpy as np from scipy.spatial import ConvexHull @@ -52,7 +53,8 @@ def preprocess_images( save_dir: str, limit_empty_frac: float = 0.1, patch_size: int = 450, - patch_overlap: int = 0 + patch_overlap: int = 0, + allow_empty: bool = False ) -> pd.DataFrame: """ Cut images into GPU-friendly chunks and process annotations accordingly. @@ -68,6 +70,7 @@ def preprocess_images( limit_empty_frac: Maximum fraction of empty patches to keep patch_size: Size of the output patches in pixels patch_overlap: Overlap between patches in pixels + allow_empty: Whether to allow patches without annotations Returns: DataFrame containing annotations for the processed image patches @@ -88,11 +91,6 @@ def preprocess_images( for image_path in annotations.image_path.unique(): annotation_df = annotations[annotations.image_path == image_path] - - if annotation_df.empty: - allow_empty = True - else: - allow_empty = False crop_annotation = process_image( image_path=image_path, @@ -144,12 +142,6 @@ def process_image( full_path = os.path.join(root_dir, image_path) - # Check if all xmin values are 0, indicating empty annotations - if annotation_df is not None and all(annotation_df['xmin'] == 0): - allow_empty = True - else: - allow_empty = False - crop_annotation = preprocess.split_raster( path_to_raster=full_path, annotations_file=annotation_df, @@ -159,7 +151,31 @@ def process_image( root_dir=root_dir, allow_empty=allow_empty ) + + # Split FalsePositives out for hard negative mining, for those images with no true positives + true_positive_images = crop_annotation.loc[crop_annotation.label != "FalsePositive", "image_path"].unique() + # Remove any FalsePositives that are in true positive images + crop_annotation = crop_annotation.loc[~((crop_annotation.label == "FalsePositive") & (crop_annotation.image_path.isin(true_positive_images))), :] + crop_annotation.loc[crop_annotation.label == "FalsePositive", ["xmin", "ymin", "xmax", "ymax"]] = 0 + crop_annotation["label"] = crop_annotation["label"].astype(str) + crop_annotation.loc[crop_annotation.label == "FalsePositive", "label"] = "Bird" + crop_annotation.loc[crop_annotation.label == "0", "label"] = "Bird" + + # Update geometry for FalsePositives + crop_annotation.drop(columns=["geometry"], inplace=True) + crop_annotation = pd.DataFrame(crop_annotation) + crop_annotation = read_file(crop_annotation) + + # Remove duplicates + crop_annotation = crop_annotation.drop_duplicates(subset=["image_path", "xmin", "ymin", "xmax", "ymax", "label"]) + + # Remove FalsePositives that are in true positive images + crop_annotation = crop_annotation.loc[~((crop_annotation.label == "FalsePositive") & (crop_annotation.image_path.isin(true_positive_images))), :] + + # Save over the original csv + crop_annotation.to_csv(crop_csv, index=False) + if annotation_df is None: empty_annotations = [] for i in range(len(crop_annotation)): diff --git a/src/detection.py b/src/detection.py index 91229e1..7fa1578 100644 --- a/src/detection.py +++ b/src/detection.py @@ -136,11 +136,15 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro tmpdir = tempfile.gettempdir() train_annotations.to_csv(os.path.join(tmpdir,"train.csv"), index=False) + test_annotations.to_csv(os.path.join(tmpdir,"test.csv"), index=False) # Set config model.config["train"]["csv_file"] = os.path.join(tmpdir,"train.csv") model.config["train"]["root_dir"] = train_image_dir + #model.config["validation"]["csv_file"] = os.path.join(tmpdir,"test.csv") + #model.config["validation"]["root_dir"] = train_image_dir + # Loop through all keys in model.config and set them to the value of the key in model.config config_args = OmegaConf.to_container(config_args) if config_args: @@ -162,24 +166,20 @@ def train(model, train_annotations, test_annotations, train_image_dir, comet_pro else: model.create_trainer() - # with comet_logger.experiment.context_manager("train_images"): - # non_empty_train_annotations = train_annotations[~(train_annotations.xmax==0)] - # try: - # non_empty_train_annotations= gpd.GeoDataFrame(non_empty_train_annotations, geometry=non_empty_train_annotations["geometry"]) - # non_empty_train_annotations.root_dir = train_image_dir - # non_empty_train_annotations = read_file(non_empty_train_annotations) - # except: - # non_empty_train_annotations = read_file(non_empty_train_annotations, root_dir=train_image_dir) - - # if non_empty_train_annotations.empty: - # pass - # else: - # sample_train_annotations = non_empty_train_annotations[non_empty_train_annotations.image_path.isin(non_empty_train_annotations.image_path.head(5))] - # for filename in sample_train_annotations.image_path: - # sample_train_annotations_for_image = sample_train_annotations[sample_train_annotations.image_path == filename] - # sample_train_annotations_for_image.root_dir = train_image_dir - # visualize.plot_results(sample_train_annotations_for_image, savedir=tmpdir) - # comet_logger.experiment.log_image(os.path.join(tmpdir, filename)) + with comet_logger.experiment.context_manager("train_images"): + non_empty_train_annotations = read_file(model.config["train"]["csv_file"], root_dir=train_image_dir) + for filename in non_empty_train_annotations.image_path.sample(5): + sample_train_annotations_for_image = non_empty_train_annotations[non_empty_train_annotations.image_path == filename] + sample_train_annotations_for_image.root_dir = train_image_dir + visualize.plot_annotations(sample_train_annotations_for_image, savedir=tmpdir) + comet_logger.experiment.log_image(os.path.join(tmpdir, filename)) + # with comet_logger.experiment.context_manager("test_images"): + # non_empty_train_annotations = read_file(model.config["validation"]["csv_file"], root_dir=train_image_dir) + # for filename in non_empty_train_annotations.image_path.sample(5): + # sample_train_annotations_for_image = non_empty_train_annotations[non_empty_train_annotations.image_path == filename] + # sample_train_annotations_for_image.root_dir = train_image_dir + # visualize.plot_annotations(sample_train_annotations_for_image, savedir=tmpdir) + # comet_logger.experiment.log_image(os.path.join(tmpdir, filename)) model.trainer.fit(model) @@ -207,22 +207,32 @@ def preprocess_and_train(config, model_type="detection"): """ # Get and split annotations train_df = gather_data(config.detection_model.train_csv_folder) - validation_df = gather_data(config.label_studio.csv_dir_validation) - validation_df.loc[validation_df.label==0,"label"] = "Bird" + validation = gather_data(config.label_studio.csv_dir_validation) + validation.loc[validation.label==0,"label"] = "Bird" + + # Remove the empty frames, using hard mining instead + train_df = train_df[~(train_df.label.astype(str)== "0")] # Preprocess train and validation data train_df = data_processing.preprocess_images(train_df, root_dir=config.detection_model.train_image_dir, - save_dir=config.detection_model.crop_image_dir) + save_dir=config.detection_model.crop_image_dir, + patch_size=config.predict.patch_size, + patch_overlap=config.predict.patch_overlap) non_empty = train_df[train_df.xmin!=0] + train_df.loc[train_df.label==0,"label"] = "Bird" + validation.loc[validation.label==0,"label"] = "Bird" - if not validation_df.empty: - validation_df = data_processing.preprocess_images(validation_df, + if not validation.empty: + validation_df = data_processing.preprocess_images(validation, root_dir=config.detection_model.train_image_dir, - save_dir=config.detection_model.crop_image_dir) - non_empty = validation_df[validation_df.xmin!=0] + save_dir=config.detection_model.crop_image_dir, + patch_size=config.predict.patch_size, + patch_overlap=config.predict.patch_overlap, + allow_empty=True + ) validation_df.loc[validation_df.label==0,"label"] = "Bird" # Limit empty frames @@ -231,6 +241,7 @@ def preprocess_and_train(config, model_type="detection"): if not validation_df.empty: validation_df = limit_empty_frames(validation_df, config.detection_model.limit_empty_frac) + # Train model # Load existing model if config.detection_model.checkpoint: @@ -240,6 +251,9 @@ def preprocess_and_train(config, model_type="detection"): else: raise ValueError("No checkpoint or checkpoint directory found.") + # Assert no FalsePositive label in train + assert "FalsePositive" not in train_df.label.unique(), "FalsePositive label found in training data." + trained_model = train(train_annotations=train_df, test_annotations=validation_df, train_image_dir=config.detection_model.crop_image_dir, @@ -280,7 +294,7 @@ def _predict_list_(image_paths, patch_size, patch_overlap, model_path, m=None, c predictions = [] for image_path in image_paths: - prediction = m.predict_tile(raster_path=image_path, return_plot=False, patch_size=patch_size, patch_overlap=patch_overlap, crop_model=crop_model) + prediction = m.predict_tile(raster_path=image_path, return_plot=False, patch_size=patch_size, patch_overlap=patch_overlap, crop_model=crop_model, verbose=True) if prediction is None: prediction = pd.DataFrame({"image_path": image_path, "xmin": [None], "ymin": [None], "xmax": [None], "ymax": [None], "label": [None], "score": [None]}) predictions.append(prediction) diff --git a/src/pipeline.py b/src/pipeline.py index f297287..8ab23cb 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -64,10 +64,11 @@ def run(self): if new_val_annotations is None: if self.config.force_upload: print("No new annotations, but force_upload is set to True, continuing") - self.skip_training = True - else: + elif not self.config.force_training: print("No new annotations, exiting") return None + else: + print(f"No new annotations, but force training is {self.config.force_training} and force upload is {self.config.force_upload}, continuing") else: try: print(f"New train annotations found: {len(new_train_annotations)}") @@ -75,16 +76,12 @@ def run(self): pass print(f"New val annotations found: {len(new_val_annotations)}") - self.skip_training = False - # Given new annotations, propogate labels to nearby images # label_propagator = propagate.LabelPropagator( # **self.config.propagate) # label_propagator.through_time(new_annotations) - else: - self.skip_training = False - if not self.skip_training: + if self.config.force_training: trained_detection_model = detection.preprocess_and_train( self.config) @@ -203,4 +200,5 @@ def run(self): pipeline_monitor=pipeline_monitor) reporter.generate_report() - + else: + print("No images to annotate") diff --git a/src/pipeline_evaluation.py b/src/pipeline_evaluation.py index 297dd6c..1794f6d 100644 --- a/src/pipeline_evaluation.py +++ b/src/pipeline_evaluation.py @@ -186,11 +186,10 @@ def evaluate_uncertain_classification(self): image_targets = self.classification_annotations.loc[self.classification_annotations.image_path == os.path.basename(image_path)] image_predictions = self.uncertain_predictions.loc[self.uncertain_predictions.image_path == os.path.basename(image_path)] image_predictions = image_predictions[image_predictions.score > self.min_score] + if image_predictions.empty: + continue target = self._format_targets(image_targets) pred = self._format_targets(image_predictions) - - if len(pred["labels"]) == 0: - continue targets.append(target) preds.append(pred) diff --git a/submit.sh b/submit.sh index e5b5817..01a4aa0 100644 --- a/submit.sh +++ b/submit.sh @@ -15,4 +15,4 @@ source activate BOEM cd ~/BOEM/ -python main.py check_annotations=False active_learning.pool_limit=100 active_testing.n_images=10 active_learning.n_images=10 +python main.py check_annotations=True active_learning.pool_limit=20000 active_testing.n_images=1 active_learning.n_images=30 detection_model.trainer.epochs = 20