From d5b4091e62d40f0ad51f821d89f3efb123032ca0 Mon Sep 17 00:00:00 2001 From: Raj Sinha Date: Tue, 16 Jul 2024 14:48:47 +0000 Subject: [PATCH] Expose the `n_components` (Number of components) and `covariance_type` (type of covariance between components) parameters of the one-class classifier. PiperOrigin-RevId: 652839931 --- CHANGELOG.md | 7 +++++- README.md | 4 ++++ spade_anomaly_detection/__init__.py | 2 +- spade_anomaly_detection/occ_ensemble_test.py | 22 ++++++++++++------ spade_anomaly_detection/parameters.py | 7 ++++++ spade_anomaly_detection/runner.py | 2 ++ spade_anomaly_detection/runner_test.py | 2 ++ .../scripts/run_cloud_spade_experiment.sh | 16 ++++++++----- spade_anomaly_detection/task.py | 23 +++++++++++++++++++ 9 files changed, 70 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47a8bac..1b9141a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,10 @@ To release a new version (e.g. from `1.0.0` -> `2.0.0`): ## [Unreleased] +## [0.3.2] - 2024-07-16 + +* Exposes the `n_component` and `covariance_type` parameters of the one-class classifier. + ## [0.3.1] - 2024-07-13 * Now writes out the pseudolabel weights and a flag that indicates whether a sample has a ground truth label (0) or a pseudolabel (1). @@ -49,7 +53,8 @@ To release a new version (e.g. from `1.0.0` -> `2.0.0`): * Initial release -[Unreleased]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.1...HEAD +[Unreleased]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.2...HEAD +[0.3.2]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.1...v0.3.2 [0.3.1]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.0...v0.3.1 [0.3.0]: https://github.com/google-research/spade_anomaly_detection/compare/v0.2.2...v0.3.0 [0.2.2]: https://github.com/google-research/spade_anomaly_detection/compare/v0.2.1...v0.2.2 diff --git a/README.md b/README.md index c165564..3d8dbf8 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,10 @@ one class classifier ensemble to label a point as negative. The higher this valu ensemble_count: Integer representing the number of one class classifiers in the ensemble used for pseudo labeling unlabeled data points. The more models in the ensemble, the less likely it is for all the models to gain consensus, and thus will reduce the amount of labeled data points. By default, we use 5 one class classifiers. +n_components: Integer representing the number of components to use in the one class classifier ensemble. By default, we use 1 component. + +covariance_type: String representing the covariance type to use in the one class classifier ensemble. By default, we use 'full' covariance. Note that when there are many components, a 'full' covariance matrix may not be suitable. + verbose (boolean): The amount of console logs to display during training. Use the default value of False to show fewer messages, and True for displaying many aspects of model training and scoring. This is useful for debugging model performance. ## Training Job Arguments diff --git a/spade_anomaly_detection/__init__.py b/spade_anomaly_detection/__init__.py index f938372..ea08ded 100644 --- a/spade_anomaly_detection/__init__.py +++ b/spade_anomaly_detection/__init__.py @@ -31,4 +31,4 @@ # A new PyPI release will be pushed every time `__version__` is increased. # When changing this, also update the CHANGELOG.md. -__version__ = '0.3.1' +__version__ = '0.3.2' diff --git a/spade_anomaly_detection/occ_ensemble_test.py b/spade_anomaly_detection/occ_ensemble_test.py index 0baf11e..4a7b673 100644 --- a/spade_anomaly_detection/occ_ensemble_test.py +++ b/spade_anomaly_detection/occ_ensemble_test.py @@ -29,15 +29,15 @@ """Tests for the one class classifier ensemble.""" -import numpy as np +from absl.testing import parameterized +import numpy as np from spade_anomaly_detection import data_loader from spade_anomaly_detection import occ_ensemble - import tensorflow as tf -class OccEnsembleTest(tf.test.TestCase): +class OccEnsembleTest(tf.test.TestCase, parameterized.TestCase): def test_ensemble_initialization_no_error(self): gmm_ensemble = occ_ensemble.GmmEnsemble(n_components=1, ensemble_count=10) @@ -47,13 +47,21 @@ def test_ensemble_initialization_no_error(self): with self.subTest(name='ObjectAttributes'): self.assertEqual(gmm_ensemble.ensemble_count, 10) - def test_ensemble_training_no_error(self): + # Params to test: n_components, ensemble_count, covariance_type. + @parameterized.named_parameters( + ('components_1_ensemble_10_full', 1, 10, 'full'), + ('components_3_ensemble_5_full', 1, 5, 'full'), + ('components_3_ensemble_5_tied', 1, 5, 'tied'), + ) + def test_ensemble_training_no_error( + self, n_components, ensemble_count, covariance_type + ): batches_per_occ = 10 - ensemble_count = 5 - n_components = 1 ensemble_obj = occ_ensemble.GmmEnsemble( - n_components=n_components, ensemble_count=ensemble_count + n_components=n_components, + ensemble_count=ensemble_count, + covariance_type=covariance_type, ) tf_dataset = data_loader.load_tf_dataset_from_csv( diff --git a/spade_anomaly_detection/parameters.py b/spade_anomaly_detection/parameters.py index 395bff1..ccdaf78 100644 --- a/spade_anomaly_detection/parameters.py +++ b/spade_anomaly_detection/parameters.py @@ -146,6 +146,11 @@ class RunnerParameters: the less likely it is for all the models to gain consensus, and thus will reduce the amount of labeled data points. By default, we use 5 one class classifiers. + n_components: The number of components to use in the one class classifier + ensemble. By default, we use 1 component. + covariance_type: The covariance type to use in the one class classifier + ensemble. By default, we use 'full' covariance. Note that when there are + many components, a 'full' covariance matrix may not be suitable. random_seed: The random seed to use for all random number generators in the algorithm. verbose: The amount of console logs to display during training. Use False to @@ -177,6 +182,8 @@ class RunnerParameters: max_occ_batch_size: int = 50000 labeling_and_model_training_batch_size: Optional[int] = None ensemble_count: int = 5 + n_components: int = 1 + covariance_type: str = 'full' random_seed: int = _RANDOM_SEED verbose: bool = False diff --git a/spade_anomaly_detection/runner.py b/spade_anomaly_detection/runner.py index ff774da..80656f7 100644 --- a/spade_anomaly_detection/runner.py +++ b/spade_anomaly_detection/runner.py @@ -276,6 +276,8 @@ def instantiate_and_fit_ensemble( """ ensemble_object = occ_ensemble.GmmEnsemble( + n_components=self.runner_parameters.n_components, + covariance_type=self.runner_parameters.covariance_type, ensemble_count=self.runner_parameters.ensemble_count, positive_threshold=self.runner_parameters.positive_threshold, negative_threshold=self.runner_parameters.negative_threshold, diff --git a/spade_anomaly_detection/runner_test.py b/spade_anomaly_detection/runner_test.py index cb9a506..cab146c 100644 --- a/spade_anomaly_detection/runner_test.py +++ b/spade_anomaly_detection/runner_test.py @@ -75,6 +75,8 @@ def setUp(self): max_occ_batch_size=50000, labeling_and_model_training_batch_size=None, ensemble_count=5, + n_components=1, + covariance_type='full', verbose=False, ) diff --git a/spade_anomaly_detection/scripts/run_cloud_spade_experiment.sh b/spade_anomaly_detection/scripts/run_cloud_spade_experiment.sh index da83aea..afbe539 100644 --- a/spade_anomaly_detection/scripts/run_cloud_spade_experiment.sh +++ b/spade_anomaly_detection/scripts/run_cloud_spade_experiment.sh @@ -48,22 +48,24 @@ TEST_LABEL_COL_NAME=${16:-"y"} ALPHA=${17:-"1.0"} BATCHES_PER_MODEL=${18:-"1"} ENSEMBLE_COUNT=${19:-"5"} -MAX_OCC_BATCH_SIZE=${20:-"50000"} -LABELING_AND_MODEL_TRAINING_BATCH_SIZE=${21:-"100000"} -VERBOSE=${22:-"True"} -UPLOAD_ONLY=${23:-"False"} +N_COMPONENTS=${20:-"1"} +COVARIANCE_TYPE=${21:-"full"} +MAX_OCC_BATCH_SIZE=${22:-"50000"} +LABELING_AND_MODEL_TRAINING_BATCH_SIZE=${23:-"100000"} +VERBOSE=${24:-"True"} +UPLOAD_ONLY=${25:-"False"} # Give a unique name to your training job. TRIAL_NAME="spade_${USER}_${DATETIME}" # Image name and location IMAGE_NAME="spade" -IMAGE_TAG=${24:-"latest-oss"} +IMAGE_TAG=${26:-"latest-oss"} # Project image (use this for testing) IMAGE_URI="us-docker.pkg.dev/${PROJECT_ID}/spade/${IMAGE_NAME}:${IMAGE_TAG}" echo "IMAGE_URI = ${IMAGE_URI}" -BUILD=${25:-"TRUE"} +BUILD=${27:-"TRUE"} if [[ "${BUILD}" == "TRUE" ]]; then /bin/bash ./scripts/build_and_push_image.sh "${IMAGE_TAG}" "${IMAGE_NAME}" "${PROJECT_ID}" || exit @@ -97,6 +99,8 @@ gcloud ai custom-jobs create \ --args=--alpha="${ALPHA}" \ --args=--batches_per_model="${BATCHES_PER_MODEL}" \ --args=--ensemble_count="${ENSEMBLE_COUNT}" \ + --args=--n_components="${N_COMPONENTS}" \ + --args=--covariance_type="${COVARIANCE_TYPE}" \ --args=--max_occ_batch_size="${MAX_OCC_BATCH_SIZE}" \ --args=--labeling_and_model_training_batch_size="${LABELING_AND_MODEL_TRAINING_BATCH_SIZE}" \ --args=--upload_only="${UPLOAD_ONLY}" \ diff --git a/spade_anomaly_detection/task.py b/spade_anomaly_detection/task.py index 18e3e05..ca6452a 100644 --- a/spade_anomaly_detection/task.py +++ b/spade_anomaly_detection/task.py @@ -301,6 +301,27 @@ ), ) +_N_COMPONENTS = flags.DEFINE_integer( + "n_components", + default=1, + required=False, + help=( + "The number of components to use in the one class classifier ensemble. " + "By default, we use 1 component." + ), +) + +_COVARIANCE_TYPE = flags.DEFINE_string( + "covariance_type", + default="full", + required=False, + help=( + "The covariance type to use in the one class classifier ensemble. By " + "default, we use 'full' covariance. Note that when there are many " + "components, a 'full' covariance matrix may not be suitable." + ), +) + _VERBOSE = flags.DEFINE_bool( "verbose", default=False, @@ -367,6 +388,8 @@ def main(argv: Sequence[str]) -> None: max_occ_batch_size=_MAX_OCC_BATCH_SIZE.value, labeling_and_model_training_batch_size=_BATCH_SIZE.value, ensemble_count=_ENSEMBLE_COUNT.value, + n_components=_N_COMPONENTS.value, + covariance_type=_COVARIANCE_TYPE.value, random_seed=_RANDOM_SEED, verbose=_VERBOSE.value, )