From 13bf3c819efe1abaf2bc643c573f1672afac4bfd Mon Sep 17 00:00:00 2001
From: Raj Sinha <sinharaj@google.com>
Date: Tue, 16 Jul 2024 14:48:47 +0000
Subject: [PATCH] Expose the `n_components` (Number of components) and
 `covariance_type` (type of covariance between components) parameters of the
 one-class classifier.

PiperOrigin-RevId: 652839931
---
 CHANGELOG.md                                  |  7 +++++-
 README.md                                     |  4 ++++
 spade_anomaly_detection/__init__.py           |  2 +-
 spade_anomaly_detection/occ_ensemble_test.py  | 22 ++++++++++++------
 spade_anomaly_detection/parameters.py         |  7 ++++++
 spade_anomaly_detection/runner.py             |  2 ++
 spade_anomaly_detection/runner_test.py        |  2 ++
 .../scripts/run_cloud_spade_experiment.sh     | 16 ++++++++-----
 spade_anomaly_detection/task.py               | 23 +++++++++++++++++++
 9 files changed, 70 insertions(+), 15 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 47a8bac..1b9141a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,10 @@ To release a new version (e.g. from `1.0.0` -> `2.0.0`):
 
 ## [Unreleased]
 
+## [0.3.2] - 2024-07-16
+
+* Exposes the `n_component` and `covariance_type` parameters of the one-class classifier.
+
 ## [0.3.1] - 2024-07-13
 
 * Now writes out the pseudolabel weights and a flag that indicates whether a sample has a ground truth label (0) or a pseudolabel (1).
@@ -49,7 +53,8 @@ To release a new version (e.g. from `1.0.0` -> `2.0.0`):
 
 * Initial release
 
-[Unreleased]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.1...HEAD
+[Unreleased]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.2...HEAD
+[0.3.2]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.1...v0.3.2
 [0.3.1]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.0...v0.3.1
 [0.3.0]: https://github.com/google-research/spade_anomaly_detection/compare/v0.2.2...v0.3.0
 [0.2.2]: https://github.com/google-research/spade_anomaly_detection/compare/v0.2.1...v0.2.2
diff --git a/README.md b/README.md
index c165564..3d8dbf8 100644
--- a/README.md
+++ b/README.md
@@ -103,6 +103,10 @@ one class classifier ensemble to label a point as negative. The higher this valu
 
 <span style="color:yellow;background-color:lightgrey">ensemble_count</span>: Integer representing the number of one class classifiers in the ensemble used for pseudo labeling unlabeled data points. The more models in the ensemble, the less likely it is for all the models to gain consensus, and thus will reduce the amount of labeled data points. By default, we use 5 one class classifiers.
 
+<span style="color:yellow;background-color:lightgrey">n_components</span>: Integer representing the number of components to use in the one class classifier ensemble. By default, we use 1 component.
+
+<span style="color:yellow;background-color:lightgrey">covariance_type</span>: String representing the covariance type to use in the one class classifier ensemble. By default, we use 'full' covariance. Note that when there are many components, a 'full' covariance matrix may not be suitable.
+
 <span style="color:yellow;background-color:lightgrey">verbose (boolean)</span>: The amount of console logs to display during training. Use the default value of  False to show fewer messages, and True for displaying many aspects of model training and scoring. This is useful for debugging model performance.
 
 ## Training Job Arguments
diff --git a/spade_anomaly_detection/__init__.py b/spade_anomaly_detection/__init__.py
index f938372..ea08ded 100644
--- a/spade_anomaly_detection/__init__.py
+++ b/spade_anomaly_detection/__init__.py
@@ -31,4 +31,4 @@
 
 # A new PyPI release will be pushed every time `__version__` is increased.
 # When changing this, also update the CHANGELOG.md.
-__version__ = '0.3.1'
+__version__ = '0.3.2'
diff --git a/spade_anomaly_detection/occ_ensemble_test.py b/spade_anomaly_detection/occ_ensemble_test.py
index 0baf11e..4a7b673 100644
--- a/spade_anomaly_detection/occ_ensemble_test.py
+++ b/spade_anomaly_detection/occ_ensemble_test.py
@@ -29,15 +29,15 @@
 
 """Tests for the one class classifier ensemble."""
 
-import numpy as np
 
+from absl.testing import parameterized
+import numpy as np
 from spade_anomaly_detection import data_loader
 from spade_anomaly_detection import occ_ensemble
-
 import tensorflow as tf
 
 
-class OccEnsembleTest(tf.test.TestCase):
+class OccEnsembleTest(tf.test.TestCase, parameterized.TestCase):
 
   def test_ensemble_initialization_no_error(self):
     gmm_ensemble = occ_ensemble.GmmEnsemble(n_components=1, ensemble_count=10)
@@ -47,13 +47,21 @@ def test_ensemble_initialization_no_error(self):
     with self.subTest(name='ObjectAttributes'):
       self.assertEqual(gmm_ensemble.ensemble_count, 10)
 
-  def test_ensemble_training_no_error(self):
+  # Params to test: n_components, ensemble_count, covariance_type.
+  @parameterized.named_parameters(
+      ('components_1_ensemble_10_full', 1, 10, 'full'),
+      ('components_3_ensemble_5_full', 1, 5, 'full'),
+      ('components_3_ensemble_5_tied', 1, 5, 'tied'),
+  )
+  def test_ensemble_training_no_error(
+      self, n_components, ensemble_count, covariance_type
+  ):
     batches_per_occ = 10
-    ensemble_count = 5
-    n_components = 1
 
     ensemble_obj = occ_ensemble.GmmEnsemble(
-        n_components=n_components, ensemble_count=ensemble_count
+        n_components=n_components,
+        ensemble_count=ensemble_count,
+        covariance_type=covariance_type,
     )
 
     tf_dataset = data_loader.load_tf_dataset_from_csv(
diff --git a/spade_anomaly_detection/parameters.py b/spade_anomaly_detection/parameters.py
index 395bff1..ccdaf78 100644
--- a/spade_anomaly_detection/parameters.py
+++ b/spade_anomaly_detection/parameters.py
@@ -146,6 +146,11 @@ class RunnerParameters:
       the less likely it is for all the models to gain consensus, and thus will
       reduce the amount of labeled data points. By default, we use 5 one class
       classifiers.
+    n_components: The number of components to use in the one class classifier
+      ensemble. By default, we use 1 component.
+    covariance_type: The covariance type to use in the one class classifier
+      ensemble. By default, we use 'full' covariance. Note that when there are
+      many components, a 'full' covariance matrix may not be suitable.
     random_seed: The random seed to use for all random number generators in the
       algorithm.
     verbose: The amount of console logs to display during training. Use False to
@@ -177,6 +182,8 @@ class RunnerParameters:
   max_occ_batch_size: int = 50000
   labeling_and_model_training_batch_size: Optional[int] = None
   ensemble_count: int = 5
+  n_components: int = 1
+  covariance_type: str = 'full'
   random_seed: int = _RANDOM_SEED
   verbose: bool = False
 
diff --git a/spade_anomaly_detection/runner.py b/spade_anomaly_detection/runner.py
index ff774da..80656f7 100644
--- a/spade_anomaly_detection/runner.py
+++ b/spade_anomaly_detection/runner.py
@@ -276,6 +276,8 @@ def instantiate_and_fit_ensemble(
     """
 
     ensemble_object = occ_ensemble.GmmEnsemble(
+        n_components=self.runner_parameters.n_components,
+        covariance_type=self.runner_parameters.covariance_type,
         ensemble_count=self.runner_parameters.ensemble_count,
         positive_threshold=self.runner_parameters.positive_threshold,
         negative_threshold=self.runner_parameters.negative_threshold,
diff --git a/spade_anomaly_detection/runner_test.py b/spade_anomaly_detection/runner_test.py
index cb9a506..cab146c 100644
--- a/spade_anomaly_detection/runner_test.py
+++ b/spade_anomaly_detection/runner_test.py
@@ -75,6 +75,8 @@ def setUp(self):
         max_occ_batch_size=50000,
         labeling_and_model_training_batch_size=None,
         ensemble_count=5,
+        n_components=1,
+        covariance_type='full',
         verbose=False,
     )
 
diff --git a/spade_anomaly_detection/scripts/run_cloud_spade_experiment.sh b/spade_anomaly_detection/scripts/run_cloud_spade_experiment.sh
index da83aea..afbe539 100644
--- a/spade_anomaly_detection/scripts/run_cloud_spade_experiment.sh
+++ b/spade_anomaly_detection/scripts/run_cloud_spade_experiment.sh
@@ -48,22 +48,24 @@ TEST_LABEL_COL_NAME=${16:-"y"}
 ALPHA=${17:-"1.0"}
 BATCHES_PER_MODEL=${18:-"1"}
 ENSEMBLE_COUNT=${19:-"5"}
-MAX_OCC_BATCH_SIZE=${20:-"50000"}
-LABELING_AND_MODEL_TRAINING_BATCH_SIZE=${21:-"100000"}
-VERBOSE=${22:-"True"}
-UPLOAD_ONLY=${23:-"False"}
+N_COMPONENTS=${20:-"1"}
+COVARIANCE_TYPE=${21:-"full"}
+MAX_OCC_BATCH_SIZE=${22:-"50000"}
+LABELING_AND_MODEL_TRAINING_BATCH_SIZE=${23:-"100000"}
+VERBOSE=${24:-"True"}
+UPLOAD_ONLY=${25:-"False"}
 
 # Give a unique name to your training job.
 TRIAL_NAME="spade_${USER}_${DATETIME}"
 
 # Image name and location
 IMAGE_NAME="spade"
-IMAGE_TAG=${24:-"latest-oss"}
+IMAGE_TAG=${26:-"latest-oss"}
 # Project image (use this for testing)
 IMAGE_URI="us-docker.pkg.dev/${PROJECT_ID}/spade/${IMAGE_NAME}:${IMAGE_TAG}"
 echo "IMAGE_URI = ${IMAGE_URI}"
 
-BUILD=${25:-"TRUE"}
+BUILD=${27:-"TRUE"}
 
 if [[ "${BUILD}" == "TRUE" ]]; then
   /bin/bash ./scripts/build_and_push_image.sh "${IMAGE_TAG}" "${IMAGE_NAME}" "${PROJECT_ID}" || exit
@@ -97,6 +99,8 @@ gcloud ai custom-jobs create \
   --args=--alpha="${ALPHA}" \
   --args=--batches_per_model="${BATCHES_PER_MODEL}" \
   --args=--ensemble_count="${ENSEMBLE_COUNT}" \
+  --args=--n_components="${N_COMPONENTS}" \
+  --args=--covariance_type="${COVARIANCE_TYPE}" \
   --args=--max_occ_batch_size="${MAX_OCC_BATCH_SIZE}" \
   --args=--labeling_and_model_training_batch_size="${LABELING_AND_MODEL_TRAINING_BATCH_SIZE}" \
   --args=--upload_only="${UPLOAD_ONLY}" \
diff --git a/spade_anomaly_detection/task.py b/spade_anomaly_detection/task.py
index 18e3e05..ca6452a 100644
--- a/spade_anomaly_detection/task.py
+++ b/spade_anomaly_detection/task.py
@@ -301,6 +301,27 @@
     ),
 )
 
+_N_COMPONENTS = flags.DEFINE_integer(
+    "n_components",
+    default=1,
+    required=False,
+    help=(
+        "The number of components to use in the one class classifier ensemble. "
+        "By default, we use 1 component."
+    ),
+)
+
+_COVARIANCE_TYPE = flags.DEFINE_string(
+    "covariance_type",
+    default="full",
+    required=False,
+    help=(
+        "The covariance type to use in the one class classifier ensemble. By "
+        "default, we use 'full' covariance. Note that when there are many "
+        "components, a 'full' covariance matrix may not be suitable."
+    ),
+)
+
 _VERBOSE = flags.DEFINE_bool(
     "verbose",
     default=False,
@@ -367,6 +388,8 @@ def main(argv: Sequence[str]) -> None:
       max_occ_batch_size=_MAX_OCC_BATCH_SIZE.value,
       labeling_and_model_training_batch_size=_BATCH_SIZE.value,
       ensemble_count=_ENSEMBLE_COUNT.value,
+      n_components=_N_COMPONENTS.value,
+      covariance_type=_COVARIANCE_TYPE.value,
       random_seed=_RANDOM_SEED,
       verbose=_VERBOSE.value,
   )