Skip to content

Commit

Permalink
Expose the n_components (Number of components) and `covariance_type…
Browse files Browse the repository at this point in the history
…` (type of covariance between components) parameters of the one-class classifier.

PiperOrigin-RevId: 652839931
  • Loading branch information
raj-sinha committed Jul 17, 2024
1 parent bba6181 commit 13bf3c8
Show file tree
Hide file tree
Showing 9 changed files with 70 additions and 15 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ To release a new version (e.g. from `1.0.0` -> `2.0.0`):

## [Unreleased]

## [0.3.2] - 2024-07-16

* Exposes the `n_component` and `covariance_type` parameters of the one-class classifier.

## [0.3.1] - 2024-07-13

* Now writes out the pseudolabel weights and a flag that indicates whether a sample has a ground truth label (0) or a pseudolabel (1).
Expand All @@ -49,7 +53,8 @@ To release a new version (e.g. from `1.0.0` -> `2.0.0`):

* Initial release

[Unreleased]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.1...HEAD
[Unreleased]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.2...HEAD
[0.3.2]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.1...v0.3.2
[0.3.1]: https://github.com/google-research/spade_anomaly_detection/compare/v0.3.0...v0.3.1
[0.3.0]: https://github.com/google-research/spade_anomaly_detection/compare/v0.2.2...v0.3.0
[0.2.2]: https://github.com/google-research/spade_anomaly_detection/compare/v0.2.1...v0.2.2
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ one class classifier ensemble to label a point as negative. The higher this valu

<span style="color:yellow;background-color:lightgrey">ensemble_count</span>: Integer representing the number of one class classifiers in the ensemble used for pseudo labeling unlabeled data points. The more models in the ensemble, the less likely it is for all the models to gain consensus, and thus will reduce the amount of labeled data points. By default, we use 5 one class classifiers.

<span style="color:yellow;background-color:lightgrey">n_components</span>: Integer representing the number of components to use in the one class classifier ensemble. By default, we use 1 component.

<span style="color:yellow;background-color:lightgrey">covariance_type</span>: String representing the covariance type to use in the one class classifier ensemble. By default, we use 'full' covariance. Note that when there are many components, a 'full' covariance matrix may not be suitable.

<span style="color:yellow;background-color:lightgrey">verbose (boolean)</span>: The amount of console logs to display during training. Use the default value of False to show fewer messages, and True for displaying many aspects of model training and scoring. This is useful for debugging model performance.

## Training Job Arguments
Expand Down
2 changes: 1 addition & 1 deletion spade_anomaly_detection/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@

# A new PyPI release will be pushed every time `__version__` is increased.
# When changing this, also update the CHANGELOG.md.
__version__ = '0.3.1'
__version__ = '0.3.2'
22 changes: 15 additions & 7 deletions spade_anomaly_detection/occ_ensemble_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@

"""Tests for the one class classifier ensemble."""

import numpy as np

from absl.testing import parameterized
import numpy as np
from spade_anomaly_detection import data_loader
from spade_anomaly_detection import occ_ensemble

import tensorflow as tf


class OccEnsembleTest(tf.test.TestCase):
class OccEnsembleTest(tf.test.TestCase, parameterized.TestCase):

def test_ensemble_initialization_no_error(self):
gmm_ensemble = occ_ensemble.GmmEnsemble(n_components=1, ensemble_count=10)
Expand All @@ -47,13 +47,21 @@ def test_ensemble_initialization_no_error(self):
with self.subTest(name='ObjectAttributes'):
self.assertEqual(gmm_ensemble.ensemble_count, 10)

def test_ensemble_training_no_error(self):
# Params to test: n_components, ensemble_count, covariance_type.
@parameterized.named_parameters(
('components_1_ensemble_10_full', 1, 10, 'full'),
('components_3_ensemble_5_full', 1, 5, 'full'),
('components_3_ensemble_5_tied', 1, 5, 'tied'),
)
def test_ensemble_training_no_error(
self, n_components, ensemble_count, covariance_type
):
batches_per_occ = 10
ensemble_count = 5
n_components = 1

ensemble_obj = occ_ensemble.GmmEnsemble(
n_components=n_components, ensemble_count=ensemble_count
n_components=n_components,
ensemble_count=ensemble_count,
covariance_type=covariance_type,
)

tf_dataset = data_loader.load_tf_dataset_from_csv(
Expand Down
7 changes: 7 additions & 0 deletions spade_anomaly_detection/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ class RunnerParameters:
the less likely it is for all the models to gain consensus, and thus will
reduce the amount of labeled data points. By default, we use 5 one class
classifiers.
n_components: The number of components to use in the one class classifier
ensemble. By default, we use 1 component.
covariance_type: The covariance type to use in the one class classifier
ensemble. By default, we use 'full' covariance. Note that when there are
many components, a 'full' covariance matrix may not be suitable.
random_seed: The random seed to use for all random number generators in the
algorithm.
verbose: The amount of console logs to display during training. Use False to
Expand Down Expand Up @@ -177,6 +182,8 @@ class RunnerParameters:
max_occ_batch_size: int = 50000
labeling_and_model_training_batch_size: Optional[int] = None
ensemble_count: int = 5
n_components: int = 1
covariance_type: str = 'full'
random_seed: int = _RANDOM_SEED
verbose: bool = False

Expand Down
2 changes: 2 additions & 0 deletions spade_anomaly_detection/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,8 @@ def instantiate_and_fit_ensemble(
"""

ensemble_object = occ_ensemble.GmmEnsemble(
n_components=self.runner_parameters.n_components,
covariance_type=self.runner_parameters.covariance_type,
ensemble_count=self.runner_parameters.ensemble_count,
positive_threshold=self.runner_parameters.positive_threshold,
negative_threshold=self.runner_parameters.negative_threshold,
Expand Down
2 changes: 2 additions & 0 deletions spade_anomaly_detection/runner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def setUp(self):
max_occ_batch_size=50000,
labeling_and_model_training_batch_size=None,
ensemble_count=5,
n_components=1,
covariance_type='full',
verbose=False,
)

Expand Down
16 changes: 10 additions & 6 deletions spade_anomaly_detection/scripts/run_cloud_spade_experiment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,22 +48,24 @@ TEST_LABEL_COL_NAME=${16:-"y"}
ALPHA=${17:-"1.0"}
BATCHES_PER_MODEL=${18:-"1"}
ENSEMBLE_COUNT=${19:-"5"}
MAX_OCC_BATCH_SIZE=${20:-"50000"}
LABELING_AND_MODEL_TRAINING_BATCH_SIZE=${21:-"100000"}
VERBOSE=${22:-"True"}
UPLOAD_ONLY=${23:-"False"}
N_COMPONENTS=${20:-"1"}
COVARIANCE_TYPE=${21:-"full"}
MAX_OCC_BATCH_SIZE=${22:-"50000"}
LABELING_AND_MODEL_TRAINING_BATCH_SIZE=${23:-"100000"}
VERBOSE=${24:-"True"}
UPLOAD_ONLY=${25:-"False"}

# Give a unique name to your training job.
TRIAL_NAME="spade_${USER}_${DATETIME}"

# Image name and location
IMAGE_NAME="spade"
IMAGE_TAG=${24:-"latest-oss"}
IMAGE_TAG=${26:-"latest-oss"}
# Project image (use this for testing)
IMAGE_URI="us-docker.pkg.dev/${PROJECT_ID}/spade/${IMAGE_NAME}:${IMAGE_TAG}"
echo "IMAGE_URI = ${IMAGE_URI}"

BUILD=${25:-"TRUE"}
BUILD=${27:-"TRUE"}

if [[ "${BUILD}" == "TRUE" ]]; then
/bin/bash ./scripts/build_and_push_image.sh "${IMAGE_TAG}" "${IMAGE_NAME}" "${PROJECT_ID}" || exit
Expand Down Expand Up @@ -97,6 +99,8 @@ gcloud ai custom-jobs create \
--args=--alpha="${ALPHA}" \
--args=--batches_per_model="${BATCHES_PER_MODEL}" \
--args=--ensemble_count="${ENSEMBLE_COUNT}" \
--args=--n_components="${N_COMPONENTS}" \
--args=--covariance_type="${COVARIANCE_TYPE}" \
--args=--max_occ_batch_size="${MAX_OCC_BATCH_SIZE}" \
--args=--labeling_and_model_training_batch_size="${LABELING_AND_MODEL_TRAINING_BATCH_SIZE}" \
--args=--upload_only="${UPLOAD_ONLY}" \
Expand Down
23 changes: 23 additions & 0 deletions spade_anomaly_detection/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,27 @@
),
)

_N_COMPONENTS = flags.DEFINE_integer(
"n_components",
default=1,
required=False,
help=(
"The number of components to use in the one class classifier ensemble. "
"By default, we use 1 component."
),
)

_COVARIANCE_TYPE = flags.DEFINE_string(
"covariance_type",
default="full",
required=False,
help=(
"The covariance type to use in the one class classifier ensemble. By "
"default, we use 'full' covariance. Note that when there are many "
"components, a 'full' covariance matrix may not be suitable."
),
)

_VERBOSE = flags.DEFINE_bool(
"verbose",
default=False,
Expand Down Expand Up @@ -367,6 +388,8 @@ def main(argv: Sequence[str]) -> None:
max_occ_batch_size=_MAX_OCC_BATCH_SIZE.value,
labeling_and_model_training_batch_size=_BATCH_SIZE.value,
ensemble_count=_ENSEMBLE_COUNT.value,
n_components=_N_COMPONENTS.value,
covariance_type=_COVARIANCE_TYPE.value,
random_seed=_RANDOM_SEED,
verbose=_VERBOSE.value,
)
Expand Down

0 comments on commit 13bf3c8

Please sign in to comment.