From 41f0758ad1a8ce8b8f9b43e2527bd8516cbf1b12 Mon Sep 17 00:00:00 2001 From: Raj Sinha Date: Fri, 10 Jan 2025 12:59:59 -0800 Subject: [PATCH] Update to allow labels class values to be unique arbitrary strings. Also allows the unlabeled value to be the empty string. For example: positive label value = "positive" negative label value = "negative" unlabeled label value = "" PiperOrigin-RevId: 714146904 --- spade_anomaly_detection/csv_data_loader.py | 115 ++++++++++++--- .../csv_data_loader_test.py | 132 ++++++++++++++++-- spade_anomaly_detection/occ_ensemble.py | 8 +- spade_anomaly_detection/performance_test.py | 8 +- spade_anomaly_detection/runner.py | 29 +++- spade_anomaly_detection/runner_test.py | 85 +++++++---- 6 files changed, 299 insertions(+), 78 deletions(-) diff --git a/spade_anomaly_detection/csv_data_loader.py b/spade_anomaly_detection/csv_data_loader.py index 13d2fe7..799955a 100644 --- a/spade_anomaly_detection/csv_data_loader.py +++ b/spade_anomaly_detection/csv_data_loader.py @@ -44,11 +44,22 @@ import tensorflow as tf -# Types are from //cloud/ml/research/data_utils/feature_metadata.py _FEATURES_TYPE: Final[str] = 'FLOAT64' _SOURCE_LABEL_TYPE: Final[str] = 'STRING' _SOURCE_LABEL_DEFAULT_VALUE: Final[str] = '-1' _LABEL_TYPE: Final[str] = 'INT64' +_STRING_TO_INTEGER_LABEL_MAP: dict[str | int, int] = { + 1: 1, + 0: 0, + -1: -1, + '': -1, + '-1': -1, + '0': 0, + '1': 1, + 'positive': 1, + 'negative': 0, + 'unlabeled': -1, +} # Setting the shuffle buffer size to 1M seems to be necessary to get the CSV # reader to provide a diversity of data to the model. @@ -167,12 +178,12 @@ def from_inputs_file( raise ValueError( f'Label column {label_column_name} not found in the header: {header}' ) - num_features = len(all_columns) - 1 features_types = [_FEATURES_TYPE] * len(all_columns) column_names_dict = collections.OrderedDict( zip(all_columns, features_types) ) column_names_dict[label_column_name] = _SOURCE_LABEL_DEFAULT_VALUE + num_features = len(all_columns) - 1 return ColumnNamesInfo( column_names_dict=column_names_dict, header=header, @@ -216,6 +227,13 @@ def __init__(self, runner_parameters: parameters.RunnerParameters): self.runner_parameters.negative_data_value, self.runner_parameters.unlabeled_data_value, ] + # Add any labels that are not already in the map. + _STRING_TO_INTEGER_LABEL_MAP[self.runner_parameters.positive_data_value] = 1 + _STRING_TO_INTEGER_LABEL_MAP[self.runner_parameters.negative_data_value] = 0 + _STRING_TO_INTEGER_LABEL_MAP[ + self.runner_parameters.unlabeled_data_value + ] = -1 + # Construct a label remap from string labels to integers. The table is not # necessary for the case when the labels are all integers. But instead of # checking if the labels are all integers, we construct the table and use @@ -286,7 +304,8 @@ def get_inputs_metadata( ) # Get information about the columns. column_names_info = ColumnNamesInfo.from_inputs_file( - csv_filenames[0], label_column_name + csv_filenames[0], + label_column_name, ) logging.info( 'Obtained metadata for data with CSV prefix %s (number of features=%d)', @@ -360,13 +379,12 @@ def filter_func(features: tf.Tensor, label: tf.Tensor) -> bool: # pylint: disab @classmethod def convert_str_to_int(cls, value: str) -> int: """Converts a string integer label to an integer label.""" - if isinstance(value, str) and value.lstrip('-').isdigit(): - return int(value) - elif isinstance(value, int): - return value + if value in _STRING_TO_INTEGER_LABEL_MAP: + return _STRING_TO_INTEGER_LABEL_MAP[value] else: raise ValueError( - f'Label {value} of type {type(value)} is not a string integer.' + f'Label {value} of type {type(value)} is not a string integer or ' + 'mappable to an integer.' ) @classmethod @@ -374,8 +392,6 @@ def _get_label_remap_table( cls, labels_mapping: dict[str, int] ) -> tf.lookup.StaticHashTable: """Returns a label remap table that converts string labels to integers.""" - # The possible keys are '', '-1, '0', '1'. None is not included because the - # Data Loader will default to '' if the label is None. keys_tensor = tf.constant( list(labels_mapping.keys()), dtype=tf.dtypes.as_dtype(_SOURCE_LABEL_TYPE.lower()), @@ -390,6 +406,14 @@ def _get_label_remap_table( ) return label_remap_table + def remap_label(self, label: str | tf.Tensor) -> int | tf.Tensor: + """Remaps the label to an integer.""" + if isinstance(label, str) or ( + isinstance(label, tf.Tensor) and label.dtype == tf.dtypes.string + ): + return self._label_remap_table.lookup(label) + return label + def load_tf_dataset_from_csv( self, input_path: str, @@ -441,6 +465,7 @@ def load_tf_dataset_from_csv( self._last_read_metadata.column_names_info.column_names_dict.values() ) ] + logging.info('column_defaults: %s', column_defaults) # Construct a single dataset out of multiple CSV files. # TODO(sinharaj): Remove the determinism after testing. @@ -456,7 +481,7 @@ def load_tf_dataset_from_csv( na_value='', header=True, num_epochs=1, - shuffle=True, + shuffle=False, shuffle_buffer_size=_SHUFFLE_BUFFER_SIZE, shuffle_seed=self.runner_parameters.random_seed, prefetch_buffer_size=tf.data.AUTOTUNE, @@ -473,17 +498,9 @@ def load_tf_dataset_from_csv( 'created.' ) - def remap_label(label: str | tf.Tensor) -> int | tf.Tensor: - """Remaps the label to an integer.""" - if isinstance(label, str) or ( - isinstance(label, tf.Tensor) and label.dtype == tf.dtypes.string - ): - return self._label_remap_table.lookup(label) - return label - # The Dataset can have labels of type int or str. Cast them to int. dataset = dataset.map( - lambda features, label: (features, remap_label(label)), + lambda features, label: (features, self.remap_label(label)), num_parallel_calls=tf.data.AUTOTUNE, deterministic=True, ) @@ -535,7 +552,6 @@ def combine_features_dict_into_tensor( self._label_counts = { k: v.numpy() for k, v in self.counts_by_label(dataset).items() } - logging.info('Label counts: %s', self._label_counts) return dataset @@ -554,11 +570,11 @@ def counts_by_label(self, dataset: tf.data.Dataset) -> Dict[int, tf.Tensor]: @tf.function def count_class( - counts: Dict[int, int], # Keys are always strings. + counts: Dict[int, int], batch: Tuple[tf.Tensor, tf.Tensor], ) -> Dict[int, int]: _, labels = batch - # Keys are always strings. + labels = self.remap_label(labels) new_counts: Dict[int, int] = counts.copy() for i in self.all_labels: # This function is called after the Dataset is constructed and the @@ -582,6 +598,59 @@ def count_class( ) return counts + def counts_by_original_label( + self, dataset: tf.data.Dataset + ) -> tuple[dict[str, tf.Tensor], dict[int, tf.Tensor]]: + """Counts the number of samples in each label class in the dataset.""" + + all_int_labels = [l for l in self.all_labels if isinstance(l, int)] + logging.info('all_int_labels: %s', all_int_labels) + all_str_labels = [l for l in self.all_labels if isinstance(l, str)] + logging.info('all_str_labels: %s', all_str_labels) + + @tf.function + def count_original_class( + counts: Dict[int | str, int], + batch: Tuple[tf.Tensor, tf.Tensor], + ) -> Dict[int | str, int]: + keys_are_int = all(isinstance(k, int) for k in counts.keys()) + if keys_are_int: + all_labels = all_int_labels + else: + all_labels = all_str_labels + _, labels = batch + new_counts: Dict[int | str, int] = counts.copy() + for label in all_labels: + cc: tf.Tensor = tf.cast(labels == label, tf.int32) + if label in list(new_counts.keys()): + new_counts[label] += tf.reduce_sum(cc) + else: + new_counts[label] = tf.reduce_sum(cc) + return new_counts + + int_keys_map = { + k: v + for k, v in _STRING_TO_INTEGER_LABEL_MAP.items() + if isinstance(k, int) + } + initial_int_state = dict((int(label), 0) for label in int_keys_map.keys()) + if initial_int_state: + int_counts = dataset.reduce( + initial_state=initial_int_state, reduce_func=count_original_class + ) + else: + int_counts = {} + str_keys_map = { + k: v + for k, v in _STRING_TO_INTEGER_LABEL_MAP.items() + if isinstance(k, str) + } + initial_str_state = dict((str(label), 0) for label in str_keys_map.keys()) + str_counts = dataset.reduce( + initial_state=initial_str_state, reduce_func=count_original_class + ) + return int_counts, str_counts + def get_label_thresholds(self) -> Mapping[str, float]: """Computes positive and negative thresholds based on label ratios. diff --git a/spade_anomaly_detection/csv_data_loader_test.py b/spade_anomaly_detection/csv_data_loader_test.py index 575dcdf..aad02b4 100644 --- a/spade_anomaly_detection/csv_data_loader_test.py +++ b/spade_anomaly_detection/csv_data_loader_test.py @@ -118,9 +118,39 @@ def setUp(self): [0.6, 0.7, ""], [0.6, 0.3, None], ] + self.dir = "dir1/" self.data1_df = pd.DataFrame(data=self.data1, columns=self.header) - self.csv_file1 = "/dir1/data1.csv" + self.csv_file1 = "{self.dir}data1.csv" self.csv_file1_content = self.data1_df.to_csv(header=True, index=False) + # Construct equivalent data with int labels. + self.data1_int_df = self.data1_df.copy(deep=True) + self.data1_int_df["y"] = self.data1_int_df["y"].replace( + {"1": 1, "0": 0, "-1": -1, "": -1} + ) + self.csv_file1_int = f"{self.dir}data1_int.csv" + self.csv_file1_int_content = self.data1_int_df.to_csv( + header=True, index=False + ) + # Construct equivalent data with 'positive' and 'negative' labels. + self.data1_posneg_df = self.data1_df.copy(deep=True) + self.data1_posneg_df["y"] = self.data1_posneg_df["y"].replace( + {"1": "positive", "0": "negative", "-1": "unlabeled"} + ) + self.csv_file1_posneg = f"{self.dir}data1_posneg.csv" + self.csv_file1_posneg_content = self.data1_posneg_df.to_csv( + header=True, index=False + ) + # Construct equivalent data with 'positive', 'negative' and blank labels. + self.data1_posneg_blank_label_df = self.data1_df.copy(deep=True) + self.data1_posneg_blank_label_df["y"] = self.data1_posneg_blank_label_df[ + "y" + ].replace({"1": "positive", "0": "negative", "-1": ""}) + self.csv_file1_posneg_blank_label = ( + f"{self.dir}data1_posneg_blank_label.csv" + ) + self.csv_file1_posneg_blank_label_content = ( + self.data1_posneg_blank_label_df.to_csv(header=True, index=False) + ) # Params to test: gcs_uri. @parameterized.named_parameters( @@ -189,18 +219,45 @@ def test_get_header_from_input_file_returns_header(self): expected_header = "x1,x2,y\n" self.assertEqual(header, expected_header) - def test_column_names_info_from_inputs_file_returns_column_names_info(self): + @parameterized.named_parameters( + ("labels_are_ints", 1), + ("labels_are_strings", 2), + ("labels_are_posneg_strings", 3), + ("labels_are_posneg_strings_with_empty_label", 4), + ) + def test_column_names_info_from_inputs_file_returns_column_names_info( + self, + test_case_number, + ): + if test_case_number == 1: + csv_file = self.csv_file1_int + csv_file_content = self.csv_file1_int_content + elif test_case_number == 2: + csv_file = self.csv_file1 + csv_file_content = self.csv_file1_content + elif test_case_number == 3: + csv_file = self.csv_file1_posneg + csv_file_content = self.csv_file1_posneg_content + elif test_case_number == 4: + csv_file = self.csv_file1_posneg_blank_label + csv_file_content = self.csv_file1_posneg_blank_label_content + else: + raise ValueError(f"Invalid test case number: {test_case_number}") + with tfds.testing.MockFs() as fs: - fs.add_file(f"{self.csv_file1}", self.csv_file1_content) + fs.add_file(f"{csv_file}", csv_file_content) column_names_info = csv_data_loader.ColumnNamesInfo.from_inputs_file( - inputs_file=self.csv_file1, label_column_name="y" + inputs_file=csv_file, + label_column_name="y", ) expected_column_names_info = csv_data_loader.ColumnNamesInfo( header="x1,x2,y", label_column_name="y", - column_names_dict=collections.OrderedDict( - [("x1", "FLOAT64"), ("x2", "FLOAT64"), ("y", "-1")] - ), + column_names_dict=collections.OrderedDict([ + ("x1", "FLOAT64"), + ("x2", "FLOAT64"), + ("y", csv_data_loader._SOURCE_LABEL_DEFAULT_VALUE), + ]), num_features=2, ) self.assertEqual(column_names_info, expected_column_names_info) @@ -221,6 +278,16 @@ def setUp(self): self.data1_df = pd.DataFrame(data=self.data1, columns=self.header) self.csv_file1 = f"{self.dir}data1.csv" self.csv_file1_content = self.data1_df.to_csv(header=True, index=False) + # Construct equivalent data with 'positive' and 'negative' labels. + self.data1_posneg_df = self.data1_df.copy(deep=True) + self.data1_posneg_df["y"] = self.data1_posneg_df["y"].replace( + {"1": "positive", "0": "negative", "-1": "unlabeled"} + ) + self.csv_file1_posneg = f"{self.dir}data1_posneg.csv" + self.csv_file1_posneg_content = self.data1_posneg_df.to_csv( + header=True, index=False + ) + self.data2 = [ [0.6, 0.7, "1"], [0.6, 0.3, "0"], @@ -231,9 +298,27 @@ def setUp(self): self.data2_df = pd.DataFrame(data=self.data2, columns=self.header) self.csv_file2 = f"{self.dir}data2.csv" self.csv_file2_content = self.data2_df.to_csv(header=True, index=False) + # Construct equivalent dat with 'positive' and 'negative' labels. + self.data2_posneg_df = self.data2_df.copy(deep=True) + self.data2_posneg_df["y"] = self.data2_posneg_df["y"].replace( + {"1": "positive", "0": "negative", "-1": "unlabeled"} + ) + self.csv_file2_posneg = f"{self.dir}data2_posneg.csv" + self.csv_file2_posneg_content = self.data2_posneg_df.to_csv( + header=True, index=False + ) + self.data_df = pd.concat([self.data1_df, self.data2_df]) # self.data_df = self.data_df.astype({"y": "str"}) + def test_illegal_label_value_raises(self): + with self.assertRaises(ValueError): + csv_data_loader.CsvDataLoader.convert_str_to_int("1.0") + with self.assertRaises(ValueError): + csv_data_loader.CsvDataLoader.convert_str_to_int("this_is_positive") + with self.assertRaises(ValueError): + csv_data_loader.CsvDataLoader.convert_str_to_int(" ") # with space. + def test_get_label_remap_table(self): label_mapping = {"-1": -1, "0": 0, "1": 1, "": -1} remap_table = csv_data_loader.CsvDataLoader._get_label_remap_table( @@ -341,6 +426,13 @@ def test_counts_by_label_returns_expected_counts(self): # Test the creation of a Dataset from CSV files. Only tests batch_size=1. @parameterized.named_parameters( + ( + "labels_are_posneg_strings", + "positive", + "negative", + "unlabeled", + True, + ), ( "labels_are_strings", "1", @@ -374,20 +466,30 @@ def test_load_tf_dataset_from_csv_returns_expected_dataset( input_path = os.path.join(tmp_dir.full_path, self.dir) tf.io.gfile.makedirs(input_path) mock_parse_gcs_uri.return_value = ("doesnt_matter", input_path, "") - mock_file_reader.return_value = [ - os.path.join(tmp_dir.full_path, self.csv_file1), - os.path.join(tmp_dir.full_path, self.csv_file2), - ] # Write the test CSV files to temporary files. These CSV files will be # re-read when the Dataset is created. Their metadata will also be recorded # in the InputFilesMetadata object. - self.data1_df.to_csv( - os.path.join(tmp_dir.full_path, self.csv_file1), + if positive_data_value == "positive": + self.test_data1_df = self.data1_posneg_df + self.test_csv_file1 = self.csv_file1_posneg + self.test_data2_df = self.data2_posneg_df + self.test_csv_file2 = self.csv_file2_posneg + else: + self.test_data1_df = self.data1_df + self.test_csv_file1 = self.csv_file1 + self.test_data2_df = self.data2_df + self.test_csv_file2 = self.csv_file2 + mock_file_reader.return_value = [ + os.path.join(tmp_dir.full_path, self.test_csv_file1), + os.path.join(tmp_dir.full_path, self.test_csv_file2), + ] + self.test_data1_df.to_csv( + os.path.join(tmp_dir.full_path, self.test_csv_file1), header=True, index=False, ) - self.data2_df.to_csv( - os.path.join(tmp_dir.full_path, self.csv_file2), + self.test_data2_df.to_csv( + os.path.join(tmp_dir.full_path, self.test_csv_file2), header=True, index=False, ) diff --git a/spade_anomaly_detection/occ_ensemble.py b/spade_anomaly_detection/occ_ensemble.py index d01c46b..76a5309 100644 --- a/spade_anomaly_detection/occ_ensemble.py +++ b/spade_anomaly_detection/occ_ensemble.py @@ -41,7 +41,7 @@ _RANDOM_SEED: Final[int] = 42 - +_SHUFFLE_BUFFER_SIZE: Final[int] = 10_000 _LABEL_TYPE: Final[str] = 'INT64' @@ -169,8 +169,11 @@ def is_batched(self, dataset: tf.data.Dataset) -> bool: or isinstance(dataset.element_spec[0].shape[0], int) ) + # Fit with -ve included in every batch to GMMs. def fit( - self, train_x: tf.data.Dataset, batches_per_occ: int + self, + train_x: tf.data.Dataset, + batches_per_occ: int, ) -> Sequence[mixture.GaussianMixture]: """Creates and fits and ensemble of one class classifiers. @@ -409,5 +412,4 @@ def pseudo_label( len(new_positive_indices)) logging.info('Number of new negative labels: %s', len(new_negative_indices)) - return new_features, new_labels, weights, pseudolabel_flags diff --git a/spade_anomaly_detection/performance_test.py b/spade_anomaly_detection/performance_test.py index c0c92c5..8934e95 100644 --- a/spade_anomaly_detection/performance_test.py +++ b/spade_anomaly_detection/performance_test.py @@ -402,8 +402,8 @@ def setUp(self): ) @parameterized.named_parameters([ - ('labels_are_ints', False, 1, 0, -1), - ('labels_are_strings', True, '1', '0', '-1'), + ('labels_are_strings_false', False, 1, 0, -1), + ('labels_are_strings_true', True, '1', '0', '-1'), ]) def test_spade_auc_performance_pnu_single_batch( self, @@ -450,8 +450,8 @@ def test_spade_auc_performance_pnu_single_batch( self.assertAlmostEqual(auc, 0.9755, delta=0.02) @parameterized.named_parameters([ - ('labels_are_ints', False, 1, 0, -1), - ('labels_are_strings', True, '1', '0', '-1'), + ('labels_are_strings_false', False, 1, 0, -1), + ('labels_are_strings_true', True, '1', '0', '-1'), ]) def test_spade_auc_performance_pu_single_batch( self, diff --git a/spade_anomaly_detection/runner.py b/spade_anomaly_detection/runner.py index 798cccb..e7e84bf 100644 --- a/spade_anomaly_detection/runner.py +++ b/spade_anomaly_detection/runner.py @@ -129,7 +129,9 @@ def __init__(self, runner_parameters: parameters.RunnerParameters): if not self.runner_parameters.upload_only: self.supervised_model_metrics: Optional[dict[str, float]] = None - supervised_model_parameters = supervised_model.RandomForestParameters() + supervised_model_parameters = supervised_model.RandomForestParameters( + random_seed=self.runner_parameters.random_seed + ) self.supervised_model_object = supervised_model.RandomForestModel( supervised_model_parameters ) @@ -189,6 +191,7 @@ def _get_table_statistics(self) -> Mapping[str, float]: batch_size=1, ) input_table_statistics = stats_data_loader.get_label_thresholds() + logging.info('Input table statistics: %s', input_table_statistics) return input_table_statistics def _get_record_count_based_on_labels(self, label_value: int | str) -> int: @@ -581,6 +584,7 @@ def _get_test_data(self) -> tf.data.Dataset: test_tf_dataset = test_tf_dataset.batch( tf.cast(test_dataset_size, tf.int64) ) + logging.info('Test dataset size: %s', test_dataset_size) test_tf_dataset = test_tf_dataset.prefetch(tf.data.AUTOTUNE) return test_tf_dataset @@ -709,6 +713,10 @@ def preprocess_train_test_split( self.test_y[self.test_y == self.int_positive_data_value] = 1 self.test_y[self.test_y == self.int_negative_data_value] = 0 + if self.test_x is not None: + logging.info('Test x shape: %s', self.test_x.shape) + if self.test_y is not None: + logging.info('Test y shape: %s', self.test_y.shape) return (train_x, train_y) def train_supervised_model( @@ -770,7 +778,21 @@ def run(self) -> None: # again. Find a way to get the label counts without reading the files. # Assumes that data loader has already been used to read the input table. total_record_count = sum(train_label_counts.values()) - logging.info('Label counts before training: %s', train_label_counts) + if ( + self.runner_parameters.labeling_and_model_training_batch_size + and self.runner_parameters.labeling_and_model_training_batch_size + > total_record_count + ): + self.runner_parameters.labeling_and_model_training_batch_size = ( + total_record_count + ) + logging.info( + 'Labeling and model training batch size is reduced to %s', + self.runner_parameters.labeling_and_model_training_batch_size, + ) + logging.info( + 'Label counts before supervised training: %s', train_label_counts + ) logging.info('Total record count: %s', total_record_count) unlabeled_record_count = self._get_record_count_based_on_labels( @@ -816,6 +838,7 @@ def run(self) -> None: ) tf_dataset = tf_dataset.as_numpy_iterator() + unique_labels = set() for batch_number, (features, labels) in enumerate(tf_dataset): logging.info( 'Labeling and supervised model training batch number: %s', @@ -843,6 +866,7 @@ def run(self) -> None: verbose=self.runner_parameters.verbose, ) ) + unique_labels.update(set(updated_labels.ravel().tolist())) logging.info('Labeling completed.') # Upload batch of pseudo labels, will append when called more than once. @@ -892,6 +916,7 @@ def run(self) -> None: weights=weights, ) # End of pseudolabeling and supervised model training loop. + logging.info('Unique labels after pseudolabeling: %s', unique_labels) if not self.runner_parameters.upload_only: self.evaluate_model() diff --git a/spade_anomaly_detection/runner_test.py b/spade_anomaly_detection/runner_test.py index 052b9d9..3662919 100644 --- a/spade_anomaly_detection/runner_test.py +++ b/spade_anomaly_detection/runner_test.py @@ -56,9 +56,9 @@ def setUp(self): data_input_gcs_uri=None, output_gcs_uri='gs://test_bucket/test_folder', label_col_name='label', - positive_data_value=5, - negative_data_value=3, - unlabeled_data_value=-100, + positive_data_value=1, + negative_data_value=0, + unlabeled_data_value=-1, labels_are_strings=False, positive_threshold=5, negative_threshold=95, @@ -416,7 +416,7 @@ def test_supervised_model_evaluation_no_error(self): with self.subTest(name='FeaturesNotNull'): self.assertIsNotNone(evaluate_arguments['x']) - def test_proprocessing_inputs_supervised_model_train(self): + def test_preprocessing_inputs_supervised_model_train(self): runner_object = runner.Runner(self.runner_parameters) runner_object.run() @@ -864,9 +864,9 @@ def setUp(self): data_input_gcs_uri='gs://some_bucket/input_folder', output_gcs_uri='gs://test_bucket/test_folder', label_col_name='label', - positive_data_value=5, - negative_data_value=3, - unlabeled_data_value=-100, + positive_data_value=1, + negative_data_value=0, + unlabeled_data_value=-1, labels_are_strings=False, positive_threshold=5, negative_threshold=95, @@ -1033,8 +1033,8 @@ def _create_mock_datasets(self) -> None: self.mock_label_counts.return_value = self.label_counts @parameterized.named_parameters([ - ('labels_are_strings_false', False, 5, 3, -100), - ('labels_are_strings_true', True, '5', '3', '-100'), + ('labels_are_strings_false', False, 1, 0, -1), + ('labels_are_strings_true', True, '1', '0', '-1'), ]) def test_csv_runner_csv_data_loader_no_error( self, @@ -1087,8 +1087,8 @@ def test_csv_runner_csv_data_loader_no_error( ) @parameterized.named_parameters([ - ('labels_are_strings_false', False, 5, 3, -100), - ('labels_are_strings_true', True, '5', '3', '-100'), + ('labels_are_strings_false', False, 1, 0, -1), + ('labels_are_strings_true', True, '1', '0', '-1'), ]) def test_csv_runner_supervised_model_fit( self, @@ -1124,8 +1124,8 @@ def test_csv_runner_supervised_model_fit( ) @parameterized.named_parameters([ - ('labels_are_strings_false', False, 5, 3, -100), - ('labels_are_strings_true', True, '5', '3', '-100'), + ('labels_are_strings_false', False, 1, 0, -1), + ('labels_are_strings_true', True, '1', '0', '-1'), ]) def test_csv_supervised_model_evaluation_no_error( self, @@ -1153,10 +1153,10 @@ def test_csv_supervised_model_evaluation_no_error( self.assertIsNotNone(evaluate_arguments['x']) @parameterized.named_parameters([ - ('labels_are_strings_false', False, 5, 3, -100), - ('labels_are_strings_true', True, '5', '3', '-100'), + ('labels_are_strings_false', False, 1, 0, -1), + ('labels_are_strings_true', True, '1', '0', '-1'), ]) - def test_csv_proprocessing_inputs_supervised_model_train( + def test_csv_preprocessing_inputs_supervised_model_train( self, labels_are_strings: bool, positive_data_value: str | int, @@ -1179,8 +1179,8 @@ def test_csv_proprocessing_inputs_supervised_model_train( ) @parameterized.named_parameters([ - ('labels_are_strings_false', False, 5, 3, -100), - ('labels_are_strings_true', True, '5', '3', '-100'), + ('labels_are_strings_false', False, 1, 0, -1), + ('labels_are_strings_true', True, '1', '0', '-1'), ]) def test_csv_upload_only_setting_true_no_error( self, @@ -1210,8 +1210,8 @@ def test_csv_upload_only_setting_true_no_error( self.mock_csv_upload.assert_called_once() @parameterized.named_parameters([ - ('labels_are_strings_false', False, 5, 3, -100), - ('labels_are_strings_true', True, '5', '3', '-100'), + ('labels_are_strings_false', False, 1, 0, -1), + ('labels_are_strings_true', True, '1', '0', '-1'), ]) def test_csv_upload_only_setting_true_throw_error_no_gcs_uri( self, @@ -1237,8 +1237,8 @@ def test_csv_upload_only_setting_true_throw_error_no_gcs_uri( runner_object.run() @parameterized.named_parameters([ - ('labels_are_strings_false', False, 5, 3, -100), - ('labels_are_strings_true', True, '5', '3', '-100'), + ('labels_are_strings_false', False, 1, 0, -1), + ('labels_are_strings_true', True, '1', '0', '-1'), ]) def test_csv_upload_only_false_no_error( self, @@ -1267,11 +1267,23 @@ def test_csv_upload_only_false_no_error( with self.subTest('CSVUploadCalled'): self.mock_csv_upload.assert_called_once() - def test_csv_runner_supervised_model_fit_with_csv_data_int_labels(self): - self.runner_parameters.labels_are_strings = False - self.runner_parameters.positive_data_value = 5 - self.runner_parameters.negative_data_value = 3 - self.runner_parameters.unlabeled_data_value = -100 + @parameterized.named_parameters([ + ('labels_are_strings_false_int_labels_1_0_minus1', False, 1, 0, -1), + ('labels_are_strings_true_str_labels_1_0_minus1', True, '1', '0', '-1'), + ('labels_are_strings_false_int_labels_6_7_minus1', False, 6, 7, -1), + ('labels_are_strings_true_str_labels_6_7_minus1', True, '6', '7', '-1'), + ]) + def test_csv_runner_supervised_model_fit_with_csv_data_int_labels( + self, + labels_are_strings: bool, + positive_data_value: str | int, + negative_data_value: str | int, + unlabeled_data_value: str | int, + ): + self.runner_parameters.labels_are_strings = labels_are_strings + self.runner_parameters.positive_data_value = positive_data_value + self.runner_parameters.negative_data_value = negative_data_value + self.runner_parameters.unlabeled_data_value = unlabeled_data_value self.runner_parameters.alpha = 0.8 self.runner_parameters.negative_threshold = 0 @@ -1297,11 +1309,22 @@ def test_csv_runner_supervised_model_fit_with_csv_data_int_labels(self): with self.subTest('CSVUploadNotCalled'): self.mock_csv_upload.assert_not_called() - def test_csv_runner_supervised_model_fit_with_csv_data_string_labels(self): + @parameterized.named_parameters([ + ('string_labels_1', 'positive', 'negative', ''), + ('string_labels_2', 'positive', 'negative', 'unlabeled'), + ('string_labels_3', '+ve', '-ve', ''), + ('string_labels_4', '+ve', '-ve', 'none'), + ]) + def test_csv_runner_supervised_model_fit_with_csv_data_string_labels( + self, + positive_data_value: str, + negative_data_value: str, + unlabeled_data_value: str, + ): self.runner_parameters.labels_are_strings = True - self.runner_parameters.positive_data_value = '5' - self.runner_parameters.negative_data_value = '3' - self.runner_parameters.unlabeled_data_value = '-100' + self.runner_parameters.positive_data_value = positive_data_value + self.runner_parameters.negative_data_value = negative_data_value + self.runner_parameters.unlabeled_data_value = unlabeled_data_value self.runner_parameters.alpha = 0.8 self.runner_parameters.negative_threshold = 0