From 824469ed70ab598acb2b015e9744136754d8f25b Mon Sep 17 00:00:00 2001 From: Maciej Nasinski Date: Thu, 26 Oct 2023 21:53:58 +0200 Subject: [PATCH] ml run (#9) * ml solution transparency --- CHANGELOG.md | 3 +- docs/example.md | 5 +- src/cat2cat/__init__.py | 2 + src/cat2cat/cat2cat.py | 47 +------ src/cat2cat/cat2cat_ml.py | 269 ++++++++++++++++++++++++++++++++++++++ tests/test_cat2cat.py | 16 --- tests/test_cat2cat_ml.py | 68 ++++++++++ 7 files changed, 346 insertions(+), 64 deletions(-) create mode 100644 src/cat2cat/cat2cat_ml.py create mode 100644 tests/test_cat2cat_ml.py diff --git a/CHANGELOG.md b/CHANGELOG.md index afd918a..f8c3034 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ # Changelog -## v0.1.4.9004 +## v0.1.4.9005 +- New `cat2cat_ml_run` function to check the ml models performance before `cat2cat` with ml option is run. Now, the ml models are more transparent. - Improved the lack of support for NaN and None in the `get_mappings`. - Fixed a bug that `cat2cat_ml.features` can be only a `list` not a `Sequence`. - Fixed assertion message and docs for the `freqs` argument in the `cat2cat_mappings`. diff --git a/docs/example.md b/docs/example.md index ad2bcf6..b0c1ea4 100644 --- a/docs/example.md +++ b/docs/example.md @@ -93,15 +93,18 @@ data_final.groupby(["year"]).sample(5).loc[:, sub_cols] ```python from sklearn.neighbors import KNeighborsClassifier +from cat2cat import cat2cat_ml_run # ml dataclass, one of the arguments for the cat2cat function ml = cat2cat_ml( data = o_new, cat_var = "code", features = ["salary", "age", "edu"], - models = [KNeighborsClassifier()] + models = [KNeighborsClassifier(random_state = 1234)] ) +cat2cat_ml_run(mappings, ml) + # apply the cat2cat procedure c2c = cat2cat(data = data, mappings = mappings, ml = ml) # pandas.concat used to bind per period datasets diff --git a/src/cat2cat/__init__.py b/src/cat2cat/__init__.py index 35d9ea6..09eff27 100644 --- a/src/cat2cat/__init__.py +++ b/src/cat2cat/__init__.py @@ -5,3 +5,5 @@ # simplified from cat2cat.cat2cat import cat2cat + +from cat2cat.cat2cat_ml import cat2cat_ml_run diff --git a/src/cat2cat/cat2cat.py b/src/cat2cat/cat2cat.py index 24cdaf0..3bf3c5c 100644 --- a/src/cat2cat/cat2cat.py +++ b/src/cat2cat/cat2cat.py @@ -4,6 +4,7 @@ from cat2cat.mappings import get_mappings, get_freqs, cat_apply_freq from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml from cat2cat.cat2cat_utils import dummy_c2c +from cat2cat.cat2cat_ml import _cat2cat_ml from typing import Optional, Any, Dict @@ -146,52 +147,6 @@ def cat2cat( return res -def _cat2cat_ml( - ml: cat2cat_ml, mapp: Dict[Any, Any], target_df: DataFrame, cat_var_target: str -) -> None: - """cat2cat ml optional part""" - for target_cat in list(mapp.keys()): - base_cats = mapp[target_cat] - ml_cat_var = ml.data[ml.cat_var] - if (not any(in1d(base_cats, ml_cat_var.unique()))) or (len(base_cats) == 1): - continue - - target_cat_index = in1d(target_df[cat_var_target].values, target_cat) - ml_cat_index = in1d(ml.data[ml.cat_var].values, base_cats) - - data_ml_train = ml.data.loc[ml_cat_index, :] - data_ml_target = target_df.loc[target_cat_index, :] - - target_cats = data_ml_target["g_new_c2c"] - data_ml_target_uniq = data_ml_target.drop_duplicates( - subset=["index_c2c"] + list(ml.features) - ) - index_c2c = data_ml_target_uniq["index_c2c"].values - - for m in ml.models: - ml_name = type(m).__name__ - ml_colname = "wei_" + ml_name + "_c2c" - - try: - m.fit(X=data_ml_train.loc[:, ml.features], y=data_ml_train[ml.cat_var]) - - X_test = data_ml_target_uniq.loc[:, ml.features] - preds = m.predict_proba(X=X_test) - - preds_df = DataFrame(preds) - preds_df.columns = m.classes_ - preds_df[setdiff1d(target_cats.unique(), m.classes_)] = 0 - preds_df["index_c2c"] = index_c2c - preds_df_melt = preds_df.melt(id_vars="index_c2c", var_name="g_new_c2c") - merge_on = ["index_c2c", "g_new_c2c"] - p_order = target_df.loc[target_cat_index, merge_on].merge( - preds_df_melt, on=merge_on, how="left" - ) - target_df.loc[target_cat_index, ml_colname] = p_order["value"].values - except: - None - - def _resolve_frequencies( base_df: DataFrame, cat_var_base: str, diff --git a/src/cat2cat/cat2cat_ml.py b/src/cat2cat/cat2cat_ml.py new file mode 100644 index 0000000..5d93a5c --- /dev/null +++ b/src/cat2cat/cat2cat_ml.py @@ -0,0 +1,269 @@ +from pandas import DataFrame, concat +from numpy import repeat, setdiff1d, in1d, sum, NaN, nanmean, isnan, round + +from sklearn.model_selection import train_test_split + +from cat2cat.mappings import get_mappings +from cat2cat.dataclass import cat2cat_mappings, cat2cat_ml + +from typing import Any, Dict + +__all__ = ["cat2cat_ml_run"] + + +class cat2cat_ml_run_results: + """The class to represent the results of the cat2cat_ml_run function call + Args: + res (Dict): raw results from the cat2cat_ml_run function call + mappings (cat2cat_mappings): dataclass with mappings related arguments. + Please check out the `cat2cat.dataclass.cat2cat_mappings` for more information. + ml (cat2cat_ml): dataclass with ml related arguments. + Please check out the `cat2cat.dataclass.cat2cat_ml` for more information. + kwargs (Dict): additional arguments passed to the `cat2cat_ml_run` function. + Returns: + cat2cat_ml_run_results class instance with the following attributes: + res (Dict): raw results from the cat2cat_ml_run function call + mean_acc (Dict): mean accuracy for each model + percent_failed (Dict): percent of failed models for each model + percent_better (Dict): percent of better models over most frequent category solution for each model + mappings (cat2cat_mappings): initial mappings dataclass with mappings related arguments. + ml (cat2cat_ml): initial ml dataclass with ml related arguments. + Methods: + get_raw: get raw results + """ + + def __init__( + self, res: Dict, mappings: cat2cat_mappings, ml: cat2cat_ml, kwargs: Dict + ) -> None: + self.res = res + self.mappings = mappings + self.ml = ml + self.kwargs = kwargs + self.models_names = [type(m).__name__ for m in self.ml.models] + + mean_acc = dict() + percent_failed = dict() + percent_better = dict() + + mean_acc["naive"] = round( + nanmean( + [self.res.get(g, {"naive": NaN}).get("naive") for g in self.res.keys()] + ), + 3, + ) + mean_acc["most_freq"] = round( + nanmean( + [self.res.get(g, {"freq": NaN}).get("freq") for g in self.res.keys()] + ), + 2, + ) + for m in self.models_names: + vals = [self.res.get(g, {}).get(m, NaN) for g in self.res.keys()] + mean_acc[m] = round(nanmean(vals), 3) + percent_failed[m] = round(sum(isnan(vals)) / len(vals) * 100, 3) + percent_better[m] = round( + sum(vals > mean_acc["most_freq"]) / len(vals) * 100, 3 + ) + + self.mean_acc = mean_acc + self.percent_failed = percent_failed + self.percent_better = percent_better + + def get_raw(self) -> Dict: + """Get raw results""" + return self.res + + def __repr__(self) -> str: + res = "" + for k, v in self.mean_acc.items(): + res += "Average Accuracy {}: {}".format(k, v) + "\n" + res += "\n" + for k, v in self.percent_failed.items(): + res += "Percent of failed {}: {}".format(k, v) + "\n" + res += "\n" + for k, v in self.percent_better.items(): + res += ( + "Percent of better {} over most frequent category solution: {}".format( + k, v + ) + + "\n" + ) + res += "\n" + res += "Features: {}".format(self.ml.features) + "\n" + res += "Test sample size: {}".format(self.kwargs.get("test_size", 0.2)) + "\n" + return res + + +def cat2cat_ml_run( + mappings: cat2cat_mappings, ml: cat2cat_ml, **kwargs: Any +) -> cat2cat_ml_run_results: + """Automatic mapping in a panel dataset - cat2cat procedure + + Args: + mappings (cat2cat_mappings): dataclass with mappings related arguments. + Please check out the `cat2cat.dataclass.cat2cat_mappings` for more information. + ml (Optional[cat2cat_ml]): dataclass with ml related arguments. + Please check out the `cat2cat.dataclass.cat2cat_ml` for more information. + **kwargs: additional arguments passed to the `cat2cat_ml_run` function. + min_match (float): minimum share of categories from the base period that have to be matched in the mapping table. Between 0 and 1. Default 0.8. + test_size (float): share of the data used for testing. Between 0 and 1. Default 0.2. + split_seed (int): random seed for the train_test_split function. Default 42. + + Returns: + cat2cat_ml_run_class + + Note: + Please check out the `cat2cat.cat2cat.cat2cat` for more information. + + + >>> from cat2cat import cat2cat + >>> from cat2cat.cat2cat_ml import cat2cat_ml_run + >>> from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml + >>> from sklearn.discriminant_analysis import LinearDiscriminantAnalysis + >>> from sklearn.tree import DecisionTreeClassifier + >>> from cat2cat.datasets import load_trans, load_occup + >>> trans = load_trans() + >>> occup = load_occup() + >>> o_old = occup.loc[occup.year == 2008, :].copy() + >>> o_new = occup.loc[occup.year == 2010, :].copy() + >>> mappings = cat2cat_mappings(trans = trans, direction = "forward") + >>> ml = cat2cat_ml( + ... occup.loc[occup.year <= 2008, :].copy(), + ... "code", + ... ["salary", "age", "edu", "sex"], + ... [DecisionTreeClassifier(random_state=1234), LinearDiscriminantAnalysis()] + ... ) + >>> cat2cat_ml_run(mappings = mappings, ml = ml) + + """ + assert isinstance( + mappings, cat2cat_mappings + ), "mappings arg has to be cat2cat_mappings instance" + assert isinstance(ml, cat2cat_ml), "ml arg has to be cat2cat_ml instance" + assert isinstance(kwargs, dict), "kwargs arg has to be a dict" + assert set(kwargs.keys()).issubset( + ["min_match", "test_size", "split_seed"] + ), "possible kwargs are min_match, split_seed and test_size" + + mapps = get_mappings(mappings.trans) + + if mappings.direction == "forward": + target_name = "new" + base_name = "old" + elif mappings.direction == "backward": + target_name = "old" + base_name = "new" + + mapp = mapps["to_" + base_name] + + cat_var = ml.data[ml.cat_var].values + cat_var_vals = mappings.trans[base_name].unique() + + assert (sum(in1d(cat_var, cat_var_vals)) / len(cat_var)) > kwargs.get( + "min_match", 0.8 + ), "The mapping table does not cover all categories in the data. Please check the direction in the mapping table." + + features = ml.features + models = ml.models + models_names = [type(m).__name__ for m in models] + + train_g = { + n: g for n, g in ml.data[list(features) + [ml.cat_var]].groupby(ml.cat_var) + } + + res = dict() + for cat in mapp.keys(): + try: + matched_cat = mapp.get(cat, []) + res[cat] = { + "naive": 1 / len(matched_cat), + "freq": NaN, + } + for m in models_names: + res[cat][m] = NaN + + data_small_g_list = list() + for g in matched_cat: + if g not in train_g.keys(): + continue + data_small_g_list.append(train_g.get(g)) + if len(data_small_g_list) == 0: + continue + + data_small_g = concat([train_g.get(g) for g in matched_cat], axis=0) + + if ( + (data_small_g.shape[0] < 10) + or (len(matched_cat) < 2) + or (sum(in1d(matched_cat, data_small_g[ml.cat_var])) == 1) + ): + continue + + X_train, X_test, y_train, y_test = train_test_split( + data_small_g[features], + data_small_g[ml.cat_var], + test_size=kwargs.get("test_size", 0.2), + random_state=kwargs.get("split_seed", 42), + ) + + gcounts = y_train.value_counts() + gfreq_max = gcounts.index[0] + res[cat]["freq"] = nanmean(gfreq_max == y_test) + + if (X_test.shape[0] == 0) or (X_train.shape[0] < 5): + continue + + for m in models: + ml_name = str(type(m).__name__) + m.fit(X_train, y_train) # type: ignore + res[cat][ml_name] = m.score(X_test, y_test) # type: ignore + except: + continue + + return cat2cat_ml_run_results(res, mappings, ml, kwargs) + + +def _cat2cat_ml( + ml: cat2cat_ml, mapp: Dict[Any, Any], target_df: DataFrame, cat_var_target: str +) -> None: + """cat2cat ml optional part""" + for target_cat in list(mapp.keys()): + base_cats = mapp[target_cat] + ml_cat_var = ml.data[ml.cat_var] + if (not any(in1d(base_cats, ml_cat_var.unique()))) or (len(base_cats) == 1): + continue + + target_cat_index = in1d(target_df[cat_var_target].values, target_cat) + ml_cat_index = in1d(ml.data[ml.cat_var].values, base_cats) + + data_ml_train = ml.data.loc[ml_cat_index, :] + data_ml_target = target_df.loc[target_cat_index, :] + + target_cats = data_ml_target["g_new_c2c"] + data_ml_target_uniq = data_ml_target.drop_duplicates( + subset=["index_c2c"] + list(ml.features) + ) + index_c2c = data_ml_target_uniq["index_c2c"].values + + for m in ml.models: + ml_name = type(m).__name__ + ml_colname = "wei_" + ml_name + "_c2c" + + try: + m.fit(X=data_ml_train.loc[:, ml.features], y=data_ml_train[ml.cat_var]) + + X_test = data_ml_target_uniq.loc[:, ml.features] + preds = m.predict_proba(X=X_test) + + preds_df = DataFrame(preds) + preds_df.columns = m.classes_ + preds_df[setdiff1d(target_cats.unique(), m.classes_)] = 0 + preds_df["index_c2c"] = index_c2c + preds_df_melt = preds_df.melt(id_vars="index_c2c", var_name="g_new_c2c") + merge_on = ["index_c2c", "g_new_c2c"] + p_order = target_df.loc[target_cat_index, merge_on].merge( + preds_df_melt, on=merge_on, how="left" + ) + target_df.loc[target_cat_index, ml_colname] = p_order["value"].values + except: + pass diff --git a/tests/test_cat2cat.py b/tests/test_cat2cat.py index 57e5bb0..69b4c89 100644 --- a/tests/test_cat2cat.py +++ b/tests/test_cat2cat.py @@ -61,22 +61,6 @@ def int_round(x: float) -> int: which_target_origin = {"backward": ("old", "new"), "forward": ("new", "old")} -def utils_test_structure(data, direction, nr_rows_old, nr_rows_new): - pass - - -def utils_test_expected(): - pass - - -def utils_test_sum1(): - pass - - -def utils_test_copy(): - pass - - @pytest.mark.parametrize("direction", ["backward", "forward"]) @pytest.mark.parametrize("cat_type", ["str", "int"]) def test_cat2cat_base(direction, cat_type): diff --git a/tests/test_cat2cat_ml.py b/tests/test_cat2cat_ml.py new file mode 100644 index 0000000..2856cb0 --- /dev/null +++ b/tests/test_cat2cat_ml.py @@ -0,0 +1,68 @@ +from cat2cat import cat2cat +from cat2cat import cat2cat_ml_run +from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis +from sklearn.tree import DecisionTreeClassifier +from cat2cat.datasets import load_trans, load_occup +from numpy.random import seed +from numpy import nan + +trans = load_trans() +occup = load_occup() +o_old = occup.loc[occup.year == 2008, :].copy() +o_new = occup.loc[occup.year == 2010, :].copy() + + +def test_cat2cat_ml_run_repr(): + mappings = cat2cat_mappings(trans=trans, direction="backward") + ml = cat2cat_ml( + occup.loc[occup.year >= 2010, :].copy(), + "code", + ["salary", "age", "edu", "sex"], + [ + DecisionTreeClassifier(random_state=1234), + LinearDiscriminantAnalysis(), + ], + ) + expected = repr(cat2cat_ml_run(mappings=mappings, ml=ml)) + actual = "Average Accuracy naive: 0.302\nAverage Accuracy most_freq: 0.54\nAverage Accuracy DecisionTreeClassifier: 0.487\nAverage Accuracy LinearDiscriminantAnalysis: 0.553\n\nPercent of failed DecisionTreeClassifier: 35.369\nPercent of failed LinearDiscriminantAnalysis: 35.369\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 27.99\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 32.57\n\nFeatures: ['salary', 'age', 'edu', 'sex']\nTest sample size: 0.2\n" + assert actual == expected + + expected = repr(cat2cat_ml_run(mappings=mappings, ml=ml, test_size=0.3)) + actual = "Average Accuracy naive: 0.302\nAverage Accuracy most_freq: 0.55\nAverage Accuracy DecisionTreeClassifier: 0.477\nAverage Accuracy LinearDiscriminantAnalysis: 0.54\n\nPercent of failed DecisionTreeClassifier: 35.369\nPercent of failed LinearDiscriminantAnalysis: 35.369\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 25.954\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 30.025\n\nFeatures: ['salary', 'age', 'edu', 'sex']\nTest sample size: 0.3\n" + assert actual == expected + + expected = repr( + cat2cat_ml_run(mappings=mappings, ml=ml, test_size=0.9, split_seed=1234) + ) + actual = "Average Accuracy naive: 0.302\nAverage Accuracy most_freq: 0.49\nAverage Accuracy DecisionTreeClassifier: 0.47\nAverage Accuracy LinearDiscriminantAnalysis: 0.491\n\nPercent of failed DecisionTreeClassifier: 60.814\nPercent of failed LinearDiscriminantAnalysis: 60.814\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 16.794\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 18.83\n\nFeatures: ['salary', 'age', 'edu', 'sex']\nTest sample size: 0.9\n" + assert actual == expected + + mappings = cat2cat_mappings(trans=trans, direction="forward") + ml = cat2cat_ml( + occup.loc[occup.year <= 2008, :].copy(), + "code", + ["salary", "age", "edu", "sex"], + [DecisionTreeClassifier(random_state=1234), LinearDiscriminantAnalysis()], + ) + expected = repr(cat2cat_ml_run(mappings=mappings, ml=ml, test_size=0.3)) + actual = "Average Accuracy naive: 0.987\nAverage Accuracy most_freq: 0.69\nAverage Accuracy DecisionTreeClassifier: 0.647\nAverage Accuracy LinearDiscriminantAnalysis: 0.708\n\nPercent of failed DecisionTreeClassifier: 98.291\nPercent of failed LinearDiscriminantAnalysis: 98.291\n\nPercent of better DecisionTreeClassifier over most frequent category solution: 0.699\nPercent of better LinearDiscriminantAnalysis over most frequent category solution: 0.932\n\nFeatures: ['salary', 'age', 'edu', 'sex']\nTest sample size: 0.3\n" + assert actual == expected + + +def test_cat2cat_ml_run_get_raw(): + mappings = cat2cat_mappings(trans=trans, direction="backward") + ml = cat2cat_ml( + occup.loc[occup.year >= 2010, :].copy(), + "code", + ["salary", "age", "edu", "sex"], + [DecisionTreeClassifier(random_state=1234), LinearDiscriminantAnalysis()], + ) + expected = cat2cat_ml_run(mappings=mappings, ml=ml).get_raw()["7431"] + actual = { + "naive": 0.3333333333333333, + "freq": nan, + "DecisionTreeClassifier": nan, + "LinearDiscriminantAnalysis": nan, + } + assert str(actual) == str(expected)