support missing in mappings

Polkas · Oct 22, 2022 · 4b5f0c1 · 4b5f0c1
1 parent 98cc332
commit 4b5f0c1
Show file tree

Hide file tree

Showing 7 changed files with 1,524 additions and 251 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,8 @@
 # Changelog
 
-## v0.1.4.9000
+## v0.1.4.9002
 
+- Improved the lack of support for NaN and None in the `get_mappings`.
 - Fixed a bug that `cat2cat_ml.features` can be only a `list` not a `Sequence`.
 - Fixed assertion message and docs for the `freqs` argument in the `cat2cat_mappings`.
 - Fixed some typing, and bring the clear `mypy`.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cat2cat"
-version = "0.1.4.9000"
+version = "0.1.4.9002"
 description = "Unifying an inconsistently coded categorical variable in a panel/longtitudal dataset."
 authors = ["Maciej Nasinski"]
 license = "MIT"

diff --git a/src/cat2cat/cat2cat.py b/src/cat2cat/cat2cat.py
@@ -37,6 +37,10 @@ def cat2cat(
         `mappings.trans` arg columns and the `data.cat_var` column have to be of the same type.
         When ml part applied `ml.cat_var` has to have the same type too.
 
+        3. Please covert all numpy.NaN to some numeric value like 999999.
+        None`s in a pandas column have to be converted to a "None" character.
+        Changes have to be made at the same time for the mapping table and datasets.
+
     >>> from cat2cat import cat2cat
     >>> from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml
     >>> from sklearn.ensemble import RandomForestClassifier

diff --git a/src/cat2cat/mappings.py b/src/cat2cat/mappings.py
@@ -1,5 +1,5 @@
 from pandas import DataFrame
-from numpy import ndarray, unique, repeat, array, round
+from numpy import ndarray, unique, repeat, array, round, unique, sort, isnan
 
 from collections.abc import Iterable
 from typing import Union, Optional, Any, List, Dict, Sequence
@@ -21,6 +21,9 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]]
 
     Returns:
         Dict[str, Dict[Any, List[Any]]]: dict with 2 internal dicts, `to_old` and `to_new`.
+    Note:
+        Please covert all numpy.NaN to some numeric value like 999999.
+        None`s in a pandas column have to be converted to a "None" character.
 
     >>> from cat2cat.mappings import get_mappings
     >>> from numpy import array
@@ -40,30 +43,47 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]]
     ), "x should have 2 dimensions and the second one is equal to 2 (columns)"
 
     if isinstance(x, DataFrame):
-        ff = x.iloc[:, 0].values
-        ss = x.iloc[:, 1].values
+        ff = x.iloc[:, 0].copy()
+        which_ff_null = ff.isnull()
+        ff = ff.values
+        ss = x.iloc[:, 1].copy()
+        which_ss_null = ss.isnull()
+        ss = ss.values
     elif isinstance(x, ndarray):
-        ff = x[:, 0]
-        ss = x[:, 1]
+        ff = x[:, 0].copy()
+        ss = x[:, 1].copy()
     else:
         raise (TypeError)
 
     assert ff.dtype == ss.dtype
+    col_type = ff.dtype
+
+    if col_type == "O":
+        ff[which_ff_null | (ff == None)] = "None"
+        ss[which_ss_null | (ss == None)] = "None"
 
-    from_old = set(ff)
-    from_new = set(ss)
+    from_old = unique(ff)
+    from_new = unique(ss)
 
     to_old = dict()
     for e in from_new:
-        idx = ss == e
+        if (col_type in [float, int]) and isnan(e):
+            idx = isnan(ss)
+        else:
+            idx = ss == e
+
         # sorted so results are stable
-        to_old[e] = sorted(set(ff[idx]))
+        to_old[e] = sorted(unique(ff[idx]))
 
     to_new = dict()
     for e in from_old:
-        idx = ff == e
+        if (col_type in [float, int]) and isnan(e):
+            idx = isnan(ff)
+        else:
+            idx = ff == e
+
         # sorted so results are stable
-        to_new[e] = sorted(set(ss[idx]))
+        to_new[e] = sorted(unique(ss[idx]))
 
     return dict(to_old=to_old, to_new=to_new)
 
@@ -134,4 +154,4 @@ def cat_apply_freq(
         cs = [freqs.get(e, 1e-12) for e in to_x[x]]
         fs = round(array(cs) / sum(cs), 10)
         res[x] = list(fs)
-    return res
+    return res
diff --git a/tests/test_cat2cat.py b/tests/test_cat2cat.py
@@ -2,10 +2,11 @@
 from cat2cat import cat2cat
 from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml
 from cat2cat.cat2cat_utils import dummy_c2c
-from pandas import concat
-from numpy import round
+from pandas import concat, DataFrame
+from numpy import round, setdiff1d
 import pytest
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.tree import DecisionTreeClassifier
 
 
 def int_round(x: float) -> int:
@@ -14,8 +15,6 @@ def int_round(x: float) -> int:
 
 verticals = load_verticals()
 
-trans = load_trans()
-
 occup = load_occup()
 occup_small = load_occup(small=True)
 o_2006 = occup.loc[occup.year == 2006, :].copy()
@@ -28,19 +27,57 @@ def int_round(x: float) -> int:
 o_new_int = o_new.copy()
 o_new_int["code"] = o_new_int["code"].astype(int)
 
+trans = load_trans()
+# impute missing values
+trans = concat(
+    [trans, DataFrame({"old": "99999", "new": setdiff1d(o_new.code, trans.new)})]
+)
+
 trans_int = trans.copy()
-trans_int.loc[trans_int["old"].isnull(), "old"] = "9999"
+trans_int.loc[trans_int["old"].isnull(), "old"] = "99999"
 trans_int = trans_int.astype({"old": int, "new": int})
 
 nr_rows_old = {"backward": 227662, "forward": 17223}
-nr_rows_new = {"backward": 17323, "forward": 18577}
+nr_rows_new = {"backward": 17323, "forward": 18680}
 data_dict = {
-    "str": {"old": o_old, "new": o_new, "trans": trans},
-    "int": {"old": o_old_int, "new": o_new_int, "trans": trans_int},
+    "str": {
+        "old": o_old,
+        "new": o_new,
+        "trans": trans,
+        "freqs": {
+            "forward": o_new["code"].value_counts().to_dict(),
+            "backward": o_old["code"].value_counts().to_dict(),
+        },
+    },
+    "int": {
+        "old": o_old_int,
+        "new": o_new_int,
+        "trans": trans_int,
+        "freqs": {
+            "forward": o_new_int["code"].value_counts().to_dict(),
+            "backward": o_old_int["code"].value_counts().to_dict(),
+        },
+    },
 }
 which_target_origin = {"backward": ("old", "new"), "forward": ("new", "old")}
 
 
+def utils_test_structure(data, direction, nr_rows_old, nr_rows_new):
+    pass
+
+
+def utils_test_expected():
+    pass
+
+
+def utils_test_sum1():
+    pass
+
+
+def utils_test_copy():
+    pass
+
+
 @pytest.mark.parametrize("direction", ["backward", "forward"])
 @pytest.mark.parametrize("cat_type", ["str", "int"])
 def test_cat2cat_base(direction, cat_type):
@@ -51,21 +88,25 @@ def test_cat2cat_base(direction, cat_type):
     )
     mappings = cat2cat_mappings(data_dict[cat_type]["trans"], direction)
     c2c = cat2cat(data, mappings)
+
+    # result structure
     assert isinstance(c2c, dict)
     assert sorted(list(c2c.keys())) == ["new", "old"]
 
+    # expected number of rows
     assert c2c["old"].shape[0] == nr_rows_old[direction]
     assert c2c["new"].shape[0] == nr_rows_new[direction]
 
     w_target_p, w_origin_p = which_target_origin[direction]
 
+    # test that the sum of the weights is 1
     assert (
         int_round(c2c[w_origin_p]["wei_freq_c2c"].sum())
         == data_dict[cat_type][w_origin_p].shape[0]
     )
     assert (
         int_round(c2c[w_target_p]["wei_freq_c2c"].sum())
-        <= data_dict[cat_type][w_target_p].shape[0]
+        == data_dict[cat_type][w_target_p].shape[0]
     )
     assert all(c2c[w_target_p].groupby("index_c2c")["wei_freq_c2c"].sum().round() == 1)
     assert all(c2c[w_origin_p]["wei_freq_c2c"].values == 1)
@@ -74,56 +115,114 @@ def test_cat2cat_base(direction, cat_type):
         == c2c[w_target_p].shape[0]
     )
 
+    # test that cat2cat not influence the original data
     assert data_dict[cat_type]["old"].equals(o)
     assert data_dict[cat_type]["new"].equals(n)
 
 
-def test_cat2cat_custom_freqs():
-    freqs = o_new["code"].value_counts().to_dict()
-    data = cat2cat_data(o_old, o_new, "code", "code", "year")
-    mappings_f = cat2cat_mappings(trans, "backward", freqs)
-    c2c = cat2cat(data, mappings_f)
+@pytest.mark.parametrize("direction", ["backward", "forward"])
+@pytest.mark.parametrize("cat_type", ["str", "int"])
+def test_cat2cat_custom_freqs(direction, cat_type):
+    o = data_dict[cat_type]["old"].copy()
+    n = data_dict[cat_type]["new"].copy()
+    data = cat2cat_data(
+        data_dict[cat_type]["old"], data_dict[cat_type]["new"], "code", "code", "year"
+    )
+    mappings = cat2cat_mappings(
+        data_dict[cat_type]["trans"], direction, data_dict[cat_type]["freqs"][direction]
+    )
+    c2c = cat2cat(data, mappings)
 
+    # result structure
     assert isinstance(c2c, dict)
     assert sorted(list(c2c.keys())) == ["new", "old"]
 
-    assert int_round(c2c["old"]["wei_freq_c2c"].sum()) == o_old.shape[0]
-    assert int_round(c2c["new"]["wei_freq_c2c"].sum()) == o_new.shape[0]
-    assert c2c["new"].shape[0] == o_new.shape[0]
+    # expected number of rows
+    assert c2c["old"].shape[0] == nr_rows_old[direction]
+    assert c2c["new"].shape[0] == nr_rows_new[direction]
 
-    assert all(c2c["old"].groupby("index_c2c")["wei_freq_c2c"].sum().round() == 1)
-    assert all(c2c["new"]["wei_freq_c2c"].values == 1)
+    w_target_p, w_origin_p = which_target_origin[direction]
 
-    mappings = cat2cat_mappings(trans, "backward")
-    c2c_default = cat2cat(data, mappings)
-    assert c2c_default["old"].equals(c2c["old"])
+    # test that the sum of the weights is 1
+    assert (
+        int_round(c2c[w_origin_p]["wei_freq_c2c"].sum())
+        == data_dict[cat_type][w_origin_p].shape[0]
+    )
+    assert (
+        int_round(c2c[w_target_p]["wei_freq_c2c"].sum())
+        == data_dict[cat_type][w_target_p].shape[0]
+    )
+    assert all(c2c[w_target_p].groupby("index_c2c")["wei_freq_c2c"].sum().round() == 1)
+    assert all(c2c[w_origin_p]["wei_freq_c2c"].values == 1)
+    assert (
+        int_round((c2c[w_target_p]["rep_c2c"] * c2c[w_target_p]["wei_naive_c2c"]).sum())
+        == c2c[w_target_p].shape[0]
+    )
 
-    assert o_old.equals(occup.loc[occup.year == 2008, :])
-    assert o_new.equals(occup.loc[occup.year == 2010, :])
+    # test that cat2cat not influence the original data
+    assert data_dict[cat_type]["old"].equals(o)
+    assert data_dict[cat_type]["new"].equals(n)
 
 
-def test_cat2cat_ml():
-    data = cat2cat_data(o_old, o_new, "code", "code", "year")
-    mappings = cat2cat_mappings(trans, "backward")
+@pytest.mark.parametrize("cat_type", ["str", "int"])
+@pytest.mark.parametrize("direction", ["backward", "forward"])
+def test_cat2cat_ml(direction, cat_type):
+    o = data_dict[cat_type]["old"].copy()
+    n = data_dict[cat_type]["new"].copy()
+    data = cat2cat_data(
+        data_dict[cat_type]["old"], data_dict[cat_type]["new"], "code", "code", "year"
+    )
+    mappings = cat2cat_mappings(
+        data_dict[cat_type]["trans"], direction, data_dict[cat_type]["freqs"][direction]
+    )
     ml = cat2cat_ml(
         occup.loc[occup.year >= 2010, :].copy(),
         "code",
         ["salary", "age", "edu", "sex"],
-        [LinearDiscriminantAnalysis()],
+        [DecisionTreeClassifier(), LinearDiscriminantAnalysis()],
     )
     c2c = cat2cat(data, mappings, ml)
 
+    # result structure
     assert isinstance(c2c, dict)
     assert sorted(list(c2c.keys())) == ["new", "old"]
+
+    # expected number of rows
+    assert c2c["old"].shape[0] == nr_rows_old[direction]
+    assert c2c["new"].shape[0] == nr_rows_new[direction]
+
+    w_target_p, w_origin_p = which_target_origin[direction]
+
+    # test that the sum of the weights is 1
+    assert c2c[w_origin_p].shape[0] == data_dict[cat_type][w_origin_p].shape[0]
+    assert (
+        int_round(c2c[w_origin_p]["wei_freq_c2c"].sum())
+        == data_dict[cat_type][w_origin_p].shape[0]
+    )
+    assert (
+        int_round(c2c[w_target_p]["wei_freq_c2c"].sum())
+        == data_dict[cat_type][w_target_p].shape[0]
+    )
+    assert all(c2c[w_target_p].groupby("index_c2c")["wei_freq_c2c"].sum().round() == 1)
+    assert all(c2c[w_origin_p]["wei_freq_c2c"].values == 1)
+    assert (
+        int_round((c2c[w_target_p]["rep_c2c"] * c2c[w_target_p]["wei_naive_c2c"]).sum())
+        == c2c[w_target_p].shape[0]
+    )
     assert (
-        int_round(c2c["old"]["wei_LinearDiscriminantAnalysis_c2c"].sum())
-        == o_old.shape[0]
+        int_round(c2c[w_target_p]["wei_DecisionTreeClassifier_c2c"].sum())
+        == data_dict[cat_type][w_target_p].shape[0]
     )
-    assert c2c["new"].shape[0] == o_new.shape[0]
-    assert all(c2c["new"]["wei_LinearDiscriminantAnalysis_c2c"].values == 1)
+    assert all(c2c[w_origin_p]["wei_DecisionTreeClassifier_c2c"].values == 1)
+    assert (
+        int_round(c2c[w_target_p]["wei_DecisionTreeClassifier_c2c"].sum())
+        == data_dict[cat_type][w_target_p].shape[0]
+    )
+    assert all(c2c[w_origin_p]["wei_DecisionTreeClassifier_c2c"].values == 1)
 
-    assert o_old.equals(occup.loc[occup.year == 2008, :])
-    assert o_new.equals(occup.loc[occup.year == 2010, :])
+    # test that cat2cat not influence the original data
+    assert data_dict[cat_type]["old"].equals(o)
+    assert data_dict[cat_type]["new"].equals(n)
 
 
 def test_cat2cat_multi():
@@ -206,5 +305,6 @@ def test_cat2cat_direct():
         == vert_old.shape[0]
     )
 
+    # test that cat2cat not influence the original data
     assert vert_old.equals(verticals.loc[verticals["v_date"] == "2020-04-01", :])
     assert vert_new.equals(verticals.loc[verticals["v_date"] == "2020-05-01", :])