Skip to content

Commit

Permalink
support missing in mappings
Browse files Browse the repository at this point in the history
  • Loading branch information
Polkas committed Oct 22, 2022
1 parent 98cc332 commit 4b5f0c1
Show file tree
Hide file tree
Showing 7 changed files with 1,524 additions and 251 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Changelog

## v0.1.4.9000
## v0.1.4.9002

- Improved the lack of support for NaN and None in the `get_mappings`.
- Fixed a bug that `cat2cat_ml.features` can be only a `list` not a `Sequence`.
- Fixed assertion message and docs for the `freqs` argument in the `cat2cat_mappings`.
- Fixed some typing, and bring the clear `mypy`.
Expand Down
1,434 changes: 1,233 additions & 201 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "cat2cat"
version = "0.1.4.9000"
version = "0.1.4.9002"
description = "Unifying an inconsistently coded categorical variable in a panel/longtitudal dataset."
authors = ["Maciej Nasinski"]
license = "MIT"
Expand Down
4 changes: 4 additions & 0 deletions src/cat2cat/cat2cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ def cat2cat(
`mappings.trans` arg columns and the `data.cat_var` column have to be of the same type.
When ml part applied `ml.cat_var` has to have the same type too.
3. Please covert all numpy.NaN to some numeric value like 999999.
None`s in a pandas column have to be converted to a "None" character.
Changes have to be made at the same time for the mapping table and datasets.
>>> from cat2cat import cat2cat
>>> from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml
>>> from sklearn.ensemble import RandomForestClassifier
Expand Down
44 changes: 32 additions & 12 deletions src/cat2cat/mappings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pandas import DataFrame
from numpy import ndarray, unique, repeat, array, round
from numpy import ndarray, unique, repeat, array, round, unique, sort, isnan

from collections.abc import Iterable
from typing import Union, Optional, Any, List, Dict, Sequence
Expand All @@ -21,6 +21,9 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]]
Returns:
Dict[str, Dict[Any, List[Any]]]: dict with 2 internal dicts, `to_old` and `to_new`.
Note:
Please covert all numpy.NaN to some numeric value like 999999.
None`s in a pandas column have to be converted to a "None" character.
>>> from cat2cat.mappings import get_mappings
>>> from numpy import array
Expand All @@ -40,30 +43,47 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]]
), "x should have 2 dimensions and the second one is equal to 2 (columns)"

if isinstance(x, DataFrame):
ff = x.iloc[:, 0].values
ss = x.iloc[:, 1].values
ff = x.iloc[:, 0].copy()
which_ff_null = ff.isnull()
ff = ff.values
ss = x.iloc[:, 1].copy()
which_ss_null = ss.isnull()
ss = ss.values
elif isinstance(x, ndarray):
ff = x[:, 0]
ss = x[:, 1]
ff = x[:, 0].copy()
ss = x[:, 1].copy()
else:
raise (TypeError)

assert ff.dtype == ss.dtype
col_type = ff.dtype

if col_type == "O":
ff[which_ff_null | (ff == None)] = "None"
ss[which_ss_null | (ss == None)] = "None"

from_old = set(ff)
from_new = set(ss)
from_old = unique(ff)
from_new = unique(ss)

to_old = dict()
for e in from_new:
idx = ss == e
if (col_type in [float, int]) and isnan(e):
idx = isnan(ss)
else:
idx = ss == e

# sorted so results are stable
to_old[e] = sorted(set(ff[idx]))
to_old[e] = sorted(unique(ff[idx]))

to_new = dict()
for e in from_old:
idx = ff == e
if (col_type in [float, int]) and isnan(e):
idx = isnan(ff)
else:
idx = ff == e

# sorted so results are stable
to_new[e] = sorted(set(ss[idx]))
to_new[e] = sorted(unique(ss[idx]))

return dict(to_old=to_old, to_new=to_new)

Expand Down Expand Up @@ -134,4 +154,4 @@ def cat_apply_freq(
cs = [freqs.get(e, 1e-12) for e in to_x[x]]
fs = round(array(cs) / sum(cs), 10)
res[x] = list(fs)
return res
return res
168 changes: 134 additions & 34 deletions tests/test_cat2cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
from cat2cat import cat2cat
from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml
from cat2cat.cat2cat_utils import dummy_c2c
from pandas import concat
from numpy import round
from pandas import concat, DataFrame
from numpy import round, setdiff1d
import pytest
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier


def int_round(x: float) -> int:
Expand All @@ -14,8 +15,6 @@ def int_round(x: float) -> int:

verticals = load_verticals()

trans = load_trans()

occup = load_occup()
occup_small = load_occup(small=True)
o_2006 = occup.loc[occup.year == 2006, :].copy()
Expand All @@ -28,19 +27,57 @@ def int_round(x: float) -> int:
o_new_int = o_new.copy()
o_new_int["code"] = o_new_int["code"].astype(int)

trans = load_trans()
# impute missing values
trans = concat(
[trans, DataFrame({"old": "99999", "new": setdiff1d(o_new.code, trans.new)})]
)

trans_int = trans.copy()
trans_int.loc[trans_int["old"].isnull(), "old"] = "9999"
trans_int.loc[trans_int["old"].isnull(), "old"] = "99999"
trans_int = trans_int.astype({"old": int, "new": int})

nr_rows_old = {"backward": 227662, "forward": 17223}
nr_rows_new = {"backward": 17323, "forward": 18577}
nr_rows_new = {"backward": 17323, "forward": 18680}
data_dict = {
"str": {"old": o_old, "new": o_new, "trans": trans},
"int": {"old": o_old_int, "new": o_new_int, "trans": trans_int},
"str": {
"old": o_old,
"new": o_new,
"trans": trans,
"freqs": {
"forward": o_new["code"].value_counts().to_dict(),
"backward": o_old["code"].value_counts().to_dict(),
},
},
"int": {
"old": o_old_int,
"new": o_new_int,
"trans": trans_int,
"freqs": {
"forward": o_new_int["code"].value_counts().to_dict(),
"backward": o_old_int["code"].value_counts().to_dict(),
},
},
}
which_target_origin = {"backward": ("old", "new"), "forward": ("new", "old")}


def utils_test_structure(data, direction, nr_rows_old, nr_rows_new):
pass


def utils_test_expected():
pass


def utils_test_sum1():
pass


def utils_test_copy():
pass


@pytest.mark.parametrize("direction", ["backward", "forward"])
@pytest.mark.parametrize("cat_type", ["str", "int"])
def test_cat2cat_base(direction, cat_type):
Expand All @@ -51,21 +88,25 @@ def test_cat2cat_base(direction, cat_type):
)
mappings = cat2cat_mappings(data_dict[cat_type]["trans"], direction)
c2c = cat2cat(data, mappings)

# result structure
assert isinstance(c2c, dict)
assert sorted(list(c2c.keys())) == ["new", "old"]

# expected number of rows
assert c2c["old"].shape[0] == nr_rows_old[direction]
assert c2c["new"].shape[0] == nr_rows_new[direction]

w_target_p, w_origin_p = which_target_origin[direction]

# test that the sum of the weights is 1
assert (
int_round(c2c[w_origin_p]["wei_freq_c2c"].sum())
== data_dict[cat_type][w_origin_p].shape[0]
)
assert (
int_round(c2c[w_target_p]["wei_freq_c2c"].sum())
<= data_dict[cat_type][w_target_p].shape[0]
== data_dict[cat_type][w_target_p].shape[0]
)
assert all(c2c[w_target_p].groupby("index_c2c")["wei_freq_c2c"].sum().round() == 1)
assert all(c2c[w_origin_p]["wei_freq_c2c"].values == 1)
Expand All @@ -74,56 +115,114 @@ def test_cat2cat_base(direction, cat_type):
== c2c[w_target_p].shape[0]
)

# test that cat2cat not influence the original data
assert data_dict[cat_type]["old"].equals(o)
assert data_dict[cat_type]["new"].equals(n)


def test_cat2cat_custom_freqs():
freqs = o_new["code"].value_counts().to_dict()
data = cat2cat_data(o_old, o_new, "code", "code", "year")
mappings_f = cat2cat_mappings(trans, "backward", freqs)
c2c = cat2cat(data, mappings_f)
@pytest.mark.parametrize("direction", ["backward", "forward"])
@pytest.mark.parametrize("cat_type", ["str", "int"])
def test_cat2cat_custom_freqs(direction, cat_type):
o = data_dict[cat_type]["old"].copy()
n = data_dict[cat_type]["new"].copy()
data = cat2cat_data(
data_dict[cat_type]["old"], data_dict[cat_type]["new"], "code", "code", "year"
)
mappings = cat2cat_mappings(
data_dict[cat_type]["trans"], direction, data_dict[cat_type]["freqs"][direction]
)
c2c = cat2cat(data, mappings)

# result structure
assert isinstance(c2c, dict)
assert sorted(list(c2c.keys())) == ["new", "old"]

assert int_round(c2c["old"]["wei_freq_c2c"].sum()) == o_old.shape[0]
assert int_round(c2c["new"]["wei_freq_c2c"].sum()) == o_new.shape[0]
assert c2c["new"].shape[0] == o_new.shape[0]
# expected number of rows
assert c2c["old"].shape[0] == nr_rows_old[direction]
assert c2c["new"].shape[0] == nr_rows_new[direction]

assert all(c2c["old"].groupby("index_c2c")["wei_freq_c2c"].sum().round() == 1)
assert all(c2c["new"]["wei_freq_c2c"].values == 1)
w_target_p, w_origin_p = which_target_origin[direction]

mappings = cat2cat_mappings(trans, "backward")
c2c_default = cat2cat(data, mappings)
assert c2c_default["old"].equals(c2c["old"])
# test that the sum of the weights is 1
assert (
int_round(c2c[w_origin_p]["wei_freq_c2c"].sum())
== data_dict[cat_type][w_origin_p].shape[0]
)
assert (
int_round(c2c[w_target_p]["wei_freq_c2c"].sum())
== data_dict[cat_type][w_target_p].shape[0]
)
assert all(c2c[w_target_p].groupby("index_c2c")["wei_freq_c2c"].sum().round() == 1)
assert all(c2c[w_origin_p]["wei_freq_c2c"].values == 1)
assert (
int_round((c2c[w_target_p]["rep_c2c"] * c2c[w_target_p]["wei_naive_c2c"]).sum())
== c2c[w_target_p].shape[0]
)

assert o_old.equals(occup.loc[occup.year == 2008, :])
assert o_new.equals(occup.loc[occup.year == 2010, :])
# test that cat2cat not influence the original data
assert data_dict[cat_type]["old"].equals(o)
assert data_dict[cat_type]["new"].equals(n)


def test_cat2cat_ml():
data = cat2cat_data(o_old, o_new, "code", "code", "year")
mappings = cat2cat_mappings(trans, "backward")
@pytest.mark.parametrize("cat_type", ["str", "int"])
@pytest.mark.parametrize("direction", ["backward", "forward"])
def test_cat2cat_ml(direction, cat_type):
o = data_dict[cat_type]["old"].copy()
n = data_dict[cat_type]["new"].copy()
data = cat2cat_data(
data_dict[cat_type]["old"], data_dict[cat_type]["new"], "code", "code", "year"
)
mappings = cat2cat_mappings(
data_dict[cat_type]["trans"], direction, data_dict[cat_type]["freqs"][direction]
)
ml = cat2cat_ml(
occup.loc[occup.year >= 2010, :].copy(),
"code",
["salary", "age", "edu", "sex"],
[LinearDiscriminantAnalysis()],
[DecisionTreeClassifier(), LinearDiscriminantAnalysis()],
)
c2c = cat2cat(data, mappings, ml)

# result structure
assert isinstance(c2c, dict)
assert sorted(list(c2c.keys())) == ["new", "old"]

# expected number of rows
assert c2c["old"].shape[0] == nr_rows_old[direction]
assert c2c["new"].shape[0] == nr_rows_new[direction]

w_target_p, w_origin_p = which_target_origin[direction]

# test that the sum of the weights is 1
assert c2c[w_origin_p].shape[0] == data_dict[cat_type][w_origin_p].shape[0]
assert (
int_round(c2c[w_origin_p]["wei_freq_c2c"].sum())
== data_dict[cat_type][w_origin_p].shape[0]
)
assert (
int_round(c2c[w_target_p]["wei_freq_c2c"].sum())
== data_dict[cat_type][w_target_p].shape[0]
)
assert all(c2c[w_target_p].groupby("index_c2c")["wei_freq_c2c"].sum().round() == 1)
assert all(c2c[w_origin_p]["wei_freq_c2c"].values == 1)
assert (
int_round((c2c[w_target_p]["rep_c2c"] * c2c[w_target_p]["wei_naive_c2c"]).sum())
== c2c[w_target_p].shape[0]
)
assert (
int_round(c2c["old"]["wei_LinearDiscriminantAnalysis_c2c"].sum())
== o_old.shape[0]
int_round(c2c[w_target_p]["wei_DecisionTreeClassifier_c2c"].sum())
== data_dict[cat_type][w_target_p].shape[0]
)
assert c2c["new"].shape[0] == o_new.shape[0]
assert all(c2c["new"]["wei_LinearDiscriminantAnalysis_c2c"].values == 1)
assert all(c2c[w_origin_p]["wei_DecisionTreeClassifier_c2c"].values == 1)
assert (
int_round(c2c[w_target_p]["wei_DecisionTreeClassifier_c2c"].sum())
== data_dict[cat_type][w_target_p].shape[0]
)
assert all(c2c[w_origin_p]["wei_DecisionTreeClassifier_c2c"].values == 1)

assert o_old.equals(occup.loc[occup.year == 2008, :])
assert o_new.equals(occup.loc[occup.year == 2010, :])
# test that cat2cat not influence the original data
assert data_dict[cat_type]["old"].equals(o)
assert data_dict[cat_type]["new"].equals(n)


def test_cat2cat_multi():
Expand Down Expand Up @@ -206,5 +305,6 @@ def test_cat2cat_direct():
== vert_old.shape[0]
)

# test that cat2cat not influence the original data
assert vert_old.equals(verticals.loc[verticals["v_date"] == "2020-04-01", :])
assert vert_new.equals(verticals.loc[verticals["v_date"] == "2020-05-01", :])
Loading

0 comments on commit 4b5f0c1

Please sign in to comment.