rm poetry and mappings

Polkas · Dec 23, 2023 · 5b558fd · 5b558fd
1 parent 2638b4d
commit 5b558fd
Show file tree

Hide file tree

Showing 8 changed files with 167 additions and 1,421 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -25,17 +25,14 @@ jobs:
       - name: Check-out repository
         uses: actions/checkout@v2
 
-      - name: Install poetry
-        uses: snok/install-poetry@v1
-
       - name: Install package
-        run: poetry install
+        run: pip install -r requirements.txt
 
       - name: Test with pytest
-        run: poetry run python -m pytest tests --doctest-modules --cov=cat2cat --cov-report=xml
+        run: python -m pytest tests --doctest-modules --cov=cat2cat --cov-report=xml
 
       - name: Test with mypy
-        run: poetry run mypy src
+        run: mypy src
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,12 +1,13 @@
 # Changelog
 
-## v0.1.4.9006
+## v0.1.4.9007
 
 - New `cat2cat_ml_run` function to check the ml models performance before `cat2cat` with ml option is run. Now, the ml models are more transparent.
 - Improved the lack of support for NaN and None in the `get_mappings`.
 - Fixed a bug that `cat2cat_ml.features` can be only a `list` not a `Sequence`.
 - Fixed assertion message and docs for the `freqs` argument in the `cat2cat_mappings`.
 - Fixed some typing, and bring the clear `mypy`.
+- Replaced poetry with setuptools.
 
 ## v0.1.4 (12/09/2022)
 

diff --git a/LICENSE b/LICENSE
@@ -1,22 +1,13 @@
-MIT License
+Copyright 2022 Maciej Nasinski
 
-Copyright (c) 2022, Maciej Nasinski
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+    http://www.apache.org/licenses/LICENSE-2.0
 
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,15 +1,18 @@
-[tool.poetry]
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
 name = "cat2cat"
-version = "0.1.4.9006"
+authors = [
+  {name = "Maciej Nasinski", email = "[email protected]"},
+]
 description = "Unifying an inconsistently coded categorical variable in a panel/longtitudal dataset."
-authors = ["Maciej Nasinski"]
-license = "MIT"
 readme = "README.md"
-homepage = "https://github.com/polkas/py-cat2cat"
-repository = "https://github.com/polkas/py-cat2cat"
-documentation = "https://py-cat2cat.readthedocs.io/en/latest/"
+version = "0.1.4.9007"
+requires-python = ">=3.8"
 keywords = ["panel", "categorical", "longtitudal", "inconsistent", "cat2cat"]
-include = ["CHANGELOG.md"]
+license = {text = "Apache License 2.0 | file LICENSE"}
 classifiers = [
 "Development Status :: 3 - Alpha",
 "Programming Language :: Python",
@@ -20,29 +23,37 @@ classifiers = [
 "Programming Language :: Python :: 3.11",
 "Programming Language :: Python :: Implementation :: PyPy",
 ]
+dependencies = [
+  "numpy",
+  "pandas",
+  "scikit-learn",
+  "importlib-resources"
+]
 
-[build-system]
-requires = ["poetry_core>=1.0.0"]
-build-backend = "poetry.core.masonry.api"
+[project.optional-dependencies]
+test = ["pytest", "pytest-cov", "mypy"]
+docs = [
+  "Sphinx", 
+  "myst-parser",
+  "sphinx-autoapi", 
+  "sphinx-rtd-theme"
+]
+build = ["build"]
+benchmark = ["snakeviz"]
+styler = ["flake8", "black"]
+all = ["cat2cat[test,docs,build,benchmark,styler]"]
+
+[project.urls]
+homepage = "https://github.com/Polkas/multidim"
+documentation = "https://multidim.readthedocs.io/en/latest/"
+repository = "https://github.com/Polkas/multidim"
+changelog = "https://raw.githubusercontent.com/Polkas/multidim/main/CHANGELOG.md"
 
-[tool.poetry.dependencies]
-python = ">=3.8,<3.12"
-numpy = "^1.23.1"
-pandas = "^1.4.3"
-scikit-learn = "^1.1.2"
-importlib-resources = "^5.9.0"
+[tool.setuptools.packages.find]
+where = ["src"]
 
-[tool.poetry.group.dev.dependencies]
-pytest = "^7.1.2"
-flake8 = "^5.0.4"
-black = "^22.6.0"
-pytest-cov = "^3.0.0"
-Sphinx = "^6.2.1"
-sphinx-autoapi = "^3.0.0"
-sphinx-rtd-theme = "^1.3.0"
-myst-parser = "^2.0.0"
-snakeviz = "^2.1.1"
-mypy = "^0.982"
+[tool.setuptools.package-data]
+"cat2cat.data" = ["*"]
 
 [tool.mypy]
 python_version = "3.8"

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,64 @@
+alabaster==0.7.13
+anyascii==0.3.2
+astroid==3.0.2
+Babel==2.14.0
+black==23.12.1
+build==1.0.3
+cdcertifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+coverage==7.3.4
+docutils==0.20.1
+exceptiongroup==1.2.0
+flake8==6.1.0
+idna==3.6
+imagesize==1.4.1
+importlib-resources==6.1.1
+iniconfig==2.0.0
+Jinja2==3.1.2
+joblib==1.3.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+mccabe==0.7.0
+mdit-py-plugins==0.4.0
+mdurl==0.1.2
+mypy==1.8.0
+mypy-extensions==1.0.0
+myst-parser==2.0.0
+numpy==1.26.2
+packaging==23.2
+pandas==2.1.4
+pathspec==0.12.1
+platformdirs==4.1.0
+pluggy==1.3.0
+pycodestyle==2.11.1
+pyflakes==3.1.0
+Pygments==2.17.2
+pyproject_hooks==1.0.0
+pytest==7.4.3
+pytest-cov==4.1.0
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+requests==2.31.0
+scikit-learn==1.3.2
+scipy==1.11.4
+six==1.16.0
+snakeviz==2.2.0
+snowballstemmer==2.2.0
+Sphinx==7.2.6
+sphinx-autoapi==3.0.0
+sphinx-rtd-theme==2.0.0
+sphinxcontrib-applehelp==1.0.7
+sphinxcontrib-devhelp==1.0.5
+sphinxcontrib-htmlhelp==2.0.4
+sphinxcontrib-jquery==4.1
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.6
+sphinxcontrib-serializinghtml==1.1.9
+threadpoolctl==3.2.0
+tomli==2.0.1
+tornado==6.4
+typing_extensions==4.9.0
+tzdata==2023.3
+urllib3==2.1.0
diff --git a/src/cat2cat/mappings.py b/src/cat2cat/mappings.py
@@ -3,12 +3,13 @@
 
 from collections.abc import Iterable
 from collections import OrderedDict
-from typing import Union, Optional, Any, List, Dict, Sequence
+from typing import Union, Optional, Any, List, Dict, Sequence, TypeVar
 
 __all__ = ["get_mappings", "cat_apply_freq", "get_freqs"]
 
+Table = TypeVar("Table", DataFrame, ndarray)
 
-def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]]]:
+def get_mappings(x: Table) -> Dict[str, Dict[Any, List[Any]]]: 
     """Transforming a mapping table with mappings to two associative lists
 
     Transforming a transition table with mappings to two associative lists
@@ -37,23 +38,56 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]]
     >>> mappings["to_new"]
     {1111.0: [111101.0, 111102.0], 1123.0: [111405.0], 1212.0: [112006.0, 112008.0, 112090.0, nan], nan: [111405.0]}
     """
-
     assert (len(x.shape) == 2) and (
         x.shape[1] == 2
     ), "x should have 2 dimensions and the second one is equal to 2 (columns)"
 
     if isinstance(x, DataFrame):
-        ff = x.iloc[:, 0].copy()
-        which_ff_null = ff.isnull()
-        ff = ff.values
-        ss = x.iloc[:, 1].copy()
-        which_ss_null = ss.isnull()
-        ss = ss.values
+        return get_mappings_df(x)
     elif isinstance(x, ndarray):
-        ff = x[:, 0].copy()
-        ss = x[:, 1].copy()
+        return get_mappings_array(x)
     else:
-        raise (TypeError)
+        raise TypeError("get_mappings input has to be ndarray or DataFrame")
+
+def get_mappings_array(x: ndarray) -> Dict[str, Dict[Any, List[Any]]]:
+    ff = x[:, 0].copy()
+    ss = x[:, 1].copy()
+
+    assert ff.dtype == ss.dtype
+    col_type = ff.dtype
+
+    from_old = list(OrderedDict.fromkeys(ff))
+    from_new = list(OrderedDict.fromkeys(ss))
+
+    to_old = dict()
+    for e in from_new:
+        if (col_type in [float, int]) and isnan(e):
+            idx = isnan(ss)
+        else:
+            idx = ss == e
+
+        # sorted so results are stable
+        to_old[e] = sorted(unique(ff[idx]))
+
+    to_new = dict()
+    for e in from_old:
+        if (col_type in [float, int]) and isnan(e):
+            idx = isnan(ff)
+        else:
+            idx = ff == e
+
+        # sorted so results are stable
+        to_new[e] = sorted(unique(ss[idx]))
+
+    return dict(to_old=to_old, to_new=to_new)
+
+def get_mappings_df(x: DataFrame) -> Dict[str, Dict[Any, List[Any]]]:
+    ff = x.iloc[:, 0].copy()
+    which_ff_null = ff.isnull()
+    ff = ff.values
+    ss = x.iloc[:, 1].copy()
+    which_ss_null = ss.isnull()
+    ss = ss.values
 
     assert ff.dtype == ss.dtype
     col_type = ff.dtype
@@ -87,7 +121,6 @@ def get_mappings(x: Union[DataFrame, ndarray]) -> Dict[str, Dict[Any, List[Any]]
 
     return dict(to_old=to_old, to_new=to_new)
 
-
 def get_freqs(
     x: Sequence[Any], multiplier: Optional[Sequence[int]] = None
 ) -> Dict[Any, int]:

diff --git a/tests/test_cat2cat.py b/tests/test_cat2cat.py
@@ -59,6 +59,7 @@ def int_round(x: float) -> int:
     },
 }
 which_target_origin = {"backward": ("old", "new"), "forward": ("new", "old")}
+code_var_name = {"backward": "code", "forward": "code4"}
 
 
 @pytest.mark.parametrize("direction", ["backward", "forward"])
@@ -146,9 +147,8 @@ def test_cat2cat_custom_freqs(direction, cat_type):
     assert data_dict[cat_type]["old"].equals(o)
     assert data_dict[cat_type]["new"].equals(n)
 
-
 @pytest.mark.parametrize("cat_type", ["str", "int"])
-@pytest.mark.parametrize("direction", ["backward", "forward"])
+@pytest.mark.parametrize("direction", ["backward"])
 def test_cat2cat_ml(direction, cat_type):
     o = data_dict[cat_type]["old"].copy()
     n = data_dict[cat_type]["new"].copy()
@@ -160,7 +160,7 @@ def test_cat2cat_ml(direction, cat_type):
     )
     ml = cat2cat_ml(
         occup.loc[occup.year >= 2010, :].copy(),
-        "code",
+        code_var_name[direction],
         ["salary", "age", "edu", "sex"],
         [DecisionTreeClassifier(), LinearDiscriminantAnalysis()],
     )