From bfb210f1e18ea007ad4fd246147b1fd82c79e9aa Mon Sep 17 00:00:00 2001 From: Calvin Che Date: Thu, 29 Aug 2024 11:56:01 +0800 Subject: [PATCH] Own implementation min max scaler --- mypy.ini | 4 -- pyproject.toml | 8 ++-- .../__init__.py | 0 .../data_processing/scaler/__init__.py | 0 .../scaler/dataframe_min_max_scaler.py} | 30 ++++++++++---- .../data_processing/scaler/min_max_scaler.py | 40 +++++++++++++++++++ 6 files changed, 67 insertions(+), 15 deletions(-) rename src/synnax_shared/{data_processing_modules => data_processing}/__init__.py (100%) create mode 100644 src/synnax_shared/data_processing/scaler/__init__.py rename src/synnax_shared/{data_processing_modules/MultipleColumnsMinMaxScaler.py => data_processing/scaler/dataframe_min_max_scaler.py} (58%) create mode 100644 src/synnax_shared/data_processing/scaler/min_max_scaler.py diff --git a/mypy.ini b/mypy.ini index 8d10f0e..c27c4fc 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,6 +1,2 @@ [mypy] disable_error_code = annotation-unchecked - -[mypy-sklearn.*] -; TechDebt: Should implement typings -ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index 2de3eb3..a9aea34 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "synnax-shared" -version = "1.5.2" +version = "1.6.0" description = "Synnax shared Python pacakges" readme = "README.md" requires-python = ">=3.11" @@ -17,9 +17,11 @@ helpers = [] http-client = ["requests", "types-requests", "retry", "types-retry"] logging-json-formatter = ["python-json-logger"] system-token-issuer = ["boto3", "boto3-stubs[lambda]", "PyJWT"] -data-processing-modules = ["scikit-learn==1.5.1"] +data-processing = ["pandas", "pandas-stubs"] dev = ["mypy", "black"] -all = ["synnax-shared[events,helpers,http-client,logging-json-formatter,system-token-issuer,data-processing-modules,dev]"] +all = [ + "synnax-shared[events,helpers,http-client,logging-json-formatter,system-token-issuer,data-processing,dev]", +] [tool.hatch.build.targets.sdist] include = ["/src"] diff --git a/src/synnax_shared/data_processing_modules/__init__.py b/src/synnax_shared/data_processing/__init__.py similarity index 100% rename from src/synnax_shared/data_processing_modules/__init__.py rename to src/synnax_shared/data_processing/__init__.py diff --git a/src/synnax_shared/data_processing/scaler/__init__.py b/src/synnax_shared/data_processing/scaler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/synnax_shared/data_processing_modules/MultipleColumnsMinMaxScaler.py b/src/synnax_shared/data_processing/scaler/dataframe_min_max_scaler.py similarity index 58% rename from src/synnax_shared/data_processing_modules/MultipleColumnsMinMaxScaler.py rename to src/synnax_shared/data_processing/scaler/dataframe_min_max_scaler.py index 88ed009..97ad0b8 100644 --- a/src/synnax_shared/data_processing_modules/MultipleColumnsMinMaxScaler.py +++ b/src/synnax_shared/data_processing/scaler/dataframe_min_max_scaler.py @@ -1,15 +1,18 @@ -from sklearn.preprocessing import MinMaxScaler +from typing import Dict +from pandas import DataFrame -# TechDebt: We cannot modify or move this file because we pickle it. -# This class needs to be properly serialized. +from synnax_shared.data_processing.scaler.min_max_scaler import ( + MinMaxScaler, + MinMaxScalerDto, +) -class MultipleColumnsMinMaxScaler: +class DataFrameMinMaxScaler: def __init__(self): - self.scalers = {} + self.scalers: Dict[str, MinMaxScaler] = {} - def fit_transform(self, df, col=None): + def fit_transform(self, df: DataFrame, col=None): df = df.copy() if col is None: for col in df.select_dtypes(exclude="O"): @@ -22,7 +25,7 @@ def fit_transform(self, df, col=None): self.scalers[col] = scaler return df - def transform(self, df, col=None): + def transform(self, df: DataFrame, col=None): df = df.copy() if col is None: for col in df.select_dtypes(exclude="O"): @@ -31,7 +34,7 @@ def transform(self, df, col=None): df[col] = self.scalers[col].transform(df[col].values.reshape(-1, 1)) return df - def inverse_transform(self, df, col=None): + def inverse_transform(self, df: DataFrame, col=None): df = df.copy() if col is None: for col in df.select_dtypes(exclude="O"): @@ -41,3 +44,14 @@ def inverse_transform(self, df, col=None): else: df[col] = self.scalers[col].inverse_transform(df[col].values.reshape(-1, 1)) return df + + def toDto(self) -> Dict[str, MinMaxScalerDto]: + return {col: scaler.toDto() for col, scaler in self.scalers.items()} + + @staticmethod + def fromDto(dto: Dict[str, MinMaxScalerDto]): + scaler = DataFrameMinMaxScaler() + scaler.scalers = { + col: MinMaxScaler.fromDto(scalerDto) for col, scalerDto in dto.items() + } + return scaler diff --git a/src/synnax_shared/data_processing/scaler/min_max_scaler.py b/src/synnax_shared/data_processing/scaler/min_max_scaler.py new file mode 100644 index 0000000..750ffe4 --- /dev/null +++ b/src/synnax_shared/data_processing/scaler/min_max_scaler.py @@ -0,0 +1,40 @@ +from typing import TypedDict +from pandas import Series + + +class MinMaxScalerDto(TypedDict): + min: float + max: float + + +class MinMaxScaler: + + def __init__(self, min: float | None = None, max: float | None = None): + self.min = min + self.max = max + + def fit_transform(self, series: Series) -> Series: + self.min = series.min() + self.max = series.max() + return self.transform(series) + + def transform(self, series: Series) -> Series: + if self.min is None or self.max is None: + raise ValueError("MinMaxScaler not fitted") + series = series.copy() + return (series - self.min) / (self.max - self.min) + + def inverse_transform(self, series: Series) -> Series: + if self.min is None or self.max is None: + raise ValueError("MinMaxScaler not fitted") + series = series.copy() + return series * (self.max - self.min) + self.min + + def toDto(self) -> MinMaxScalerDto: + if self.min is None or self.max is None: + raise ValueError("MinMaxScaler not fitted") + return {"min": self.min, "max": self.max} + + @staticmethod + def fromDto(dto: MinMaxScalerDto): + return MinMaxScaler(dto["min"], dto["max"])