From 8f9039d3ae9daa86de0b59ee13e8b97ccf92478b Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Fri, 10 Jan 2025 09:17:20 -0800 Subject: [PATCH 1/3] BUG: Fix ValueError in DataFrame/Series regex replace for all-NA values (#60691) * BUG: DataFrame/Series regex replace fix for all NA values * Add entry to whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/array_algos/replace.py | 3 ++- pandas/tests/frame/methods/test_replace.py | 7 +++++++ pandas/tests/series/methods/test_replace.py | 7 +++++++ 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 94c289ef3ace7..5cc258a54fa48 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -798,6 +798,7 @@ Other - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`) - Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`) - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index a9ad66b7cb2e5..debd6368e98a4 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -89,7 +89,8 @@ def _check_comparison_types( op = np.vectorize( lambda x: bool(re.search(b, x)) if isinstance(x, str) and isinstance(b, (str, Pattern)) - else False + else False, + otypes=[bool], ) # GH#32621 use mask to avoid comparing to NAs diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index b2320798ea9a2..c95806d9316bf 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -713,6 +713,13 @@ def test_replace_with_None_keeps_categorical(self): ) tm.assert_frame_equal(result, expected) + def test_replace_all_NA(self): + # GH#60688 + df = DataFrame({"ticker": ["#1234#"], "name": [None]}) + result = df.replace({col: {r"^#": "$"} for col in df.columns}, regex=True) + expected = DataFrame({"ticker": ["$1234#"], "name": [None]}) + tm.assert_frame_equal(result, expected) + def test_replace_value_is_none(self, datetime_frame): orig_value = datetime_frame.iloc[0, 0] orig2 = datetime_frame.iloc[1, 0] diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index ecfe3d1b39d31..abd5d075ea3d5 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -708,3 +708,10 @@ def test_replace_ea_float_with_bool(self): expected = ser.copy() result = ser.replace(0.0, True) tm.assert_series_equal(result, expected) + + def test_replace_all_NA(self): + # GH#60688 + df = pd.Series([pd.NA, pd.NA]) + result = df.replace({r"^#": "$"}, regex=True) + expected = pd.Series([pd.NA, pd.NA]) + tm.assert_series_equal(result, expected) From 11cc7e06f1cf177a0293222e3bfcb389120aa284 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 10 Jan 2025 12:24:06 -0500 Subject: [PATCH 2/3] TST(string dtype): Resolve replace xfails (#60659) * TST(string dtype): Resolve replace xfails * Add test * fixup --- pandas/tests/frame/methods/test_replace.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index c95806d9316bf..9e302dc5f94ee 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -334,7 +334,6 @@ def test_regex_replace_str_to_numeric(self, mix_abc): return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) assert return_value is None expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) - # TODO(infer_string) expec["c"] = expec["c"].astype(object) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) @@ -1476,21 +1475,24 @@ def test_regex_replace_scalar( tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("regex", [False, True]) - def test_replace_regex_dtype_frame(self, regex): + @pytest.mark.parametrize("value", [1, "1"]) + def test_replace_regex_dtype_frame(self, regex, value): # GH-48644 df1 = DataFrame({"A": ["0"], "B": ["0"]}) - expected_df1 = DataFrame({"A": [1], "B": [1]}, dtype=object) - result_df1 = df1.replace(to_replace="0", value=1, regex=regex) + # When value is an integer, coerce result to object. + # When value is a string, infer the correct string dtype. + dtype = object if value == 1 else None + + expected_df1 = DataFrame({"A": [value], "B": [value]}, dtype=dtype) + result_df1 = df1.replace(to_replace="0", value=value, regex=regex) tm.assert_frame_equal(result_df1, expected_df1) df2 = DataFrame({"A": ["0"], "B": ["1"]}) if regex: - # TODO(infer_string): both string columns get cast to object, - # while only needed for column A - expected_df2 = DataFrame({"A": [1], "B": ["1"]}, dtype=object) + expected_df2 = DataFrame({"A": [value], "B": ["1"]}, dtype=dtype) else: - expected_df2 = DataFrame({"A": Series([1], dtype=object), "B": ["1"]}) - result_df2 = df2.replace(to_replace="0", value=1, regex=regex) + expected_df2 = DataFrame({"A": Series([value], dtype=dtype), "B": ["1"]}) + result_df2 = df2.replace(to_replace="0", value=value, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) def test_replace_with_value_also_being_replaced(self): From a81d52fd19062ec3128f43285eff55cc9e76a511 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Fri, 10 Jan 2025 09:42:36 -0800 Subject: [PATCH 3/3] ENH: Support kurtosis (kurt) in DataFrameGroupBy and SeriesGroupBy (#60433) * ENH: Support kurtosis (kurt) in DataFrameGroupBy and SeriesGroupBy * ENH: Address review comments * ENH: Fix comments in new test cases * ENH: Skip pyarrow test case if no pyarrow available * ENH: Update to intp instead of np.intp * ENH: Change intp to int64 * Address review comments --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/groupby.pyi | 9 + pandas/_libs/groupby.pyx | 98 +++++++++- pandas/core/arrays/base.py | 1 + pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 6 +- pandas/core/groupby/base.py | 1 + pandas/core/groupby/generic.py | 182 +++++++++++++++++- pandas/core/groupby/ops.py | 8 +- pandas/tests/groupby/methods/test_kurt.py | 90 +++++++++ pandas/tests/groupby/test_api.py | 1 + pandas/tests/groupby/test_apply.py | 1 + pandas/tests/groupby/test_categorical.py | 1 + pandas/tests/groupby/test_groupby.py | 10 +- pandas/tests/groupby/test_numeric_only.py | 3 + pandas/tests/groupby/test_raises.py | 33 +++- pandas/tests/groupby/test_reductions.py | 5 +- .../tests/groupby/transform/test_transform.py | 8 +- 18 files changed, 436 insertions(+), 24 deletions(-) create mode 100644 pandas/tests/groupby/methods/test_kurt.py diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 5cc258a54fa48..47838d1e49d61 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -55,6 +55,7 @@ Other enhancements - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) +- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 53f5f73624232..34367f55d2bbb 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -97,6 +97,15 @@ def group_skew( result_mask: np.ndarray | None = ..., skipna: bool = ..., ) -> None: ... +def group_kurt( + out: np.ndarray, # float64_t[:, ::1] + counts: np.ndarray, # int64_t[::1] + values: np.ndarray, # ndarray[float64_T, ndim=2] + labels: np.ndarray, # const intp_t[::1] + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., + skipna: bool = ..., +) -> None: ... def group_mean( out: np.ndarray, # floating[:, ::1] counts: np.ndarray, # int64_t[::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d7e485f74e58b..59bc59135a8ff 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -910,7 +910,7 @@ def group_var( @cython.wraparound(False) @cython.boundscheck(False) @cython.cdivision(True) -@cython.cpow +@cython.cpow(True) def group_skew( float64_t[:, ::1] out, int64_t[::1] counts, @@ -961,7 +961,7 @@ def group_skew( isna_entry = _treat_as_na(val, False) if not isna_entry: - # Based on RunningStats::Push from + # Running stats update based on RunningStats::Push from # https://www.johndcook.com/blog/skewness_kurtosis/ n1 = nobs[lab, j] n = n1 + 1 @@ -995,6 +995,100 @@ def group_skew( ) +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +@cython.cpow(True) +def group_kurt( + float64_t[:, ::1] out, + int64_t[::1] counts, + ndarray[float64_t, ndim=2] values, + const intp_t[::1] labels, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, + bint skipna=True, +) -> None: + cdef: + Py_ssize_t i, j, N, K, lab, ngroups = len(counts) + int64_t[:, ::1] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) + bint isna_entry, uses_mask = mask is not None + float64_t[:, ::1] M1, M2, M3, M4 + float64_t delta, delta_n, delta_n2, term1, val + int64_t n1, n + float64_t ct, num, den, adj + + if len_values != len_labels: + raise ValueError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + + # M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments + M1 = np.zeros((out).shape, dtype=np.float64) + M2 = np.zeros((out).shape, dtype=np.float64) + M3 = np.zeros((out).shape, dtype=np.float64) + M4 = np.zeros((out).shape, dtype=np.float64) + + N, K = (values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, False) + + if not isna_entry: + # Running stats update based on RunningStats::Push from + # https://www.johndcook.com/blog/skewness_kurtosis/ + n1 = nobs[lab, j] + n = n1 + 1 + + nobs[lab, j] = n + delta = val - M1[lab, j] + delta_n = delta / n + delta_n2 = delta_n * delta_n + term1 = delta * delta_n * n1 + + M1[lab, j] += delta_n + M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3) + + 6 * delta_n2 * M2[lab, j] + - 4 * delta_n * M3[lab, j]) + M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j] + M2[lab, j] += term1 + elif not skipna: + M1[lab, j] = NaN + M2[lab, j] = NaN + M3[lab, j] = NaN + M4[lab, j] = NaN + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 4: + if result_mask is not None: + result_mask[i, j] = 1 + out[i, j] = NaN + elif M2[i, j] == 0: + out[i, j] = 0 + else: + num = ct * (ct + 1) * (ct - 1) * M4[i, j] + den = (ct - 2) * (ct - 3) * M2[i, j] ** 2 + adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3)) + out[i, j] = num / den - adj + + @cython.wraparound(False) @cython.boundscheck(False) def group_mean( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 4835d808f2433..e831883998098 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2618,6 +2618,7 @@ def _groupby_op( "sem", "var", "skew", + "kurt", ]: raise TypeError( f"dtype '{self.dtype}' does not support operation '{how}'" diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 99e4cb0545e2d..ae20bfb6b284b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2736,7 +2736,7 @@ def _groupby_op( op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) dtype = self.dtype - if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "skew", "kurt"]: raise TypeError(f"{dtype} type does not support {how} operations") if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered: # raise TypeError instead of NotImplementedError to ensure we diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c6b6367e347ba..8a79ab53442c3 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1656,7 +1656,7 @@ def _groupby_op( dtype = self.dtype if dtype.kind == "M": # Adding/multiplying datetimes is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]: raise TypeError(f"datetime64 type does not support operation '{how}'") if how in ["any", "all"]: # GH#34479 @@ -1667,7 +1667,7 @@ def _groupby_op( elif isinstance(dtype, PeriodDtype): # Adding/multiplying Periods is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]: raise TypeError(f"Period type does not support {how} operations") if how in ["any", "all"]: # GH#34479 @@ -1677,7 +1677,7 @@ def _groupby_op( ) else: # timedeltas we can add but not multiply - if how in ["prod", "cumprod", "skew", "var"]: + if how in ["prod", "cumprod", "skew", "kurt", "var"]: raise TypeError(f"timedelta64 type does not support {how} operations") # All of the functions implemented here are ordinal, so we can diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index bad9749b5ecee..7699fb3d0f864 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -50,6 +50,7 @@ class OutputKey: "sem", "size", "skew", + "kurt", "std", "sum", "var", diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f4e3f3e8b1001..1251403db6ff3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1272,13 +1272,86 @@ def skew( Name: Max Speed, dtype: float64 """ + return self._cython_agg_general( + "skew", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + + def kurt( + self, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> Series: + """ + Return unbiased kurtosis within groups. + + Parameters + ---------- + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + Series + Unbiased kurtosis within groups. + + See Also + -------- + Series.kurt : Return unbiased kurtosis over requested axis. + + Examples + -------- + >>> ser = pd.Series( + ... [390.0, 350.0, 357.0, 333.0, np.nan, 22.0, 20.0, 30.0, 40.0, 41.0], + ... index=[ + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Falcon", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... "Parrot", + ... ], + ... name="Max Speed", + ... ) + >>> ser + Falcon 390.0 + Falcon 350.0 + Falcon 357.0 + Falcon 333.0 + Falcon NaN + Parrot 22.0 + Parrot 20.0 + Parrot 30.0 + Parrot 40.0 + Parrot 41.0 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).kurt() + Falcon 1.622109 + Parrot -2.878714 + Name: Max Speed, dtype: float64 + >>> ser.groupby(level=0).kurt(skipna=False) + Falcon NaN + Parrot -2.878714 + Name: Max Speed, dtype: float64 + """ + def alt(obj): # This should not be reached since the cython path should raise # TypeError and not NotImplementedError. - raise TypeError(f"'skew' is not supported for dtype={obj.dtype}") + raise TypeError(f"'kurt' is not supported for dtype={obj.dtype}") return self._cython_agg_general( - "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs + "kurt", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs ) @property @@ -2921,6 +2994,111 @@ def alt(obj): "skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs ) + def kurt( + self, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> DataFrame: + """ + Return unbiased kurtosis within groups. + + Parameters + ---------- + skipna : bool, default True + Exclude NA/null values when computing the result. + + numeric_only : bool, default False + Include only float, int, boolean columns. + + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + DataFrame + Unbiased kurtosis within groups. + + See Also + -------- + DataFrame.kurt : Return unbiased kurtosis over requested axis. + + Examples + -------- + >>> arrays = [ + ... [ + ... "falcon", + ... "parrot", + ... "cockatoo", + ... "kiwi", + ... "eagle", + ... "lion", + ... "monkey", + ... "rabbit", + ... "dog", + ... "wolf", + ... ], + ... [ + ... "bird", + ... "bird", + ... "bird", + ... "bird", + ... "bird", + ... "mammal", + ... "mammal", + ... "mammal", + ... "mammal", + ... "mammal", + ... ], + ... ] + >>> index = pd.MultiIndex.from_arrays(arrays, names=("name", "class")) + >>> df = pd.DataFrame( + ... { + ... "max_speed": [ + ... 389.0, + ... 24.0, + ... 70.0, + ... np.nan, + ... 350.0, + ... 80.5, + ... 21.5, + ... 15.0, + ... 40.0, + ... 50.0, + ... ] + ... }, + ... index=index, + ... ) + >>> df + max_speed + name class + falcon bird 389.0 + parrot bird 24.0 + cockatoo bird 70.0 + kiwi bird NaN + eagle bird 350.0 + lion mammal 80.5 + monkey mammal 21.5 + rabbit mammal 15.0 + dog mammal 40.0 + wolf mammal 50.0 + >>> gb = df.groupby(["class"]) + >>> gb.kurt() + max_speed + class + bird -5.493277 + mammal 0.204125 + >>> gb.kurt(skipna=False) + max_speed + class + bird NaN + mammal 0.204125 + """ + + return self._cython_agg_general( + "kurt", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + @property @doc(DataFrame.plot.__doc__) def plot(self) -> GroupByPlot: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4c7fe604e452d..c4c7f73ee166c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -144,6 +144,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "std": functools.partial(libgroupby.group_var, name="std"), "sem": functools.partial(libgroupby.group_var, name="sem"), "skew": "group_skew", + "kurt": "group_kurt", "first": "group_nth", "last": "group_last", "ohlc": "group_ohlc", @@ -193,7 +194,7 @@ def _get_cython_function( elif how in ["std", "sem", "idxmin", "idxmax"]: # We have a partial object that does not have __signatures__ return f - elif how == "skew": + elif how in ["skew", "kurt"]: # _get_cython_vals will convert to float64 pass elif "object" not in f.__signatures__: @@ -224,7 +225,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how in ["median", "std", "sem", "skew"]: + if how in ["median", "std", "sem", "skew", "kurt"]: # median only has a float64 implementation # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function @@ -453,7 +454,7 @@ def _call_cython_op( **kwargs, ) result = result.astype(bool, copy=False) - elif self.how in ["skew"]: + elif self.how in ["skew", "kurt"]: func( out=result, counts=counts, @@ -1021,6 +1022,7 @@ def apply_groupwise( # getattr pattern for __name__ is needed for functools.partial objects if len(group_keys) == 0 and getattr(f, "__name__", None) in [ "skew", + "kurt", "sum", "prod", ]: diff --git a/pandas/tests/groupby/methods/test_kurt.py b/pandas/tests/groupby/methods/test_kurt.py new file mode 100644 index 0000000000000..21b7c50c3c5aa --- /dev/null +++ b/pandas/tests/groupby/methods/test_kurt.py @@ -0,0 +1,90 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + + +def test_groupby_kurt_equivalence(): + # GH#40139 + # Test that that groupby kurt method (which uses libgroupby.group_kurt) + # matches the results of operating group-by-group (which uses nanops.nankurt) + nrows = 1000 + ngroups = 3 + ncols = 2 + nan_frac = 0.05 + + arr = np.random.default_rng(2).standard_normal((nrows, ncols)) + arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan + + df = pd.DataFrame(arr) + grps = np.random.default_rng(2).integers(0, ngroups, size=nrows) + gb = df.groupby(grps) + + result = gb.kurt() + + grpwise = [grp.kurt().to_frame(i).T for i, grp in gb] + expected = pd.concat(grpwise, axis=0) + expected.index = expected.index.astype("int64") # 32bit builds + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + "Float64", + ], +) +def test_groupby_kurt_arrow_float64(dtype): + # GH#40139 + # Test groupby.kurt() with float64[pyarrow] and Float64 dtypes + df = pd.DataFrame( + { + "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], + "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], + }, + dtype=dtype, + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt() + expected = pd.DataFrame({"x": [2.1644713], "y": [0.1513969]}, dtype=dtype) + tm.assert_almost_equal(result, expected) + + +def test_groupby_kurt_noskipna(): + # GH#40139 + # Test groupby.kurt() with skipna = False + df = pd.DataFrame( + { + "x": [1.0, np.nan, 3.2, 4.8, 2.3, 1.9, 8.9], + "y": [1.6, 3.3, 3.2, 6.8, 1.3, 2.9, 9.0], + } + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt(skipna=False) + expected = pd.DataFrame({"x": [np.nan], "y": [0.1513969]}) + tm.assert_almost_equal(result, expected) + + +def test_groupby_kurt_all_ones(): + # GH#40139 + # Test groupby.kurt() with constant values + df = pd.DataFrame( + { + "x": [1.0] * 10, + } + ) + gb = df.groupby(by=lambda x: 0) + + result = gb.kurt(skipna=False) + expected = pd.DataFrame( + { + "x": [0.0], # Same behavior as pd.DataFrame.kurt() + } + ) + tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index 013b308cd14cd..baec3ed1a5024 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -74,6 +74,7 @@ def test_tab_completion(multiindex_dataframe_random_data): "all", "shift", "skew", + "kurt", "take", "pct_change", "any", diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 62d4a0ddcc0f5..294ab14c96de8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1381,6 +1381,7 @@ def test_result_name_when_one_group(name): ("apply", lambda gb: gb.values[-1]), ("apply", lambda gb: gb["b"].iloc[0]), ("agg", "skew"), + ("agg", "kurt"), ("agg", "prod"), ("agg", "sum"), ], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 656a61de5d105..20309e852a556 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -61,6 +61,7 @@ def f(a): "sem": np.nan, "size": 0, "skew": np.nan, + "kurt": np.nan, "std": np.nan, "sum": 0, "var": np.nan, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c4c1e7bd9ac4f..36eb2f119ae25 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1710,7 +1710,7 @@ def test_pivot_table_values_key_error(): ) @pytest.mark.parametrize("method", ["attr", "agg", "apply"]) @pytest.mark.parametrize( - "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] + "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew", "kurt"] ) def test_empty_groupby(columns, keys, values, method, op, dropna, using_infer_string): # GH8093 & GH26411 @@ -1786,7 +1786,7 @@ def get_categorical_invalid_expected(): tm.assert_equal(result, expected) return - if op in ["prod", "sum", "skew"]: + if op in ["prod", "sum", "skew", "kurt"]: # ops that require more than just ordered-ness if is_dt64 or is_cat or is_per or (is_str and op != "sum"): # GH#41291 @@ -1799,15 +1799,15 @@ def get_categorical_invalid_expected(): msg = f"dtype 'str' does not support operation '{op}'" else: msg = "category type does not support" - if op == "skew": - msg = "|".join([msg, "does not support operation 'skew'"]) + if op in ["skew", "kurt"]: + msg = "|".join([msg, f"does not support operation '{op}'"]) with pytest.raises(TypeError, match=msg): get_result() if not isinstance(columns, list): # i.e. SeriesGroupBy return - elif op == "skew": + elif op in ["skew", "kurt"]: # TODO: test the numeric_only=True case return else: diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 0779faa8d8975..99a88a5d8fe7c 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -244,6 +244,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): ("quantile", True), ("sem", True), ("skew", True), + ("kurt", True), ("std", True), ("sum", True), ("var", True), @@ -378,6 +379,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "max", "prod", "skew", + "kurt", ) # Test default behavior; kernels that fail may be enabled in the future but kernels @@ -407,6 +409,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "quantile", "sem", "skew", + "kurt", "std", "sum", "var", diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 789105c275625..ba13d3bd7278f 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -171,6 +171,7 @@ def test_groupby_raises_string( "shift": (None, ""), "size": (None, ""), "skew": (ValueError, "could not convert string to float"), + "kurt": (ValueError, "could not convert string to float"), "std": (ValueError, "could not convert string to float"), "sum": (None, ""), "var": ( @@ -190,10 +191,11 @@ def test_groupby_raises_string( "sem", "var", "skew", + "kurt", "quantile", ]: msg = f"dtype 'str' does not support operation '{groupby_func}'" - if groupby_func in ["sem", "std", "skew"]: + if groupby_func in ["sem", "std", "skew", "kurt"]: # The object-dtype raises ValueError when trying to convert to numeric. klass = TypeError elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": @@ -323,6 +325,15 @@ def test_groupby_raises_datetime( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + r"dtype datetime64\[ns\] does not support operation", + "datetime64 type does not support operation 'kurt'", + ] + ), + ), "std": (None, ""), "sum": (TypeError, "datetime64 type does not support operation 'sum"), "var": (TypeError, "datetime64 type does not support operation 'var'"), @@ -372,7 +383,7 @@ def test_groupby_raises_datetime_np( _call_and_check(klass, msg, how, gb, groupby_func_np, ()) -@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) +@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "kurt", "var"]) def test_groupby_raises_timedelta(func): df = DataFrame( { @@ -502,6 +513,15 @@ def test_groupby_raises_category( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + "dtype category does not support operation 'kurt'", + "category type does not support kurt operations", + ] + ), + ), "std": ( TypeError, "|".join( @@ -676,6 +696,15 @@ def test_groupby_raises_category_on_category( ] ), ), + "kurt": ( + TypeError, + "|".join( + [ + "category type does not support kurt operations", + "dtype category does not support operation 'kurt'", + ] + ), + ), "std": ( TypeError, "|".join( diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 51c7eab2bfa82..a17200c123d22 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1114,6 +1114,7 @@ def test_apply_to_nullable_integer_returns_float(values, function): "median", "mean", "skew", + "kurt", "std", "var", "sem", @@ -1127,8 +1128,8 @@ def test_regression_allowlist_methods(op, skipna, sort): grouped = frame.groupby(level=0, sort=sort) - if op == "skew": - # skew has skipna + if op in ["skew", "kurt"]: + # skew and kurt have skipna result = getattr(grouped, op)(skipna=skipna) expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(skipna=skipna)) if sort: diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 888b97f2e0206..fecd20fd6cece 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1088,13 +1088,13 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): func = reduction_func obj = DataFrame( - {"a": [0, 0, 0, 1, 1, 1], "b": range(6)}, - index=["A", "B", "C", "D", "E", "F"], + {"a": [0, 0, 0, 0, 1, 1, 1, 1], "b": range(8)}, + index=["A", "B", "C", "D", "E", "F", "G", "H"], ) if frame_or_series is Series: obj = obj["a"] - g = obj.groupby(np.repeat([0, 1], 3)) + g = obj.groupby(np.repeat([0, 1], 4)) if func == "corrwith" and isinstance(obj, Series): # GH#32293 # TODO: implement SeriesGroupBy.corrwith @@ -1119,7 +1119,7 @@ def test_transform_agg_by_name(request, reduction_func, frame_or_series): tm.assert_index_equal(result.columns, obj.columns) # verify that values were broadcasted across each group - assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 + assert len(set(DataFrame(result).iloc[-4:, -1])) == 1 def test_transform_lambda_with_datetimetz():