Skip to content

Commit

Permalink
ENH: Support kurtosis (kurt) in DataFrameGroupBy and SeriesGroupBy (#…
Browse files Browse the repository at this point in the history
…60433)

* ENH: Support kurtosis (kurt) in DataFrameGroupBy and SeriesGroupBy

* ENH: Address review comments

* ENH: Fix comments in new test cases

* ENH: Skip pyarrow test case if no pyarrow available

* ENH: Update to intp instead of np.intp

* ENH: Change intp to int64

* Address review comments
  • Loading branch information
snitish authored Jan 10, 2025
1 parent 11cc7e0 commit a81d52f
Show file tree
Hide file tree
Showing 18 changed files with 436 additions and 24 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ Other enhancements
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
Expand Down
9 changes: 9 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ def group_skew(
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_kurt(
out: np.ndarray, # float64_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[float64_T, ndim=2]
labels: np.ndarray, # const intp_t[::1]
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_mean(
out: np.ndarray, # floating[:, ::1]
counts: np.ndarray, # int64_t[::1]
Expand Down
98 changes: 96 additions & 2 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ def group_var(
@cython.wraparound(False)
@cython.boundscheck(False)
@cython.cdivision(True)
@cython.cpow
@cython.cpow(True)
def group_skew(
float64_t[:, ::1] out,
int64_t[::1] counts,
Expand Down Expand Up @@ -961,7 +961,7 @@ def group_skew(
isna_entry = _treat_as_na(val, False)

if not isna_entry:
# Based on RunningStats::Push from
# Running stats update based on RunningStats::Push from
# https://www.johndcook.com/blog/skewness_kurtosis/
n1 = nobs[lab, j]
n = n1 + 1
Expand Down Expand Up @@ -995,6 +995,100 @@ def group_skew(
)


@cython.wraparound(False)
@cython.boundscheck(False)
@cython.cdivision(True)
@cython.cpow(True)
def group_kurt(
float64_t[:, ::1] out,
int64_t[::1] counts,
ndarray[float64_t, ndim=2] values,
const intp_t[::1] labels,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
cdef:
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
float64_t[:, ::1] M1, M2, M3, M4
float64_t delta, delta_n, delta_n2, term1, val
int64_t n1, n
float64_t ct, num, den, adj

if len_values != len_labels:
raise ValueError("len(index) != len(labels)")

nobs = np.zeros((<object>out).shape, dtype=np.int64)

# M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments
M1 = np.zeros((<object>out).shape, dtype=np.float64)
M2 = np.zeros((<object>out).shape, dtype=np.float64)
M3 = np.zeros((<object>out).shape, dtype=np.float64)
M4 = np.zeros((<object>out).shape, dtype=np.float64)

N, K = (<object>values).shape

out[:, :] = 0.0

with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1

for j in range(K):
val = values[i, j]

if uses_mask:
isna_entry = mask[i, j]
else:
isna_entry = _treat_as_na(val, False)

if not isna_entry:
# Running stats update based on RunningStats::Push from
# https://www.johndcook.com/blog/skewness_kurtosis/
n1 = nobs[lab, j]
n = n1 + 1

nobs[lab, j] = n
delta = val - M1[lab, j]
delta_n = delta / n
delta_n2 = delta_n * delta_n
term1 = delta * delta_n * n1

M1[lab, j] += delta_n
M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3)
+ 6 * delta_n2 * M2[lab, j]
- 4 * delta_n * M3[lab, j])
M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j]
M2[lab, j] += term1
elif not skipna:
M1[lab, j] = NaN
M2[lab, j] = NaN
M3[lab, j] = NaN
M4[lab, j] = NaN

for i in range(ngroups):
for j in range(K):
ct = <float64_t>nobs[i, j]
if ct < 4:
if result_mask is not None:
result_mask[i, j] = 1
out[i, j] = NaN
elif M2[i, j] == 0:
out[i, j] = 0
else:
num = ct * (ct + 1) * (ct - 1) * M4[i, j]
den = (ct - 2) * (ct - 3) * M2[i, j] ** 2
adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3))
out[i, j] = num / den - adj


@cython.wraparound(False)
@cython.boundscheck(False)
def group_mean(
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2618,6 +2618,7 @@ def _groupby_op(
"sem",
"var",
"skew",
"kurt",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2736,7 +2736,7 @@ def _groupby_op(
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

dtype = self.dtype
if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
if how in ["sum", "prod", "cumsum", "cumprod", "skew", "kurt"]:
raise TypeError(f"{dtype} type does not support {how} operations")
if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:
# raise TypeError instead of NotImplementedError to ensure we
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1656,7 +1656,7 @@ def _groupby_op(
dtype = self.dtype
if dtype.kind == "M":
# Adding/multiplying datetimes is not valid
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
raise TypeError(f"datetime64 type does not support operation '{how}'")
if how in ["any", "all"]:
# GH#34479
Expand All @@ -1667,7 +1667,7 @@ def _groupby_op(

elif isinstance(dtype, PeriodDtype):
# Adding/multiplying Periods is not valid
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
raise TypeError(f"Period type does not support {how} operations")
if how in ["any", "all"]:
# GH#34479
Expand All @@ -1677,7 +1677,7 @@ def _groupby_op(
)
else:
# timedeltas we can add but not multiply
if how in ["prod", "cumprod", "skew", "var"]:
if how in ["prod", "cumprod", "skew", "kurt", "var"]:
raise TypeError(f"timedelta64 type does not support {how} operations")

# All of the functions implemented here are ordinal, so we can
Expand Down
1 change: 1 addition & 0 deletions pandas/core/groupby/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class OutputKey:
"sem",
"size",
"skew",
"kurt",
"std",
"sum",
"var",
Expand Down
182 changes: 180 additions & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1272,13 +1272,86 @@ def skew(
Name: Max Speed, dtype: float64
"""

return self._cython_agg_general(
"skew", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs
)

def kurt(
self,
skipna: bool = True,
numeric_only: bool = False,
**kwargs,
) -> Series:
"""
Return unbiased kurtosis within groups.
Parameters
----------
skipna : bool, default True
Exclude NA/null values when computing the result.
numeric_only : bool, default False
Include only float, int, boolean columns. Not implemented for Series.
**kwargs
Additional keyword arguments to be passed to the function.
Returns
-------
Series
Unbiased kurtosis within groups.
See Also
--------
Series.kurt : Return unbiased kurtosis over requested axis.
Examples
--------
>>> ser = pd.Series(
... [390.0, 350.0, 357.0, 333.0, np.nan, 22.0, 20.0, 30.0, 40.0, 41.0],
... index=[
... "Falcon",
... "Falcon",
... "Falcon",
... "Falcon",
... "Falcon",
... "Parrot",
... "Parrot",
... "Parrot",
... "Parrot",
... "Parrot",
... ],
... name="Max Speed",
... )
>>> ser
Falcon 390.0
Falcon 350.0
Falcon 357.0
Falcon 333.0
Falcon NaN
Parrot 22.0
Parrot 20.0
Parrot 30.0
Parrot 40.0
Parrot 41.0
Name: Max Speed, dtype: float64
>>> ser.groupby(level=0).kurt()
Falcon 1.622109
Parrot -2.878714
Name: Max Speed, dtype: float64
>>> ser.groupby(level=0).kurt(skipna=False)
Falcon NaN
Parrot -2.878714
Name: Max Speed, dtype: float64
"""

def alt(obj):
# This should not be reached since the cython path should raise
# TypeError and not NotImplementedError.
raise TypeError(f"'skew' is not supported for dtype={obj.dtype}")
raise TypeError(f"'kurt' is not supported for dtype={obj.dtype}")

return self._cython_agg_general(
"skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
"kurt", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
)

@property
Expand Down Expand Up @@ -2921,6 +2994,111 @@ def alt(obj):
"skew", alt=alt, skipna=skipna, numeric_only=numeric_only, **kwargs
)

def kurt(
self,
skipna: bool = True,
numeric_only: bool = False,
**kwargs,
) -> DataFrame:
"""
Return unbiased kurtosis within groups.
Parameters
----------
skipna : bool, default True
Exclude NA/null values when computing the result.
numeric_only : bool, default False
Include only float, int, boolean columns.
**kwargs
Additional keyword arguments to be passed to the function.
Returns
-------
DataFrame
Unbiased kurtosis within groups.
See Also
--------
DataFrame.kurt : Return unbiased kurtosis over requested axis.
Examples
--------
>>> arrays = [
... [
... "falcon",
... "parrot",
... "cockatoo",
... "kiwi",
... "eagle",
... "lion",
... "monkey",
... "rabbit",
... "dog",
... "wolf",
... ],
... [
... "bird",
... "bird",
... "bird",
... "bird",
... "bird",
... "mammal",
... "mammal",
... "mammal",
... "mammal",
... "mammal",
... ],
... ]
>>> index = pd.MultiIndex.from_arrays(arrays, names=("name", "class"))
>>> df = pd.DataFrame(
... {
... "max_speed": [
... 389.0,
... 24.0,
... 70.0,
... np.nan,
... 350.0,
... 80.5,
... 21.5,
... 15.0,
... 40.0,
... 50.0,
... ]
... },
... index=index,
... )
>>> df
max_speed
name class
falcon bird 389.0
parrot bird 24.0
cockatoo bird 70.0
kiwi bird NaN
eagle bird 350.0
lion mammal 80.5
monkey mammal 21.5
rabbit mammal 15.0
dog mammal 40.0
wolf mammal 50.0
>>> gb = df.groupby(["class"])
>>> gb.kurt()
max_speed
class
bird -5.493277
mammal 0.204125
>>> gb.kurt(skipna=False)
max_speed
class
bird NaN
mammal 0.204125
"""

return self._cython_agg_general(
"kurt", alt=None, skipna=skipna, numeric_only=numeric_only, **kwargs
)

@property
@doc(DataFrame.plot.__doc__)
def plot(self) -> GroupByPlot:
Expand Down
Loading

0 comments on commit a81d52f

Please sign in to comment.