Skip to content

Commit

Permalink
Merge branch 'main' into issue-37210-to-sql-truncate
Browse files Browse the repository at this point in the history
  • Loading branch information
gmcrocetti authored Jan 10, 2025
2 parents e8930d7 + a81d52f commit ca10c55
Show file tree
Hide file tree
Showing 21 changed files with 464 additions and 34 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ Other enhancements
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
Expand Down Expand Up @@ -799,6 +800,7 @@ Other
- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
- Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`)
- Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`)
- Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
- Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
Expand Down
9 changes: 9 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ def group_skew(
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_kurt(
out: np.ndarray, # float64_t[:, ::1]
counts: np.ndarray, # int64_t[::1]
values: np.ndarray, # ndarray[float64_T, ndim=2]
labels: np.ndarray, # const intp_t[::1]
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
skipna: bool = ...,
) -> None: ...
def group_mean(
out: np.ndarray, # floating[:, ::1]
counts: np.ndarray, # int64_t[::1]
Expand Down
98 changes: 96 additions & 2 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ def group_var(
@cython.wraparound(False)
@cython.boundscheck(False)
@cython.cdivision(True)
@cython.cpow
@cython.cpow(True)
def group_skew(
float64_t[:, ::1] out,
int64_t[::1] counts,
Expand Down Expand Up @@ -961,7 +961,7 @@ def group_skew(
isna_entry = _treat_as_na(val, False)

if not isna_entry:
# Based on RunningStats::Push from
# Running stats update based on RunningStats::Push from
# https://www.johndcook.com/blog/skewness_kurtosis/
n1 = nobs[lab, j]
n = n1 + 1
Expand Down Expand Up @@ -995,6 +995,100 @@ def group_skew(
)


@cython.wraparound(False)
@cython.boundscheck(False)
@cython.cdivision(True)
@cython.cpow(True)
def group_kurt(
float64_t[:, ::1] out,
int64_t[::1] counts,
ndarray[float64_t, ndim=2] values,
const intp_t[::1] labels,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
cdef:
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
int64_t[:, ::1] nobs
Py_ssize_t len_values = len(values), len_labels = len(labels)
bint isna_entry, uses_mask = mask is not None
float64_t[:, ::1] M1, M2, M3, M4
float64_t delta, delta_n, delta_n2, term1, val
int64_t n1, n
float64_t ct, num, den, adj

if len_values != len_labels:
raise ValueError("len(index) != len(labels)")

nobs = np.zeros((<object>out).shape, dtype=np.int64)

# M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments
M1 = np.zeros((<object>out).shape, dtype=np.float64)
M2 = np.zeros((<object>out).shape, dtype=np.float64)
M3 = np.zeros((<object>out).shape, dtype=np.float64)
M4 = np.zeros((<object>out).shape, dtype=np.float64)

N, K = (<object>values).shape

out[:, :] = 0.0

with nogil:
for i in range(N):
lab = labels[i]
if lab < 0:
continue

counts[lab] += 1

for j in range(K):
val = values[i, j]

if uses_mask:
isna_entry = mask[i, j]
else:
isna_entry = _treat_as_na(val, False)

if not isna_entry:
# Running stats update based on RunningStats::Push from
# https://www.johndcook.com/blog/skewness_kurtosis/
n1 = nobs[lab, j]
n = n1 + 1

nobs[lab, j] = n
delta = val - M1[lab, j]
delta_n = delta / n
delta_n2 = delta_n * delta_n
term1 = delta * delta_n * n1

M1[lab, j] += delta_n
M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3)
+ 6 * delta_n2 * M2[lab, j]
- 4 * delta_n * M3[lab, j])
M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j]
M2[lab, j] += term1
elif not skipna:
M1[lab, j] = NaN
M2[lab, j] = NaN
M3[lab, j] = NaN
M4[lab, j] = NaN

for i in range(ngroups):
for j in range(K):
ct = <float64_t>nobs[i, j]
if ct < 4:
if result_mask is not None:
result_mask[i, j] = 1
out[i, j] = NaN
elif M2[i, j] == 0:
out[i, j] = 0
else:
num = ct * (ct + 1) * (ct - 1) * M4[i, j]
den = (ct - 2) * (ct - 3) * M2[i, j] ** 2
adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3))
out[i, j] = num / den - adj


@cython.wraparound(False)
@cython.boundscheck(False)
def group_mean(
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/array_algos/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ def _check_comparison_types(
op = np.vectorize(
lambda x: bool(re.search(b, x))
if isinstance(x, str) and isinstance(b, (str, Pattern))
else False
else False,
otypes=[bool],
)

# GH#32621 use mask to avoid comparing to NAs
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2618,6 +2618,7 @@ def _groupby_op(
"sem",
"var",
"skew",
"kurt",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2736,7 +2736,7 @@ def _groupby_op(
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

dtype = self.dtype
if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
if how in ["sum", "prod", "cumsum", "cumprod", "skew", "kurt"]:
raise TypeError(f"{dtype} type does not support {how} operations")
if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:
# raise TypeError instead of NotImplementedError to ensure we
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1656,7 +1656,7 @@ def _groupby_op(
dtype = self.dtype
if dtype.kind == "M":
# Adding/multiplying datetimes is not valid
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
raise TypeError(f"datetime64 type does not support operation '{how}'")
if how in ["any", "all"]:
# GH#34479
Expand All @@ -1667,7 +1667,7 @@ def _groupby_op(

elif isinstance(dtype, PeriodDtype):
# Adding/multiplying Periods is not valid
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
raise TypeError(f"Period type does not support {how} operations")
if how in ["any", "all"]:
# GH#34479
Expand All @@ -1677,7 +1677,7 @@ def _groupby_op(
)
else:
# timedeltas we can add but not multiply
if how in ["prod", "cumprod", "skew", "var"]:
if how in ["prod", "cumprod", "skew", "kurt", "var"]:
raise TypeError(f"timedelta64 type does not support {how} operations")

# All of the functions implemented here are ordinal, so we can
Expand Down
1 change: 1 addition & 0 deletions pandas/core/groupby/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class OutputKey:
"sem",
"size",
"skew",
"kurt",
"std",
"sum",
"var",
Expand Down
Loading

0 comments on commit ca10c55

Please sign in to comment.