Merge branch 'main' into issue-37210-to-sql-truncate

pandas-dev · Jan 10, 2025 · ca10c55 · ca10c55
2 parents e8930d7 + a81d52f
commit ca10c55
Show file tree

Hide file tree

Showing 21 changed files with 464 additions and 34 deletions.
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -55,6 +55,7 @@ Other enhancements
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
+- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
 - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
 - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
 - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
@@ -799,6 +800,7 @@ Other
 - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
 - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
 - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
+- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`)
 - Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`)
 - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)

diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -97,6 +97,15 @@ def group_skew(
     result_mask: np.ndarray | None = ...,
     skipna: bool = ...,
 ) -> None: ...
+def group_kurt(
+    out: np.ndarray,  # float64_t[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[float64_T, ndim=2]
+    labels: np.ndarray,  # const intp_t[::1]
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
+) -> None: ...
 def group_mean(
     out: np.ndarray,  # floating[:, ::1]
     counts: np.ndarray,  # int64_t[::1]

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -910,7 +910,7 @@ def group_var(
 @cython.wraparound(False)
 @cython.boundscheck(False)
 @cython.cdivision(True)
-@cython.cpow
+@cython.cpow(True)
 def group_skew(
     float64_t[:, ::1] out,
     int64_t[::1] counts,
@@ -961,7 +961,7 @@ def group_skew(
                     isna_entry = _treat_as_na(val, False)
 
                 if not isna_entry:
-                    # Based on RunningStats::Push from
+                    # Running stats update based on RunningStats::Push from
                     #  https://www.johndcook.com/blog/skewness_kurtosis/
                     n1 = nobs[lab, j]
                     n = n1 + 1
@@ -995,6 +995,100 @@ def group_skew(
                     )
 
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.cdivision(True)
+@cython.cpow(True)
+def group_kurt(
+    float64_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[float64_t, ndim=2] values,
+    const intp_t[::1] labels,
+    const uint8_t[:, ::1] mask=None,
+    uint8_t[:, ::1] result_mask=None,
+    bint skipna=True,
+) -> None:
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
+        int64_t[:, ::1] nobs
+        Py_ssize_t len_values = len(values), len_labels = len(labels)
+        bint isna_entry, uses_mask = mask is not None
+        float64_t[:, ::1] M1, M2, M3, M4
+        float64_t delta, delta_n, delta_n2, term1, val
+        int64_t n1, n
+        float64_t ct, num, den, adj
+
+    if len_values != len_labels:
+        raise ValueError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object>out).shape, dtype=np.int64)
+
+    # M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments
+    M1 = np.zeros((<object>out).shape, dtype=np.float64)
+    M2 = np.zeros((<object>out).shape, dtype=np.float64)
+    M3 = np.zeros((<object>out).shape, dtype=np.float64)
+    M4 = np.zeros((<object>out).shape, dtype=np.float64)
+
+    N, K = (<object>values).shape
+
+    out[:, :] = 0.0
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+
+            for j in range(K):
+                val = values[i, j]
+
+                if uses_mask:
+                    isna_entry = mask[i, j]
+                else:
+                    isna_entry = _treat_as_na(val, False)
+
+                if not isna_entry:
+                    # Running stats update based on RunningStats::Push from
+                    #  https://www.johndcook.com/blog/skewness_kurtosis/
+                    n1 = nobs[lab, j]
+                    n = n1 + 1
+
+                    nobs[lab, j] = n
+                    delta = val - M1[lab, j]
+                    delta_n = delta / n
+                    delta_n2 = delta_n * delta_n
+                    term1 = delta * delta_n * n1
+
+                    M1[lab, j] += delta_n
+                    M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3)
+                                   + 6 * delta_n2 * M2[lab, j]
+                                   - 4 * delta_n * M3[lab, j])
+                    M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j]
+                    M2[lab, j] += term1
+                elif not skipna:
+                    M1[lab, j] = NaN
+                    M2[lab, j] = NaN
+                    M3[lab, j] = NaN
+                    M4[lab, j] = NaN
+
+        for i in range(ngroups):
+            for j in range(K):
+                ct = <float64_t>nobs[i, j]
+                if ct < 4:
+                    if result_mask is not None:
+                        result_mask[i, j] = 1
+                    out[i, j] = NaN
+                elif M2[i, j] == 0:
+                    out[i, j] = 0
+                else:
+                    num = ct * (ct + 1) * (ct - 1) * M4[i, j]
+                    den = (ct - 2) * (ct - 3) * M2[i, j] ** 2
+                    adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3))
+                    out[i, j] = num / den - adj
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_mean(

diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py
@@ -89,7 +89,8 @@ def _check_comparison_types(
         op = np.vectorize(
             lambda x: bool(re.search(b, x))
             if isinstance(x, str) and isinstance(b, (str, Pattern))
-            else False
+            else False,
+            otypes=[bool],
         )
 
     # GH#32621 use mask to avoid comparing to NAs

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -2618,6 +2618,7 @@ def _groupby_op(
                 "sem",
                 "var",
                 "skew",
+                "kurt",
             ]:
                 raise TypeError(
                     f"dtype '{self.dtype}' does not support operation '{how}'"

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2736,7 +2736,7 @@ def _groupby_op(
         op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)
 
         dtype = self.dtype
-        if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
+        if how in ["sum", "prod", "cumsum", "cumprod", "skew", "kurt"]:
             raise TypeError(f"{dtype} type does not support {how} operations")
         if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:
             # raise TypeError instead of NotImplementedError to ensure we

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1656,7 +1656,7 @@ def _groupby_op(
         dtype = self.dtype
         if dtype.kind == "M":
             # Adding/multiplying datetimes is not valid
-            if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
+            if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
                 raise TypeError(f"datetime64 type does not support operation '{how}'")
             if how in ["any", "all"]:
                 # GH#34479
@@ -1667,7 +1667,7 @@ def _groupby_op(
 
         elif isinstance(dtype, PeriodDtype):
             # Adding/multiplying Periods is not valid
-            if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
+            if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew", "kurt"]:
                 raise TypeError(f"Period type does not support {how} operations")
             if how in ["any", "all"]:
                 # GH#34479
@@ -1677,7 +1677,7 @@ def _groupby_op(
                 )
         else:
             # timedeltas we can add but not multiply
-            if how in ["prod", "cumprod", "skew", "var"]:
+            if how in ["prod", "cumprod", "skew", "kurt", "var"]:
                 raise TypeError(f"timedelta64 type does not support {how} operations")
 
         # All of the functions implemented here are ordinal, so we can

diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py
@@ -50,6 +50,7 @@ class OutputKey:
         "sem",
         "size",
         "skew",
+        "kurt",
         "std",
         "sum",
         "var",
-Original file line number
+Diff line change
@@ Expand Up / @@ -2618,6 +2618,7 @@ def _groupby_op( @@
                     "sem",
                     "var",
                     "skew",
+                    "kurt",
                 ]:
                     raise TypeError(
                         f"dtype '{self.dtype}' does not support operation '{how}'"
@@ Expand Down @@