From 21f2e70497410f7cff5e33d4cc5948da9d776c92 Mon Sep 17 00:00:00 2001
From: komo-fr <komo.mdrms@gmail.com>
Date: Thu, 2 Jan 2025 17:01:14 +0900
Subject: [PATCH 1/2] change default dtype of str.get_dummies() to bool

---
 pandas/core/arrays/string_arrow.py       |  8 ++---
 pandas/core/strings/accessor.py          | 43 ++++++++++++++++++------
 pandas/core/strings/object_array.py      |  2 +-
 pandas/tests/extension/test_arrow.py     | 10 ++++++
 pandas/tests/strings/test_get_dummies.py | 29 ++++++++++++++--
 5 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 27c1425d11ac6..327cb042e4342 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -397,19 +397,19 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
 
     def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         if dtype is None:
-            dtype = np.int64
+            dtype = np.bool_
         dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(
             sep, dtype
         )
-        if len(labels) == 0:
-            return np.empty(shape=(0, 0), dtype=dtype), labels
-        dummies = np.vstack(dummies_pa.to_numpy())
         _dtype = pandas_dtype(dtype)
         dummies_dtype: NpDtype
         if isinstance(_dtype, np.dtype):
             dummies_dtype = _dtype
         else:
             dummies_dtype = np.bool_
+        if len(labels) == 0:
+            return np.empty(shape=(0, 0), dtype=dummies_dtype), labels
+        dummies = np.vstack(dummies_pa.to_numpy())
         return dummies.astype(dummies_dtype, copy=False), labels
 
     def _convert_int_result(self, result):
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index e5b434edacc59..8c56ffe488f9d 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -2489,7 +2489,7 @@ def get_dummies(
         ----------
         sep : str, default "|"
             String to split on.
-        dtype : dtype, default np.int64
+        dtype : dtype, default bool
             Data type for new columns. Only a single dtype is allowed.
 
         Returns
@@ -2505,27 +2505,48 @@ def get_dummies(
         Examples
         --------
         >>> pd.Series(["a|b", "a", "a|c"]).str.get_dummies()
-           a  b  c
-        0  1  1  0
-        1  1  0  0
-        2  1  0  1
+              a      b      c
+        0  True   True  False
+        1  True  False  False
+        2  True  False   True
 
         >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies()
+               a      b      c
+        0   True   True  False
+        1  False  False  False
+        2   True  False   True
+
+        >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=np.int64)
            a  b  c
         0  1  1  0
         1  0  0  0
         2  1  0  1
-
-        >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool)
-                a      b      c
-        0   True   True    False
-        1   False  False   False
-        2   True   False   True
         """
         from pandas.core.frame import DataFrame
 
         # we need to cast to Series of strings as only that has all
         # methods available for making the dummies...
+        input_dtype = self._data.dtype
+        if dtype is None and not isinstance(input_dtype, ArrowDtype):
+            from pandas.core.arrays.string_ import StringDtype
+
+            if isinstance(input_dtype, CategoricalDtype):
+                input_dtype = input_dtype.categories.dtype
+
+            if isinstance(input_dtype, ArrowDtype):
+                import pyarrow as pa
+
+                dtype = ArrowDtype(pa.bool_())
+            elif (
+                isinstance(input_dtype, StringDtype)
+                and input_dtype.na_value is not np.nan
+            ):
+                from pandas.core.dtypes.common import pandas_dtype
+
+                dtype = pandas_dtype("boolean")
+            else:
+                dtype = np.bool_
+
         result, name = self._data.array._str_get_dummies(sep, dtype)
         if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype):
             return self._wrap_result(
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 0268194e64d50..595bfac5f229c 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -416,7 +416,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         from pandas import Series
 
         if dtype is None:
-            dtype = np.int64
+            dtype = np.bool_
         arr = Series(self).fillna("")
         try:
             arr = sep + arr + sep
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 6dd1f3f15bc15..ab68fa74ae218 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -2206,6 +2206,16 @@ def test_get_dummies():
     )
     tm.assert_frame_equal(result, expected)
 
+    ser = pd.Series(
+        ["a", "b"],
+        dtype=pd.CategoricalDtype(pd.Index(["a", "b"], dtype=ArrowDtype(pa.string()))),
+    )
+    result = ser.str.get_dummies()
+    expected = pd.DataFrame(
+        [[True, False], [False, True]], dtype=ArrowDtype(pa.bool_()), columns=["a", "b"]
+    )
+    tm.assert_frame_equal(result, expected)
+
 
 def test_str_partition():
     ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 3b989e284ca25..04805d14616eb 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -6,6 +6,8 @@
 import pandas.util._test_decorators as td
 
 from pandas import (
+    NA,
+    CategoricalDtype,
     DataFrame,
     Index,
     MultiIndex,
@@ -22,19 +24,28 @@
 def test_get_dummies(any_string_dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
     result = s.str.get_dummies("|")
-    expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"))
+    exp_dtype = (
+        "boolean"
+        if any_string_dtype == "string" and any_string_dtype.na_value is NA
+        else "bool"
+    )
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=exp_dtype
+    )
     tm.assert_frame_equal(result, expected)
 
     s = Series(["a;b", "a", 7], dtype=any_string_dtype)
     result = s.str.get_dummies(";")
-    expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab"))
+    expected = DataFrame(
+        [[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab"), dtype=exp_dtype
+    )
     tm.assert_frame_equal(result, expected)
 
 
 def test_get_dummies_index():
     # GH9980, GH8028
     idx = Index(["a|b", "a|c", "b|c"])
-    result = idx.str.get_dummies("|")
+    result = idx.str.get_dummies("|", dtype=np.int64)
 
     expected = MultiIndex.from_tuples(
         [(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c")
@@ -125,3 +136,15 @@ def test_get_dummies_with_pa_str_dtype(any_string_dtype):
         dtype="str[pyarrow]",
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype_type", ["string", "category"])
+def test_get_dummies_ea_dtype(dtype_type, string_dtype_no_object):
+    dtype = string_dtype_no_object
+    exp_dtype = "boolean" if dtype.na_value is NA else "bool"
+    if dtype_type == "category":
+        dtype = CategoricalDtype(Index(["a", "b"], dtype))
+    s = Series(["a", "b"], dtype=dtype)
+    result = s.str.get_dummies()
+    expected = DataFrame([[1, 0], [0, 1]], columns=list("ab"), dtype=exp_dtype)
+    tm.assert_frame_equal(result, expected)

From 1de77e52a188ba84c136860ae7b45d961f343c69 Mon Sep 17 00:00:00 2001
From: komo-fr <komo.mdrms@gmail.com>
Date: Thu, 2 Jan 2025 18:23:30 +0900
Subject: [PATCH 2/2] ignore mypy assignment type check in str.get_dummies

---
 pandas/core/strings/accessor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 8c56ffe488f9d..97055c583b8f2 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -2536,14 +2536,14 @@ def get_dummies(
             if isinstance(input_dtype, ArrowDtype):
                 import pyarrow as pa
 
-                dtype = ArrowDtype(pa.bool_())
+                dtype = ArrowDtype(pa.bool_())  # type: ignore[assignment]
             elif (
                 isinstance(input_dtype, StringDtype)
                 and input_dtype.na_value is not np.nan
             ):
                 from pandas.core.dtypes.common import pandas_dtype
 
-                dtype = pandas_dtype("boolean")
+                dtype = pandas_dtype("boolean")  # type: ignore[assignment]
             else:
                 dtype = np.bool_