Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Change default dtype of str.get_dummies() to bool #60641

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,19 +397,19 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):

def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
if dtype is None:
dtype = np.int64
dtype = np.bool_
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(
sep, dtype
)
if len(labels) == 0:
return np.empty(shape=(0, 0), dtype=dtype), labels
dummies = np.vstack(dummies_pa.to_numpy())
_dtype = pandas_dtype(dtype)
dummies_dtype: NpDtype
if isinstance(_dtype, np.dtype):
dummies_dtype = _dtype
else:
dummies_dtype = np.bool_
if len(labels) == 0:
return np.empty(shape=(0, 0), dtype=dummies_dtype), labels
dummies = np.vstack(dummies_pa.to_numpy())
Comment on lines -404 to +412
Copy link
Author

@komo-fr komo-fr Jan 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the existing implementation, the following code would raise a TypeError: Cannot interpret 'BooleanDtype' as a data type due to the line return np.empty(shape=(0, 0), dtype=dtype):

# Empty Series
sr = pd.Series(dtype="string[pyarrow]")
sr.str.get_dummies(dtype=pd.BooleanDtype())

With this PR, the default dtype is changed to a boolean type, which makes similar issues more likely to occur. To address this, I modified the code to pass dummies_dtype to np.empty() instead of using dtype directly.

Related test: https://github.com/pandas-dev/pandas/blob/main/pandas/tests/strings/test_strings.py#L136

return dummies.astype(dummies_dtype, copy=False), labels

def _convert_int_result(self, result):
Expand Down
43 changes: 32 additions & 11 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2489,7 +2489,7 @@ def get_dummies(
----------
sep : str, default "|"
String to split on.
dtype : dtype, default np.int64
dtype : dtype, default bool
Data type for new columns. Only a single dtype is allowed.

Returns
Expand All @@ -2505,27 +2505,48 @@ def get_dummies(
Examples
--------
>>> pd.Series(["a|b", "a", "a|c"]).str.get_dummies()
a b c
0 1 1 0
1 1 0 0
2 1 0 1
a b c
0 True True False
1 True False False
2 True False True

>>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies()
a b c
0 True True False
1 False False False
2 True False True

>>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=np.int64)
a b c
0 1 1 0
1 0 0 0
2 1 0 1

>>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool)
a b c
0 True True False
1 False False False
2 True False True
"""
from pandas.core.frame import DataFrame

# we need to cast to Series of strings as only that has all
# methods available for making the dummies...
input_dtype = self._data.dtype
if dtype is None and not isinstance(input_dtype, ArrowDtype):
from pandas.core.arrays.string_ import StringDtype

if isinstance(input_dtype, CategoricalDtype):
input_dtype = input_dtype.categories.dtype

if isinstance(input_dtype, ArrowDtype):
import pyarrow as pa

dtype = ArrowDtype(pa.bool_())
elif (
isinstance(input_dtype, StringDtype)
and input_dtype.na_value is not np.nan
):
from pandas.core.dtypes.common import pandas_dtype

dtype = pandas_dtype("boolean")
else:
dtype = np.bool_
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I based this logic on the existing implementation of pd.get_dummies():
https://github.com/pandas-dev/pandas/blob/v2.2.3/pandas/core/reshape/encoding.py#L252-L269

I added the condition if dtype is None and not isinstance(input_dtype, ArrowDtype): to avoid errors when input_dtype is an ArrowDtype.
The reason is that not excluding ArrowDtype would cause an error with the following code:

sr = pd.Series(["A", "B", "A"], dtype=pd.ArrowDtype(pa.string()))
sr.str.get_dummies(dtype=pd.ArrowDtype(pa.bool_()))

Output (this issue also exists in the implementation before this PR):

...
  File "/Users/komo_fr/P_Project/pandas_workspace/pandas-komo_fr/pandas/core/strings/accessor.py", line 2532, in get_dummies
    DataFrame(result, columns=name, dtype=dtype),
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
...
pyarrow.lib.ArrowNotImplementedError: Unsupported cast from list<item: bool> to bool using function cast_boolean

With this PR, the default dtype is changed to a boolean type, which makes similar issues more likely to occur.
Since I wasn’t sure how to fully resolve this problem and it could lead to a much larger PR, I chose to exclude ArrowDtype cases for now.


result, name = self._data.array._str_get_dummies(sep, dtype)
if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype):
return self._wrap_result(
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/strings/object_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
from pandas import Series

if dtype is None:
dtype = np.int64
dtype = np.bool_
arr = Series(self).fillna("")
try:
arr = sep + arr + sep
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2206,6 +2206,16 @@ def test_get_dummies():
)
tm.assert_frame_equal(result, expected)

ser = pd.Series(
["a", "b"],
dtype=pd.CategoricalDtype(pd.Index(["a", "b"], dtype=ArrowDtype(pa.string()))),
)
result = ser.str.get_dummies()
expected = pd.DataFrame(
[[True, False], [False, True]], dtype=ArrowDtype(pa.bool_()), columns=["a", "b"]
)
tm.assert_frame_equal(result, expected)


def test_str_partition():
ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
Expand Down
29 changes: 26 additions & 3 deletions pandas/tests/strings/test_get_dummies.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import pandas.util._test_decorators as td

from pandas import (
NA,
CategoricalDtype,
DataFrame,
Index,
MultiIndex,
Expand All @@ -22,19 +24,28 @@
def test_get_dummies(any_string_dtype):
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
result = s.str.get_dummies("|")
expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"))
exp_dtype = (
"boolean"
if any_string_dtype == "string" and any_string_dtype.na_value is NA
else "bool"
)
expected = DataFrame(
[[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=exp_dtype
)
tm.assert_frame_equal(result, expected)

s = Series(["a;b", "a", 7], dtype=any_string_dtype)
result = s.str.get_dummies(";")
expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab"))
expected = DataFrame(
[[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab"), dtype=exp_dtype
)
tm.assert_frame_equal(result, expected)


def test_get_dummies_index():
# GH9980, GH8028
idx = Index(["a|b", "a|c", "b|c"])
result = idx.str.get_dummies("|")
result = idx.str.get_dummies("|", dtype=np.int64)
Comment on lines 45 to +48
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The behavior where the output becomes a MultiIndex when the input data is a pd.Index assumes that the dtype is not a boolean type:
https://github.com/pandas-dev/pandas/blob/main/pandas/core/strings/accessor.py#L381-L389

With this PR, the default behavior of str.get_dummies() changes to use a boolean dtype. To ensure the test cases remain consistent with the intended behavior, I modified them to explicitly specify the dtype.


expected = MultiIndex.from_tuples(
[(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c")
Expand Down Expand Up @@ -125,3 +136,15 @@ def test_get_dummies_with_pa_str_dtype(any_string_dtype):
dtype="str[pyarrow]",
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("dtype_type", ["string", "category"])
def test_get_dummies_ea_dtype(dtype_type, string_dtype_no_object):
dtype = string_dtype_no_object
exp_dtype = "boolean" if dtype.na_value is NA else "bool"
if dtype_type == "category":
dtype = CategoricalDtype(Index(["a", "b"], dtype))
s = Series(["a", "b"], dtype=dtype)
result = s.str.get_dummies()
expected = DataFrame([[1, 0], [0, 1]], columns=list("ab"), dtype=exp_dtype)
tm.assert_frame_equal(result, expected)
Loading