Skip to content

Commit

Permalink
GH-45175: [Python] Honor the strings_to_categorical keyword in to_pan…
Browse files Browse the repository at this point in the history
…das for string view type (#45176)

### Rationale for this change

Currently this keyword works for string or large string:

```python
>>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string())})
>>> table.to_pandas(strings_to_categorical=True).dtypes
col    category
dtype: object
>>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.large_string())})
>>> table.to_pandas(strings_to_categorical=True).dtypes
col    category
dtype: object
```

but not for string view:

```python
>>> table = pa.table({"col": pa.array(["a", "b", "a"], pa.string_view())})
>>> table.to_pandas(strings_to_categorical=True).dtypes
col    object
dtype: object
```

For consistency we should make that keyword check for string view columns as well, I think

From https://github.com/apache/arrow/pull/44195/files#r1901831460

### Are these changes tested?

Yes

### Are there any user-facing changes?

Yes, when using the `strings_to_categorical=True` keyword and having a string_view type, this column will now be converted to a pandas Categorical

* GitHub Issue: #45175

Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
  • Loading branch information
jorisvandenbossche authored Jan 7, 2025
1 parent e12bc56 commit 2c5ae51
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 6 deletions.
6 changes: 4 additions & 2 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2523,7 +2523,8 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr
}
if (options.strings_to_categorical) {
for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
if (is_base_binary_like((*arrays)[i]->type()->id())) {
if (is_base_binary_like((*arrays)[i]->type()->id()) ||
is_binary_view_like((*arrays)[i]->type()->id())) {
columns_to_encode.push_back(i);
}
}
Expand Down Expand Up @@ -2557,7 +2558,8 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options,
py_ref = nullptr;
}

if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) {
if (options.strings_to_categorical && (is_base_binary_like(arr->type()->id()) ||
is_binary_view_like(arr->type()->id()))) {
if (options.zero_copy_only) {
return Status::Invalid("Need to dictionary encode a column, but ",
"only zero-copy conversions allowed");
Expand Down
32 changes: 28 additions & 4 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1836,10 +1836,13 @@ def test_to_pandas_categories_already_dictionary(self):
result = table.to_pandas(categories=['col'])
assert table.to_pandas().equals(result)

def test_table_str_to_categorical_without_na(self):
@pytest.mark.parametrize(
"string_type", [pa.string(), pa.large_string(), pa.string_view()]
)
def test_table_str_to_categorical_without_na(self, string_type):
values = ['a', 'a', 'b', 'b', 'c']
df = pd.DataFrame({'strings': values})
field = pa.field('strings', pa.string())
field = pa.field('strings', string_type)
schema = pa.schema([field])
table = pa.Table.from_pandas(df, schema=schema)

Expand All @@ -1851,10 +1854,22 @@ def test_table_str_to_categorical_without_na(self):
table.to_pandas(strings_to_categorical=True,
zero_copy_only=True)

def test_table_str_to_categorical_with_na(self):
# chunked array
result = table["strings"].to_pandas(strings_to_categorical=True)
expected = pd.Series(pd.Categorical(values), name="strings")
tm.assert_series_equal(result, expected)

with pytest.raises(pa.ArrowInvalid):
table["strings"].to_pandas(strings_to_categorical=True,
zero_copy_only=True)

@pytest.mark.parametrize(
"string_type", [pa.string(), pa.large_string(), pa.string_view()]
)
def test_table_str_to_categorical_with_na(self, string_type):
values = [None, 'a', 'b', np.nan]
df = pd.DataFrame({'strings': values})
field = pa.field('strings', pa.string())
field = pa.field('strings', string_type)
schema = pa.schema([field])
table = pa.Table.from_pandas(df, schema=schema)

Expand All @@ -1866,6 +1881,15 @@ def test_table_str_to_categorical_with_na(self):
table.to_pandas(strings_to_categorical=True,
zero_copy_only=True)

# chunked array
result = table["strings"].to_pandas(strings_to_categorical=True)
expected = pd.Series(pd.Categorical(values), name="strings")
tm.assert_series_equal(result, expected)

with pytest.raises(pa.ArrowInvalid):
table["strings"].to_pandas(strings_to_categorical=True,
zero_copy_only=True)

# Regression test for ARROW-2101
def test_array_of_bytes_to_strings(self):
converted = pa.array(np.array([b'x'], dtype=object), pa.string())
Expand Down

0 comments on commit 2c5ae51

Please sign in to comment.