Skip to content

Commit

Permalink
Define cudf repr methods on the Column (rapidsai#17675)
Browse files Browse the repository at this point in the history
Refactors cudf Python objects' repr handling to define the core conversion of "cleaning" nulls at the column level and then rolling up the conversions at the `Frame` and its subclasses level.

Notable positive changes:

* `repr(cudf.Series)` no longer deep copies
* Fixes a bug when `repr(cudf.Series)` with a timedelta type to better match pandas (adjusted unit tests accordingly)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: rapidsai#17675
  • Loading branch information
mroeschke authored Jan 8, 2025
1 parent f017f86 commit f1cb88d
Show file tree
Hide file tree
Showing 13 changed files with 117 additions and 143 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.
# Copyright (c) 2021-2025, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -350,7 +350,7 @@ def names(self, values):

self.name = values[0]

def _clean_nulls_from_index(self):
def _pandas_repr_compatible(self):
"""
Convert all na values(if any) in Index object
to `<NA>` as a preprocessing step to `__repr__` methods.
Expand Down
16 changes: 15 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
# Copyright (c) 2018-2025, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -77,6 +77,7 @@

from cudf._typing import ColumnLike, Dtype, ScalarLike
from cudf.core.column.numerical import NumericalColumn
from cudf.core.column.strings import StringColumn

if PANDAS_GE_210:
NumpyExtensionArray = pd.arrays.NumpyExtensionArray
Expand All @@ -92,6 +93,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible):
"min",
}

_PANDAS_NA_REPR = str(pd.NA)

def data_array_view(
self, *, mode: Literal["write", "read"] = "write"
) -> "cuda.devicearray.DeviceNDArray":
Expand Down Expand Up @@ -176,6 +179,17 @@ def __repr__(self):
f"dtype: {self.dtype}"
)

def _prep_pandas_compat_repr(self) -> StringColumn | Self:
"""
Preprocess Column to be compatible with pandas repr, namely handling nulls.
* null (datetime/timedelta) = str(pd.NaT)
* null (other types)= str(pd.NA)
"""
if self.has_nulls():
return self.astype("str").fillna(self._PANDAS_NA_REPR)
return self

def to_pandas(
self,
*,
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
# Copyright (c) 2019-2025, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -212,6 +212,8 @@ class DatetimeColumn(column.ColumnBase):
"__rsub__",
}

_PANDAS_NA_REPR = str(pd.NaT)

def __init__(
self,
data: Buffer,
Expand Down
13 changes: 12 additions & 1 deletion python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -28,6 +28,7 @@

from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.column.string import StringColumn


class ListColumn(ColumnBase):
Expand Down Expand Up @@ -67,6 +68,16 @@ def __init__(
children=children,
)

def _prep_pandas_compat_repr(self) -> StringColumn | Self:
"""
Preprocess Column to be compatible with pandas repr, namely handling nulls.
* null (datetime/timedelta) = str(pd.NaT)
* null (other types)= str(pd.NA)
"""
# TODO: handle if self.has_nulls(): case
return self

@cached_property
def memory_usage(self):
n = super().memory_usage
Expand Down
13 changes: 12 additions & 1 deletion python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
from __future__ import annotations

from functools import cached_property
Expand All @@ -18,6 +18,7 @@

from cudf._typing import Dtype
from cudf.core.buffer import Buffer
from cudf.core.column.string import StringColumn


class StructColumn(ColumnBase):
Expand Down Expand Up @@ -51,6 +52,16 @@ def __init__(
children=children,
)

def _prep_pandas_compat_repr(self) -> StringColumn | Self:
"""
Preprocess Column to be compatible with pandas repr, namely handling nulls.
* null (datetime/timedelta) = str(pd.NaT)
* null (other types)= str(pd.NA)
"""
# TODO: handle if self.has_nulls(): case
return self

@staticmethod
def _validate_dtype_instance(dtype: StructDtype) -> StructDtype:
# IntervalDtype is a subclass of StructDtype, so compare types exactly
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ class TimeDeltaColumn(ColumnBase):
"__rfloordiv__",
}

_PANDAS_NA_REPR = str(pd.NaT)

def __init__(
self,
data: Buffer,
Expand Down
53 changes: 8 additions & 45 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
# Copyright (c) 2018-2025, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -1894,7 +1894,7 @@ def astype(
dtype = {cc: dtype for cc in self._column_names}
return super().astype(dtype, copy, errors)

def _clean_renderable_dataframe(self, output):
def _clean_renderable_dataframe(self, output: Self) -> str:
"""
This method takes in partial/preprocessed dataframe
and returns correct representation of it with correct
Expand Down Expand Up @@ -1929,59 +1929,25 @@ def _clean_renderable_dataframe(self, output):
)
return "\n".join(lines)

def _clean_nulls_from_dataframe(self, df):
"""
This function converts all ``null`` values to ``<NA>`` for
representation as a string in `__repr__`.
Since we utilize Pandas `__repr__` at all places in our code
for formatting purposes, we convert columns to `str` dtype for
filling with `<NA>` values.
"""
for col in df._data:
if isinstance(
df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype)
):
# TODO we need to handle this
pass
elif df._data[col].has_nulls():
fill_value = (
str(cudf.NaT)
if isinstance(
df._data[col],
(
cudf.core.column.DatetimeColumn,
cudf.core.column.TimeDeltaColumn,
),
)
else str(cudf.NA)
)

df[col] = df._data[col].astype("str").fillna(fill_value)
else:
df[col] = df._data[col]

return df

def _get_renderable_dataframe(self):
def _get_renderable_dataframe(self) -> Self:
"""
Takes rows and columns from pandas settings or estimation from size.
pulls quadrants based off of some known parameters then style for
multiindex as well producing an efficient representative string
for printing with the dataframe.
"""
max_rows = pd.options.display.max_rows
nrows = np.max([len(self) if max_rows is None else max_rows, 1])
if pd.options.display.max_rows == 0:
nrows = len(self)
if max_rows in {0, None}:
max_rows = len(self)
nrows = max(max_rows, 1)
ncols = (
pd.options.display.max_columns
if pd.options.display.max_columns
else pd.options.display.width / 2
)

if len(self) <= nrows and self._num_columns <= ncols:
output = self.copy(deep=False)
output = self
elif self.empty and len(self.index) > 0:
max_seq_items = pd.options.display.max_seq_items
# In case of Empty DataFrame with index, Pandas prints
Expand Down Expand Up @@ -2041,10 +2007,7 @@ def _get_renderable_dataframe(self):
lower = cudf.concat([lower_left, lower_right], axis=1)
output = cudf.concat([upper, lower])

output = self._clean_nulls_from_dataframe(output)
output.index = output.index._clean_nulls_from_index()

return output
return output._pandas_repr_compatible()

@_performance_tracking
def __repr__(self):
Expand Down
9 changes: 8 additions & 1 deletion python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -820,6 +820,13 @@ def fillna(
inplace=inplace,
)

def _pandas_repr_compatible(self) -> Self:
"""Return Self but with columns prepared for a pandas-like repr."""
columns = (col._prep_pandas_compat_repr() for col in self._columns)
return self._from_data_like_self(
self._data._from_columns_like_self(columns, verify=False)
)

@_performance_tracking
def _drop_column(
self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise"
Expand Down
30 changes: 5 additions & 25 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def _values(self) -> ColumnBase:
else:
return column.column_empty(0, dtype=self.dtype)

def _clean_nulls_from_index(self) -> Self:
def _pandas_repr_compatible(self) -> Self:
return self

def _is_numeric(self) -> bool:
Expand Down Expand Up @@ -1127,15 +1127,9 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
out.name = name
return out

@classmethod
@_performance_tracking
def _from_data_like_self(
cls, data: MutableMapping, name: Any = no_default
) -> Self:
out = _index_from_data(data, name)
if name is not no_default:
out.name = name
return out
def _from_data_like_self(self, data: MutableMapping) -> Self:
return _index_from_data(data, self.name)

@classmethod
@_performance_tracking
Expand Down Expand Up @@ -1494,7 +1488,7 @@ def __repr__(self) -> str:
if isinstance(self._values, StringColumn):
output = repr(self.to_pandas(nullable=True))
else:
output = repr(self._clean_nulls_from_index().to_pandas())
output = repr(self._pandas_repr_compatible().to_pandas())
# We should remove all the single quotes
# from the output due to the type-cast to
# object dtype happening above.
Expand Down Expand Up @@ -1650,20 +1644,6 @@ def __contains__(self, item) -> bool:
hash(item)
return item in self._column

def _clean_nulls_from_index(self) -> Index:
if self._values.has_nulls():
fill_value = (
str(cudf.NaT)
if isinstance(self, (DatetimeIndex, TimedeltaIndex))
else str(cudf.NA)
)
return Index._from_column(
self._column.astype("str").fillna(fill_value),
name=self.name,
)

return self

def any(self) -> bool:
return self._column.any()

Expand Down Expand Up @@ -3615,7 +3595,7 @@ def _is_interval(self) -> bool:
def _is_boolean(self) -> bool:
return False

def _clean_nulls_from_index(self) -> Self:
def _pandas_repr_compatible(self) -> Self:
return self

@property
Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4410,6 +4410,12 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
index_names=self.index.names if keep_index else None,
)

def _pandas_repr_compatible(self) -> Self:
"""Return Self but with columns prepared for a pandas-like repr."""
result = super()._pandas_repr_compatible()
result.index = self.index._pandas_repr_compatible()
return result

def take(self, indices, axis=0):
"""Return a new frame containing the rows specified by *indices*.
Expand Down
19 changes: 8 additions & 11 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
# Copyright (c) 2019-2025, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -361,6 +361,13 @@ def _from_data(
name=name,
)

@_performance_tracking
def _from_data_like_self(self, data: MutableMapping) -> Self:
mi = type(self)._from_data(data, name=self.name)
if mi.nlevels == self.nlevels:
mi.names = self.names
return mi

@classmethod
def _simple_new(
cls,
Expand Down Expand Up @@ -1753,16 +1760,6 @@ def nunique(self, dropna: bool = True) -> int:
mi = self.dropna(how="all") if dropna else self
return len(mi.unique())

def _clean_nulls_from_index(self) -> Self:
"""
Convert all na values(if any) in MultiIndex object
to `<NA>` as a preprocessing step to `__repr__` methods.
"""
index_df = self.to_frame(index=False, name=list(range(self.nlevels)))
return MultiIndex.from_frame(
index_df._clean_nulls_from_dataframe(index_df), names=self.names
)

@_performance_tracking
def memory_usage(self, deep: bool = False) -> int:
usage = sum(col.memory_usage for col in self._columns)
Expand Down
Loading

0 comments on commit f1cb88d

Please sign in to comment.