diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index c2f3c782d10..2806a1f6c23 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -350,7 +350,7 @@ def names(self, values): self.name = values[0] - def _clean_nulls_from_index(self): + def _pandas_repr_compatible(self): """ Convert all na values(if any) in Index object to `` as a preprocessing step to `__repr__` methods. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 24b657f1c32..ef815e44d9d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -77,6 +77,7 @@ from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.column.numerical import NumericalColumn + from cudf.core.column.strings import StringColumn if PANDAS_GE_210: NumpyExtensionArray = pd.arrays.NumpyExtensionArray @@ -92,6 +93,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible): "min", } + _PANDAS_NA_REPR = str(pd.NA) + def data_array_view( self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": @@ -176,6 +179,17 @@ def __repr__(self): f"dtype: {self.dtype}" ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + if self.has_nulls(): + return self.astype("str").fillna(self._PANDAS_NA_REPR) + return self + def to_pandas( self, *, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b6a4122ebb9..80551e33115 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -212,6 +212,8 @@ class DatetimeColumn(column.ColumnBase): "__rsub__", } + _PANDAS_NA_REPR = str(pd.NaT) + def __init__( self, data: Buffer, diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 6283e498842..9c5041df521 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -28,6 +28,7 @@ from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer + from cudf.core.column.string import StringColumn class ListColumn(ColumnBase): @@ -67,6 +68,16 @@ def __init__( children=children, ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + # TODO: handle if self.has_nulls(): case + return self + @cached_property def memory_usage(self): n = super().memory_usage diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index ba765b50729..052a68cec98 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from functools import cached_property @@ -18,6 +18,7 @@ from cudf._typing import Dtype from cudf.core.buffer import Buffer + from cudf.core.column.string import StringColumn class StructColumn(ColumnBase): @@ -51,6 +52,16 @@ def __init__( children=children, ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + # TODO: handle if self.has_nulls(): case + return self + @staticmethod def _validate_dtype_instance(dtype: StructDtype) -> StructDtype: # IntervalDtype is a subclass of StructDtype, so compare types exactly diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 749ab8e837a..302178ea277 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -81,6 +81,8 @@ class TimeDeltaColumn(ColumnBase): "__rfloordiv__", } + _PANDAS_NA_REPR = str(pd.NaT) + def __init__( self, data: Buffer, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b2121511a14..40d36a6ff56 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -1894,7 +1894,7 @@ def astype( dtype = {cc: dtype for cc in self._column_names} return super().astype(dtype, copy, errors) - def _clean_renderable_dataframe(self, output): + def _clean_renderable_dataframe(self, output: Self) -> str: """ This method takes in partial/preprocessed dataframe and returns correct representation of it with correct @@ -1929,41 +1929,7 @@ def _clean_renderable_dataframe(self, output): ) return "\n".join(lines) - def _clean_nulls_from_dataframe(self, df): - """ - This function converts all ``null`` values to ```` for - representation as a string in `__repr__`. - - Since we utilize Pandas `__repr__` at all places in our code - for formatting purposes, we convert columns to `str` dtype for - filling with `` values. - """ - for col in df._data: - if isinstance( - df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype) - ): - # TODO we need to handle this - pass - elif df._data[col].has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance( - df._data[col], - ( - cudf.core.column.DatetimeColumn, - cudf.core.column.TimeDeltaColumn, - ), - ) - else str(cudf.NA) - ) - - df[col] = df._data[col].astype("str").fillna(fill_value) - else: - df[col] = df._data[col] - - return df - - def _get_renderable_dataframe(self): + def _get_renderable_dataframe(self) -> Self: """ Takes rows and columns from pandas settings or estimation from size. pulls quadrants based off of some known parameters then style for @@ -1971,9 +1937,9 @@ def _get_renderable_dataframe(self): for printing with the dataframe. """ max_rows = pd.options.display.max_rows - nrows = np.max([len(self) if max_rows is None else max_rows, 1]) - if pd.options.display.max_rows == 0: - nrows = len(self) + if max_rows in {0, None}: + max_rows = len(self) + nrows = max(max_rows, 1) ncols = ( pd.options.display.max_columns if pd.options.display.max_columns @@ -1981,7 +1947,7 @@ def _get_renderable_dataframe(self): ) if len(self) <= nrows and self._num_columns <= ncols: - output = self.copy(deep=False) + output = self elif self.empty and len(self.index) > 0: max_seq_items = pd.options.display.max_seq_items # In case of Empty DataFrame with index, Pandas prints @@ -2041,10 +2007,7 @@ def _get_renderable_dataframe(self): lower = cudf.concat([lower_left, lower_right], axis=1) output = cudf.concat([upper, lower]) - output = self._clean_nulls_from_dataframe(output) - output.index = output.index._clean_nulls_from_index() - - return output + return output._pandas_repr_compatible() @_performance_tracking def __repr__(self): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8f45c6f0115..abf9f7b3686 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -820,6 +820,13 @@ def fillna( inplace=inplace, ) + def _pandas_repr_compatible(self) -> Self: + """Return Self but with columns prepared for a pandas-like repr.""" + columns = (col._prep_pandas_compat_repr() for col in self._columns) + return self._from_data_like_self( + self._data._from_columns_like_self(columns, verify=False) + ) + @_performance_tracking def _drop_column( self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise" diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 85be8d21d27..54635b162bc 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -339,7 +339,7 @@ def _values(self) -> ColumnBase: else: return column.column_empty(0, dtype=self.dtype) - def _clean_nulls_from_index(self) -> Self: + def _pandas_repr_compatible(self) -> Self: return self def _is_numeric(self) -> bool: @@ -1127,15 +1127,9 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self: out.name = name return out - @classmethod @_performance_tracking - def _from_data_like_self( - cls, data: MutableMapping, name: Any = no_default - ) -> Self: - out = _index_from_data(data, name) - if name is not no_default: - out.name = name - return out + def _from_data_like_self(self, data: MutableMapping) -> Self: + return _index_from_data(data, self.name) @classmethod @_performance_tracking @@ -1494,7 +1488,7 @@ def __repr__(self) -> str: if isinstance(self._values, StringColumn): output = repr(self.to_pandas(nullable=True)) else: - output = repr(self._clean_nulls_from_index().to_pandas()) + output = repr(self._pandas_repr_compatible().to_pandas()) # We should remove all the single quotes # from the output due to the type-cast to # object dtype happening above. @@ -1650,20 +1644,6 @@ def __contains__(self, item) -> bool: hash(item) return item in self._column - def _clean_nulls_from_index(self) -> Index: - if self._values.has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance(self, (DatetimeIndex, TimedeltaIndex)) - else str(cudf.NA) - ) - return Index._from_column( - self._column.astype("str").fillna(fill_value), - name=self.name, - ) - - return self - def any(self) -> bool: return self._column.any() @@ -3615,7 +3595,7 @@ def _is_interval(self) -> bool: def _is_boolean(self) -> bool: return False - def _clean_nulls_from_index(self) -> Self: + def _pandas_repr_compatible(self) -> Self: return self @property diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e9ed74f804b..c779e1ebe97 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4410,6 +4410,12 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True): index_names=self.index.names if keep_index else None, ) + def _pandas_repr_compatible(self) -> Self: + """Return Self but with columns prepared for a pandas-like repr.""" + result = super()._pandas_repr_compatible() + result.index = self.index._pandas_repr_compatible() + return result + def take(self, indices, axis=0): """Return a new frame containing the rows specified by *indices*. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 1e613e49ffc..e7efd01ca85 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -361,6 +361,13 @@ def _from_data( name=name, ) + @_performance_tracking + def _from_data_like_self(self, data: MutableMapping) -> Self: + mi = type(self)._from_data(data, name=self.name) + if mi.nlevels == self.nlevels: + mi.names = self.names + return mi + @classmethod def _simple_new( cls, @@ -1753,16 +1760,6 @@ def nunique(self, dropna: bool = True) -> int: mi = self.dropna(how="all") if dropna else self return len(mi.unique()) - def _clean_nulls_from_index(self) -> Self: - """ - Convert all na values(if any) in MultiIndex object - to `` as a preprocessing step to `__repr__` methods. - """ - index_df = self.to_frame(index=False, name=list(range(self.nlevels))) - return MultiIndex.from_frame( - index_df._clean_nulls_from_dataframe(index_df), names=self.names - ) - @_performance_tracking def memory_usage(self, deep: bool = False) -> int: usage = sum(col.memory_usage for col in self._columns) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 49c2c8cf387..3b047ee5ed4 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1449,35 +1449,16 @@ def __repr__(self): warnings.simplefilter("ignore", FutureWarning) preprocess = cudf.concat([top, bottom]) else: - preprocess = self.copy() - preprocess.index = preprocess.index._clean_nulls_from_index() - if ( - preprocess.nullable - and not isinstance( - preprocess.dtype, - ( - cudf.CategoricalDtype, - cudf.ListDtype, - cudf.StructDtype, - cudf.core.dtypes.DecimalDtype, - ), - ) - ) or preprocess.dtype.kind == "m": - fill_value = ( - str(cudf.NaT) - if preprocess.dtype.kind in "mM" - else str(cudf.NA) - ) - output = repr( - preprocess.astype("str").fillna(fill_value).to_pandas() - ) - elif isinstance(preprocess.dtype, cudf.CategoricalDtype): + preprocess = self + if isinstance(preprocess.dtype, cudf.CategoricalDtype): min_rows = ( height if pd.get_option("display.min_rows") == 0 else pd.get_option("display.min_rows") ) show_dimensions = pd.get_option("display.show_dimensions") + preprocess = preprocess.copy(deep=False) + preprocess.index = preprocess.index._pandas_repr_compatible() if preprocess.dtype.categories.dtype.kind == "f": pd_series = ( preprocess.astype("str") @@ -1502,7 +1483,7 @@ def __repr__(self): na_rep=str(cudf.NA), ) else: - output = repr(preprocess.to_pandas()) + output = repr(preprocess._pandas_repr_compatible().to_pandas()) lines = output.split("\n") if isinstance(preprocess.dtype, cudf.CategoricalDtype): diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index bf0c97adb00..2cb742727cc 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import textwrap @@ -618,9 +618,9 @@ def test_timedelta_series_s_us_repr(data, dtype): cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"), textwrap.dedent( """ - 0 0 days 00:00:00.001000000 - 1 0 days 00:00:00.000200000 - 2 0 days 00:00:00.003000000 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:00.000200 + 2 0 days 00:00:00.003000 dtype: timedelta64[ns] """ ), @@ -710,12 +710,12 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 0 days 00:00:00.012 - 1 0 days 00:00:00.012 - 2 0 days 00:00:00.022 - 3 0 days 00:00:00.343 - 4 0 days 01:12:33.534 - 5 0 days 00:07:15.342 + 0 0 days 00:00:00.012000 + 1 0 days 00:00:00.012000 + 2 0 days 00:00:00.022000 + 3 0 days 00:00:00.343000 + 4 0 days 01:12:33.534000 + 5 0 days 00:07:15.342000 dtype: timedelta64[ms] """ ), @@ -745,13 +745,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 0 days 00:00:00.001 - 1 0 days 00:00:01.132 - 2 0 days 06:27:03.231 - 3 0 days 00:00:00.233 - 4 0 days 00:00:00 - 5 0 days 00:00:00.332 - 6 0 days 00:00:00.323 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:01.132000 + 2 0 days 06:27:03.231000 + 3 0 days 00:00:00.233000 + 4 0 days 00:00:00 + 5 0 days 00:00:00.332000 + 6 0 days 00:00:00.323000 dtype: timedelta64[ms] """ ), @@ -771,13 +771,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 dtype: timedelta64[ms] """ ), @@ -824,13 +824,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 Name: abc, dtype: timedelta64[ms] """ ),