Skip to content

Commit

Permalink
Remove MultiIndex._poplevel inplace implementation. (rapidsai#16767)
Browse files Browse the repository at this point in the history
`MultiIndex._poplevel`, which backs `MultiIndex.droplevel`, operates by dropping a given level inplace. There 2 places where `._poplevel` is called, and both usages makes a shallow copy of the data first, presumably to work around side effects of this inplace behavior.

This PR remove the `MultiIndex._poplevel` implementation and just implements dropping level like behavior by just returning a new object.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: rapidsai#16767
  • Loading branch information
mroeschke authored Sep 19, 2024
1 parent 51c2dd6 commit 7233da9
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 72 deletions.
111 changes: 46 additions & 65 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name

if TYPE_CHECKING:
from collections.abc import Generator
from collections.abc import Generator, Hashable

from typing_extensions import Self

Expand Down Expand Up @@ -1041,20 +1041,25 @@ def to_frame(
)

@_performance_tracking
def get_level_values(self, level) -> cudf.Index:
def _level_to_ca_label(self, level) -> tuple[Hashable, int]:
"""
Return the values at the requested level
Convert a level to a ColumAccessor label and an integer position.
Useful if self._column_names != self.names.
Parameters
----------
level : int or label
Returns
-------
An Index containing the values at the requested level.
tuple[Hashable, int]
(ColumnAccessor label corresponding to level, integer position of the level)
"""
colnames = self._data.names
if level not in colnames:
colnames = self._column_names
try:
level_idx = colnames.index(level)
except ValueError:
if isinstance(level, int):
if level < 0:
level = level + len(colnames)
Expand All @@ -1067,8 +1072,22 @@ def get_level_values(self, level) -> cudf.Index:
level = colnames[level_idx]
else:
raise KeyError(f"Level not found: '{level}'")
else:
level_idx = colnames.index(level)
return level, level_idx

@_performance_tracking
def get_level_values(self, level) -> cudf.Index:
"""
Return the values at the requested level
Parameters
----------
level : int or label
Returns
-------
An Index containing the values at the requested level.
"""
level, level_idx = self._level_to_ca_label(level)
level_values = cudf.Index._from_column(
self._data[level], name=self.names[level_idx]
)
Expand Down Expand Up @@ -1420,57 +1439,6 @@ def from_arrays(
codes=codes, levels=levels, sortorder=sortorder, names=names
)

@_performance_tracking
def _poplevels(self, level) -> None | MultiIndex | cudf.Index:
"""
Remove and return the specified levels from self.
Parameters
----------
level : level name or index, list
One or more levels to remove
Returns
-------
Index composed of the removed levels. If only a single level
is removed, a flat index is returned. If no levels are specified
(empty list), None is returned.
"""
if not pd.api.types.is_list_like(level):
level = (level,)

ilevels = sorted(self._level_index_from_level(lev) for lev in level)

if not ilevels:
return None

popped_data = {}
popped_names = []
names = list(self.names)

# build the popped data and names
for i in ilevels:
n = self._data.names[i]
popped_data[n] = self._data[n]
popped_names.append(self.names[i])

# pop the levels out from self
# this must be done iterating backwards
for i in reversed(ilevels):
n = self._data.names[i]
names.pop(i)
popped_data[n] = self._data.pop(n)

# construct the popped result
popped = cudf.core.index._index_from_data(popped_data)
popped.names = popped_names

# update self
self.names = names
self._levels, self._codes = _compute_levels_and_codes(self._data)

return popped

@_performance_tracking
def swaplevel(self, i=-2, j=-1) -> Self:
"""
Expand Down Expand Up @@ -1523,7 +1491,7 @@ def swaplevel(self, i=-2, j=-1) -> Self:
return midx

@_performance_tracking
def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
def droplevel(self, level=-1) -> Self | cudf.Index:
"""
Removes the specified levels from the MultiIndex.
Expand Down Expand Up @@ -1578,11 +1546,24 @@ def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
>>> idx.droplevel(["first", "second"])
Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third')
"""
mi = self.copy(deep=False)
mi._poplevels(level)
if mi.nlevels == 1:
return mi.get_level_values(mi.names[0])
if is_scalar(level):
level = (level,)
elif len(level) == 0:
return self

new_names = list(self.names)
new_data = self._data.copy(deep=False)
for i in sorted(
(self._level_index_from_level(lev) for lev in level), reverse=True
):
new_names.pop(i)
new_data.pop(self._data.names[i])

if len(new_data) == 1:
return cudf.core.index._index_from_data(new_data)
else:
mi = MultiIndex._from_data(new_data)
mi.names = new_names
return mi

@_performance_tracking
Expand Down Expand Up @@ -1886,7 +1867,7 @@ def __array_function__(self, func, types, args, kwargs):
else:
return NotImplemented

def _level_index_from_level(self, level):
def _level_index_from_level(self, level) -> int:
"""
Return level index from given level name or index
"""
Expand Down
26 changes: 19 additions & 7 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from cudf._lib.transform import one_hot_encode
from cudf._lib.types import size_type_dtype
from cudf.api.extensions import no_default
from cudf.api.types import is_scalar
from cudf.core._compat import PANDAS_LT_300
from cudf.core.column import ColumnBase, as_column, column_empty_like
from cudf.core.column_accessor import ColumnAccessor
Expand Down Expand Up @@ -1227,13 +1228,24 @@ def unstack(df, level, fill_value=None, sort: bool = True):
)
return res
else:
df = df.copy(deep=False)
columns = df.index._poplevels(level)
index = df.index
result = _pivot(df, index, columns)
if result.index.nlevels == 1:
result.index = result.index.get_level_values(result.index.names[0])
return result
index = df.index.droplevel(level)
if is_scalar(level):
columns = df.index.get_level_values(level)
else:
new_names = []
ca_data = {}
for lev in level:
ca_level, level_idx = df.index._level_to_ca_label(lev)
new_names.append(df.index.names[level_idx])
ca_data[ca_level] = df.index._data[ca_level]
columns = type(df.index)._from_data(
ColumnAccessor(ca_data, verify=False)
)
columns.names = new_names
result = _pivot(df, index, columns)
if result.index.nlevels == 1:
result.index = result.index.get_level_values(result.index.names[0])
return result


def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase:
Expand Down

0 comments on commit 7233da9

Please sign in to comment.