Skip to content

Commit

Permalink
feat: ListNamespace.len() for Expr and Series (#1536)
Browse files Browse the repository at this point in the history
* feat: list namespace WIP

* WIP

* fixed?

* add Self

* polars pre 1.16

* docstring examples

* pragma: no cover

* casting to UInt32

* manual renaming

* avoid alias in test
  • Loading branch information
FBruzzesi authored Dec 8, 2024
1 parent 9742ee5 commit abfc8de
Show file tree
Hide file tree
Showing 16 changed files with 382 additions and 11 deletions.
9 changes: 9 additions & 0 deletions docs/api-reference/expr_list.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# `narwhals.Expr.list`

::: narwhals.expr.ExprListNamespace
handler: python
options:
members:
- len
show_source: false
show_bases: false
9 changes: 9 additions & 0 deletions docs/api-reference/series_list.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# `narwhals.Series.list`

::: narwhals.series.SeriesListNamespace
handler: python
options:
members:
- len
show_source: false
show_bases: false
16 changes: 16 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,10 @@ def cat(self: Self) -> ArrowExprCatNamespace:
def name(self: Self) -> ArrowExprNameNamespace:
return ArrowExprNameNamespace(self)

@property
def list(self: Self) -> ArrowExprListNamespace:
return ArrowExprListNamespace(self)


class ArrowExprCatNamespace:
def __init__(self: Self, expr: ArrowExpr) -> None:
Expand Down Expand Up @@ -869,3 +873,15 @@ def to_uppercase(self: Self) -> ArrowExpr:
backend_version=self._compliant_expr._backend_version,
version=self._compliant_expr._version,
)


class ArrowExprListNamespace:
def __init__(self: Self, expr: ArrowExpr) -> None:
self._expr = expr

def len(self: Self) -> ArrowExpr:
return reuse_series_namespace_implementation(
self._expr,
"list",
"len",
)
17 changes: 17 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,6 +1062,10 @@ def cat(self: Self) -> ArrowSeriesCatNamespace:
def str(self: Self) -> ArrowSeriesStringNamespace:
return ArrowSeriesStringNamespace(self)

@property
def list(self: Self) -> ArrowSeriesListNamespace:
return ArrowSeriesListNamespace(self)


class ArrowSeriesDateTimeNamespace:
def __init__(self: Self, series: ArrowSeries) -> None:
Expand Down Expand Up @@ -1458,3 +1462,16 @@ def to_lowercase(self: Self) -> ArrowSeries:
return self._compliant_series._from_native_series(
pc.utf8_lower(self._compliant_series._native_series),
)


class ArrowSeriesListNamespace:
def __init__(self: Self, series: ArrowSeries) -> None:
self._arrow_series = series

def len(self: Self) -> ArrowSeries:
import pyarrow as pa # ignore-banned-import()
import pyarrow.compute as pc # ignore-banned-import()

return self._arrow_series._from_native_series(
pc.cast(pc.list_value_length(self._arrow_series._native_series), pa.uint32())
)
10 changes: 7 additions & 3 deletions narwhals/_arrow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,13 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> pa
return pa.duration(time_unit)
if isinstance_or_issubclass(dtype, dtypes.Date):
return pa.date32()
if isinstance_or_issubclass(dtype, dtypes.List): # pragma: no cover
msg = "Converting to List dtype is not supported yet"
return NotImplementedError(msg)
if isinstance_or_issubclass(dtype, dtypes.List):
return pa.list_(
value_type=narwhals_to_native_dtype(
dtype.inner, # type: ignore[union-attr]
version=version,
)
)
if isinstance_or_issubclass(dtype, dtypes.Struct): # pragma: no cover
msg = "Converting to Struct dtype is not supported yet"
return NotImplementedError(msg)
Expand Down
18 changes: 17 additions & 1 deletion narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

class PandasLikeExpr:
def __init__(
self,
self: Self,
call: Callable[[PandasLikeDataFrame], list[PandasLikeSeries]],
*,
depth: int,
Expand Down Expand Up @@ -545,6 +545,10 @@ def cat(self: Self) -> PandasLikeExprCatNamespace:
def name(self: Self) -> PandasLikeExprNameNamespace:
return PandasLikeExprNameNamespace(self)

@property
def list(self: Self) -> PandasLikeExprListNamespace:
return PandasLikeExprListNamespace(self)


class PandasLikeExprCatNamespace:
def __init__(self, expr: PandasLikeExpr) -> None:
Expand Down Expand Up @@ -902,3 +906,15 @@ def to_uppercase(self: Self) -> PandasLikeExpr:
backend_version=self._compliant_expr._backend_version,
version=self._compliant_expr._version,
)


class PandasLikeExprListNamespace:
def __init__(self: Self, expr: PandasLikeExpr) -> None:
self._expr = expr

def len(self: Self) -> PandasLikeExpr:
return reuse_series_namespace_implementation(
self._expr,
"list",
"len",
)
23 changes: 23 additions & 0 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,6 +864,10 @@ def dt(self) -> PandasLikeSeriesDateTimeNamespace:
def cat(self) -> PandasLikeSeriesCatNamespace:
return PandasLikeSeriesCatNamespace(self)

@property
def list(self) -> PandasLikeSeriesListNamespace:
return PandasLikeSeriesListNamespace(self)


class PandasLikeSeriesCatNamespace:
def __init__(self, series: PandasLikeSeries) -> None:
Expand Down Expand Up @@ -1156,3 +1160,22 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSe
raise TypeError(msg)
result[mask_na] = None
return self._compliant_series._from_native_series(result)


class PandasLikeSeriesListNamespace:
def __init__(self, series: PandasLikeSeries) -> None:
self._compliant_series = series

def len(self: Self) -> PandasLikeSeries:
from narwhals.utils import import_dtypes_module

native_series = self._compliant_series._native_series
native_result = native_series.list.len().rename(native_series.name, copy=False)
dtype = narwhals_to_native_dtype(
dtype=import_dtypes_module(self._compliant_series._version).UInt32(),
starting_dtype=native_result.dtype,
implementation=self._compliant_series._implementation,
backend_version=self._compliant_series._backend_version,
version=self._compliant_series._version,
)
return self._compliant_series._from_native_series(native_result.astype(dtype))
29 changes: 26 additions & 3 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from typing import Sequence
from typing import TypeVar

from narwhals._arrow.utils import (
narwhals_to_native_dtype as arrow_narwhals_to_native_dtype,
)
from narwhals._arrow.utils import (
native_to_narwhals_dtype as arrow_native_to_narwhals_dtype,
)
Expand Down Expand Up @@ -567,9 +570,29 @@ def narwhals_to_native_dtype( # noqa: PLR0915
if isinstance_or_issubclass(dtype, dtypes.Enum):
msg = "Converting to Enum is not (yet) supported"
raise NotImplementedError(msg)
if isinstance_or_issubclass(dtype, dtypes.List): # pragma: no cover
msg = "Converting to List dtype is not supported yet"
return NotImplementedError(msg)
if isinstance_or_issubclass(dtype, dtypes.List):
if implementation is Implementation.PANDAS and backend_version >= (2, 2):
try:
import pandas as pd # ignore-banned-import
import pyarrow as pa # ignore-banned-import
except ImportError as exc: # pragma: no cover
msg = f"Unable to convert to {dtype} to to the following exception: {exc.msg}"
raise ImportError(msg) from exc

return pd.ArrowDtype(
pa.list_(
value_type=arrow_narwhals_to_native_dtype(
dtype.inner, # type: ignore[union-attr]
version=version,
)
)
)
else:
msg = (
"Converting to List dtype is not supported for implementation "
f"{implementation} and version {version}."
)
return NotImplementedError(msg)
if isinstance_or_issubclass(dtype, dtypes.Struct): # pragma: no cover
msg = "Converting to Struct dtype is not supported yet"
return NotImplementedError(msg)
Expand Down
38 changes: 38 additions & 0 deletions narwhals/_polars/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ def cat(self: Self) -> PolarsExprCatNamespace:
def name(self: Self) -> PolarsExprNameNamespace:
return PolarsExprNameNamespace(self)

@property
def list(self: Self) -> PolarsExprListNamespace:
return PolarsExprListNamespace(self)


class PolarsExprDateTimeNamespace:
def __init__(self: Self, expr: PolarsExpr) -> None:
Expand Down Expand Up @@ -236,3 +240,37 @@ def func(*args: Any, **kwargs: Any) -> PolarsExpr:
)

return func


class PolarsExprListNamespace:
def __init__(self: Self, expr: PolarsExpr) -> None:
self._expr = expr

def len(self: Self) -> PolarsExpr:
native_expr = self._expr._native_expr
native_result = native_expr.list.len()

if self._expr._backend_version < (1, 16): # pragma: no cover
import polars as pl

native_result: pl.Expr = ( # type: ignore[no-redef]
pl.when(~native_expr.is_null()).then(native_result).cast(pl.UInt32())
)
elif self._expr._backend_version < (1, 17): # pragma: no cover
import polars as pl

native_result = native_result.cast(pl.UInt32())

return self._expr._from_native_expr(native_result)

# TODO(FBruzzesi): Remove `pragma: no cover` once other namespace methods are added
def __getattr__(
self: Self, attr: str
) -> Callable[[Any], PolarsExpr]: # pragma: no cover
def func(*args: Any, **kwargs: Any) -> PolarsExpr:
args, kwargs = extract_args_kwargs(args, kwargs) # type: ignore[assignment]
return self._expr._from_native_expr(
getattr(self._expr._native_expr.list, attr)(*args, **kwargs)
)

return func
37 changes: 37 additions & 0 deletions narwhals/_polars/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,10 @@ def str(self: Self) -> PolarsSeriesStringNamespace:
def cat(self: Self) -> PolarsSeriesCatNamespace:
return PolarsSeriesCatNamespace(self)

@property
def list(self: Self) -> PolarsSeriesListNamespace:
return PolarsSeriesListNamespace(self)


class PolarsSeriesDateTimeNamespace:
def __init__(self: Self, series: PolarsSeries) -> None:
Expand Down Expand Up @@ -407,3 +411,36 @@ def func(*args: Any, **kwargs: Any) -> Any:
)

return func


class PolarsSeriesListNamespace:
def __init__(self: Self, series: PolarsSeries) -> None:
self._series = series

def len(self: Self) -> PolarsSeries:
native_series = self._series._native_series
native_result = native_series.list.len()

if self._series._backend_version < (1, 16): # pragma: no cover
import polars as pl

native_result = pl.select(
pl.when(~native_series.is_null()).then(native_result).otherwise(None)
)[native_series.name].cast(pl.UInt32())

elif self._series._backend_version < (1, 17): # pragma: no cover
import polars as pl

native_result = native_series.cast(pl.UInt32())

return self._series._from_native_series(native_result)

# TODO(FBruzzesi): Remove `pragma: no cover` once other namespace methods are added
def __getattr__(self: Self, attr: str) -> Any: # pragma: no cover
def func(*args: Any, **kwargs: Any) -> Any:
args, kwargs = extract_args_kwargs(args, kwargs) # type: ignore[assignment]
return self._series._from_native_series(
getattr(self._series._native_series.list, attr)(*args, **kwargs)
)

return func
5 changes: 2 additions & 3 deletions narwhals/_polars/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,8 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> pl
du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
return pl.Duration(time_unit=du_time_unit)

if dtype == dtypes.List: # pragma: no cover
msg = "Converting to List dtype is not supported yet"
raise NotImplementedError(msg)
if dtype == dtypes.List:
return pl.List(narwhals_to_native_dtype(dtype.inner, version)) # type: ignore[union-attr]
if dtype == dtypes.Struct: # pragma: no cover
msg = "Converting to Struct dtype is not supported yet"
raise NotImplementedError(msg)
Expand Down
68 changes: 68 additions & 0 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3447,6 +3447,10 @@ def cat(self: Self) -> ExprCatNamespace[Self]:
def name(self: Self) -> ExprNameNamespace[Self]:
return ExprNameNamespace(self)

@property
def list(self: Self) -> ExprListNamespace[Self]:
return ExprListNamespace(self)


ExprT = TypeVar("ExprT", bound=Expr)

Expand Down Expand Up @@ -5603,6 +5607,70 @@ def to_uppercase(self: Self) -> ExprT:
)


class ExprListNamespace(Generic[ExprT]):
def __init__(self: Self, expr: ExprT) -> None:
self._expr = expr

def len(self: Self) -> ExprT:
"""Return the number of elements in each list.
Null values count towards the total.
Returns:
A new expression.
Examples:
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> data = {"a": [[1, 2], [3, 4, None], None, []]}
Let's define a dataframe-agnostic function:
>>> def agnostic_list_len(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.with_columns(a_len=nw.col("a").list.len()).to_native()
We can then pass pandas / PyArrow / Polars / any other supported library:
>>> agnostic_list_len(
... pd.DataFrame(data).astype({"a": pd.ArrowDtype(pa.list_(pa.int64()))})
... ) # doctest: +SKIP
a a_len
0 [1. 2.] 2
1 [ 3. 4. nan] 3
2 <NA> <NA>
3 [] 0
>>> agnostic_list_len(pl.DataFrame(data))
shape: (4, 2)
┌──────────────┬───────┐
│ a ┆ a_len │
│ --- ┆ --- │
│ list[i64] ┆ u32 │
╞══════════════╪═══════╡
│ [1, 2] ┆ 2 │
│ [3, 4, null] ┆ 3 │
│ null ┆ null │
│ [] ┆ 0 │
└──────────────┴───────┘
>>> agnostic_list_len(pa.table(data))
pyarrow.Table
a: list<item: int64>
child 0, item: int64
a_len: uint32
----
a: [[[1,2],[3,4,null],null,[]]]
a_len: [[2,3,null,0]]
"""
return self._expr.__class__(
lambda plx: self._expr._to_compliant_expr(plx).list.len()
)


def col(*names: str | Iterable[str]) -> Expr:
"""Creates an expression that references one or more columns by their name(s).
Expand Down
Loading

0 comments on commit abfc8de

Please sign in to comment.