Skip to content

Commit

Permalink
feat: implement selectors set arithmetic (#247)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli authored Jun 2, 2024
1 parent 2a3693b commit 7c83032
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 29 deletions.
8 changes: 8 additions & 0 deletions docs/api-reference/selectors.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# `narwhals.selectors`

The following selectors are all supported. In addition, just like in Polars, the following
set operations are supported:

- set intersection: `&`
- set union: `|`
- set difference: `-`
- complement: `~`

::: narwhals.selectors
handler: python
options:
Expand Down
2 changes: 2 additions & 0 deletions narwhals/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from narwhals import selectors
from narwhals.dataframe import DataFrame
from narwhals.dataframe import LazyFrame
from narwhals.dtypes import Boolean
Expand Down Expand Up @@ -34,6 +35,7 @@
__version__ = "0.8.21"

__all__ = [
"selectors",
"concat",
"to_native",
"from_native",
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,9 @@ def select(
**named_exprs: IntoPandasExpr,
) -> Self:
new_series = evaluate_into_exprs(self, *exprs, **named_exprs)
if not new_series:
# return empty dataframe, like Polars does
return self._from_dataframe(self._dataframe.__class__())
new_series = validate_indices(new_series)
df = horizontal_concat(
new_series,
Expand Down
6 changes: 3 additions & 3 deletions narwhals/_pandas_like/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from narwhals import dtypes
from narwhals._pandas_like.dataframe import PandasDataFrame
from narwhals._pandas_like.expr import PandasExpr
from narwhals._pandas_like.selectors import PandasSelector
from narwhals._pandas_like.selectors import PandasSelectorNamespace
from narwhals._pandas_like.series import PandasSeries
from narwhals._pandas_like.utils import horizontal_concat
from narwhals._pandas_like.utils import parse_into_exprs
Expand Down Expand Up @@ -38,8 +38,8 @@ class PandasNamespace:
Datetime = dtypes.Datetime

@property
def selectors(self) -> PandasSelector:
return PandasSelector(self._implementation)
def selectors(self) -> PandasSelectorNamespace:
return PandasSelectorNamespace(self._implementation)

# --- not in spec ---
def __init__(self, implementation: str) -> None:
Expand Down
127 changes: 124 additions & 3 deletions narwhals/_pandas_like/selectors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import Any

from narwhals import dtypes
from narwhals._pandas_like.expr import PandasExpr

if TYPE_CHECKING:
Expand All @@ -10,19 +12,138 @@
from narwhals.dtypes import DType


class PandasSelector:
class PandasSelectorNamespace:
def __init__(self, implementation: str) -> None:
self._implementation = implementation

def by_dtype(self, dtypes: list[DType]) -> PandasExpr:
def by_dtype(self, dtypes: list[DType | type[DType]]) -> PandasSelector:
def func(df: PandasDataFrame) -> list[PandasSeries]:
return [df[col] for col in df.columns if df.schema[col] in dtypes]

return PandasExpr(
return PandasSelector(
func,
depth=0,
function_name="type_selector",
root_names=None,
output_names=None,
implementation=self._implementation,
)

def numeric(self) -> PandasSelector:
return self.by_dtype(
[
dtypes.Int64,
dtypes.Int32,
dtypes.Int16,
dtypes.Int8,
dtypes.UInt64,
dtypes.UInt32,
dtypes.UInt16,
dtypes.UInt8,
dtypes.Float64,
dtypes.Float32,
],
)

def categorical(self) -> PandasSelector:
return self.by_dtype([dtypes.Categorical])

def string(self) -> PandasSelector:
return self.by_dtype([dtypes.String])

def boolean(self) -> PandasSelector:
return self.by_dtype([dtypes.Boolean])

def all(self) -> PandasSelector:
def func(df: PandasDataFrame) -> list[PandasSeries]:
return [df[col] for col in df.columns]

return PandasSelector(
func,
depth=0,
function_name="type_selector",
root_names=None,
output_names=None,
implementation=self._implementation,
)


class PandasSelector(PandasExpr):
def __repr__(self) -> str: # pragma: no cover
return (
f"PandasSelector("
f"depth={self._depth}, "
f"function_name={self._function_name}, "
f"root_names={self._root_names}, "
f"output_names={self._output_names}"
)

def _to_expr(self) -> PandasExpr:
return PandasExpr(
self._call,
depth=self._depth,
function_name=self._function_name,
root_names=self._root_names,
output_names=self._output_names,
implementation=self._implementation,
)

def __sub__(self, other: PandasSelector | Any) -> PandasSelector | Any:
if isinstance(other, PandasSelector):

def call(df: PandasDataFrame) -> list[PandasSeries]:
lhs = self._call(df)
rhs = other._call(df)
return [x for x in lhs if x.name not in [x.name for x in rhs]]

return PandasSelector(
call,
depth=0,
function_name="type_selector",
root_names=None,
output_names=None,
implementation=self._implementation,
)
else:
return self._to_expr() - other

def __or__(self, other: PandasSelector | Any) -> PandasSelector | Any:
if isinstance(other, PandasSelector):

def call(df: PandasDataFrame) -> list[PandasSeries]:
lhs = self._call(df)
rhs = other._call(df)
return [x for x in lhs if x.name not in [x.name for x in rhs]] + rhs

return PandasSelector(
call,
depth=0,
function_name="type_selector",
root_names=None,
output_names=None,
implementation=self._implementation,
)
else:
return self._to_expr() | other

def __and__(self, other: PandasSelector | Any) -> PandasSelector | Any:
if isinstance(other, PandasSelector):

def call(df: PandasDataFrame) -> list[PandasSeries]:
lhs = self._call(df)
rhs = other._call(df)
return [x for x in lhs if x.name in [x.name for x in rhs]]

return PandasSelector(
call,
depth=0,
function_name="type_selector",
root_names=None,
output_names=None,
implementation=self._implementation,
)
else:
return self._to_expr() & other

def __invert__(self) -> PandasSelector:
return PandasSelectorNamespace(self._implementation).all() - self
73 changes: 50 additions & 23 deletions narwhals/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

from typing import Any

from narwhals import dtypes
from narwhals.dtypes import translate_dtype
from narwhals.expression import Expr
from narwhals.utils import flatten


class Selector(Expr): ...


def by_dtype(*dtypes: Any) -> Expr:
"""
Select columns based on their dtype.
Expand Down Expand Up @@ -50,7 +52,7 @@ def by_dtype(*dtypes: Any) -> Expr:
│ 4 ┆ 4.6 │
└─────┴─────┘
"""
return Expr(
return Selector(
lambda plx: plx.selectors.by_dtype(
[translate_dtype(plx, dtype) for dtype in flatten(dtypes)]
)
Expand Down Expand Up @@ -96,18 +98,7 @@ def numeric() -> Expr:
│ 4 ┆ 4.6 │
└─────┴─────┘
"""
return by_dtype(
dtypes.Int64,
dtypes.Int32,
dtypes.Int16,
dtypes.Int8,
dtypes.UInt64,
dtypes.UInt32,
dtypes.UInt16,
dtypes.UInt8,
dtypes.Float64,
dtypes.Float32,
)
return Selector(lambda plx: plx.selectors.numeric())


def boolean() -> Expr:
Expand Down Expand Up @@ -149,9 +140,7 @@ def boolean() -> Expr:
│ true │
└───────┘
"""
return by_dtype(
dtypes.Boolean,
)
return Selector(lambda plx: plx.selectors.boolean())


def string() -> Expr:
Expand Down Expand Up @@ -193,9 +182,7 @@ def string() -> Expr:
│ y │
└─────┘
"""
return by_dtype(
dtypes.String,
)
return Selector(lambda plx: plx.selectors.string())


def categorical() -> Expr:
Expand Down Expand Up @@ -237,6 +224,46 @@ def categorical() -> Expr:
│ y │
└─────┘
"""
return by_dtype(
dtypes.Categorical,
)
return Selector(lambda plx: plx.selectors.categorical())


def all() -> Expr:
"""
Select all columns.
Examples:
>>> import narwhals as nw
>>> import narwhals.selectors as ncs
>>> import pandas as pd
>>> import polars as pl
>>>
>>> data = {'a': [1, 2], 'b': ['x', 'y'], 'c': [False, True]}
>>> df_pd = pd.DataFrame(data).astype({'b': 'category'})
>>> df_pl = pl.DataFrame(data, schema_overrides={'b': pl.Categorical})
Let's define a dataframe-agnostic function to select string
dtypes:
>>> def func(df_any):
... df = nw.from_native(df_any)
... df = df.select(ncs.all())
... return nw.to_native(df)
We can then pass either pandas or Polars dataframes:
>>> func(df_pd)
a b c
0 1 x False
1 2 y True
>>> func(df_pl)
shape: (2, 3)
┌─────┬─────┬───────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ cat ┆ bool │
╞═════╪═════╪═══════╡
│ 1 ┆ x ┆ false │
│ 2 ┆ y ┆ true │
└─────┴─────┴───────┘
"""
return Selector(lambda plx: plx.selectors.all())
26 changes: 26 additions & 0 deletions tests/selectors_test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from __future__ import annotations

from typing import Any

import pandas as pd
import polars as pl
import pytest

import narwhals as nw
from narwhals.selectors import all
from narwhals.selectors import boolean
from narwhals.selectors import by_dtype
from narwhals.selectors import categorical
Expand Down Expand Up @@ -61,3 +64,26 @@ def test_categorical() -> None:
result = nw.to_native(df.select(categorical()))
expected = {"b": ["a", "b", "c"]}
compare_dicts(result, expected)


@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame])
@pytest.mark.parametrize(
("selector", "expected"),
[
(numeric() | boolean(), ["a", "c", "d"]),
(numeric() & boolean(), []),
(numeric() & by_dtype(nw.Int64), ["a"]),
(numeric() | by_dtype(nw.Int64), ["a", "c"]),
(~numeric(), ["b", "d"]),
(boolean() & True, ["d"]),
(boolean() | True, ["d"]),
(numeric() - 1, ["a", "c"]),
(all(), ["a", "b", "c", "d"]),
],
)
def test_set_ops(
constructor: Any, selector: nw.selectors.Selector, expected: list[str]
) -> None:
df = nw.from_native(constructor(data))
result = df.select(selector).columns
assert sorted(result) == expected

0 comments on commit 7c83032

Please sign in to comment.