diff --git a/docs/api-reference/selectors.md b/docs/api-reference/selectors.md index 4b56afdcb..6480a869e 100644 --- a/docs/api-reference/selectors.md +++ b/docs/api-reference/selectors.md @@ -1,5 +1,13 @@ # `narwhals.selectors` +The following selectors are all supported. In addition, just like in Polars, the following +set operations are supported: + +- set intersection: `&` +- set union: `|` +- set difference: `-` +- complement: `~` + ::: narwhals.selectors handler: python options: diff --git a/narwhals/__init__.py b/narwhals/__init__.py index ce03bcab0..5d26e39cf 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -1,3 +1,4 @@ +from narwhals import selectors from narwhals.dataframe import DataFrame from narwhals.dataframe import LazyFrame from narwhals.dtypes import Boolean @@ -34,6 +35,7 @@ __version__ = "0.8.21" __all__ = [ + "selectors", "concat", "to_native", "from_native", diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 687b11340..73ba692bf 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -127,6 +127,9 @@ def select( **named_exprs: IntoPandasExpr, ) -> Self: new_series = evaluate_into_exprs(self, *exprs, **named_exprs) + if not new_series: + # return empty dataframe, like Polars does + return self._from_dataframe(self._dataframe.__class__()) new_series = validate_indices(new_series) df = horizontal_concat( new_series, diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 1bf12c611..0606ecdc1 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -9,7 +9,7 @@ from narwhals import dtypes from narwhals._pandas_like.dataframe import PandasDataFrame from narwhals._pandas_like.expr import PandasExpr -from narwhals._pandas_like.selectors import PandasSelector +from narwhals._pandas_like.selectors import PandasSelectorNamespace from narwhals._pandas_like.series import PandasSeries from narwhals._pandas_like.utils import horizontal_concat from narwhals._pandas_like.utils import parse_into_exprs @@ -38,8 +38,8 @@ class PandasNamespace: Datetime = dtypes.Datetime @property - def selectors(self) -> PandasSelector: - return PandasSelector(self._implementation) + def selectors(self) -> PandasSelectorNamespace: + return PandasSelectorNamespace(self._implementation) # --- not in spec --- def __init__(self, implementation: str) -> None: diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index df0655175..b28f320f5 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -1,7 +1,9 @@ from __future__ import annotations from typing import TYPE_CHECKING +from typing import Any +from narwhals import dtypes from narwhals._pandas_like.expr import PandasExpr if TYPE_CHECKING: @@ -10,15 +12,15 @@ from narwhals.dtypes import DType -class PandasSelector: +class PandasSelectorNamespace: def __init__(self, implementation: str) -> None: self._implementation = implementation - def by_dtype(self, dtypes: list[DType]) -> PandasExpr: + def by_dtype(self, dtypes: list[DType | type[DType]]) -> PandasSelector: def func(df: PandasDataFrame) -> list[PandasSeries]: return [df[col] for col in df.columns if df.schema[col] in dtypes] - return PandasExpr( + return PandasSelector( func, depth=0, function_name="type_selector", @@ -26,3 +28,122 @@ def func(df: PandasDataFrame) -> list[PandasSeries]: output_names=None, implementation=self._implementation, ) + + def numeric(self) -> PandasSelector: + return self.by_dtype( + [ + dtypes.Int64, + dtypes.Int32, + dtypes.Int16, + dtypes.Int8, + dtypes.UInt64, + dtypes.UInt32, + dtypes.UInt16, + dtypes.UInt8, + dtypes.Float64, + dtypes.Float32, + ], + ) + + def categorical(self) -> PandasSelector: + return self.by_dtype([dtypes.Categorical]) + + def string(self) -> PandasSelector: + return self.by_dtype([dtypes.String]) + + def boolean(self) -> PandasSelector: + return self.by_dtype([dtypes.Boolean]) + + def all(self) -> PandasSelector: + def func(df: PandasDataFrame) -> list[PandasSeries]: + return [df[col] for col in df.columns] + + return PandasSelector( + func, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + implementation=self._implementation, + ) + + +class PandasSelector(PandasExpr): + def __repr__(self) -> str: # pragma: no cover + return ( + f"PandasSelector(" + f"depth={self._depth}, " + f"function_name={self._function_name}, " + f"root_names={self._root_names}, " + f"output_names={self._output_names}" + ) + + def _to_expr(self) -> PandasExpr: + return PandasExpr( + self._call, + depth=self._depth, + function_name=self._function_name, + root_names=self._root_names, + output_names=self._output_names, + implementation=self._implementation, + ) + + def __sub__(self, other: PandasSelector | Any) -> PandasSelector | Any: + if isinstance(other, PandasSelector): + + def call(df: PandasDataFrame) -> list[PandasSeries]: + lhs = self._call(df) + rhs = other._call(df) + return [x for x in lhs if x.name not in [x.name for x in rhs]] + + return PandasSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + implementation=self._implementation, + ) + else: + return self._to_expr() - other + + def __or__(self, other: PandasSelector | Any) -> PandasSelector | Any: + if isinstance(other, PandasSelector): + + def call(df: PandasDataFrame) -> list[PandasSeries]: + lhs = self._call(df) + rhs = other._call(df) + return [x for x in lhs if x.name not in [x.name for x in rhs]] + rhs + + return PandasSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + implementation=self._implementation, + ) + else: + return self._to_expr() | other + + def __and__(self, other: PandasSelector | Any) -> PandasSelector | Any: + if isinstance(other, PandasSelector): + + def call(df: PandasDataFrame) -> list[PandasSeries]: + lhs = self._call(df) + rhs = other._call(df) + return [x for x in lhs if x.name in [x.name for x in rhs]] + + return PandasSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + implementation=self._implementation, + ) + else: + return self._to_expr() & other + + def __invert__(self) -> PandasSelector: + return PandasSelectorNamespace(self._implementation).all() - self diff --git a/narwhals/selectors.py b/narwhals/selectors.py index b6580b69e..19bad5d3a 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -2,12 +2,14 @@ from typing import Any -from narwhals import dtypes from narwhals.dtypes import translate_dtype from narwhals.expression import Expr from narwhals.utils import flatten +class Selector(Expr): ... + + def by_dtype(*dtypes: Any) -> Expr: """ Select columns based on their dtype. @@ -50,7 +52,7 @@ def by_dtype(*dtypes: Any) -> Expr: │ 4 ┆ 4.6 │ └─────┴─────┘ """ - return Expr( + return Selector( lambda plx: plx.selectors.by_dtype( [translate_dtype(plx, dtype) for dtype in flatten(dtypes)] ) @@ -96,18 +98,7 @@ def numeric() -> Expr: │ 4 ┆ 4.6 │ └─────┴─────┘ """ - return by_dtype( - dtypes.Int64, - dtypes.Int32, - dtypes.Int16, - dtypes.Int8, - dtypes.UInt64, - dtypes.UInt32, - dtypes.UInt16, - dtypes.UInt8, - dtypes.Float64, - dtypes.Float32, - ) + return Selector(lambda plx: plx.selectors.numeric()) def boolean() -> Expr: @@ -149,9 +140,7 @@ def boolean() -> Expr: │ true │ └───────┘ """ - return by_dtype( - dtypes.Boolean, - ) + return Selector(lambda plx: plx.selectors.boolean()) def string() -> Expr: @@ -193,9 +182,7 @@ def string() -> Expr: │ y │ └─────┘ """ - return by_dtype( - dtypes.String, - ) + return Selector(lambda plx: plx.selectors.string()) def categorical() -> Expr: @@ -237,6 +224,46 @@ def categorical() -> Expr: │ y │ └─────┘ """ - return by_dtype( - dtypes.Categorical, - ) + return Selector(lambda plx: plx.selectors.categorical()) + + +def all() -> Expr: + """ + Select all columns. + + Examples: + >>> import narwhals as nw + >>> import narwhals.selectors as ncs + >>> import pandas as pd + >>> import polars as pl + >>> + >>> data = {'a': [1, 2], 'b': ['x', 'y'], 'c': [False, True]} + >>> df_pd = pd.DataFrame(data).astype({'b': 'category'}) + >>> df_pl = pl.DataFrame(data, schema_overrides={'b': pl.Categorical}) + + Let's define a dataframe-agnostic function to select string + dtypes: + + >>> def func(df_any): + ... df = nw.from_native(df_any) + ... df = df.select(ncs.all()) + ... return nw.to_native(df) + + We can then pass either pandas or Polars dataframes: + + >>> func(df_pd) + a b c + 0 1 x False + 1 2 y True + >>> func(df_pl) + shape: (2, 3) + ┌─────┬─────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ cat ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 1 ┆ x ┆ false │ + │ 2 ┆ y ┆ true │ + └─────┴─────┴───────┘ + """ + return Selector(lambda plx: plx.selectors.all()) diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 931ad49dd..ccf1b43e5 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import pandas as pd @@ -5,6 +7,7 @@ import pytest import narwhals as nw +from narwhals.selectors import all from narwhals.selectors import boolean from narwhals.selectors import by_dtype from narwhals.selectors import categorical @@ -61,3 +64,26 @@ def test_categorical() -> None: result = nw.to_native(df.select(categorical())) expected = {"b": ["a", "b", "c"]} compare_dicts(result, expected) + + +@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame]) +@pytest.mark.parametrize( + ("selector", "expected"), + [ + (numeric() | boolean(), ["a", "c", "d"]), + (numeric() & boolean(), []), + (numeric() & by_dtype(nw.Int64), ["a"]), + (numeric() | by_dtype(nw.Int64), ["a", "c"]), + (~numeric(), ["b", "d"]), + (boolean() & True, ["d"]), + (boolean() | True, ["d"]), + (numeric() - 1, ["a", "c"]), + (all(), ["a", "b", "c", "d"]), + ], +) +def test_set_ops( + constructor: Any, selector: nw.selectors.Selector, expected: list[str] +) -> None: + df = nw.from_native(constructor(data)) + result = df.select(selector).columns + assert sorted(result) == expected