feat: implement selectors set arithmetic (#247)

narwhals-dev · Jun 2, 2024 · 7c83032 · 7c83032
1 parent 2a3693b
commit 7c83032
Show file tree

Hide file tree

Showing 7 changed files with 216 additions and 29 deletions.
diff --git a/docs/api-reference/selectors.md b/docs/api-reference/selectors.md
@@ -1,5 +1,13 @@
 # `narwhals.selectors`
 
+The following selectors are all supported. In addition, just like in Polars, the following
+set operations are supported:
+
+- set intersection: `&`
+- set union: `|`
+- set difference: `-`
+- complement: `~`
+
 ::: narwhals.selectors
     handler: python
     options:

diff --git a/narwhals/__init__.py b/narwhals/__init__.py
@@ -1,3 +1,4 @@
+from narwhals import selectors
 from narwhals.dataframe import DataFrame
 from narwhals.dataframe import LazyFrame
 from narwhals.dtypes import Boolean
@@ -34,6 +35,7 @@
 __version__ = "0.8.21"
 
 __all__ = [
+    "selectors",
     "concat",
     "to_native",
     "from_native",

diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py
@@ -127,6 +127,9 @@ def select(
         **named_exprs: IntoPandasExpr,
     ) -> Self:
         new_series = evaluate_into_exprs(self, *exprs, **named_exprs)
+        if not new_series:
+            # return empty dataframe, like Polars does
+            return self._from_dataframe(self._dataframe.__class__())
         new_series = validate_indices(new_series)
         df = horizontal_concat(
             new_series,

diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py
@@ -9,7 +9,7 @@
 from narwhals import dtypes
 from narwhals._pandas_like.dataframe import PandasDataFrame
 from narwhals._pandas_like.expr import PandasExpr
-from narwhals._pandas_like.selectors import PandasSelector
+from narwhals._pandas_like.selectors import PandasSelectorNamespace
 from narwhals._pandas_like.series import PandasSeries
 from narwhals._pandas_like.utils import horizontal_concat
 from narwhals._pandas_like.utils import parse_into_exprs
@@ -38,8 +38,8 @@ class PandasNamespace:
     Datetime = dtypes.Datetime
 
     @property
-    def selectors(self) -> PandasSelector:
-        return PandasSelector(self._implementation)
+    def selectors(self) -> PandasSelectorNamespace:
+        return PandasSelectorNamespace(self._implementation)
 
     # --- not in spec ---
     def __init__(self, implementation: str) -> None:

diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
+from typing import Any
 
+from narwhals import dtypes
 from narwhals._pandas_like.expr import PandasExpr
 
 if TYPE_CHECKING:
@@ -10,19 +12,138 @@
     from narwhals.dtypes import DType
 
 
-class PandasSelector:
+class PandasSelectorNamespace:
     def __init__(self, implementation: str) -> None:
         self._implementation = implementation
 
-    def by_dtype(self, dtypes: list[DType]) -> PandasExpr:
+    def by_dtype(self, dtypes: list[DType | type[DType]]) -> PandasSelector:
         def func(df: PandasDataFrame) -> list[PandasSeries]:
             return [df[col] for col in df.columns if df.schema[col] in dtypes]
 
-        return PandasExpr(
+        return PandasSelector(
             func,
             depth=0,
             function_name="type_selector",
             root_names=None,
             output_names=None,
             implementation=self._implementation,
         )
+
+    def numeric(self) -> PandasSelector:
+        return self.by_dtype(
+            [
+                dtypes.Int64,
+                dtypes.Int32,
+                dtypes.Int16,
+                dtypes.Int8,
+                dtypes.UInt64,
+                dtypes.UInt32,
+                dtypes.UInt16,
+                dtypes.UInt8,
+                dtypes.Float64,
+                dtypes.Float32,
+            ],
+        )
+
+    def categorical(self) -> PandasSelector:
+        return self.by_dtype([dtypes.Categorical])
+
+    def string(self) -> PandasSelector:
+        return self.by_dtype([dtypes.String])
+
+    def boolean(self) -> PandasSelector:
+        return self.by_dtype([dtypes.Boolean])
+
+    def all(self) -> PandasSelector:
+        def func(df: PandasDataFrame) -> list[PandasSeries]:
+            return [df[col] for col in df.columns]
+
+        return PandasSelector(
+            func,
+            depth=0,
+            function_name="type_selector",
+            root_names=None,
+            output_names=None,
+            implementation=self._implementation,
+        )
+
+
+class PandasSelector(PandasExpr):
+    def __repr__(self) -> str:  # pragma: no cover
+        return (
+            f"PandasSelector("
+            f"depth={self._depth}, "
+            f"function_name={self._function_name}, "
+            f"root_names={self._root_names}, "
+            f"output_names={self._output_names}"
+        )
+
+    def _to_expr(self) -> PandasExpr:
+        return PandasExpr(
+            self._call,
+            depth=self._depth,
+            function_name=self._function_name,
+            root_names=self._root_names,
+            output_names=self._output_names,
+            implementation=self._implementation,
+        )
+
+    def __sub__(self, other: PandasSelector | Any) -> PandasSelector | Any:
+        if isinstance(other, PandasSelector):
+
+            def call(df: PandasDataFrame) -> list[PandasSeries]:
+                lhs = self._call(df)
+                rhs = other._call(df)
+                return [x for x in lhs if x.name not in [x.name for x in rhs]]
+
+            return PandasSelector(
+                call,
+                depth=0,
+                function_name="type_selector",
+                root_names=None,
+                output_names=None,
+                implementation=self._implementation,
+            )
+        else:
+            return self._to_expr() - other
+
+    def __or__(self, other: PandasSelector | Any) -> PandasSelector | Any:
+        if isinstance(other, PandasSelector):
+
+            def call(df: PandasDataFrame) -> list[PandasSeries]:
+                lhs = self._call(df)
+                rhs = other._call(df)
+                return [x for x in lhs if x.name not in [x.name for x in rhs]] + rhs
+
+            return PandasSelector(
+                call,
+                depth=0,
+                function_name="type_selector",
+                root_names=None,
+                output_names=None,
+                implementation=self._implementation,
+            )
+        else:
+            return self._to_expr() | other
+
+    def __and__(self, other: PandasSelector | Any) -> PandasSelector | Any:
+        if isinstance(other, PandasSelector):
+
+            def call(df: PandasDataFrame) -> list[PandasSeries]:
+                lhs = self._call(df)
+                rhs = other._call(df)
+                return [x for x in lhs if x.name in [x.name for x in rhs]]
+
+            return PandasSelector(
+                call,
+                depth=0,
+                function_name="type_selector",
+                root_names=None,
+                output_names=None,
+                implementation=self._implementation,
+            )
+        else:
+            return self._to_expr() & other
+
+    def __invert__(self) -> PandasSelector:
+        return PandasSelectorNamespace(self._implementation).all() - self
diff --git a/narwhals/selectors.py b/narwhals/selectors.py
@@ -2,12 +2,14 @@
 
 from typing import Any
 
-from narwhals import dtypes
 from narwhals.dtypes import translate_dtype
 from narwhals.expression import Expr
 from narwhals.utils import flatten
 
 
+class Selector(Expr): ...
+
+
 def by_dtype(*dtypes: Any) -> Expr:
     """
     Select columns based on their dtype.
@@ -50,7 +52,7 @@ def by_dtype(*dtypes: Any) -> Expr:
         │ 4   ┆ 4.6 │
         └─────┴─────┘
     """
-    return Expr(
+    return Selector(
         lambda plx: plx.selectors.by_dtype(
             [translate_dtype(plx, dtype) for dtype in flatten(dtypes)]
         )
@@ -96,18 +98,7 @@ def numeric() -> Expr:
         │ 4   ┆ 4.6 │
         └─────┴─────┘
     """
-    return by_dtype(
-        dtypes.Int64,
-        dtypes.Int32,
-        dtypes.Int16,
-        dtypes.Int8,
-        dtypes.UInt64,
-        dtypes.UInt32,
-        dtypes.UInt16,
-        dtypes.UInt8,
-        dtypes.Float64,
-        dtypes.Float32,
-    )
+    return Selector(lambda plx: plx.selectors.numeric())
 
 
 def boolean() -> Expr:
@@ -149,9 +140,7 @@ def boolean() -> Expr:
         │ true  │
         └───────┘
     """
-    return by_dtype(
-        dtypes.Boolean,
-    )
+    return Selector(lambda plx: plx.selectors.boolean())
 
 
 def string() -> Expr:
@@ -193,9 +182,7 @@ def string() -> Expr:
         │ y   │
         └─────┘
     """
-    return by_dtype(
-        dtypes.String,
-    )
+    return Selector(lambda plx: plx.selectors.string())
 
 
 def categorical() -> Expr:
@@ -237,6 +224,46 @@ def categorical() -> Expr:
         │ y   │
         └─────┘
     """
-    return by_dtype(
-        dtypes.Categorical,
-    )
+    return Selector(lambda plx: plx.selectors.categorical())
+
+
+def all() -> Expr:
+    """
+    Select all columns.
+
+    Examples:
+        >>> import narwhals as nw
+        >>> import narwhals.selectors as ncs
+        >>> import pandas as pd
+        >>> import polars as pl
+        >>>
+        >>> data = {'a': [1, 2], 'b': ['x', 'y'], 'c': [False, True]}
+        >>> df_pd = pd.DataFrame(data).astype({'b': 'category'})
+        >>> df_pl = pl.DataFrame(data, schema_overrides={'b': pl.Categorical})
+
+        Let's define a dataframe-agnostic function to select string
+        dtypes:
+
+        >>> def func(df_any):
+        ...     df = nw.from_native(df_any)
+        ...     df = df.select(ncs.all())
+        ...     return nw.to_native(df)
+
+        We can then pass either pandas or Polars dataframes:
+
+        >>> func(df_pd)
+           a  b      c
+        0  1  x  False
+        1  2  y   True
+        >>> func(df_pl)
+        shape: (2, 3)
+        ┌─────┬─────┬───────┐
+        │ a   ┆ b   ┆ c     │
+        │ --- ┆ --- ┆ ---   │
+        │ i64 ┆ cat ┆ bool  │
+        ╞═════╪═════╪═══════╡
+        │ 1   ┆ x   ┆ false │
+        │ 2   ┆ y   ┆ true  │
+        └─────┴─────┴───────┘
+    """
+    return Selector(lambda plx: plx.selectors.all())
diff --git a/tests/selectors_test.py b/tests/selectors_test.py
@@ -1,10 +1,13 @@
+from __future__ import annotations
+
 from typing import Any
 
 import pandas as pd
 import polars as pl
 import pytest
 
 import narwhals as nw
+from narwhals.selectors import all
 from narwhals.selectors import boolean
 from narwhals.selectors import by_dtype
 from narwhals.selectors import categorical
@@ -61,3 +64,26 @@ def test_categorical() -> None:
     result = nw.to_native(df.select(categorical()))
     expected = {"b": ["a", "b", "c"]}
     compare_dicts(result, expected)
+
+
+@pytest.mark.parametrize("constructor", [pd.DataFrame, pl.DataFrame])
+@pytest.mark.parametrize(
+    ("selector", "expected"),
+    [
+        (numeric() | boolean(), ["a", "c", "d"]),
+        (numeric() & boolean(), []),
+        (numeric() & by_dtype(nw.Int64), ["a"]),
+        (numeric() | by_dtype(nw.Int64), ["a", "c"]),
+        (~numeric(), ["b", "d"]),
+        (boolean() & True, ["d"]),
+        (boolean() | True, ["d"]),
+        (numeric() - 1, ["a", "c"]),
+        (all(), ["a", "b", "c", "d"]),
+    ],
+)
+def test_set_ops(
+    constructor: Any, selector: nw.selectors.Selector, expected: list[str]
+) -> None:
+    df = nw.from_native(constructor(data))
+    result = df.select(selector).columns
+    assert sorted(result) == expected