From 9ecd1767d2cea6956d9ca8b2a12d584705f8023a Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 27 Oct 2024 14:23:59 +0000 Subject: [PATCH 01/11] feat: add drop_null_keys argument to group_by (#1257) * feat: add drop_null_keys argument to group_by * py38 * another strategy * pandas version * simplify * avoid catching warnings * coverage * coverage --- narwhals/_arrow/dataframe.py | 4 +-- narwhals/_arrow/expr.py | 2 +- narwhals/_arrow/group_by.py | 17 ++++++------ narwhals/_dask/dataframe.py | 4 +-- narwhals/_dask/expr.py | 2 +- narwhals/_dask/group_by.py | 6 +++-- narwhals/_pandas_like/dataframe.py | 3 ++- narwhals/_pandas_like/expr.py | 2 +- narwhals/_pandas_like/group_by.py | 32 ++++++++++++++--------- narwhals/_polars/dataframe.py | 8 +++--- narwhals/_polars/group_by.py | 14 +++++++--- narwhals/dataframe.py | 16 +++++++++--- narwhals/group_by.py | 12 ++++++--- tests/group_by_test.py | 42 ++++++++++++++++++++++++++++++ 14 files changed, 117 insertions(+), 47 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 6b87f1d8d..b5de57015 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -335,10 +335,10 @@ def with_columns( df = self._native_frame.__class__.from_arrays(to_concat, names=output_names) return self._from_native_frame(df) - def group_by(self, *keys: str) -> ArrowGroupBy: + def group_by(self, *keys: str, drop_null_keys: bool) -> ArrowGroupBy: from narwhals._arrow.group_by import ArrowGroupBy - return ArrowGroupBy(self, list(keys)) + return ArrowGroupBy(self, list(keys), drop_null_keys=drop_null_keys) def join( self, diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index c9ee160e4..35e936d72 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -353,7 +353,7 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: "`nw.col('a', 'b')`\n" ) raise ValueError(msg) - tmp = df.group_by(*keys).agg(self) + tmp = df.group_by(*keys, drop_null_keys=False).agg(self) tmp = df.select(*keys).join( tmp, how="left", left_on=keys, right_on=keys, suffix="_right" ) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index 6c7b20485..991a96a51 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -37,10 +37,15 @@ def get_function_name_option(function_name: str) -> Any | None: class ArrowGroupBy: - def __init__(self, df: ArrowDataFrame, keys: list[str]) -> None: + def __init__( + self, df: ArrowDataFrame, keys: list[str], *, drop_null_keys: bool + ) -> None: import pyarrow as pa # ignore-banned-import() - self._df = df + if drop_null_keys: + self._df = df.drop_nulls(keys) + else: + self._df = df self._keys = list(keys) self._grouped = pa.TableGroupBy(self._df._native_frame, list(self._keys)) @@ -74,11 +79,7 @@ def agg( ) def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]: - key_values = ( - self._df.select(*self._keys) - .unique(subset=self._keys, keep="first") - .iter_rows() - ) + key_values = self._df.select(*self._keys).unique(subset=self._keys, keep="first") nw_namespace = self._df.__narwhals_namespace__() yield from ( ( @@ -87,7 +88,7 @@ def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]: *[nw_namespace.col(k) == v for k, v in zip(self._keys, key_value)] ), ) - for key_value in key_values + for key_value in key_values.iter_rows() ) diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index f78f48bf0..bc8dbaebd 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -345,10 +345,10 @@ def join_asof( ), ) - def group_by(self, *by: str) -> DaskLazyGroupBy: + def group_by(self, *by: str, drop_null_keys: bool) -> DaskLazyGroupBy: from narwhals._dask.group_by import DaskLazyGroupBy - return DaskLazyGroupBy(self, list(by)) + return DaskLazyGroupBy(self, list(by), drop_null_keys=drop_null_keys) def tail(self: Self, n: int) -> Self: native_frame = self._native_frame diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index d3ae29b45..0b9e979a1 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -670,7 +670,7 @@ def func(df: DaskLazyFrame) -> list[Any]: raise ValueError(msg) if df._native_frame.npartitions == 1: # pragma: no cover - tmp = df.group_by(*keys).agg(self) + tmp = df.group_by(*keys, drop_null_keys=False).agg(self) tmp_native = ( df.select(*keys) .join(tmp, how="left", left_on=keys, right_on=keys, suffix="_right") diff --git a/narwhals/_dask/group_by.py b/narwhals/_dask/group_by.py index 55ef69f46..e4c1e14c1 100644 --- a/narwhals/_dask/group_by.py +++ b/narwhals/_dask/group_by.py @@ -41,12 +41,14 @@ def agg(s0: pd.core.groupby.generic.SeriesGroupBy) -> int: class DaskLazyGroupBy: - def __init__(self, df: DaskLazyFrame, keys: list[str]) -> None: + def __init__( + self, df: DaskLazyFrame, keys: list[str], *, drop_null_keys: bool + ) -> None: self._df = df self._keys = keys self._grouped = self._df._native_frame.groupby( list(self._keys), - dropna=False, + dropna=drop_null_keys, observed=True, ) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index b1860a5d5..afab0f2f1 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -470,12 +470,13 @@ def collect(self) -> PandasLikeDataFrame: ) # --- actions --- - def group_by(self, *keys: str) -> PandasLikeGroupBy: + def group_by(self, *keys: str, drop_null_keys: bool) -> PandasLikeGroupBy: from narwhals._pandas_like.group_by import PandasLikeGroupBy return PandasLikeGroupBy( self, list(keys), + drop_null_keys=drop_null_keys, ) def join( diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index a4082235f..a58597eea 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -331,7 +331,7 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: "`nw.col('a', 'b')`\n" ) raise ValueError(msg) - tmp = df.group_by(*keys).agg(self) + tmp = df.group_by(*keys, drop_null_keys=False).agg(self) tmp = df.select(*keys).join( tmp, how="left", left_on=keys, right_on=keys, suffix="_right" ) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index ee18dc7f1..0bd52a0cf 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -13,6 +13,7 @@ from narwhals._pandas_like.utils import native_series_from_iterable from narwhals.utils import Implementation from narwhals.utils import remove_prefix +from narwhals.utils import tupleify if TYPE_CHECKING: from narwhals._pandas_like.dataframe import PandasLikeDataFrame @@ -26,14 +27,19 @@ class PandasLikeGroupBy: - def __init__(self, df: PandasLikeDataFrame, keys: list[str]) -> None: + def __init__( + self, df: PandasLikeDataFrame, keys: list[str], *, drop_null_keys: bool + ) -> None: self._df = df self._keys = keys if ( self._df._implementation is Implementation.PANDAS and self._df._backend_version < (1, 1) ): # pragma: no cover - if self._df._native_frame.loc[:, self._keys].isna().any().any(): + if ( + not drop_null_keys + and self._df._native_frame.loc[:, self._keys].isna().any().any() + ): msg = "Grouping by null values is not supported in pandas < 1.0.0" raise NotImplementedError(msg) self._grouped = self._df._native_frame.groupby( @@ -47,7 +53,7 @@ def __init__(self, df: PandasLikeDataFrame, keys: list[str]) -> None: list(self._keys), sort=False, as_index=True, - dropna=False, + dropna=drop_null_keys, observed=True, ) @@ -96,16 +102,16 @@ def _from_native_frame(self, df: PandasLikeDataFrame) -> PandasLikeDataFrame: ) def __iter__(self) -> Iterator[tuple[Any, PandasLikeDataFrame]]: - with warnings.catch_warnings(): - # we already use `tupleify` above, so we're already opting in to - # the new behaviour - warnings.filterwarnings( - "ignore", - message="In a future version of pandas, a length 1 tuple will be returned", - category=FutureWarning, - ) - iterator = self._grouped.__iter__() - yield from ((key, self._from_native_frame(sub_df)) for (key, sub_df) in iterator) + indices = self._grouped.indices + for key in indices: + if ( + self._df._implementation is Implementation.PANDAS + and self._df._backend_version < (2, 2) + ): # pragma: no cover + pass + else: # pragma: no cover + key = tupleify(key) # noqa: PLW2901 + yield (key, self._from_native_frame(self._grouped.get_group(key))) def agg_pandas( # noqa: PLR0915 diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index ff80148c1..832331ebf 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -203,10 +203,10 @@ def to_dict(self, *, as_series: bool) -> Any: else: return df.to_dict(as_series=False) - def group_by(self, *by: str) -> Any: + def group_by(self, *by: str, drop_null_keys: bool) -> Any: from narwhals._polars.group_by import PolarsGroupBy - return PolarsGroupBy(self, list(by)) + return PolarsGroupBy(self, list(by), drop_null_keys=drop_null_keys) def with_row_index(self, name: str) -> Any: if self._backend_version < (0, 20, 4): # pragma: no cover @@ -312,10 +312,10 @@ def collect(self) -> PolarsDataFrame: dtypes=self._dtypes, ) - def group_by(self, *by: str) -> Any: + def group_by(self, *by: str, drop_null_keys: bool) -> Any: from narwhals._polars.group_by import PolarsLazyGroupBy - return PolarsLazyGroupBy(self, list(by)) + return PolarsLazyGroupBy(self, list(by), drop_null_keys=drop_null_keys) def with_row_index(self, name: str) -> Any: if self._backend_version < (0, 20, 4): # pragma: no cover diff --git a/narwhals/_polars/group_by.py b/narwhals/_polars/group_by.py index f03da610e..aa69db37f 100644 --- a/narwhals/_polars/group_by.py +++ b/narwhals/_polars/group_by.py @@ -11,10 +11,13 @@ class PolarsGroupBy: - def __init__(self, df: Any, keys: list[str]) -> None: + def __init__(self, df: Any, keys: list[str], *, drop_null_keys: bool) -> None: self._compliant_frame = df self.keys = keys - self._grouped = df._native_frame.group_by(keys) + if drop_null_keys: + self._grouped = df.drop_nulls(keys)._native_frame.group_by(keys) + else: + self._grouped = df._native_frame.group_by(keys) def agg(self, *aggs: Any, **named_aggs: Any) -> PolarsDataFrame: aggs, named_aggs = extract_args_kwargs(aggs, named_aggs) # type: ignore[assignment] @@ -28,10 +31,13 @@ def __iter__(self) -> Any: class PolarsLazyGroupBy: - def __init__(self, df: Any, keys: list[str]) -> None: + def __init__(self, df: Any, keys: list[str], *, drop_null_keys: bool) -> None: self._compliant_frame = df self.keys = keys - self._grouped = df._native_frame.group_by(keys) + if drop_null_keys: + self._grouped = df.drop_nulls(keys)._native_frame.group_by(keys) + else: + self._grouped = df._native_frame.group_by(keys) def agg(self, *aggs: Any, **named_aggs: Any) -> PolarsLazyFrame: aggs, named_aggs = extract_args_kwargs(aggs, named_aggs) # type: ignore[assignment] diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 4645cfbb4..a113fef17 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -1867,12 +1867,16 @@ def filter(self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool]) -> Sel """ return super().filter(*predicates) - def group_by(self, *keys: str | Iterable[str]) -> GroupBy[Self]: + def group_by( + self, *keys: str | Iterable[str], drop_null_keys: bool = False + ) -> GroupBy[Self]: r""" Start a group by operation. Arguments: *keys: Column(s) to group by. Accepts multiple columns names as a list. + drop_null_keys: if True, then groups where any key is null won't be included + in the result. Returns: GroupBy: Object which can be used to perform aggregations. @@ -1941,7 +1945,7 @@ def group_by(self, *keys: str | Iterable[str]) -> GroupBy[Self]: """ from narwhals.group_by import GroupBy - return GroupBy(self, *flatten(keys)) + return GroupBy(self, *flatten(keys), drop_null_keys=drop_null_keys) def sort( self, @@ -3758,7 +3762,9 @@ def filter(self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool]) -> Sel """ return super().filter(*predicates) - def group_by(self, *keys: str | Iterable[str]) -> LazyGroupBy[Self]: + def group_by( + self, *keys: str | Iterable[str], drop_null_keys: bool = False + ) -> LazyGroupBy[Self]: r""" Start a group by operation. @@ -3766,6 +3772,8 @@ def group_by(self, *keys: str | Iterable[str]) -> LazyGroupBy[Self]: *keys: Column(s) to group by. Accepts expression input. Strings are parsed as column names. + drop_null_keys: if True, then groups where any key is null won't be + included in the result. Examples: Group by one column and call `agg` to compute the grouped sum of @@ -3858,7 +3866,7 @@ def group_by(self, *keys: str | Iterable[str]) -> LazyGroupBy[Self]: """ from narwhals.group_by import LazyGroupBy - return LazyGroupBy(self, *flatten(keys)) + return LazyGroupBy(self, *flatten(keys), drop_null_keys=drop_null_keys) def sort( self, diff --git a/narwhals/group_by.py b/narwhals/group_by.py index 797442e3c..9ec14c4d7 100644 --- a/narwhals/group_by.py +++ b/narwhals/group_by.py @@ -20,10 +20,12 @@ class GroupBy(Generic[DataFrameT]): - def __init__(self, df: DataFrameT, *keys: str) -> None: + def __init__(self, df: DataFrameT, *keys: str, drop_null_keys: bool) -> None: self._df = cast(DataFrame[Any], df) self._keys = keys - self._grouped = self._df._compliant_frame.group_by(*self._keys) + self._grouped = self._df._compliant_frame.group_by( + *self._keys, drop_null_keys=drop_null_keys + ) def agg( self, *aggs: IntoExpr | Iterable[IntoExpr], **named_aggs: IntoExpr @@ -119,10 +121,12 @@ def __iter__(self) -> Iterator[tuple[Any, DataFrameT]]: class LazyGroupBy(Generic[LazyFrameT]): - def __init__(self, df: LazyFrameT, *keys: str) -> None: + def __init__(self, df: LazyFrameT, *keys: str, drop_null_keys: bool) -> None: self._df = cast(LazyFrame[Any], df) self._keys = keys - self._grouped = self._df._compliant_frame.group_by(*self._keys) + self._grouped = self._df._compliant_frame.group_by( + *self._keys, drop_null_keys=drop_null_keys + ) def agg( self, *aggs: IntoExpr | Iterable[IntoExpr], **named_aggs: IntoExpr diff --git a/tests/group_by_test.py b/tests/group_by_test.py index ed3444fdc..09ee213e8 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -256,6 +256,48 @@ def test_key_with_nulls( assert_equal_data(result, expected) +def test_key_with_nulls_ignored( + constructor: Constructor, +) -> None: + data = {"b": [4, 5, None], "a": [1, 2, 3]} + result = ( + nw.from_native(constructor(data)) + .group_by("b", drop_null_keys=True) + .agg(nw.len(), nw.col("a").min()) + .sort("a") + .with_columns(nw.col("b").cast(nw.Float64)) + ) + expected = {"b": [4.0, 5], "len": [1, 1], "a": [1, 2]} + assert_equal_data(result, expected) + + +def test_key_with_nulls_iter( + constructor_eager: ConstructorEager, + request: pytest.FixtureRequest, +) -> None: + if PANDAS_VERSION < (1, 3) and "pandas_constructor" in str(constructor_eager): + # bug in old pandas + request.applymarker(pytest.mark.xfail) + data = {"b": ["4", "5", None, "7"], "a": [1, 2, 3, 4], "c": ["4", "3", None, None]} + result = dict( + nw.from_native(constructor_eager(data), eager_only=True) + .group_by("b", "c", drop_null_keys=True) + .__iter__() + ) + assert len(result) == 2 + assert_equal_data(result[("4", "4")], {"b": ["4"], "a": [1], "c": ["4"]}) + assert_equal_data(result[("5", "3")], {"b": ["5"], "a": [2], "c": ["3"]}) + + result = dict( + nw.from_native(constructor_eager(data), eager_only=True) + .group_by("b", "c", drop_null_keys=False) + .__iter__() + ) + assert_equal_data(result[("4", "4")], {"b": ["4"], "a": [1], "c": ["4"]}) + assert_equal_data(result[("5", "3")], {"b": ["5"], "a": [2], "c": ["3"]}) + assert len(result) == 4 + + def test_no_agg(constructor: Constructor) -> None: result = nw.from_native(constructor(data)).group_by(["a", "b"]).agg().sort("a", "b") From 6e8a7dbc646f79aa1d6009c97aae98aa3e43972c Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Sun, 27 Oct 2024 10:34:22 -0400 Subject: [PATCH 02/11] feat: Add total seconds implementation for cuDF (#1188) * add total_seconds for cuDF * unxfail total_seconds tests for cuDF * add pragma: no cover for cuDF --- narwhals/_pandas_like/series.py | 21 ++++++++++++++----- .../dt/datetime_duration_test.py | 4 ---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index f2589f0e5..b4631bbf7 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -867,8 +867,19 @@ def ordinal_day(self) -> PandasLikeSeries: ) ) + def _get_total_seconds(self) -> Any: + if hasattr(self._pandas_series._native_series.dt, "total_seconds"): + return self._pandas_series._native_series.dt.total_seconds() + else: # pragma: no cover + return ( + self._pandas_series._native_series.dt.days * 86400 + + self._pandas_series._native_series.dt.seconds + + (self._pandas_series._native_series.dt.microseconds / 1e6) + + (self._pandas_series._native_series.dt.nanoseconds / 1e9) + ) + def total_minutes(self) -> PandasLikeSeries: - s = self._pandas_series._native_series.dt.total_seconds() + s = self._get_total_seconds() s_sign = ( 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 ) # this calculates the sign of each series element @@ -878,7 +889,7 @@ def total_minutes(self) -> PandasLikeSeries: return self._pandas_series._from_native_series(s_abs * s_sign) def total_seconds(self) -> PandasLikeSeries: - s = self._pandas_series._native_series.dt.total_seconds() + s = self._get_total_seconds() s_sign = ( 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 ) # this calculates the sign of each series element @@ -888,7 +899,7 @@ def total_seconds(self) -> PandasLikeSeries: return self._pandas_series._from_native_series(s_abs * s_sign) def total_milliseconds(self) -> PandasLikeSeries: - s = self._pandas_series._native_series.dt.total_seconds() * 1e3 + s = self._get_total_seconds() * 1e3 s_sign = ( 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 ) # this calculates the sign of each series element @@ -898,7 +909,7 @@ def total_milliseconds(self) -> PandasLikeSeries: return self._pandas_series._from_native_series(s_abs * s_sign) def total_microseconds(self) -> PandasLikeSeries: - s = self._pandas_series._native_series.dt.total_seconds() * 1e6 + s = self._get_total_seconds() * 1e6 s_sign = ( 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 ) # this calculates the sign of each series element @@ -908,7 +919,7 @@ def total_microseconds(self) -> PandasLikeSeries: return self._pandas_series._from_native_series(s_abs * s_sign) def total_nanoseconds(self) -> PandasLikeSeries: - s = self._pandas_series._native_series.dt.total_seconds() * 1e9 + s = self._get_total_seconds() * 1e9 s_sign = ( 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 ) # this calculates the sign of each series element diff --git a/tests/expr_and_series/dt/datetime_duration_test.py b/tests/expr_and_series/dt/datetime_duration_test.py index 9a93591c9..09f227c79 100644 --- a/tests/expr_and_series/dt/datetime_duration_test.py +++ b/tests/expr_and_series/dt/datetime_duration_test.py @@ -46,8 +46,6 @@ def test_duration_attributes( ) -> None: if PANDAS_VERSION < (2, 2) and "pandas_pyarrow" in str(constructor): request.applymarker(pytest.mark.xfail) - if "cudf" in str(constructor): - request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) @@ -81,8 +79,6 @@ def test_duration_attributes_series( ) -> None: if PANDAS_VERSION < (2, 2) and "pandas_pyarrow" in str(constructor_eager): request.applymarker(pytest.mark.xfail) - if "cudf" in str(constructor_eager): - request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor_eager(data), eager_only=True) From 591992cb6992939bc1275002b1876903b256992b Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 27 Oct 2024 14:37:23 +0000 Subject: [PATCH 03/11] perf: improve performance of Series.dtype and DataFrame.schema for pandas-like (#1255) --- narwhals/_dask/dataframe.py | 4 +- narwhals/_dask/expr.py | 5 ++- narwhals/_pandas_like/dataframe.py | 14 +++++-- narwhals/_pandas_like/series.py | 14 +++++-- narwhals/_pandas_like/utils.py | 31 ++++++++------- narwhals/dataframe.py | 5 --- narwhals/series.py | 38 +++++++++++++++++++ .../interchange_native_namespace_test.py | 4 +- tests/frame/unpivot_test.py | 6 +-- 9 files changed, 85 insertions(+), 36 deletions(-) diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index bc8dbaebd..039b8f33f 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -149,7 +149,9 @@ def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: @property def schema(self) -> dict[str, DType]: return { - col: native_to_narwhals_dtype(self._native_frame.loc[:, col], self._dtypes) + col: native_to_narwhals_dtype( + self._native_frame.loc[:, col], self._dtypes, self._implementation + ) for col in self._native_frame.columns } diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 0b9e979a1..487fba77f 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -13,6 +13,7 @@ from narwhals._pandas_like.utils import calculate_timestamp_date from narwhals._pandas_like.utils import calculate_timestamp_datetime from narwhals._pandas_like.utils import native_to_narwhals_dtype +from narwhals.utils import Implementation from narwhals.utils import generate_unique_token if TYPE_CHECKING: @@ -943,7 +944,7 @@ def replace_time_zone(self, time_zone: str | None) -> DaskExpr: def convert_time_zone(self, time_zone: str) -> DaskExpr: def func(s: dask_expr.Series, time_zone: str) -> dask_expr.Series: - dtype = native_to_narwhals_dtype(s, self._expr._dtypes) + dtype = native_to_narwhals_dtype(s, self._expr._dtypes, Implementation.DASK) if dtype.time_zone is None: # type: ignore[attr-defined] return s.dt.tz_localize("UTC").dt.tz_convert(time_zone) else: @@ -960,7 +961,7 @@ def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> DaskExpr: def func( s: dask_expr.Series, time_unit: Literal["ns", "us", "ms"] = "us" ) -> dask_expr.Series: - dtype = native_to_narwhals_dtype(s, self._expr._dtypes) + dtype = native_to_narwhals_dtype(s, self._expr._dtypes, Implementation.DASK) is_pyarrow_dtype = "pyarrow" in str(dtype) mask_na = s.isna() if dtype == self._expr._dtypes.Date: diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index afab0f2f1..67eac0c19 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -54,6 +54,8 @@ def __init__( self._backend_version = backend_version self._dtypes = dtypes + self._schema_cache: dict[str, DType] | None = None + def __narwhals_dataframe__(self) -> Self: return self @@ -303,10 +305,14 @@ def iter_rows( @property def schema(self) -> dict[str, DType]: - return { - col: native_to_narwhals_dtype(self._native_frame[col], self._dtypes) - for col in self._native_frame.columns - } + if self._schema_cache is None: + self._schema_cache = { + col: native_to_narwhals_dtype( + self._native_frame[col], self._dtypes, self._implementation + ) + for col in self._native_frame.columns + } + return self._schema_cache def collect_schema(self) -> dict[str, DType]: return self.schema diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index b4631bbf7..3ea4d2255 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -89,6 +89,8 @@ def __init__( self._backend_version = backend_version self._dtypes = dtypes + self._dtype_cache: DType | None = None + # In pandas, copy-on-write becomes the default in version 3. # So, before that, we need to explicitly avoid unnecessary # copies by using `copy=False` sometimes. @@ -170,7 +172,11 @@ def shape(self) -> tuple[int]: @property def dtype(self: Self) -> DType: - return native_to_narwhals_dtype(self._native_series, self._dtypes) + if self._dtype_cache is None: + self._dtype_cache = native_to_narwhals_dtype( + self._native_series, self._dtypes, self._implementation + ) + return self._dtype_cache def scatter(self, indices: int | Sequence[int], values: Any) -> Self: if isinstance(values, self.__class__): @@ -494,8 +500,10 @@ def sort( ) def alias(self, name: str) -> Self: - ser = self._native_series - return self._from_native_series(ser.rename(name, copy=False)) + if name != self.name: + ser = self._native_series + return self._from_native_series(ser.rename(name, copy=False)) + return self def __array__(self, dtype: Any = None, copy: bool | None = None) -> Any: # pandas used to always return object dtype for nullable dtypes. diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 2c4612eb1..99181bc1e 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -218,7 +218,9 @@ def set_axis( return obj.set_axis(index, axis=0, **kwargs) # type: ignore[attr-defined, no-any-return] -def native_to_narwhals_dtype(native_column: Any, dtypes: DTypes) -> DType: +def native_to_narwhals_dtype( + native_column: Any, dtypes: DTypes, implementation: Implementation +) -> DType: dtype = str(native_column.dtype) pd_datetime_rgx = ( @@ -283,15 +285,20 @@ def native_to_narwhals_dtype(native_column: Any, dtypes: DTypes) -> DType: if dtype.startswith(("large_list", "list", "struct", "fixed_size_list")): return arrow_native_to_narwhals_dtype(native_column.dtype.pyarrow_dtype, dtypes) if dtype == "object": - if ( # pragma: no cover TODO(unassigned): why does this show as uncovered? - idx := getattr(native_column, "first_valid_index", lambda: None)() - ) is not None and isinstance(native_column.loc[idx], str): - # Infer based on first non-missing value. - # For pandas pre 3.0, this isn't perfect. - # After pandas 3.0, pandas has a dedicated string dtype - # which is inferred by default. + if implementation is Implementation.DASK: + # Dask columns are lazy, so we can't inspect values. + # The most useful assumption is probably String return dtypes.String() - else: + if implementation is Implementation.PANDAS: # pragma: no cover + # This is the most efficient implementation for pandas, + # and doesn't require the interchange protocol + import pandas as pd # ignore-banned-import + + dtype = pd.api.types.infer_dtype(native_column, skipna=True) + if dtype == "string": + return dtypes.String() + return dtypes.Object() + else: # pragma: no cover df = native_column.to_frame() if hasattr(df, "__dataframe__"): from narwhals._interchange.dataframe import ( @@ -302,10 +309,8 @@ def native_to_narwhals_dtype(native_column: Any, dtypes: DTypes) -> DType: return map_interchange_dtype_to_narwhals_dtype( df.__dataframe__().get_column(0).dtype, dtypes ) - except Exception: # noqa: BLE001 - return dtypes.Object() - else: # pragma: no cover - return dtypes.Object() + except Exception: # noqa: BLE001, S110 + pass return dtypes.Unknown() diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index a113fef17..3ddaa2814 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -3326,11 +3326,6 @@ def rename(self, mapping: dict[str, str]) -> Self: function that takes the old name as input and returns the new name. - Notes: - If existing names are swapped (e.g. 'A' points to 'B' and 'B' - points to 'A'), polars will block projection and predicate - pushdowns at this node. - Examples: >>> import pandas as pd >>> import polars as pl diff --git a/narwhals/series.py b/narwhals/series.py index d43677fcd..6f5223202 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1203,6 +1203,25 @@ def alias(self, name: str) -> Self: """ Rename the Series. + Notes: + This method is very cheap, but does not guarantee that data + will be copied. For example: + + ```python + s1: nw.Series + s2 = s1.alias("foo") + arr = s2.to_numpy() + arr[0] = 999 + ``` + + may (depending on the backend, and on the version) result in + `s1`'s data being modified. We recommend: + + - if you need to alias an object and don't need the original + one around any more, just use `alias` without worrying about it. + - if you were expecting `alias` to copy data, then explicily call + `.clone` before calling `alias`. + Arguments: name: The new name. @@ -1255,6 +1274,25 @@ def rename(self, name: str) -> Self: Alias for `Series.alias()`. + Notes: + This method is very cheap, but does not guarantee that data + will be copied. For example: + + ```python + s1: nw.Series + s2 = s1.rename("foo") + arr = s2.to_numpy() + arr[0] = 999 + ``` + + may (depending on the backend, and on the version) result in + `s1`'s data being modified. We recommend: + + - if you need to rename an object and don't need the original + one around any more, just use `rename` without worrying about it. + - if you were expecting `rename` to copy data, then explicily call + `.clone` before calling `rename`. + Arguments: name: The new name. diff --git a/tests/frame/interchange_native_namespace_test.py b/tests/frame/interchange_native_namespace_test.py index 6098abf98..22d036460 100644 --- a/tests/frame/interchange_native_namespace_test.py +++ b/tests/frame/interchange_native_namespace_test.py @@ -27,9 +27,7 @@ def test_interchange() -> None: series.__native_namespace__() -@pytest.mark.filterwarnings( - "ignore:.*The `ArrowDtype` class is not available in pandas 1.0.5" -) +@pytest.mark.filterwarnings("ignore:.*The `ArrowDtype` class is not available in pandas") def test_ibis( tmpdir: pytest.TempdirFactory, ) -> None: # pragma: no cover diff --git a/tests/frame/unpivot_test.py b/tests/frame/unpivot_test.py index fe225c7b5..fba51f2bb 100644 --- a/tests/frame/unpivot_test.py +++ b/tests/frame/unpivot_test.py @@ -94,11 +94,7 @@ def test_unpivot_mixed_types( data: dict[str, Any], expected_dtypes: list[DType], ) -> None: - if ( - "dask" in str(constructor) - or "cudf" in str(constructor) - or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (14, 0, 0)) - ): + if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (14, 0, 0): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.unpivot(on=["a", "b"], index="idx") From e6abf27889b0c8b5a9655c07a8975dc980df4dda Mon Sep 17 00:00:00 2001 From: Vincent Arel-Bundock Date: Sun, 27 Oct 2024 11:14:23 -0400 Subject: [PATCH 04/11] DataFrame conversion tutorial (#1240) * data frame conversion vignette * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * data frame conversion tutorial: executable code * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * PyCapsule discussion * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixup, minor edits * fixup random versions ci job fail, add pymarginaleffects to readme --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- README.md | 3 +- docs/basics/dataframe_conversion.md | 76 ++++++++++++++++++++++ mkdocs.yml | 1 + tests/expr_and_series/dt/timestamp_test.py | 2 + 4 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 docs/basics/dataframe_conversion.md diff --git a/README.md b/README.md index 409459f3c..b3acb17ba 100644 --- a/README.md +++ b/README.md @@ -43,10 +43,11 @@ Join the party! - [Altair](https://github.com/vega/altair/) - [Hamilton](https://github.com/DAGWorks-Inc/hamilton/tree/main/examples/narwhals) +- [marimo](https://github.com/marimo-team/marimo) +- [pymarginaleffects](https://github.com/vincentarelbundock/pymarginaleffects) - [scikit-lego](https://github.com/koaning/scikit-lego) - [scikit-playtime](https://github.com/koaning/scikit-playtime) - [timebasedcv](https://github.com/FBruzzesi/timebasedcv) -- [marimo](https://github.com/marimo-team/marimo) - [tubular](https://github.com/lvgig/tubular) - [wimsey](https://github.com/benrutter/wimsey) diff --git a/docs/basics/dataframe_conversion.md b/docs/basics/dataframe_conversion.md new file mode 100644 index 000000000..690f5d093 --- /dev/null +++ b/docs/basics/dataframe_conversion.md @@ -0,0 +1,76 @@ +# Conversion between libraries + +Some library maintainers must apply complex dataframe operations, using methods and functions that may not (yet) be implemented in Narwhals. In such cases, Narwhals can still be highly beneficial, by allowing easy dataframe conversion. + +## Dataframe X in, pandas out + +Imagine that you maintain a library with a function that operates on pandas dataframes to produce automated reports. You want to allow users to supply a dataframe in any format to that function (pandas, Polars, DuckDB, cuDF, Modin, etc.) without adding all those dependencies to your own project and without special-casing each input library's variation of `to_pandas` / `toPandas` / `to_pandas_df` / `df` ... + +One solution is to use Narwhals as a thin Dataframe ingestion layer, to convert user-supplied dataframe to the format that your library uses internally. Since Narwhals is zero-dependency, this is a much more lightweight solution than including all the dataframe libraries as dependencies, +and easier to write than special casing each input library's `to_pandas` method (if it even exists!). + +To illustrate, we create dataframes in various formats: + +```python exec="1" source="above" session="conversion" +import narwhals as nw +from narwhals.typing import IntoDataFrame + +import duckdb +import polars as pl +import pandas as pd + +df_polars = pl.DataFrame( + { + "A": [1, 2, 3, 4, 5], + "fruits": ["banana", "banana", "apple", "apple", "banana"], + "B": [5, 4, 3, 2, 1], + "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + } +) +df_pandas = df_polars.to_pandas() +df_duckdb = duckdb.sql("SELECT * FROM df_polars") +``` + +Now, we define a function that can ingest any dataframe type supported by Narwhals, and convert it to a pandas DataFrame for internal use: + +```python exec="1" source="above" session="conversion" result="python" +def df_to_pandas(df: IntoDataFrame) -> pd.DataFrame: + return nw.from_native(df).to_pandas() + + +print(df_to_pandas(df_polars)) +``` + +## Dataframe X in, Polars out + +### Via PyCapsule Interface + +Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe to Polars format using Narwhals. + +```python exec="1" source="above" session="conversion" result="python" +def df_to_polars(df: IntoDataFrame) -> pl.DataFrame: + return nw.from_arrow(nw.from_native(df), native_namespace=pl).to_native() + + +print(df_to_polars(df_duckdb)) # You can only execute this line of code once. +``` + +It works to pass Polars to `native_namespace` here because Polars supports the [PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) for import. + +Note that the PyCapsule Interface makes no guarantee that you can call it repeatedly, so the approach above only works if you +only expect to perform the conversion a single time on each input object. + +### Via PyArrow + +If you need to ingest the same dataframe multiple times, then you may want to go via PyArrow instead. +This may be less efficient than the PyCapsule approach above (and always requires PyArrow!), but is more forgiving: + +```python exec="1" source="above" session="conversion" result="python" +def df_to_polars(df: IntoDataFrame) -> pl.DataFrame: + return pl.DataFrame(nw.from_native(df).to_arrow()) + + +df_duckdb = duckdb.sql("SELECT * FROM df_polars") +print(df_to_polars(df_duckdb)) # We can execute this... +print(df_to_polars(df_duckdb)) # ...as many times as we like! +``` diff --git a/mkdocs.yml b/mkdocs.yml index 328ec08a9..3944ec979 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,6 +10,7 @@ nav: - basics/dataframe.md - basics/series.md - basics/complete_example.md + - basics/dataframe_conversion.md - Pandas-like concepts: - other/pandas_index.md - other/user_warning.md diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index b653fe02a..212926628 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -11,6 +11,7 @@ import narwhals.stable.v1 as nw from tests.utils import PANDAS_VERSION +from tests.utils import POLARS_VERSION from tests.utils import PYARROW_VERSION from tests.utils import Constructor from tests.utils import ConstructorEager @@ -197,6 +198,7 @@ def test_timestamp_invalid_unit_series(constructor_eager: ConstructorEager) -> N starting_time_unit=st.sampled_from(["us", "ns"]), ) @pytest.mark.skipif(PANDAS_VERSION < (2, 2), reason="bug in old pandas") +@pytest.mark.skipif(POLARS_VERSION < (0, 20, 7), reason="bug in old Polars") def test_timestamp_hypothesis( inputs: datetime, time_unit: Literal["ms", "us", "ns"], From ca11fa3283a1741c3b75af4b8c9f69bb0c348dd8 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 27 Oct 2024 15:30:55 +0000 Subject: [PATCH 05/11] test: xfail cudf failures (#1259) --- tests/frame/unpivot_test.py | 4 +++- tests/group_by_test.py | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/frame/unpivot_test.py b/tests/frame/unpivot_test.py index fba51f2bb..ed8d98c96 100644 --- a/tests/frame/unpivot_test.py +++ b/tests/frame/unpivot_test.py @@ -94,7 +94,9 @@ def test_unpivot_mixed_types( data: dict[str, Any], expected_dtypes: list[DType], ) -> None: - if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (14, 0, 0): + if "cudf" in str(constructor) or ( + "pyarrow_table" in str(constructor) and PYARROW_VERSION < (14, 0, 0) + ): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.unpivot(on=["a", "b"], index="idx") diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 09ee213e8..63cc631a8 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -74,7 +74,12 @@ def test_invalid_group_by() -> None: ) -def test_group_by_iter(constructor_eager: ConstructorEager) -> None: +def test_group_by_iter( + constructor_eager: ConstructorEager, request: pytest.FixtureRequest +) -> None: + if "cudf" in str(constructor_eager): + # https://github.com/rapidsai/cudf/issues/17187 + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor_eager(data), eager_only=True) expected_keys = [(1,), (3,)] keys = [] From 3ec01dc44a5200ef8d70167928279fb4b675d93a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 27 Oct 2024 15:32:42 +0000 Subject: [PATCH 06/11] [pre-commit.ci] pre-commit autoupdate (#1236) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.6.9 → v0.7.0](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.9...v0.7.0) - [github.com/pre-commit/mirrors-mypy: v1.11.2 → v1.12.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.11.2...v1.12.1) * add type ignore assignment --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- narwhals/_arrow/dataframe.py | 2 +- narwhals/_pandas_like/dataframe.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4d416e237..b03d649dc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.6.9' + rev: 'v0.7.0' hooks: # Run the formatter. - id: ruff-format @@ -9,7 +9,7 @@ repos: - id: ruff args: [--fix] - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.11.2' + rev: 'v1.12.1' hooks: - id: mypy additional_dependencies: ['polars==1.4.1', 'pytest==8.3.2'] diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index b5de57015..35a63d192 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -172,7 +172,7 @@ def __getitem__( ), ) -> ArrowSeries | ArrowDataFrame: if isinstance(item, tuple): - item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) + item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) # type: ignore[assignment] if isinstance(item, str): from narwhals._arrow.series import ArrowSeries diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 67eac0c19..1ca35b197 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -155,7 +155,7 @@ def __getitem__( ), ) -> PandasLikeSeries | PandasLikeDataFrame: if isinstance(item, tuple): - item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) + item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item) # type: ignore[assignment] if isinstance(item, str): from narwhals._pandas_like.series import PandasLikeSeries From efbb4eebe0a0d2b932ce68a128df17e11d8da8d0 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 27 Oct 2024 15:42:41 +0000 Subject: [PATCH 07/11] docs: add duckdb to docs (#1261) --- docs/requirements-docs.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 929f35790..beec6070b 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,4 +1,5 @@ jinja2 +duckdb markdown-exec[ansi] mkdocs mkdocs-autorefs From c0a26beb811345b5459b6a239ae9b56e998ceb31 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 27 Oct 2024 16:03:07 +0000 Subject: [PATCH 08/11] release: Bump version to 1.11.0 (#1262) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 8f8dde67a..2b5fb86cd 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -29,7 +29,7 @@ To verify the installation, start the Python REPL and execute: ```python >>> import narwhals >>> narwhals.__version__ -'1.10.0' +'1.11.0' ``` If you see the version number, then the installation was successful! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index c1ff411cd..5fa4d0bd5 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -66,7 +66,7 @@ from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index -__version__ = "1.10.0" +__version__ = "1.11.0" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index 1897216ff..c6cb74280 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.10.0" +version = "1.11.0" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From 33c9527d8912dd2b2f42ef058df8f0263c5b82ef Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 28 Oct 2024 08:33:12 +0100 Subject: [PATCH 09/11] feat: expose `generate_temporary_column_name` publicly (#1264) --- docs/api-reference/narwhals.md | 1 + narwhals/__init__.py | 2 ++ narwhals/_arrow/dataframe.py | 8 ++++---- narwhals/_arrow/series.py | 8 ++++---- narwhals/_dask/dataframe.py | 10 +++++----- narwhals/_dask/expr.py | 6 +++--- narwhals/_pandas_like/dataframe.py | 6 +++--- narwhals/stable/v1/__init__.py | 30 +++++++++++++++++++++++++++++ narwhals/utils.py | 31 +++++++++++++++++++++++++----- tests/utils_test.py | 30 +++++++++++++++++++++++++++++ 10 files changed, 108 insertions(+), 24 deletions(-) diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index c4b04a2f4..2b5be6e8c 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -15,6 +15,7 @@ Here are the top-level functions available in Narwhals. - from_dict - from_native - from_arrow + - generate_temporary_column_name - get_level - get_native_namespace - is_ordered_categorical diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 5fa4d0bd5..6b549c93c 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -59,6 +59,7 @@ from narwhals.translate import narwhalify from narwhals.translate import to_native from narwhals.translate import to_py_scalar +from narwhals.utils import generate_temporary_column_name from narwhals.utils import is_ordered_categorical from narwhals.utils import maybe_align_index from narwhals.utils import maybe_convert_dtypes @@ -74,6 +75,7 @@ "concat", "from_dict", "from_arrow", + "generate_temporary_column_name", "get_level", "new_series", "to_native", diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 35a63d192..ac845853a 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -17,7 +17,7 @@ from narwhals.dependencies import is_numpy_array from narwhals.utils import Implementation from narwhals.utils import flatten -from narwhals.utils import generate_unique_token +from narwhals.utils import generate_temporary_column_name from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop @@ -358,7 +358,7 @@ def join( if how == "cross": plx = self.__narwhals_namespace__() - key_token = generate_unique_token( + key_token = generate_temporary_column_name( n_bytes=8, columns=[*self.columns, *other.columns] ) @@ -579,7 +579,7 @@ def is_duplicated(self: Self) -> ArrowSeries: df = self._native_frame columns = self.columns - col_token = generate_unique_token(n_bytes=8, columns=columns) + col_token = generate_temporary_column_name(n_bytes=8, columns=columns) row_count = ( df.append_column(col_token, pa.array(np.arange(len(self)))) .group_by(columns) @@ -638,7 +638,7 @@ def unique( agg_func_map = {"any": "min", "first": "min", "last": "max"} agg_func = agg_func_map[keep] - col_token = generate_unique_token(n_bytes=8, columns=self.columns) + col_token = generate_temporary_column_name(n_bytes=8, columns=self.columns) keep_idx = ( df.append_column(col_token, pa.array(np.arange(len(self)))) .group_by(subset) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index f029a4d5c..be1377b4d 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -14,7 +14,7 @@ from narwhals._arrow.utils import native_to_narwhals_dtype from narwhals._arrow.utils import validate_column_comparand from narwhals.utils import Implementation -from narwhals.utils import generate_unique_token +from narwhals.utils import generate_temporary_column_name if TYPE_CHECKING: from types import ModuleType @@ -604,7 +604,7 @@ def is_first_distinct(self: Self) -> Self: import pyarrow.compute as pc # ignore-banned-import() row_number = pa.array(np.arange(len(self))) - col_token = generate_unique_token(n_bytes=8, columns=[self.name]) + col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) first_distinct_index = ( pa.Table.from_arrays([self._native_series], names=[self.name]) .append_column(col_token, row_number) @@ -621,7 +621,7 @@ def is_last_distinct(self: Self) -> Self: import pyarrow.compute as pc # ignore-banned-import() row_number = pa.array(np.arange(len(self))) - col_token = generate_unique_token(n_bytes=8, columns=[self.name]) + col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) last_distinct_index = ( pa.Table.from_arrays([self._native_series], names=[self.name]) .append_column(col_token, row_number) @@ -715,7 +715,7 @@ def to_arrow(self: Self) -> pa.Array: def mode(self: Self) -> ArrowSeries: plx = self.__narwhals_namespace__() - col_token = generate_unique_token(n_bytes=8, columns=[self.name]) + col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name]) return self.value_counts(name=col_token, normalize=False).filter( plx.col(col_token) == plx.col(col_token).max() )[self.name] diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 039b8f33f..150b0177c 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -11,7 +11,7 @@ from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals.utils import Implementation from narwhals.utils import flatten -from narwhals.utils import generate_unique_token +from narwhals.utils import generate_temporary_column_name from narwhals.utils import parse_columns_to_drop from narwhals.utils import parse_version @@ -194,7 +194,7 @@ def unique( native_frame = self._native_frame if keep == "none": subset = subset or self.columns - token = generate_unique_token(n_bytes=8, columns=subset) + token = generate_temporary_column_name(n_bytes=8, columns=subset) ser = native_frame.groupby(subset).size().rename(token) ser = ser.loc[ser == 1] unique = ser.reset_index().drop(columns=token) @@ -236,7 +236,7 @@ def join( if isinstance(right_on, str): right_on = [right_on] if how == "cross": - key_token = generate_unique_token( + key_token = generate_temporary_column_name( n_bytes=8, columns=[*self.columns, *other.columns] ) @@ -253,7 +253,7 @@ def join( ) if how == "anti": - indicator_token = generate_unique_token( + indicator_token = generate_temporary_column_name( n_bytes=8, columns=[*self.columns, *other.columns] ) @@ -363,7 +363,7 @@ def tail(self: Self, n: int) -> Self: raise NotImplementedError(msg) def gather_every(self: Self, n: int, offset: int) -> Self: - row_index_token = generate_unique_token(n_bytes=8, columns=self.columns) + row_index_token = generate_temporary_column_name(n_bytes=8, columns=self.columns) pln = self.__narwhals_namespace__() return ( self.with_row_index(name=row_index_token) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 487fba77f..db29f6c4d 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -14,7 +14,7 @@ from narwhals._pandas_like.utils import calculate_timestamp_datetime from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals.utils import Implementation -from narwhals.utils import generate_unique_token +from narwhals.utils import generate_temporary_column_name if TYPE_CHECKING: import dask_expr @@ -580,7 +580,7 @@ def func(_input: dask_expr.Series, _quantile: float) -> dask_expr.Series: def is_first_distinct(self: Self) -> Self: def func(_input: dask_expr.Series) -> dask_expr.Series: _name = _input.name - col_token = generate_unique_token(n_bytes=8, columns=[_name]) + col_token = generate_temporary_column_name(n_bytes=8, columns=[_name]) _input = add_row_index(_input.to_frame(), col_token) first_distinct_index = _input.groupby(_name).agg({col_token: "min"})[ col_token @@ -597,7 +597,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: def is_last_distinct(self: Self) -> Self: def func(_input: dask_expr.Series) -> dask_expr.Series: _name = _input.name - col_token = generate_unique_token(n_bytes=8, columns=[_name]) + col_token = generate_temporary_column_name(n_bytes=8, columns=[_name]) _input = add_row_index(_input.to_frame(), col_token) last_distinct_index = _input.groupby(_name).agg({col_token: "max"})[col_token] diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 1ca35b197..50b309d47 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -19,7 +19,7 @@ from narwhals.dependencies import is_numpy_array from narwhals.utils import Implementation from narwhals.utils import flatten -from narwhals.utils import generate_unique_token +from narwhals.utils import generate_temporary_column_name from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop @@ -506,7 +506,7 @@ def join( self._implementation is Implementation.PANDAS and self._backend_version < (1, 4) ): - key_token = generate_unique_token( + key_token = generate_temporary_column_name( n_bytes=8, columns=[*self.columns, *other.columns] ) @@ -541,7 +541,7 @@ def join( ) ) else: - indicator_token = generate_unique_token( + indicator_token = generate_temporary_column_name( n_bytes=8, columns=[*self.columns, *other.columns] ) diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index f35d9197f..33531480c 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -55,6 +55,9 @@ from narwhals.typing import IntoDataFrameT from narwhals.typing import IntoFrameT from narwhals.typing import IntoSeriesT +from narwhals.utils import ( + generate_temporary_column_name as nw_generate_temporary_column_name, +) from narwhals.utils import is_ordered_categorical as nw_is_ordered_categorical from narwhals.utils import maybe_align_index as nw_maybe_align_index from narwhals.utils import maybe_convert_dtypes as nw_maybe_convert_dtypes @@ -2149,6 +2152,32 @@ def maybe_reset_index(obj: T) -> T: return nw_maybe_reset_index(obj) +def generate_temporary_column_name(n_bytes: int, columns: list[str]) -> str: + """Generates a unique token of specified `n_bytes` that is not present in the given + list of columns. + + It relies on [python secrets token_hex](https://docs.python.org/3/library/secrets.html#secrets.token_hex) + function to return a string nbytes random bytes. + + Arguments: + n_bytes: The number of bytes to generate for the token. + columns: The list of columns to check for uniqueness. + + Returns: + A unique token that is not present in the given list of columns. + + Raises: + AssertionError: If a unique token cannot be generated after 100 attempts. + + Examples: + >>> import narwhals.stable.v1 as nw + >>> columns = ["abc", "xyz"] + >>> nw.generate_temporary_column_name(n_bytes=8, columns=columns) not in columns + True + """ + return nw_generate_temporary_column_name(n_bytes=n_bytes, columns=columns) + + def get_native_namespace(obj: Any) -> Any: """ Get native namespace from object. @@ -2447,6 +2476,7 @@ def from_dict( "maybe_get_index", "maybe_reset_index", "maybe_set_index", + "generate_temporary_column_name", "get_native_namespace", "get_level", "all", diff --git a/narwhals/utils.py b/narwhals/utils.py index 34b45447d..66c2badee 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -10,6 +10,7 @@ from typing import Sequence from typing import TypeVar from typing import cast +from warnings import warn from narwhals._exceptions import ColumnNotFoundError from narwhals.dependencies import get_cudf @@ -481,17 +482,37 @@ def is_ordered_categorical(series: Series) -> bool: def generate_unique_token(n_bytes: int, columns: list[str]) -> str: # pragma: no cover - """Generates a unique token of specified n_bytes that is not present in the given list of columns. + warn( + "Use `generate_temporary_column_name` instead. `generate_unique_token` is " + "deprecated and it will be removed in future versions", + DeprecationWarning, + stacklevel=2, + ) + return generate_temporary_column_name(n_bytes=n_bytes, columns=columns) + + +def generate_temporary_column_name(n_bytes: int, columns: list[str]) -> str: + """Generates a unique token of specified `n_bytes` that is not present in the given + list of columns. + + It relies on [python secrets token_hex](https://docs.python.org/3/library/secrets.html#secrets.token_hex) + function to return a string nbytes random bytes. Arguments: - n_bytes : The number of bytes to generate for the token. - columns : The list of columns to check for uniqueness. + n_bytes: The number of bytes to generate for the token. + columns: The list of columns to check for uniqueness. Returns: A unique token that is not present in the given list of columns. Raises: AssertionError: If a unique token cannot be generated after 100 attempts. + + Examples: + >>> import narwhals as nw + >>> columns = ["abc", "xyz"] + >>> nw.generate_temporary_column_name(n_bytes=8, columns=columns) not in columns + True """ counter = 0 while True: @@ -502,8 +523,8 @@ def generate_unique_token(n_bytes: int, columns: list[str]) -> str: # pragma: n counter += 1 if counter > 100: msg = ( - "Internal Error: Narwhals was not able to generate a column name to perform given " - "join operation" + "Internal Error: Narwhals was not able to generate a column name with " + f"{n_bytes=} and not in {columns}" ) raise AssertionError(msg) diff --git a/tests/utils_test.py b/tests/utils_test.py index e06cc9cac..fb668b4d2 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -1,8 +1,12 @@ from __future__ import annotations +import string + +import hypothesis.strategies as st import pandas as pd import polars as pl import pytest +from hypothesis import given from pandas.testing import assert_frame_equal from pandas.testing import assert_index_equal from pandas.testing import assert_series_equal @@ -147,3 +151,29 @@ def test_maybe_convert_dtypes_polars() -> None: def test_get_trivial_version_with_uninstalled_module() -> None: result = get_module_version_as_tuple("non_existent_module") assert result == (0, 0, 0) + + +@given(n_bytes=st.integers(1, 100)) # type: ignore[misc] +def test_generate_temporary_column_name(n_bytes: int) -> None: + columns = ["abc", "XYZ"] + + temp_col_name = nw.generate_temporary_column_name(n_bytes=n_bytes, columns=columns) + assert temp_col_name not in columns + + +def test_generate_temporary_column_name_raise() -> None: + from itertools import product + + columns = [ + "".join(t) + for t in product( + string.ascii_lowercase + string.digits, + string.ascii_lowercase + string.digits, + ) + ] + + with pytest.raises( + AssertionError, + match="Internal Error: Narwhals was not able to generate a column name with ", + ): + nw.generate_temporary_column_name(n_bytes=1, columns=columns) From 425dbe4c465bc877d0ccd37fb5c0c73c0fa003e3 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 28 Oct 2024 07:56:14 +0000 Subject: [PATCH 10/11] fix: support groupby.iter for cudf (#1265) --- narwhals/_pandas_like/group_by.py | 17 +++++++++-------- tests/group_by_test.py | 7 +------ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 0bd52a0cf..c628ecbdb 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -103,15 +103,16 @@ def _from_native_frame(self, df: PandasLikeDataFrame) -> PandasLikeDataFrame: def __iter__(self) -> Iterator[tuple[Any, PandasLikeDataFrame]]: indices = self._grouped.indices - for key in indices: - if ( - self._df._implementation is Implementation.PANDAS - and self._df._backend_version < (2, 2) - ): # pragma: no cover - pass - else: # pragma: no cover + if ( + self._df._implementation is Implementation.PANDAS + and self._df._backend_version < (2, 2) + ) or (self._df._implementation is Implementation.CUDF): # pragma: no cover + for key in indices: + yield (key, self._from_native_frame(self._grouped.get_group(key))) + else: + for key in indices: key = tupleify(key) # noqa: PLW2901 - yield (key, self._from_native_frame(self._grouped.get_group(key))) + yield (key, self._from_native_frame(self._grouped.get_group(key))) def agg_pandas( # noqa: PLR0915 diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 63cc631a8..09ee213e8 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -74,12 +74,7 @@ def test_invalid_group_by() -> None: ) -def test_group_by_iter( - constructor_eager: ConstructorEager, request: pytest.FixtureRequest -) -> None: - if "cudf" in str(constructor_eager): - # https://github.com/rapidsai/cudf/issues/17187 - request.applymarker(pytest.mark.xfail) +def test_group_by_iter(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) expected_keys = [(1,), (3,)] keys = [] From 0b333f99a1aaf03d8934318601c7738b75212cf0 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 28 Oct 2024 10:10:31 +0100 Subject: [PATCH 11/11] RFC, feat: add `.select` (by str) for duckdb and ibis backend (#1266) --- docs/extending.md | 13 ++++++++ docs/index.md | 2 +- narwhals/_duckdb/dataframe.py | 19 +++++++++++ narwhals/_ibis/dataframe.py | 21 ++++++++++++ tests/frame/interchange_select_test.py | 46 ++++++++++++++++++++++++++ 5 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 tests/frame/interchange_select_test.py diff --git a/docs/extending.md b/docs/extending.md index 22d85f701..814db3287 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -37,6 +37,7 @@ def func(df: FrameT) -> FrameT: b_std=nw.col("b").std(), ) ``` + will work for any of pandas, Polars, cuDF, Modin, and PyArrow. However, sometimes you don't need to do complex operations on dataframes - all you need @@ -57,9 +58,21 @@ def func(df: Any) -> Schema: df = nw.from_native(df, eager_or_interchange_only=True) return df.schema ``` + is also supported, meaning that, in addition to the libraries mentioned above, you can also pass Ibis, DuckDB, Vaex, and any library which implements the protocol. +#### Interchange-only support + +While libraries for which we have full support can benefit from the whole Narwhals API, +libraries which have interchange only support can access the following methods after +converting to Narwhals DataFrame: + +- `.schema`, hence column names via `.schema.names()` and column types via `.schema.dtypes()` +- `.to_pandas()` and `.to_arrow()`, for converting to Pandas and Arrow, respectively. +- `.select(names)` (Ibis and DuckDB), where `names` is a list of (string) column names. This is useful for + selecting columns before converting to another library. + ### Extending Narwhals If you want your own library to be recognised too, you're welcome open a PR (with tests)!. diff --git a/docs/index.md b/docs/index.md index f18d9af85..e9fe02170 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,7 +6,7 @@ Extremely lightweight and extensible compatibility layer between dataframe libra - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow - **Lazy-only support**: Dask -- **Interchange-level support**: Ibis, Vaex, anything else which implements the DataFrame Interchange Protocol +- **Interchange-level support**: Ibis, DuckDB, Vaex, anything else which implements the DataFrame Interchange Protocol Seamlessly support all, without depending on any! diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 82ac6d41b..5bd7af153 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -90,6 +90,22 @@ def __getitem__(self, item: str) -> DuckDBInterchangeSeries: self._native_frame.select(item), dtypes=self._dtypes ) + def select( + self: Self, + *exprs: Any, + **named_exprs: Any, + ) -> Self: + if named_exprs or not all(isinstance(x, str) for x in exprs): # pragma: no cover + msg = ( + "`select`-ing not by name is not supported for DuckDB backend.\n\n" + "If you would like to see this kind of object better supported in " + "Narwhals, please open a feature request " + "at https://github.com/narwhals-dev/narwhals/issues." + ) + raise NotImplementedError(msg) + + return self._from_native_frame(self._native_frame.select(*exprs)) + def __getattr__(self, attr: str) -> Any: if attr == "schema": return { @@ -120,3 +136,6 @@ def to_pandas(self: Self) -> pd.DataFrame: def to_arrow(self: Self) -> pa.Table: return self._native_frame.arrow() + + def _from_native_frame(self: Self, df: Any) -> Self: + return self.__class__(df, dtypes=self._dtypes) diff --git a/narwhals/_ibis/dataframe.py b/narwhals/_ibis/dataframe.py index a9c3a49fa..c8a665db0 100644 --- a/narwhals/_ibis/dataframe.py +++ b/narwhals/_ibis/dataframe.py @@ -85,6 +85,24 @@ def to_pandas(self: Self) -> pd.DataFrame: def to_arrow(self: Self) -> pa.Table: return self._native_frame.to_pyarrow() + def select( + self: Self, + *exprs: Any, + **named_exprs: Any, + ) -> Self: + if named_exprs or not all(isinstance(x, str) for x in exprs): # pragma: no cover + msg = ( + "`select`-ing not by name is not supported for Ibis backend.\n\n" + "If you would like to see this kind of object better supported in " + "Narwhals, please open a feature request " + "at https://github.com/narwhals-dev/narwhals/issues." + ) + raise NotImplementedError(msg) + + import ibis.selectors as s + + return self._from_native_frame(self._native_frame.select(s.cols(*exprs))) + def __getattr__(self, attr: str) -> Any: if attr == "schema": return { @@ -98,3 +116,6 @@ def __getattr__(self, attr: str) -> Any: "at https://github.com/narwhals-dev/narwhals/issues." ) raise NotImplementedError(msg) + + def _from_native_frame(self: Self, df: Any) -> Self: + return self.__class__(df, dtypes=self._dtypes) diff --git a/tests/frame/interchange_select_test.py b/tests/frame/interchange_select_test.py new file mode 100644 index 000000000..e124735f7 --- /dev/null +++ b/tests/frame/interchange_select_test.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import duckdb +import polars as pl +import pytest + +import narwhals.stable.v1 as nw + +data = {"a": [1, 2, 3], "b": [4.0, 5.0, 6.1], "z": ["x", "y", "z"]} + + +def test_interchange() -> None: + df_pl = pl.DataFrame(data) + df = nw.from_native(df_pl.__dataframe__(), eager_or_interchange_only=True) + with pytest.raises( + NotImplementedError, + match="Attribute select is not supported for metadata-only dataframes", + ): + df.select("a", "z") + + +def test_interchange_ibis( + tmpdir: pytest.TempdirFactory, +) -> None: # pragma: no cover + ibis = pytest.importorskip("ibis") + df_pl = pl.DataFrame(data) + + filepath = str(tmpdir / "file.parquet") # type: ignore[operator] + df_pl.write_parquet(filepath) + + tbl = ibis.read_parquet(filepath) + df = nw.from_native(tbl, eager_or_interchange_only=True) + + out_cols = df.select("a", "z").schema.names() + + assert out_cols == ["a", "z"] + + +def test_interchange_duckdb() -> None: + df_pl = pl.DataFrame(data) # noqa: F841 + rel = duckdb.sql("select * from df_pl") + df = nw.from_native(rel, eager_or_interchange_only=True) + + out_cols = df.select("a", "z").schema.names() + + assert out_cols == ["a", "z"]