From df1feb5c4296a6253de39d87e669025b08d365c4 Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Thu, 19 Dec 2024 12:43:50 -0800 Subject: [PATCH 01/13] add is_nan expression & series method - add support for pandas, arrow, dask - add to docs - add tests --- docs/api-reference/expr.md | 1 + docs/api-reference/series.md | 1 + narwhals/_arrow/expr.py | 3 ++ narwhals/_arrow/series.py | 6 +++ narwhals/_dask/expr.py | 19 ++++++++ narwhals/_pandas_like/expr.py | 3 ++ narwhals/_pandas_like/series.py | 11 +++++ narwhals/expr.py | 72 ++++++++++++++++++++++++++++ narwhals/series.py | 40 ++++++++++++++++ tests/expr_and_series/is_nan_test.py | 40 ++++++++++++++++ 10 files changed, 196 insertions(+) create mode 100644 tests/expr_and_series/is_nan_test.py diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index fef26d1e2..d1e71caa1 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -33,6 +33,7 @@ - is_in - is_last_distinct - is_null + - is_nan - is_unique - len - map_batches diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index d5a446dc3..f23215225 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -39,6 +39,7 @@ - is_in - is_last_distinct - is_null + - is_nan - is_sorted - is_unique - item diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index b960ffa4e..ba962ab64 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -312,6 +312,9 @@ def null_count(self: Self) -> Self: def is_null(self: Self) -> Self: return reuse_series_implementation(self, "is_null") + def is_nan(self: Self) -> Self: + return reuse_series_implementation(self, "is_nan") + def is_between(self: Self, lower_bound: Any, upper_bound: Any, closed: str) -> Self: return reuse_series_implementation( self, "is_between", lower_bound, upper_bound, closed diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 2f29ab9db..e15f1d9b4 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -514,6 +514,12 @@ def is_null(self: Self) -> Self: ser = self._native_series return self._from_native_series(ser.is_null()) + def is_nan(self: Self) -> Self: + import pyarrow.compute as pc + + ser = self._native_series + return self._from_native_series(pc.is_nan(ser)) + def cast(self: Self, dtype: DType) -> Self: import pyarrow.compute as pc diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 0e8985791..958e610c3 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -689,6 +689,25 @@ def is_null(self: Self) -> Self: returns_scalar=False, ) + def is_nan(self: Self) -> Self: + def func(_input: dask_expr.Series) -> dask_expr.Series: + dtype = native_to_narwhals_dtype(_input, self._version, Implementation.DASK) + dtypes = import_dtypes_module(self._version) + if dtype == dtypes.Float64: + return _input != _input # noqa: PLR0124 + + import dask_expr as dx + + return dx.new_collection( + dx.expr.ScalarToSeries(frame=False, index=_input.index) + ) + + return self._from_call( + func, + "is_null", + returns_scalar=False, + ) + def len(self: Self) -> Self: return self._from_call( lambda _input: _input.size, diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 8d507fb46..f50c1e8cc 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -306,6 +306,9 @@ def is_between( def is_null(self) -> Self: return reuse_series_implementation(self, "is_null") + def is_nan(self) -> Self: + return reuse_series_implementation(self, "is_nan") + def fill_null( self, value: Any | None = None, diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index a8b57c5ce..39c44226b 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -654,6 +654,17 @@ def is_null(self) -> PandasLikeSeries: ser = self._native_series return self._from_native_series(ser.isna()) + def is_nan(self) -> PandasLikeSeries: + ser = self._native_series + dtypes = import_dtypes_module(self._version) + if self.dtype == dtypes.Float64: + return self._from_native_series(ser != ser) # noqa: PLR0124 + return self._from_native_series( + self._implementation.to_native_namespace().Series( + data=False, index=ser.index, name=ser.name + ) + ) + def fill_null( self, value: Any | None = None, diff --git a/narwhals/expr.py b/narwhals/expr.py index 24ddd3f40..fc53c15c2 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1921,6 +1921,78 @@ def is_null(self) -> Self: """ return self.__class__(lambda plx: self._to_compliant_expr(plx).is_null()) + def is_nan(self) -> Self: + """Returns a boolean Series indicating which values are NaN. + + Returns: + A new expression. + + Notes: + pandas, Polars and PyArrow handle null values differently. Polars and PyArrow + distinguish between NaN and Null, whereas pandas doesn't. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> df_pd = pd.DataFrame( + ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} + ... ).astype({"a": "Int64"}) + >>> df_pl = pl.DataFrame( + ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} + ... ) + >>> df_pa = pa.table( + ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} + ... ) + + Let's define a dataframe-agnostic function: + + >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... a_is_nan=nw.col("a").is_nan(), b_is_nan=nw.col("b").is_nan() + ... ).to_native() + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> my_library_agnostic_function(df_pd) + a b a_is_nan b_is_nan + 0 2 2.0 False False + 1 4 4.0 False False + 2 NaN False True + 3 3 3.0 False False + 4 5 5.0 False False + + >>> my_library_agnostic_function(df_pl) # nan != null for polars + shape: (5, 4) + ┌──────┬─────┬──────────┬──────────┐ + │ a ┆ b ┆ a_is_nan ┆ b_is_nan │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪═════╪══════════╪══════════╡ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ 4 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ NaN ┆ false ┆ true │ + │ 3 ┆ 3.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴─────┴──────────┴──────────┘ + + >>> my_library_agnostic_function(df_pa) # nan != null for pyarrow + pyarrow.Table + a: int64 + b: double + a_is_nan: bool + b_is_nan: bool + ---- + a: [[2,4,null,3,5]] + b: [[2,4,nan,3,5]] + a_is_nan: [[false,false,null,false,false]] + b_is_nan: [[false,false,true,false,false]] + """ + return self.__class__(lambda plx: self._to_compliant_expr(plx).is_nan()) + def arg_true(self) -> Self: """Find elements where boolean expression is True. diff --git a/narwhals/series.py b/narwhals/series.py index 98baab296..aa821bbf9 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1915,6 +1915,46 @@ def is_null(self) -> Self: """ return self._from_compliant_series(self._compliant_series.is_null()) + def is_nan(self) -> Self: + """Returns a boolean Series indicating which values are null. + + Notes: + pandas and Polars handle NaN values differently. Polars distinguishes + between NaN and Null, whereas pandas doesn't. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> s = [1.0, 2.0, float("nan")] + >>> s_pd = pd.Series(s, dtype="float64") + >>> s_pl = pl.Series(s) + + We define a dataframe-agnostic function: + + >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.is_nan().to_native() + + We can then pass either pandas or Polars to `func`: + + >>> my_library_agnostic_function(s_pd) + 0 False + 1 False + 2 True + dtype: bool + >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [bool] + [ + false + false + true + ] + """ + return self._from_compliant_series(self._compliant_series.is_nan()) + def fill_null( self, value: Any | None = None, diff --git a/tests/expr_and_series/is_nan_test.py b/tests/expr_and_series/is_nan_test.py new file mode 100644 index 000000000..238e7c3b3 --- /dev/null +++ b/tests/expr_and_series/is_nan_test.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import narwhals.stable.v1 as nw +from tests.conftest import dask_lazy_p2_constructor +from tests.conftest import pandas_constructor +from tests.utils import Constructor +from tests.utils import ConstructorEager +from tests.utils import assert_equal_data + +NON_NULLABLE_CONSTRUCTORS = [pandas_constructor, dask_lazy_p2_constructor] + + +def test_nan(constructor: Constructor) -> None: + data_na = {"a": [0, 1, None]} + df = nw.from_native(constructor(data_na)).select(nw.col("a") / nw.col("a")) + result = df.select(nw.col("a").is_nan()) + if any(constructor is c for c in NON_NULLABLE_CONSTRUCTORS): + # Null values are coerced to NaN for non-nullable datatypes + expected = {"a": [True, False, True]} + else: + # Null are preserved and should be differentiated for nullable datatypes + expected = {"a": [True, False, None]} # type: ignore[list-item] + + assert_equal_data(result, expected) + + +def test_nan_series(constructor_eager: ConstructorEager) -> None: + data_na = {"a": [0, 1, None]} + df = nw.from_native(constructor_eager(data_na), eager_only=True).select( + nw.col("a") / nw.col("a") + ) + result = {"a": df["a"].is_nan()} + if any(constructor_eager is c for c in NON_NULLABLE_CONSTRUCTORS): + # Null values are coerced to NaN for non-nullable datatypes + expected = {"a": [True, False, True]} + else: + # Null are preserved for nullable datatypes + expected = {"a": [True, False, None]} # type: ignore[list-item] + + assert_equal_data(result, expected) From b6c53fc045fee3f30470f08129cec3c4a28d0e70 Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Fri, 20 Dec 2024 06:56:12 -0800 Subject: [PATCH 02/13] alphabetize is_nan in ref/compl docs --- docs/api-reference/expr.md | 2 +- docs/api-reference/series.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index d1e71caa1..299ab2d4a 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -32,8 +32,8 @@ - is_first_distinct - is_in - is_last_distinct - - is_null - is_nan + - is_null - is_unique - len - map_batches diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index f23215225..c2e35a3c5 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -38,8 +38,8 @@ - is_first_distinct - is_in - is_last_distinct - - is_null - is_nan + - is_null - is_sorted - is_unique - item From ec3b3d5f7487cc7fb938f72f29ea7bf921a0954b Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Fri, 27 Dec 2024 11:25:39 -0800 Subject: [PATCH 03/13] clean up is_nan examples --- narwhals/expr.py | 23 +++++++++-------------- narwhals/series.py | 21 ++++++++++++++++----- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index fc53c15c2..f6b492b06 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1937,27 +1937,22 @@ def is_nan(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame( - ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} - ... ).astype({"a": "Int64"}) - >>> df_pl = pl.DataFrame( - ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} - ... ) - >>> df_pa = pa.table( - ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} - ... ) + >>> data = {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} + >>> df_pd = pd.DataFrame(data).astype({"a": "Int64"}) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_nan_columns(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_is_nan=nw.col("a").is_nan(), b_is_nan=nw.col("b").is_nan() ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_is_nan_columns`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_nan_columns(df_pd) a b a_is_nan b_is_nan 0 2 2.0 False False 1 4 4.0 False False @@ -1965,7 +1960,7 @@ def is_nan(self) -> Self: 3 3 3.0 False False 4 5 5.0 False False - >>> my_library_agnostic_function(df_pl) # nan != null for polars + >>> agnostic_is_nan_columns(df_pl) # nan != null for polars shape: (5, 4) ┌──────┬─────┬──────────┬──────────┐ │ a ┆ b ┆ a_is_nan ┆ b_is_nan │ @@ -1979,7 +1974,7 @@ def is_nan(self) -> Self: │ 5 ┆ 5.0 ┆ false ┆ false │ └──────┴─────┴──────────┴──────────┘ - >>> my_library_agnostic_function(df_pa) # nan != null for pyarrow + >>> agnostic_is_nan_columns(df_pa) # nan != null for pyarrow pyarrow.Table a: int64 b: double diff --git a/narwhals/series.py b/narwhals/series.py index aa821bbf9..d3cee06d6 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1925,26 +1925,28 @@ def is_nan(self) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT >>> s = [1.0, 2.0, float("nan")] >>> s_pd = pd.Series(s, dtype="float64") >>> s_pl = pl.Series(s) + >>> s_pa = pa.chunked_array([s], type=pa.float64()) - We define a dataframe-agnostic function: + We define a series-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_is_nan_series(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.is_nan().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass either pandas or Polars to `agnostic_is_nan_series`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_is_nan_series(s_pd) 0 False 1 False 2 True dtype: bool - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_nan_series(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [bool] [ @@ -1952,6 +1954,15 @@ def is_nan(self) -> Self: false true ] + >>> agnostic_is_nan_series(s_pa) # doctest: +NORMALIZE_WHITESPACE + + [ + [ + false, + false, + true + ] + ] """ return self._from_compliant_series(self._compliant_series.is_nan()) From ad95bf74a20c22ce8fb5b768733897ca6c48ad08 Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Fri, 27 Dec 2024 11:27:08 -0800 Subject: [PATCH 04/13] error for is_nan non-numeric dtypes --- narwhals/_dask/expr.py | 12 ++++-------- narwhals/_pandas_like/series.py | 13 ++++--------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 958e610c3..9016397a6 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -15,6 +15,7 @@ from narwhals._pandas_like.utils import calculate_timestamp_datetime from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals.exceptions import ColumnNotFoundError +from narwhals.exceptions import InvalidOperationError from narwhals.typing import CompliantExpr from narwhals.utils import Implementation from narwhals.utils import generate_temporary_column_name @@ -692,15 +693,10 @@ def is_null(self: Self) -> Self: def is_nan(self: Self) -> Self: def func(_input: dask_expr.Series) -> dask_expr.Series: dtype = native_to_narwhals_dtype(_input, self._version, Implementation.DASK) - dtypes = import_dtypes_module(self._version) - if dtype == dtypes.Float64: + if dtype.is_numeric(): return _input != _input # noqa: PLR0124 - - import dask_expr as dx - - return dx.new_collection( - dx.expr.ScalarToSeries(frame=False, index=_input.index) - ) + msg = f"`is_nan` is not supported for dtype {dtype}" + raise InvalidOperationError(msg) return self._from_call( func, diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 39c44226b..cc7b5a336 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -20,6 +20,7 @@ from narwhals._pandas_like.utils import set_axis from narwhals._pandas_like.utils import to_datetime from narwhals.dependencies import is_numpy_scalar +from narwhals.exceptions import InvalidOperationError from narwhals.typing import CompliantSeries from narwhals.utils import Implementation from narwhals.utils import import_dtypes_module @@ -614,8 +615,6 @@ def mean(self) -> Any: return ser.mean() def median(self) -> Any: - from narwhals.exceptions import InvalidOperationError - if not self.dtype.is_numeric(): msg = "`median` operation not supported for non-numeric input type." raise InvalidOperationError(msg) @@ -656,14 +655,10 @@ def is_null(self) -> PandasLikeSeries: def is_nan(self) -> PandasLikeSeries: ser = self._native_series - dtypes = import_dtypes_module(self._version) - if self.dtype == dtypes.Float64: + if self.dtype.is_numeric(): return self._from_native_series(ser != ser) # noqa: PLR0124 - return self._from_native_series( - self._implementation.to_native_namespace().Series( - data=False, index=ser.index, name=ser.name - ) - ) + msg = f"`is_nan` is not supported for dtype {self.dtype}" + raise InvalidOperationError(msg) def fill_null( self, From d864063345838e1416e1fdbf7a2125011b09dd59 Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Fri, 27 Dec 2024 11:34:16 -0800 Subject: [PATCH 05/13] fix doctest expr --- narwhals/expr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index f6b492b06..e427dc72a 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1956,7 +1956,7 @@ def is_nan(self) -> Self: a b a_is_nan b_is_nan 0 2 2.0 False False 1 4 4.0 False False - 2 NaN False True + 2 NaN True 3 3 3.0 False False 4 5 5.0 False False From bacd81faaccce944c959c4ca9c4c1139fa0a9f13 Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Fri, 27 Dec 2024 11:29:03 -0800 Subject: [PATCH 06/13] is_nan tests better coverage --- tests/expr_and_series/is_nan_test.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/expr_and_series/is_nan_test.py b/tests/expr_and_series/is_nan_test.py index 238e7c3b3..240d4f52a 100644 --- a/tests/expr_and_series/is_nan_test.py +++ b/tests/expr_and_series/is_nan_test.py @@ -1,6 +1,9 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw +from narwhals.exceptions import InvalidOperationError from tests.conftest import dask_lazy_p2_constructor from tests.conftest import pandas_constructor from tests.utils import Constructor @@ -38,3 +41,14 @@ def test_nan_series(constructor_eager: ConstructorEager) -> None: expected = {"a": [True, False, None]} # type: ignore[list-item] assert_equal_data(result, expected) + + +def test_nan_non_float() -> None: + data = {"a": ["0", "1"]} + pd_df = nw.from_native(pandas_constructor(data)) + with pytest.raises(InvalidOperationError, match="not supported"): + pd_df.select(nw.col("a").is_nan()) + + dd_df = nw.from_native(dask_lazy_p2_constructor(data)) + with pytest.raises(InvalidOperationError, match="not supported"): + dd_df.select(nw.col("a").is_nan()) From 3c7be12e914c489fd57e9b3722cc1ad20a1ebb5e Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Mon, 30 Dec 2024 10:56:55 -0800 Subject: [PATCH 07/13] fix is_nan documentation examples --- narwhals/expr.py | 69 +++++++++++++++++++++++----------------------- narwhals/series.py | 46 +++++++++++++++++-------------- 2 files changed, 59 insertions(+), 56 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index c77fac102..2c9e5b072 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1931,6 +1931,9 @@ def is_nan(self) -> Self: Returns: A new expression. + Raises: + narwhals.InvalidOperationError for non-Float64 dtypes. + Notes: pandas, Polars and PyArrow handle null values differently. Polars and PyArrow distinguish between NaN and Null, whereas pandas doesn't. @@ -1941,54 +1944,50 @@ def is_nan(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> data = {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} - >>> df_pd = pd.DataFrame(data).astype({"a": "Int64"}) + >>> data = {"orig": [0.0, None, 2.0]} + >>> df_pd = pd.DataFrame(data).astype({"orig": "Float64"}) >>> df_pl = pl.DataFrame(data) >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def agnostic_is_nan_columns(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_self_div_is_nan(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... a_is_nan=nw.col("a").is_nan(), b_is_nan=nw.col("b").is_nan() + ... divided=nw.col("orig") / nw.col("orig"), + ... divided_is_nan=(nw.col("orig") / nw.col("orig")).is_nan(), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_is_nan_columns`: + We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_self_div_is_nan`: - >>> agnostic_is_nan_columns(df_pd) - a b a_is_nan b_is_nan - 0 2 2.0 False False - 1 4 4.0 False False - 2 NaN True - 3 3 3.0 False False - 4 5 5.0 False False + >>> print(agnostic_self_div_is_nan(df_pd)) + orig divided divided_is_nan + 0 0.0 NaN True + 1 + 2 2.0 1.0 False - >>> agnostic_is_nan_columns(df_pl) # nan != null for polars - shape: (5, 4) - ┌──────┬─────┬──────────┬──────────┐ - │ a ┆ b ┆ a_is_nan ┆ b_is_nan │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪══════════╪══════════╡ - │ 2 ┆ 2.0 ┆ false ┆ false │ - │ 4 ┆ 4.0 ┆ false ┆ false │ - │ null ┆ NaN ┆ false ┆ true │ - │ 3 ┆ 3.0 ┆ false ┆ false │ - │ 5 ┆ 5.0 ┆ false ┆ false │ - └──────┴─────┴──────────┴──────────┘ - - >>> agnostic_is_nan_columns(df_pa) # nan != null for pyarrow + >>> print(agnostic_self_div_is_nan(df_pl)) + shape: (3, 3) + ┌──────┬─────────┬────────────────┐ + │ orig ┆ divided ┆ divided_is_nan │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ bool │ + ╞══════╪═════════╪════════════════╡ + │ 0.0 ┆ NaN ┆ true │ + │ null ┆ null ┆ null │ + │ 2.0 ┆ 1.0 ┆ false │ + └──────┴─────────┴────────────────┘ + + >>> print(agnostic_self_div_is_nan(df_pa)) pyarrow.Table - a: int64 - b: double - a_is_nan: bool - b_is_nan: bool + orig: double + divided: double + divided_is_nan: bool ---- - a: [[2,4,null,3,5]] - b: [[2,4,nan,3,5]] - a_is_nan: [[false,false,null,false,false]] - b_is_nan: [[false,false,true,false,false]] + orig: [[0,null,2]] + divided: [[nan,null,1]] + divided_is_nan: [[true,null,false]] + """ return self.__class__(lambda plx: self._to_compliant_expr(plx).is_nan()) diff --git a/narwhals/series.py b/narwhals/series.py index fe691ed08..3c874c916 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2033,7 +2033,13 @@ def is_null(self) -> Self: return self._from_compliant_series(self._compliant_series.is_null()) def is_nan(self) -> Self: - """Returns a boolean Series indicating which values are null. + """Returns a boolean Series indicating which values are NaN. + + Returns: + A new Series. + + Raises: + narwhals.InvalidOperationError for non-Float64 dtypes. Notes: pandas and Polars handle NaN values differently. Polars distinguishes @@ -2045,39 +2051,37 @@ def is_nan(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [1.0, 2.0, float("nan")] - >>> s_pd = pd.Series(s, dtype="float64") - >>> s_pl = pl.Series(s) - >>> s_pa = pa.chunked_array([s], type=pa.float64()) - We define a series-agnostic function: + >>> data = [0.0, None, 2.0] + >>> s_pd = pd.Series(data, dtype="Float64") + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data], type=pa.float64()) - >>> def agnostic_is_nan_series(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_self_div_is_nan(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.is_nan().to_native() - We can then pass either pandas or Polars to `agnostic_is_nan_series`: - - >>> agnostic_is_nan_series(s_pd) + >>> print(agnostic_self_div_is_nan(s_pd)) 0 False - 1 False - 2 True - dtype: bool - >>> agnostic_is_nan_series(s_pl) # doctest: +NORMALIZE_WHITESPACE + 1 + 2 False + dtype: boolean + + >>> print(agnostic_self_div_is_nan(s_pl)) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [bool] [ - false - false - true + false + null + false ] - >>> agnostic_is_nan_series(s_pa) # doctest: +NORMALIZE_WHITESPACE - + + >>> print(agnostic_self_div_is_nan(s_pa)) # doctest: +NORMALIZE_WHITESPACE [ [ false, - false, - true + null, + false ] ] """ From a2b225b8bea528e494d1768a8e56047755388bfc Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Mon, 30 Dec 2024 11:05:28 -0800 Subject: [PATCH 08/13] is_nan only works on numeric dtypes all backends --- narwhals/_arrow/series.py | 3 +-- narwhals/_dask/expr.py | 4 ++-- narwhals/_pandas_like/series.py | 2 +- narwhals/_polars/expr.py | 9 +++++++++ narwhals/_polars/series.py | 13 +++++++++++++ 5 files changed, 26 insertions(+), 5 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 92f56df58..0549cd1b8 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -523,8 +523,7 @@ def is_null(self: Self) -> Self: def is_nan(self: Self) -> Self: import pyarrow.compute as pc - ser = self._native_series - return self._from_native_series(pc.is_nan(ser)) + return self._from_native_series(pc.is_nan(self._native_series)) def cast(self: Self, dtype: DType) -> Self: import pyarrow.compute as pc diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index c6238fc15..63cc4e503 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -705,10 +705,10 @@ def is_null(self: Self) -> Self: def is_nan(self: Self) -> Self: def func(_input: dask_expr.Series) -> dask_expr.Series: - dtype = native_to_narwhals_dtype(_input, self._version, Implementation.DASK) + dtype = native_to_narwhals_dtype(_input, self._version, self._implementation) if dtype.is_numeric(): return _input != _input # noqa: PLR0124 - msg = f"`is_nan` is not supported for dtype {dtype}" + msg = f"`.is_nan` only supported for numeric dtypes and not {dtype}, did you mean `.is_null`?" raise InvalidOperationError(msg) return self._from_call( diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 23a470e7b..33a8ce332 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -666,7 +666,7 @@ def is_nan(self) -> PandasLikeSeries: ser = self._native_series if self.dtype.is_numeric(): return self._from_native_series(ser != ser) # noqa: PLR0124 - msg = f"`is_nan` is not supported for dtype {self.dtype}" + msg = f"`.is_nan` only supported for numeric dtype and not {self.dtype}, did you mean `.is_null`?" raise InvalidOperationError(msg) def fill_null( diff --git a/narwhals/_polars/expr.py b/narwhals/_polars/expr.py index 6c50c99af..230ce37d5 100644 --- a/narwhals/_polars/expr.py +++ b/narwhals/_polars/expr.py @@ -79,6 +79,15 @@ def ewm_mean( ) return self._from_native_expr(native_expr) + def is_nan(self: Self) -> Self: + if self._backend_version < (1, 18): # pragma: no cover + import polars as pl + + return self._from_native_expr( + pl.when(self._native_expr.is_not_null()).then(self._native_expr.is_nan()) + ) + return self._from_native_expr(self._native_expr.is_nan()) + def rolling_var( self: Self, window_size: int, diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index d7088bb0b..30cd90fd5 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -220,6 +220,19 @@ def __rpow__(self: Self, other: PolarsSeries | Any) -> Self: def __invert__(self: Self) -> Self: return self._from_native_series(self._native_series.__invert__()) + def is_nan(self: Self) -> Self: + import polars as pl + + native = self._native_series + + if self._backend_version < (1, 18): # pragma: no cover + return self._from_native_series( + pl.select(pl.when(native.is_not_null()).then(native.is_nan()))[ + native.name + ] + ) + return self._from_native_series(native.is_nan()) + def median(self: Self) -> Any: from narwhals.exceptions import InvalidOperationError From 0b580808023c4596cb7fb565e4c283400477bd0f Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Mon, 30 Dec 2024 11:13:48 -0800 Subject: [PATCH 09/13] enh testing for is_nan --- tests/expr_and_series/is_nan_test.py | 106 +++++++++++++++++++++------ 1 file changed, 85 insertions(+), 21 deletions(-) diff --git a/tests/expr_and_series/is_nan_test.py b/tests/expr_and_series/is_nan_test.py index 240d4f52a..b047bcb7e 100644 --- a/tests/expr_and_series/is_nan_test.py +++ b/tests/expr_and_series/is_nan_test.py @@ -1,9 +1,10 @@ from __future__ import annotations +from typing import Any + import pytest import narwhals.stable.v1 as nw -from narwhals.exceptions import InvalidOperationError from tests.conftest import dask_lazy_p2_constructor from tests.conftest import pandas_constructor from tests.utils import Constructor @@ -14,41 +15,104 @@ def test_nan(constructor: Constructor) -> None: - data_na = {"a": [0, 1, None]} - df = nw.from_native(constructor(data_na)).select(nw.col("a") / nw.col("a")) - result = df.select(nw.col("a").is_nan()) + data_na = {"int": [0, 1, None]} + df = nw.from_native(constructor(data_na)).with_columns( + float=nw.col("int").cast(nw.Float64), float_na=nw.col("int") / nw.col("int") + ) + result = df.select( + int=nw.col("int").is_nan(), + float=nw.col("float").is_nan(), + float_na=nw.col("float_na").is_nan(), + ) + + expected: dict[str, list[Any]] if any(constructor is c for c in NON_NULLABLE_CONSTRUCTORS): # Null values are coerced to NaN for non-nullable datatypes - expected = {"a": [True, False, True]} + expected = { + "int": [False, False, True], + "float": [False, False, True], + "float_na": [True, False, True], + } else: # Null are preserved and should be differentiated for nullable datatypes - expected = {"a": [True, False, None]} # type: ignore[list-item] + expected = { + "int": [False, False, None], + "float": [False, False, None], + "float_na": [True, False, None], + } assert_equal_data(result, expected) def test_nan_series(constructor_eager: ConstructorEager) -> None: - data_na = {"a": [0, 1, None]} - df = nw.from_native(constructor_eager(data_na), eager_only=True).select( - nw.col("a") / nw.col("a") + data_na = {"int": [0, 1, None]} + df = nw.from_native(constructor_eager(data_na), eager_only=True).with_columns( + float=nw.col("int").cast(nw.Float64), float_na=nw.col("int") / nw.col("int") ) - result = {"a": df["a"].is_nan()} + + result = { + "int": df["int"].is_nan(), + "float": df["float"].is_nan(), + "float_na": df["float_na"].is_nan(), + } + expected: dict[str, list[Any]] if any(constructor_eager is c for c in NON_NULLABLE_CONSTRUCTORS): # Null values are coerced to NaN for non-nullable datatypes - expected = {"a": [True, False, True]} + expected = { + "int": [False, False, True], + "float": [False, False, True], + "float_na": [True, False, True], + } else: - # Null are preserved for nullable datatypes - expected = {"a": [True, False, None]} # type: ignore[list-item] + # Null are preserved and should be differentiated for nullable datatypes + expected = { + "int": [False, False, None], + "float": [False, False, None], + "float_na": [True, False, None], + } assert_equal_data(result, expected) -def test_nan_non_float() -> None: - data = {"a": ["0", "1"]} - pd_df = nw.from_native(pandas_constructor(data)) - with pytest.raises(InvalidOperationError, match="not supported"): - pd_df.select(nw.col("a").is_nan()) +def test_nan_non_float(constructor: Constructor) -> None: + from polars.exceptions import InvalidOperationError as PlInvalidOperationError + from pyarrow.lib import ArrowNotImplementedError + + from narwhals.exceptions import InvalidOperationError as NwInvalidOperationError + + data = {"a": ["x", "y"]} + df = nw.from_native(constructor(data)) + + if "polars" in str(constructor): + with pytest.raises(PlInvalidOperationError): + df.select(nw.col("a").is_nan()).lazy().collect() + + elif "dask" in str(constructor) or "pandas" in str(constructor): + with pytest.raises(NwInvalidOperationError): + df.select(nw.col("a").is_nan()) + + elif "pyarrow" in str(constructor): + with pytest.raises(ArrowNotImplementedError): + df.select(nw.col("a").is_nan()) + + +def test_nan_non_float_series(constructor_eager: ConstructorEager) -> None: + from polars.exceptions import InvalidOperationError as PlInvalidOperationError + from pyarrow.lib import ArrowNotImplementedError + + from narwhals.exceptions import InvalidOperationError as NwInvalidOperationError + + data = {"a": ["x", "y"]} + df = nw.from_native(constructor_eager(data), eager_only=True) + + if "polars" in str(constructor_eager): + with pytest.raises(PlInvalidOperationError): + df["a"].is_nan() + + elif "dask" in str(constructor_eager) or "pandas" in str(constructor_eager): + with pytest.raises(NwInvalidOperationError): + df["a"].is_nan() - dd_df = nw.from_native(dask_lazy_p2_constructor(data)) - with pytest.raises(InvalidOperationError, match="not supported"): - dd_df.select(nw.col("a").is_nan()) + elif "pyarrow" in str(constructor_eager): + with pytest.raises(ArrowNotImplementedError): + df["a"].is_nan() From b74e65cae28972a03bfaa190a9b430d42c5da388 Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Mon, 30 Dec 2024 12:45:01 -0800 Subject: [PATCH 10/13] enh is_nan documentation - link out to pandas_like_concepts/null_handling - remove stale raises - fix returns for Series to be more specific --- narwhals/expr.py | 10 ++++------ narwhals/series.py | 10 ++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index 2c9e5b072..68a459c0e 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1926,17 +1926,15 @@ def is_null(self) -> Self: return self.__class__(lambda plx: self._to_compliant_expr(plx).is_null()) def is_nan(self) -> Self: - """Returns a boolean Series indicating which values are NaN. + """Indicate which values are NaN. Returns: A new expression. - Raises: - narwhals.InvalidOperationError for non-Float64 dtypes. - Notes: - pandas, Polars and PyArrow handle null values differently. Polars and PyArrow - distinguish between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../pandas_like_concepts/null_handling.md/) + for reference. Examples: >>> import pandas as pd diff --git a/narwhals/series.py b/narwhals/series.py index 3c874c916..111b789f0 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2036,14 +2036,12 @@ def is_nan(self) -> Self: """Returns a boolean Series indicating which values are NaN. Returns: - A new Series. - - Raises: - narwhals.InvalidOperationError for non-Float64 dtypes. + A boolean Series indicating which values are NaN. Notes: - pandas and Polars handle NaN values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../pandas_like_concepts/null_handling.md/) + for reference. Examples: >>> import pandas as pd From 266894b96b96e0cb5810cc7511175c3e4515a817 Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Thu, 2 Jan 2025 07:43:50 -0800 Subject: [PATCH 11/13] add modin to is_nan tests --- tests/expr_and_series/is_nan_test.py | 30 +++++++++++----------------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/tests/expr_and_series/is_nan_test.py b/tests/expr_and_series/is_nan_test.py index b047bcb7e..323dcc890 100644 --- a/tests/expr_and_series/is_nan_test.py +++ b/tests/expr_and_series/is_nan_test.py @@ -83,17 +83,14 @@ def test_nan_non_float(constructor: Constructor) -> None: data = {"a": ["x", "y"]} df = nw.from_native(constructor(data)) + exc = NwInvalidOperationError if "polars" in str(constructor): - with pytest.raises(PlInvalidOperationError): - df.select(nw.col("a").is_nan()).lazy().collect() + exc = PlInvalidOperationError + elif "pyarrow_table" in str(constructor): + exc = ArrowNotImplementedError - elif "dask" in str(constructor) or "pandas" in str(constructor): - with pytest.raises(NwInvalidOperationError): - df.select(nw.col("a").is_nan()) - - elif "pyarrow" in str(constructor): - with pytest.raises(ArrowNotImplementedError): - df.select(nw.col("a").is_nan()) + with pytest.raises(exc): + df.select(nw.col("a").is_nan()).lazy().collect() def test_nan_non_float_series(constructor_eager: ConstructorEager) -> None: @@ -105,14 +102,11 @@ def test_nan_non_float_series(constructor_eager: ConstructorEager) -> None: data = {"a": ["x", "y"]} df = nw.from_native(constructor_eager(data), eager_only=True) + exc = NwInvalidOperationError if "polars" in str(constructor_eager): - with pytest.raises(PlInvalidOperationError): - df["a"].is_nan() - - elif "dask" in str(constructor_eager) or "pandas" in str(constructor_eager): - with pytest.raises(NwInvalidOperationError): - df["a"].is_nan() + exc = PlInvalidOperationError + elif "pyarrow_table" in str(constructor_eager): + exc = ArrowNotImplementedError - elif "pyarrow" in str(constructor_eager): - with pytest.raises(ArrowNotImplementedError): - df["a"].is_nan() + with pytest.raises(exc): + df["a"].is_nan() From 2716ed9700ad1023b1df676c20a847a3327b58e8 Mon Sep 17 00:00:00 2001 From: Cameron Riddell Date: Thu, 2 Jan 2025 08:04:47 -0800 Subject: [PATCH 12/13] fix modin in is_nan tests --- tests/expr_and_series/is_nan_test.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/expr_and_series/is_nan_test.py b/tests/expr_and_series/is_nan_test.py index 323dcc890..d98b39e85 100644 --- a/tests/expr_and_series/is_nan_test.py +++ b/tests/expr_and_series/is_nan_test.py @@ -5,13 +5,20 @@ import pytest import narwhals.stable.v1 as nw +from tests.conftest import dask_lazy_p1_constructor from tests.conftest import dask_lazy_p2_constructor +from tests.conftest import modin_constructor from tests.conftest import pandas_constructor from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data -NON_NULLABLE_CONSTRUCTORS = [pandas_constructor, dask_lazy_p2_constructor] +NON_NULLABLE_CONSTRUCTORS = [ + pandas_constructor, + dask_lazy_p1_constructor, + dask_lazy_p2_constructor, + modin_constructor, +] def test_nan(constructor: Constructor) -> None: From a7ea65ab9787a776cf6dec5edf9df2419b3af854 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 3 Jan 2025 19:49:04 +0000 Subject: [PATCH 13/13] Update narwhals/_dask/expr.py --- narwhals/_dask/expr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index d3c00eeae..6ae73a56a 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -718,7 +718,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: return self._from_call( func, "is_null", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def len(self: Self) -> Self: