Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add is_nan expression & series method #1625

Merged
merged 19 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
- is_in
- is_last_distinct
- is_null
- is_nan
camriddell marked this conversation as resolved.
Show resolved Hide resolved
- is_unique
- len
- map_batches
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
- is_in
- is_last_distinct
- is_null
- is_nan
- is_sorted
- is_unique
- item
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ def null_count(self: Self) -> Self:
def is_null(self: Self) -> Self:
return reuse_series_implementation(self, "is_null")

def is_nan(self: Self) -> Self:
return reuse_series_implementation(self, "is_nan")

def is_between(self: Self, lower_bound: Any, upper_bound: Any, closed: str) -> Self:
return reuse_series_implementation(
self, "is_between", lower_bound, upper_bound, closed
Expand Down
6 changes: 6 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,12 @@ def is_null(self: Self) -> Self:
ser = self._native_series
return self._from_native_series(ser.is_null())

def is_nan(self: Self) -> Self:
import pyarrow.compute as pc

ser = self._native_series
return self._from_native_series(pc.is_nan(ser))

def cast(self: Self, dtype: DType) -> Self:
import pyarrow.compute as pc

Expand Down
19 changes: 19 additions & 0 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,25 @@ def is_null(self: Self) -> Self:
returns_scalar=False,
)

def is_nan(self: Self) -> Self:
def func(_input: dask_expr.Series) -> dask_expr.Series:
dtype = native_to_narwhals_dtype(_input, self._version, Implementation.DASK)
dtypes = import_dtypes_module(self._version)
if dtype == dtypes.Float64:
return _input != _input # noqa: PLR0124

import dask_expr as dx

return dx.new_collection(
dx.expr.ScalarToSeries(frame=False, index=_input.index)
)

return self._from_call(
func,
"is_null",
returns_scalar=False,
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved
)

def len(self: Self) -> Self:
return self._from_call(
lambda _input: _input.size,
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,9 @@ def is_between(
def is_null(self) -> Self:
return reuse_series_implementation(self, "is_null")

def is_nan(self) -> Self:
return reuse_series_implementation(self, "is_nan")

def fill_null(
self,
value: Any | None = None,
Expand Down
11 changes: 11 additions & 0 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,17 @@ def is_null(self) -> PandasLikeSeries:
ser = self._native_series
return self._from_native_series(ser.isna())

def is_nan(self) -> PandasLikeSeries:
ser = self._native_series
dtypes = import_dtypes_module(self._version)
if self.dtype == dtypes.Float64:
return self._from_native_series(ser != ser) # noqa: PLR0124
return self._from_native_series(
self._implementation.to_native_namespace().Series(
data=False, index=ser.index, name=ser.name
)
)

def fill_null(
self,
value: Any | None = None,
Expand Down
72 changes: 72 additions & 0 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1921,6 +1921,78 @@ def is_null(self) -> Self:
"""
return self.__class__(lambda plx: self._to_compliant_expr(plx).is_null())

def is_nan(self) -> Self:
"""Returns a boolean Series indicating which values are NaN.

Returns:
A new expression.

Notes:
pandas, Polars and PyArrow handle null values differently. Polars and PyArrow
distinguish between NaN and Null, whereas pandas doesn't.
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>> df_pd = pd.DataFrame(
... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
... ).astype({"a": "Int64"})
>>> df_pl = pl.DataFrame(
... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
... )
>>> df_pa = pa.table(
... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
... )

Let's define a dataframe-agnostic function:

>>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#1500 is still in progress, but as we're adding a new function, shall we follow the conventions there?

... df = nw.from_native(df_native)
... return df.with_columns(
... a_is_nan=nw.col("a").is_nan(), b_is_nan=nw.col("b").is_nan()
... ).to_native()
camriddell marked this conversation as resolved.
Show resolved Hide resolved

We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`:

>>> my_library_agnostic_function(df_pd)
a b a_is_nan b_is_nan
0 2 2.0 False False
1 4 4.0 False False
2 <NA> NaN False True
3 3 3.0 False False
4 5 5.0 False False

>>> my_library_agnostic_function(df_pl) # nan != null for polars
shape: (5, 4)
β”Œβ”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ a ┆ b ┆ a_is_nan ┆ b_is_nan β”‚
β”‚ --- ┆ --- ┆ --- ┆ --- β”‚
β”‚ i64 ┆ f64 ┆ bool ┆ bool β”‚
β•žβ•β•β•β•β•β•β•ͺ═════β•ͺ══════════β•ͺ══════════║
β”‚ 2 ┆ 2.0 ┆ false ┆ false β”‚
β”‚ 4 ┆ 4.0 ┆ false ┆ false β”‚
β”‚ null ┆ NaN ┆ false ┆ true β”‚
β”‚ 3 ┆ 3.0 ┆ false ┆ false β”‚
β”‚ 5 ┆ 5.0 ┆ false ┆ false β”‚
β””β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

>>> my_library_agnostic_function(df_pa) # nan != null for pyarrow
pyarrow.Table
a: int64
b: double
a_is_nan: bool
b_is_nan: bool
----
a: [[2,4,null,3,5]]
b: [[2,4,nan,3,5]]
a_is_nan: [[false,false,null,false,false]]
b_is_nan: [[false,false,true,false,false]]
"""
return self.__class__(lambda plx: self._to_compliant_expr(plx).is_nan())

def arg_true(self) -> Self:
"""Find elements where boolean expression is True.

Expand Down
40 changes: 40 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1915,6 +1915,46 @@ def is_null(self) -> Self:
"""
return self._from_compliant_series(self._compliant_series.is_null())

def is_nan(self) -> Self:
"""Returns a boolean Series indicating which values are null.

Notes:
pandas and Polars handle NaN values differently. Polars distinguishes
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved
between NaN and Null, whereas pandas doesn't.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> from narwhals.typing import IntoSeriesT
>>> s = [1.0, 2.0, float("nan")]
>>> s_pd = pd.Series(s, dtype="float64")
>>> s_pl = pl.Series(s)

We define a dataframe-agnostic function:

>>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

... s = nw.from_native(s_native, series_only=True)
... return s.is_nan().to_native()

We can then pass either pandas or Polars to `func`:
camriddell marked this conversation as resolved.
Show resolved Hide resolved

>>> my_library_agnostic_function(s_pd)
0 False
1 False
2 True
dtype: bool
>>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE
shape: (3,)
Series: '' [bool]
[
false
false
true
]
"""
return self._from_compliant_series(self._compliant_series.is_nan())

def fill_null(
self,
value: Any | None = None,
Expand Down
40 changes: 40 additions & 0 deletions tests/expr_and_series/is_nan_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

import narwhals.stable.v1 as nw
from tests.conftest import dask_lazy_p2_constructor
from tests.conftest import pandas_constructor
from tests.utils import Constructor
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

NON_NULLABLE_CONSTRUCTORS = [pandas_constructor, dask_lazy_p2_constructor]


def test_nan(constructor: Constructor) -> None:
data_na = {"a": [0, 1, None]}
df = nw.from_native(constructor(data_na)).select(nw.col("a") / nw.col("a"))
result = df.select(nw.col("a").is_nan())
if any(constructor is c for c in NON_NULLABLE_CONSTRUCTORS):
# Null values are coerced to NaN for non-nullable datatypes
expected = {"a": [True, False, True]}
else:
# Null are preserved and should be differentiated for nullable datatypes
expected = {"a": [True, False, None]} # type: ignore[list-item]

assert_equal_data(result, expected)


def test_nan_series(constructor_eager: ConstructorEager) -> None:
data_na = {"a": [0, 1, None]}
df = nw.from_native(constructor_eager(data_na), eager_only=True).select(
nw.col("a") / nw.col("a")
)
result = {"a": df["a"].is_nan()}
if any(constructor_eager is c for c in NON_NULLABLE_CONSTRUCTORS):
# Null values are coerced to NaN for non-nullable datatypes
expected = {"a": [True, False, True]}
else:
# Null are preserved for nullable datatypes
expected = {"a": [True, False, None]} # type: ignore[list-item]

assert_equal_data(result, expected)
Loading