Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into when
Browse files Browse the repository at this point in the history
  • Loading branch information
DeaMariaLeon committed May 20, 2024
2 parents 8a368dd + f18c533 commit 9d85baa
Show file tree
Hide file tree
Showing 13 changed files with 800 additions and 1 deletion.
5 changes: 5 additions & 0 deletions docs/api-reference/expressions.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,16 @@
- fill_null
- filter
- is_between
- is_duplicated
- is_first_distinct
- is_in
- is_last_distinct
- is_null
- is_unique
- max
- mean
- min
- null_count
- n_unique
- over
- unique
Expand Down
8 changes: 8 additions & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,19 @@
- fill_null
- filter
- is_between
- is_duplicated
- is_empty
- is_first_distinct
- is_in
- is_last_distinct
- is_null
- is_sorted
- is_unique
- max
- mean
- min
- name
- null_count
- n_unique
- sample
- shape
Expand All @@ -32,5 +39,6 @@
- to_numpy
- to_pandas
- unique
- value_counts
show_source: false
show_bases: false
15 changes: 15 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,21 @@ def func(df: PandasDataFrame) -> list[PandasSeries]:
implementation=self._implementation,
)

def is_duplicated(self) -> Self:
return register_expression_call(self, "is_duplicated")

def is_unique(self) -> Self:
return register_expression_call(self, "is_unique")

def null_count(self) -> Self:
return register_expression_call(self, "null_count")

def is_first_distinct(self) -> Self:
return register_expression_call(self, "is_first_distinct")

def is_last_distinct(self) -> Self:
return register_expression_call(self, "is_last_distinct")

@property
def str(self) -> PandasExprStringNamespace:
return PandasExprStringNamespace(self)
Expand Down
44 changes: 44 additions & 0 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,50 @@ def to_pandas(self) -> Any:
msg = f"Unknown implementation: {self._implementation}" # pragma: no cover
raise AssertionError(msg)

# --- descriptive ---
def is_duplicated(self: Self) -> Self:
return self._from_series(self._series.duplicated(keep=False))

def is_empty(self: Self) -> bool:
return self._series.empty # type: ignore[no-any-return]

def is_unique(self: Self) -> Self:
return self._from_series(~self._series.duplicated(keep=False))

def null_count(self: Self) -> int:
return self._series.isnull().sum() # type: ignore[no-any-return]

def is_first_distinct(self: Self) -> Self:
return self._from_series(~self._series.duplicated(keep="first"))

def is_last_distinct(self: Self) -> Self:
return self._from_series(~self._series.duplicated(keep="last"))

def is_sorted(self: Self, *, descending: bool = False) -> bool:
if not isinstance(descending, bool):
msg = f"argument 'descending' should be boolean, found {type(descending)}"
raise TypeError(msg)

if descending:
return self._series.is_monotonic_decreasing # type: ignore[no-any-return]
else:
return self._series.is_monotonic_increasing # type: ignore[no-any-return]

def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> Any:
"""Parallel is unused, exists for compatibility"""
from narwhals._pandas_like.dataframe import PandasDataFrame

name_ = "index" if self._series.name is None else self._series.name
val_count = self._series.value_counts(dropna=False, sort=False).reset_index()
val_count.columns = [name_, "count"]
if sort:
val_count = val_count.sort_values(name_)

return PandasDataFrame(
val_count,
implementation=self._implementation,
)

@property
def str(self) -> PandasSeriesStringNamespace:
return PandasSeriesStringNamespace(self)
Expand Down
208 changes: 208 additions & 0 deletions narwhals/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,214 @@ def over(self, *keys: str | Iterable[str]) -> Expr:
"""
return self.__class__(lambda plx: self._call(plx).over(flatten(keys)))

def is_duplicated(self) -> Expr:
r"""
Return a boolean mask indicating duplicated values.
Examples:
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]}
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
Let's define a dataframe-agnostic function:
>>> def func(df_any):
... df = nw.from_native(df_any)
... duplicated = df.select(nw.all().is_duplicated())
... return nw.to_native(duplicated)
We can then pass either pandas or Polars to `func`:
>>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE
a b
0 True True
1 False True
2 False False
3 True False
>>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE
shape: (4, 2)
┌───────┬───────┐
│ a ┆ b │
│ --- ┆ --- │
│ bool ┆ bool │
╞═══════╪═══════╡
│ true ┆ true │
│ false ┆ true │
│ false ┆ false │
│ true ┆ false │
└───────┴───────┘
"""
return self.__class__(lambda plx: self._call(plx).is_duplicated())

def is_unique(self) -> Expr:
r"""
Return a boolean mask indicating unique values.
Examples:
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]}
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
Let's define a dataframe-agnostic function:
>>> def func(df_any):
... df = nw.from_native(df_any)
... unique = df.select(nw.all().is_unique())
... return nw.to_native(unique)
We can then pass either pandas or Polars to `func`:
>>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE
a b
0 False False
1 True False
2 True True
3 False True
>>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE
shape: (4, 2)
┌───────┬───────┐
│ a ┆ b │
│ --- ┆ --- │
│ bool ┆ bool │
╞═══════╪═══════╡
│ false ┆ false │
│ true ┆ false │
│ true ┆ true │
│ false ┆ true │
└───────┴───────┘
"""

return self.__class__(lambda plx: self._call(plx).is_unique())

def null_count(self) -> Expr:
r"""
Count null values.
Notes:
pandas and Polars handle null values differently. Polars distinguishes
between NaN and Null, whereas pandas doesn't.
Examples:
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> data = {"a": [1, 2, None, 1], "b": ["a", None, "b", None]}
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
Let's define a dataframe-agnostic function:
>>> def func(df_any):
... df = nw.from_native(df_any)
... nulls = df.select(nw.all().null_count())
... return nw.to_native(nulls)
We can then pass either pandas or Polars to `func`:
>>> func(df_pd)
a b
0 1 2
>>> func(df_pl)
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ u32 ┆ u32 │
╞═════╪═════╡
│ 1 ┆ 2 │
└─────┴─────┘
"""
return self.__class__(lambda plx: self._call(plx).null_count())

def is_first_distinct(self) -> Expr:
r"""
Return a boolean mask indicating the first occurrence of each distinct value.
Examples:
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]}
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
Let's define a dataframe-agnostic function:
>>> def func(df_any):
... df = nw.from_native(df_any)
... first_distinct = df.select(nw.all().is_first_distinct())
... return nw.to_native(first_distinct)
We can then pass either pandas or Polars to `func`:
>>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE
a b
0 True True
1 True False
2 True True
3 False True
>>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE
shape: (4, 2)
┌───────┬───────┐
│ a ┆ b │
│ --- ┆ --- │
│ bool ┆ bool │
╞═══════╪═══════╡
│ true ┆ true │
│ true ┆ false │
│ true ┆ true │
│ false ┆ true │
└───────┴───────┘
"""
return self.__class__(lambda plx: self._call(plx).is_first_distinct())

def is_last_distinct(self) -> Expr:
r"""Return a boolean mask indicating the last occurrence of each distinct value.
Examples:
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]}
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
Let's define a dataframe-agnostic function:
>>> def func(df_any):
... df = nw.from_native(df_any)
... last_distinct = df.select(nw.all().is_last_distinct())
... return nw.to_native(last_distinct)
We can then pass either pandas or Polars to `func`:
>>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE
a b
0 False False
1 True True
2 True True
3 True True
>>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE
shape: (4, 2)
┌───────┬───────┐
│ a ┆ b │
│ --- ┆ --- │
│ bool ┆ bool │
╞═══════╪═══════╡
│ false ┆ false │
│ true ┆ true │
│ true ┆ true │
│ true ┆ true │
└───────┴───────┘
"""
return self.__class__(lambda plx: self._call(plx).is_last_distinct())

@property
def str(self) -> ExprStringNamespace:
return ExprStringNamespace(self)
Expand Down
Loading

0 comments on commit 9d85baa

Please sign in to comment.