Merge remote-tracking branch 'upstream/main' into when

narwhals-dev · May 20, 2024 · 9d85baa · 9d85baa
2 parents 8a368dd + f18c533
commit 9d85baa
Show file tree

Hide file tree

Showing 13 changed files with 800 additions and 1 deletion.
diff --git a/docs/api-reference/expressions.md b/docs/api-reference/expressions.md
@@ -14,11 +14,16 @@
         - fill_null
         - filter
         - is_between
+        - is_duplicated
+        - is_first_distinct
         - is_in
+        - is_last_distinct
         - is_null
+        - is_unique
         - max
         - mean
         - min
+        - null_count
         - n_unique
         - over
         - unique

diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md
@@ -15,12 +15,19 @@
         - fill_null
         - filter
         - is_between
+        - is_duplicated
+        - is_empty
+        - is_first_distinct
         - is_in
+        - is_last_distinct
         - is_null
+        - is_sorted
+        - is_unique
         - max
         - mean
         - min
         - name
+        - null_count
         - n_unique
         - sample
         - shape
@@ -32,5 +39,6 @@
         - to_numpy
         - to_pandas
         - unique
+        - value_counts
       show_source: false
       show_bases: false
diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py
@@ -253,6 +253,21 @@ def func(df: PandasDataFrame) -> list[PandasSeries]:
             implementation=self._implementation,
         )
 
+    def is_duplicated(self) -> Self:
+        return register_expression_call(self, "is_duplicated")
+
+    def is_unique(self) -> Self:
+        return register_expression_call(self, "is_unique")
+
+    def null_count(self) -> Self:
+        return register_expression_call(self, "null_count")
+
+    def is_first_distinct(self) -> Self:
+        return register_expression_call(self, "is_first_distinct")
+
+    def is_last_distinct(self) -> Self:
+        return register_expression_call(self, "is_last_distinct")
+
     @property
     def str(self) -> PandasExprStringNamespace:
         return PandasExprStringNamespace(self)

diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py
@@ -439,6 +439,50 @@ def to_pandas(self) -> Any:
         msg = f"Unknown implementation: {self._implementation}"  # pragma: no cover
         raise AssertionError(msg)
 
+    # --- descriptive ---
+    def is_duplicated(self: Self) -> Self:
+        return self._from_series(self._series.duplicated(keep=False))
+
+    def is_empty(self: Self) -> bool:
+        return self._series.empty  # type: ignore[no-any-return]
+
+    def is_unique(self: Self) -> Self:
+        return self._from_series(~self._series.duplicated(keep=False))
+
+    def null_count(self: Self) -> int:
+        return self._series.isnull().sum()  # type: ignore[no-any-return]
+
+    def is_first_distinct(self: Self) -> Self:
+        return self._from_series(~self._series.duplicated(keep="first"))
+
+    def is_last_distinct(self: Self) -> Self:
+        return self._from_series(~self._series.duplicated(keep="last"))
+
+    def is_sorted(self: Self, *, descending: bool = False) -> bool:
+        if not isinstance(descending, bool):
+            msg = f"argument 'descending' should be boolean, found {type(descending)}"
+            raise TypeError(msg)
+
+        if descending:
+            return self._series.is_monotonic_decreasing  # type: ignore[no-any-return]
+        else:
+            return self._series.is_monotonic_increasing  # type: ignore[no-any-return]
+
+    def value_counts(self: Self, *, sort: bool = False, parallel: bool = False) -> Any:
+        """Parallel is unused, exists for compatibility"""
+        from narwhals._pandas_like.dataframe import PandasDataFrame
+
+        name_ = "index" if self._series.name is None else self._series.name
+        val_count = self._series.value_counts(dropna=False, sort=False).reset_index()
+        val_count.columns = [name_, "count"]
+        if sort:
+            val_count = val_count.sort_values(name_)
+
+        return PandasDataFrame(
+            val_count,
+            implementation=self._implementation,
+        )
+
     @property
     def str(self) -> PandasSeriesStringNamespace:
         return PandasSeriesStringNamespace(self)

diff --git a/narwhals/expression.py b/narwhals/expression.py
@@ -1127,6 +1127,214 @@ def over(self, *keys: str | Iterable[str]) -> Expr:
         """
         return self.__class__(lambda plx: self._call(plx).over(flatten(keys)))
 
+    def is_duplicated(self) -> Expr:
+        r"""
+        Return a boolean mask indicating duplicated values.
+
+        Examples:
+            >>> import narwhals as nw
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]}
+            >>> df_pd = pd.DataFrame(data)
+            >>> df_pl = pl.DataFrame(data)
+
+            Let's define a dataframe-agnostic function:
+
+            >>> def func(df_any):
+            ...     df = nw.from_native(df_any)
+            ...     duplicated = df.select(nw.all().is_duplicated())
+            ...     return nw.to_native(duplicated)
+
+            We can then pass either pandas or Polars to `func`:
+
+            >>> func(df_pd)  # doctest: +NORMALIZE_WHITESPACE
+                   a      b
+            0   True   True
+            1  False   True
+            2  False  False
+            3   True  False
+            >>> func(df_pl)  # doctest: +NORMALIZE_WHITESPACE
+            shape: (4, 2)
+            ┌───────┬───────┐
+            │ a     ┆ b     │
+            │ ---   ┆ ---   │
+            │ bool  ┆ bool  │
+            ╞═══════╪═══════╡
+            │ true  ┆ true  │
+            │ false ┆ true  │
+            │ false ┆ false │
+            │ true  ┆ false │
+            └───────┴───────┘
+        """
+        return self.__class__(lambda plx: self._call(plx).is_duplicated())
+
+    def is_unique(self) -> Expr:
+        r"""
+        Return a boolean mask indicating unique values.
+
+        Examples:
+            >>> import narwhals as nw
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]}
+            >>> df_pd = pd.DataFrame(data)
+            >>> df_pl = pl.DataFrame(data)
+
+            Let's define a dataframe-agnostic function:
+
+            >>> def func(df_any):
+            ...     df = nw.from_native(df_any)
+            ...     unique = df.select(nw.all().is_unique())
+            ...     return nw.to_native(unique)
+
+            We can then pass either pandas or Polars to `func`:
+
+            >>> func(df_pd)  # doctest: +NORMALIZE_WHITESPACE
+                   a      b
+            0  False  False
+            1   True  False
+            2   True   True
+            3  False   True
+            >>> func(df_pl)  # doctest: +NORMALIZE_WHITESPACE
+            shape: (4, 2)
+            ┌───────┬───────┐
+            │ a     ┆ b     │
+            │ ---   ┆ ---   │
+            │ bool  ┆ bool  │
+            ╞═══════╪═══════╡
+            │ false ┆ false │
+            │ true  ┆ false │
+            │ true  ┆ true  │
+            │ false ┆ true  │
+            └───────┴───────┘
+        """
+
+        return self.__class__(lambda plx: self._call(plx).is_unique())
+
+    def null_count(self) -> Expr:
+        r"""
+        Count null values.
+
+        Notes:
+            pandas and Polars handle null values differently. Polars distinguishes
+            between NaN and Null, whereas pandas doesn't.
+
+        Examples:
+            >>> import narwhals as nw
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> data = {"a": [1, 2, None, 1], "b": ["a", None, "b", None]}
+            >>> df_pd = pd.DataFrame(data)
+            >>> df_pl = pl.DataFrame(data)
+
+            Let's define a dataframe-agnostic function:
+
+            >>> def func(df_any):
+            ...     df = nw.from_native(df_any)
+            ...     nulls = df.select(nw.all().null_count())
+            ...     return nw.to_native(nulls)
+
+            We can then pass either pandas or Polars to `func`:
+
+            >>> func(df_pd)
+               a  b
+            0  1  2
+            >>> func(df_pl)
+            shape: (1, 2)
+            ┌─────┬─────┐
+            │ a   ┆ b   │
+            │ --- ┆ --- │
+            │ u32 ┆ u32 │
+            ╞═════╪═════╡
+            │ 1   ┆ 2   │
+            └─────┴─────┘
+        """
+        return self.__class__(lambda plx: self._call(plx).null_count())
+
+    def is_first_distinct(self) -> Expr:
+        r"""
+        Return a boolean mask indicating the first occurrence of each distinct value.
+
+        Examples:
+            >>> import narwhals as nw
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]}
+            >>> df_pd = pd.DataFrame(data)
+            >>> df_pl = pl.DataFrame(data)
+
+            Let's define a dataframe-agnostic function:
+
+            >>> def func(df_any):
+            ...     df = nw.from_native(df_any)
+            ...     first_distinct = df.select(nw.all().is_first_distinct())
+            ...     return nw.to_native(first_distinct)
+
+            We can then pass either pandas or Polars to `func`:
+
+            >>> func(df_pd)  # doctest: +NORMALIZE_WHITESPACE
+                   a      b
+            0   True   True
+            1   True  False
+            2   True   True
+            3  False   True
+            >>> func(df_pl)  # doctest: +NORMALIZE_WHITESPACE
+            shape: (4, 2)
+            ┌───────┬───────┐
+            │ a     ┆ b     │
+            │ ---   ┆ ---   │
+            │ bool  ┆ bool  │
+            ╞═══════╪═══════╡
+            │ true  ┆ true  │
+            │ true  ┆ false │
+            │ true  ┆ true  │
+            │ false ┆ true  │
+            └───────┴───────┘
+        """
+        return self.__class__(lambda plx: self._call(plx).is_first_distinct())
+
+    def is_last_distinct(self) -> Expr:
+        r"""Return a boolean mask indicating the last occurrence of each distinct value.
+
+        Examples:
+            >>> import narwhals as nw
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]}
+            >>> df_pd = pd.DataFrame(data)
+            >>> df_pl = pl.DataFrame(data)
+
+            Let's define a dataframe-agnostic function:
+
+            >>> def func(df_any):
+            ...     df = nw.from_native(df_any)
+            ...     last_distinct = df.select(nw.all().is_last_distinct())
+            ...     return nw.to_native(last_distinct)
+
+            We can then pass either pandas or Polars to `func`:
+
+            >>> func(df_pd)  # doctest: +NORMALIZE_WHITESPACE
+                   a      b
+            0  False  False
+            1   True   True
+            2   True   True
+            3   True   True
+            >>> func(df_pl)  # doctest: +NORMALIZE_WHITESPACE
+            shape: (4, 2)
+            ┌───────┬───────┐
+            │ a     ┆ b     │
+            │ ---   ┆ ---   │
+            │ bool  ┆ bool  │
+            ╞═══════╪═══════╡
+            │ false ┆ false │
+            │ true  ┆ true  │
+            │ true  ┆ true  │
+            │ true  ┆ true  │
+            └───────┴───────┘
+        """
+        return self.__class__(lambda plx: self._call(plx).is_last_distinct())
+
     @property
     def str(self) -> ExprStringNamespace:
         return ExprStringNamespace(self)