Merge remote-tracking branch 'upstream/main' into map_batches

DeaMariaLeon · Oct 28, 2024 · 96e08a4 · 96e08a4
2 parents e9e8c28 + 0b333f9
commit 96e08a4
Show file tree

Hide file tree

Showing 39 changed files with 517 additions and 124 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,15 +1,15 @@
 repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
-  rev: 'v0.6.9'
+  rev: 'v0.7.0'
   hooks:
     # Run the formatter.
     - id: ruff-format
     # Run the linter.
     - id: ruff
       args: [--fix]
 - repo: https://github.com/pre-commit/mirrors-mypy
-  rev: 'v1.11.2'
+  rev: 'v1.12.1'
   hooks:
     - id: mypy
       additional_dependencies: ['polars==1.4.1', 'pytest==8.3.2']

diff --git a/README.md b/README.md
@@ -43,10 +43,11 @@ Join the party!
 
 - [Altair](https://github.com/vega/altair/)
 - [Hamilton](https://github.com/DAGWorks-Inc/hamilton/tree/main/examples/narwhals)
+- [marimo](https://github.com/marimo-team/marimo)
+- [pymarginaleffects](https://github.com/vincentarelbundock/pymarginaleffects)
 - [scikit-lego](https://github.com/koaning/scikit-lego)
 - [scikit-playtime](https://github.com/koaning/scikit-playtime)
 - [timebasedcv](https://github.com/FBruzzesi/timebasedcv)
-- [marimo](https://github.com/marimo-team/marimo)
 - [tubular](https://github.com/lvgig/tubular)
 - [wimsey](https://github.com/benrutter/wimsey)
 

diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md
@@ -15,6 +15,7 @@ Here are the top-level functions available in Narwhals.
         - from_dict
         - from_native
         - from_arrow
+        - generate_temporary_column_name
         - get_level
         - get_native_namespace
         - is_ordered_categorical

diff --git a/docs/basics/dataframe_conversion.md b/docs/basics/dataframe_conversion.md
@@ -0,0 +1,76 @@
+# Conversion between libraries
+
+Some library maintainers must apply complex dataframe operations, using methods and functions that may not (yet) be implemented in Narwhals. In such cases, Narwhals can still be highly beneficial, by allowing easy dataframe conversion.
+
+## Dataframe X in, pandas out
+
+Imagine that you maintain a library with a function that operates on pandas dataframes to produce automated reports. You want to allow users to supply a dataframe in any format to that function (pandas, Polars, DuckDB, cuDF, Modin, etc.) without adding all those dependencies to your own project and without special-casing each input library's variation of `to_pandas` / `toPandas` / `to_pandas_df` / `df` ...
+
+One solution is to use Narwhals as a thin Dataframe ingestion layer, to convert user-supplied dataframe to the format that your library uses internally. Since Narwhals is zero-dependency, this is a much more lightweight solution than including all the dataframe libraries as dependencies,
+and easier to write than special casing each input library's `to_pandas` method (if it even exists!).
+
+To illustrate, we create dataframes in various formats:
+
+```python exec="1" source="above" session="conversion"
+import narwhals as nw
+from narwhals.typing import IntoDataFrame
+
+import duckdb
+import polars as pl
+import pandas as pd
+
+df_polars = pl.DataFrame(
+    {
+        "A": [1, 2, 3, 4, 5],
+        "fruits": ["banana", "banana", "apple", "apple", "banana"],
+        "B": [5, 4, 3, 2, 1],
+        "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
+    }
+)
+df_pandas = df_polars.to_pandas()
+df_duckdb = duckdb.sql("SELECT * FROM df_polars")
+```
+
+Now, we define a function that can ingest any dataframe type supported by Narwhals, and convert it to a pandas DataFrame for internal use:
+
+```python exec="1" source="above" session="conversion" result="python"
+def df_to_pandas(df: IntoDataFrame) -> pd.DataFrame:
+    return nw.from_native(df).to_pandas()
+
+
+print(df_to_pandas(df_polars))
+```
+
+## Dataframe X in, Polars out
+
+### Via PyCapsule Interface
+
+Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe to Polars format using Narwhals.
+
+```python exec="1" source="above" session="conversion" result="python"
+def df_to_polars(df: IntoDataFrame) -> pl.DataFrame:
+    return nw.from_arrow(nw.from_native(df), native_namespace=pl).to_native()
+
+
+print(df_to_polars(df_duckdb))  # You can only execute this line of code once.
+```
+
+It works to pass Polars to `native_namespace` here because Polars supports the [PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) for import.
+
+Note that the PyCapsule Interface makes no guarantee that you can call it repeatedly, so the approach above only works if you
+only expect to perform the conversion a single time on each input object.
+
+### Via PyArrow
+
+If you need to ingest the same dataframe multiple times, then you may want to go via PyArrow instead.
+This may be less efficient than the PyCapsule approach above (and always requires PyArrow!), but is more forgiving:
+
+```python exec="1" source="above" session="conversion" result="python"
+def df_to_polars(df: IntoDataFrame) -> pl.DataFrame:
+    return pl.DataFrame(nw.from_native(df).to_arrow())
+
+
+df_duckdb = duckdb.sql("SELECT * FROM df_polars")
+print(df_to_polars(df_duckdb))  # We can execute this...
+print(df_to_polars(df_duckdb))  # ...as many times as we like!
+```
diff --git a/docs/extending.md b/docs/extending.md
@@ -37,6 +37,7 @@ def func(df: FrameT) -> FrameT:
         b_std=nw.col("b").std(),
     )
 ```
+
 will work for any of pandas, Polars, cuDF, Modin, and PyArrow.
 
 However, sometimes you don't need to do complex operations on dataframes - all you need
@@ -57,9 +58,21 @@ def func(df: Any) -> Schema:
     df = nw.from_native(df, eager_or_interchange_only=True)
     return df.schema
 ```
+
 is also supported, meaning that, in addition to the libraries mentioned above, you can
 also pass Ibis, DuckDB, Vaex, and any library which implements the protocol.
 
+#### Interchange-only support
+
+While libraries for which we have full support can benefit from the whole Narwhals API,
+libraries which have interchange only support can access the following methods after 
+converting to Narwhals DataFrame:
+
+- `.schema`, hence column names via `.schema.names()` and column types via `.schema.dtypes()`
+- `.to_pandas()` and `.to_arrow()`, for converting to Pandas and Arrow, respectively.
+- `.select(names)` (Ibis and DuckDB), where `names` is a list of (string) column names. This is useful for
+  selecting columns before converting to another library.
+
 ### Extending Narwhals
 
 If you want your own library to be recognised too, you're welcome open a PR (with tests)!.

diff --git a/docs/index.md b/docs/index.md
@@ -6,7 +6,7 @@ Extremely lightweight and extensible compatibility layer between dataframe libra
 
 - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow
 - **Lazy-only support**: Dask
-- **Interchange-level support**: Ibis, Vaex, anything else which implements the DataFrame Interchange Protocol
+- **Interchange-level support**: Ibis, DuckDB, Vaex, anything else which implements the DataFrame Interchange Protocol
 
 Seamlessly support all, without depending on any!
 

diff --git a/docs/installation.md b/docs/installation.md
@@ -29,7 +29,7 @@ To verify the installation, start the Python REPL and execute:
 ```python
 >>> import narwhals
 >>> narwhals.__version__
-'1.10.0'
+'1.11.0'
 ```
 If you see the version number, then the installation was successful!
 

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
@@ -1,4 +1,5 @@
 jinja2
+duckdb
 markdown-exec[ansi]
 mkdocs
 mkdocs-autorefs

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -10,6 +10,7 @@ nav:
     - basics/dataframe.md
     - basics/series.md
     - basics/complete_example.md
+    - basics/dataframe_conversion.md
   - Pandas-like concepts:
     - other/pandas_index.md
     - other/user_warning.md

diff --git a/narwhals/__init__.py b/narwhals/__init__.py
@@ -59,21 +59,23 @@
 from narwhals.translate import narwhalify
 from narwhals.translate import to_native
 from narwhals.translate import to_py_scalar
+from narwhals.utils import generate_temporary_column_name
 from narwhals.utils import is_ordered_categorical
 from narwhals.utils import maybe_align_index
 from narwhals.utils import maybe_convert_dtypes
 from narwhals.utils import maybe_get_index
 from narwhals.utils import maybe_reset_index
 from narwhals.utils import maybe_set_index
 
-__version__ = "1.10.0"
+__version__ = "1.11.0"
 
 __all__ = [
     "dependencies",
     "selectors",
     "concat",
     "from_dict",
     "from_arrow",
+    "generate_temporary_column_name",
     "get_level",
     "new_series",
     "to_native",

diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -17,7 +17,7 @@
 from narwhals.dependencies import is_numpy_array
 from narwhals.utils import Implementation
 from narwhals.utils import flatten
-from narwhals.utils import generate_unique_token
+from narwhals.utils import generate_temporary_column_name
 from narwhals.utils import is_sequence_but_not_str
 from narwhals.utils import parse_columns_to_drop
 
@@ -172,7 +172,7 @@ def __getitem__(
         ),
     ) -> ArrowSeries | ArrowDataFrame:
         if isinstance(item, tuple):
-            item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item)
+            item = tuple(list(i) if is_sequence_but_not_str(i) else i for i in item)  # type: ignore[assignment]
 
         if isinstance(item, str):
             from narwhals._arrow.series import ArrowSeries
@@ -335,10 +335,10 @@ def with_columns(
         df = self._native_frame.__class__.from_arrays(to_concat, names=output_names)
         return self._from_native_frame(df)
 
-    def group_by(self, *keys: str) -> ArrowGroupBy:
+    def group_by(self, *keys: str, drop_null_keys: bool) -> ArrowGroupBy:
         from narwhals._arrow.group_by import ArrowGroupBy
 
-        return ArrowGroupBy(self, list(keys))
+        return ArrowGroupBy(self, list(keys), drop_null_keys=drop_null_keys)
 
     def join(
         self,
@@ -358,7 +358,7 @@ def join(
 
         if how == "cross":
             plx = self.__narwhals_namespace__()
-            key_token = generate_unique_token(
+            key_token = generate_temporary_column_name(
                 n_bytes=8, columns=[*self.columns, *other.columns]
             )
 
@@ -579,7 +579,7 @@ def is_duplicated(self: Self) -> ArrowSeries:
         df = self._native_frame
 
         columns = self.columns
-        col_token = generate_unique_token(n_bytes=8, columns=columns)
+        col_token = generate_temporary_column_name(n_bytes=8, columns=columns)
         row_count = (
             df.append_column(col_token, pa.array(np.arange(len(self))))
             .group_by(columns)
@@ -638,7 +638,7 @@ def unique(
             agg_func_map = {"any": "min", "first": "min", "last": "max"}
 
             agg_func = agg_func_map[keep]
-            col_token = generate_unique_token(n_bytes=8, columns=self.columns)
+            col_token = generate_temporary_column_name(n_bytes=8, columns=self.columns)
             keep_idx = (
                 df.append_column(col_token, pa.array(np.arange(len(self))))
                 .group_by(subset)

diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
@@ -354,7 +354,7 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:
                     "`nw.col('a', 'b')`\n"
                 )
                 raise ValueError(msg)
-            tmp = df.group_by(*keys).agg(self)
+            tmp = df.group_by(*keys, drop_null_keys=False).agg(self)
             tmp = df.select(*keys).join(
                 tmp, how="left", left_on=keys, right_on=keys, suffix="_right"
             )

diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py
@@ -37,10 +37,15 @@ def get_function_name_option(function_name: str) -> Any | None:
 
 
 class ArrowGroupBy:
-    def __init__(self, df: ArrowDataFrame, keys: list[str]) -> None:
+    def __init__(
+        self, df: ArrowDataFrame, keys: list[str], *, drop_null_keys: bool
+    ) -> None:
         import pyarrow as pa  # ignore-banned-import()
 
-        self._df = df
+        if drop_null_keys:
+            self._df = df.drop_nulls(keys)
+        else:
+            self._df = df
         self._keys = list(keys)
         self._grouped = pa.TableGroupBy(self._df._native_frame, list(self._keys))
 
@@ -74,11 +79,7 @@ def agg(
         )
 
     def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]:
-        key_values = (
-            self._df.select(*self._keys)
-            .unique(subset=self._keys, keep="first")
-            .iter_rows()
-        )
+        key_values = self._df.select(*self._keys).unique(subset=self._keys, keep="first")
         nw_namespace = self._df.__narwhals_namespace__()
         yield from (
             (
@@ -87,7 +88,7 @@ def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]:
                     *[nw_namespace.col(k) == v for k, v in zip(self._keys, key_value)]
                 ),
             )
-            for key_value in key_values
+            for key_value in key_values.iter_rows()
         )
 
 

diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
@@ -14,7 +14,7 @@
 from narwhals._arrow.utils import native_to_narwhals_dtype
 from narwhals._arrow.utils import validate_column_comparand
 from narwhals.utils import Implementation
-from narwhals.utils import generate_unique_token
+from narwhals.utils import generate_temporary_column_name
 
 if TYPE_CHECKING:
     from types import ModuleType
@@ -604,7 +604,7 @@ def is_first_distinct(self: Self) -> Self:
         import pyarrow.compute as pc  # ignore-banned-import()
 
         row_number = pa.array(np.arange(len(self)))
-        col_token = generate_unique_token(n_bytes=8, columns=[self.name])
+        col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name])
         first_distinct_index = (
             pa.Table.from_arrays([self._native_series], names=[self.name])
             .append_column(col_token, row_number)
@@ -621,7 +621,7 @@ def is_last_distinct(self: Self) -> Self:
         import pyarrow.compute as pc  # ignore-banned-import()
 
         row_number = pa.array(np.arange(len(self)))
-        col_token = generate_unique_token(n_bytes=8, columns=[self.name])
+        col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name])
         last_distinct_index = (
             pa.Table.from_arrays([self._native_series], names=[self.name])
             .append_column(col_token, row_number)
@@ -715,7 +715,7 @@ def to_arrow(self: Self) -> pa.Array:
 
     def mode(self: Self) -> ArrowSeries:
         plx = self.__narwhals_namespace__()
-        col_token = generate_unique_token(n_bytes=8, columns=[self.name])
+        col_token = generate_temporary_column_name(n_bytes=8, columns=[self.name])
         return self.value_counts(name=col_token, normalize=False).filter(
             plx.col(col_token) == plx.col(col_token).max()
         )[self.name]