Merge remote-tracking branch 'upstream/main' into spark-expr-dunder-m…

…ethods
narwhals-dev · Jan 8, 2025 · 66acada · 66acada
2 parents 6097214 + 5dca2a9
commit 66acada
Show file tree

Hide file tree

Showing 159 changed files with 6,664 additions and 2,850 deletions.
diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml
@@ -220,7 +220,7 @@ jobs:
         run: |
             cd tea-tasting
             pdm remove narwhals
-            pdm add ./..
+            pdm add ./..[dev]
       - name: show-deps
         run: |
             cd tea-tasting

diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml
@@ -61,7 +61,7 @@ jobs:
           cache-suffix: ${{ matrix.python-version }}
           cache-dependency-glob: "pyproject.toml"
       - name: install-pretty-old-versions
-        run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.3.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system
+        run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system
       - name: install-reqs
         run: uv pip install -e ".[dev]" --system
       - name: show-deps
@@ -75,7 +75,7 @@ jobs:
           echo "$DEPS" | grep 'polars==0.20.3'
           echo "$DEPS" | grep 'numpy==1.17.5'
           echo "$DEPS" | grep 'pyarrow==11.0.0'
-          echo "$DEPS" | grep 'pyspark==3.3.0'
+          echo "$DEPS" | grep 'pyspark==3.5.0'
           echo "$DEPS" | grep 'scipy==1.5.0'
           echo "$DEPS" | grep 'scikit-learn==1.1.0'
       - name: Run pytest
@@ -84,7 +84,7 @@ jobs:
   not_so_old_versions:
     strategy:
       matrix:
-        python-version: ["3.9"]
+        python-version: ["3.10"]
         os: [ubuntu-latest]
     runs-on: ${{ matrix.os }}
     steps:
@@ -99,7 +99,7 @@ jobs:
           cache-suffix: ${{ matrix.python-version }}
           cache-dependency-glob: "pyproject.toml"
       - name: install-not-so-old-versions
-        run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==14.0.0 "pyarrow-stubs<17" pyspark==3.4.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.7 tzdata --system
+        run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==15.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.10 tzdata --system
       - name: install-reqs
         run: uv pip install -e ".[dev]" --system
       - name: show-deps
@@ -110,11 +110,11 @@ jobs:
           echo "$DEPS" | grep 'pandas==2.0.3'
           echo "$DEPS" | grep 'polars==0.20.8'
           echo "$DEPS" | grep 'numpy==1.24.4'
-          echo "$DEPS" | grep 'pyarrow==14.0.0'
-          echo "$DEPS" | grep 'pyspark==3.4.0'
+          echo "$DEPS" | grep 'pyarrow==15.0.0'
+          echo "$DEPS" | grep 'pyspark==3.5.0'
           echo "$DEPS" | grep 'scipy==1.8.0'
           echo "$DEPS" | grep 'scikit-learn==1.3.0'
-          echo "$DEPS" | grep 'dask==2024.7'
+          echo "$DEPS" | grep 'dask==2024.10'
       - name: Run pytest
         run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow --constructors=pandas,pyarrow,polars[eager],polars[lazy],dask
 

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -59,7 +59,7 @@ jobs:
   pytest-full-coverage:
     strategy:
       matrix:
-        python-version: ["3.9", "3.11", "3.13"]
+        python-version: ["3.11", "3.13"]
         os: [ubuntu-latest]
     runs-on: ${{ matrix.os }}
     steps:
@@ -78,7 +78,7 @@ jobs:
       - name: install pyspark
         run: uv pip install -e ".[pyspark]" --system
         # PySpark is not yet available on Python3.12+
-        if: matrix.python-version == '3.9' || matrix.python-version == '3.11'
+        if: matrix.python-version != '3.13'
       - name: install ibis
         run: uv pip install -e ".[ibis]" --system
         # Ibis puts upper bounds on dependencies, and requires Python3.10+,

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ ci:
 repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
-  rev: 'v0.8.1'
+  rev: 'v0.8.6'
   hooks:
     # Run the formatter.
     - id: ruff-format
@@ -14,7 +14,7 @@ repos:
       alias: check-docstrings
       entry: python utils/check_docstrings.py
 - repo: https://github.com/pre-commit/mirrors-mypy
-  rev: 'v1.13.0'
+  rev: 'v1.14.1'
   hooks:
     - id: mypy
       additional_dependencies: ['polars==1.4.1', 'pytest==8.3.2']

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -78,6 +78,10 @@ where `YOUR-GITHUB-USERNAME` will be your GitHub user name.
 
 Here's how you can set up your local development environment to contribute.
 
+#### Prerequisites for PySpark tests
+
+If you want to run PySpark-related tests, you'll need to have Java installed. Refer to the [Spark documentation](https://spark.apache.org/docs/latest/#downloading) for more information.
+
 #### Option 1: Use UV (recommended)
 
 1. Make sure you have Python3.12 installed, create a virtual environment,

diff --git a/README.md b/README.md
@@ -14,8 +14,7 @@
 Extremely lightweight and extensible compatibility layer between dataframe libraries!
 
 - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow
-- **Lazy-only support**: Dask
-- **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol
+- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark.
 
 Seamlessly support all, without depending on any!
 

diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md
@@ -32,6 +32,7 @@
         - is_first_distinct
         - is_in
         - is_last_distinct
+        - is_nan
         - is_null
         - is_unique
         - len
@@ -46,6 +47,7 @@
         - over
         - pipe
         - quantile
+        - rank
         - replace_strict
         - rolling_mean
         - rolling_std

diff --git a/docs/api-reference/expr_dt.md b/docs/api-reference/expr_dt.md
@@ -23,6 +23,7 @@
         - total_nanoseconds
         - total_seconds
         - to_string
+        - weekday
         - year
       show_source: false
       show_bases: false
diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md
@@ -38,6 +38,7 @@
         - is_first_distinct
         - is_in
         - is_last_distinct
+        - is_nan
         - is_null
         - is_sorted
         - is_unique
@@ -53,6 +54,7 @@
         - null_count
         - pipe
         - quantile
+        - rank
         - rename
         - replace_strict
         - rolling_mean

diff --git a/docs/api-reference/series_dt.md b/docs/api-reference/series_dt.md
@@ -23,6 +23,7 @@
         - total_nanoseconds
         - total_seconds
         - to_string
+        - weekday
         - year
       show_source: false
       show_bases: false
diff --git a/docs/backcompat.md b/docs/backcompat.md
@@ -96,12 +96,13 @@ Anything currently in `narwhals.stable.v1` will not be changed or removed in fut
 
 Here are exceptions to our backwards compatibility policy:
 
-- unambiguous bugs. If a function contains what is unambiguously a bug, then we'll fix it, without
+- Unambiguous bugs. If a function contains what is unambiguously a bug, then we'll fix it, without
   considering that to be a breaking change.
-- radical changes in backends. Suppose that Polars was to remove
+- Radical changes in backends. Suppose that Polars was to remove
   expressions, or pandas were to remove support for categorical data. At that point, we might
   need to rethink Narwhals. However, we expect such radical changes to be exceedingly unlikely.
-- we may consider making some type hints more precise.
+- We may consider making some type hints more precise.
+- Anything labelled "unstable".
 
 In general, decision are driven by use-cases, and we conduct a search of public GitHub repositories
 before making any change.
@@ -110,6 +111,10 @@ before making any change.
 
 ### After `stable.v1`
 
+
+- Since Narwhals 1.21, passing a `DuckDBPyRelation` to `from_native` returns a `LazyFrame`. In
+  `narwhals.stable.v1`, it returns a `DataFrame` with `level='interchange'`.
+
 - Since Narwhals 1.15, `Series` is generic in the native Series, meaning that you can
   write:
   ```python

diff --git a/docs/basics/dataframe_conversion.md b/docs/basics/dataframe_conversion.md
@@ -14,6 +14,7 @@ To illustrate, we create dataframes in various formats:
 ```python exec="1" source="above" session="conversion"
 import narwhals as nw
 from narwhals.typing import IntoDataFrame
+from typing import Any
 
 import duckdb
 import polars as pl
@@ -45,11 +46,17 @@ print(df_to_pandas(df_polars))
 
 ### Via PyCapsule Interface
 
-Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe to Polars format using Narwhals.
+Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe
+which implements `__arrow_c_stream__`:
 
 ```python exec="1" source="above" session="conversion" result="python"
-def df_to_polars(df: IntoDataFrame) -> pl.DataFrame:
-    return nw.from_arrow(nw.from_native(df), native_namespace=pl).to_native()
+def df_to_polars(df_native: Any) -> pl.DataFrame:
+    if hasattr(df_native, "__arrow_c_stream__"):
+        return nw.from_arrow(df_native, native_namespace=pl).to_native()
+    msg = (
+        f"Expected object which implements '__arrow_c_stream__' got: {type(df_native)}"
+    )
+    raise TypeError(msg)
 
 
 print(df_to_polars(df_duckdb))  # You can only execute this line of code once.
@@ -66,8 +73,9 @@ If you need to ingest the same dataframe multiple times, then you may want to go
 This may be less efficient than the PyCapsule approach above (and always requires PyArrow!), but is more forgiving:
 
 ```python exec="1" source="above" session="conversion" result="python"
-def df_to_polars(df: IntoDataFrame) -> pl.DataFrame:
-    return pl.DataFrame(nw.from_native(df).to_arrow())
+def df_to_polars(df_native: IntoDataFrame) -> pl.DataFrame:
+    df = nw.from_native(df_native).lazy().collect()
+    return pl.DataFrame(nw.from_native(df, eager_only=True).to_arrow())
 
 
 df_duckdb = duckdb.sql("SELECT * FROM df_polars")

diff --git a/docs/extending.md b/docs/extending.md
@@ -15,17 +15,16 @@ Currently, Narwhals has **full API** support for the following libraries:
 It also has **lazy-only** support for [Dask](https://github.com/dask/dask), and **interchange** support
 for [DuckDB](https://github.com/duckdb/duckdb) and [Ibis](https://github.com/ibis-project/ibis).
 
+We are working towards full "lazy-only" support for DuckDB, Ibis, and PySpark.
+
 ### Levels of support
 
 Narwhals comes with three levels of support:
 
 - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow
-- **Lazy-only support**: Dask
+- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark.
 - **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol
 
-The lazy-only layer is a major item on our 2025 roadmap, and hope to be able to bring libraries currently in
-the "interchange" level into that one.
-
 Libraries for which we have full support can benefit from the whole
 [Narwhals API](./api-reference/index.md).
 

diff --git a/docs/installation.md b/docs/installation.md
@@ -30,7 +30,7 @@ To verify the installation, start the Python REPL and execute:
 ```python
 >>> import narwhals
 >>> narwhals.__version__
-'1.20.1'
+'1.21.1'
 ```
 
 If you see the version number, then the installation was successful!

diff --git a/docs/pandas_like_concepts/null_handling.md b/docs/pandas_like_concepts/null_handling.md
@@ -43,3 +43,48 @@ def check_null_behavior(df: IntoFrameT) -> IntoFrameT:
     df = pa.table(data)
     print(check_null_behavior(df))
     ```
+
+Conversely, `is_nan` is consistent across backends. This consistency comes from Narwhals exploiting its native implementations
+in Polars and PyArrow, while ensuring that pandas only identifies the floating-point NaN values and not those encoding the missing value indicator.
+
+```python exec="1" source="above" session="null_handling"
+import narwhals as nw
+from narwhals.typing import IntoFrameT
+
+data = {"a": [0.0, None, 2.0]}
+
+
+def check_nan_behavior(df: IntoFrameT) -> IntoFrameT:
+    return (
+        nw.from_native(df)
+        .with_columns(
+            a_div_a=(nw.col("a") / nw.col("a")),
+            a_div_a_is_nan=(nw.col("a") / nw.col("a")).is_nan(),
+        )
+        .to_native()
+    )
+```
+
+=== "pandas"
+    ```python exec="true" source="material-block" result="python" session="null_handling"
+    import pandas as pd
+
+    df = pd.DataFrame(data).astype({"a": "Float64"})
+    print(check_nan_behavior(df))
+    ```
+
+=== "Polars (eager)"
+    ```python exec="true" source="material-block" result="python" session="null_handling"
+    import polars as pl
+
+    df = pl.DataFrame(data)
+    print(check_nan_behavior(df))
+    ```
+
+=== "PyArrow"
+    ```python exec="true" source="material-block" result="python" session="null_handling"
+    import pyarrow as pa
+
+    df = pa.table(data)
+    print(check_nan_behavior(df))
+    ```
diff --git a/narwhals/__init__.py b/narwhals/__init__.py
@@ -79,7 +79,7 @@
 from narwhals.utils import maybe_reset_index
 from narwhals.utils import maybe_set_index
 
-__version__ = "1.20.1"
+__version__ = "1.21.1"
 
 __all__ = [
     "Array",