merge upstream

narwhals-dev · Sep 10, 2024 · 9f8f944 · 9f8f944
2 parents b5957dc + e9afffd
commit 9f8f944
Show file tree

Hide file tree

Showing 96 changed files with 2,752 additions and 877 deletions.
diff --git a/.github/workflows/check_tpch_queries.yml b/.github/workflows/check_tpch_queries.yml
@@ -0,0 +1,30 @@
+name: Tests for TPCH Queries
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  validate-queries:
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+        os: [ubuntu-latest]
+
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install uv 
+        run: curl -LsSf https://astral.sh/uv/install.sh | sh
+      - name: install-reqs
+        run: uv pip install --upgrade -r requirements-dev.txt --system
+      - name: local-install
+        run: uv pip install -e . --system
+      - name: generate-data
+        run: cd tpch && python generate_data.py
+      - name: tpch-tests 
+        run: cd tpch && pytest tests
diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml
@@ -104,7 +104,7 @@ jobs:
       - name: uninstall pandas
         run: uv pip uninstall pandas --system
       - name: install-pandas-nightly
-        run: uv pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas --system
+        run: uv pip install --prerelease=allow --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas --system
       - name: uninstall numpy
         run: uv pip uninstall numpy --system
       - name: install numpy nightly

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,15 +1,15 @@
 repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
-  rev: 'v0.5.7'
+  rev: 'v0.6.3'
   hooks:
     # Run the formatter.
     - id: ruff-format
     # Run the linter.
     - id: ruff
       args: [--fix]
 - repo: https://github.com/pre-commit/mirrors-mypy
-  rev: 'v1.11.1'
+  rev: 'v1.11.2'
   hooks:
     - id: mypy
       additional_dependencies: ['polars==1.4.1', 'pytest==8.3.2']

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -47,22 +47,41 @@ git clone [email protected]:YOUR-USERNAME/narwhals.git
 
 ### 4. Setting up your environment
 
-Here's how you can set up your local development environment to contribute:
-
-1. Make sure you have Python3.8+ installed (for example, Python 3.11)
-2. Create a new virtual environment with `python3.11 -m venv .venv` (or whichever version of Python3.9+ you prefer)
-3. Activate it: `. .venv/bin/activate`
-4. Install Narwhals: `pip install -e .`
-5. Install test requirements: `pip install -r requirements-dev.txt`
-6. Install docs requirements: `pip install -r docs/requirements-docs.txt`
+Here's how you can set up your local development environment to contribute.
+
+#### Option 1: Use UV (recommended)
+
+1. Make sure you have Python3.8+ installed (for example, Python 3.11), create a virtual environment,
+   and activate it. If you're new to this, here's one way that we recommend:
+   1. Install uv: https://github.com/astral-sh/uv?tab=readme-ov-file#getting-started
+   2. Install some version of Python greater than Python3.8. For example, to install
+      Python3.11:
+      ```
+      uv python install 3.11
+      ```
+   3. Create a virtual environment:
+      ```
+      uv venv -p 3.11 --seed
+      ```
+   4. Activate it. On Linux, this is `. .venv/bin/activate`, on Windows `.\.venv\Scripts\activate`.
+2. Install Narwhals: `uv pip install -e .`
+3. Install test requirements: `uv pip install -r requirements-dev.txt`
+4. Install docs requirements: `uv pip install -r docs/requirements-docs.txt`
 
 You should also install pre-commit:
 ```
-pip install pre-commit
+uv pip install pre-commit
 pre-commit install
 ```
 This will automatically format and lint your code before each commit, and it will block the commit if any issues are found.
 
+#### Option 2: use python3-venv
+
+1. Make sure you have Python 3.8+ installed. If you don't, you can check [install Python](https://realpython.com/installing-python/)
+   to learn how. Then, [create and activate](https://realpython.com/python-virtual-environments-a-primer/)
+   a virtual environment.
+2. Then, follow steps 2-4 from above but using `pip install` instead of `uv pip install`.
+
 ### 5. Working on your issue
 
 Create a new git branch from the `main` branch in your local repository.

diff --git a/README.md b/README.md
@@ -113,6 +113,31 @@ provided some funding / development time:
 If you contribute to Narwhals on your organization's time, please let us know. We'd be happy to add your employer
 to this list!
 
+## Appears on
+
+Narwhals has been featured in several talks, podcasts, and blog posts:
+
+- [Talk Python to me Podcast](https://youtu.be/FSH7BZ0tuE0)
+  Ahoy, Narwhals are bridging the data science APIs
+
+- [Super Data Science: ML & AI Podcast](https://www.youtube.com/watch?v=TeG4U8R0U8U)  
+  Narwhals: For Pandas-to-Polars DataFrame Compatibility
+
+- [Sample Space Podcast | probabl](https://youtu.be/8hYdq4sWbbQ?si=WG0QP1CZ6gkFf18b)  
+  How Narwhals has many end users ... that never use it directly. - Marco Gorelli
+
+- [Pycon Lithuania](https://www.youtube.com/watch?v=-mdx7Cn6_6E)  
+  Marco Gorelli - DataFrame interoperatiblity - what's been achieved, and what comes next?
+
+- [Pycon Italy](https://www.youtube.com/watch?v=3IqUli9XsmQ)  
+  How you can write a dataframe-agnostic library - Marco Gorelli
+
+- [Polars Blog Post](https://pola.rs/posts/lightweight_plotting/)  
+  Polars has a new lightweight plotting backend
+
+- [Quansight Labs blog post (w/ Scikit-Lego)](https://labs.quansight.org/blog/scikit-lego-narwhals)  
+  How Narwhals and scikit-lego came together to achieve dataframe-agnosticism
+
 ## Why "Narwhals"?
 
 [Coz they are so awesome](https://youtu.be/ykwqXuMPsoc?si=A-i8LdR38teYsos4).

diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md
@@ -22,6 +22,7 @@
         - item
         - iter_rows
         - join
+        - join_asof
         - lazy
         - null_count
         - pipe

diff --git a/docs/api-reference/lazyframe.md b/docs/api-reference/lazyframe.md
@@ -15,6 +15,7 @@
         - group_by
         - head
         - join
+        - join_asof
         - lazy
         - pipe
         - rename

diff --git a/docs/installation.md b/docs/installation.md
@@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following:
 ```python
 >>> import narwhals
 >>> narwhals.__version__
-'1.6.2'
+'1.6.4'
 ```
 then installation worked correctly!
diff --git a/docs/why.md b/docs/why.md
@@ -27,7 +27,7 @@ pl_df_right = pl.DataFrame({"a": [1, 2, 3], "c": [4, 5, 6]})
 pl_left_merge = pl_df_left.join(pl_df_right, left_on="b", right_on="c", how="left")
 
 print(pd_left_merge.columns)
-print(pl_df_right.columns)
+print(pl_left_merge.columns)
 ```
 
 There are several such subtle difference between the libraries. Writing dataframe-agnostic code is hard!

diff --git a/narwhals/__init__.py b/narwhals/__init__.py
@@ -53,7 +53,7 @@
 from narwhals.utils import maybe_get_index
 from narwhals.utils import maybe_set_index
 
-__version__ = "1.6.2"
+__version__ = "1.6.4"
 
 __all__ = [
     "dependencies",

diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -121,7 +121,12 @@ def __getitem__(self, item: str) -> ArrowSeries: ...
     def __getitem__(self, item: slice) -> ArrowDataFrame: ...
 
     def __getitem__(
-        self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int]
+        self,
+        item: str
+        | slice
+        | Sequence[int]
+        | Sequence[str]
+        | tuple[Sequence[int], str | int],
     ) -> ArrowSeries | ArrowDataFrame:
         if isinstance(item, str):
             from narwhals._arrow.series import ArrowSeries
@@ -136,9 +141,12 @@ def __getitem__(
             and len(item) == 2
             and isinstance(item[1], (list, tuple))
         ):
-            return self._from_native_frame(
-                self._native_frame.take(item[0]).select(item[1])
-            )
+            if item[0] == slice(None):
+                selected_rows = self._native_frame
+            else:
+                selected_rows = self._native_frame.take(item[0])
+
+            return self._from_native_frame(selected_rows.select(item[1]))
 
         elif isinstance(item, tuple) and len(item) == 2:
             if isinstance(item[1], slice):
@@ -188,6 +196,8 @@ def __getitem__(
             )
 
         elif isinstance(item, Sequence) or (is_numpy_array(item) and item.ndim == 1):
+            if isinstance(item, Sequence) and all(isinstance(x, str) for x in item):
+                return self._from_native_frame(self._native_frame.select(item))
             return self._from_native_frame(self._native_frame.take(item))
 
         else:  # pragma: no cover
@@ -274,12 +284,8 @@ def join(
         how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner",
         left_on: str | list[str] | None,
         right_on: str | list[str] | None,
+        suffix: str,
     ) -> Self:
-        if isinstance(left_on, str):
-            left_on = [left_on]
-        if isinstance(right_on, str):
-            right_on = [right_on]
-
         how_to_join_map = {
             "anti": "left anti",
             "semi": "left semi",
@@ -300,7 +306,7 @@ def join(
                     keys=key_token,
                     right_keys=key_token,
                     join_type="inner",
-                    right_suffix="_right",
+                    right_suffix=suffix,
                 )
                 .drop([key_token]),
             )
@@ -311,10 +317,25 @@ def join(
                 keys=left_on,
                 right_keys=right_on,
                 join_type=how_to_join_map[how],
-                right_suffix="_right",
+                right_suffix=suffix,
             ),
         )
 
+    def join_asof(
+        self,
+        other: Self,
+        *,
+        left_on: str | None = None,
+        right_on: str | None = None,
+        on: str | None = None,
+        by_left: str | list[str] | None = None,
+        by_right: str | list[str] | None = None,
+        by: str | list[str] | None = None,
+        strategy: Literal["backward", "forward", "nearest"] = "backward",
+    ) -> Self:
+        msg = "join_asof is not yet supported on PyArrow tables"
+        raise NotImplementedError(msg)
+
     def drop(self: Self, columns: list[str], strict: bool) -> Self:  # noqa: FBT001
         to_drop = parse_columns_to_drop(
             compliant_frame=self, columns=columns, strict=strict

diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
@@ -309,7 +309,9 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:
                 )
                 raise ValueError(msg)
             tmp = df.group_by(*keys).agg(self)
-            tmp = df.select(*keys).join(tmp, how="left", left_on=keys, right_on=keys)
+            tmp = df.select(*keys).join(
+                tmp, how="left", left_on=keys, right_on=keys, suffix="_right"
+            )
             return [tmp[name] for name in self._output_names]
 
         return self.__class__(

diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py
@@ -15,6 +15,26 @@
     from narwhals._arrow.expr import ArrowExpr
     from narwhals._arrow.typing import IntoArrowExpr
 
+POLARS_TO_ARROW_AGGREGATIONS = {
+    "len": "count",
+    "n_unique": "count_distinct",
+    "std": "stddev",
+    "var": "variance",  # currently unused, we don't have `var` yet
+}
+
+
+def get_function_name_option(function_name: str) -> Any | None:
+    """Map specific pyarrow compute function to respective option to match polars behaviour."""
+    import pyarrow.compute as pc  # ignore-banned-import
+
+    function_name_to_options = {
+        "count": pc.CountOptions(mode="all"),
+        "count_distinct": pc.CountOptions(mode="all"),
+        "stddev": pc.VarianceOptions(ddof=1),
+        "variance": pc.VarianceOptions(ddof=1),
+    }
+    return function_name_to_options.get(function_name)
+
 
 class ArrowGroupBy:
     def __init__(self, df: ArrowDataFrame, keys: list[str]) -> None:
@@ -112,17 +132,14 @@ def agg_arrow(
                 raise AssertionError(msg)
 
             function_name = remove_prefix(expr._function_name, "col->")
+            function_name = POLARS_TO_ARROW_AGGREGATIONS.get(function_name, function_name)
+
+            option = get_function_name_option(function_name)
             for root_name, output_name in zip(expr._root_names, expr._output_names):
-                if function_name != "len":
-                    simple_aggregations[output_name] = (
-                        (root_name, function_name),
-                        f"{root_name}_{function_name}",
-                    )
-                else:
-                    simple_aggregations[output_name] = (
-                        (root_name, "count", pc.CountOptions(mode="all")),
-                        f"{root_name}_count",
-                    )
+                simple_aggregations[output_name] = (
+                    (root_name, function_name, option),
+                    f"{root_name}_{function_name}",
+                )
 
         aggs: list[Any] = []
         name_mapping = {}
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,6 +22,7 @@ @@
             - item
             - iter_rows
             - join
+            - join_asof
             - lazy
             - null_count
             - pipe
@@ Expand Down @@