From 9eb5f46aa58672763d3309537416eef1bdfb2273 Mon Sep 17 00:00:00 2001 From: Isaias Gutierrez-Cruz <64386035+IsaiasGutierrezCruz@users.noreply.github.com> Date: Wed, 4 Sep 2024 09:47:58 -0600 Subject: [PATCH 01/30] ci: add tests for the queries of TPC-H (#899) --- .github/workflows/check_tpch_queries.yml | 30 ++++++++++++++++++++++++ pyproject.toml | 1 + requirements-dev.txt | 1 + tpch/tests/__init__.py | 0 tpch/tests/test_queries.py | 29 +++++++++++++++++++++++ 5 files changed, 61 insertions(+) create mode 100644 .github/workflows/check_tpch_queries.yml create mode 100644 tpch/tests/__init__.py create mode 100644 tpch/tests/test_queries.py diff --git a/.github/workflows/check_tpch_queries.yml b/.github/workflows/check_tpch_queries.yml new file mode 100644 index 000000000..397163091 --- /dev/null +++ b/.github/workflows/check_tpch_queries.yml @@ -0,0 +1,30 @@ +name: Tests for TPCH Queries + +on: + pull_request: + types: [labeled] + +jobs: + validate-queries: + if: ${{ github.event.label.name == 'full-test' }} + strategy: + matrix: + python-version: ["3.12"] + os: [ubuntu-latest] + + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + - name: install-reqs + run: uv pip install --upgrade -r requirements-dev.txt --system + - name: local-install + run: uv pip install -e . --system + - name: generate-data + run: cd tpch && python generate_data.py + - name: tpch-tests + run: python -m unittest discover -s 'tpch/tests' \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a279280bf..b3a2a0c28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,6 +76,7 @@ lint.ignore = [ [tool.ruff.lint.per-file-ignores] "tests/*" = ["S101"] +"tpch/tests/*" = ["S101"] "utils/*" = ["S311", "PTH123"] "tpch/execute/*" = ["T201"] diff --git a/requirements-dev.txt b/requirements-dev.txt index 213fcdcb8..23ff1757e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,4 @@ +tqdm covdefaults duckdb pandas diff --git a/tpch/tests/__init__.py b/tpch/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tpch/tests/test_queries.py b/tpch/tests/test_queries.py new file mode 100644 index 000000000..4b7cdd866 --- /dev/null +++ b/tpch/tests/test_queries.py @@ -0,0 +1,29 @@ +import os +import subprocess +import sys +import unittest +from pathlib import Path + + +class TestQueries(unittest.TestCase): + def test_execute_scripts(self) -> None: + root = Path(__file__).resolve().parent.parent + # directory containing all the queries + execute_dir = root / "execute" + + env = os.environ.copy() + env["PYTHONPATH"] = str(root) + + for script_path in execute_dir.glob("q[1-9]*.py"): + result = subprocess.run( # noqa: S603 + [sys.executable, str(script_path)], + capture_output=True, + text=True, + env=env, + cwd=root, + check=False, + shell=False, + ) + assert ( + result.returncode == 0 + ), f"Script {script_path} failed with error: {result.stderr}" From 69da133bbf86906b7787c95db403fcd4a626e78a Mon Sep 17 00:00:00 2001 From: Zhengbo Wang Date: Wed, 4 Sep 2024 23:50:39 +0800 Subject: [PATCH 02/30] feat: Add more queries of tpch (#898) --- tpch/execute/__init__.py | 30 +++++++++++++++++++++++ tpch/execute/q1.py | 23 ++---------------- tpch/execute/q10.py | 25 ++++---------------- tpch/execute/q11.py | 23 ++++-------------- tpch/execute/q15.py | 21 +++++++++++++++++ tpch/execute/q17.py | 21 +++++++++++++++++ tpch/execute/q18.py | 22 +++++++++++++++++ tpch/execute/q19.py | 17 ++++++++++++++ tpch/execute/q2.py | 32 +++++-------------------- tpch/execute/q20.py | 20 ++++++++++++++++ tpch/execute/q21.py | 19 +++++++++++++++ tpch/execute/q3.py | 25 ++++---------------- tpch/execute/q4.py | 21 +++-------------- tpch/execute/q5.py | 29 ++++++----------------- tpch/execute/q6.py | 15 ++---------- tpch/execute/q7.py | 27 +++++++++++++++++++++ tpch/execute/q9.py | 35 +++++++++++++++++++++++++++ tpch/queries/q15.py | 33 ++++++++++++++++++++++++++ tpch/queries/q17.py | 23 ++++++++++++++++++ tpch/queries/q18.py | 31 ++++++++++++++++++++++++ tpch/queries/q19.py | 39 ++++++++++++++++++++++++++++++ tpch/queries/q20.py | 43 +++++++++++++++++++++++++++++++++ tpch/queries/q21.py | 43 +++++++++++++++++++++++++++++++++ tpch/queries/q6.py | 4 ---- tpch/queries/q7.py | 51 ++++++++++++++++++++++++++++++++++++++++ tpch/queries/q9.py | 36 ++++++++++++++++++++++++++++ 26 files changed, 544 insertions(+), 164 deletions(-) create mode 100644 tpch/execute/q15.py create mode 100644 tpch/execute/q17.py create mode 100644 tpch/execute/q18.py create mode 100644 tpch/execute/q19.py create mode 100644 tpch/execute/q20.py create mode 100644 tpch/execute/q21.py create mode 100644 tpch/execute/q7.py create mode 100644 tpch/execute/q9.py create mode 100644 tpch/queries/q15.py create mode 100644 tpch/queries/q17.py create mode 100644 tpch/queries/q18.py create mode 100644 tpch/queries/q19.py create mode 100644 tpch/queries/q20.py create mode 100644 tpch/queries/q21.py create mode 100644 tpch/queries/q7.py create mode 100644 tpch/queries/q9.py diff --git a/tpch/execute/__init__.py b/tpch/execute/__init__.py index e69de29bb..e0c448649 100644 --- a/tpch/execute/__init__.py +++ b/tpch/execute/__init__.py @@ -0,0 +1,30 @@ +from pathlib import Path + +import dask.dataframe as dd +import pandas as pd +import polars as pl +import pyarrow.parquet as pq + +pd.options.mode.copy_on_write = True +pd.options.future.infer_string = True + +lineitem = Path("data") / "lineitem.parquet" +region = Path("data") / "region.parquet" +nation = Path("data") / "nation.parquet" +supplier = Path("data") / "supplier.parquet" +part = Path("data") / "part.parquet" +partsupp = Path("data") / "partsupp.parquet" +orders = Path("data") / "orders.parquet" +customer = Path("data") / "customer.parquet" +line_item = Path("data") / "lineitem.parquet" + +IO_FUNCS = { + "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), + "pandas[pyarrow]": lambda x: pd.read_parquet( + x, engine="pyarrow", dtype_backend="pyarrow" + ), + "polars[eager]": lambda x: pl.read_parquet(x), + "polars[lazy]": lambda x: pl.scan_parquet(x), + "pyarrow": lambda x: pq.read_table(x), + "dask": lambda x: dd.read_parquet(x, engine="pyarrow", dtype_backend="pyarrow"), +} diff --git a/tpch/execute/q1.py b/tpch/execute/q1.py index dd839b292..9889c3af0 100644 --- a/tpch/execute/q1.py +++ b/tpch/execute/q1.py @@ -1,26 +1,7 @@ -from pathlib import Path - -import dask.dataframe as dd -import pandas as pd -import polars as pl -import pyarrow.parquet as pq from queries import q1 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -lineitem = Path("data") / "lineitem.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), - "pyarrow": lambda x: pq.read_table(x), - "dask": lambda x: dd.read_parquet(x, engine="pyarrow", dtype_backend="pyarrow"), -} +from . import IO_FUNCS +from . import lineitem print(q1.query(IO_FUNCS["pandas[pyarrow]"](lineitem))) print(q1.query(IO_FUNCS["polars[lazy]"](lineitem)).collect()) diff --git a/tpch/execute/q10.py b/tpch/execute/q10.py index 19e2e7ce0..9876f2aa9 100644 --- a/tpch/execute/q10.py +++ b/tpch/execute/q10.py @@ -1,25 +1,10 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q10 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -customer = Path("data") / "customer.parquet" -nation = Path("data") / "nation.parquet" -lineitem = Path("data") / "lineitem.parquet" -orders = Path("data") / "orders.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} +from . import IO_FUNCS +from . import customer +from . import lineitem +from . import nation +from . import orders tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q11.py b/tpch/execute/q11.py index 55161ae6b..82b1936aa 100644 --- a/tpch/execute/q11.py +++ b/tpch/execute/q11.py @@ -1,24 +1,9 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q11 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -nation = Path("data") / "nation.parquet" -partsupp = Path("data") / "partsupp.parquet" -supplier = Path("data") / "supplier.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} +from . import IO_FUNCS +from . import nation +from . import partsupp +from . import supplier tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q15.py b/tpch/execute/q15.py new file mode 100644 index 000000000..8fdaf2ab1 --- /dev/null +++ b/tpch/execute/q15.py @@ -0,0 +1,21 @@ +from queries import q15 + +from . import IO_FUNCS +from . import lineitem +from . import supplier + +tool = "pandas" +fn = IO_FUNCS[tool] +print(q15.query(fn(lineitem), fn(supplier))) + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q15.query(fn(lineitem), fn(supplier))) + +tool = "polars[eager]" +fn = IO_FUNCS[tool] +print(q15.query(fn(lineitem), fn(supplier))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q15.query(fn(lineitem), fn(supplier)).collect()) diff --git a/tpch/execute/q17.py b/tpch/execute/q17.py new file mode 100644 index 000000000..5f2228012 --- /dev/null +++ b/tpch/execute/q17.py @@ -0,0 +1,21 @@ +from queries import q17 + +from . import IO_FUNCS +from . import lineitem +from . import part + +tool = "pandas" +fn = IO_FUNCS[tool] +print(q17.query(fn(lineitem), fn(part))) + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q17.query(fn(lineitem), fn(part))) + +tool = "polars[eager]" +fn = IO_FUNCS[tool] +print(q17.query(fn(lineitem), fn(part))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q17.query(fn(lineitem), fn(part)).collect()) diff --git a/tpch/execute/q18.py b/tpch/execute/q18.py new file mode 100644 index 000000000..5a59f0e5e --- /dev/null +++ b/tpch/execute/q18.py @@ -0,0 +1,22 @@ +from queries import q18 + +from . import IO_FUNCS +from . import customer +from . import lineitem +from . import orders + +tool = "pandas" +fn = IO_FUNCS[tool] +print(q18.query(fn(customer), fn(lineitem), fn(orders))) + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q18.query(fn(customer), fn(lineitem), fn(orders))) + +tool = "polars[eager]" +fn = IO_FUNCS[tool] +print(q18.query(fn(customer), fn(lineitem), fn(orders))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q18.query(fn(customer), fn(lineitem), fn(orders)).collect()) diff --git a/tpch/execute/q19.py b/tpch/execute/q19.py new file mode 100644 index 000000000..87467064c --- /dev/null +++ b/tpch/execute/q19.py @@ -0,0 +1,17 @@ +from queries import q19 + +from . import IO_FUNCS +from . import lineitem +from . import part + +fn = IO_FUNCS["pandas"] +print(q19.query(fn(lineitem), fn(part))) + +fn = IO_FUNCS["pandas[pyarrow]"] +print(q19.query(fn(lineitem), fn(part))) + +fn = IO_FUNCS["polars[eager]"] +print(q19.query(fn(lineitem), fn(part))) + +fn = IO_FUNCS["polars[lazy]"] +print(q19.query(fn(lineitem), fn(part)).collect()) diff --git a/tpch/execute/q2.py b/tpch/execute/q2.py index 22a7f4317..cd82a9047 100644 --- a/tpch/execute/q2.py +++ b/tpch/execute/q2.py @@ -1,31 +1,11 @@ -from pathlib import Path - -import dask.dataframe as dd -import pandas as pd -import polars as pl -import pyarrow.parquet as pq from queries import q2 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -region = Path("data") / "region.parquet" -nation = Path("data") / "nation.parquet" -supplier = Path("data") / "supplier.parquet" -part = Path("data") / "part.parquet" -partsupp = Path("data") / "partsupp.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), - "pyarrow": lambda x: pq.read_table(x), - "dask": lambda x: dd.read_parquet(x, engine="pyarrow", dtype_backend="pyarrow"), -} - +from . import IO_FUNCS +from . import nation +from . import part +from . import partsupp +from . import region +from . import supplier tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q20.py b/tpch/execute/q20.py new file mode 100644 index 000000000..68d18a6b5 --- /dev/null +++ b/tpch/execute/q20.py @@ -0,0 +1,20 @@ +from queries import q20 + +from . import IO_FUNCS +from . import lineitem +from . import nation +from . import part +from . import partsupp +from . import supplier + +fn = IO_FUNCS["pandas"] +print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))) + +fn = IO_FUNCS["pandas[pyarrow]"] +print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))) + +fn = IO_FUNCS["polars[eager]"] +print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))) + +fn = IO_FUNCS["polars[lazy]"] +print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier)).collect()) diff --git a/tpch/execute/q21.py b/tpch/execute/q21.py new file mode 100644 index 000000000..693953870 --- /dev/null +++ b/tpch/execute/q21.py @@ -0,0 +1,19 @@ +from queries import q21 + +from . import IO_FUNCS +from . import lineitem +from . import nation +from . import orders +from . import supplier + +fn = IO_FUNCS["pandas"] +print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier))) + +fn = IO_FUNCS["pandas[pyarrow]"] +print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier))) + +fn = IO_FUNCS["polars[eager]"] +print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier))) + +fn = IO_FUNCS["polars[lazy]"] +print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier)).collect()) diff --git a/tpch/execute/q3.py b/tpch/execute/q3.py index 30194b5da..8602bb3d0 100644 --- a/tpch/execute/q3.py +++ b/tpch/execute/q3.py @@ -1,26 +1,9 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q3 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - - -customer = Path("data") / "customer.parquet" -lineitem = Path("data") / "lineitem.parquet" -orders = Path("data") / "orders.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} - +from . import IO_FUNCS +from . import customer +from . import lineitem +from . import orders tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q4.py b/tpch/execute/q4.py index 672a43e17..3e67a9c87 100644 --- a/tpch/execute/q4.py +++ b/tpch/execute/q4.py @@ -1,23 +1,8 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q4 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -line_item = Path("data") / "lineitem.parquet" -orders = Path("data") / "orders.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} +from . import IO_FUNCS +from . import line_item +from . import orders tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q5.py b/tpch/execute/q5.py index b77f740d8..317b15fc7 100644 --- a/tpch/execute/q5.py +++ b/tpch/execute/q5.py @@ -1,27 +1,12 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q5 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -region = Path("data") / "region.parquet" -nation = Path("data") / "nation.parquet" -customer = Path("data") / "customer.parquet" -line_item = Path("data") / "lineitem.parquet" -orders = Path("data") / "orders.parquet" -supplier = Path("data") / "supplier.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} +from . import IO_FUNCS +from . import customer +from . import line_item +from . import nation +from . import orders +from . import region +from . import supplier tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q6.py b/tpch/execute/q6.py index 85b3d9968..adca0e26d 100644 --- a/tpch/execute/q6.py +++ b/tpch/execute/q6.py @@ -1,18 +1,7 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q6 -lineitem = Path("data") / "lineitem.parquet" -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} +from . import IO_FUNCS +from . import lineitem tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q7.py b/tpch/execute/q7.py new file mode 100644 index 000000000..43e110a72 --- /dev/null +++ b/tpch/execute/q7.py @@ -0,0 +1,27 @@ +from queries import q7 + +from . import IO_FUNCS +from . import customer +from . import lineitem +from . import nation +from . import orders +from . import supplier + +tool = "pandas" +fn = IO_FUNCS[tool] +print(q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))) + + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))) + +tool = "polars[eager]" +fn = IO_FUNCS[tool] +print(q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print( + q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect() +) diff --git a/tpch/execute/q9.py b/tpch/execute/q9.py new file mode 100644 index 000000000..9ccbe35b7 --- /dev/null +++ b/tpch/execute/q9.py @@ -0,0 +1,35 @@ +from queries import q9 + +from . import IO_FUNCS +from . import lineitem +from . import nation +from . import orders +from . import part +from . import partsupp +from . import supplier + +tool = "pandas" +fn = IO_FUNCS[tool] +print( + q9.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)) +) + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print( + q9.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)) +) + +tool = "polars[eager]" +fn = IO_FUNCS[tool] +print( + q9.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)) +) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print( + q9.query( + fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier) + ).collect() +) diff --git a/tpch/queries/q15.py b/tpch/queries/q15.py new file mode 100644 index 000000000..1ebae57d6 --- /dev/null +++ b/tpch/queries/q15.py @@ -0,0 +1,33 @@ +from datetime import datetime + +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + lineitem_ds: FrameT, + supplier_ds: FrameT, +) -> FrameT: + var1 = datetime(1996, 1, 1) + var2 = datetime(1996, 4, 1) + + revenue = ( + lineitem_ds.filter(nw.col("l_shipdate").is_between(var1, var2, closed="left")) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias( + "total_revenue" + ) + ) + .group_by("l_suppkey") + .agg(nw.sum("total_revenue")) + .select(nw.col("l_suppkey").alias("supplier_no"), nw.col("total_revenue")) + ) + + return ( + supplier_ds.join(revenue, left_on="s_suppkey", right_on="supplier_no") + .filter(nw.col("total_revenue") == nw.col("total_revenue").max()) + .with_columns(nw.col("total_revenue").round(2)) + .select("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue") + .sort("s_suppkey") + ) diff --git a/tpch/queries/q17.py b/tpch/queries/q17.py new file mode 100644 index 000000000..5d35929d1 --- /dev/null +++ b/tpch/queries/q17.py @@ -0,0 +1,23 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query(lineitem_ds: FrameT, part_ds: FrameT) -> FrameT: + var1 = "Brand#23" + var2 = "MED BOX" + + query1 = ( + part_ds.filter(nw.col("p_brand") == var1) + .filter(nw.col("p_container") == var2) + .join(lineitem_ds, how="left", left_on="p_partkey", right_on="l_partkey") + ) + + return ( + query1.group_by("p_partkey") + .agg((0.2 * nw.col("l_quantity").mean()).alias("avg_quantity")) + .select(nw.col("p_partkey").alias("key"), nw.col("avg_quantity")) + .join(query1, left_on="key", right_on="p_partkey") + .filter(nw.col("l_quantity") < nw.col("avg_quantity")) + .select((nw.col("l_extendedprice").sum() / 7.0).round(2).alias("avg_yearly")) + ) diff --git a/tpch/queries/q18.py b/tpch/queries/q18.py new file mode 100644 index 000000000..d3d183176 --- /dev/null +++ b/tpch/queries/q18.py @@ -0,0 +1,31 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query(customer_ds: FrameT, lineitem_ds: FrameT, orders_ds: FrameT) -> FrameT: + var1 = 300 + + query1 = ( + lineitem_ds.group_by("l_orderkey") + .agg(nw.col("l_quantity").sum().alias("sum_quantity")) + .filter(nw.col("sum_quantity") > var1) + ) + + return ( + orders_ds.join(query1, left_on="o_orderkey", right_on="l_orderkey", how="semi") + .join(lineitem_ds, left_on="o_orderkey", right_on="l_orderkey") + .join(customer_ds, left_on="o_custkey", right_on="c_custkey") + .group_by("c_name", "o_custkey", "o_orderkey", "o_orderdate", "o_totalprice") + .agg(nw.col("l_quantity").sum().alias("col6")) + .select( + nw.col("c_name"), + nw.col("o_custkey").alias("c_custkey"), + nw.col("o_orderkey"), + nw.col("o_orderdate").alias("o_orderdat"), + nw.col("o_totalprice"), + nw.col("col6"), + ) + .sort(by=["o_totalprice", "o_orderdat"], descending=[True, False]) + .head(100) + ) diff --git a/tpch/queries/q19.py b/tpch/queries/q19.py new file mode 100644 index 000000000..bcab36e9a --- /dev/null +++ b/tpch/queries/q19.py @@ -0,0 +1,39 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query(lineitem_ds: FrameT, part_ds: FrameT) -> FrameT: + return ( + part_ds.join(lineitem_ds, left_on="p_partkey", right_on="l_partkey") + .filter(nw.col("l_shipmode").is_in(["AIR", "AIR REG"])) + .filter(nw.col("l_shipinstruct") == "DELIVER IN PERSON") + .filter( + ( + (nw.col("p_brand") == "Brand#12") + & nw.col("p_container").is_in(["SM CASE", "SM BOX", "SM PACK", "SM PKG"]) + & (nw.col("l_quantity").is_between(1, 11)) + & (nw.col("p_size").is_between(1, 5)) + ) + | ( + (nw.col("p_brand") == "Brand#23") + & nw.col("p_container").is_in( + ["MED BAG", "MED BOX", "MED PKG", "MED PACK"] + ) + & (nw.col("l_quantity").is_between(10, 20)) + & (nw.col("p_size").is_between(1, 10)) + ) + | ( + (nw.col("p_brand") == "Brand#34") + & nw.col("p_container").is_in(["LG CASE", "LG BOX", "LG PACK", "LG PKG"]) + & (nw.col("l_quantity").is_between(20, 30)) + & (nw.col("p_size").is_between(1, 15)) + ) + ) + .select( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))) + .sum() + .round(2) + .alias("revenue") + ) + ) diff --git a/tpch/queries/q20.py b/tpch/queries/q20.py new file mode 100644 index 000000000..d9014f7b8 --- /dev/null +++ b/tpch/queries/q20.py @@ -0,0 +1,43 @@ +from datetime import datetime + +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + part_ds: FrameT, + partsupp_ds: FrameT, + nation_ds: FrameT, + lineitem_ds: FrameT, + supplier_ds: FrameT, +) -> FrameT: + var1 = datetime(1994, 1, 1) + var2 = datetime(1995, 1, 1) + var3 = "CANADA" + var4 = "forest" + + query1 = ( + lineitem_ds.filter(nw.col("l_shipdate").is_between(var1, var2, closed="left")) + .group_by("l_partkey", "l_suppkey") + .agg((nw.col("l_quantity").sum()).alias("sum_quantity")) + .with_columns(sum_quantity=nw.col("sum_quantity") * 0.5) + ) + query2 = nation_ds.filter(nw.col("n_name") == var3) + query3 = supplier_ds.join(query2, left_on="s_nationkey", right_on="n_nationkey") + + return ( + part_ds.filter(nw.col("p_name").str.starts_with(var4)) + .select(nw.col("p_partkey").unique()) + .join(partsupp_ds, left_on="p_partkey", right_on="ps_partkey") + .join( + query1, + left_on=["ps_suppkey", "p_partkey"], + right_on=["l_suppkey", "l_partkey"], + ) + .filter(nw.col("ps_availqty") > nw.col("sum_quantity")) + .select(nw.col("ps_suppkey").unique()) + .join(query3, left_on="ps_suppkey", right_on="s_suppkey") + .select("s_name", "s_address") + .sort("s_name") + ) diff --git a/tpch/queries/q21.py b/tpch/queries/q21.py new file mode 100644 index 000000000..d10ff394f --- /dev/null +++ b/tpch/queries/q21.py @@ -0,0 +1,43 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + lineitem: FrameT, + nation: FrameT, + orders: FrameT, + supplier: FrameT, +) -> FrameT: + var1 = "SAUDI ARABIA" + + q1 = ( + lineitem.group_by("l_orderkey") + .agg(nw.len().alias("n_supp_by_order")) + .filter(nw.col("n_supp_by_order") > 1) + .join( + lineitem.filter(nw.col("l_receiptdate") > nw.col("l_commitdate")), + left_on="l_orderkey", + right_on="l_orderkey", + ) + ) + + return ( + q1.group_by("l_orderkey") + .agg(nw.len().alias("n_supp_by_order")) + .join( + q1, + left_on="l_orderkey", + right_on="l_orderkey", + ) + .join(supplier, left_on="l_suppkey", right_on="s_suppkey") + .join(nation, left_on="s_nationkey", right_on="n_nationkey") + .join(orders, left_on="l_orderkey", right_on="o_orderkey") + .filter(nw.col("n_supp_by_order") == 1) + .filter(nw.col("n_name") == var1) + .filter(nw.col("o_orderstatus") == "F") + .group_by("s_name") + .agg(nw.len().alias("numwait")) + .sort(by=["numwait", "s_name"], descending=[True, False]) + .head(100) + ) diff --git a/tpch/queries/q6.py b/tpch/queries/q6.py index 6a9b5c1d2..67f0ac785 100644 --- a/tpch/queries/q6.py +++ b/tpch/queries/q6.py @@ -1,12 +1,8 @@ from datetime import datetime -import pandas as pd - import narwhals as nw from narwhals.typing import FrameT -pd.options.mode.copy_on_write = True - @nw.narwhalify def query(line_item_ds: FrameT) -> FrameT: diff --git a/tpch/queries/q7.py b/tpch/queries/q7.py new file mode 100644 index 000000000..ec0946ac3 --- /dev/null +++ b/tpch/queries/q7.py @@ -0,0 +1,51 @@ +from datetime import datetime + +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + nation_ds: FrameT, + customer_ds: FrameT, + line_item_ds: FrameT, + orders_ds: FrameT, + supplier_ds: FrameT, +) -> FrameT: + n1 = nation_ds.filter(nw.col("n_name") == "FRANCE") + n2 = nation_ds.filter(nw.col("n_name") == "GERMANY") + + var_1 = datetime(1995, 1, 1) + var_2 = datetime(1996, 12, 31) + + df1 = ( + customer_ds.join(n1, left_on="c_nationkey", right_on="n_nationkey") + .join(orders_ds, left_on="c_custkey", right_on="o_custkey") + .rename({"n_name": "cust_nation"}) + .join(line_item_ds, left_on="o_orderkey", right_on="l_orderkey") + .join(supplier_ds, left_on="l_suppkey", right_on="s_suppkey") + .join(n2, left_on="s_nationkey", right_on="n_nationkey") + .rename({"n_name": "supp_nation"}) + ) + + df2 = ( + customer_ds.join(n2, left_on="c_nationkey", right_on="n_nationkey") + .join(orders_ds, left_on="c_custkey", right_on="o_custkey") + .rename({"n_name": "cust_nation"}) + .join(line_item_ds, left_on="o_orderkey", right_on="l_orderkey") + .join(supplier_ds, left_on="l_suppkey", right_on="s_suppkey") + .join(n1, left_on="s_nationkey", right_on="n_nationkey") + .rename({"n_name": "supp_nation"}) + ) + + return ( + nw.concat([df1, df2]) + .filter(nw.col("l_shipdate").is_between(var_1, var_2)) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("volume") + ) + .with_columns(nw.col("l_shipdate").dt.year().alias("l_year")) + .group_by("supp_nation", "cust_nation", "l_year") + .agg(nw.sum("volume").alias("revenue")) + .sort(by=["supp_nation", "cust_nation", "l_year"]) + ) diff --git a/tpch/queries/q9.py b/tpch/queries/q9.py new file mode 100644 index 000000000..09dff4787 --- /dev/null +++ b/tpch/queries/q9.py @@ -0,0 +1,36 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + part_ds: FrameT, + partsupp_ds: FrameT, + nation_ds: FrameT, + lineitem_ds: FrameT, + orders_ds: FrameT, + supplier_ds: FrameT, +) -> FrameT: + return ( + part_ds.join(partsupp_ds, left_on="p_partkey", right_on="ps_partkey") + .join(supplier_ds, left_on="ps_suppkey", right_on="s_suppkey") + .join( + lineitem_ds, + left_on=["p_partkey", "ps_suppkey"], + right_on=["l_partkey", "l_suppkey"], + ) + .join(orders_ds, left_on="l_orderkey", right_on="o_orderkey") + .join(nation_ds, left_on="s_nationkey", right_on="n_nationkey") + .filter(nw.col("p_name").str.contains("green")) + .select( + nw.col("n_name").alias("nation"), + nw.col("o_orderdate").dt.year().alias("o_year"), + ( + nw.col("l_extendedprice") * (1 - nw.col("l_discount")) + - nw.col("ps_supplycost") * nw.col("l_quantity") + ).alias("amount"), + ) + .group_by("nation", "o_year") + .agg(nw.sum("amount").alias("sum_profit")) + .sort(by=["nation", "o_year"], descending=[False, True]) + ) From cb82d26b7d9d6a1aef882aa6fcbda79a612f1223 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:51:36 +0200 Subject: [PATCH 03/30] feat: dask lit with dtype (#909) --- narwhals/_dask/namespace.py | 15 ++++++++++++--- tests/frame/lit_test.py | 6 +----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 89ca372ec..1668ee323 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -12,6 +12,7 @@ from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.expr import DaskExpr from narwhals._dask.selectors import DaskSelectorNamespace +from narwhals._dask.utils import reverse_translate_dtype from narwhals._dask.utils import validate_comparand from narwhals._expression_parsing import parse_into_exprs @@ -19,6 +20,7 @@ import dask_expr from narwhals._dask.typing import IntoDaskExpr + from narwhals.dtypes import DType class DaskNamespace: @@ -70,10 +72,17 @@ def col(self, *column_names: str) -> DaskExpr: ) def lit(self, value: Any, dtype: dtypes.DType | None) -> DaskExpr: - # TODO @FBruzzesi: cast to dtype once `narwhals_to_native_dtype` is implemented. - # It should be enough to add `.astype(narwhals_to_native_dtype(dtype))` + def convert_if_dtype( + series: dask_expr.Series, dtype: DType | type[DType] + ) -> dask_expr.Series: + return series.astype(reverse_translate_dtype(dtype)) if dtype else series + return DaskExpr( - lambda df: [df._native_frame.assign(lit=value).loc[:, "lit"]], + lambda df: [ + df._native_frame.assign(lit=value) + .loc[:, "lit"] + .pipe(convert_if_dtype, dtype) + ], depth=0, function_name="lit", root_names=None, diff --git a/tests/frame/lit_test.py b/tests/frame/lit_test.py index 328e4d8e0..e5756e035 100644 --- a/tests/frame/lit_test.py +++ b/tests/frame/lit_test.py @@ -17,11 +17,7 @@ ("dtype", "expected_lit"), [(None, [2, 2, 2]), (nw.String, ["2", "2", "2"]), (nw.Float32, [2.0, 2.0, 2.0])], ) -def test_lit( - constructor: Any, dtype: DType | None, expected_lit: list[Any], request: Any -) -> None: - if "dask" in str(constructor) and dtype == nw.String: - request.applymarker(pytest.mark.xfail) +def test_lit(constructor: Any, dtype: DType | None, expected_lit: list[Any]) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor(data) df = nw.from_native(df_raw).lazy() From d2d10cecab3dead7f5bd2909f6e0fff5ff89a12f Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Fri, 6 Sep 2024 03:41:46 -0400 Subject: [PATCH 04/30] xfail dt.date tests for cuDF (#912) --- tests/expr_and_series/dt/datetime_attributes_test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index 4d59567df..22e20590e 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -42,6 +42,8 @@ def test_datetime_attributes( and "pyarrow" not in str(constructor) ): request.applymarker(pytest.mark.xfail) + if attribute == "date" and "cudf" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(getattr(nw.col("a").dt, attribute)()) @@ -73,6 +75,8 @@ def test_datetime_attributes_series( and "pyarrow" not in str(constructor_eager) ): request.applymarker(pytest.mark.xfail) + if attribute == "date" and "cudf" in str(constructor_eager): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor_eager(data), eager_only=True) result = df.select(getattr(df["a"].dt, attribute)()) @@ -82,6 +86,8 @@ def test_datetime_attributes_series( def test_datetime_chained_attributes(request: Any, constructor_eager: Any) -> None: if "pandas" in str(constructor_eager) and "pyarrow" not in str(constructor_eager): request.applymarker(pytest.mark.xfail) + if "cudf" in str(constructor_eager): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor_eager(data), eager_only=True) result = df.select(df["a"].dt.date().dt.year()) From fdc8f88be7c5f10bfc0d87040f8482b3ee56bf31 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 6 Sep 2024 08:46:23 +0100 Subject: [PATCH 05/30] Update extremes.yml (#913) --- .github/workflows/extremes.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index ae9c79009..7e1a5586e 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -104,7 +104,7 @@ jobs: - name: uninstall pandas run: uv pip uninstall pandas --system - name: install-pandas-nightly - run: uv pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas --system + run: uv pip install --prerelease=allow --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas --system - name: uninstall numpy run: uv pip uninstall numpy --system - name: install numpy nightly From 029f590baf01869dd18b7b8ddfd518eaeba8af54 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Fri, 6 Sep 2024 09:17:49 +0100 Subject: [PATCH 06/30] feat: Add join_asof support for pandas and dask (#911) --- docs/api-reference/dataframe.md | 1 + docs/api-reference/lazyframe.md | 1 + narwhals/_arrow/dataframe.py | 11 ++ narwhals/_dask/dataframe.py | 20 +++ narwhals/_pandas_like/dataframe.py | 20 +++ narwhals/dataframe.py | 202 +++++++++++++++++++++++++++++ tests/frame/join_test.py | 112 ++++++++++++++++ 7 files changed, 367 insertions(+) diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index c144b4af0..f78b4e3da 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -22,6 +22,7 @@ - item - iter_rows - join + - join_asof - lazy - null_count - pipe diff --git a/docs/api-reference/lazyframe.md b/docs/api-reference/lazyframe.md index 9ca6a9745..5d472bab6 100644 --- a/docs/api-reference/lazyframe.md +++ b/docs/api-reference/lazyframe.md @@ -15,6 +15,7 @@ - group_by - head - join + - join_asof - lazy - pipe - rename diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index f6cb47101..755a92416 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -315,6 +315,17 @@ def join( ), ) + def join_asof( + self, + other: Self, + *, + left_on: str, + right_on: str, + strategy: Literal["backward", "forward", "nearest"] = "backward", + ) -> Self: + msg = "join_asof is not yet supported on PyArrow tables" + raise NotImplementedError(msg) + def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 to_drop = parse_columns_to_drop( compliant_frame=self, columns=columns, strict=strict diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index f11a88903..91a7e96a9 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -298,6 +298,26 @@ def join( ), ) + def join_asof( + self, + other: Self, + *, + left_on: str, + right_on: str, + strategy: Literal["backward", "forward", "nearest"] = "backward", + ) -> Self: + plx = self.__native_namespace__() + return self._from_native_frame( + plx.merge_asof( + self._native_frame, + other._native_frame, + left_on=left_on, + right_on=right_on, + direction=strategy, + suffixes=("", "_right"), + ), + ) + def group_by(self, *by: str) -> DaskLazyGroupBy: from narwhals._dask.group_by import DaskLazyGroupBy diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 193955cbd..0425e28e1 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -509,6 +509,26 @@ def join( ), ) + def join_asof( + self, + other: Self, + *, + left_on: str, + right_on: str, + strategy: Literal["backward", "forward", "nearest"] = "backward", + ) -> Self: + plx = self.__native_namespace__() + return self._from_native_frame( + plx.merge_asof( + self._native_frame, + other._native_frame, + left_on=left_on, + right_on=right_on, + direction=strategy, + suffixes=("", "_right"), + ), + ) + # --- partial reduction --- def head(self, n: int) -> Self: diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 9276fda10..da1ee1dc8 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -214,6 +214,29 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: self._compliant_frame.gather_every(n=n, offset=offset) ) + def join_asof( + self, + other: Self, + *, + left_on: str, + right_on: str, + strategy: Literal["backward", "forward", "nearest"] = "backward", + ) -> Self: + _supported_strategies = ("backward", "forward", "nearest") + + if strategy not in _supported_strategies: + msg = f"Only the following strategies are supported: {_supported_strategies}; found '{strategy}'." + raise NotImplementedError(msg) + + return self._from_compliant_dataframe( + self._compliant_frame.join_asof( + self._extract_compliant(other), + left_on=left_on, + right_on=right_on, + strategy=strategy, + ) + ) + class DataFrame(BaseFrame[FrameT]): """ @@ -1839,6 +1862,96 @@ def join( """ return super().join(other, how=how, left_on=left_on, right_on=right_on) + def join_asof( + self, + other: Self, + *, + left_on: str, + right_on: str, + strategy: Literal["backward", "forward", "nearest"] = "backward", + ) -> Self: + """ + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than equal keys. + + Both DataFrames must be sorted by the asof_join key. + + Arguments: + other: DataFrame to join with. + + left_on: Name(s) of the left join column(s). + + right_on: Name(s) of the right join column(s). + + strategy: Join strategy. The default is "backward". + + * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. + * *forward*: selects the first row in the right DataFrame whose "on" key is greater than or equal to the left's key. + * *nearest*: search selects the last row in the right DataFrame whose value is nearest to the left's key. + + Returns: + A new joined DataFrame + + Examples: + >>> from datetime import datetime + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data_gdp = { + ... "datetime": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... datetime(2020, 1, 1), + ... ], + ... "gdp": [4164, 4411, 4566, 4696, 4827], + ... } + >>> data_population = { + ... "datetime": [ + ... datetime(2016, 3, 1), + ... datetime(2018, 8, 1), + ... datetime(2019, 1, 1), + ... ], + ... "population": [82.19, 82.66, 83.12], + ... } + >>> gdp_pd = pd.DataFrame(data_gdp) + >>> population_pd = pd.DataFrame(data_population) + + >>> gdp_pl = pl.DataFrame(data_gdp).sort("datetime") + >>> population_pl = pl.DataFrame(data_population).sort("datetime") + + Let's define a dataframe-agnostic function in which we join over "datetime" column: + + >>> @nw.narwhalify + ... def join_asof_date(df, other_any, strategy): + ... return df.join_asof( + ... other_any, left_on="datetime", right_on="datetime", strategy=strategy + ... ) + >>> # We can now pass either pandas or Polars to the function: + >>> join_asof_date(population_pd, gdp_pd, strategy="backward") + datetime population gdp + 0 2016-03-01 82.19 4164 + 1 2018-08-01 82.66 4566 + 2 2019-01-01 83.12 4696 + + >>> join_asof_date(population_pl, gdp_pl, strategy="backward") + shape: (3, 3) + ┌─────────────────────┬────────────┬──────┐ + │ datetime ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-03-01 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │ + │ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + """ + return super().join_asof( + other, left_on=left_on, right_on=right_on, strategy=strategy + ) + # --- descriptive --- def is_duplicated(self: Self) -> Series: r""" @@ -3378,6 +3491,95 @@ def join( """ return super().join(other, how=how, left_on=left_on, right_on=right_on) + def join_asof( + self, + other: Self, + *, + left_on: str, + right_on: str, + strategy: Literal["backward", "forward", "nearest"] = "backward", + ) -> Self: + """ + Perform an asof join. + + This is similar to a left-join except that we match on nearest key rather than equal keys. + + Both DataFrames must be sorted by the asof_join key. + + Arguments: + other: DataFrame to join with. + + left_on: Name(s) of the left join column(s). + + right_on: Name(s) of the right join column(s). + + strategy: Join strategy. The default is "backward". + + * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. + * *forward*: selects the first row in the right DataFrame whose "on" key is greater than or equal to the left's key. + * *nearest*: search selects the last row in the right DataFrame whose value is nearest to the left's key. + + Returns: + A new joined DataFrame + + Examples: + >>> from datetime import datetime + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data_gdp = { + ... "datetime": [ + ... datetime(2016, 1, 1), + ... datetime(2017, 1, 1), + ... datetime(2018, 1, 1), + ... datetime(2019, 1, 1), + ... datetime(2020, 1, 1), + ... ], + ... "gdp": [4164, 4411, 4566, 4696, 4827], + ... } + >>> data_population = { + ... "datetime": [ + ... datetime(2016, 3, 1), + ... datetime(2018, 8, 1), + ... datetime(2019, 1, 1), + ... ], + ... "population": [82.19, 82.66, 83.12], + ... } + >>> gdp_pd = pd.DataFrame(data_gdp) + >>> population_pd = pd.DataFrame(data_population) + >>> gdp_pl = pl.LazyFrame(data_gdp).sort("datetime") + >>> population_pl = pl.LazyFrame(data_population).sort("datetime") + + Let's define a dataframe-agnostic function in which we join over "datetime" column: + + >>> @nw.narwhalify + ... def join_asof_date(df, other_any, strategy): + ... return df.join_asof( + ... other_any, left_on="datetime", right_on="datetime", strategy=strategy + ... ) + >>> # We can now pass either pandas or Polars to the function: + >>> join_asof_date(population_pd, gdp_pd, strategy="backward") + datetime population gdp + 0 2016-03-01 82.19 4164 + 1 2018-08-01 82.66 4566 + 2 2019-01-01 83.12 4696 + + >>> join_asof_date(population_pl, gdp_pl, strategy="backward").collect() + shape: (3, 3) + ┌─────────────────────┬────────────┬──────┐ + │ datetime ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ f64 ┆ i64 │ + ╞═════════════════════╪════════════╪══════╡ + │ 2016-03-01 00:00:00 ┆ 82.19 ┆ 4164 │ + │ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │ + │ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │ + └─────────────────────┴────────────┴──────┘ + """ + return super().join_asof( + other, left_on=left_on, right_on=right_on, strategy=strategy + ) + def clone(self) -> Self: r""" Create a copy of this DataFrame. diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index e6dfad634..c9119e204 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +from datetime import datetime from typing import Any import pandas as pd @@ -8,6 +9,7 @@ import narwhals.stable.v1 as nw from narwhals.utils import Implementation +from narwhals.utils import parse_version from tests.utils import compare_dicts @@ -202,3 +204,113 @@ def test_left_join_overlapping_column(constructor: Any) -> None: "index": [0, 1, 2], } compare_dicts(result, expected) + + +def test_joinasof_numeric(constructor: Any, request: Any) -> None: + if "pyarrow_table" in str(constructor): + request.applymarker(pytest.mark.xfail) + if parse_version(pd.__version__) < (2, 1) and ( + ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) + ): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor({"a": [1, 5, 10], "val": ["a", "b", "c"]})).sort("a") + df_right = nw.from_native( + constructor({"a": [1, 2, 3, 6, 7], "val": [1, 2, 3, 6, 7]}) + ).sort("a") + result_backward = df.join_asof(df_right, left_on="a", right_on="a") # type: ignore[arg-type] + result_forward = df.join_asof(df_right, left_on="a", right_on="a", strategy="forward") # type: ignore[arg-type] + result_nearest = df.join_asof(df_right, left_on="a", right_on="a", strategy="nearest") # type: ignore[arg-type] + expected_backward = { + "a": [1, 5, 10], + "val": ["a", "b", "c"], + "val_right": [1, 3, 7], + } + expected_forward = { + "a": [1, 5, 10], + "val": ["a", "b", "c"], + "val_right": [1, 6, float("nan")], + } + expected_nearest = { + "a": [1, 5, 10], + "val": ["a", "b", "c"], + "val_right": [1, 6, 7], + } + compare_dicts(result_backward, expected_backward) + compare_dicts(result_forward, expected_forward) + compare_dicts(result_nearest, expected_nearest) + + +def test_joinasof_time(constructor: Any, request: Any) -> None: + if "pyarrow_table" in str(constructor): + request.applymarker(pytest.mark.xfail) + if parse_version(pd.__version__) < (2, 1) and ("pandas_pyarrow" in str(constructor)): + request.applymarker(pytest.mark.xfail) + df = nw.from_native( + constructor( + { + "datetime": [ + datetime(2016, 3, 1), + datetime(2018, 8, 1), + datetime(2019, 1, 1), + ], + "population": [82.19, 82.66, 83.12], + } + ) + ).sort("datetime") + df_right = nw.from_native( + constructor( + { + "datetime": [ + datetime(2016, 1, 1), + datetime(2017, 1, 1), + datetime(2018, 1, 1), + datetime(2019, 1, 1), + datetime(2020, 1, 1), + ], + "gdp": [4164, 4411, 4566, 4696, 4827], + } + ) + ).sort("datetime") + result_backward = df.join_asof(df_right, left_on="datetime", right_on="datetime") # type: ignore[arg-type] + result_forward = df.join_asof( + df_right, # type: ignore[arg-type] + left_on="datetime", + right_on="datetime", + strategy="forward", + ) + result_nearest = df.join_asof( + df_right, # type: ignore[arg-type] + left_on="datetime", + right_on="datetime", + strategy="nearest", + ) + expected_backward = { + "datetime": [datetime(2016, 3, 1), datetime(2018, 8, 1), datetime(2019, 1, 1)], + "population": [82.19, 82.66, 83.12], + "gdp": [4164, 4566, 4696], + } + expected_forward = { + "datetime": [datetime(2016, 3, 1), datetime(2018, 8, 1), datetime(2019, 1, 1)], + "population": [82.19, 82.66, 83.12], + "gdp": [4411, 4696, 4696], + } + expected_nearest = { + "datetime": [datetime(2016, 3, 1), datetime(2018, 8, 1), datetime(2019, 1, 1)], + "population": [82.19, 82.66, 83.12], + "gdp": [4164, 4696, 4696], + } + compare_dicts(result_backward, expected_backward) + compare_dicts(result_forward, expected_forward) + compare_dicts(result_nearest, expected_nearest) + + +@pytest.mark.parametrize("strategy", ["back", "furthest"]) +def test_joinasof_not_implemented(constructor: Any, strategy: str) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + df = nw.from_native(constructor(data)) + + with pytest.raises( + NotImplementedError, + match=rf"Only the following strategies are supported: \('backward', 'forward', 'nearest'\); found '{strategy}'.", + ): + df.join_asof(df, left_on="a", right_on="a", strategy=strategy) # type: ignore[arg-type] From 4cf94ce268c92b4d26b7e4fed6e245d3eee1bdfc Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Fri, 6 Sep 2024 16:16:42 +0100 Subject: [PATCH 07/30] feat: enable `on` key in `join_asof` (#916) * enable `on` key in `join_asof` * remove repeated keys validation in LazyFrame --- narwhals/_arrow/dataframe.py | 5 ++- narwhals/_dask/dataframe.py | 6 ++- narwhals/_pandas_like/dataframe.py | 6 ++- narwhals/dataframe.py | 72 +++++++++++++++++++----------- tests/frame/join_test.py | 47 +++++++++++++++++++ 5 files changed, 103 insertions(+), 33 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 755a92416..f01ada158 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -319,8 +319,9 @@ def join_asof( self, other: Self, *, - left_on: str, - right_on: str, + left_on: str | None = None, + right_on: str | None = None, + on: str | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: msg = "join_asof is not yet supported on PyArrow tables" diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 91a7e96a9..8f11ccaad 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -302,8 +302,9 @@ def join_asof( self, other: Self, *, - left_on: str, - right_on: str, + left_on: str | None = None, + right_on: str | None = None, + on: str | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: plx = self.__native_namespace__() @@ -313,6 +314,7 @@ def join_asof( other._native_frame, left_on=left_on, right_on=right_on, + on=on, direction=strategy, suffixes=("", "_right"), ), diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 0425e28e1..9750cd9d4 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -513,8 +513,9 @@ def join_asof( self, other: Self, *, - left_on: str, - right_on: str, + left_on: str | None = None, + right_on: str | None = None, + on: str | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: plx = self.__native_namespace__() @@ -524,6 +525,7 @@ def join_asof( other._native_frame, left_on=left_on, right_on=right_on, + on=on, direction=strategy, suffixes=("", "_right"), ), diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index da1ee1dc8..440856eb4 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -218,8 +218,9 @@ def join_asof( self, other: Self, *, - left_on: str, - right_on: str, + left_on: str | None = None, + right_on: str | None = None, + on: str | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: _supported_strategies = ("backward", "forward", "nearest") @@ -228,14 +229,29 @@ def join_asof( msg = f"Only the following strategies are supported: {_supported_strategies}; found '{strategy}'." raise NotImplementedError(msg) - return self._from_compliant_dataframe( - self._compliant_frame.join_asof( - self._extract_compliant(other), - left_on=left_on, - right_on=right_on, - strategy=strategy, + if left_on is not None and right_on is not None and on is not None: + msg = "Either (`left_on` and `right_on`) or `on` keys should be specified." + raise ValueError(msg) + if left_on is not None and right_on is not None: + return self._from_compliant_dataframe( + self._compliant_frame.join_asof( + self._extract_compliant(other), + left_on=left_on, + right_on=right_on, + strategy=strategy, + ) ) - ) + elif on is not None: + return self._from_compliant_dataframe( + self._compliant_frame.join_asof( + self._extract_compliant(other), + on=on, + strategy=strategy, + ) + ) + else: + msg = "Either (`left_on` and `right_on`) or `on` keys should be specified." + raise ValueError(msg) class DataFrame(BaseFrame[FrameT]): @@ -1866,8 +1882,9 @@ def join_asof( self, other: Self, *, - left_on: str, - right_on: str, + left_on: str | None = None, + right_on: str | None = None, + on: str | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: """ @@ -1884,6 +1901,8 @@ def join_asof( right_on: Name(s) of the right join column(s). + on: Join column of both DataFrames. If set, left_on and right_on should be None. + strategy: Join strategy. The default is "backward". * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. @@ -1925,18 +1944,16 @@ def join_asof( Let's define a dataframe-agnostic function in which we join over "datetime" column: >>> @nw.narwhalify - ... def join_asof_date(df, other_any, strategy): - ... return df.join_asof( - ... other_any, left_on="datetime", right_on="datetime", strategy=strategy - ... ) + ... def join_asof_datetime(df, other_any, strategy): + ... return df.join_asof(other_any, on="datetime", strategy=strategy) >>> # We can now pass either pandas or Polars to the function: - >>> join_asof_date(population_pd, gdp_pd, strategy="backward") + >>> join_asof_datetime(population_pd, gdp_pd, strategy="backward") datetime population gdp 0 2016-03-01 82.19 4164 1 2018-08-01 82.66 4566 2 2019-01-01 83.12 4696 - >>> join_asof_date(population_pl, gdp_pl, strategy="backward") + >>> join_asof_datetime(population_pl, gdp_pl, strategy="backward") shape: (3, 3) ┌─────────────────────┬────────────┬──────┐ │ datetime ┆ population ┆ gdp │ @@ -1949,7 +1966,7 @@ def join_asof( └─────────────────────┴────────────┴──────┘ """ return super().join_asof( - other, left_on=left_on, right_on=right_on, strategy=strategy + other, left_on=left_on, right_on=right_on, on=on, strategy=strategy ) # --- descriptive --- @@ -3495,8 +3512,9 @@ def join_asof( self, other: Self, *, - left_on: str, - right_on: str, + left_on: str | None = None, + right_on: str | None = None, + on: str | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: """ @@ -3513,6 +3531,8 @@ def join_asof( right_on: Name(s) of the right join column(s). + on: Join column of both DataFrames. If set, left_on and right_on should be None. + strategy: Join strategy. The default is "backward". * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. @@ -3553,18 +3573,16 @@ def join_asof( Let's define a dataframe-agnostic function in which we join over "datetime" column: >>> @nw.narwhalify - ... def join_asof_date(df, other_any, strategy): - ... return df.join_asof( - ... other_any, left_on="datetime", right_on="datetime", strategy=strategy - ... ) + ... def join_asof_datetime(df, other_any, strategy): + ... return df.join_asof(other_any, on="datetime", strategy=strategy) >>> # We can now pass either pandas or Polars to the function: - >>> join_asof_date(population_pd, gdp_pd, strategy="backward") + >>> join_asof_datetime(population_pd, gdp_pd, strategy="backward") datetime population gdp 0 2016-03-01 82.19 4164 1 2018-08-01 82.66 4566 2 2019-01-01 83.12 4696 - >>> join_asof_date(population_pl, gdp_pl, strategy="backward").collect() + >>> join_asof_datetime(population_pl, gdp_pl, strategy="backward").collect() shape: (3, 3) ┌─────────────────────┬────────────┬──────┐ │ datetime ┆ population ┆ gdp │ @@ -3577,7 +3595,7 @@ def join_asof( └─────────────────────┴────────────┴──────┘ """ return super().join_asof( - other, left_on=left_on, right_on=right_on, strategy=strategy + other, left_on=left_on, right_on=right_on, on=on, strategy=strategy ) def clone(self) -> Self: diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index c9119e204..72f1304df 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -220,6 +220,9 @@ def test_joinasof_numeric(constructor: Any, request: Any) -> None: result_backward = df.join_asof(df_right, left_on="a", right_on="a") # type: ignore[arg-type] result_forward = df.join_asof(df_right, left_on="a", right_on="a", strategy="forward") # type: ignore[arg-type] result_nearest = df.join_asof(df_right, left_on="a", right_on="a", strategy="nearest") # type: ignore[arg-type] + result_backward_on = df.join_asof(df_right, on="a") # type: ignore[arg-type] + result_forward_on = df.join_asof(df_right, on="a", strategy="forward") # type: ignore[arg-type] + result_nearest_on = df.join_asof(df_right, on="a", strategy="nearest") # type: ignore[arg-type] expected_backward = { "a": [1, 5, 10], "val": ["a", "b", "c"], @@ -238,6 +241,9 @@ def test_joinasof_numeric(constructor: Any, request: Any) -> None: compare_dicts(result_backward, expected_backward) compare_dicts(result_forward, expected_forward) compare_dicts(result_nearest, expected_nearest) + compare_dicts(result_backward_on, expected_backward) + compare_dicts(result_forward_on, expected_forward) + compare_dicts(result_nearest_on, expected_nearest) def test_joinasof_time(constructor: Any, request: Any) -> None: @@ -284,6 +290,17 @@ def test_joinasof_time(constructor: Any, request: Any) -> None: right_on="datetime", strategy="nearest", ) + result_backward_on = df.join_asof(df_right, on="datetime") # type: ignore[arg-type] + result_forward_on = df.join_asof( + df_right, # type: ignore[arg-type] + on="datetime", + strategy="forward", + ) + result_nearest_on = df.join_asof( + df_right, # type: ignore[arg-type] + on="datetime", + strategy="nearest", + ) expected_backward = { "datetime": [datetime(2016, 3, 1), datetime(2018, 8, 1), datetime(2019, 1, 1)], "population": [82.19, 82.66, 83.12], @@ -302,6 +319,9 @@ def test_joinasof_time(constructor: Any, request: Any) -> None: compare_dicts(result_backward, expected_backward) compare_dicts(result_forward, expected_forward) compare_dicts(result_nearest, expected_nearest) + compare_dicts(result_backward_on, expected_backward) + compare_dicts(result_forward_on, expected_forward) + compare_dicts(result_nearest_on, expected_nearest) @pytest.mark.parametrize("strategy", ["back", "furthest"]) @@ -314,3 +334,30 @@ def test_joinasof_not_implemented(constructor: Any, strategy: str) -> None: match=rf"Only the following strategies are supported: \('backward', 'forward', 'nearest'\); found '{strategy}'.", ): df.join_asof(df, left_on="a", right_on="a", strategy=strategy) # type: ignore[arg-type] + + +def test_joinasof_no_keys(constructor: Any) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + df = nw.from_native(constructor(data)) + + msg = r"Either \(`left_on` and `right_on`\) or `on` keys should be specified." + with pytest.raises( + ValueError, + match=msg, + ): + df.join_asof(df, left_on="a") # type: ignore[arg-type] + with pytest.raises( + ValueError, + match=msg, + ): + df.join_asof(df, right_on="a") # type: ignore[arg-type] + with pytest.raises( + ValueError, + match=msg, + ): + df.join_asof(df) # type: ignore[arg-type] + with pytest.raises( + ValueError, + match=msg, + ): + df.join_asof(df, left_on="a", right_on="a", on="a") # type: ignore[arg-type] From ad5616a2c6488c5cb1c5a6dcef71ac00a8c6d65a Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Fri, 6 Sep 2024 19:48:15 +0200 Subject: [PATCH 08/30] patch: group by `n_unique` (#917) --- narwhals/_arrow/group_by.py | 27 +++++++++--- narwhals/_dask/group_by.py | 35 +++++++++++++--- narwhals/_pandas_like/group_by.py | 69 ++++++++++++++++++++++++------- tests/test_group_by.py | 51 +++++++++++++++++++++++ 4 files changed, 156 insertions(+), 26 deletions(-) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index 27c7ff368..78b241c9b 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -15,6 +15,12 @@ from narwhals._arrow.expr import ArrowExpr from narwhals._arrow.typing import IntoArrowExpr +POLARS_TO_ARROW_AGGREGATIONS = { + "n_unique": "count_distinct", + "std": "stddev", + "var": "variance", # currently unused, we don't have `var` yet +} + class ArrowGroupBy: def __init__(self, df: ArrowDataFrame, keys: list[str]) -> None: @@ -112,16 +118,27 @@ def agg_arrow( raise AssertionError(msg) function_name = remove_prefix(expr._function_name, "col->") + function_name = POLARS_TO_ARROW_AGGREGATIONS.get(function_name, function_name) for root_name, output_name in zip(expr._root_names, expr._output_names): - if function_name != "len": + if function_name == "len": simple_aggregations[output_name] = ( - (root_name, function_name), - f"{root_name}_{function_name}", + (root_name, "count", pc.CountOptions(mode="all")), + f"{root_name}_count", + ) + elif function_name == "count_distinct": + simple_aggregations[output_name] = ( + (root_name, "count_distinct", pc.CountOptions(mode="all")), + f"{root_name}_count_distinct", + ) + elif function_name == "stddev": + simple_aggregations[output_name] = ( + (root_name, "stddev", pc.VarianceOptions(ddof=1)), + f"{root_name}_stddev", ) else: simple_aggregations[output_name] = ( - (root_name, "count", pc.CountOptions(mode="all")), - f"{root_name}_count", + (root_name, function_name), + f"{root_name}_{function_name}", ) aggs: list[Any] = [] diff --git a/narwhals/_dask/group_by.py b/narwhals/_dask/group_by.py index 8538c62d2..463d6fc58 100644 --- a/narwhals/_dask/group_by.py +++ b/narwhals/_dask/group_by.py @@ -10,12 +10,33 @@ from narwhals.utils import remove_prefix if TYPE_CHECKING: + import dask.dataframe as dd + import pandas as pd + from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.expr import DaskExpr from narwhals._dask.typing import IntoDaskExpr -POLARS_TO_PANDAS_AGGREGATIONS = { + +def n_unique() -> dd.Aggregation: + import dask.dataframe as dd # ignore-banned-import + + def chunk(s: pd.core.groupby.generic.SeriesGroupBy) -> int: + return s.nunique(dropna=False) # type: ignore[no-any-return] + + def agg(s0: pd.core.groupby.generic.SeriesGroupBy) -> int: + return s0.sum() # type: ignore[no-any-return] + + return dd.Aggregation( + name="nunique", + chunk=chunk, + agg=agg, + ) + + +POLARS_TO_DASK_AGGREGATIONS = { "len": "size", + "n_unique": n_unique, } @@ -85,7 +106,7 @@ def agg_dask( break if all_simple_aggs: - simple_aggregations: dict[str, tuple[str, str]] = {} + simple_aggregations: dict[str, tuple[str, str | dd.Aggregation]] = {} for expr in exprs: if expr._depth == 0: # e.g. agg(nw.len()) # noqa: ERA001 @@ -93,7 +114,7 @@ def agg_dask( msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues" raise AssertionError(msg) - function_name = POLARS_TO_PANDAS_AGGREGATIONS.get( + function_name = POLARS_TO_DASK_AGGREGATIONS.get( expr._function_name, expr._function_name ) for output_name in expr._output_names: @@ -108,9 +129,11 @@ def agg_dask( raise AssertionError(msg) function_name = remove_prefix(expr._function_name, "col->") - function_name = POLARS_TO_PANDAS_AGGREGATIONS.get( - function_name, function_name - ) + function_name = POLARS_TO_DASK_AGGREGATIONS.get(function_name, function_name) + + # deal with n_unique case in a "lazy" mode to not depend on dask globally + function_name = function_name() if callable(function_name) else function_name + for root_name, output_name in zip(expr._root_names, expr._output_names): simple_aggregations[output_name] = (root_name, function_name) try: diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 11abc85c8..97a477dc4 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -21,6 +21,7 @@ POLARS_TO_PANDAS_AGGREGATIONS = { "len": "size", + "n_unique": "nunique", } @@ -103,7 +104,7 @@ def __iter__(self) -> Iterator[tuple[Any, PandasLikeDataFrame]]: yield from ((key, self._from_native_frame(sub_df)) for (key, sub_df) in iterator) -def agg_pandas( +def agg_pandas( # noqa: PLR0915 grouped: Any, exprs: list[PandasLikeExpr], keys: list[str], @@ -120,13 +121,18 @@ def agg_pandas( - https://github.com/rapidsai/cudf/issues/15118 - https://github.com/rapidsai/cudf/issues/15084 """ - all_simple_aggs = True + all_aggs_are_simple = True for expr in exprs: if not is_simple_aggregation(expr): - all_simple_aggs = False + all_aggs_are_simple = False break - if all_simple_aggs: + # dict of {output_name: root_name} that we count n_unique on + # We need to do this separately from the rest so that we + # can pass the `dropna` kwargs. + nunique_aggs: dict[str, str] = {} + + if all_aggs_are_simple: simple_aggregations: dict[str, tuple[str, str]] = {} for expr in exprs: if expr._depth == 0: @@ -154,21 +160,54 @@ def agg_pandas( function_name, function_name ) for root_name, output_name in zip(expr._root_names, expr._output_names): - simple_aggregations[output_name] = (root_name, function_name) + if function_name == "nunique": + nunique_aggs[output_name] = root_name + else: + simple_aggregations[output_name] = (root_name, function_name) - aggs = collections.defaultdict(list) + simple_aggs = collections.defaultdict(list) name_mapping = {} for output_name, named_agg in simple_aggregations.items(): - aggs[named_agg[0]].append(named_agg[1]) + simple_aggs[named_agg[0]].append(named_agg[1]) name_mapping[f"{named_agg[0]}_{named_agg[1]}"] = output_name - try: - result_simple = grouped.agg(aggs) - except AttributeError as exc: - msg = "Failed to aggregated - does your aggregation function return a scalar?" - raise RuntimeError(msg) from exc - result_simple.columns = [f"{a}_{b}" for a, b in result_simple.columns] - result_simple = result_simple.rename(columns=name_mapping).reset_index() - return from_dataframe(result_simple.loc[:, output_names]) + if simple_aggs: + try: + result_simple_aggs = grouped.agg(simple_aggs) + except AttributeError as exc: + msg = "Failed to aggregated - does your aggregation function return a scalar?" + raise RuntimeError(msg) from exc + result_simple_aggs.columns = [ + f"{a}_{b}" for a, b in result_simple_aggs.columns + ] + result_simple_aggs = result_simple_aggs.rename( + columns=name_mapping + ).reset_index() + if nunique_aggs: + result_nunique_aggs = grouped[list(nunique_aggs.values())].nunique( + dropna=False + ) + result_nunique_aggs.columns = list(nunique_aggs.keys()) + result_nunique_aggs = result_nunique_aggs.reset_index() + if simple_aggs and nunique_aggs: + if ( + set(result_simple_aggs.columns) + .difference(keys) + .intersection(result_nunique_aggs.columns) + ): + msg = ( + "Got two aggregations with the same output name. Please make sure " + "that aggregations have unique output names." + ) + raise ValueError(msg) + result_aggs = result_simple_aggs.merge(result_nunique_aggs, on=keys) + elif nunique_aggs and not simple_aggs: + result_aggs = result_nunique_aggs + elif simple_aggs and not nunique_aggs: + result_aggs = result_simple_aggs + else: # pragma: no cover + msg = "Congrats, you entered unreachable code. Please report a bug to https://github.com/narwhals-dev/narwhals/issues." + raise RuntimeError(msg) + return from_dataframe(result_aggs.loc[:, output_names]) if dataframe_is_empty: # Don't even attempt this, it's way too inconsistent across pandas versions. diff --git a/tests/test_group_by.py b/tests/test_group_by.py index 2bb8d435b..4bd3427a5 100644 --- a/tests/test_group_by.py +++ b/tests/test_group_by.py @@ -102,6 +102,57 @@ def test_group_by_len(constructor: Any) -> None: compare_dicts(result, expected) +def test_group_by_n_unique(constructor: Any) -> None: + result = ( + nw.from_native(constructor(data)) + .group_by("a") + .agg(nw.col("b").n_unique()) + .sort("a") + ) + expected = {"a": [1, 3], "b": [1, 1]} + compare_dicts(result, expected) + + +def test_group_by_std(constructor: Any) -> None: + data = {"a": [1, 1, 2, 2], "b": [5, 4, 3, 2]} + result = ( + nw.from_native(constructor(data)).group_by("a").agg(nw.col("b").std()).sort("a") + ) + expected = {"a": [1, 2], "b": [0.707107] * 2} + compare_dicts(result, expected) + + +def test_group_by_n_unique_w_missing(constructor: Any) -> None: + data = {"a": [1, 1, 2], "b": [4, None, 5], "c": [None, None, 7], "d": [1, 1, 3]} + result = ( + nw.from_native(constructor(data)) + .group_by("a") + .agg( + nw.col("b").n_unique(), + c_n_unique=nw.col("c").n_unique(), + c_n_min=nw.col("b").min(), + d_n_unique=nw.col("d").n_unique(), + ) + .sort("a") + ) + expected = { + "a": [1, 2], + "b": [2, 1], + "c_n_unique": [1, 1], + "c_n_min": [4, 5], + "d_n_unique": [1, 1], + } + compare_dicts(result, expected) + + +def test_group_by_same_name_twice() -> None: + import pandas as pd + + df = pd.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]}) + with pytest.raises(ValueError, match="two aggregations with the same"): + nw.from_native(df).group_by("a").agg(nw.col("b").sum(), nw.col("b").n_unique()) + + def test_group_by_empty_result_pandas() -> None: df_any = pd.DataFrame({"a": [1, 2, 3], "b": [4, 3, 2]}) df = nw.from_native(df_any, eager_only=True) From 0061d5b6befc8e4ef67eaf94243eaee005862d05 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 6 Sep 2024 22:09:20 +0100 Subject: [PATCH 09/30] test: fixup tpch tests (#918) * test: fixup tpch tests * test: fixup tpch tests * fixup --- .github/workflows/check_tpch_queries.yml | 2 +- tpch/__init__.py | 0 tpch/execute/q10.py | 8 ----- tpch/execute/q11.py | 8 ----- tpch/execute/q15.py | 8 ----- tpch/execute/q17.py | 8 ----- tpch/execute/q18.py | 8 ----- tpch/execute/q3.py | 8 ----- tpch/execute/q4.py | 8 ----- tpch/execute/q5.py | 16 ---------- tpch/execute/q6.py | 8 ----- tpch/execute/q7.py | 9 ------ tpch/execute/q9.py | 12 -------- tpch/generate_data.py | 2 +- tpch/tests/test_queries.py | 38 ++++++++++-------------- 15 files changed, 17 insertions(+), 126 deletions(-) delete mode 100644 tpch/__init__.py diff --git a/.github/workflows/check_tpch_queries.yml b/.github/workflows/check_tpch_queries.yml index 397163091..82a2f4aa4 100644 --- a/.github/workflows/check_tpch_queries.yml +++ b/.github/workflows/check_tpch_queries.yml @@ -27,4 +27,4 @@ jobs: - name: generate-data run: cd tpch && python generate_data.py - name: tpch-tests - run: python -m unittest discover -s 'tpch/tests' \ No newline at end of file + run: cd tpch && pytest tests \ No newline at end of file diff --git a/tpch/__init__.py b/tpch/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tpch/execute/q10.py b/tpch/execute/q10.py index 9876f2aa9..99d850f53 100644 --- a/tpch/execute/q10.py +++ b/tpch/execute/q10.py @@ -6,18 +6,10 @@ from . import nation from . import orders -tool = "pandas" -fn = IO_FUNCS[tool] -print(q10.query(fn(customer), fn(nation), fn(lineitem), fn(orders))) - tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] print(q10.query(fn(customer), fn(nation), fn(lineitem), fn(orders))) -tool = "polars[eager]" -fn = IO_FUNCS[tool] -print(q10.query(fn(customer), fn(nation), fn(lineitem), fn(orders))) - tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q10.query(fn(customer), fn(nation), fn(lineitem), fn(orders)).collect()) diff --git a/tpch/execute/q11.py b/tpch/execute/q11.py index 82b1936aa..101710adb 100644 --- a/tpch/execute/q11.py +++ b/tpch/execute/q11.py @@ -5,18 +5,10 @@ from . import partsupp from . import supplier -tool = "pandas" -fn = IO_FUNCS[tool] -print(q11.query(fn(nation), fn(partsupp), fn(supplier))) - tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] print(q11.query(fn(nation), fn(partsupp), fn(supplier))) -tool = "polars[eager]" -fn = IO_FUNCS[tool] -print(q11.query(fn(nation), fn(partsupp), fn(supplier))) - tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q11.query(fn(nation), fn(partsupp), fn(supplier)).collect()) diff --git a/tpch/execute/q15.py b/tpch/execute/q15.py index 8fdaf2ab1..0d9e9f374 100644 --- a/tpch/execute/q15.py +++ b/tpch/execute/q15.py @@ -4,18 +4,10 @@ from . import lineitem from . import supplier -tool = "pandas" -fn = IO_FUNCS[tool] -print(q15.query(fn(lineitem), fn(supplier))) - tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] print(q15.query(fn(lineitem), fn(supplier))) -tool = "polars[eager]" -fn = IO_FUNCS[tool] -print(q15.query(fn(lineitem), fn(supplier))) - tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q15.query(fn(lineitem), fn(supplier)).collect()) diff --git a/tpch/execute/q17.py b/tpch/execute/q17.py index 5f2228012..2d9920c69 100644 --- a/tpch/execute/q17.py +++ b/tpch/execute/q17.py @@ -4,18 +4,10 @@ from . import lineitem from . import part -tool = "pandas" -fn = IO_FUNCS[tool] -print(q17.query(fn(lineitem), fn(part))) - tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] print(q17.query(fn(lineitem), fn(part))) -tool = "polars[eager]" -fn = IO_FUNCS[tool] -print(q17.query(fn(lineitem), fn(part))) - tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q17.query(fn(lineitem), fn(part)).collect()) diff --git a/tpch/execute/q18.py b/tpch/execute/q18.py index 5a59f0e5e..4092fc0d6 100644 --- a/tpch/execute/q18.py +++ b/tpch/execute/q18.py @@ -5,18 +5,10 @@ from . import lineitem from . import orders -tool = "pandas" -fn = IO_FUNCS[tool] -print(q18.query(fn(customer), fn(lineitem), fn(orders))) - tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] print(q18.query(fn(customer), fn(lineitem), fn(orders))) -tool = "polars[eager]" -fn = IO_FUNCS[tool] -print(q18.query(fn(customer), fn(lineitem), fn(orders))) - tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q18.query(fn(customer), fn(lineitem), fn(orders)).collect()) diff --git a/tpch/execute/q3.py b/tpch/execute/q3.py index 8602bb3d0..a1eea74d1 100644 --- a/tpch/execute/q3.py +++ b/tpch/execute/q3.py @@ -5,18 +5,10 @@ from . import lineitem from . import orders -tool = "pandas" -fn = IO_FUNCS[tool] -print(q3.query(fn(customer), fn(lineitem), fn(orders))) - tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] print(q3.query(fn(customer), fn(lineitem), fn(orders))) -tool = "polars[eager]" -fn = IO_FUNCS[tool] -print(q3.query(fn(customer), fn(lineitem), fn(orders))) - tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q3.query(fn(customer), fn(lineitem), fn(orders)).collect()) diff --git a/tpch/execute/q4.py b/tpch/execute/q4.py index 3e67a9c87..79213f1ac 100644 --- a/tpch/execute/q4.py +++ b/tpch/execute/q4.py @@ -4,18 +4,10 @@ from . import line_item from . import orders -tool = "pandas" -fn = IO_FUNCS[tool] -print(q4.query(fn(line_item), fn(orders))) - tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] print(q4.query(fn(line_item), fn(orders))) -tool = "polars[eager]" -fn = IO_FUNCS[tool] -print(q4.query(fn(line_item), fn(orders))) - tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q4.query(fn(line_item), fn(orders)).collect()) diff --git a/tpch/execute/q5.py b/tpch/execute/q5.py index 317b15fc7..7a04dec1b 100644 --- a/tpch/execute/q5.py +++ b/tpch/execute/q5.py @@ -8,14 +8,6 @@ from . import region from . import supplier -tool = "pandas" -fn = IO_FUNCS[tool] -print( - q5.query( - fn(region), fn(nation), fn(customer), fn(line_item), fn(orders), fn(supplier) - ) -) - tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] print( @@ -24,14 +16,6 @@ ) ) -tool = "polars[eager]" -fn = IO_FUNCS[tool] -print( - q5.query( - fn(region), fn(nation), fn(customer), fn(line_item), fn(orders), fn(supplier) - ) -) - tool = "polars[lazy]" fn = IO_FUNCS[tool] print( diff --git a/tpch/execute/q6.py b/tpch/execute/q6.py index adca0e26d..402e6d452 100644 --- a/tpch/execute/q6.py +++ b/tpch/execute/q6.py @@ -3,18 +3,10 @@ from . import IO_FUNCS from . import lineitem -tool = "pandas" -fn = IO_FUNCS[tool] -print(q6.query(fn(lineitem))) - tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] print(q6.query(fn(lineitem))) -tool = "polars[eager]" -fn = IO_FUNCS[tool] -print(q6.query(fn(lineitem))) - tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q6.query(fn(lineitem)).collect()) diff --git a/tpch/execute/q7.py b/tpch/execute/q7.py index 43e110a72..9f6179d23 100644 --- a/tpch/execute/q7.py +++ b/tpch/execute/q7.py @@ -7,19 +7,10 @@ from . import orders from . import supplier -tool = "pandas" -fn = IO_FUNCS[tool] -print(q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))) - - tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] print(q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))) -tool = "polars[eager]" -fn = IO_FUNCS[tool] -print(q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))) - tool = "polars[lazy]" fn = IO_FUNCS[tool] print( diff --git a/tpch/execute/q9.py b/tpch/execute/q9.py index 9ccbe35b7..e01dd0f2c 100644 --- a/tpch/execute/q9.py +++ b/tpch/execute/q9.py @@ -8,24 +8,12 @@ from . import partsupp from . import supplier -tool = "pandas" -fn = IO_FUNCS[tool] -print( - q9.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)) -) - tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] print( q9.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)) ) -tool = "polars[eager]" -fn = IO_FUNCS[tool] -print( - q9.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)) -) - tool = "polars[lazy]" fn = IO_FUNCS[tool] print( diff --git a/tpch/generate_data.py b/tpch/generate_data.py index 9ae7c3214..4d5695dcf 100644 --- a/tpch/generate_data.py +++ b/tpch/generate_data.py @@ -1,4 +1,4 @@ -from pathlib import Path +from pathlib import Path # noqa: INP001 import duckdb import pyarrow as pa diff --git a/tpch/tests/test_queries.py b/tpch/tests/test_queries.py index 4b7cdd866..35909b683 100644 --- a/tpch/tests/test_queries.py +++ b/tpch/tests/test_queries.py @@ -1,29 +1,21 @@ -import os import subprocess import sys -import unittest from pathlib import Path -class TestQueries(unittest.TestCase): - def test_execute_scripts(self) -> None: - root = Path(__file__).resolve().parent.parent - # directory containing all the queries - execute_dir = root / "execute" +def test_execute_scripts() -> None: + root = Path(__file__).resolve().parent.parent + # directory containing all the queries + execute_dir = root / "execute" - env = os.environ.copy() - env["PYTHONPATH"] = str(root) - - for script_path in execute_dir.glob("q[1-9]*.py"): - result = subprocess.run( # noqa: S603 - [sys.executable, str(script_path)], - capture_output=True, - text=True, - env=env, - cwd=root, - check=False, - shell=False, - ) - assert ( - result.returncode == 0 - ), f"Script {script_path} failed with error: {result.stderr}" + for script_path in execute_dir.glob("q[1-9]*.py"): + print(f"executing query {script_path.stem}") # noqa: T201 + result = subprocess.run( # noqa: S603 + [sys.executable, "-m", f"execute.{script_path.stem}"], + capture_output=True, + text=True, + check=False, + ) + assert ( + result.returncode == 0 + ), f"Script {script_path} failed with error: {result.stderr}" From 62c8adadd535ad8a10f5f45ef455678989d4c49c Mon Sep 17 00:00:00 2001 From: Zhengbo Wang Date: Sat, 7 Sep 2024 14:50:59 +0800 Subject: [PATCH 10/30] feat: Add q12, q13, q14, q16, q22 (#910) --- tpch/execute/q12.py | 13 +++++++++++++ tpch/execute/q13.py | 13 +++++++++++++ tpch/execute/q14.py | 13 +++++++++++++ tpch/execute/q16.py | 14 ++++++++++++++ tpch/execute/q22.py | 13 +++++++++++++ tpch/queries/q12.py | 33 +++++++++++++++++++++++++++++++++ tpch/queries/q13.py | 19 +++++++++++++++++++ tpch/queries/q14.py | 27 +++++++++++++++++++++++++++ tpch/queries/q16.py | 26 ++++++++++++++++++++++++++ tpch/queries/q22.py | 32 ++++++++++++++++++++++++++++++++ 10 files changed, 203 insertions(+) create mode 100644 tpch/execute/q12.py create mode 100644 tpch/execute/q13.py create mode 100644 tpch/execute/q14.py create mode 100644 tpch/execute/q16.py create mode 100644 tpch/execute/q22.py create mode 100644 tpch/queries/q12.py create mode 100644 tpch/queries/q13.py create mode 100644 tpch/queries/q14.py create mode 100644 tpch/queries/q16.py create mode 100644 tpch/queries/q22.py diff --git a/tpch/execute/q12.py b/tpch/execute/q12.py new file mode 100644 index 000000000..b74742373 --- /dev/null +++ b/tpch/execute/q12.py @@ -0,0 +1,13 @@ +from queries import q12 + +from . import IO_FUNCS +from . import line_item +from . import orders + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q12.query(fn(line_item), fn(orders))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q12.query(fn(line_item), fn(orders)).collect()) diff --git a/tpch/execute/q13.py b/tpch/execute/q13.py new file mode 100644 index 000000000..084fcca9b --- /dev/null +++ b/tpch/execute/q13.py @@ -0,0 +1,13 @@ +from queries import q13 + +from . import IO_FUNCS +from . import customer +from . import orders + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q13.query(fn(customer), fn(orders))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q13.query(fn(customer), fn(orders)).collect()) diff --git a/tpch/execute/q14.py b/tpch/execute/q14.py new file mode 100644 index 000000000..57f83a595 --- /dev/null +++ b/tpch/execute/q14.py @@ -0,0 +1,13 @@ +from queries import q14 + +from . import IO_FUNCS +from . import line_item +from . import part + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q14.query(fn(line_item), fn(part))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q14.query(fn(line_item), fn(part)).collect()) diff --git a/tpch/execute/q16.py b/tpch/execute/q16.py new file mode 100644 index 000000000..5176a5cc6 --- /dev/null +++ b/tpch/execute/q16.py @@ -0,0 +1,14 @@ +from queries import q16 + +from . import IO_FUNCS +from . import part +from . import partsupp +from . import supplier + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q16.query(fn(part), fn(partsupp), fn(supplier))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q16.query(fn(part), fn(partsupp), fn(supplier)).collect()) diff --git a/tpch/execute/q22.py b/tpch/execute/q22.py new file mode 100644 index 000000000..91ed46d9d --- /dev/null +++ b/tpch/execute/q22.py @@ -0,0 +1,13 @@ +from queries import q22 + +from . import IO_FUNCS +from . import customer +from . import orders + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q22.query(fn(customer), fn(orders))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q22.query(fn(customer), fn(orders)).collect()) diff --git a/tpch/queries/q12.py b/tpch/queries/q12.py new file mode 100644 index 000000000..ced775830 --- /dev/null +++ b/tpch/queries/q12.py @@ -0,0 +1,33 @@ +from datetime import datetime + +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query(line_item_ds: FrameT, orders_ds: FrameT) -> FrameT: + var1 = "MAIL" + var2 = "SHIP" + var3 = datetime(1994, 1, 1) + var4 = datetime(1995, 1, 1) + + return ( + orders_ds.join(line_item_ds, left_on="o_orderkey", right_on="l_orderkey") + .filter(nw.col("l_shipmode").is_in([var1, var2])) + .filter(nw.col("l_commitdate") < nw.col("l_receiptdate")) + .filter(nw.col("l_shipdate") < nw.col("l_commitdate")) + .filter(nw.col("l_receiptdate").is_between(var3, var4, closed="left")) + .with_columns( + nw.when(nw.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"])) + .then(1) + .otherwise(0) + .alias("high_line_count"), + nw.when(~nw.col("o_orderpriority").is_in(["1-URGENT", "2-HIGH"])) + .then(1) + .otherwise(0) + .alias("low_line_count"), + ) + .group_by("l_shipmode") + .agg(nw.col("high_line_count").sum(), nw.col("low_line_count").sum()) + .sort("l_shipmode") + ) diff --git a/tpch/queries/q13.py b/tpch/queries/q13.py new file mode 100644 index 000000000..adf57e5a2 --- /dev/null +++ b/tpch/queries/q13.py @@ -0,0 +1,19 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query(customer_ds: FrameT, orders_ds: FrameT) -> FrameT: + var1 = "special" + var2 = "requests" + + orders = orders_ds.filter(~nw.col("o_comment").str.contains(f"{var1}.*{var2}")) + return ( + customer_ds.join(orders, left_on="c_custkey", right_on="o_custkey", how="left") + .group_by("c_custkey") + .agg(nw.col("o_orderkey").count().alias("c_count")) + .group_by("c_count") + .agg(nw.len()) + .select(nw.col("c_count"), nw.col("len").alias("custdist")) + .sort(by=["custdist", "c_count"], descending=[True, True]) + ) diff --git a/tpch/queries/q14.py b/tpch/queries/q14.py new file mode 100644 index 000000000..f1ec6cbe3 --- /dev/null +++ b/tpch/queries/q14.py @@ -0,0 +1,27 @@ +from datetime import datetime + +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query(line_item_ds: FrameT, part_ds: FrameT) -> FrameT: + var1 = datetime(1995, 9, 1) + var2 = datetime(1995, 10, 1) + + return ( + line_item_ds.join(part_ds, left_on="l_partkey", right_on="p_partkey") + .filter(nw.col("l_shipdate").is_between(var1, var2, closed="left")) + .select( + ( + 100.00 + * nw.when(nw.col("p_type").str.contains("PROMO*")) + .then(nw.col("l_extendedprice") * (1 - nw.col("l_discount"))) + .otherwise(0) + .sum() + / (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).sum() + ) + .round(2) + .alias("promo_revenue") + ) + ) diff --git a/tpch/queries/q16.py b/tpch/queries/q16.py new file mode 100644 index 000000000..d84b9aab5 --- /dev/null +++ b/tpch/queries/q16.py @@ -0,0 +1,26 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query(part_ds: FrameT, partsupp_ds: FrameT, supplier_ds: FrameT) -> FrameT: + var1 = "Brand#45" + + supplier = supplier_ds.filter( + nw.col("s_comment").str.contains(".*Customer.*Complaints.*") + ).select(nw.col("s_suppkey"), nw.col("s_suppkey").alias("ps_suppkey")) + + return ( + part_ds.join(partsupp_ds, left_on="p_partkey", right_on="ps_partkey") + .filter(nw.col("p_brand") != var1) + .filter(~nw.col("p_type").str.contains("MEDIUM POLISHED*")) + .filter(nw.col("p_size").is_in([49, 14, 23, 45, 19, 3, 36, 9])) + .join(supplier, left_on="ps_suppkey", right_on="s_suppkey", how="left") + .filter(nw.col("ps_suppkey_right").is_null()) + .group_by("p_brand", "p_type", "p_size") + .agg(nw.col("ps_suppkey").n_unique().alias("supplier_cnt")) + .sort( + by=["supplier_cnt", "p_brand", "p_type", "p_size"], + descending=[True, False, False, False], + ) + ) diff --git a/tpch/queries/q22.py b/tpch/queries/q22.py new file mode 100644 index 000000000..4738c6fd3 --- /dev/null +++ b/tpch/queries/q22.py @@ -0,0 +1,32 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query(customer_ds: FrameT, orders_ds: FrameT) -> FrameT: + q1 = ( + customer_ds.with_columns(nw.col("c_phone").str.slice(0, 2).alias("cntrycode")) + .filter(nw.col("cntrycode").str.contains("13|31|23|29|30|18|17")) + .select("c_acctbal", "c_custkey", "cntrycode") + ) + + q2 = q1.filter(nw.col("c_acctbal") > 0.0).select( + nw.col("c_acctbal").mean().alias("avg_acctbal") + ) + + q3 = orders_ds.select(nw.col("o_custkey").unique()).with_columns( + nw.col("o_custkey").alias("c_custkey") + ) + + return ( + q1.join(q3, left_on="c_custkey", right_on="c_custkey", how="left") + .filter(nw.col("o_custkey").is_null()) + .join(q2, how="cross") + .filter(nw.col("c_acctbal") > nw.col("avg_acctbal")) + .group_by("cntrycode") + .agg( + nw.col("c_acctbal").count().alias("numcust"), + nw.col("c_acctbal").sum().alias("totacctbal"), + ) + .sort("cntrycode") + ) From 2969d75c5d1074af96744c7ce97234d042993e46 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sat, 7 Sep 2024 07:58:15 +0100 Subject: [PATCH 11/30] docs: Recommend `uv` in contributing guide (#873) --- CONTRIBUTING.md | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d36d21a55..aeed2538f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -47,22 +47,41 @@ git clone git@github.com:YOUR-USERNAME/narwhals.git ### 4. Setting up your environment -Here's how you can set up your local development environment to contribute: - -1. Make sure you have Python3.8+ installed (for example, Python 3.11) -2. Create a new virtual environment with `python3.11 -m venv .venv` (or whichever version of Python3.9+ you prefer) -3. Activate it: `. .venv/bin/activate` -4. Install Narwhals: `pip install -e .` -5. Install test requirements: `pip install -r requirements-dev.txt` -6. Install docs requirements: `pip install -r docs/requirements-docs.txt` +Here's how you can set up your local development environment to contribute. + +#### Option 1: Use UV (recommended) + +1. Make sure you have Python3.8+ installed (for example, Python 3.11), create a virtual environment, + and activate it. If you're new to this, here's one way that we recommend: + 1. Install uv: https://github.com/astral-sh/uv?tab=readme-ov-file#getting-started + 2. Install some version of Python greater than Python3.8. For example, to install + Python3.11: + ``` + uv python install 3.11 + ``` + 3. Create a virtual environment: + ``` + uv venv -p 3.11 --seed + ``` + 4. Activate it. On Linux, this is `. .venv/bin/activate`, on Windows `.\.venv\Scripts\activate`. +2. Install Narwhals: `uv pip install -e .` +3. Install test requirements: `uv pip install -r requirements-dev.txt` +4. Install docs requirements: `uv pip install -r docs/requirements-docs.txt` You should also install pre-commit: ``` -pip install pre-commit +uv pip install pre-commit pre-commit install ``` This will automatically format and lint your code before each commit, and it will block the commit if any issues are found. +#### Option 2: use python3-venv + +1. Make sure you have Python 3.8+ installed. If you don't, you can check [install Python](https://realpython.com/installing-python/) + to learn how. Then, [create and activate](https://realpython.com/python-virtual-environments-a-primer/) + a virtual environment. +2. Then, follow steps 2-4 from above but using `pip install` instead of `uv pip install`. + ### 5. Working on your issue Create a new git branch from the `main` branch in your local repository. From 5f91aa17c6cd9274851d219f4b50abe12c0fdfeb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 7 Sep 2024 08:43:20 +0100 Subject: [PATCH 12/30] [pre-commit.ci] pre-commit autoupdate (#818) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.7 → v0.6.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.7...v0.6.3) - [github.com/pre-commit/mirrors-mypy: v1.11.1 → v1.11.2](https://github.com/pre-commit/mirrors-mypy/compare/v1.11.1...v1.11.2) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * making ruff happy --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> --- .pre-commit-config.yaml | 4 +- pyproject.toml | 9 ++ tests/expr_and_series/arithmetic_test.py | 4 +- tests/expr_and_series/dt/ordinal_day_test.py | 2 +- .../expr_and_series/dt/total_minutes_test.py | 2 +- tests/hypothesis/test_basic_arithmetic.py | 2 +- tests/hypothesis/test_concat.py | 2 +- tests/hypothesis/test_join.py | 6 +- tpch/notebooks/q1/execute.ipynb | 53 +++++----- tpch/notebooks/q10/execute.ipynb | 41 ++++---- tpch/notebooks/q11/execute.ipynb | 44 ++++----- tpch/notebooks/q15/execute.ipynb | 43 ++++---- tpch/notebooks/q17/execute.ipynb | 42 ++++---- tpch/notebooks/q18/execute.ipynb | 41 ++++---- tpch/notebooks/q19/execute.ipynb | 45 ++++----- tpch/notebooks/q2/execute.ipynb | 51 +++++----- tpch/notebooks/q20/execute.ipynb | 47 ++++----- tpch/notebooks/q21/execute.ipynb | 99 +++++++++---------- tpch/notebooks/q3/execute.ipynb | 84 ++++++++-------- tpch/notebooks/q4/execute.ipynb | 79 +++++++-------- tpch/notebooks/q5/execute.ipynb | 73 +++++++------- tpch/notebooks/q6/execute.ipynb | 77 ++++++++------- tpch/notebooks/q7/execute.ipynb | 85 ++++++++-------- tpch/notebooks/q9/execute.ipynb | 42 ++++---- 24 files changed, 501 insertions(+), 476 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 57e766f59..f3a68e7a0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.5.7' + rev: 'v0.6.3' hooks: # Run the formatter. - id: ruff-format @@ -9,7 +9,7 @@ repos: - id: ruff args: [--fix] - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.11.1' + rev: 'v1.11.2' hooks: - id: mypy additional_dependencies: ['polars==1.4.1', 'pytest==8.3.2'] diff --git a/pyproject.toml b/pyproject.toml index b3a2a0c28..c4a10603f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,15 @@ lint.ignore = [ "tpch/tests/*" = ["S101"] "utils/*" = ["S311", "PTH123"] "tpch/execute/*" = ["T201"] +"tpch/notebooks/*" = [ + "ANN001", + "ANN201", + "EM101", + "EXE002", + "PTH123", + "T203", + "TRY003", +] [tool.ruff.lint.pydocstyle] convention = "google" diff --git a/tests/expr_and_series/arithmetic_test.py b/tests/expr_and_series/arithmetic_test.py index 47d3e8ff0..7ff945c80 100644 --- a/tests/expr_and_series/arithmetic_test.py +++ b/tests/expr_and_series/arithmetic_test.py @@ -149,7 +149,7 @@ def test_truediv_same_dims(constructor_eager: Any, request: Any) -> None: compare_dicts({"a": result}, {"a": [2, 1, 1 / 3]}) -@pytest.mark.slow() +@pytest.mark.slow @given( # type: ignore[misc] left=st.integers(-100, 100), right=st.integers(-100, 100), @@ -189,7 +189,7 @@ def test_floordiv(left: int, right: int) -> None: compare_dicts(result, expected) -@pytest.mark.slow() +@pytest.mark.slow @given( # type: ignore[misc] left=st.integers(-100, 100), right=st.integers(-100, 100), diff --git a/tests/expr_and_series/dt/ordinal_day_test.py b/tests/expr_and_series/dt/ordinal_day_test.py index 1cb464259..2681188df 100644 --- a/tests/expr_and_series/dt/ordinal_day_test.py +++ b/tests/expr_and_series/dt/ordinal_day_test.py @@ -17,7 +17,7 @@ parse_version(pd.__version__) < parse_version("2.0.0"), reason="pyarrow dtype not available", ) -@pytest.mark.slow() +@pytest.mark.slow def test_ordinal_day(dates: datetime) -> None: result_pd = nw.from_native(pd.Series([dates]), series_only=True).dt.ordinal_day()[0] result_pdms = nw.from_native( diff --git a/tests/expr_and_series/dt/total_minutes_test.py b/tests/expr_and_series/dt/total_minutes_test.py index f2469e495..bcd664442 100644 --- a/tests/expr_and_series/dt/total_minutes_test.py +++ b/tests/expr_and_series/dt/total_minutes_test.py @@ -22,7 +22,7 @@ parse_version(pd.__version__) < parse_version("2.2.0"), reason="pyarrow dtype not available", ) -@pytest.mark.slow() +@pytest.mark.slow def test_total_minutes(timedeltas: timedelta) -> None: result_pd = nw.from_native( pd.Series([timedeltas]), series_only=True diff --git a/tests/hypothesis/test_basic_arithmetic.py b/tests/hypothesis/test_basic_arithmetic.py index 2ab7bad7b..00818271d 100644 --- a/tests/hypothesis/test_basic_arithmetic.py +++ b/tests/hypothesis/test_basic_arithmetic.py @@ -22,7 +22,7 @@ max_size=3, ), ) # type: ignore[misc] -@pytest.mark.slow() +@pytest.mark.slow def test_mean( integer: st.SearchStrategy[list[int]], floats: st.SearchStrategy[float], diff --git a/tests/hypothesis/test_concat.py b/tests/hypothesis/test_concat.py index 1b1248628..9ae54dbc4 100644 --- a/tests/hypothesis/test_concat.py +++ b/tests/hypothesis/test_concat.py @@ -31,7 +31,7 @@ ), how=st.sampled_from(["horizontal", "vertical"]), ) # type: ignore[misc] -@pytest.mark.slow() +@pytest.mark.slow @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") def test_concat( # pragma: no cover integers: list[int], diff --git a/tests/hypothesis/test_join.py b/tests/hypothesis/test_join.py index ebdb88757..bc1cd735c 100644 --- a/tests/hypothesis/test_join.py +++ b/tests/hypothesis/test_join.py @@ -42,7 +42,7 @@ ) # type: ignore[misc] @pytest.mark.skipif(pl_version < parse_version("0.20.13"), reason="0.0 == -0.0") @pytest.mark.skipif(pd_version < parse_version("2.0.0"), reason="requires pyarrow") -@pytest.mark.slow() +@pytest.mark.slow def test_join( # pragma: no cover integers: st.SearchStrategy[list[int]], other_integers: st.SearchStrategy[list[int]], @@ -88,7 +88,7 @@ def test_join( # pragma: no cover max_size=3, ), ) # type: ignore[misc] -@pytest.mark.slow() +@pytest.mark.slow @pytest.mark.skipif(pd_version < parse_version("2.0.0"), reason="requires pyarrow") def test_cross_join( # pragma: no cover integers: st.SearchStrategy[list[int]], @@ -135,7 +135,7 @@ def test_cross_join( # pragma: no cover st.sampled_from(["a", "b", "d"]), min_size=1, max_size=3, unique=True ), ) -@pytest.mark.slow() +@pytest.mark.slow @pytest.mark.filterwarnings("ignore:the default coalesce behavior") def test_left_join( # pragma: no cover a_left_data: list[int], diff --git a/tpch/notebooks/q1/execute.ipynb b/tpch/notebooks/q1/execute.ipynb index cc6dd4559..de9c52baa 100755 --- a/tpch/notebooks/q1/execute.ipynb +++ b/tpch/notebooks/q1/execute.ipynb @@ -58,10 +58,12 @@ }, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "@nw.narwhalify\n", "def q1(lineitem_ds: Any) -> Any:\n", " var_1 = datetime(1998, 9, 2)\n", @@ -107,14 +109,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -133,16 +135,18 @@ }, "outputs": [], "source": [ - "import pyarrow.parquet as pq\n", "import dask.dataframe as dd\n", + "import pyarrow.parquet as pq\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'pyarrow': lambda x: pq.read_table(x),\n", - " 'dask': lambda x: dd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"pyarrow\": lambda x: pq.read_table(x),\n", + " \"dask\": lambda x: dd.read_parquet(x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"),\n", "}" ] }, @@ -171,7 +175,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pyarrow'\n", + "tool = \"pyarrow\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -210,7 +214,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(lineitem_ds=fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -249,7 +253,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -288,7 +292,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -327,7 +331,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(fn(lineitem)).collect()\n", "results[tool] = timings.all_runs" @@ -348,7 +352,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'dask'\n", + "tool = \"dask\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(fn(lineitem)).collect()\n", "results[tool] = timings.all_runs" @@ -370,8 +374,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q10/execute.ipynb b/tpch/notebooks/q10/execute.ipynb index 85ec0f14b..9ff211773 100644 --- a/tpch/notebooks/q10/execute.ipynb +++ b/tpch/notebooks/q10/execute.ipynb @@ -55,22 +55,23 @@ }, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q10(\n", " customer_ds_raw: Any,\n", " nation_ds_raw: Any,\n", " lineitem_ds_raw: Any,\n", " orders_ds_raw: Any,\n", ") -> Any:\n", - "\n", " nation_ds = nw.from_native(nation_ds_raw)\n", " line_item_ds = nw.from_native(lineitem_ds_raw)\n", " orders_ds = nw.from_native(orders_ds_raw)\n", " customer_ds = nw.from_native(customer_ds_raw)\n", - " \n", + "\n", " var1 = datetime(1993, 10, 1)\n", " var2 = datetime(1994, 1, 1)\n", "\n", @@ -81,8 +82,7 @@ " .filter(nw.col(\"o_orderdate\").is_between(var1, var2, closed=\"left\"))\n", " .filter(nw.col(\"l_returnflag\") == \"R\")\n", " .with_columns(\n", - " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")))\n", - " .alias(\"revenue\")\n", + " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\"revenue\")\n", " )\n", " .group_by(\n", " \"c_custkey\",\n", @@ -127,10 +127,10 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + 'nation.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "customer = dir_ + 'customer.parquet'" + "nation = dir_ + \"nation.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "customer = dir_ + \"customer.parquet\"" ] }, { @@ -149,10 +149,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -196,7 +198,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -233,7 +235,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -270,7 +272,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -307,7 +309,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" @@ -327,8 +329,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q11/execute.ipynb b/tpch/notebooks/q11/execute.ipynb index 33951d922..f5bbc0f9c 100644 --- a/tpch/notebooks/q11/execute.ipynb +++ b/tpch/notebooks/q11/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -56,19 +56,19 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q11(\n", " partsupp_ds_raw: Any,\n", " nation_ds_raw: Any,\n", " supplier_ds_raw: Any,\n", ") -> Any:\n", - "\n", " nation_ds = nw.from_native(nation_ds_raw)\n", " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", " supplier_ds = nw.from_native(supplier_ds_raw)\n", "\n", - " \n", " var1 = \"GERMANY\"\n", " var2 = 0.0001\n", "\n", @@ -83,14 +83,9 @@ " )\n", "\n", " q_final = (\n", - " q1.with_columns(\n", - " (nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\"))\n", - " .alias(\"value\")\n", - " )\n", + " q1.with_columns((nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\")).alias(\"value\"))\n", " .group_by(\"ps_partkey\")\n", - " .agg(\n", - " nw.sum(\"value\")\n", - " )\n", + " .agg(nw.sum(\"value\"))\n", " .join(q2, how=\"cross\")\n", " .filter(nw.col(\"value\") > nw.col(\"tmp\"))\n", " .select(\"ps_partkey\", \"value\")\n", @@ -116,9 +111,9 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + 'nation.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "nation = dir_ + \"nation.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -137,10 +132,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -184,7 +181,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -221,7 +218,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -258,7 +255,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -295,7 +292,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -315,8 +312,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] }, { diff --git a/tpch/notebooks/q15/execute.ipynb b/tpch/notebooks/q15/execute.ipynb index 0baf11956..d108a7196 100644 --- a/tpch/notebooks/q15/execute.ipynb +++ b/tpch/notebooks/q15/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -55,32 +55,34 @@ }, "outputs": [], "source": [ + "from datetime import datetime\n", "from typing import Any\n", + "\n", "import narwhals as nw\n", - "from datetime import datetime\n", + "\n", "\n", "def q15(\n", " lineitem_ds_raw: Any,\n", " supplier_ds_raw: Any,\n", ") -> Any:\n", - "\n", " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", " supplier_ds = nw.from_native(supplier_ds_raw)\n", - " \n", + "\n", " var1 = datetime(1996, 1, 1)\n", " var2 = datetime(1996, 4, 1)\n", "\n", " revenue = (\n", " lineitem_ds.filter(nw.col(\"l_shipdate\").is_between(var1, var2, closed=\"left\"))\n", " .with_columns(\n", - " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")))\n", - " .alias(\"total_revenue\")\n", + " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\n", + " \"total_revenue\"\n", + " )\n", " )\n", " .group_by(\"l_suppkey\")\n", " .agg(nw.sum(\"total_revenue\"))\n", " .select(nw.col(\"l_suppkey\").alias(\"supplier_no\"), nw.col(\"total_revenue\"))\n", " )\n", - " \n", + "\n", " result = (\n", " supplier_ds.join(revenue, left_on=\"s_suppkey\", right_on=\"supplier_no\")\n", " .filter(nw.col(\"total_revenue\") == nw.col(\"total_revenue\").max())\n", @@ -108,8 +110,8 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "supplier = dir_ + 'supplier.parquet'" + "lineitem = dir_ + \"lineitem.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"" ] }, { @@ -128,10 +130,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -175,7 +179,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -212,7 +216,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -249,7 +253,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -286,7 +290,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q15(fn(lineitem), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -306,8 +310,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q17/execute.ipynb b/tpch/notebooks/q17/execute.ipynb index b13445d28..4d012f088 100644 --- a/tpch/notebooks/q17/execute.ipynb +++ b/tpch/notebooks/q17/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -56,25 +56,23 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", - "def q17(\n", - " lineitem_ds_raw: Any,\n", - " part_ds_raw: Any\n", - ") -> Any:\n", "\n", + "def q17(lineitem_ds_raw: Any, part_ds_raw: Any) -> Any:\n", " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", " part_ds = nw.from_native(part_ds_raw)\n", - " \n", + "\n", " var1 = \"Brand#23\"\n", " var2 = \"MED BOX\"\n", - " \n", + "\n", " query1 = (\n", " part_ds.filter(nw.col(\"p_brand\") == var1)\n", " .filter(nw.col(\"p_container\") == var2)\n", " .join(lineitem_ds, how=\"left\", left_on=\"p_partkey\", right_on=\"l_partkey\")\n", " )\n", - " \n", + "\n", " final_query = (\n", " query1.group_by(\"p_partkey\")\n", " .agg((0.2 * nw.col(\"l_quantity\").mean()).alias(\"avg_quantity\"))\n", @@ -84,7 +82,6 @@ " .select((nw.col(\"l_extendedprice\").sum() / 7.0).round(2).alias(\"avg_yearly\"))\n", " )\n", "\n", - "\n", " return nw.to_native(final_query)" ] }, @@ -104,8 +101,8 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "part = dir_ + 'part.parquet'" + "lineitem = dir_ + \"lineitem.parquet\"\n", + "part = dir_ + \"part.parquet\"" ] }, { @@ -124,10 +121,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -171,7 +170,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -208,7 +207,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -245,7 +244,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -282,7 +281,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q17(fn(lineitem), fn(part)).collect()\n", "results[tool] = timings.all_runs" @@ -302,8 +301,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q18/execute.ipynb b/tpch/notebooks/q18/execute.ipynb index c90629e0f..edf635d9e 100644 --- a/tpch/notebooks/q18/execute.ipynb +++ b/tpch/notebooks/q18/execute.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -29,18 +29,15 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", - "def q18(\n", - " customer_ds_raw: Any,\n", - " lineitem_ds_raw: Any,\n", - " orders_ds_raw: Any\n", - ") -> Any:\n", "\n", + "def q18(customer_ds_raw: Any, lineitem_ds_raw: Any, orders_ds_raw: Any) -> Any:\n", " customer_ds = nw.from_native(customer_ds_raw)\n", " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", " orders_ds = nw.from_native(orders_ds_raw)\n", - " \n", + "\n", " var1 = 300\n", "\n", " query1 = (\n", @@ -67,7 +64,6 @@ " .head(100)\n", " )\n", "\n", - "\n", " return nw.to_native(q_final)" ] }, @@ -78,9 +74,9 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'" + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"" ] }, { @@ -90,10 +86,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -119,7 +117,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -138,7 +136,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -157,7 +155,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -176,7 +174,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" @@ -196,8 +194,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q19/execute.ipynb b/tpch/notebooks/q19/execute.ipynb index 8483e06d5..8860cc773 100644 --- a/tpch/notebooks/q19/execute.ipynb +++ b/tpch/notebooks/q19/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -56,14 +56,11 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", - "def q19(\n", - " lineitem_ds_raw: Any,\n", - " part_ds_raw: Any\n", - " \n", - ") -> Any:\n", "\n", + "def q19(lineitem_ds_raw: Any, part_ds_raw: Any) -> Any:\n", " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", " part_ds = nw.from_native(part_ds_raw)\n", "\n", @@ -74,9 +71,7 @@ " .filter(\n", " (\n", " (nw.col(\"p_brand\") == \"Brand#12\")\n", - " & nw.col(\"p_container\").is_in(\n", - " [\"SM CASE\", \"SM BOX\", \"SM PACK\", \"SM PKG\"]\n", - " )\n", + " & nw.col(\"p_container\").is_in([\"SM CASE\", \"SM BOX\", \"SM PACK\", \"SM PKG\"])\n", " & (nw.col(\"l_quantity\").is_between(1, 11))\n", " & (nw.col(\"p_size\").is_between(1, 5))\n", " )\n", @@ -90,9 +85,7 @@ " )\n", " | (\n", " (nw.col(\"p_brand\") == \"Brand#34\")\n", - " & nw.col(\"p_container\").is_in(\n", - " [\"LG CASE\", \"LG BOX\", \"LG PACK\", \"LG PKG\"]\n", - " )\n", + " & nw.col(\"p_container\").is_in([\"LG CASE\", \"LG BOX\", \"LG PACK\", \"LG PKG\"])\n", " & (nw.col(\"l_quantity\").is_between(20, 30))\n", " & (nw.col(\"p_size\").is_between(1, 15))\n", " )\n", @@ -105,7 +98,6 @@ " )\n", " )\n", "\n", - "\n", " return nw.to_native(result)" ] }, @@ -125,8 +117,8 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "part = dir_ + 'part.parquet'" + "lineitem = dir_ + \"lineitem.parquet\"\n", + "part = dir_ + \"part.parquet\"" ] }, { @@ -145,10 +137,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -192,7 +186,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -229,7 +223,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -266,7 +260,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -303,7 +297,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q19(fn(lineitem), fn(part)).collect()\n", "results[tool] = timings.all_runs" @@ -323,8 +317,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q2/execute.ipynb b/tpch/notebooks/q2/execute.ipynb index c05345336..74ba50f2a 100755 --- a/tpch/notebooks/q2/execute.ipynb +++ b/tpch/notebooks/q2/execute.ipynb @@ -69,8 +69,10 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "@nw.narwhalify\n", "def q2(\n", " region_ds: Any,\n", @@ -140,14 +142,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -166,16 +168,18 @@ }, "outputs": [], "source": [ - "import pyarrow.parquet as pq\n", "import dask.dataframe as dd\n", + "import pyarrow.parquet as pq\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'pyarrow': lambda x: pq.read_table(x),\n", - " 'dask': lambda x: dd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"pyarrow\": lambda x: pq.read_table(x),\n", + " \"dask\": lambda x: dd.read_parquet(x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"),\n", "}" ] }, @@ -222,7 +226,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" @@ -261,7 +265,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" @@ -300,7 +304,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" @@ -339,7 +343,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp)).collect()\n", "results[tool] = timings.all_runs" @@ -360,7 +364,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pyarrow'\n", + "tool = \"pyarrow\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" @@ -381,7 +385,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'dask'\n", + "tool = \"dask\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp)).compute()\n", "results[tool] = timings.all_runs" @@ -403,8 +407,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q20/execute.ipynb b/tpch/notebooks/q20/execute.ipynb index aecb3a473..a9698c1ad 100644 --- a/tpch/notebooks/q20/execute.ipynb +++ b/tpch/notebooks/q20/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -55,24 +55,25 @@ }, "outputs": [], "source": [ + "from datetime import datetime\n", "from typing import Any\n", + "\n", "import narwhals as nw\n", - "from datetime import datetime\n", + "\n", "\n", "def q20(\n", " part_ds_raw: Any,\n", " partsupp_ds_raw: Any,\n", " nation_ds_raw: Any,\n", " lineitem_ds_raw: Any,\n", - " supplier_ds_raw: Any\n", + " supplier_ds_raw: Any,\n", ") -> Any:\n", - "\n", " part_ds = nw.from_native(part_ds_raw)\n", " nation_ds = nw.from_native(nation_ds_raw)\n", " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", " supplier_ds = nw.from_native(supplier_ds_raw)\n", - " \n", + "\n", " var1 = datetime(1994, 1, 1)\n", " var2 = datetime(1995, 1, 1)\n", " var3 = \"CANADA\"\n", @@ -82,7 +83,7 @@ " lineitem_ds.filter(nw.col(\"l_shipdate\").is_between(var1, var2, closed=\"left\"))\n", " .group_by(\"l_partkey\", \"l_suppkey\")\n", " .agg((nw.col(\"l_quantity\").sum()).alias(\"sum_quantity\"))\n", - " .with_columns(sum_quantity = nw.col(\"sum_quantity\") * 0.5)\n", + " .with_columns(sum_quantity=nw.col(\"sum_quantity\") * 0.5)\n", " )\n", " query2 = nation_ds.filter(nw.col(\"n_name\") == var3)\n", " query3 = supplier_ds.join(query2, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", @@ -103,7 +104,6 @@ " .sort(\"s_name\")\n", " )\n", "\n", - "\n", " return nw.to_native(result)" ] }, @@ -123,11 +123,11 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + 'nation.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "nation = dir_ + \"nation.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -146,10 +146,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -193,7 +195,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -230,7 +232,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -267,7 +269,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -304,7 +306,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -324,8 +326,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q21/execute.ipynb b/tpch/notebooks/q21/execute.ipynb index b51b15dce..af12a424c 100755 --- a/tpch/notebooks/q21/execute.ipynb +++ b/tpch/notebooks/q21/execute.ipynb @@ -36,13 +36,12 @@ "outputs": [], "source": [ "from typing import Any\n", - "from datetime import date\n", - "\n", - "import narwhals as nw\n", "\n", "import pandas as pd\n", "import polars as pl\n", "\n", + "import narwhals as nw\n", + "\n", "pd.options.mode.copy_on_write = True\n", "pd.options.future.infer_string = True" ] @@ -66,10 +65,12 @@ "Q_NUM = 21\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -95,34 +96,28 @@ " orders_raw: Any,\n", " supplier_raw: Any,\n", ") -> Any:\n", - " \n", " lineitem = nw.from_native(lineitem_raw)\n", " nation = nw.from_native(nation_raw)\n", " orders = nw.from_native(orders_raw)\n", " supplier = nw.from_native(supplier_raw)\n", - " \n", + "\n", " var1 = \"SAUDI ARABIA\"\n", - " \n", - " \n", + "\n", " q1 = (\n", " lineitem.group_by(\"l_orderkey\")\n", - "# .agg(nw.col(\"l_suppkey\").len().alias(\"n_supp_by_order\"))\n", " .agg(nw.len().alias(\"n_supp_by_order\"))\n", " .filter(nw.col(\"n_supp_by_order\") > 1)\n", " .join(\n", " lineitem.filter(nw.col(\"l_receiptdate\") > nw.col(\"l_commitdate\")),\n", - "# on=\"l_orderkey\",\n", - " left_on=\"l_orderkey\", right_on=\"l_orderkey\",\n", + " left_on=\"l_orderkey\",\n", + " right_on=\"l_orderkey\",\n", " )\n", " )\n", "\n", " q_final = (\n", " q1.group_by(\"l_orderkey\")\n", - "# .agg(nw.col(\"l_suppkey\").len().alias(\"n_supp_by_order\"))\n", " .agg(nw.len().alias(\"n_supp_by_order\"))\n", - " .join(q1, left_on=\"l_orderkey\", right_on=\"l_orderkey\"\n", - " #on=\"l_orderkey\"\n", - " )\n", + " .join(q1, left_on=\"l_orderkey\", right_on=\"l_orderkey\")\n", " .join(supplier, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", " .join(nation, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", " .join(orders, left_on=\"l_orderkey\", right_on=\"o_orderkey\")\n", @@ -155,10 +150,10 @@ "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", "\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'" + "lineitem = dir_ + \"lineitem.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"" ] }, { @@ -213,10 +208,15 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", + "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", + " fn(lineitem),\n", + " fn(nation),\n", + " fn(orders),\n", + " fn(supplier),\n", + ")\n", "\n", "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", "results[tool] = timings.all_runs" @@ -255,9 +255,14 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", + "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", + " fn(lineitem),\n", + " fn(nation),\n", + " fn(orders),\n", + " fn(supplier),\n", + ")\n", "\n", "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", "results[tool] = timings.all_runs" @@ -296,10 +301,15 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", + "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", + " fn(lineitem),\n", + " fn(nation),\n", + " fn(orders),\n", + " fn(supplier),\n", + ")\n", "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", "results[tool] = timings.all_runs" ] @@ -337,10 +347,15 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", + "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", + " fn(lineitem),\n", + " fn(nation),\n", + " fn(orders),\n", + " fn(supplier),\n", + ")\n", "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw).collect()\n", "results[tool] = timings.all_runs" ] @@ -379,29 +394,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16", - "metadata": { - "papermill": { - "duration": 0.02616, - "end_time": "2024-06-20T09:46:18.666732", - "exception": false, - "start_time": "2024-06-20T09:46:18.640572", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from pprint import pprint\n", "\n", - "pprint(results)" + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q3/execute.ipynb b/tpch/notebooks/q3/execute.ipynb index 80178cae1..b81135fc3 100755 --- a/tpch/notebooks/q3/execute.ipynb +++ b/tpch/notebooks/q3/execute.ipynb @@ -49,14 +49,15 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import date\n", + "from typing import Any\n", + "\n", "\n", "def q3_pandas_native(\n", " customer_ds: Any,\n", " line_item_ds: Any,\n", " orders_ds: Any,\n", - "):\n", + ") -> Any:\n", " var1 = \"BUILDING\"\n", " var2 = date(1995, 3, 15)\n", "\n", @@ -69,18 +70,15 @@ " jn2 = jn2[jn2[\"l_shipdate\"] > var2]\n", " jn2[\"revenue\"] = jn2.l_extendedprice * (1 - jn2.l_discount)\n", "\n", - " gb = jn2.groupby(\n", - " [\"o_orderkey\", \"o_orderdate\", \"o_shippriority\"], as_index=False\n", - " )\n", + " gb = jn2.groupby([\"o_orderkey\", \"o_orderdate\", \"o_shippriority\"], as_index=False)\n", " agg = gb[\"revenue\"].sum()\n", "\n", " sel = agg.loc[:, [\"o_orderkey\", \"revenue\", \"o_orderdate\", \"o_shippriority\"]]\n", " sel = sel.rename({\"o_orderkey\": \"l_orderkey\"}, axis=\"columns\")\n", "\n", " sorted = sel.sort_values(by=[\"revenue\", \"o_orderdate\"], ascending=[False, True])\n", - " result_df = sorted.head(10)\n", "\n", - " return result_df # type: ignore[no-any-return]" + " return sorted.head(10) # type: ignore[no-any-return]" ] }, { @@ -99,10 +97,12 @@ }, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q3(\n", " customer_ds_raw: Any,\n", " line_item_ds_raw: Any,\n", @@ -122,7 +122,8 @@ " .filter(\n", " nw.col(\"o_orderdate\") < var_2,\n", " nw.col(\"l_shipdate\") > var_1,\n", - " ).with_columns(\n", + " )\n", + " .with_columns(\n", " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\"revenue\")\n", " )\n", " .group_by([\"o_orderkey\", \"o_orderdate\", \"o_shippriority\"])\n", @@ -150,16 +151,16 @@ "outputs": [], "source": [ "from typing import Any\n", - "from datetime import datetime\n", - "import narwhals as nw\n", + "\n", "import ibis\n", "\n", + "\n", "def q3_ibis(\n", " customer: Any,\n", " lineitem: Any,\n", " orders: Any,\n", " *,\n", - " tool,\n", + " tool: str,\n", ") -> Any:\n", " var1 = \"BUILDING\"\n", " var2 = date(1995, 3, 15)\n", @@ -186,9 +187,9 @@ " .order_by(ibis.desc(\"revenue\"), \"o_orderdate\")\n", " .limit(10)\n", " )\n", - " if tool == 'pandas':\n", + " if tool == \"pandas\":\n", " return q_final.to_pandas()\n", - " if tool == 'polars':\n", + " if tool == \"polars\":\n", " return q_final.to_polars()\n", " raise ValueError(\"expected pandas or polars\")" ] @@ -210,14 +211,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -236,18 +237,20 @@ }, "outputs": [], "source": [ - "import ibis\n", - "\n", "con_pd = ibis.pandas.connect()\n", "con_pl = ibis.polars.connect()\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -276,7 +279,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow][ibis]'\n", + "tool = \"pandas[pyarrow][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='pandas')\n", "results[tool] = timings.all_runs" @@ -297,7 +300,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy][ibis]'\n", + "tool = \"polars[lazy][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='polars')\n", "results[tool] = timings.all_runs" @@ -318,10 +321,10 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3_pandas_native(fn(customer), fn(lineitem), fn(orders))\n", - "results[tool+'[native]'] = timings.all_runs" + "results[tool + \"[native]\"] = timings.all_runs" ] }, { @@ -357,7 +360,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -396,7 +399,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -435,7 +438,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -474,7 +477,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" @@ -496,8 +499,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q4/execute.ipynb b/tpch/notebooks/q4/execute.ipynb index df07c9c5f..b0a55e345 100755 --- a/tpch/notebooks/q4/execute.ipynb +++ b/tpch/notebooks/q4/execute.ipynb @@ -52,6 +52,7 @@ "from datetime import date\n", "from typing import Any\n", "\n", + "\n", "def q4_pandas_native(\n", " line_item_ds: Any,\n", " orders_ds: Any,\n", @@ -72,9 +73,7 @@ " gb = jn.groupby(\"o_orderpriority\", as_index=False)\n", " agg = gb.agg(order_count=pd.NamedAgg(column=\"o_orderkey\", aggfunc=\"count\"))\n", "\n", - " result_df = agg.sort_values([\"o_orderpriority\"])\n", - "\n", - " return result_df # type: ignore[no-any-return]" + " return agg.sort_values([\"o_orderpriority\"]) # type: ignore[no-any-return]" ] }, { @@ -93,10 +92,12 @@ }, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q4(\n", " lineitem_ds_raw: Any,\n", " orders_ds_raw: Any,\n", @@ -112,7 +113,8 @@ " .filter(\n", " nw.col(\"o_orderdate\").is_between(var_1, var_2, closed=\"left\"),\n", " nw.col(\"l_commitdate\") < nw.col(\"l_receiptdate\"),\n", - " ).unique(subset=[\"o_orderpriority\", \"l_orderkey\"])\n", + " )\n", + " .unique(subset=[\"o_orderpriority\", \"l_orderkey\"])\n", " .group_by(\"o_orderpriority\")\n", " .agg(nw.len().alias(\"order_count\"))\n", " .sort(by=\"o_orderpriority\")\n", @@ -130,15 +132,11 @@ "outputs": [], "source": [ "from typing import Any\n", - "from datetime import datetime\n", + "\n", "import ibis\n", "\n", - "def q4_ibis(\n", - " lineitem: Any,\n", - " orders: Any,\n", - " *,\n", - " tool: str\n", - ") -> Any:\n", + "\n", + "def q4_ibis(lineitem: Any, orders: Any, *, tool: str) -> Any:\n", " var1 = datetime(1993, 7, 1)\n", " var2 = datetime(1993, 10, 1)\n", "\n", @@ -151,9 +149,9 @@ " .agg(order_count=ibis._.count())\n", " .order_by(\"o_orderpriority\")\n", " )\n", - " if tool == 'pandas':\n", + " if tool == \"pandas\":\n", " return q_final.to_pandas()\n", - " if tool == 'polars':\n", + " if tool == \"polars\":\n", " return q_final.to_polars()\n", " raise ValueError(\"expected pandas or polars\")" ] @@ -175,14 +173,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -201,18 +199,20 @@ }, "outputs": [], "source": [ - "import ibis\n", - "\n", "con_pd = ibis.pandas.connect()\n", "con_pl = ibis.polars.connect()\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -241,7 +241,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy][ibis]'\n", + "tool = \"polars[lazy][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4_ibis(fn(lineitem), fn(orders), tool='polars')\n", "results[tool] = timings.all_runs" @@ -262,10 +262,10 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4_pandas_native(fn(lineitem), fn(orders))\n", - "results[tool+'[native]'] = timings.all_runs" + "results[tool + \"[native]\"] = timings.all_runs" ] }, { @@ -301,7 +301,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -340,7 +340,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -379,7 +379,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -418,7 +418,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4(fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" @@ -440,8 +440,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q5/execute.ipynb b/tpch/notebooks/q5/execute.ipynb index 5f6df9bbc..da0cae78b 100755 --- a/tpch/notebooks/q5/execute.ipynb +++ b/tpch/notebooks/q5/execute.ipynb @@ -49,8 +49,9 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import date\n", + "from typing import Any\n", + "\n", "\n", "def q5_pandas_native(\n", " region_ds: Any,\n", @@ -79,9 +80,8 @@ " jn5[\"revenue\"] = jn5.l_extendedprice * (1.0 - jn5.l_discount)\n", "\n", " gb = jn5.groupby(\"n_name\", as_index=False)[\"revenue\"].sum()\n", - " result_df = gb.sort_values(\"revenue\", ascending=False)\n", "\n", - " return result_df # type: ignore[no-any-return]" + " return gb.sort_values(\"revenue\", ascending=False) # type: ignore[no-any-return]" ] }, { @@ -91,10 +91,12 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q5(\n", " region_ds_raw: Any,\n", " nation_ds_raw: Any,\n", @@ -126,7 +128,7 @@ " )\n", " .filter(\n", " nw.col(\"r_name\") == var_1,\n", - " nw.col(\"o_orderdate\").is_between(var_2, var_3, closed=\"left\")\n", + " nw.col(\"o_orderdate\").is_between(var_2, var_3, closed=\"left\"),\n", " )\n", " .with_columns(\n", " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\"revenue\")\n", @@ -147,10 +149,10 @@ "outputs": [], "source": [ "from typing import Any\n", - "from datetime import datetime\n", - "import narwhals as nw\n", + "\n", "import ibis\n", "\n", + "\n", "def q5_ibis(\n", " region: Any,\n", " nation: Any,\n", @@ -183,9 +185,9 @@ " .order_by(ibis.desc(\"revenue\"))\n", " )\n", "\n", - " if tool == 'pandas':\n", + " if tool == \"pandas\":\n", " return q_final.to_pandas()\n", - " if tool == 'polars':\n", + " if tool == \"polars\":\n", " return q_final.to_polars()\n", " raise ValueError(\"expected pandas or polars\")" ] @@ -207,14 +209,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -233,18 +235,20 @@ }, "outputs": [], "source": [ - "import ibis\n", - "\n", "con_pd = ibis.pandas.connect()\n", "con_pl = ibis.polars.connect()\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -273,7 +277,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy][ibis]'\n", + "tool = \"polars[lazy][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5_ibis(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", "results[tool] = timings.all_runs" @@ -294,10 +298,10 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5_pandas_native(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool+'[native]'] = timings.all_runs" + "results[tool + \"[native]\"] = timings.all_runs" ] }, { @@ -333,7 +337,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -372,7 +376,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -411,7 +415,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -450,7 +454,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -472,8 +476,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q6/execute.ipynb b/tpch/notebooks/q6/execute.ipynb index b101aa98d..5abcb65f0 100755 --- a/tpch/notebooks/q6/execute.ipynb +++ b/tpch/notebooks/q6/execute.ipynb @@ -50,6 +50,7 @@ "source": [ "from datetime import date\n", "\n", + "\n", "def q6_pandas_native(line_item_ds):\n", " var1 = date(1994, 1, 1)\n", " var2 = date(1995, 1, 1)\n", @@ -66,9 +67,8 @@ " ]\n", "\n", " result_value = (flineitem[\"l_extendedprice\"] * flineitem[\"l_discount\"]).sum()\n", - " result_df = pd.DataFrame({\"revenue\": [result_value]})\n", "\n", - " return result_df" + " return pd.DataFrame({\"revenue\": [result_value]})" ] }, { @@ -87,10 +87,11 @@ }, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q6(line_item_raw) -> None:\n", " var_1 = datetime(1994, 1, 1)\n", " var_2 = datetime(1995, 1, 1)\n", @@ -103,12 +104,11 @@ " nw.col(\"l_shipdate\").is_between(var_1, var_2, closed=\"left\"),\n", " nw.col(\"l_discount\").is_between(0.05, 0.07),\n", " nw.col(\"l_quantity\") < var_3,\n", - " ).with_columns(\n", - " (nw.col(\"l_extendedprice\") * nw.col(\"l_discount\")).alias(\"revenue\")\n", " )\n", + " .with_columns((nw.col(\"l_extendedprice\") * nw.col(\"l_discount\")).alias(\"revenue\"))\n", " .select(nw.sum(\"revenue\"))\n", " )\n", - " return nw.to_native(result)\n" + " return nw.to_native(result)" ] }, { @@ -118,10 +118,6 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Any\n", - "from datetime import datetime\n", - "import narwhals as nw\n", - "\n", "def q6_ibis(lineitem, *, tool: str) -> None:\n", " var1 = datetime(1994, 1, 1)\n", " var2 = datetime(1995, 1, 1)\n", @@ -138,12 +134,12 @@ " .mutate(revenue=ibis._[\"l_extendedprice\"] * (ibis._[\"l_discount\"]))\n", " .agg(revenue=ibis._[\"revenue\"].sum())\n", " )\n", - " \n", - " if tool == 'pandas':\n", + "\n", + " if tool == \"pandas\":\n", " return q_final.to_pandas()\n", - " if tool == 'polars':\n", + " if tool == \"polars\":\n", " return q_final.to_polars()\n", - " raise ValueError(\"expected pandas or polars\")\n" + " raise ValueError(\"expected pandas or polars\")" ] }, { @@ -163,14 +159,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -195,12 +191,16 @@ "con_pl = ibis.polars.connect()\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -229,7 +229,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow][ibis]'\n", + "tool = \"pandas[pyarrow][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6_ibis(fn(lineitem), tool='pandas')\n", "results[tool] = timings.all_runs" @@ -250,7 +250,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy][ibis]'\n", + "tool = \"polars[lazy][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6_ibis(fn(lineitem), tool='polars')\n", "results[tool] = timings.all_runs" @@ -271,10 +271,10 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6_pandas_native(fn(lineitem))\n", - "results[tool+'[native]'] = timings.all_runs" + "results[tool + \"[native]\"] = timings.all_runs" ] }, { @@ -310,7 +310,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -349,7 +349,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -388,7 +388,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -427,7 +427,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6(fn(lineitem)).collect()\n", "results[tool] = timings.all_runs" @@ -449,8 +449,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q7/execute.ipynb b/tpch/notebooks/q7/execute.ipynb index 1213043b0..8711d7505 100755 --- a/tpch/notebooks/q7/execute.ipynb +++ b/tpch/notebooks/q7/execute.ipynb @@ -49,10 +49,13 @@ "metadata": {}, "outputs": [], "source": [ + "from datetime import date\n", + "from datetime import datetime\n", "from typing import Any\n", - "from datetime import datetime, date\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q7_pandas_native(\n", " nation_ds,\n", " customer_ds,\n", @@ -96,9 +99,7 @@ " gb = total.groupby([\"supp_nation\", \"cust_nation\", \"l_year\"], as_index=False)\n", " agg = gb.agg(revenue=pd.NamedAgg(column=\"volume\", aggfunc=\"sum\"))\n", "\n", - " result_df = agg.sort_values(by=[\"supp_nation\", \"cust_nation\", \"l_year\"])\n", - "\n", - " return result_df # type: ignore[no-any-return]" + " return agg.sort_values(by=[\"supp_nation\", \"cust_nation\", \"l_year\"]) # type: ignore[no-any-return]" ] }, { @@ -117,10 +118,6 @@ }, "outputs": [], "source": [ - "from typing import Any\n", - "from datetime import datetime\n", - "import narwhals as nw\n", - "\n", "def q7(\n", " nation_ds,\n", " customer_ds,\n", @@ -171,7 +168,7 @@ " .agg(nw.sum(\"volume\").alias(\"revenue\"))\n", " .sort(by=[\"supp_nation\", \"cust_nation\", \"l_year\"])\n", " )\n", - " return nw.to_native(result)\n" + " return nw.to_native(result)" ] }, { @@ -181,18 +178,11 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Any\n", - "from datetime import datetime\n", "import ibis\n", "\n", + "\n", "def q7_ibis(\n", - " nation: Any,\n", - " customer: Any,\n", - " lineitem: Any,\n", - " orders: Any,\n", - " supplier: Any,\n", - " *,\n", - " tool: str\n", + " nation: Any, customer: Any, lineitem: Any, orders: Any, supplier: Any, *, tool: str\n", ") -> None:\n", " var1 = \"FRANCE\"\n", " var2 = \"GERMANY\"\n", @@ -234,9 +224,9 @@ " .order_by(\"supp_nation\", \"cust_nation\", \"l_year\")\n", " )\n", "\n", - " if tool == 'pandas':\n", + " if tool == \"pandas\":\n", " return q_final.to_pandas()\n", - " if tool == 'polars':\n", + " if tool == \"polars\":\n", " return q_final.to_polars()\n", " raise ValueError(\"expected pandas or polars\")" ] @@ -258,14 +248,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -284,18 +274,20 @@ }, "outputs": [], "source": [ - "import ibis\n", - "\n", "con_pd = ibis.pandas.connect()\n", "con_pl = ibis.polars.connect()\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -324,7 +316,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow][ibis]'\n", + "tool = \"pandas[pyarrow][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='pandas')\n", "results[tool] = timings.all_runs" @@ -345,7 +337,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy][ibis]'\n", + "tool = \"polars[lazy][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", "results[tool] = timings.all_runs" @@ -366,10 +358,10 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7_pandas_native(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool+'[native]'] = timings.all_runs" + "results[tool + \"[native]\"] = timings.all_runs" ] }, { @@ -405,7 +397,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -444,7 +436,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -483,7 +475,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -522,7 +514,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -544,8 +536,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q9/execute.ipynb b/tpch/notebooks/q9/execute.ipynb index 86417e180..802799a01 100644 --- a/tpch/notebooks/q9/execute.ipynb +++ b/tpch/notebooks/q9/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -56,8 +56,10 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q9(\n", " part_ds_raw: Any,\n", " partsupp_ds_raw: Any,\n", @@ -66,7 +68,6 @@ " orders_ds_raw: Any,\n", " supplier_ds_raw: Any,\n", ") -> Any:\n", - "\n", " part_ds = nw.from_native(part_ds_raw)\n", " nation_ds = nw.from_native(nation_ds_raw)\n", " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", @@ -91,7 +92,7 @@ " (\n", " nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))\n", " - nw.col(\"ps_supplycost\") * nw.col(\"l_quantity\")\n", - " ).alias(\"amount\")\n", + " ).alias(\"amount\"),\n", " )\n", " .group_by(\"nation\", \"o_year\")\n", " .agg(nw.sum(\"amount\").alias(\"sum_profit\"))\n", @@ -117,12 +118,12 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + 'nation.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "nation = dir_ + \"nation.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -141,10 +142,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -188,7 +191,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -225,7 +228,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -262,7 +265,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -299,7 +302,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -319,8 +322,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], From 956274c9fc6eb901d95cc2af788658c66e6c3c87 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Sat, 7 Sep 2024 13:39:59 +0100 Subject: [PATCH 13/30] add `by` argument to join_asof (#921) --- narwhals/_arrow/dataframe.py | 3 + narwhals/_dask/dataframe.py | 6 + narwhals/_pandas_like/dataframe.py | 6 + narwhals/dataframe.py | 222 ++++++++++++++++++++++++++++- tests/frame/join_test.py | 59 ++++++++ 5 files changed, 292 insertions(+), 4 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index f01ada158..960d833a5 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -322,6 +322,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: msg = "join_asof is not yet supported on PyArrow tables" diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 8f11ccaad..5ef8c5a9d 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -305,6 +305,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: plx = self.__native_namespace__() @@ -315,6 +318,9 @@ def join_asof( left_on=left_on, right_on=right_on, on=on, + left_by=by_left, + right_by=by_right, + by=by, direction=strategy, suffixes=("", "_right"), ), diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 9750cd9d4..3040adda0 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -516,6 +516,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: plx = self.__native_namespace__() @@ -526,6 +529,9 @@ def join_asof( left_on=left_on, right_on=right_on, on=on, + left_by=by_left, + right_by=by_right, + by=by, direction=strategy, suffixes=("", "_right"), ), diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 440856eb4..165b65981 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -221,6 +221,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: _supported_strategies = ("backward", "forward", "nearest") @@ -232,12 +235,30 @@ def join_asof( if left_on is not None and right_on is not None and on is not None: msg = "Either (`left_on` and `right_on`) or `on` keys should be specified." raise ValueError(msg) + if by_left is not None and by_right is not None and by is not None: + msg = "Can not specify `by_left`, `by_right`, and `by` keys at the same time." + raise ValueError(msg) + if by_left is not None and by_right is None and by is None: + msg = "`by_right` can not be None if `by_left` is specified." + raise ValueError(msg) + if by_left is None and by_right is not None and by is None: + msg = "`by_left` can not be None if `by_right` is specified." + raise ValueError(msg) + if ( + (by_left is None and by_right is not None) + or (by_left is not None and by_right is None) + ) and by is not None: + msg = "Either (`by_left` and `by_right_`) or `by` keys should be specified." + raise ValueError(msg) if left_on is not None and right_on is not None: return self._from_compliant_dataframe( self._compliant_frame.join_asof( self._extract_compliant(other), left_on=left_on, right_on=right_on, + by_left=by_left, + by_right=by_right, + by=by, strategy=strategy, ) ) @@ -246,6 +267,9 @@ def join_asof( self._compliant_frame.join_asof( self._extract_compliant(other), on=on, + by_left=by_left, + by_right=by_right, + by=by, strategy=strategy, ) ) @@ -1885,6 +1909,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: """ @@ -1903,6 +1930,12 @@ def join_asof( on: Join column of both DataFrames. If set, left_on and right_on should be None. + by_left: join on these columns before doing asof join + + by_right: join on these columns before doing asof join + + by: join on these columns before doing asof join + strategy: Join strategy. The default is "backward". * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. @@ -1946,7 +1979,9 @@ def join_asof( >>> @nw.narwhalify ... def join_asof_datetime(df, other_any, strategy): ... return df.join_asof(other_any, on="datetime", strategy=strategy) - >>> # We can now pass either pandas or Polars to the function: + + We can now pass either pandas or Polars to the function: + >>> join_asof_datetime(population_pd, gdp_pd, strategy="backward") datetime population gdp 0 2016-03-01 82.19 4164 @@ -1964,9 +1999,93 @@ def join_asof( │ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │ │ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │ └─────────────────────┴────────────┴──────┘ + + Here is a real-world times-series example that uses `by` argument. + + >>> from datetime import datetime + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data_quotes = { + ... "datetime": [ + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 30), + ... datetime(2016, 5, 25, 13, 30, 0, 41), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 49), + ... datetime(2016, 5, 25, 13, 30, 0, 72), + ... datetime(2016, 5, 25, 13, 30, 0, 75), + ... ], + ... "ticker": [ + ... "GOOG", + ... "MSFT", + ... "MSFT", + ... "MSFT", + ... "GOOG", + ... "AAPL", + ... "GOOG", + ... "MSFT", + ... ], + ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + ... } + >>> data_trades = { + ... "datetime": [ + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 38), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... ], + ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], + ... "quantity": [75, 155, 100, 100, 100], + ... } + >>> quotes_pd = pd.DataFrame(data_quotes) + >>> trades_pd = pd.DataFrame(data_trades) + >>> quotes_pl = pl.DataFrame(data_quotes).sort("datetime") + >>> trades_pl = pl.DataFrame(data_trades).sort("datetime") + + Let's define a dataframe-agnostic function in which we join over "datetime" and by "ticker" columns: + + >>> @nw.narwhalify + ... def join_asof_datetime_by_ticker(df, other_any): + ... return df.join_asof(other_any, on="datetime", by="ticker") + + We can now pass either pandas or Polars to the function: + + >>> join_asof_datetime_by_ticker(trades_pd, quotes_pd) + datetime ticker price quantity bid ask + 0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.000048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN + + >>> join_asof_datetime_by_ticker(trades_pl, quotes_pl) + shape: (5, 6) + ┌────────────────────────────┬────────┬────────┬──────────┬───────┬────────┐ + │ datetime ┆ ticker ┆ price ┆ quantity ┆ bid ┆ ask │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ f64 ┆ i64 ┆ f64 ┆ f64 │ + ╞════════════════════════════╪════════╪════════╪══════════╪═══════╪════════╡ + │ 2016-05-25 13:30:00.000023 ┆ MSFT ┆ 51.95 ┆ 75 ┆ 51.95 ┆ 51.96 │ + │ 2016-05-25 13:30:00.000038 ┆ MSFT ┆ 51.95 ┆ 155 ┆ 51.97 ┆ 51.98 │ + │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.77 ┆ 100 ┆ 720.5 ┆ 720.93 │ + │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │ + │ 2016-05-25 13:30:00.000048 ┆ AAPL ┆ 98.0 ┆ 100 ┆ null ┆ null │ + └────────────────────────────┴────────┴────────┴──────────┴───────┴────────┘ """ return super().join_asof( - other, left_on=left_on, right_on=right_on, on=on, strategy=strategy + other, + left_on=left_on, + right_on=right_on, + on=on, + by_left=by_left, + by_right=by_right, + by=by, + strategy=strategy, ) # --- descriptive --- @@ -3515,6 +3634,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: """ @@ -3533,6 +3655,12 @@ def join_asof( on: Join column of both DataFrames. If set, left_on and right_on should be None. + by_left: join on these columns before doing asof join + + by_right: join on these columns before doing asof join + + by: join on these columns before doing asof join + strategy: Join strategy. The default is "backward". * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. @@ -3575,7 +3703,9 @@ def join_asof( >>> @nw.narwhalify ... def join_asof_datetime(df, other_any, strategy): ... return df.join_asof(other_any, on="datetime", strategy=strategy) - >>> # We can now pass either pandas or Polars to the function: + + We can now pass either pandas or Polars to the function: + >>> join_asof_datetime(population_pd, gdp_pd, strategy="backward") datetime population gdp 0 2016-03-01 82.19 4164 @@ -3593,9 +3723,93 @@ def join_asof( │ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │ │ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │ └─────────────────────┴────────────┴──────┘ + + Here is a real-world times-series example that uses `by` argument. + + >>> from datetime import datetime + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data_quotes = { + ... "datetime": [ + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 30), + ... datetime(2016, 5, 25, 13, 30, 0, 41), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 49), + ... datetime(2016, 5, 25, 13, 30, 0, 72), + ... datetime(2016, 5, 25, 13, 30, 0, 75), + ... ], + ... "ticker": [ + ... "GOOG", + ... "MSFT", + ... "MSFT", + ... "MSFT", + ... "GOOG", + ... "AAPL", + ... "GOOG", + ... "MSFT", + ... ], + ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + ... } + >>> data_trades = { + ... "datetime": [ + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 38), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... ], + ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], + ... "quantity": [75, 155, 100, 100, 100], + ... } + >>> quotes_pd = pd.DataFrame(data_quotes) + >>> trades_pd = pd.DataFrame(data_trades) + >>> quotes_pl = pl.LazyFrame(data_quotes).sort("datetime") + >>> trades_pl = pl.LazyFrame(data_trades).sort("datetime") + + Let's define a dataframe-agnostic function in which we join over "datetime" and by "ticker" columns: + + >>> @nw.narwhalify + ... def join_asof_datetime_by_ticker(df, other_any): + ... return df.join_asof(other_any, on="datetime", by="ticker") + + We can now pass either pandas or Polars to the function: + + >>> join_asof_datetime_by_ticker(trades_pd, quotes_pd) + datetime ticker price quantity bid ask + 0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.000048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN + + >>> join_asof_datetime_by_ticker(trades_pl, quotes_pl).collect() + shape: (5, 6) + ┌────────────────────────────┬────────┬────────┬──────────┬───────┬────────┐ + │ datetime ┆ ticker ┆ price ┆ quantity ┆ bid ┆ ask │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ f64 ┆ i64 ┆ f64 ┆ f64 │ + ╞════════════════════════════╪════════╪════════╪══════════╪═══════╪════════╡ + │ 2016-05-25 13:30:00.000023 ┆ MSFT ┆ 51.95 ┆ 75 ┆ 51.95 ┆ 51.96 │ + │ 2016-05-25 13:30:00.000038 ┆ MSFT ┆ 51.95 ┆ 155 ┆ 51.97 ┆ 51.98 │ + │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.77 ┆ 100 ┆ 720.5 ┆ 720.93 │ + │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │ + │ 2016-05-25 13:30:00.000048 ┆ AAPL ┆ 98.0 ┆ 100 ┆ null ┆ null │ + └────────────────────────────┴────────┴────────┴──────────┴───────┴────────┘ """ return super().join_asof( - other, left_on=left_on, right_on=right_on, on=on, strategy=strategy + other, + left_on=left_on, + right_on=right_on, + on=on, + by_left=by_left, + by_right=by_right, + by=by, + strategy=strategy, ) def clone(self) -> Self: diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 72f1304df..34a5961ef 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -324,6 +324,31 @@ def test_joinasof_time(constructor: Any, request: Any) -> None: compare_dicts(result_nearest_on, expected_nearest) +def test_joinasof_by(constructor: Any, request: Any) -> None: + if "pyarrow_table" in str(constructor): + request.applymarker(pytest.mark.xfail) + if parse_version(pd.__version__) < (2, 1) and ( + ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) + ): + request.applymarker(pytest.mark.xfail) + df = nw.from_native( + constructor({"a": [1, 5, 7, 10], "b": ["D", "D", "C", "A"], "c": [9, 2, 1, 1]}) + ).sort("a") + df_right = nw.from_native( + constructor({"a": [1, 4, 5, 8], "b": ["D", "D", "A", "F"], "d": [1, 3, 4, 1]}) + ).sort("a") + result = df.join_asof(df_right, on="a", by_left="b", by_right="b") # type: ignore[arg-type] + result_by = df.join_asof(df_right, on="a", by="b") # type: ignore[arg-type] + expected = { + "a": [1, 5, 7, 10], + "b": ["D", "D", "C", "A"], + "c": [9, 2, 1, 1], + "d": [1, 3, float("nan"), 4], + } + compare_dicts(result, expected) + compare_dicts(result_by, expected) + + @pytest.mark.parametrize("strategy", ["back", "furthest"]) def test_joinasof_not_implemented(constructor: Any, strategy: str) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} @@ -361,3 +386,37 @@ def test_joinasof_no_keys(constructor: Any) -> None: match=msg, ): df.join_asof(df, left_on="a", right_on="a", on="a") # type: ignore[arg-type] + + +def test_joinasof_by_exceptions(constructor: Any) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + df = nw.from_native(constructor(data)) + with pytest.raises( + ValueError, + match=r"Can not specify `by_left`, `by_right`, and `by` keys at the same time.", + ): + df.join_asof(df, on="a", by_left="b", by_right="b", by="b") # type: ignore[arg-type] + + with pytest.raises( + ValueError, + match=r"`by_right` can not be None if `by_left` is specified.", + ): + df.join_asof(df, on="a", by_left="b") # type: ignore[arg-type] + + with pytest.raises( + ValueError, + match=r"`by_left` can not be None if `by_right` is specified.", + ): + df.join_asof(df, on="a", by_right="b") # type: ignore[arg-type] + + with pytest.raises( + ValueError, + match=r"Either \(`by_left` and `by_right_`\) or `by` keys should be specified.", + ): + df.join_asof(df, on="a", by_left="b", by="b") # type: ignore[arg-type] + + with pytest.raises( + ValueError, + match=r"Either \(`by_left` and `by_right_`\) or `by` keys should be specified.", + ): + df.join_asof(df, on="a", by_right="b", by="b") # type: ignore[arg-type] From b32fd4ac089c4d848eb6a92cbb2e23080f74824c Mon Sep 17 00:00:00 2001 From: Isaias Gutierrez-Cruz <64386035+IsaiasGutierrezCruz@users.noreply.github.com> Date: Sat, 7 Sep 2024 14:45:47 -0600 Subject: [PATCH 14/30] feat: add first implementation of query 8 (#922) --- tpch/execute/q8.py | 39 +++++++++++++++++++++++++++++++++++ tpch/queries/q8.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 tpch/execute/q8.py create mode 100644 tpch/queries/q8.py diff --git a/tpch/execute/q8.py b/tpch/execute/q8.py new file mode 100644 index 000000000..f1a8677ff --- /dev/null +++ b/tpch/execute/q8.py @@ -0,0 +1,39 @@ +from queries import q8 + +from . import IO_FUNCS +from . import customer +from . import lineitem +from . import nation +from . import orders +from . import part +from . import region +from . import supplier + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print( + q8.query( + fn(part), + fn(supplier), + fn(lineitem), + fn(orders), + fn(customer), + fn(nation), + fn(region), + ) +) + + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print( + q8.query( + fn(part), + fn(supplier), + fn(lineitem), + fn(orders), + fn(customer), + fn(nation), + fn(region), + ).collect() +) diff --git a/tpch/queries/q8.py b/tpch/queries/q8.py new file mode 100644 index 000000000..3fba96313 --- /dev/null +++ b/tpch/queries/q8.py @@ -0,0 +1,51 @@ +from datetime import date + +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + part_ds: FrameT, + supplier_ds: FrameT, + line_item_ds: FrameT, + orders_ds: FrameT, + customer_ds: FrameT, + nation_ds: FrameT, + region_ds: FrameT, +) -> FrameT: + nation = "BRAZIL" + region = "AMERICA" + type = "ECONOMY ANODIZED STEEL" + date1 = date(1995, 1, 1) + date2 = date(1996, 12, 31) + + n1 = nation_ds.select("n_nationkey", "n_regionkey") + n2 = nation_ds.select("n_nationkey", "n_name") + + return ( + part_ds.join(line_item_ds, left_on="p_partkey", right_on="l_partkey") + .join(supplier_ds, left_on="l_suppkey", right_on="s_suppkey") + .join(orders_ds, left_on="l_orderkey", right_on="o_orderkey") + .join(customer_ds, left_on="o_custkey", right_on="c_custkey") + .join(n1, left_on="c_nationkey", right_on="n_nationkey") + .join(region_ds, left_on="n_regionkey", right_on="r_regionkey") + .filter(nw.col("r_name") == region) + .join(n2, left_on="s_nationkey", right_on="n_nationkey") + .filter(nw.col("o_orderdate").is_between(date1, date2)) + .filter(nw.col("p_type") == type) + .select( + nw.col("o_orderdate").dt.year().alias("o_year"), + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("volume"), + nw.col("n_name").alias("nation"), + ) + .with_columns( + nw.when(nw.col("nation") == nation) + .then(nw.col("volume")) + .otherwise(0) + .alias("_tmp") + ) + .group_by("o_year") + .agg((nw.sum("_tmp") / nw.sum("volume")).round(2).alias("mkt_share")) + .sort("o_year") + ) From 525d92df9b644c30ae1f7371cca523bd0586a095 Mon Sep 17 00:00:00 2001 From: Alessandro Miola <37796412+AlessandroMiola@users.noreply.github.com> Date: Sun, 8 Sep 2024 12:06:13 +0200 Subject: [PATCH 15/30] docs: fix a typo (#925) --- docs/why.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/why.md b/docs/why.md index adf8f39b4..4ec605d16 100644 --- a/docs/why.md +++ b/docs/why.md @@ -27,7 +27,7 @@ pl_df_right = pl.DataFrame({"a": [1, 2, 3], "c": [4, 5, 6]}) pl_left_merge = pl_df_left.join(pl_df_right, left_on="b", right_on="c", how="left") print(pd_left_merge.columns) -print(pl_df_right.columns) +print(pl_left_merge.columns) ``` There are several such subtle difference between the libraries. Writing dataframe-agnostic code is hard! From ee8c62a4bbc8201360d0e43a7056b2f163c31918 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 8 Sep 2024 06:49:35 -0400 Subject: [PATCH 16/30] fix: Fixes arrow support for df[:, list[int|str]] (#923) --- narwhals/_arrow/dataframe.py | 9 ++++++--- narwhals/dataframe.py | 10 +++++++++- narwhals/stable/v1.py | 4 ++++ tests/frame/slice_test.py | 6 ++++++ 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 960d833a5..064903d74 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -136,9 +136,12 @@ def __getitem__( and len(item) == 2 and isinstance(item[1], (list, tuple)) ): - return self._from_native_frame( - self._native_frame.take(item[0]).select(item[1]) - ) + if item[0] == slice(None): + selected_rows = self._native_frame + else: + selected_rows = self._native_frame.take(item[0]) + + return self._from_native_frame(selected_rows.select(item[1])) elif isinstance(item, tuple) and len(item) == 2: if isinstance(item[1], slice): diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 165b65981..f6fccb274 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -585,10 +585,14 @@ def __getitem__(self, item: tuple[Sequence[int], slice]) -> Self: ... @overload def __getitem__(self, item: tuple[Sequence[int], Sequence[int]]) -> Self: ... @overload + def __getitem__(self, item: tuple[slice, Sequence[int]]) -> Self: ... + @overload def __getitem__(self, item: tuple[Sequence[int], str]) -> Series: ... # type: ignore[overload-overlap] @overload def __getitem__(self, item: tuple[Sequence[int], Sequence[str]]) -> Self: ... @overload + def __getitem__(self, item: tuple[slice, Sequence[str]]) -> Self: ... + @overload def __getitem__(self, item: tuple[Sequence[int], int]) -> Series: ... # type: ignore[overload-overlap] @overload @@ -606,7 +610,7 @@ def __getitem__( | slice | Sequence[int] | tuple[Sequence[int], str | int] - | tuple[Sequence[int], Sequence[int] | Sequence[str] | slice], + | tuple[slice | Sequence[int], Sequence[int] | Sequence[str] | slice], ) -> Series | Self: """ Extract column or slice of DataFrame. @@ -623,6 +627,10 @@ def __getitem__( a `Series`. - `df[[0, 1], [0, 1, 2]]` extracts the first two rows and the first three columns and returns a `DataFrame` + - `df[:, [0, 1, 2]]` extracts all rows from the first three columns and returns a + `DataFrame`. + - `df[:, ['a', 'c']]` extracts all rows and columns `'a'` and `'c'` and returns a + `DataFrame`. - `df[0: 2, ['a', 'c']]` extracts the first two rows and columns `'a'` and `'c'` and returns a `DataFrame` - `df[:, 0: 2]` extracts all rows from the first two columns and returns a `DataFrame` diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index 98ee12a7b..1af7a26f3 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -76,11 +76,15 @@ class DataFrame(NwDataFrame[IntoDataFrameT]): def __getitem__(self, item: tuple[Sequence[int], slice]) -> Self: ... @overload def __getitem__(self, item: tuple[Sequence[int], Sequence[int]]) -> Self: ... + @overload + def __getitem__(self, item: tuple[slice, Sequence[int]]) -> Self: ... @overload def __getitem__(self, item: tuple[Sequence[int], str]) -> Series: ... # type: ignore[overload-overlap] @overload def __getitem__(self, item: tuple[Sequence[int], Sequence[str]]) -> Self: ... + @overload + def __getitem__(self, item: tuple[slice, Sequence[str]]) -> Self: ... @overload def __getitem__(self, item: tuple[Sequence[int], int]) -> Series: ... # type: ignore[overload-overlap] diff --git a/tests/frame/slice_test.py b/tests/frame/slice_test.py index eea94d440..18b05bf3b 100644 --- a/tests/frame/slice_test.py +++ b/tests/frame/slice_test.py @@ -141,6 +141,12 @@ def test_slice_slice_columns(constructor_eager: Any) -> None: result = df[[0, 1], 1:] expected = {"b": [4, 5], "c": [7, 8], "d": [1, 4]} compare_dicts(result, expected) + result = df[:, ["b", "d"]] + expected = {"b": [4, 5, 6], "d": [1, 4, 2]} + compare_dicts(result, expected) + result = df[:, [0, 2]] + expected = {"a": [1, 2, 3], "c": [7, 8, 9]} + compare_dicts(result, expected) def test_slice_invalid(constructor_eager: Any) -> None: From 64b58a7daa77e83a18d3b2487c58f8e77fd7a8a3 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 8 Sep 2024 12:02:12 +0100 Subject: [PATCH 17/30] release: Bump version to 1.6.3 (#926) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 796cd8708..b89aa4b69 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.6.2' +'1.6.3' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 56c638e84..b26cf9490 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -53,7 +53,7 @@ from narwhals.utils import maybe_get_index from narwhals.utils import maybe_set_index -__version__ = "1.6.2" +__version__ = "1.6.3" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index c4a10603f..a928ae0df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.6.2" +version = "1.6.3" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From 693d53a0014c407e57c19854ece235d0ccb39227 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Sun, 8 Sep 2024 13:54:43 +0100 Subject: [PATCH 18/30] feat: add `on` key to `join` (#927) --- narwhals/_arrow/dataframe.py | 5 -- narwhals/_dask/dataframe.py | 5 -- narwhals/_pandas_like/dataframe.py | 5 -- narwhals/dataframe.py | 81 ++++++++++++++++++------------ tests/frame/join_test.py | 71 ++++++++++++++++++++------ 5 files changed, 106 insertions(+), 61 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 064903d74..2750f8c09 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -278,11 +278,6 @@ def join( left_on: str | list[str] | None, right_on: str | list[str] | None, ) -> Self: - if isinstance(left_on, str): - left_on = [left_on] - if isinstance(right_on, str): - right_on = [right_on] - how_to_join_map = { "anti": "left anti", "semi": "left semi", diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 5ef8c5a9d..1a40d7a6c 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -209,11 +209,6 @@ def join( left_on: str | list[str] | None, right_on: str | list[str] | None, ) -> Self: - if isinstance(left_on, str): - left_on = [left_on] - if isinstance(right_on, str): - right_on = [right_on] - if how == "cross": key_token = generate_unique_token( n_bytes=8, columns=[*self.columns, *other.columns] diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 3040adda0..880e2d140 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -404,11 +404,6 @@ def join( left_on: str | list[str] | None, right_on: str | list[str] | None, ) -> Self: - if isinstance(left_on, str): - left_on = [left_on] - if isinstance(right_on, str): - right_on = [right_on] - if how == "cross": if ( self._implementation is Implementation.MODIN diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index f6fccb274..ffd7ce36d 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -186,6 +186,7 @@ def join( how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, + on: str | list[str] | None = None, ) -> Self: _supported_joins = ("inner", "left", "cross", "anti", "semi") @@ -193,10 +194,25 @@ def join( msg = f"Only the following join strategies are supported: {_supported_joins}; found '{how}'." raise NotImplementedError(msg) - if how == "cross" and (left_on or right_on): - msg = "Can not pass left_on, right_on for cross join" + if how == "cross" and ( + left_on is not None or right_on is not None or on is not None + ): + msg = "Can not pass `left_on`, `right_on` or `on` keys for cross join" + raise ValueError(msg) + + if how != "cross" and (on is None and (left_on is None or right_on is None)): + msg = f"Either (`left_on` and `right_on`) or `on` keys should be specified for {how}." raise ValueError(msg) + if how != "cross" and ( + on is not None and (left_on is not None or right_on is not None) + ): + msg = f"If `on` is specified, `left_on` and `right_on` should be None for {how}." + raise ValueError(msg) + + if on is not None: + left_on = right_on = on + return self._from_compliant_dataframe( self._compliant_frame.join( self._extract_compliant(other), @@ -232,37 +248,24 @@ def join_asof( msg = f"Only the following strategies are supported: {_supported_strategies}; found '{strategy}'." raise NotImplementedError(msg) - if left_on is not None and right_on is not None and on is not None: + if (on is None) and (left_on is None or right_on is None): msg = "Either (`left_on` and `right_on`) or `on` keys should be specified." raise ValueError(msg) - if by_left is not None and by_right is not None and by is not None: - msg = "Can not specify `by_left`, `by_right`, and `by` keys at the same time." - raise ValueError(msg) - if by_left is not None and by_right is None and by is None: - msg = "`by_right` can not be None if `by_left` is specified." - raise ValueError(msg) - if by_left is None and by_right is not None and by is None: - msg = "`by_left` can not be None if `by_right` is specified." + if (on is not None) and (left_on is not None or right_on is not None): + msg = "If `on` is specified, `left_on` and `right_on` should be None." raise ValueError(msg) - if ( + if (by is None) and ( (by_left is None and by_right is not None) or (by_left is not None and by_right is None) - ) and by is not None: - msg = "Either (`by_left` and `by_right_`) or `by` keys should be specified." - raise ValueError(msg) - if left_on is not None and right_on is not None: - return self._from_compliant_dataframe( - self._compliant_frame.join_asof( - self._extract_compliant(other), - left_on=left_on, - right_on=right_on, - by_left=by_left, - by_right=by_right, - by=by, - strategy=strategy, - ) + ): + msg = ( + "Can not specify only `by_left` or `by_right`, you need to specify both." ) - elif on is not None: + raise ValueError(msg) + if (by is not None) and (by_left is not None or by_right is not None): + msg = "If `by` is specified, `by_left` and `by_right` should be None." + raise ValueError(msg) + if on is not None: return self._from_compliant_dataframe( self._compliant_frame.join_asof( self._extract_compliant(other), @@ -273,9 +276,17 @@ def join_asof( strategy=strategy, ) ) - else: - msg = "Either (`left_on` and `right_on`) or `on` keys should be specified." - raise ValueError(msg) + return self._from_compliant_dataframe( + self._compliant_frame.join_asof( + self._extract_compliant(other), + left_on=left_on, + right_on=right_on, + by_left=by_left, + by_right=by_right, + by=by, + strategy=strategy, + ) + ) class DataFrame(BaseFrame[FrameT]): @@ -1843,6 +1854,7 @@ def join( how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, + on: str | list[str] | None = None, ) -> Self: r""" Join in SQL-like fashion. @@ -1861,6 +1873,8 @@ def join( right_on: Name(s) of the right join column(s). + on: Join column of both DataFrames. If set, left_on and right_on should be None. + Returns: A new joined DataFrame @@ -1908,7 +1922,7 @@ def join( │ 2 ┆ 7.0 ┆ b ┆ y │ └─────┴─────┴─────┴───────┘ """ - return super().join(other, how=how, left_on=left_on, right_on=right_on) + return super().join(other, how=how, left_on=left_on, right_on=right_on, on=on) def join_asof( self, @@ -3568,6 +3582,7 @@ def join( how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, + on: str | list[str] | None = None, ) -> Self: r""" Add a join operation to the Logical Plan. @@ -3586,6 +3601,8 @@ def join( right_on: Join column of the right DataFrame. + on: Join column of both DataFrames. If set, left_on and right_on should be None. + Returns: A new joined LazyFrame @@ -3633,7 +3650,7 @@ def join( │ 2 ┆ 7.0 ┆ b ┆ y │ └─────┴─────┴─────┴───────┘ """ - return super().join(other, how=how, left_on=left_on, right_on=right_on) + return super().join(other, how=how, left_on=left_on, right_on=right_on, on=on) def join_asof( self, diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 34a5961ef..6615d5031 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -18,8 +18,9 @@ def test_inner_join_two_keys(constructor: Any) -> None: df = nw.from_native(constructor(data)) df_right = df result = df.join(df_right, left_on=["a", "b"], right_on=["a", "b"], how="inner") # type: ignore[arg-type] - result = result.sort("index") - result = result.drop("index_right") + result_on = df.join(df_right, on=["a", "b"], how="inner") # type: ignore[arg-type] + result = result.sort("index").drop("index_right") + result_on = result_on.sort("index").drop("index_right") expected = { "a": [1, 3, 2], "b": [4, 4, 6], @@ -28,6 +29,7 @@ def test_inner_join_two_keys(constructor: Any) -> None: "index": [0, 1, 2], } compare_dicts(result, expected) + compare_dicts(result_on, expected) def test_inner_join_single_key(constructor: Any) -> None: @@ -35,7 +37,9 @@ def test_inner_join_single_key(constructor: Any) -> None: df = nw.from_native(constructor(data)) df_right = df result = df.join(df_right, left_on="a", right_on="a", how="inner").sort("index") # type: ignore[arg-type] + result_on = df.join(df_right, on="a", how="inner").sort("index") # type: ignore[arg-type] result = result.drop("index_right") + result_on = result_on.drop("index_right") expected = { "a": [1, 3, 2], "b": [4, 4, 6], @@ -45,6 +49,7 @@ def test_inner_join_single_key(constructor: Any) -> None: "index": [0, 1, 2], } compare_dicts(result, expected) + compare_dicts(result_on, expected) def test_cross_join(constructor: Any) -> None: @@ -57,7 +62,9 @@ def test_cross_join(constructor: Any) -> None: } compare_dicts(result, expected) - with pytest.raises(ValueError, match="Can not pass left_on, right_on for cross join"): + with pytest.raises( + ValueError, match="Can not pass `left_on`, `right_on` or `on` keys for cross join" + ): df.join(df, how="cross", left_on="a") # type: ignore[arg-type] @@ -206,6 +213,33 @@ def test_left_join_overlapping_column(constructor: Any) -> None: compare_dicts(result, expected) +@pytest.mark.parametrize("how", ["inner", "left", "semi", "anti"]) +def test_join_keys_exceptions(constructor: Any, how: str) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + df = nw.from_native(constructor(data)) + + with pytest.raises( + ValueError, + match=rf"Either \(`left_on` and `right_on`\) or `on` keys should be specified for {how}.", + ): + df.join(df, how=how) # type: ignore[arg-type] + with pytest.raises( + ValueError, + match=rf"Either \(`left_on` and `right_on`\) or `on` keys should be specified for {how}.", + ): + df.join(df, how=how, left_on="a") # type: ignore[arg-type] + with pytest.raises( + ValueError, + match=rf"Either \(`left_on` and `right_on`\) or `on` keys should be specified for {how}.", + ): + df.join(df, how=how, right_on="a") # type: ignore[arg-type] + with pytest.raises( + ValueError, + match=f"If `on` is specified, `left_on` and `right_on` should be None for {how}.", + ): + df.join(df, how=how, on="a", right_on="a") # type: ignore[arg-type] + + def test_joinasof_numeric(constructor: Any, request: Any) -> None: if "pyarrow_table" in str(constructor): request.applymarker(pytest.mark.xfail) @@ -361,31 +395,40 @@ def test_joinasof_not_implemented(constructor: Any, strategy: str) -> None: df.join_asof(df, left_on="a", right_on="a", strategy=strategy) # type: ignore[arg-type] -def test_joinasof_no_keys(constructor: Any) -> None: +def test_joinasof_keys_exceptions(constructor: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) - msg = r"Either \(`left_on` and `right_on`\) or `on` keys should be specified." with pytest.raises( ValueError, - match=msg, + match=r"Either \(`left_on` and `right_on`\) or `on` keys should be specified.", ): df.join_asof(df, left_on="a") # type: ignore[arg-type] with pytest.raises( ValueError, - match=msg, + match=r"Either \(`left_on` and `right_on`\) or `on` keys should be specified.", ): df.join_asof(df, right_on="a") # type: ignore[arg-type] with pytest.raises( ValueError, - match=msg, + match=r"Either \(`left_on` and `right_on`\) or `on` keys should be specified.", ): df.join_asof(df) # type: ignore[arg-type] with pytest.raises( ValueError, - match=msg, + match="If `on` is specified, `left_on` and `right_on` should be None.", ): df.join_asof(df, left_on="a", right_on="a", on="a") # type: ignore[arg-type] + with pytest.raises( + ValueError, + match="If `on` is specified, `left_on` and `right_on` should be None.", + ): + df.join_asof(df, left_on="a", on="a") # type: ignore[arg-type] + with pytest.raises( + ValueError, + match="If `on` is specified, `left_on` and `right_on` should be None.", + ): + df.join_asof(df, right_on="a", on="a") # type: ignore[arg-type] def test_joinasof_by_exceptions(constructor: Any) -> None: @@ -393,30 +436,30 @@ def test_joinasof_by_exceptions(constructor: Any) -> None: df = nw.from_native(constructor(data)) with pytest.raises( ValueError, - match=r"Can not specify `by_left`, `by_right`, and `by` keys at the same time.", + match="If `by` is specified, `by_left` and `by_right` should be None.", ): df.join_asof(df, on="a", by_left="b", by_right="b", by="b") # type: ignore[arg-type] with pytest.raises( ValueError, - match=r"`by_right` can not be None if `by_left` is specified.", + match="Can not specify only `by_left` or `by_right`, you need to specify both.", ): df.join_asof(df, on="a", by_left="b") # type: ignore[arg-type] with pytest.raises( ValueError, - match=r"`by_left` can not be None if `by_right` is specified.", + match="Can not specify only `by_left` or `by_right`, you need to specify both.", ): df.join_asof(df, on="a", by_right="b") # type: ignore[arg-type] with pytest.raises( ValueError, - match=r"Either \(`by_left` and `by_right_`\) or `by` keys should be specified.", + match="If `by` is specified, `by_left` and `by_right` should be None.", ): df.join_asof(df, on="a", by_left="b", by="b") # type: ignore[arg-type] with pytest.raises( ValueError, - match=r"Either \(`by_left` and `by_right_`\) or `by` keys should be specified.", + match="If `by` is specified, `by_left` and `by_right` should be None.", ): df.join_asof(df, on="a", by_right="b", by="b") # type: ignore[arg-type] From 7bfb7764d7ade3cf0753ee759379f090d47e6f41 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Mon, 9 Sep 2024 03:04:53 -0400 Subject: [PATCH 19/30] feat: implement `to_arrow` for cuDF (#924) * xfail to_arrow tests for cuDF * Revert "xfail to_arrow tests for cuDF" This reverts commit d695efd210fb30b2e2e5beea84a67d25836f152b. * implement to_arrow for cuDF --- narwhals/_pandas_like/dataframe.py | 3 +-- narwhals/_pandas_like/series.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 880e2d140..ce0cae8ac 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -672,8 +672,7 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: def to_arrow(self: Self) -> Any: if self._implementation is Implementation.CUDF: # pragma: no cover - msg = "`to_arrow` is not implemented for CuDF backend." - raise NotImplementedError(msg) + return self._native_frame.to_arrow(preserve_index=False) import pyarrow as pa # ignore-banned-import() diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index e94c95a8c..b28a04088 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -635,8 +635,7 @@ def clip( def to_arrow(self: Self) -> Any: if self._implementation is Implementation.CUDF: # pragma: no cover - msg = "`to_arrow` is not implemented for CuDF backend." - raise NotImplementedError(msg) + return self._native_series.to_arrow() import pyarrow as pa # ignore-banned-import() From e7b3b83e8e6808ff7d6c43f4edf2a03ba5aceac5 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Mon, 9 Sep 2024 03:18:19 -0400 Subject: [PATCH 20/30] update drop_nulls docstring (#928) --- narwhals/series.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/narwhals/series.py b/narwhals/series.py index d80564d22..0b1645ea7 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -783,12 +783,9 @@ def drop_nulls(self) -> Self: """ Drop all null values. - See Also: - drop_nans - Notes: - A null value is not the same as a NaN value. - To drop NaN values, use :func:`drop_nans`. + pandas and Polars handle null values differently. Polars distinguishes + between NaN and Null, whereas pandas doesn't. Examples: >>> import pandas as pd From 767fbfb138c1efc665723e4c08ecab741d56140c Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 9 Sep 2024 14:54:32 +0100 Subject: [PATCH 21/30] fix: (#930) fix join when using string literals --- narwhals/_pandas_like/dataframe.py | 4 + tests/frame/join_test.py | 283 +++++++++++++++++++---------- 2 files changed, 191 insertions(+), 96 deletions(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index ce0cae8ac..4ec42ef59 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -404,6 +404,10 @@ def join( left_on: str | list[str] | None, right_on: str | list[str] | None, ) -> Self: + if isinstance(left_on, str): + left_on = [left_on] + if isinstance(right_on, str): + right_on = [right_on] if how == "cross": if ( self._implementation is Implementation.MODIN diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 6615d5031..6a1985f41 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -3,6 +3,7 @@ import re from datetime import datetime from typing import Any +from typing import Literal import pandas as pd import pytest @@ -14,18 +15,28 @@ def test_inner_join_two_keys(constructor: Any) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9], "index": [0, 1, 2]} + data = { + "antananarivo": [1, 3, 2], + "bob": [4, 4, 6], + "zorro": [7.0, 8, 9], + "index": [0, 1, 2], + } df = nw.from_native(constructor(data)) df_right = df - result = df.join(df_right, left_on=["a", "b"], right_on=["a", "b"], how="inner") # type: ignore[arg-type] - result_on = df.join(df_right, on=["a", "b"], how="inner") # type: ignore[arg-type] + result = df.join( + df_right, # type: ignore[arg-type] + left_on=["antananarivo", "bob"], + right_on=["antananarivo", "bob"], + how="inner", + ) + result_on = df.join(df_right, on=["antananarivo", "bob"], how="inner") # type: ignore[arg-type] result = result.sort("index").drop("index_right") result_on = result_on.sort("index").drop("index_right") expected = { - "a": [1, 3, 2], - "b": [4, 4, 6], - "z": [7.0, 8, 9], - "z_right": [7.0, 8, 9], + "antananarivo": [1, 3, 2], + "bob": [4, 4, 6], + "zorro": [7.0, 8, 9], + "zorro_right": [7.0, 8, 9], "index": [0, 1, 2], } compare_dicts(result, expected) @@ -33,19 +44,29 @@ def test_inner_join_two_keys(constructor: Any) -> None: def test_inner_join_single_key(constructor: Any) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9], "index": [0, 1, 2]} + data = { + "antananarivo": [1, 3, 2], + "bob": [4, 4, 6], + "zorro": [7.0, 8, 9], + "index": [0, 1, 2], + } df = nw.from_native(constructor(data)) df_right = df - result = df.join(df_right, left_on="a", right_on="a", how="inner").sort("index") # type: ignore[arg-type] - result_on = df.join(df_right, on="a", how="inner").sort("index") # type: ignore[arg-type] + result = df.join( + df_right, # type: ignore[arg-type] + left_on="antananarivo", + right_on="antananarivo", + how="inner", + ).sort("index") + result_on = df.join(df_right, on="antananarivo", how="inner").sort("index") # type: ignore[arg-type] result = result.drop("index_right") result_on = result_on.drop("index_right") expected = { - "a": [1, 3, 2], - "b": [4, 4, 6], - "b_right": [4, 4, 6], - "z": [7.0, 8, 9], - "z_right": [7.0, 8, 9], + "antananarivo": [1, 3, 2], + "bob": [4, 4, 6], + "bob_right": [4, 4, 6], + "zorro": [7.0, 8, 9], + "zorro_right": [7.0, 8, 9], "index": [0, 1, 2], } compare_dicts(result, expected) @@ -53,30 +74,30 @@ def test_inner_join_single_key(constructor: Any) -> None: def test_cross_join(constructor: Any) -> None: - data = {"a": [1, 3, 2]} + data = {"antananarivo": [1, 3, 2]} df = nw.from_native(constructor(data)) - result = df.join(df, how="cross").sort("a", "a_right") # type: ignore[arg-type] + result = df.join(df, how="cross").sort("antananarivo", "antananarivo_right") # type: ignore[arg-type] expected = { - "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], - "a_right": [1, 2, 3, 1, 2, 3, 1, 2, 3], + "antananarivo": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "antananarivo_right": [1, 2, 3, 1, 2, 3, 1, 2, 3], } compare_dicts(result, expected) with pytest.raises( ValueError, match="Can not pass `left_on`, `right_on` or `on` keys for cross join" ): - df.join(df, how="cross", left_on="a") # type: ignore[arg-type] + df.join(df, how="cross", left_on="antananarivo") # type: ignore[arg-type] def test_cross_join_non_pandas() -> None: - data = {"a": [1, 3, 2]} + data = {"antananarivo": [1, 3, 2]} df = nw.from_native(pd.DataFrame(data)) # HACK to force testing for a non-pandas codepath df._compliant_frame._implementation = Implementation.MODIN result = df.join(df, how="cross") # type: ignore[arg-type] expected = { - "a": [1, 1, 1, 3, 3, 3, 2, 2, 2], - "a_right": [1, 3, 2, 1, 3, 2, 1, 3, 2], + "antananarivo": [1, 1, 1, 3, 3, 3, 2, 2, 2], + "antananarivo_right": [1, 3, 2, 1, 3, 2, 1, 3, 2], } compare_dicts(result, expected) @@ -84,9 +105,17 @@ def test_cross_join_non_pandas() -> None: @pytest.mark.parametrize( ("join_key", "filter_expr", "expected"), [ - (["a", "b"], (nw.col("b") < 5), {"a": [2], "b": [6], "z": [9]}), - (["b"], (nw.col("b") < 5), {"a": [2], "b": [6], "z": [9]}), - (["b"], (nw.col("b") > 5), {"a": [1, 3], "b": [4, 4], "z": [7.0, 8.0]}), + ( + ["antananarivo", "bob"], + (nw.col("bob") < 5), + {"antananarivo": [2], "bob": [6], "zorro": [9]}, + ), + (["bob"], (nw.col("bob") < 5), {"antananarivo": [2], "bob": [6], "zorro": [9]}), + ( + ["bob"], + (nw.col("bob") > 5), + {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7.0, 8.0]}, + ), ], ) def test_anti_join( @@ -95,7 +124,7 @@ def test_anti_join( filter_expr: nw.Expr, expected: dict[str, list[Any]], ) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) other = df.filter(filter_expr) result = df.join(other, how="anti", left_on=join_key, right_on=join_key) # type: ignore[arg-type] @@ -105,9 +134,21 @@ def test_anti_join( @pytest.mark.parametrize( ("join_key", "filter_expr", "expected"), [ - (["a"], (nw.col("b") > 5), {"a": [2], "b": [6], "z": [9]}), - (["b"], (nw.col("b") < 5), {"a": [1, 3], "b": [4, 4], "z": [7, 8]}), - (["a", "b"], (nw.col("b") < 5), {"a": [1, 3], "b": [4, 4], "z": [7, 8]}), + ( + ["antananarivo"], + (nw.col("bob") > 5), + {"antananarivo": [2], "bob": [6], "zorro": [9]}, + ), + ( + ["bob"], + (nw.col("bob") < 5), + {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7, 8]}, + ), + ( + ["antananarivo", "bob"], + (nw.col("bob") < 5), + {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7, 8]}, + ), ], ) def test_semi_join( @@ -116,16 +157,18 @@ def test_semi_join( filter_expr: nw.Expr, expected: dict[str, list[Any]], ) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) other = df.filter(filter_expr) - result = df.join(other, how="semi", left_on=join_key, right_on=join_key).sort("a") # type: ignore[arg-type] + result = df.join(other, how="semi", left_on=join_key, right_on=join_key).sort( # type: ignore[arg-type] + "antananarivo" + ) compare_dicts(result, expected) @pytest.mark.parametrize("how", ["right", "full"]) def test_join_not_implemented(constructor: Any, how: str) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( @@ -134,24 +177,28 @@ def test_join_not_implemented(constructor: Any, how: str) -> None: f"Only the following join strategies are supported: ('inner', 'left', 'cross', 'anti', 'semi'); found '{how}'." ), ): - df.join(df, left_on="a", right_on="a", how=how) # type: ignore[arg-type] + df.join(df, left_on="antananarivo", right_on="antananarivo", how=how) # type: ignore[arg-type] @pytest.mark.filterwarnings("ignore:the default coalesce behavior") def test_left_join(constructor: Any) -> None: - data_left = {"a": [1.0, 2, 3], "b": [4.0, 5, 6], "index": [0.0, 1.0, 2.0]} - data_right = {"a": [1.0, 2, 3], "c": [4.0, 5, 7], "index": [0.0, 1.0, 2.0]} + data_left = { + "antananarivo": [1.0, 2, 3], + "bob": [4.0, 5, 6], + "index": [0.0, 1.0, 2.0], + } + data_right = {"antananarivo": [1.0, 2, 3], "c": [4.0, 5, 7], "index": [0.0, 1.0, 2.0]} df_left = nw.from_native(constructor(data_left)) df_right = nw.from_native(constructor(data_right)) - result = df_left.join(df_right, left_on="b", right_on="c", how="left").select( # type: ignore[arg-type] + result = df_left.join(df_right, left_on="bob", right_on="c", how="left").select( # type: ignore[arg-type] nw.all().fill_null(float("nan")) ) result = result.sort("index") result = result.drop("index_right") expected = { - "a": [1, 2, 3], - "b": [4, 5, 6], - "a_right": [1, 2, float("nan")], + "antananarivo": [1, 2, 3], + "bob": [4, 5, 6], + "antananarivo_right": [1, 2, float("nan")], "index": [0, 1, 2], } compare_dicts(result, expected) @@ -159,54 +206,62 @@ def test_left_join(constructor: Any) -> None: @pytest.mark.filterwarnings("ignore: the default coalesce behavior") def test_left_join_multiple_column(constructor: Any) -> None: - data_left = {"a": [1, 2, 3], "b": [4, 5, 6], "index": [0, 1, 2]} - data_right = {"a": [1, 2, 3], "c": [4, 5, 6], "index": [0, 1, 2]} + data_left = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "index": [0, 1, 2]} + data_right = {"antananarivo": [1, 2, 3], "c": [4, 5, 6], "index": [0, 1, 2]} df_left = nw.from_native(constructor(data_left)) df_right = nw.from_native(constructor(data_right)) - result = df_left.join(df_right, left_on=["a", "b"], right_on=["a", "c"], how="left") # type: ignore[arg-type] + result = df_left.join( + df_right, # type: ignore[arg-type] + left_on=["antananarivo", "bob"], + right_on=["antananarivo", "c"], + how="left", + ) result = result.sort("index") result = result.drop("index_right") - expected = {"a": [1, 2, 3], "b": [4, 5, 6], "index": [0, 1, 2]} + expected = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "index": [0, 1, 2]} compare_dicts(result, expected) @pytest.mark.filterwarnings("ignore: the default coalesce behavior") def test_left_join_overlapping_column(constructor: Any) -> None: data_left = { - "a": [1.0, 2, 3], - "b": [4.0, 5, 6], + "antananarivo": [1.0, 2, 3], + "bob": [4.0, 5, 6], "d": [1.0, 4, 2], "index": [0.0, 1.0, 2.0], } data_right = { - "a": [1.0, 2, 3], + "antananarivo": [1.0, 2, 3], "c": [4.0, 5, 6], "d": [1.0, 4, 2], "index": [0.0, 1.0, 2.0], } df_left = nw.from_native(constructor(data_left)) df_right = nw.from_native(constructor(data_right)) - result = df_left.join(df_right, left_on="b", right_on="c", how="left").sort("index") # type: ignore[arg-type] + result = df_left.join(df_right, left_on="bob", right_on="c", how="left").sort("index") # type: ignore[arg-type] result = result.drop("index_right") expected: dict[str, list[Any]] = { - "a": [1, 2, 3], - "b": [4, 5, 6], + "antananarivo": [1, 2, 3], + "bob": [4, 5, 6], "d": [1, 4, 2], - "a_right": [1, 2, 3], + "antananarivo_right": [1, 2, 3], "d_right": [1, 4, 2], "index": [0, 1, 2], } compare_dicts(result, expected) - result = df_left.join(df_right, left_on="a", right_on="d", how="left").select( # type: ignore[arg-type] - nw.all().fill_null(float("nan")) - ) + result = df_left.join( + df_right, # type: ignore[arg-type] + left_on="antananarivo", + right_on="d", + how="left", + ).select(nw.all().fill_null(float("nan"))) result = result.sort("index") result = result.drop("index_right") expected = { - "a": [1, 2, 3], - "b": [4, 5, 6], + "antananarivo": [1, 2, 3], + "bob": [4, 5, 6], "d": [1, 4, 2], - "a_right": [1.0, 3.0, float("nan")], + "antananarivo_right": [1.0, 3.0, float("nan")], "c": [4.0, 6.0, float("nan")], "index": [0, 1, 2], } @@ -215,7 +270,7 @@ def test_left_join_overlapping_column(constructor: Any) -> None: @pytest.mark.parametrize("how", ["inner", "left", "semi", "anti"]) def test_join_keys_exceptions(constructor: Any, how: str) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( @@ -227,17 +282,17 @@ def test_join_keys_exceptions(constructor: Any, how: str) -> None: ValueError, match=rf"Either \(`left_on` and `right_on`\) or `on` keys should be specified for {how}.", ): - df.join(df, how=how, left_on="a") # type: ignore[arg-type] + df.join(df, how=how, left_on="antananarivo") # type: ignore[arg-type] with pytest.raises( ValueError, match=rf"Either \(`left_on` and `right_on`\) or `on` keys should be specified for {how}.", ): - df.join(df, how=how, right_on="a") # type: ignore[arg-type] + df.join(df, how=how, right_on="antananarivo") # type: ignore[arg-type] with pytest.raises( ValueError, match=f"If `on` is specified, `left_on` and `right_on` should be None for {how}.", ): - df.join(df, how=how, on="a", right_on="a") # type: ignore[arg-type] + df.join(df, how=how, on="antananarivo", right_on="antananarivo") # type: ignore[arg-type] def test_joinasof_numeric(constructor: Any, request: Any) -> None: @@ -247,28 +302,44 @@ def test_joinasof_numeric(constructor: Any, request: Any) -> None: ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) ): request.applymarker(pytest.mark.xfail) - df = nw.from_native(constructor({"a": [1, 5, 10], "val": ["a", "b", "c"]})).sort("a") + df = nw.from_native( + constructor({"antananarivo": [1, 5, 10], "val": ["a", "b", "c"]}) + ).sort("antananarivo") df_right = nw.from_native( - constructor({"a": [1, 2, 3, 6, 7], "val": [1, 2, 3, 6, 7]}) - ).sort("a") - result_backward = df.join_asof(df_right, left_on="a", right_on="a") # type: ignore[arg-type] - result_forward = df.join_asof(df_right, left_on="a", right_on="a", strategy="forward") # type: ignore[arg-type] - result_nearest = df.join_asof(df_right, left_on="a", right_on="a", strategy="nearest") # type: ignore[arg-type] - result_backward_on = df.join_asof(df_right, on="a") # type: ignore[arg-type] - result_forward_on = df.join_asof(df_right, on="a", strategy="forward") # type: ignore[arg-type] - result_nearest_on = df.join_asof(df_right, on="a", strategy="nearest") # type: ignore[arg-type] + constructor({"antananarivo": [1, 2, 3, 6, 7], "val": [1, 2, 3, 6, 7]}) + ).sort("antananarivo") + result_backward = df.join_asof( + df_right, # type: ignore[arg-type] + left_on="antananarivo", + right_on="antananarivo", + ) + result_forward = df.join_asof( + df_right, # type: ignore[arg-type] + left_on="antananarivo", + right_on="antananarivo", + strategy="forward", + ) + result_nearest = df.join_asof( + df_right, # type: ignore[arg-type] + left_on="antananarivo", + right_on="antananarivo", + strategy="nearest", + ) + result_backward_on = df.join_asof(df_right, on="antananarivo") # type: ignore[arg-type] + result_forward_on = df.join_asof(df_right, on="antananarivo", strategy="forward") # type: ignore[arg-type] + result_nearest_on = df.join_asof(df_right, on="antananarivo", strategy="nearest") # type: ignore[arg-type] expected_backward = { - "a": [1, 5, 10], + "antananarivo": [1, 5, 10], "val": ["a", "b", "c"], "val_right": [1, 3, 7], } expected_forward = { - "a": [1, 5, 10], + "antananarivo": [1, 5, 10], "val": ["a", "b", "c"], "val_right": [1, 6, float("nan")], } expected_nearest = { - "a": [1, 5, 10], + "antananarivo": [1, 5, 10], "val": ["a", "b", "c"], "val_right": [1, 6, 7], } @@ -366,16 +437,24 @@ def test_joinasof_by(constructor: Any, request: Any) -> None: ): request.applymarker(pytest.mark.xfail) df = nw.from_native( - constructor({"a": [1, 5, 7, 10], "b": ["D", "D", "C", "A"], "c": [9, 2, 1, 1]}) - ).sort("a") + constructor( + { + "antananarivo": [1, 5, 7, 10], + "bob": ["D", "D", "C", "A"], + "c": [9, 2, 1, 1], + } + ) + ).sort("antananarivo") df_right = nw.from_native( - constructor({"a": [1, 4, 5, 8], "b": ["D", "D", "A", "F"], "d": [1, 3, 4, 1]}) - ).sort("a") - result = df.join_asof(df_right, on="a", by_left="b", by_right="b") # type: ignore[arg-type] - result_by = df.join_asof(df_right, on="a", by="b") # type: ignore[arg-type] + constructor( + {"antananarivo": [1, 4, 5, 8], "bob": ["D", "D", "A", "F"], "d": [1, 3, 4, 1]} + ) + ).sort("antananarivo") + result = df.join_asof(df_right, on="antananarivo", by_left="bob", by_right="bob") # type: ignore[arg-type] + result_by = df.join_asof(df_right, on="antananarivo", by="bob") # type: ignore[arg-type] expected = { - "a": [1, 5, 7, 10], - "b": ["D", "D", "C", "A"], + "antananarivo": [1, 5, 7, 10], + "bob": ["D", "D", "C", "A"], "c": [9, 2, 1, 1], "d": [1, 3, float("nan"), 4], } @@ -384,31 +463,38 @@ def test_joinasof_by(constructor: Any, request: Any) -> None: @pytest.mark.parametrize("strategy", ["back", "furthest"]) -def test_joinasof_not_implemented(constructor: Any, strategy: str) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} +def test_joinasof_not_implemented( + constructor: Any, strategy: Literal["backward", "forward"] +) -> None: + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( NotImplementedError, match=rf"Only the following strategies are supported: \('backward', 'forward', 'nearest'\); found '{strategy}'.", ): - df.join_asof(df, left_on="a", right_on="a", strategy=strategy) # type: ignore[arg-type] + df.join_asof( + df, # type: ignore[arg-type] + left_on="antananarivo", + right_on="antananarivo", + strategy=strategy, + ) def test_joinasof_keys_exceptions(constructor: Any) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( ValueError, match=r"Either \(`left_on` and `right_on`\) or `on` keys should be specified.", ): - df.join_asof(df, left_on="a") # type: ignore[arg-type] + df.join_asof(df, left_on="antananarivo") # type: ignore[arg-type] with pytest.raises( ValueError, match=r"Either \(`left_on` and `right_on`\) or `on` keys should be specified.", ): - df.join_asof(df, right_on="a") # type: ignore[arg-type] + df.join_asof(df, right_on="antananarivo") # type: ignore[arg-type] with pytest.raises( ValueError, match=r"Either \(`left_on` and `right_on`\) or `on` keys should be specified.", @@ -418,48 +504,53 @@ def test_joinasof_keys_exceptions(constructor: Any) -> None: ValueError, match="If `on` is specified, `left_on` and `right_on` should be None.", ): - df.join_asof(df, left_on="a", right_on="a", on="a") # type: ignore[arg-type] + df.join_asof( + df, # type: ignore[arg-type] + left_on="antananarivo", + right_on="antananarivo", + on="antananarivo", + ) with pytest.raises( ValueError, match="If `on` is specified, `left_on` and `right_on` should be None.", ): - df.join_asof(df, left_on="a", on="a") # type: ignore[arg-type] + df.join_asof(df, left_on="antananarivo", on="antananarivo") # type: ignore[arg-type] with pytest.raises( ValueError, match="If `on` is specified, `left_on` and `right_on` should be None.", ): - df.join_asof(df, right_on="a", on="a") # type: ignore[arg-type] + df.join_asof(df, right_on="antananarivo", on="antananarivo") # type: ignore[arg-type] def test_joinasof_by_exceptions(constructor: Any) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( ValueError, match="If `by` is specified, `by_left` and `by_right` should be None.", ): - df.join_asof(df, on="a", by_left="b", by_right="b", by="b") # type: ignore[arg-type] + df.join_asof(df, on="antananarivo", by_left="bob", by_right="bob", by="bob") # type: ignore[arg-type] with pytest.raises( ValueError, match="Can not specify only `by_left` or `by_right`, you need to specify both.", ): - df.join_asof(df, on="a", by_left="b") # type: ignore[arg-type] + df.join_asof(df, on="antananarivo", by_left="bob") # type: ignore[arg-type] with pytest.raises( ValueError, match="Can not specify only `by_left` or `by_right`, you need to specify both.", ): - df.join_asof(df, on="a", by_right="b") # type: ignore[arg-type] + df.join_asof(df, on="antananarivo", by_right="bob") # type: ignore[arg-type] with pytest.raises( ValueError, match="If `by` is specified, `by_left` and `by_right` should be None.", ): - df.join_asof(df, on="a", by_left="b", by="b") # type: ignore[arg-type] + df.join_asof(df, on="antananarivo", by_left="bob", by="bob") # type: ignore[arg-type] with pytest.raises( ValueError, match="If `by` is specified, `by_left` and `by_right` should be None.", ): - df.join_asof(df, on="a", by_right="b", by="b") # type: ignore[arg-type] + df.join_asof(df, on="antananarivo", by_right="bob", by="bob") # type: ignore[arg-type] From 06f7b875fcfcff980f7ccd0fc2b3bce1ce096165 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 9 Sep 2024 15:55:04 +0200 Subject: [PATCH 22/30] refactor pyarrow (#931) --- narwhals/_arrow/group_by.py | 40 ++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index 78b241c9b..6c7b20485 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -16,12 +16,26 @@ from narwhals._arrow.typing import IntoArrowExpr POLARS_TO_ARROW_AGGREGATIONS = { + "len": "count", "n_unique": "count_distinct", "std": "stddev", "var": "variance", # currently unused, we don't have `var` yet } +def get_function_name_option(function_name: str) -> Any | None: + """Map specific pyarrow compute function to respective option to match polars behaviour.""" + import pyarrow.compute as pc # ignore-banned-import + + function_name_to_options = { + "count": pc.CountOptions(mode="all"), + "count_distinct": pc.CountOptions(mode="all"), + "stddev": pc.VarianceOptions(ddof=1), + "variance": pc.VarianceOptions(ddof=1), + } + return function_name_to_options.get(function_name) + + class ArrowGroupBy: def __init__(self, df: ArrowDataFrame, keys: list[str]) -> None: import pyarrow as pa # ignore-banned-import() @@ -119,27 +133,13 @@ def agg_arrow( function_name = remove_prefix(expr._function_name, "col->") function_name = POLARS_TO_ARROW_AGGREGATIONS.get(function_name, function_name) + + option = get_function_name_option(function_name) for root_name, output_name in zip(expr._root_names, expr._output_names): - if function_name == "len": - simple_aggregations[output_name] = ( - (root_name, "count", pc.CountOptions(mode="all")), - f"{root_name}_count", - ) - elif function_name == "count_distinct": - simple_aggregations[output_name] = ( - (root_name, "count_distinct", pc.CountOptions(mode="all")), - f"{root_name}_count_distinct", - ) - elif function_name == "stddev": - simple_aggregations[output_name] = ( - (root_name, "stddev", pc.VarianceOptions(ddof=1)), - f"{root_name}_stddev", - ) - else: - simple_aggregations[output_name] = ( - (root_name, function_name), - f"{root_name}_{function_name}", - ) + simple_aggregations[output_name] = ( + (root_name, function_name, option), + f"{root_name}_{function_name}", + ) aggs: list[Any] = [] name_mapping = {} From 047bb96c13924a99a865870cbd0213aec7ff3ee6 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 9 Sep 2024 15:02:41 +0100 Subject: [PATCH 23/30] release: Bump version to 1.6.4 (#932) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index b89aa4b69..58302d417 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.6.3' +'1.6.4' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index b26cf9490..f410a1b24 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -53,7 +53,7 @@ from narwhals.utils import maybe_get_index from narwhals.utils import maybe_set_index -__version__ = "1.6.3" +__version__ = "1.6.4" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index a928ae0df..12482a349 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.6.3" +version = "1.6.4" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From 9246f11f647b68283d9fbd6f39b376633d93c0e7 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 9 Sep 2024 16:26:20 +0100 Subject: [PATCH 24/30] test: always run tpch tests (#933) --- .github/workflows/check_tpch_queries.yml | 4 ++-- tpch/execute/q10.py | 4 ++++ tpch/execute/q11.py | 4 ++++ tpch/execute/q12.py | 4 ++++ tpch/execute/q13.py | 4 ++++ tpch/execute/q14.py | 4 ++++ tpch/execute/q15.py | 4 ++++ tpch/execute/q16.py | 4 ++++ tpch/execute/q17.py | 4 ++++ tpch/execute/q18.py | 4 ++++ tpch/execute/q19.py | 9 +++------ tpch/execute/q20.py | 9 +++------ tpch/execute/q21.py | 9 +++------ tpch/execute/q22.py | 4 ++++ tpch/execute/q3.py | 4 ++++ tpch/execute/q4.py | 4 ++++ tpch/execute/q5.py | 8 ++++++++ tpch/execute/q6.py | 4 ++++ tpch/execute/q7.py | 4 ++++ tpch/execute/q8.py | 14 ++++++++++++++ tpch/execute/q9.py | 6 ++++++ tpch/queries/q17.py | 5 +++-- tpch/queries/q8.py | 3 ++- 23 files changed, 100 insertions(+), 23 deletions(-) diff --git a/.github/workflows/check_tpch_queries.yml b/.github/workflows/check_tpch_queries.yml index 82a2f4aa4..46dd5df20 100644 --- a/.github/workflows/check_tpch_queries.yml +++ b/.github/workflows/check_tpch_queries.yml @@ -2,11 +2,11 @@ name: Tests for TPCH Queries on: pull_request: - types: [labeled] + push: + branches: [main] jobs: validate-queries: - if: ${{ github.event.label.name == 'full-test' }} strategy: matrix: python-version: ["3.12"] diff --git a/tpch/execute/q10.py b/tpch/execute/q10.py index 99d850f53..e1d56d36b 100644 --- a/tpch/execute/q10.py +++ b/tpch/execute/q10.py @@ -13,3 +13,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q10.query(fn(customer), fn(nation), fn(lineitem), fn(orders)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q10.query(fn(customer), fn(nation), fn(lineitem), fn(orders))) diff --git a/tpch/execute/q11.py b/tpch/execute/q11.py index 101710adb..a6b830f30 100644 --- a/tpch/execute/q11.py +++ b/tpch/execute/q11.py @@ -12,3 +12,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q11.query(fn(nation), fn(partsupp), fn(supplier)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q11.query(fn(nation), fn(partsupp), fn(supplier))) diff --git a/tpch/execute/q12.py b/tpch/execute/q12.py index b74742373..0cdc0378b 100644 --- a/tpch/execute/q12.py +++ b/tpch/execute/q12.py @@ -11,3 +11,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q12.query(fn(line_item), fn(orders)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q12.query(fn(line_item), fn(orders))) diff --git a/tpch/execute/q13.py b/tpch/execute/q13.py index 084fcca9b..b5e6c8bbe 100644 --- a/tpch/execute/q13.py +++ b/tpch/execute/q13.py @@ -11,3 +11,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q13.query(fn(customer), fn(orders)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q13.query(fn(customer), fn(orders))) diff --git a/tpch/execute/q14.py b/tpch/execute/q14.py index 57f83a595..1a89dbbbe 100644 --- a/tpch/execute/q14.py +++ b/tpch/execute/q14.py @@ -11,3 +11,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q14.query(fn(line_item), fn(part)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q14.query(fn(line_item), fn(part))) diff --git a/tpch/execute/q15.py b/tpch/execute/q15.py index 0d9e9f374..ac858841d 100644 --- a/tpch/execute/q15.py +++ b/tpch/execute/q15.py @@ -11,3 +11,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q15.query(fn(lineitem), fn(supplier)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q15.query(fn(lineitem), fn(supplier))) diff --git a/tpch/execute/q16.py b/tpch/execute/q16.py index 5176a5cc6..7fa6c72b0 100644 --- a/tpch/execute/q16.py +++ b/tpch/execute/q16.py @@ -12,3 +12,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q16.query(fn(part), fn(partsupp), fn(supplier)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q16.query(fn(part), fn(partsupp), fn(supplier))) diff --git a/tpch/execute/q17.py b/tpch/execute/q17.py index 2d9920c69..8eefb92dc 100644 --- a/tpch/execute/q17.py +++ b/tpch/execute/q17.py @@ -11,3 +11,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q17.query(fn(lineitem), fn(part)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q17.query(fn(lineitem), fn(part))) diff --git a/tpch/execute/q18.py b/tpch/execute/q18.py index 4092fc0d6..fdd50c095 100644 --- a/tpch/execute/q18.py +++ b/tpch/execute/q18.py @@ -12,3 +12,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q18.query(fn(customer), fn(lineitem), fn(orders)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q18.query(fn(customer), fn(lineitem), fn(orders))) diff --git a/tpch/execute/q19.py b/tpch/execute/q19.py index 87467064c..e1dff3eb5 100644 --- a/tpch/execute/q19.py +++ b/tpch/execute/q19.py @@ -4,14 +4,11 @@ from . import lineitem from . import part -fn = IO_FUNCS["pandas"] -print(q19.query(fn(lineitem), fn(part))) - fn = IO_FUNCS["pandas[pyarrow]"] print(q19.query(fn(lineitem), fn(part))) -fn = IO_FUNCS["polars[eager]"] -print(q19.query(fn(lineitem), fn(part))) - fn = IO_FUNCS["polars[lazy]"] print(q19.query(fn(lineitem), fn(part)).collect()) + +fn = IO_FUNCS["pyarrow"] +print(q19.query(fn(lineitem), fn(part))) diff --git a/tpch/execute/q20.py b/tpch/execute/q20.py index 68d18a6b5..d15f8c85f 100644 --- a/tpch/execute/q20.py +++ b/tpch/execute/q20.py @@ -7,14 +7,11 @@ from . import partsupp from . import supplier -fn = IO_FUNCS["pandas"] -print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))) - fn = IO_FUNCS["pandas[pyarrow]"] print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))) -fn = IO_FUNCS["polars[eager]"] -print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))) - fn = IO_FUNCS["polars[lazy]"] print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier)).collect()) + +fn = IO_FUNCS["pyarrow"] +print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))) diff --git a/tpch/execute/q21.py b/tpch/execute/q21.py index 693953870..9940e6232 100644 --- a/tpch/execute/q21.py +++ b/tpch/execute/q21.py @@ -6,14 +6,11 @@ from . import orders from . import supplier -fn = IO_FUNCS["pandas"] -print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier))) - fn = IO_FUNCS["pandas[pyarrow]"] print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier))) -fn = IO_FUNCS["polars[eager]"] -print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier))) - fn = IO_FUNCS["polars[lazy]"] print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier)).collect()) + +fn = IO_FUNCS["pyarrow"] +print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier))) diff --git a/tpch/execute/q22.py b/tpch/execute/q22.py index 91ed46d9d..3b3fe523f 100644 --- a/tpch/execute/q22.py +++ b/tpch/execute/q22.py @@ -11,3 +11,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q22.query(fn(customer), fn(orders)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q22.query(fn(customer), fn(orders))) diff --git a/tpch/execute/q3.py b/tpch/execute/q3.py index a1eea74d1..f836fae27 100644 --- a/tpch/execute/q3.py +++ b/tpch/execute/q3.py @@ -12,3 +12,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q3.query(fn(customer), fn(lineitem), fn(orders)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q3.query(fn(customer), fn(lineitem), fn(orders))) diff --git a/tpch/execute/q4.py b/tpch/execute/q4.py index 79213f1ac..ca60f38ee 100644 --- a/tpch/execute/q4.py +++ b/tpch/execute/q4.py @@ -11,3 +11,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q4.query(fn(line_item), fn(orders)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q4.query(fn(line_item), fn(orders))) diff --git a/tpch/execute/q5.py b/tpch/execute/q5.py index 7a04dec1b..c343fea5d 100644 --- a/tpch/execute/q5.py +++ b/tpch/execute/q5.py @@ -23,3 +23,11 @@ fn(region), fn(nation), fn(customer), fn(line_item), fn(orders), fn(supplier) ).collect() ) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print( + q5.query( + fn(region), fn(nation), fn(customer), fn(line_item), fn(orders), fn(supplier) + ) +) diff --git a/tpch/execute/q6.py b/tpch/execute/q6.py index 402e6d452..eebf3f864 100644 --- a/tpch/execute/q6.py +++ b/tpch/execute/q6.py @@ -10,3 +10,7 @@ tool = "polars[lazy]" fn = IO_FUNCS[tool] print(q6.query(fn(lineitem)).collect()) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q6.query(fn(lineitem))) diff --git a/tpch/execute/q7.py b/tpch/execute/q7.py index 9f6179d23..c59f82ce7 100644 --- a/tpch/execute/q7.py +++ b/tpch/execute/q7.py @@ -16,3 +16,7 @@ print( q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect() ) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print(q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))) diff --git a/tpch/execute/q8.py b/tpch/execute/q8.py index f1a8677ff..902a34e70 100644 --- a/tpch/execute/q8.py +++ b/tpch/execute/q8.py @@ -37,3 +37,17 @@ fn(region), ).collect() ) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print( + q8.query( + fn(part), + fn(supplier), + fn(lineitem), + fn(orders), + fn(customer), + fn(nation), + fn(region), + ) +) diff --git a/tpch/execute/q9.py b/tpch/execute/q9.py index e01dd0f2c..44d4154aa 100644 --- a/tpch/execute/q9.py +++ b/tpch/execute/q9.py @@ -21,3 +21,9 @@ fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier) ).collect() ) + +tool = "pyarrow" +fn = IO_FUNCS[tool] +print( + q9.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)) +) diff --git a/tpch/queries/q17.py b/tpch/queries/q17.py index 5d35929d1..976f476f0 100644 --- a/tpch/queries/q17.py +++ b/tpch/queries/q17.py @@ -14,8 +14,9 @@ def query(lineitem_ds: FrameT, part_ds: FrameT) -> FrameT: ) return ( - query1.group_by("p_partkey") - .agg((0.2 * nw.col("l_quantity").mean()).alias("avg_quantity")) + query1.with_columns(l_quantity_times_point_2=nw.col("l_quantity") * 0.2) + .group_by("p_partkey") + .agg(nw.col("l_quantity_times_point_2").mean().alias("avg_quantity")) .select(nw.col("p_partkey").alias("key"), nw.col("avg_quantity")) .join(query1, left_on="key", right_on="p_partkey") .filter(nw.col("l_quantity") < nw.col("avg_quantity")) diff --git a/tpch/queries/q8.py b/tpch/queries/q8.py index 3fba96313..ac3fa4baf 100644 --- a/tpch/queries/q8.py +++ b/tpch/queries/q8.py @@ -46,6 +46,7 @@ def query( .alias("_tmp") ) .group_by("o_year") - .agg((nw.sum("_tmp") / nw.sum("volume")).round(2).alias("mkt_share")) + .agg(_tmp_sum=nw.sum("_tmp"), volume_sum=nw.sum("volume")) + .select("o_year", mkt_share=nw.col("_tmp_sum") / nw.col("volume_sum")) .sort("o_year") ) From 5dc43000dfcd3e93c81f6a1cb90ea1d8bda38ffa Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 9 Sep 2024 17:52:21 +0200 Subject: [PATCH 25/30] feat: join suffix (#934) --- narwhals/_arrow/dataframe.py | 5 ++-- narwhals/_arrow/expr.py | 4 ++- narwhals/_dask/dataframe.py | 7 +++-- narwhals/_dask/expr.py | 2 +- narwhals/_pandas_like/dataframe.py | 11 +++---- narwhals/_pandas_like/expr.py | 4 ++- narwhals/dataframe.py | 46 ++++++++++++++++-------------- tests/frame/join_test.py | 35 +++++++++++++++++++++++ 8 files changed, 80 insertions(+), 34 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 2750f8c09..fa5a69950 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -277,6 +277,7 @@ def join( how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner", left_on: str | list[str] | None, right_on: str | list[str] | None, + suffix: str, ) -> Self: how_to_join_map = { "anti": "left anti", @@ -298,7 +299,7 @@ def join( keys=key_token, right_keys=key_token, join_type="inner", - right_suffix="_right", + right_suffix=suffix, ) .drop([key_token]), ) @@ -309,7 +310,7 @@ def join( keys=left_on, right_keys=right_on, join_type=how_to_join_map[how], - right_suffix="_right", + right_suffix=suffix, ), ) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 593e73eb3..31052fa52 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -309,7 +309,9 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: ) raise ValueError(msg) tmp = df.group_by(*keys).agg(self) - tmp = df.select(*keys).join(tmp, how="left", left_on=keys, right_on=keys) + tmp = df.select(*keys).join( + tmp, how="left", left_on=keys, right_on=keys, suffix="_right" + ) return [tmp[name] for name in self._output_names] return self.__class__( diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 1a40d7a6c..e2a034ae2 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -208,6 +208,7 @@ def join( how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner", left_on: str | list[str] | None, right_on: str | list[str] | None, + suffix: str, ) -> Self: if how == "cross": key_token = generate_unique_token( @@ -221,7 +222,7 @@ def join( how="inner", left_on=key_token, right_on=key_token, - suffixes=("", "_right"), + suffixes=("", suffix), ) .drop(columns=key_token), ) @@ -273,7 +274,7 @@ def join( how="left", left_on=left_on, right_on=right_on, - suffixes=("", "_right"), + suffixes=("", suffix), ) extra = [] for left_key, right_key in zip(left_on, right_on): # type: ignore[arg-type] @@ -289,7 +290,7 @@ def join( left_on=left_on, right_on=right_on, how=how, - suffixes=("", "_right"), + suffixes=("", suffix), ), ) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index e3030a787..730824508 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -629,7 +629,7 @@ def func(df: DaskLazyFrame) -> list[Any]: tmp = df.group_by(*keys).agg(self) tmp_native = ( df.select(*keys) - .join(tmp, how="left", left_on=keys, right_on=keys) + .join(tmp, how="left", left_on=keys, right_on=keys, suffix="_right") ._native_frame ) return [tmp_native[name] for name in self._output_names] diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 4ec42ef59..59cff49fc 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -403,6 +403,7 @@ def join( how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner", left_on: str | list[str] | None, right_on: str | list[str] | None, + suffix: str, ) -> Self: if isinstance(left_on, str): left_on = [left_on] @@ -427,7 +428,7 @@ def join( how="inner", left_on=key_token, right_on=key_token, - suffixes=("", "_right"), + suffixes=("", suffix), ) .drop(columns=key_token), ) @@ -436,7 +437,7 @@ def join( self._native_frame.merge( other._native_frame, how="cross", - suffixes=("", "_right"), + suffixes=("", suffix), ), ) @@ -488,14 +489,14 @@ def join( how="left", left_on=left_on, right_on=right_on, - suffixes=("", "_right"), + suffixes=("", suffix), ) extra = [] for left_key, right_key in zip(left_on, right_on): # type: ignore[arg-type] if right_key != left_key and right_key not in self.columns: extra.append(right_key) elif right_key != left_key: - extra.append(f"{right_key}_right") + extra.append(f"{right_key}{suffix}") return self._from_native_frame(result_native.drop(columns=extra)) return self._from_native_frame( @@ -504,7 +505,7 @@ def join( left_on=left_on, right_on=right_on, how=how, - suffixes=("", "_right"), + suffixes=("", suffix), ), ) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 44154453d..8c3536c77 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -287,7 +287,9 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: ) raise ValueError(msg) tmp = df.group_by(*keys).agg(self) - tmp = df.select(*keys).join(tmp, how="left", left_on=keys, right_on=keys) + tmp = df.select(*keys).join( + tmp, how="left", left_on=keys, right_on=keys, suffix="_right" + ) return [tmp[name] for name in self._output_names] return self.__class__( diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index ffd7ce36d..a266b73c7 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -182,11 +182,12 @@ def sort( def join( self, other: Self, - *, + on: str | list[str] | None = None, how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", + *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, - on: str | list[str] | None = None, + suffix: str = "_right", ) -> Self: _supported_joins = ("inner", "left", "cross", "anti", "semi") @@ -219,6 +220,7 @@ def join( how=how, left_on=left_on, right_on=right_on, + suffix=suffix, ) ) @@ -1850,30 +1852,29 @@ def sort( def join( self, other: Self, - *, + on: str | list[str] | None = None, how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", + *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, - on: str | list[str] | None = None, + suffix: str = "_right", ) -> Self: r""" Join in SQL-like fashion. Arguments: - other: DataFrame to join with. - + other: Lazy DataFrame to join with. + on: Name(s) of the join columns in both DataFrames. If set, `left_on` and + `right_on` should be None. how: Join strategy. * *inner*: Returns rows that have matching values in both tables. * *cross*: Returns the Cartesian product of rows from both tables. * *semi*: Filter rows that have a match in the right table. * *anti*: Filter rows that do not have a match in the right table. - - left_on: Name(s) of the left join column(s). - - right_on: Name(s) of the right join column(s). - - on: Join column of both DataFrames. If set, left_on and right_on should be None. + left_on: Join column of the left DataFrame. + right_on: Join column of the right DataFrame. + suffix: Suffix to append to columns with a duplicate name. Returns: A new joined DataFrame @@ -1922,7 +1923,9 @@ def join( │ 2 ┆ 7.0 ┆ b ┆ y │ └─────┴─────┴─────┴───────┘ """ - return super().join(other, how=how, left_on=left_on, right_on=right_on, on=on) + return super().join( + other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix + ) def join_asof( self, @@ -3578,30 +3581,29 @@ def sort( def join( self, other: Self, - *, + on: str | list[str] | None = None, how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", + *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, - on: str | list[str] | None = None, + suffix: str = "_right", ) -> Self: r""" Add a join operation to the Logical Plan. Arguments: other: Lazy DataFrame to join with. - + on: Name(s) of the join columns in both DataFrames. If set, `left_on` and + `right_on` should be None. how: Join strategy. * *inner*: Returns rows that have matching values in both tables. * *cross*: Returns the Cartesian product of rows from both tables. * *semi*: Filter rows that have a match in the right table. * *anti*: Filter rows that do not have a match in the right table. - left_on: Join column of the left DataFrame. - right_on: Join column of the right DataFrame. - - on: Join column of both DataFrames. If set, left_on and right_on should be None. + suffix: Suffix to append to columns with a duplicate name. Returns: A new joined LazyFrame @@ -3650,7 +3652,9 @@ def join( │ 2 ┆ 7.0 ┆ b ┆ y │ └─────┴─────┴─────┴───────┘ """ - return super().join(other, how=how, left_on=left_on, right_on=right_on, on=on) + return super().join( + other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix + ) def join_asof( self, diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 6a1985f41..18e9aae64 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -89,6 +89,41 @@ def test_cross_join(constructor: Any) -> None: df.join(df, how="cross", left_on="antananarivo") # type: ignore[arg-type] +@pytest.mark.parametrize("how", ["inner", "left"]) +@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"]) +def test_suffix(constructor: Any, how: str, suffix: str) -> None: + data = { + "antananarivo": [1, 3, 2], + "bob": [4, 4, 6], + "zorro": [7.0, 8, 9], + } + df = nw.from_native(constructor(data)) + df_right = df + result = df.join( + df_right, # type: ignore[arg-type] + left_on=["antananarivo", "bob"], + right_on=["antananarivo", "bob"], + how=how, # type: ignore[arg-type] + suffix=suffix, + ) + result_cols = result.collect_schema().names() + assert result_cols == ["antananarivo", "bob", "zorro", f"zorro{suffix}"] + + +@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"]) +def test_cross_join_suffix(constructor: Any, suffix: str) -> None: + data = {"antananarivo": [1, 3, 2]} + df = nw.from_native(constructor(data)) + result = df.join(df, how="cross", suffix=suffix).sort( # type: ignore[arg-type] + "antananarivo", f"antananarivo{suffix}" + ) + expected = { + "antananarivo": [1, 1, 1, 2, 2, 2, 3, 3, 3], + f"antananarivo{suffix}": [1, 2, 3, 1, 2, 3, 1, 2, 3], + } + compare_dicts(result, expected) + + def test_cross_join_non_pandas() -> None: data = {"antananarivo": [1, 3, 2]} df = nw.from_native(pd.DataFrame(data)) From b906621bbd3672b043c1a6535a5a0a83aabb6c94 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Tue, 10 Sep 2024 00:02:58 -0400 Subject: [PATCH 26/30] docs: fix `maybe_align_index` docstring formatting (#938) --- narwhals/stable/v1.py | 2 +- narwhals/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index 1af7a26f3..78cfa5ba1 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -1541,7 +1541,7 @@ def is_ordered_categorical(series: Series) -> bool: def maybe_align_index(lhs: T, rhs: Series | DataFrame[Any] | LazyFrame[Any]) -> T: """ - Align `lhs` to the Index of `rhs, if they're both pandas-like. + Align `lhs` to the Index of `rhs`, if they're both pandas-like. Notes: This is only really intended for backwards-compatibility purposes, diff --git a/narwhals/utils.py b/narwhals/utils.py index 6c1b5c1b4..ec3c722d4 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -153,7 +153,7 @@ def validate_laziness(items: Iterable[Any]) -> None: def maybe_align_index(lhs: T, rhs: Series | BaseFrame[Any]) -> T: """ - Align `lhs` to the Index of `rhs, if they're both pandas-like. + Align `lhs` to the Index of `rhs`, if they're both pandas-like. Notes: This is only really intended for backwards-compatibility purposes, From be95f2e83615dd93c7a55619236e54e98be73a18 Mon Sep 17 00:00:00 2001 From: Liam Connors Date: Tue, 10 Sep 2024 03:22:55 -0400 Subject: [PATCH 27/30] fix: Update `copy` param on `to_numpy` to default to `True` for cuDF (#937) * set copy=True for cuDF * set copy=True for cuDF series --- narwhals/_pandas_like/dataframe.py | 4 ++-- narwhals/_pandas_like/series.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 59cff49fc..499777833 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -590,8 +590,8 @@ def to_numpy(self, dtype: Any = None, copy: bool | None = None) -> Any: from narwhals._pandas_like.series import PANDAS_TO_NUMPY_DTYPE_MISSING if copy is None: - # pandas default differs from Polars - copy = False + # pandas default differs from Polars, but cuDF default is True + copy = self._implementation is Implementation.CUDF if dtype is not None: return self._native_frame.to_numpy(dtype=dtype, copy=copy) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index b28a04088..a0830784f 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -473,7 +473,7 @@ def __array__(self, dtype: Any = None, copy: bool | None = None) -> Any: def to_numpy(self, dtype: Any = None, copy: bool | None = None) -> Any: # the default is meant to be None, but pandas doesn't allow it? # https://numpy.org/doc/stable/reference/generated/numpy.ndarray.__array__.html - copy = copy or False + copy = copy or self._implementation is Implementation.CUDF has_missing = self._native_series.isna().any() if ( From 270adbd432259f21a24f8f9a4f3121ee19a3d646 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Tue, 10 Sep 2024 09:23:41 +0200 Subject: [PATCH 28/30] chore: ban dask boolean mask filtering (#939) --- narwhals/_dask/dataframe.py | 15 ++++++++------- tests/frame/filter_test.py | 19 ++++++++++++++++--- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index e2a034ae2..d4433fb39 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -79,14 +79,15 @@ def filter( and isinstance(predicates[0], list) and all(isinstance(x, bool) for x in predicates[0]) ): - mask = predicates[0] - else: - from narwhals._dask.namespace import DaskNamespace + msg = "Filtering with boolean mask is not supported for `DaskLazyFrame`" + raise NotImplementedError(msg) + + from narwhals._dask.namespace import DaskNamespace - plx = DaskNamespace(backend_version=self._backend_version) - expr = plx.all_horizontal(*predicates) - # Safety: all_horizontal's expression only returns a single column. - mask = expr._call(self)[0] + plx = DaskNamespace(backend_version=self._backend_version) + expr = plx.all_horizontal(*predicates) + # Safety: all_horizontal's expression only returns a single column. + mask = expr._call(self)[0] return self._from_native_frame(self._native_frame.loc[mask]) def lazy(self) -> Self: diff --git a/tests/frame/filter_test.py b/tests/frame/filter_test.py index a8d3144aa..609f8ef91 100644 --- a/tests/frame/filter_test.py +++ b/tests/frame/filter_test.py @@ -1,5 +1,8 @@ +from contextlib import nullcontext as does_not_raise from typing import Any +import pytest + import narwhals.stable.v1 as nw from tests.utils import compare_dicts @@ -15,6 +18,16 @@ def test_filter(constructor: Any) -> None: def test_filter_with_boolean_list(constructor: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) - result = df.filter([False, True, True]) - expected = {"a": [3, 2], "b": [4, 6], "z": [8.0, 9.0]} - compare_dicts(result, expected) + + context = ( + pytest.raises( + NotImplementedError, match="Filtering with boolean mask is not supported" + ) + if "dask" in str(constructor) + else does_not_raise() + ) + + with context: + result = df.filter([False, True, True]) + expected = {"a": [3, 2], "b": [4, 6], "z": [8.0, 9.0]} + compare_dicts(result, expected) From 359905b95f8c76fdb7eaaf48cb77cc08eafd6209 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:11:27 +0100 Subject: [PATCH 29/30] support `__getitem__` with single tuple of column names (#935) --- narwhals/_arrow/dataframe.py | 9 ++++++++- narwhals/_pandas_like/dataframe.py | 17 ++++++++++++++--- narwhals/dataframe.py | 8 +++++++- narwhals/stable/v1.py | 5 +++-- tests/frame/slice_test.py | 3 +++ 5 files changed, 35 insertions(+), 7 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index fa5a69950..f409ef735 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -121,7 +121,12 @@ def __getitem__(self, item: str) -> ArrowSeries: ... def __getitem__(self, item: slice) -> ArrowDataFrame: ... def __getitem__( - self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int] + self, + item: str + | slice + | Sequence[int] + | Sequence[str] + | tuple[Sequence[int], str | int], ) -> ArrowSeries | ArrowDataFrame: if isinstance(item, str): from narwhals._arrow.series import ArrowSeries @@ -191,6 +196,8 @@ def __getitem__( ) elif isinstance(item, Sequence) or (is_numpy_array(item) and item.ndim == 1): + if isinstance(item, Sequence) and all(isinstance(x, str) for x in item): + return self._from_native_frame(self._native_frame.select(item)) return self._from_native_frame(self._native_frame.take(item)) else: # pragma: no cover diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 499777833..71a659998 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -111,13 +111,22 @@ def __getitem__(self, item: tuple[Sequence[int], str | int]) -> PandasLikeSeries def __getitem__(self, item: Sequence[int]) -> PandasLikeDataFrame: ... @overload - def __getitem__(self, item: str) -> PandasLikeSeries: ... + def __getitem__(self, item: str) -> PandasLikeSeries: ... # type: ignore[overload-overlap] + + @overload + def __getitem__(self, item: Sequence[str]) -> PandasLikeDataFrame: ... @overload def __getitem__(self, item: slice) -> PandasLikeDataFrame: ... def __getitem__( - self, item: str | slice | Sequence[int] | tuple[Sequence[int], str | int] + self, + item: str + | int + | slice + | Sequence[int] + | Sequence[str] + | tuple[Sequence[int], str | int], ) -> PandasLikeSeries | PandasLikeDataFrame: if isinstance(item, str): from narwhals._pandas_like.series import PandasLikeSeries @@ -174,7 +183,7 @@ def __getitem__( from narwhals._pandas_like.series import PandasLikeSeries if isinstance(item[1], str): - item = (item[0], self._native_frame.columns.get_loc(item[1])) + item = (item[0], self._native_frame.columns.get_loc(item[1])) # type: ignore[assignment] native_series = self._native_frame.iloc[item] elif isinstance(item[1], int): native_series = self._native_frame.iloc[item] @@ -191,6 +200,8 @@ def __getitem__( elif isinstance(item, (slice, Sequence)) or ( is_numpy_array(item) and item.ndim == 1 ): + if isinstance(item, Sequence) and all(isinstance(x, str) for x in item): + return self._from_native_frame(self._native_frame.loc[:, item]) return self._from_native_frame(self._native_frame.iloc[item]) else: # pragma: no cover diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index a266b73c7..1b91f0910 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -612,7 +612,10 @@ def __getitem__(self, item: tuple[Sequence[int], int]) -> Series: ... # type: i def __getitem__(self, item: Sequence[int]) -> Self: ... @overload - def __getitem__(self, item: str) -> Series: ... + def __getitem__(self, item: str) -> Series: ... # type: ignore[overload-overlap] + + @overload + def __getitem__(self, item: Sequence[str]) -> Self: ... @overload def __getitem__(self, item: slice) -> Self: ... @@ -622,6 +625,7 @@ def __getitem__( item: str | slice | Sequence[int] + | Sequence[str] | tuple[Sequence[int], str | int] | tuple[slice | Sequence[int], Sequence[int] | Sequence[str] | slice], ) -> Series | Self: @@ -644,6 +648,8 @@ def __getitem__( `DataFrame`. - `df[:, ['a', 'c']]` extracts all rows and columns `'a'` and `'c'` and returns a `DataFrame`. + - `df[['a', 'c']]` extracts all rows and columns `'a'` and `'c'` and returns a + `DataFrame`. - `df[0: 2, ['a', 'c']]` extracts the first two rows and columns `'a'` and `'c'` and returns a `DataFrame` - `df[:, 0: 2]` extracts all rows from the first two columns and returns a `DataFrame` diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index 78cfa5ba1..862ba5d1a 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -91,9 +91,10 @@ def __getitem__(self, item: tuple[Sequence[int], int]) -> Series: ... # type: i @overload def __getitem__(self, item: Sequence[int]) -> Self: ... - @overload - def __getitem__(self, item: str) -> Series: ... + def __getitem__(self, item: str) -> Series: ... # type: ignore[overload-overlap] + @overload + def __getitem__(self, item: Sequence[str]) -> Self: ... @overload def __getitem__(self, item: slice) -> Self: ... diff --git a/tests/frame/slice_test.py b/tests/frame/slice_test.py index 18b05bf3b..834e88bff 100644 --- a/tests/frame/slice_test.py +++ b/tests/frame/slice_test.py @@ -147,6 +147,9 @@ def test_slice_slice_columns(constructor_eager: Any) -> None: result = df[:, [0, 2]] expected = {"a": [1, 2, 3], "c": [7, 8, 9]} compare_dicts(result, expected) + result = df[["b", "c"]] + expected = {"b": [4, 5, 6], "c": [7, 8, 9]} + compare_dicts(result, expected) def test_slice_invalid(constructor_eager: Any) -> None: From e9afffd233ed4b4df5364dc8c16ba00e16f86871 Mon Sep 17 00:00:00 2001 From: Aidos Kanapyanov <65722512+aidoskanapyanov@users.noreply.github.com> Date: Tue, 10 Sep 2024 20:44:45 +0500 Subject: [PATCH 30/30] docs: add `appears on` section with links to relevant podcasts/blogs (#941) * docs: add `appears on` section with links to relevant podcasts/blogs * add "talk python to me" as well Co-authored-by: Marco Edward Gorelli * Update README.md --------- Co-authored-by: Marco Edward Gorelli --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index d26107e67..74630fd03 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,31 @@ provided some funding / development time: If you contribute to Narwhals on your organization's time, please let us know. We'd be happy to add your employer to this list! +## Appears on + +Narwhals has been featured in several talks, podcasts, and blog posts: + +- [Talk Python to me Podcast](https://youtu.be/FSH7BZ0tuE0) + Ahoy, Narwhals are bridging the data science APIs + +- [Super Data Science: ML & AI Podcast](https://www.youtube.com/watch?v=TeG4U8R0U8U) + Narwhals: For Pandas-to-Polars DataFrame Compatibility + +- [Sample Space Podcast | probabl](https://youtu.be/8hYdq4sWbbQ?si=WG0QP1CZ6gkFf18b) + How Narwhals has many end users ... that never use it directly. - Marco Gorelli + +- [Pycon Lithuania](https://www.youtube.com/watch?v=-mdx7Cn6_6E) + Marco Gorelli - DataFrame interoperatiblity - what's been achieved, and what comes next? + +- [Pycon Italy](https://www.youtube.com/watch?v=3IqUli9XsmQ) + How you can write a dataframe-agnostic library - Marco Gorelli + +- [Polars Blog Post](https://pola.rs/posts/lightweight_plotting/) + Polars has a new lightweight plotting backend + +- [Quansight Labs blog post (w/ Scikit-Lego)](https://labs.quansight.org/blog/scikit-lego-narwhals) + How Narwhals and scikit-lego came together to achieve dataframe-agnosticism + ## Why "Narwhals"? [Coz they are so awesome](https://youtu.be/ykwqXuMPsoc?si=A-i8LdR38teYsos4).