diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index f7d40c414..654c0188d 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -31,7 +31,7 @@ jobs: - name: install-modin run: python -m pip install pandas==2.0.0 polars==0.20.13 modin[dask] - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 + run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow - name: Run doctests run: pytest narwhals --doctest-modules @@ -64,7 +64,7 @@ jobs: - name: install-pandas-nightly run: python -m pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 + run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow - name: Run doctests run: pytest narwhals --doctest-modules diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index e9f5f9ba9..723fc827f 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -60,6 +60,6 @@ jobs: - name: install-modin run: python -m pip install --upgrade modin[dask] - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=100 + run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=100 --runslow - name: Run doctests run: pytest narwhals --doctest-modules diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 15f4f3d6f..4bf7271ab 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -108,7 +108,7 @@ def select( new_series = evaluate_into_exprs(self, *exprs, **named_exprs) new_series = validate_indices(new_series) df = horizontal_concat( - [series._series for series in new_series], + new_series, implementation=self._implementation, ) return self._from_dataframe(df) @@ -227,6 +227,17 @@ def to_dict(self, *, as_series: bool = False) -> dict[str, Any]: return self._dataframe.to_dict(orient="list") # type: ignore[no-any-return] def to_numpy(self) -> Any: + from narwhals._pandas_like.series import PANDAS_TO_NUMPY_DTYPE_MISSING + + # pandas return `object` dtype for nullable dtypes, so we cast each + # Series to numpy and let numpy find a common dtype. + # If there aren't any dtypes where `to_numpy()` is "broken" (i.e. it + # returns Object) then we just call `to_numpy()` on the DataFrame. + for dtype in self._dataframe.dtypes: + if str(dtype) in PANDAS_TO_NUMPY_DTYPE_MISSING: + import numpy as np + + return np.hstack([self[col].to_numpy()[:, None] for col in self.columns]) return self._dataframe.to_numpy() def to_pandas(self) -> Any: diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index b4f687096..f02a86aea 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from typing import TYPE_CHECKING from typing import Any from typing import Sequence @@ -16,6 +17,51 @@ from narwhals._pandas_like.namespace import PandasNamespace from narwhals.dtypes import DType +PANDAS_TO_NUMPY_DTYPE_NO_MISSING = { + "Int64": "int64", + "int64[pyarrow]": "int64", + "Int32": "int32", + "int32[pyarrow]": "int32", + "Int16": "int16", + "int16[pyarrow]": "int16", + "Int8": "int8", + "int8[pyarrow]": "int8", + "UInt64": "uint64", + "uint64[pyarrow]": "uint64", + "UInt32": "uint32", + "uint32[pyarrow]": "uint32", + "UInt16": "uint16", + "uint16[pyarrow]": "uint16", + "UInt8": "uint8", + "uint8[pyarrow]": "uint8", + "Float64": "float64", + "float64[pyarrow]": "float64", + "Float32": "float32", + "float32[pyarrow]": "float32", +} +PANDAS_TO_NUMPY_DTYPE_MISSING = { + "Int64": "float64", + "int64[pyarrow]": "float64", + "Int32": "float64", + "int32[pyarrow]": "float64", + "Int16": "float64", + "int16[pyarrow]": "float64", + "Int8": "float64", + "int8[pyarrow]": "float64", + "UInt64": "float64", + "uint64[pyarrow]": "float64", + "UInt32": "float64", + "uint32[pyarrow]": "float64", + "UInt16": "float64", + "uint16[pyarrow]": "float64", + "UInt8": "float64", + "uint8[pyarrow]": "float64", + "Float64": "float64", + "float64[pyarrow]": "float64", + "Float32": "float32", + "float32[pyarrow]": "float32", +} + class PandasSeries: def __init__( @@ -102,7 +148,14 @@ def is_in(self, other: Any) -> PandasSeries: import pandas as pd ser = self._series - res = ser.isin(other).convert_dtypes() + with warnings.catch_warnings(): + # np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types` + warnings.filterwarnings( + "ignore", + message="np.find_common_type is deprecated.*", + category=DeprecationWarning, + ) + res = ser.isin(other).convert_dtypes() res[ser.isna()] = pd.NA return self._from_series(res) @@ -317,6 +370,19 @@ def alias(self, name: str) -> Self: return self._from_series(self._rename(ser, name)) def to_numpy(self) -> Any: + has_missing = self._series.isna().any() + if has_missing and str(self._series.dtype) in PANDAS_TO_NUMPY_DTYPE_MISSING: + return self._series.to_numpy( + dtype=PANDAS_TO_NUMPY_DTYPE_MISSING[str(self._series.dtype)], + na_value=float("nan"), + ) + if ( + not has_missing + and str(self._series.dtype) in PANDAS_TO_NUMPY_DTYPE_NO_MISSING + ): + return self._series.to_numpy( + dtype=PANDAS_TO_NUMPY_DTYPE_NO_MISSING[str(self._series.dtype)] + ) return self._series.to_numpy() def to_pandas(self) -> Any: diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index cec39aeae..5d8518200 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -49,11 +49,7 @@ def validate_column_comparand(index: Any, other: Any) -> Any: # broadcast return other.item() if other._series.index is not index and not (other._series.index == index).all(): - msg = ( - "Narwhals does not support automated index alignment. " - "You may need to do a join before this operation." - ) - raise ValueError(msg) + return other._series.set_axis(index, axis=0) return other._series return other @@ -74,11 +70,7 @@ def validate_dataframe_comparand(index: Any, other: Any) -> Any: # broadcast return item(other._series) if other._series.index is not index and not (other._series.index == index).all(): - msg = ( - "Narwhals does not support automated index alignment. " - "You may need to do a join before this operation." - ) - raise ValueError(msg) + return other._series.set_axis(index, axis=0) return other._series raise AssertionError("Please report a bug") @@ -362,13 +354,12 @@ def reverse_translate_dtype(dtype: DType | type[DType]) -> Any: raise AssertionError(msg) -def validate_indices(series: list[PandasSeries]) -> list[PandasSeries]: +def validate_indices(series: list[PandasSeries]) -> list[Any]: idx = series[0]._series.index + reindexed = [series[0]._series] for s in series[1:]: if s._series.index is not idx and not (s._series.index == idx).all(): - msg = ( - "Narwhals does not support automated index alignment. " - "You may need to do a join before this operation." - ) - raise RuntimeError(msg) - return series + reindexed.append(s._series.set_axis(idx.rename(s._series.index.name), axis=0)) + else: + reindexed.append(s._series) + return reindexed diff --git a/narwhals/dependencies.py b/narwhals/dependencies.py index be68a4d92..8a117d5dd 100644 --- a/narwhals/dependencies.py +++ b/narwhals/dependencies.py @@ -21,7 +21,7 @@ def get_pandas() -> Any: @functools.lru_cache -def get_modin() -> Any: +def get_modin() -> Any: # pragma: no cover try: import modin.pandas as mpd except ImportError: # pragma: no cover diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..f0316826a --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,23 @@ +from typing import Any + +import pytest + + +def pytest_addoption(parser: Any) -> None: + parser.addoption( + "--runslow", action="store_true", default=False, help="run slow tests" + ) + + +def pytest_configure(config: Any) -> None: + config.addinivalue_line("markers", "slow: mark test as slow to run") + + +def pytest_collection_modifyitems(config: Any, items: Any) -> Any: # pragma: no cover + if config.getoption("--runslow"): + # --runslow given in cli: do not skip slow tests + return + skip_slow = pytest.mark.skip(reason="need --runslow option to run") + for item in items: + if "slow" in item.keywords: + item.add_marker(skip_slow) diff --git a/tests/hypothesis/test_join.py b/tests/hypothesis/test_join.py index 58c5e5386..4bb2e64dd 100644 --- a/tests/hypothesis/test_join.py +++ b/tests/hypothesis/test_join.py @@ -2,6 +2,7 @@ import pandas as pd import polars as pl +import pytest from hypothesis import example from hypothesis import given from hypothesis import strategies as st @@ -34,6 +35,7 @@ unique=True, ), ) # type: ignore[misc] +@pytest.mark.slow() def test_join( integers: st.SearchStrategy[list[int]], other_integers: st.SearchStrategy[list[int]], diff --git a/tests/test_common.py b/tests/test_common.py index c13d95710..71f860b0c 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -322,7 +322,9 @@ def test_convert_pandas(df_raw: Any) -> None: pd.testing.assert_frame_equal(result, expected) -@pytest.mark.parametrize("df_raw", [df_polars, df_pandas, df_mpd]) +@pytest.mark.parametrize( + "df_raw", [df_polars, df_pandas, df_mpd, df_pandas_nullable, df_pandas_pyarrow] +) @pytest.mark.filterwarnings( r"ignore:np\.find_common_type is deprecated\.:DeprecationWarning" ) @@ -330,6 +332,7 @@ def test_convert_numpy(df_raw: Any) -> None: result = nw.DataFrame(df_raw).to_numpy() expected = np.array([[1, 3, 2], [4, 4, 6], [7.0, 8, 9]]).T np.testing.assert_array_equal(result, expected) + assert result.dtype == "float64" @pytest.mark.parametrize("df_raw", [df_polars, df_pandas, df_mpd]) @@ -569,15 +572,19 @@ def test_invalid() -> None: @pytest.mark.parametrize("df_raw", [df_pandas]) def test_reindex(df_raw: Any) -> None: df = nw.DataFrame(df_raw) - with pytest.raises(RuntimeError, match="automated index alignment"): - df.select("a", df["b"].sort(descending=True)) - with pytest.raises(RuntimeError, match="automated index alignment"): - df.select("a", nw.col("b").sort(descending=True)) + result = df.select("b", df["a"].sort(descending=True)) + expected = {"b": [4, 4, 6], "a": [3, 2, 1]} + compare_dicts(result, expected) + result = df.select("b", nw.col("a").sort(descending=True)) + compare_dicts(result, expected) s = df["a"] - with pytest.raises(ValueError, match="index alignment"): - nw.to_native(s > s.sort()) - with pytest.raises(ValueError, match="index alignment"): - nw.to_native(df.with_columns(s.sort())) + result_s = s > s.sort() + assert not result_s[0] + assert result_s[1] + assert not result_s[2] + result = df.with_columns(s.sort()) + expected = {"a": [1, 2, 3], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]} # type: ignore[list-item] + compare_dicts(result, expected) with pytest.raises(ValueError, match="Multi-output expressions are not supported"): nw.to_native(df.with_columns(nw.all() + nw.all())) diff --git a/tests/test_series.py b/tests/test_series.py index 941883e9c..ddf72f6c9 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -40,13 +40,14 @@ def test_len(df_raw: Any) -> None: result = len(nw.Series(df_raw["a"])) assert result == 3 - result = len(nw.to_native(nw.LazyFrame(df_raw).collect()["a"])) + result = len(nw.LazyFrame(df_raw).collect()["a"]) assert result == 3 @pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) +@pytest.mark.filterwarnings("ignore:np.find_common_type is deprecated:DeprecationWarning") def test_is_in(df_raw: Any) -> None: - result = nw.to_native(nw.Series(df_raw["a"]).is_in([1, 2])) + result = nw.Series(df_raw["a"]).is_in([1, 2]) assert result[0] assert not result[1] assert result[2] @@ -55,11 +56,11 @@ def test_is_in(df_raw: Any) -> None: @pytest.mark.parametrize("df_raw", [df_pandas, df_polars]) def test_gt(df_raw: Any) -> None: s = nw.Series(df_raw["a"]) - result = nw.to_native(s > s) # noqa: PLR0124 + result = s > s # noqa: PLR0124 assert not result[0] assert not result[1] assert not result[2] - result = nw.to_native(s > 1) + result = s > 1 assert not result[0] assert result[1] assert result[2] @@ -285,3 +286,9 @@ def test_cast() -> None: n=df["m"].cast(nw.Boolean), ).schema assert result == expected + + +def test_to_numpy() -> None: + s = pd.Series([1, 2, None], dtype="Int64") + result = nw.Series(s).to_numpy() + assert result.dtype == "float64" diff --git a/tpch/notebooks/q8/execute.ipynb b/tpch/notebooks/q8/execute.ipynb new file mode 100755 index 000000000..3d7745b8b --- /dev/null +++ b/tpch/notebooks/q8/execute.ipynb @@ -0,0 +1,668 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "0385ed89", + "metadata": { + "execution": { + "iopub.execute_input": "2024-03-22T17:23:42.216843Z", + "iopub.status.busy": "2024-03-22T17:23:42.216251Z", + "iopub.status.idle": "2024-03-22T17:24:15.599245Z", + "shell.execute_reply": "2024-03-22T17:24:15.598103Z" + }, + "papermill": { + "duration": 33.390992, + "end_time": "2024-03-22T17:24:15.601719", + "exception": false, + "start_time": "2024-03-22T17:23:42.210727", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (2.2.1)\r\n", + "Requirement already satisfied: polars in /opt/conda/lib/python3.10/site-packages (0.20.15)\r\n", + "Collecting polars\r\n", + " Downloading polars-0.20.16-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)\r\n", + "Requirement already satisfied: pyarrow in /opt/conda/lib/python3.10/site-packages (15.0.2)\r\n", + "Requirement already satisfied: numpy<2,>=1.22.4 in /opt/conda/lib/python3.10/site-packages (from pandas) (1.26.4)\r\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas) (2.9.0.post0)\r\n", + "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.3.post1)\r\n", + "Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas) (2023.4)\r\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\r\n", + "Downloading polars-0.20.16-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.3 MB)\r\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m26.3/26.3 MB\u001b[0m \u001b[31m67.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n", + "\u001b[?25hInstalling collected packages: polars\r\n", + " Attempting uninstall: polars\r\n", + " Found existing installation: polars 0.20.15\r\n", + " Uninstalling polars-0.20.15:\r\n", + " Successfully uninstalled polars-0.20.15\r\n", + "Successfully installed polars-0.20.16\r\n" + ] + } + ], + "source": [ + "!pip install -U polars pyarrow && pip uninstall pandas -y && pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92c1bf9f", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b969208d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-03-22T17:24:38.152925Z", + "iopub.status.busy": "2024-03-22T17:24:38.152552Z", + "iopub.status.idle": "2024-03-22T17:24:39.051533Z", + "shell.execute_reply": "2024-03-22T17:24:39.050623Z" + }, + "papermill": { + "duration": 0.907754, + "end_time": "2024-03-22T17:24:39.053873", + "exception": false, + "start_time": "2024-03-22T17:24:38.146119", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import polars as pl\n", + "\n", + "pd.options.mode.copy_on_write = True\n", + "pd.options.future.infer_string = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c83ec1b", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import datetime, date\n", + "import narwhals as nw\n", + "\n", + "def q8_pandas_native(\n", + " nation_ds,\n", + " customer_ds,\n", + " line_item_ds,\n", + " orders_ds,\n", + " supplier_ds,\n", + ") -> None:\n", + " var1 = \"FRANCE\"\n", + " var2 = \"GERMANY\"\n", + " var3 = date(1995, 1, 1)\n", + " var4 = date(1996, 12, 31)\n", + "\n", + " n1 = nation_ds[(nation_ds[\"n_name\"] == var1)]\n", + " n2 = nation_ds[(nation_ds[\"n_name\"] == var2)]\n", + "\n", + " # Part 1\n", + " jn1 = customer_ds.merge(n1, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n", + " jn2 = jn1.merge(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", + " jn2 = jn2.rename({\"n_name\": \"cust_nation\"}, axis=\"columns\")\n", + " jn3 = jn2.merge(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", + " jn4 = jn3.merge(supplier_ds, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", + " jn5 = jn4.merge(n2, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", + " df1 = jn5.rename({\"n_name\": \"supp_nation\"}, axis=\"columns\")\n", + "\n", + " # Part 2\n", + " jn1 = customer_ds.merge(n2, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n", + " jn2 = jn1.merge(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", + " jn2 = jn2.rename({\"n_name\": \"cust_nation\"}, axis=\"columns\")\n", + " jn3 = jn2.merge(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", + " jn4 = jn3.merge(supplier_ds, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", + " jn5 = jn4.merge(n1, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", + " df2 = jn5.rename({\"n_name\": \"supp_nation\"}, axis=\"columns\")\n", + "\n", + " # Combine\n", + " total = pd.concat([df1, df2])\n", + "\n", + " total = total[(total[\"l_shipdate\"] >= var3) & (total[\"l_shipdate\"] <= var4)]\n", + " total[\"volume\"] = total[\"l_extendedprice\"] * (1.0 - total[\"l_discount\"])\n", + " total[\"l_year\"] = total[\"l_shipdate\"].dt.year\n", + "\n", + " gb = total.groupby([\"supp_nation\", \"cust_nation\", \"l_year\"], as_index=False)\n", + " agg = gb.agg(revenue=pd.NamedAgg(column=\"volume\", aggfunc=\"sum\"))\n", + "\n", + " result_df = agg.sort_values(by=[\"supp_nation\", \"cust_nation\", \"l_year\"])\n", + "\n", + " return result_df # type: ignore[no-any-return]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "42e7f0e2", + "metadata": { + "execution": { + "iopub.execute_input": "2024-03-22T17:24:39.066341Z", + "iopub.status.busy": "2024-03-22T17:24:39.065881Z", + "iopub.status.idle": "2024-03-22T17:24:39.078875Z", + "shell.execute_reply": "2024-03-22T17:24:39.077655Z" + }, + "papermill": { + "duration": 0.021725, + "end_time": "2024-03-22T17:24:39.080999", + "exception": false, + "start_time": "2024-03-22T17:24:39.059274", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import datetime\n", + "import narwhals as nw\n", + "\n", + "def q8(\n", + " nation_ds_raw: Any,\n", + " customer_ds_raw: Any,\n", + " line_item_ds_raw: Any,\n", + " orders_ds_raw: Any,\n", + " supplier_ds_raw: Any,\n", + " part_ds_raw: Any,\n", + ") -> None:\n", + " nation_ds = nw.from_native(nation_ds_raw)\n", + " customer_ds = nw.from_native(customer_ds_raw)\n", + " line_item_ds = nw.from_native(line_item_ds_raw)\n", + " orders_ds = nw.from_native(orders_ds_raw)\n", + " supplier_ds = nw.from_native(supplier_ds_raw)\n", + " part_ds = nw.from_native(part_ds_raw)\n", + "\n", + " n1 = nation_ds.select(\"n_nationkey\", \"n_regionkey\")\n", + " n2 = nation_ds.select(\"n_nationkey\", \"n_name\")\n", + "\n", + " result = (\n", + " part_ds.join(line_item_ds, left_on=\"p_partkey\", right_on=\"l_partkey\")\n", + " .join(supplier_ds, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", + " .join(orders_ds, left_on=\"l_orderkey\", right_on=\"o_orderkey\")\n", + " .join(customer_ds, left_on=\"o_custkey\", right_on=\"c_custkey\")\n", + " .join(n1, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n", + " .join(region_ds, left_on=\"n_regionkey\", right_on=\"r_regionkey\")\n", + " .filter(nw.col(\"r_name\") == \"AMERICA\")\n", + " .join(n2, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", + " .filter(\n", + " nw.col(\"o_orderdate\")>= date(1995, 1, 1),\n", + " nw.col('o_orderdate')<=date(1996, 12, 31)\n", + " )\n", + " .filter(nw.col(\"p_type\") == \"ECONOMY ANODIZED STEEL\")\n", + " .select(\n", + " nw.col(\"o_orderdate\").dt.year().alias(\"o_year\"),\n", + " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\"volume\"),\n", + " nw.col(\"n_name\").alias(\"nation\"),\n", + " )\n", + " .with_columns(\n", + " nw.when(nw.col(\"nation\") == \"BRAZIL\")\n", + " .then(nw.col(\"volume\"))\n", + " .otherwise(0)\n", + " .alias(\"_tmp\")\n", + " )\n", + " .group_by(\"o_year\")\n", + " .agg((nw.sum(\"_tmp\") / nw.sum(\"volume\")).round(2).alias(\"mkt_share\"))\n", + " .sort(\"o_year\")\n", + " )\n", + " \n", + " return nw.to_native(result)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8d540303", + "metadata": { + "execution": { + "iopub.execute_input": "2024-03-22T17:24:39.093217Z", + "iopub.status.busy": "2024-03-22T17:24:39.092874Z", + "iopub.status.idle": "2024-03-22T17:24:39.097611Z", + "shell.execute_reply": "2024-03-22T17:24:39.096644Z" + }, + "papermill": { + "duration": 0.013325, + "end_time": "2024-03-22T17:24:39.099766", + "exception": false, + "start_time": "2024-03-22T17:24:39.086441", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", + "region = dir_ + 'region.parquet'\n", + "nation = dir_ + 'nation.parquet'\n", + "customer = dir_ + 'customer.parquet'\n", + "lineitem = dir_ + 'lineitem.parquet'\n", + "orders = dir_ + 'orders.parquet'\n", + "supplier = dir_ + 'supplier.parquet'\n", + "part = dir_ + 'part.parquet'\n", + "partsupp = dir_ + 'partsupp.parquet'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "afc23c7d", + "metadata": { + "execution": { + "iopub.execute_input": "2024-03-22T17:24:39.112703Z", + "iopub.status.busy": "2024-03-22T17:24:39.112327Z", + "iopub.status.idle": "2024-03-22T17:24:39.117529Z", + "shell.execute_reply": "2024-03-22T17:24:39.116459Z" + }, + "papermill": { + "duration": 0.014284, + "end_time": "2024-03-22T17:24:39.119737", + "exception": false, + "start_time": "2024-03-22T17:24:39.105453", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "IO_FUNCS = {\n", + " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", + " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " 'polars[eager]': lambda x: pl.read_parquet(x),\n", + " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7b521f62", + "metadata": {}, + "outputs": [], + "source": [ + "results = {}" + ] + }, + { + "cell_type": "markdown", + "id": "12824d5d", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtypes, native" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce229598", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20.2 s ± 5.8 s per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "16.42582530300001" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q7_pandas_native(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "results[tool+'[native]'] = timings.best" + ] + }, + { + "cell_type": "markdown", + "id": "09249944", + "metadata": { + "papermill": { + "duration": 0.005113, + "end_time": "2024-03-22T17:24:39.130472", + "exception": false, + "start_time": "2024-03-22T17:24:39.125359", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e2663325", + "metadata": { + "execution": { + "iopub.execute_input": "2024-03-22T17:24:39.142769Z", + "iopub.status.busy": "2024-03-22T17:24:39.142165Z", + "iopub.status.idle": "2024-03-22T17:27:55.915660Z", + "shell.execute_reply": "2024-03-22T17:27:55.914640Z" + }, + "papermill": { + "duration": 196.786925, + "end_time": "2024-03-22T17:27:55.922832", + "exception": false, + "start_time": "2024-03-22T17:24:39.135907", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "ename": "TypeError", + "evalue": "Index(...) must be called with a collection of some kind, was passed", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m tool \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpandas\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 2\u001b[0m fn \u001b[38;5;241m=\u001b[39m IO_FUNCS[tool]\n\u001b[0;32m----> 3\u001b[0m \u001b[43mq7\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnation\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcustomer\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlineitem\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43morders\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43msupplier\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4\u001b[0m results[tool] \u001b[38;5;241m=\u001b[39m timings\u001b[38;5;241m.\u001b[39mbest\n", + "Cell \u001b[0;32mIn[2], line 12\u001b[0m, in \u001b[0;36mq7\u001b[0;34m(nation_ds, customer_ds, line_item_ds, orders_ds, supplier_ds)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mq7\u001b[39m(\n\u001b[1;32m 6\u001b[0m nation_ds,\n\u001b[1;32m 7\u001b[0m customer_ds,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 10\u001b[0m supplier_ds,\n\u001b[1;32m 11\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 12\u001b[0m n1 \u001b[38;5;241m=\u001b[39m \u001b[43mnation_ds\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnw\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcol\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mn_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mFRANCE\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 13\u001b[0m n2 \u001b[38;5;241m=\u001b[39m nation_ds\u001b[38;5;241m.\u001b[39mfilter(nw\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mn_name\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGERMANY\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 15\u001b[0m var_1 \u001b[38;5;241m=\u001b[39m date(\u001b[38;5;241m1995\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n", + "File \u001b[0;32m~/polars-api-compat-dev/.venv/lib/python3.11/site-packages/pandas/core/generic.py:5790\u001b[0m, in \u001b[0;36mNDFrame.filter\u001b[0;34m(self, items, like, regex, axis)\u001b[0m\n\u001b[1;32m 5788\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m items \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 5789\u001b[0m name \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_axis_name(axis)\n\u001b[0;32m-> 5790\u001b[0m items \u001b[38;5;241m=\u001b[39m \u001b[43mIndex\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitems\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mintersection(labels)\n\u001b[1;32m 5791\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(items) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 5792\u001b[0m \u001b[38;5;66;03m# Keep the dtype of labels when we are empty\u001b[39;00m\n\u001b[1;32m 5793\u001b[0m items \u001b[38;5;241m=\u001b[39m items\u001b[38;5;241m.\u001b[39mastype(labels\u001b[38;5;241m.\u001b[39mdtype)\n", + "File \u001b[0;32m~/polars-api-compat-dev/.venv/lib/python3.11/site-packages/pandas/core/indexes/base.py:532\u001b[0m, in \u001b[0;36mIndex.__new__\u001b[0;34m(cls, data, dtype, copy, name, tupleize_cols)\u001b[0m\n\u001b[1;32m 528\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m(np\u001b[38;5;241m.\u001b[39masarray(data), dtype\u001b[38;5;241m=\u001b[39mdtype, copy\u001b[38;5;241m=\u001b[39mcopy, name\u001b[38;5;241m=\u001b[39mname)\n\u001b[1;32m 529\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_list_like(data) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mmemoryview\u001b[39m):\n\u001b[1;32m 530\u001b[0m \u001b[38;5;66;03m# 2022-11-16 the memoryview check is only necessary on some CI\u001b[39;00m\n\u001b[1;32m 531\u001b[0m \u001b[38;5;66;03m# builds, not clear why\u001b[39;00m\n\u001b[0;32m--> 532\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_scalar_data_error\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tupleize_cols:\n\u001b[1;32m 536\u001b[0m \u001b[38;5;66;03m# GH21470: convert iterable to list before determining if empty\u001b[39;00m\n", + "File \u001b[0;32m~/polars-api-compat-dev/.venv/lib/python3.11/site-packages/pandas/core/indexes/base.py:5289\u001b[0m, in \u001b[0;36mIndex._raise_scalar_data_error\u001b[0;34m(cls, data)\u001b[0m\n\u001b[1;32m 5284\u001b[0m \u001b[38;5;129m@final\u001b[39m\n\u001b[1;32m 5285\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 5286\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_raise_scalar_data_error\u001b[39m(\u001b[38;5;28mcls\u001b[39m, data):\n\u001b[1;32m 5287\u001b[0m \u001b[38;5;66;03m# We return the TypeError so that we can raise it from the constructor\u001b[39;00m\n\u001b[1;32m 5288\u001b[0m \u001b[38;5;66;03m# in order to keep mypy happy\u001b[39;00m\n\u001b[0;32m-> 5289\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 5290\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m(...) must be called with a collection of some \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 5291\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mkind, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mrepr\u001b[39m(data)\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mif\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;129;01mnot\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28misinstance\u001b[39m(data,\u001b[38;5;250m \u001b[39mnp\u001b[38;5;241m.\u001b[39mgeneric)\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01melse\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mstr\u001b[39m(data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 5292\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwas passed\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 5293\u001b[0m )\n", + "\u001b[0;31mTypeError\u001b[0m: Index(...) must be called with a collection of some kind, was passed" + ] + } + ], + "source": [ + "tool = 'pandas'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "results[tool] = timings.best" + ] + }, + { + "cell_type": "markdown", + "id": "0b561017", + "metadata": { + "papermill": { + "duration": 0.005184, + "end_time": "2024-03-22T17:27:55.933407", + "exception": false, + "start_time": "2024-03-22T17:27:55.928223", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## pandas, pyarrow dtypes, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "32ed7477", + "metadata": { + "execution": { + "iopub.execute_input": "2024-03-22T17:27:55.947042Z", + "iopub.status.busy": "2024-03-22T17:27:55.946658Z", + "iopub.status.idle": "2024-03-22T17:30:34.673691Z", + "shell.execute_reply": "2024-03-22T17:30:34.672291Z" + }, + "papermill": { + "duration": 158.748353, + "end_time": "2024-03-22T17:30:34.688289", + "exception": false, + "start_time": "2024-03-22T17:27:55.939936", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20.2 s ± 5.8 s per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "16.42582530300001" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "results[tool] = timings.best" + ] + }, + { + "cell_type": "markdown", + "id": "a8005d7d", + "metadata": { + "papermill": { + "duration": 0.005773, + "end_time": "2024-03-22T17:30:34.700300", + "exception": false, + "start_time": "2024-03-22T17:30:34.694527", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars read_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a5e9bfd3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-03-22T17:30:34.714876Z", + "iopub.status.busy": "2024-03-22T17:30:34.714421Z", + "iopub.status.idle": "2024-03-22T17:31:12.519452Z", + "shell.execute_reply": "2024-03-22T17:31:12.518760Z" + }, + "papermill": { + "duration": 37.821116, + "end_time": "2024-03-22T17:31:12.527466", + "exception": false, + "start_time": "2024-03-22T17:30:34.706350", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4.67 s ± 85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "4.574684939999997" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tool = 'polars[eager]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "results[tool] = timings.best" + ] + }, + { + "cell_type": "markdown", + "id": "c77a701f", + "metadata": { + "papermill": { + "duration": 0.005515, + "end_time": "2024-03-22T17:31:12.539068", + "exception": false, + "start_time": "2024-03-22T17:31:12.533553", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Polars scan_parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7cbe1a02", + "metadata": { + "execution": { + "iopub.execute_input": "2024-03-22T17:31:12.554281Z", + "iopub.status.busy": "2024-03-22T17:31:12.553860Z", + "iopub.status.idle": "2024-03-22T17:31:17.344303Z", + "shell.execute_reply": "2024-03-22T17:31:17.343397Z" + }, + "papermill": { + "duration": 4.800698, + "end_time": "2024-03-22T17:31:17.346813", + "exception": false, + "start_time": "2024-03-22T17:31:12.546115", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "595 ms ± 18.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "0.5674880569999914" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tool = 'polars[lazy]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", + "results[tool] = timings.best" + ] + }, + { + "cell_type": "markdown", + "id": "37ce6bf3", + "metadata": {}, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14399622", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('results.json', 'w') as fd:\n", + " json.dump(results, fd)\n" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "sourceId": 167796716, + "sourceType": "kernelVersion" + }, + { + "sourceId": 167796934, + "sourceType": "kernelVersion" + }, + { + "sourceId": 167796952, + "sourceType": "kernelVersion" + }, + { + "sourceId": 167796969, + "sourceType": "kernelVersion" + } + ], + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + }, + "papermill": { + "default_parameters": {}, + "duration": 458.423327, + "end_time": "2024-03-22T17:31:18.077306", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2024-03-22T17:23:39.653979", + "version": "2.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tpch/notebooks/q8/kernel-metadata.json b/tpch/notebooks/q8/kernel-metadata.json new file mode 100644 index 000000000..1c67f0d53 --- /dev/null +++ b/tpch/notebooks/q8/kernel-metadata.json @@ -0,0 +1,15 @@ +{ + "id": "marcogorelli/narwhals-tpch-q8-s2", + "title": "Narwhals TPCH Q8 S2", + "code_file": "execute.ipynb", + "language": "python", + "kernel_type": "notebook", + "is_private": "false", + "enable_gpu": "false", + "enable_tpu": "false", + "enable_internet": "true", + "dataset_sources": [], + "competition_sources": [], + "kernel_sources": ["marcogorelli/tpc-h-data-parquet-s-2"], + "model_sources": [] +} \ No newline at end of file