From eb266f838e15b5e9c9c2cd888deb670a54ad85d7 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 6 May 2024 22:05:39 +0100 Subject: [PATCH] expand tutorial --- README.md | 87 +------------------------------------- docs/other/pandas_index.md | 82 +++++++++++++++++++++++++++++++++++ mkdocs.yml | 2 + 3 files changed, 85 insertions(+), 86 deletions(-) create mode 100644 docs/other/pandas_index.md diff --git a/README.md b/README.md index 6a230149c..0acbec770 100644 --- a/README.md +++ b/README.md @@ -67,92 +67,7 @@ environment: ## Example -Here's an example of a dataframe agnostic function: - -```python -from typing import Any -import pandas as pd -import polars as pl - -import narwhals as nw - - -def my_agnostic_function( - suppliers_native, - parts_native, -): - suppliers = nw.from_native(suppliers_native) - parts = nw.from_native(parts_native) - - result = ( - suppliers.join(parts, left_on="city", right_on="city") - .filter(nw.col("weight") > 10) - .group_by("s") - .agg( - weight_mean=nw.col("weight").mean(), - weight_max=nw.col("weight").max(), - ) - .sort("s") - ) - - return nw.to_native(result) -``` -You can pass in a pandas or Polars dataframe, the output will be the same! -Let's try it out: - -```python -suppliers = { - "s": ["S1", "S2", "S3", "S4", "S5"], - "sname": ["Smith", "Jones", "Blake", "Clark", "Adams"], - "status": [20, 10, 30, 20, 30], - "city": ["London", "Paris", "Paris", "London", "Athens"], -} -parts = { - "p": ["P1", "P2", "P3", "P4", "P5", "P6"], - "pname": ["Nut", "Bolt", "Screw", "Screw", "Cam", "Cog"], - "color": ["Red", "Green", "Blue", "Red", "Blue", "Red"], - "weight": [12.0, 17.0, 17.0, 14.0, 12.0, 19.0], - "city": ["London", "Paris", "Oslo", "London", "Paris", "London"], -} - -print("pandas output:") -print( - my_agnostic_function( - pd.DataFrame(suppliers), - pd.DataFrame(parts), - ) -) -print("\nPolars output:") -print( - my_agnostic_function( - pl.LazyFrame(suppliers), - pl.LazyFrame(parts), - ).collect() -) -``` - -``` -pandas output: - s weight_mean weight_max -0 S1 15.0 19.0 -1 S2 14.5 17.0 -2 S3 14.5 17.0 -3 S4 15.0 19.0 - -Polars output: -shape: (4, 3) -┌─────┬─────────────┬────────────┐ -│ s ┆ weight_mean ┆ weight_max │ -│ --- ┆ --- ┆ --- │ -│ str ┆ f64 ┆ f64 │ -╞═════╪═════════════╪════════════╡ -│ S1 ┆ 15.0 ┆ 19.0 │ -│ S2 ┆ 14.5 ┆ 17.0 │ -│ S3 ┆ 14.5 ┆ 17.0 │ -│ S4 ┆ 15.0 ┆ 19.0 │ -└─────┴─────────────┴────────────┘ -``` -Magic! 🪄 +See the [tutorial](https://narwhals-dev.github.io/narwhals/basics/dataframe/) for several examples! ## Scope diff --git a/docs/other/pandas_index.md b/docs/other/pandas_index.md new file mode 100644 index 000000000..2d5e69514 --- /dev/null +++ b/docs/other/pandas_index.md @@ -0,0 +1,82 @@ +# What about the pandas Index? + +There are two types of pandas users: + +- The ones who make full use of the Index's power. +- The `.reset_index(drop=True)` ones, who would rather not think about the Index. + +Narwhals aims to accommodate both! + +- If you'd rather not think about the Index, then don't + worry: it's not part of the Narwhals public API, and you'll never have to worry about + resetting the index or about pandas doing funky index alignment for you. +- If you want your library to cater to Index powerusers who would be very angry if you reset + their beautiful Index on their behalf, then don't worry: Narwhals makes certain promises + with regards to the Index. + +Let's learn about what Narwhals promises. + +## 1. Narwhals will preserve your index for dataframe operations + +```python exec="1" source="above" session="ex1" +import narwhals as nw + +def my_func(df_any): + df = nw.from_native(df_any) + df = df.with_columns(a_sorted=nw.col('a').sort()) + return nw.to_native(df) +``` + +Let's start with a dataframe with an Index with values `[7, 8, 9]`. + +```python exec="true" source="material-block" result="python" session="ex1" +import pandas as pd + +df = pd.DataFrame({'a': [2, 1, 3], 'b': [3, 5, -3]}, index=[7, 8, 9]) +print(my_func(df)) +``` + +Note how the result still has the original index - Narwhals did not modify +it. + +## 2. Index alignment follows the left-hand-rule + +pandas automatically aligns indices for users. For example: + +```python exec="1" source="above" session="ex2" +import pandas as pd + +df_pd = pd.DataFrame({'a': [2, 1, 3], 'b': [4, 5, 6]}) +s_pd = df_pd['a'].sort_values() +df_pd['a_sorted'] = s_pd +``` +Reading the code, you might expect that this will create a dataframe that +looks like this: + +```python + a b a_sorted +0 2 4 1 +1 1 5 2 +2 3 6 3 +``` +**However**, here is what `result` actually looks like: +```python exec="1" source="material-block" session="ex2" result="python" +print(df_pd) +``` +In other words, pandas' index alignment undid the `sort_values` operation! + +Narwhals, on the other hand, preserves the index of the left-hand-side argument. +Everything else will be inserted positionally, just like Polars would do: + +```python exec="1" source="material-block" session="ex2" result="python" +import narwhals as nw + +df = nw.from_native(df_pd) +s = nw.from_native(s_pd, allow_series=True) +df = df.with_columns(a_sorted=s.sort()) +print(nw.to_native(df)) +``` + +If you keep these two rules in mind, then Narwhals will both help you avoid +Index-related surprises whilst letting you preserve the Index for your users +who consciously make great use it. diff --git a/mkdocs.yml b/mkdocs.yml index e22fbc9e3..476a8758f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,6 +9,8 @@ nav: - basics/dataframe.md - basics/column.md - basics/complete_example.md + - Other concepts: + - other/pandas_index.md - extending.md - Roadmap: roadmap.md - Related projects: related.md