-
Notifications
You must be signed in to change notification settings - Fork 348
/
Copy pathread_parquet.py
146 lines (112 loc) · 3.29 KB
/
read_parquet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "duckdb==1.1.1",
# "marimo",
# "polars==1.18.0",
# "pyarrow==18.1.0",
# "vega-datasets==0.9.0",
# ]
# ///
import marimo
__generated_with = "0.9.1"
app = marimo.App(width="medium")
@app.cell(hide_code=True)
def __(mo):
mo.md(
"""
# Read Parquet
This notebook shows how to read a Parquet file from a local file or a URL into an in-memory table.
"""
)
return
@app.cell(hide_code=True)
def __():
import marimo as mo
import polars as pl
pl.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}).write_parquet("data.parquet")
return mo, pl
@app.cell(hide_code=True)
def __(mo):
mo.md("""Reading from a Parquet file is as easy as `SELECT * from "data.parquet"`, where `data.parquet` is the path or URL to your parquet file.""")
return
@app.cell(hide_code=True)
def __(mo):
mo.accordion(
{
"Tip: Creating SQL Cells": mo.md(
f"""
Create a SQL cell in one of two ways:
1. Click the {mo.icon("lucide:database")} `SQL` button at the **bottom of your notebook**
2. **Right-click** the {mo.icon("lucide:circle-plus")} button to the **left of a cell**, and choose `SQL`.
In the SQL cell, you can query dataframes in your notebook as if
they were tables — just reference them by name.
"""
)
}
)
return
@app.cell
def __(data, mo):
result = mo.sql(
f"""
-- Tip: you can also specify the data files using a glob, such as '/path/to/*.parquet'
-- or '/path/**/to/*.parquet'
SELECT * FROM 'data.parquet'
""", output=False
)
return (result,)
@app.cell(hide_code=True)
def __(mo):
mo.accordion(
{
"Tip: Query output": mo.md(
r"""
The query output is returned to Python as a dataframe (Polars if you have it installed, Pandas otherwise).
Choose the dataframe name via the **output variable** input in the bottom-left
of the cell. If the name starts with an underscore, it won't be made available
to other cells. In this case, we've named the output `result`.
"""
)
}
)
return
@app.cell
def __(result):
result
return
@app.cell(hide_code=True)
def __(mo):
mo.md(
r"""
## Create an in-memory table from a Parquet file
You can also create a table from a Parquet file, so you can easily query it in subsequent cells. This table will appear in marimo's data sources panel.
"""
)
return
@app.cell
def __(data, mo):
_df = mo.sql(
f"""
CREATE OR REPLACE TABLE myTable AS SELECT * FROM 'data.parquet'
"""
)
return (myTable,)
@app.cell
def __(mo, myTable):
_df = mo.sql(
f"""
SELECT * FROM myTable
"""
)
return
@app.cell(hide_code=True)
def __(mo):
mo.md(r"""## Advanced usage""")
return
@app.cell(hide_code=True)
def __(mo):
mo.md(r"""To customize how your parquet file is read, use [duckdb's `read_parquet` function](https://duckdb.org/docs/data/parquet/overview.html).""")
return
if __name__ == "__main__":
app.run()