-
Notifications
You must be signed in to change notification settings - Fork 348
/
Copy pathconnect_to_motherduck.py
408 lines (339 loc) · 10.3 KB
/
connect_to_motherduck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "altair==5.4.1",
# "duckdb==1.1.0",
# "polars==1.18.0",
# "pyarrow==18.1.0",
# "marimo",
# ]
# ///
import marimo
__generated_with = "0.9.10"
app = marimo.App(width="medium")
@app.cell(hide_code=True)
def __(mo):
mo.md(
r"""
# MotherDuck 🧡 marimo
Throughout this notebook, we will explore using [MotherDuck](https://motherduck.com) inside marimo. If you’re new to marimo, check out our [GitHub](https://github.com/marimo-team/marimo) repo: marimo is free and open source.
_You can expand the code of any cells to see how the output are being created._
"""
)
return
@app.cell(hide_code=True)
def __(__file__, md_token, mo):
callout = mo.md(f"""
There is no **MotherDuck** token found in your environment. To set one up, go to the [MotherDuck's settings page](https://app.motherduck.com/settings/general), create a token, and copy it below.
And re-run this notebook:
```console
motherduck_token="YOUR_TOKEN_HERE" marimo edit {__file__}
```
""").callout()
if md_token is None:
mo.output.replace(
mo.accordion({"Tired of logging in to MotherDuck?": callout})
)
return (callout,)
@app.cell(hide_code=True)
def __():
import os
md_token = os.environ.get("motherduck_token") or os.environ.get(
"MOTHERDUCK_TOKEN"
)
return md_token, os
@app.cell(hide_code=True)
def __(mo):
mo.md("""Let's attach a remote MotherDuck database using `md:`""")
return
@app.cell
def __():
import duckdb
import marimo as mo
duckdb.sql(
"ATTACH 'md:_share/sample_data/23b0d623-1361-421d-ae77-62d701d471e6' AS sample_data"
)
# or add your own md instance
# duckdb.sql(f"ATTACH IF NOT EXISTS 'md:sample_data'")
return duckdb, mo, sample_data
@app.cell(hide_code=True)
def __(mo):
mo.md(
"""
!!! tip "Explore data sources"
If you open the "Explore data sources" panel on the left side bar (3rd icon), you will see all your tables including any news ones we will create below
"""
)
return
@app.cell
def __(mo):
mo.md(r"""## Let's make some queries 🦆""")
return
@app.cell
def __(mo):
most_shared_websites = mo.sql(
f"""
-- Most shared websites
-- This query returns the top domains being shared on Hacker News.
SELECT
regexp_extract(url, 'http[s]?://([^/]+)/', 1) AS domain,
count(*) AS count
FROM sample_data.hn.hacker_news
WHERE url IS NOT NULL AND regexp_extract(url, 'http[s]?://([^/]+)/', 1) != ''
GROUP BY domain
ORDER BY count DESC
LIMIT 20;
-- We've named the result of this dataframe to be `most_shared_websites`. Now we can use this in any downstream Python or SQL code.
"""
)
return (most_shared_websites,)
@app.cell
def __(hacker_news, mo, ranked_stories, sample_data):
most_commented_stories_each_month = mo.sql(
f"""
-- Most Commented Stories Each Month
-- This query calculates the total number of comments for each story and identifies the most commented story of each month.
WITH ranked_stories AS (
SELECT
title,
'https://news.ycombinator.com/item?id=' || id AS hn_url,
descendants AS nb_comments,
YEAR(timestamp) AS year,
MONTH(timestamp) AS month,
ROW_NUMBER()
OVER (
PARTITION BY YEAR(timestamp), MONTH(timestamp)
ORDER BY descendants DESC
)
AS rn
FROM sample_data.hn.hacker_news
WHERE type = 'story'
)
SELECT
year,
month,
title,
hn_url,
nb_comments
FROM ranked_stories
WHERE rn = 1
ORDER BY year, month;
-- This also creates a table most_commented_stories_each_month
-- Which can be used in Python to create charts
"""
)
return (most_commented_stories_each_month,)
@app.cell(hide_code=True)
def __(mo):
mo.md(
r"""
## Let's make some charts 📈
Now that we have made some queries and named the results, we can chart those resulting dataframes in Python, using our favorite charting libraries (e.g [altair](https://altair-viz.github.io/), [matplotlib](https://matplotlib.org/), or [plotly](https://plotly.com/)).
"""
)
return
@app.cell
def __(most_shared_websites):
import altair as alt
chart = (
alt.Chart(most_shared_websites)
.mark_bar()
.encode(
x=alt.X("count:Q", title="Number of Shares"),
y=alt.Y("domain:N", sort="-x", title="Domain"),
tooltip=["domain", "count"],
)
.properties(
title="Top 20 Most Shared Websites on Hacker News", width="container"
)
)
chart
return alt, chart
@app.cell(hide_code=True)
def __(mo):
mo.md(
r"""
## Adding reactivity ⚡
We can also parameterize our SQL using marimo UI elements. This not only makes our SQL reactive, but also any downstream logic, including our charts.
"""
)
return
@app.cell
def __(MONTHS, duckdb, hacker_news, mo, sample_data):
month_select = mo.ui.multiselect(
MONTHS,
label="Month",
value=MONTHS.keys(),
)
hn_types = duckdb.sql(
"""
SELECT DISTINCT type as 'HN Type'
FROM sample_data.hn.hacker_news
WHERE score NOT NULL AND descendants NOT NULL
LIMIT 10;
"""
).df()
hn_type_select = mo.ui.dropdown.from_series(hn_types["HN Type"], value="story")
return hn_type_select, hn_types, month_select
@app.cell(hide_code=True)
def __(hn_type_select, mo, month_select):
month_list = ",".join([str(month) for month in month_select.value])
mo.hstack(
[
mo.md(f"## {mo.icon('lucide:filter')}"),
month_select,
hn_type_select,
],
).left()
return (month_list,)
@app.cell(hide_code=True)
def __(
hacker_news,
hn_type_select,
mo,
month_list,
ranked_stories,
sample_data,
):
most_monthly_voted = mo.sql(
f"""
-- Most monthly voted
-- This query determines the most voted type for each month.
WITH ranked_stories AS (
SELECT
title,
'https://news.ycombinator.com/item?id=' || id AS hn_url,
score,
type,
descendants,
YEAR(timestamp) AS year,
MONTH(timestamp) AS month,
ROW_NUMBER()
OVER (PARTITION BY YEAR(timestamp), MONTH(timestamp) ORDER BY score DESC)
AS rn
FROM sample_data.hn.hacker_news
-- here we parameterize the sql statement
WHERE
type = '{hn_type_select.value}'
AND
MONTH(timestamp) in ({month_list})
AND
descendants NOT NULl
)
SELECT
month,
score,
type,
title,
hn_url,
descendants as nb_comments,
year,
FROM ranked_stories
WHERE rn = 1
ORDER BY year, month;
"""
)
return (most_monthly_voted,)
@app.cell(hide_code=True)
def __(alt, hn_type_select, most_monthly_voted):
_chart = (
alt.Chart(most_monthly_voted)
.mark_circle()
.encode(
x=alt.X("month:O", title="Month"),
y=alt.Y("score:Q", title="Score"),
size=alt.Size(
"nb_comments:Q",
scale=alt.Scale(range=[100, 1000]),
title="Number of Comments",
),
color=alt.Color("nb_comments:Q", scale=alt.Scale(scheme="viridis")),
tooltip=["title", "nb_comments", "hn_url"],
)
.properties(
title=f"Most Commented {hn_type_select.value} Each Month",
width="container",
height=400,
)
)
_chart
return
@app.cell(hide_code=True)
def __(mo):
mo.md("""## Additional Reactivity ⚡⚡""")
return
@app.cell
def __(mo):
search_input = mo.ui.text(label="Search for keywords", value="duckdb")
search_input
return (search_input,)
@app.cell
def __(hacker_news, mo, sample_data, search_value):
keyword_results = mo.sql(
f"""
SELECT
YEAR(timestamp) AS year,
MONTH(timestamp) AS month,
COUNT(*) AS keyword_mentions
FROM sample_data.hn.hacker_news
WHERE
(title LIKE '%{search_value}%' OR text LIKE '%{search_value}%')
GROUP BY year, month
ORDER BY year ASC, month ASC;
"""
)
return (keyword_results,)
@app.cell(hide_code=True)
def __(search_input):
search_value = search_input.value
return (search_value,)
@app.cell(hide_code=True)
def __(alt, keyword_results, mo, search_value):
if keyword_results.is_empty():
mo.stop(True, f"No results for {search_value}")
# Create the chart
_chart = (
alt.Chart(keyword_results)
.mark_rect()
.encode(
x=alt.X("month:O", title="Month"),
y=alt.Y("keyword_mentions:Q", title="Year"),
tooltip=["year", "month", "keyword_mentions"],
)
.properties(
title=f'Monthly Mentions of "{search_value}" in Hacker News Posts',
width="container",
height=400,
)
)
# Add text labels for the number of mentions
_text = (
alt.Chart(keyword_results)
.mark_text(baseline="bottom", dy=-5)
.encode(
x=alt.X("month:O"),
y=alt.Y("keyword_mentions:Q", title="Year"),
text=alt.Text("keyword_mentions:Q"),
)
)
_chart + _text
return
@app.cell(hide_code=True)
def __():
MONTHS = {
"January": 1,
"February": 2,
"March": 3,
"April": 4,
"May": 5,
"June": 6,
"July": 7,
"August": 8,
"September": 9,
"October": 10,
"November": 11,
"December": 12,
}
return (MONTHS,)
if __name__ == "__main__":
app.run()