This repository has been archived by the owner on Feb 21, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathTuring_Workshop.jl
1558 lines (1229 loc) · 61.1 KB
/
Turing_Workshop.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
### A Pluto.jl notebook ###
# v0.14.8
using Markdown
using InteractiveUtils
# This Pluto notebook uses @bind for interactivity. When running this notebook outside of Pluto, the following 'mock version' of @bind gives bound variables a default value (instead of an error).
macro bind(def, element)
quote
local el = $(esc(element))
global $(esc(def)) = Core.applicable(Base.get, el) ? Base.get(el) : missing
el
end
end
# ╔═╡ 8902a846-fbb9-42fc-8742-c9c4a84db52c
begin
import Pkg
Pkg.activate(mktempdir())
Pkg.add([
Pkg.PackageSpec(name="BenchmarkTools", version="1.0.0"),
Pkg.PackageSpec(name="CSV", version="0.8.5"),
Pkg.PackageSpec(name="Chain", version="0.4.6"),
Pkg.PackageSpec(name="DataFrames", version="1.1.1"),
Pkg.PackageSpec(name="DifferentialEquations", version="6.17.1"),
Pkg.PackageSpec(name="Distributions", version="0.24.18"),
Pkg.PackageSpec(name="LaTeXStrings", version="1.2.1"),
Pkg.PackageSpec(name="LazyArrays", version="0.21.5"),
Pkg.PackageSpec(name="Plots", version="1.16.2"),
Pkg.PackageSpec(name="PlutoUI", version="0.7.9"),
Pkg.PackageSpec(name="StatsBase", version="0.33.8"),
Pkg.PackageSpec(name="StatsPlots", version="0.14.21"),
Pkg.PackageSpec(name="Turing", version="0.16.0")
])
using BenchmarkTools
using CSV
using DataFrames
using DifferentialEquations
using Distributions
using LaTeXStrings
using LazyArrays
using LinearAlgebra
using Random
using StatsBase
using StatsPlots
using Turing
using Plots
using PlutoUI
using LinearAlgebra: qr
using Statistics: mean, std
end
# ╔═╡ 31161289-1d4c-46ba-8bd9-e687fb7da29e
begin
using InteractiveUtils
with_terminal() do
versioninfo()
end
end
# ╔═╡ 4af78efd-d484-4241-9d3c-97cc78e1dbd4
begin
Turing.setprogress!(false);
Random.seed!(1);
end
# ╔═╡ 5df4d7d2-c622-11eb-3bbd-bff9668ee5e0
md"""
# Turing Workshop
"""
# ╔═╡ 19c63110-4baa-4aff-ab91-46e5c149f3a2
Resource("https://img.shields.io/badge/License-CC%20BY--SA%204.0-lightgrey.svg", :width => 120, :display => "inline")
# ╔═╡ dceb8312-230f-4e4b-9285-4e23f219b838
Resource("https://github.com/storopoli/Turing-Workshop/blob/master/images/bayes-meme.jpg?raw=true", :width => 250, :display=>"center")
# ╔═╡ cda7dc96-d983-4e31-9298-6148205b54b1
md"""
A little bit about myself:
$(Resource("https://github.com/storopoli/Turing-Workshop/blob/master/images/profile_pic.jpg?raw=true", :width => 100, :align => "right"))
* **Jose Storopoli**, PhD 🌐 [storopoli.io](https://storopoli.io)
* Associate Professor at [**Universidade Nove de Julho** (UNINOVE)](https://uninove.br)
* Teach undergraduates [**Statistics** and **Machine Learning** (using Python](https://storopoli.io/ciencia-de-dados) 😓, but I'm starting ot migrate the content to [**Julia**](https://storopoli.io/Julia-ciencia-de-dados/) 🚀)
* Teach graduate students [**Bayesian Statistics** (using `Stan`)](https://storopoli.io/Estatistica-Bayesiana) and [**Scientific Computing** (using **Julia**](https://storopoli.io/Computacao-Cientifica) 🚀)
* I've made some `Turing` tutorials, you can check them out at [storopoli.io/Bayesian-Julia](https://storopoli.io/Bayesian-Julia)
* You can find me on [Twitter](https://twitter.com/JoseStoropoli) (altough I rarelly use it) or on [LinkedIn](https://www.linkedin.com/in/storopoli/)
"""
# ╔═╡ 2164bf58-75ff-470c-828c-b0165f0d980d
md"""
This workshop can be found in a CreativeCommons [YouTube Video](https://youtu.be/CKSxxJ7RdAU)
"""
# ╔═╡ 55777e4a-b197-4a61-8e57-6ae9792c0564
html"""
<iframe width="560" height="315" src="https://www.youtube.com/embed/CKSxxJ7RdAU" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
"""
# ╔═╡ 1436305e-37d8-44f1-88d6-4de838580360
md"""
## Bayesian Statistics?!
**Bayesian statistics** is an approach to inferential statistics based on Bayes' theorem, where available knowledge about parameters in a statistical model is updated with the information in observed data. The background knowledge is expressed as a prior distribution and combined with observational data in the form of a likelihood function to determine the posterior distribution. The posterior can also be used for making predictions about future events.
$$\underbrace{P(\theta \mid y)}_{\text{Posterior}} = \frac{\overbrace{P(y \mid \theta)}^{\text{Likelihood}} \cdot \overbrace{P(\theta)}^{\text{Prior}}}{\underbrace{P(y)}_{\text{Normalizing Costant}}}$$
> No $p$-values! Nobody knows what they are anyway... Not $P(H_0 \mid y)$
"""
# ╔═╡ 08f508c4-233a-4bba-b313-b04c1d6c4a4c
md"""
### Recommended Books
"""
# ╔═╡ 868d8932-b108-41d9-b4e8-d62d31b5465d
md"""
We are not covering Bayesian stuff, but there are some **awesome books**:
"""
# ╔═╡ 653ec420-8de5-407e-91a9-f045e25a6395
md"""
[$(Resource("https://github.com/storopoli/Turing-Workshop/blob/master/images/BDA_book.jpg?raw=true", :width => 100.5*1.5))](https://www.routledge.com/Bayesian-Data-Analysis/Gelman-Carlin-Stern-Dunson-Vehtari-Rubin/p/book/9781439840955)
[$(Resource("https://github.com/storopoli/Turing-Workshop/blob/master/images/SR_book.jpg?raw=true", :width => 104*1.5))](https://www.routledge.com/Statistical-Rethinking-A-Bayesian-Course-with-Examples-in-R-and-STAN/McElreath/p/book/9780367139919)
[$(Resource("https://github.com/storopoli/Turing-Workshop/blob/master/images/ROS_book.jpg?raw=true", :width => 118*1.5))](https://www.cambridge.org/fi/academic/subjects/statistics-probability/statistical-theory-and-methods/regression-and-other-stories)
[$(Resource("https://github.com/storopoli/Turing-Workshop/blob/master/images/Bayes_book.jpg?raw=true", :width => 102*1.5))](https://www.amazon.com/Theory-That-Would-Not-Die/dp/0300188226/)
"""
# ╔═╡ 716cea7d-d771-46e9-ad81-687292004009
md"""
## 1. What is Turing?
"""
# ╔═╡ cb808fd4-6eb2-457e-afa4-58ae1be09aec
md"""
[**`Turing`** (Ge, Xu & Ghahramani, 2018)](http://turing.ml/) is a ecosystem of Julia packages for Bayesian Inference using [probabilistic programming](https://en.wikipedia.org/wiki/Probabilistic_programming). Models specified using `Turing` are easy to read and write -- models work the way you write them. Like everything in Julia, `Turing` is fast [(Tarek, Xu, Trapp, Ge & Ghahramani, 2020)](https://arxiv.org/abs/2002.02702).
Before we dive into how to specify models in Turing. Let's discuss Turing's **ecosystem**.
We have several Julia packages under the Turing's GitHub organization [TuringLang](https://github.com/TuringLang), but I will focus on 5 of those:
* [`Turing.jl`](https://github.com/TuringLang/Turing.jl): main package that we use to **interface with all the Turing ecosystem** of packages and the backbone of the PPL Turing.
* [`MCMCChains.jl`](https://github.com/TuringLang/MCMCChains.jl): is an interface to **summarizing MCMC simulations** and has several utility functions for **diagnostics** and **visualizations**.
* [`DynamicPPL.jl`](https://github.com/TuringLang/DynamicPPL.jl): which specifies a domain-specific language and backend for Turing (which itself is a PPL), modular and written in Julia
* [`AdvancedHMC.jl`](https://github.com/TuringLang/AdvancedHMC.jl): modular and efficient implementation of advanced HMC algorithms. The state-of-the-art HMC algorithm is the **N**o-**U**-**T**urn **S**ampling (NUTS) (Hoffman & Gelman, 2011)
* [`DistributionsAD.jl`](https://github.com/TuringLang/DistributionsAD.jl): defines the necessary functions to enable automatic differentiation (AD) of the `logpdf` function from [`Distributions.jl`](https://github.com/JuliaStats/Distributions.jl) using the packages [`Tracker.jl`](https://github.com/FluxML/Tracker.jl), [`Zygote.jl`](https://github.com/FluxML/Zygote.jl), [`ForwardDiff.jl`](https://github.com/JuliaDiff/ForwardDiff.jl) and [`ReverseDiff.jl`](https://github.com/JuliaDiff/ReverseDiff.jl). The main goal of `DistributionsAD.jl` is to make the output of `logpdf` differentiable with respect to all continuous parameters of a distribution.
"""
# ╔═╡ 0484ae7f-bd8a-4615-a760-5c4b2eef9d3f
md"""
## 2. How to Specify a Model? `@model`
"""
# ╔═╡ 1d467044-bc7d-4df7-bda6-bb8ea6ff0712
md"""
**We specify the model inside a macro** `@model` where we can assign variables in two ways:
* using `~`: which means that a variable follows some probability distribution (Normal, Binomial etc.) and its value is random under that distribution
* using `=`: which means that a variable does not follow a probability distribution and its value is deterministic (like the normal `=` assignment in programming languages)
Turing will perform automatic inference on all variables that you specify using `~`.
Just like you would write in mathematical form:
$$\begin{aligned}
p &\sim \text{Beta}(1,1) \\
\text{coin flip} &\sim \text{Bernoulli}(p)
\end{aligned}$$
> **Example**: Unfair coin with $p$ = 0.7.
"""
# ╔═╡ b1d99482-53f5-4c6b-8c20-c761ff6bdb77
coin_flips = rand(Bernoulli(0.7), 100);
# ╔═╡ 65fa382d-4ef7-432d-8630-27082977185b
@model coin(coin_flips) = begin
p ~ Beta(1, 1)
for i ∈ 1:length(coin_flips)
coin_flips[i] ~ Bernoulli(p)
end
end;
# ╔═╡ 06f93734-2315-4b36-a39a-09e8167bab1f
begin
chain_coin = sample(coin(coin_flips), MH(), 100);
summarystats(chain_coin)
end
# ╔═╡ 9f6b96a7-033d-4c7d-a853-46a0b5af4675
md"""
## 3. How to specify a MCMC sampler (`NUTS`, `HMC`, `MH` etc.)
"""
# ╔═╡ b7667fb4-6e76-4711-b61d-dae5f993531e
md"""
We have [several samplers](https://turing.ml/dev/docs/using-turing/sampler-viz) available:
* `MH()`: **M**etropolis-**H**astings
* `PG()`: **P**article **G**ibbs
* `SMC()`: **S**equential **M**onte **C**arlo
* `HMC()`: **H**amiltonian **M**onte **C**arlo
* `HMCDA()`: **H**amiltonian **M**onte **C**arlo with Nesterov's **D**ual **A**veraging
* `NUTS()`: **N**o-**U**-**T**urn **S**ampling
Just stick your desired `sampler` inside the function `sample(model, sampler, N; kwargs)`.
Play around if you want. Choose your `sampler`:
"""
# ╔═╡ cb168dc1-70e2-450f-b2cf-c8680251ab27
@bind chosen_sampler Radio([
"MH()",
"PG(Nₚ) - Number of Particles",
"SMC()",
"HMC(ϵ, L) - leaprog step size(ϵ) and number of leaprogs steps (L)",
"HMCDA(Nₐ, δ, λ) - Number of samples to use for adaptation (Nₐ), target acceptance ratio (δ), and target leapfrog length(λ)",
"NUTS(Nₐ, δ) - Number of samples to use for adaptation (Nₐ) and target acceptance ratio (δ)"], default = "MH()")
# ╔═╡ 07d408cf-d202-40b2-90c2-5e8630549339
begin
your_sampler = nothing
if chosen_sampler == "MH()"
your_sampler = MH()
elseif chosen_sampler == "PG(Nₚ) - Number of Particles"
your_sampler = PG(2)
elseif chosen_sampler == "SMC()"
your_sampler = SMC()
elseif chosen_sampler == "HMC(ϵ, L) - leaprog step size(ϵ) and number of leaprogs steps (L)"
your_sampler = HMC(0.05, 10)
elseif chosen_sampler == "HMCDA(Nₐ, δ, λ) - Number of samples to use for adaptation (Nₐ), target acceptance ratio (δ), and target leapfrog length(λ)"
your_sampler = HMCDA(10, 0.65, 0.3)
elseif chosen_sampler == "NUTS(Nₐ, δ) - Number of samples to use for adaptation (Nₐ) and target acceptance ratio (δ)"
your_sampler = NUTS(10, 0.65)
end
end
# ╔═╡ 744a8a63-647f-4550-adf7-44354fde44be
begin
chain_coin_2 = sample(coin(coin_flips), your_sampler, 100); # Here is your sampler
summarystats(chain_coin_2)
end
# ╔═╡ e6365296-cd68-430e-99c5-fb571f39aad5
md"""
### 3.1 MOAH CHAINS!!: `MCMCThreads` and `MCMCDistributed`
"""
# ╔═╡ 927ad0a4-ba68-45a6-9bde-561915503e48
md"""
There is some methods of `Turing`'s `sample()` that accepts either:
* `MCMCThreads()`: uses multithread stuff with [`Threads.jl`](https://docs.julialang.org/en/v1/manual/multi-threading/#man-multithreading)
* `MCMCDistributed()`: uses multiprocesses stuff with [`Distributed.jl`](https://docs.julialang.org/en/v1/manual/distributed-computing/) and uses the [MPI -- Message Passing Interface](https://en.wikipedia.org/wiki/Message_Passing_Interface)
> If you are using `MCMCDistributed()` don't forget the macro `@everywhere` and the `addprocs()` stuff
Just use `sample(model, sampler, MCMCThreads(), N, chains)`
Let's revisit our biased-coin example:
"""
# ╔═╡ ab6c2ba6-4cd8-473a-88c6-b8d61551fb22
begin
chain_coin_parallel = sample(coin(coin_flips), MH(), MCMCThreads(), 2_000, 2);
summarystats(chain_coin_parallel)
end
# ╔═╡ 2ab3c34a-1cfc-4d20-becc-5902d08d03e0
md"""
### 3.2 LOOK MUM NO DATA!!: Prior Predictive Checks `Prior()`
"""
# ╔═╡ 924fcad9-75c1-4707-90ef-3e36947d64fe
md"""
It's very important that we check if our **priors make sense**. This is called **Prior Predictive Check** (Gelman et al., 2020b). Obs: I will not cover **Posterior Predictive Check** because is mostly the same procedure in `Turing`.
"""
# ╔═╡ fc8e40c3-34a1-4b2e-bd1b-893d7998d359
md"""
$(Resource("https://github.com/storopoli/Turing-Workshop/blob/master/images/bayesian_workflow.png?raw=true", :width => 700))
Based on Gelman et al. (2020b)
"""
# ╔═╡ fb366eb1-4ab0-4e7a-83ed-d531978c06a0
md"""
Predictive checks are a great way to **validate a model**. The idea is to **generate data from the model** using **parameters from draws from the prior or posterior**. *Prior predictive check* is when we simulate data using model's parameters values drawn fom the *prior* distribution, and *posterior* predictive check is is when we simulate data using model's parameters values drawn fom the *posterior* distribution.
The workflow we do when specifying and sampling Bayesian models is not linear or acyclic (Gelman et al., 2020b). This means that we need to iterate several times between the different stages in order to find a model that captures best the data generating process with the desired assumptions.
This is quite easy in `Turing`. We need to create a *prior* distribution for our model. To accomplish this, instead of supplying a MCMC sampler like `NUTS()` or `MH()`, we supply the "sampler" `Prior()` inside `Turing`'s `sample()` function:
"""
# ╔═╡ 0fe83f55-a379-49ea-ab23-9defaab05890
begin
prior_chain_coin = sample(coin(coin_flips), Prior(), 1_000)
summarystats(prior_chain_coin)
end
# ╔═╡ 3aa95b4b-aaf8-45cf-8bc5-05b65b4bcccf
md"""
Now we can perform predictive checks using both the *prior* (`prior_chain_coin`) or *posterior* (`chain_coin`) distributions. To draw from the prior and posterior predictive distributions we instantiate a "predictive model", i.e. a `Turing` model but with the observations set to `missing`, and then calling `predict()` on the predictive model and the previously drawn samples.
Let's do the *prior* predictive check:
"""
# ╔═╡ dd27ee5f-e442-42d7-a39b-d76328d2e59f
begin
missing_data = Vector{Missing}(missing, length(coin_flips)); # vector of `missing`
model_predict = coin(missing_data); # instantiate the "predictive model"
prior_check = predict(model_predict, prior_chain_coin);
describe(DataFrame(summarystats(prior_check)))
end
# ╔═╡ c4808b43-bc0f-4254-abf1-1adc19135dc7
md"""
### 3.3 Posterior Predictive Checks
The *posterior* predictive check is trivial, just do the same but with the posterior `chain_coin`:
"""
# ╔═╡ 1773d8c3-4651-4128-9442-e7c858bc4a43
begin
posterior_check = predict(model_predict, chain_coin);
describe(DataFrame(summarystats(posterior_check)))
end
# ╔═╡ 5674f7aa-3205-47c7-8367-244c6419ce69
md"""
## 4. How to inspect chains and plot stuff with `MCMCChains.jl`
"""
# ╔═╡ 83cc80c1-d97e-4b82-872e-e5493d2b62ab
md"""
We can inspect and plot our model's chains and its underlying parameters with [`MCMCChains.jl`](https://turinglang.github.io/MCMCChains.jl/stable/)
1. **Inspecting Chains**
* **Summary Statistics**: just do `summarystats(chain)`
* **Quantiles** (Median, etc.): just do `quantile(chain)`
* What if I just want a **subset** of parameters?: just do `group(chain, :parameter)` or index with `chain[:, 1:6, :]` or `chain[[:parameters,...]]`
"""
# ╔═╡ 475be60f-1876-4086-9725-3bf5f52a3e43
summarystats(chain_coin_parallel)
# ╔═╡ f6bc0cfd-a1d9-48e5-833c-f33bf1b89d45
quantile(chain_coin_parallel)
# ╔═╡ ed640696-cae6-47e1-a4df-0655192e0855
quantile(group(chain_coin_parallel, :p))
# ╔═╡ bc9fa101-8854-4af5-904a-f0b683fb63b1
summarystats(chain_coin_parallel[:, 1:1, :])
# ╔═╡ c82687d1-89d0-4ecd-bed7-1708ba8b2662
md"""
2. **Plotting Chains**: Now we have several options. The default `plot()` recipe will plot a `traceplot()` side-by-side with a `mixeddensity()`.
First, we have to choose either to plot **parameters**(`:parameter`) or **chains**(`:chain`) with the keyword `colordim`.
"""
# ╔═╡ 270c0b90-cce1-4092-9e29-5f9deda2cb7d
plot(chain_coin_parallel; colordim=:chain, dpi=300)
# ╔═╡ c4146b8b-9d11-446e-9765-8d5283a6d445
plot(chain_coin_parallel; colordim=:parameter, dpi=300)
# ╔═╡ 3d09c8c3-ce95-4f26-9136-fedd601e2a70
md"""
Second, we have several plots to choose from:
* `traceplot()`: used for inspecting Markov chain **convergence**
* `meanplot()`: running average plots per interaction
* `density()`: **density** plots
* `histogram()`: **histogram** plots
* `mixeddensity()`: **mixed density** plots
* `autcorplot()`: **autocorrelation** plots
"""
# ╔═╡ 8d9bdae2-658d-45bf-9b25-50b6efbe0cdf
plot(
traceplot(chain_coin_parallel, title="traceplot"),
meanplot(chain_coin_parallel, title="meanplot"),
density(chain_coin_parallel, title="density"),
histogram(chain_coin_parallel, title="histogram"),
mixeddensity(chain_coin_parallel, title="mixeddensity"),
autocorplot(chain_coin_parallel, title="autocorplot"),
dpi=300, size=(840, 600)
)
# ╔═╡ 41b014c2-7b49-4d03-8741-51c91b95f64c
md"""
There is also the option to **construct your own plot** with `plot()` and the keyword `seriestype`:
"""
# ╔═╡ 2f08c6e4-fa7c-471c-ad9f-9d036e3027d5
plot(chain_coin_parallel, seriestype = (:meanplot, :autocorplot), dpi=300)
# ╔═╡ 5f639d2d-bb96-4a33-a78e-d5b9f0e8d274
md"""
Finally there is one special plot that makes a **cornerplot** (requires `StatPlots`) of parameters in a chain:
> Obs: I will hijack a multi-parameter model from *below* to show the cornerplot
"""
# ╔═╡ c70ebb70-bd96-44a5-85e9-871b0e478b1a
md"""
## 5. Better tricks to avoid `for`-loops inside `@model` (`lazyarrays` and `filldist`)
"""
# ╔═╡ 36258bdd-f617-48f6-91c9-e8bbff78ebd8
md"""
**Using Logistic Regression**
"""
# ╔═╡ 6630eb47-77f6-48e9-aafe-55bda275449c
md"""
First the Naïve model *with* `for`-loops:
"""
# ╔═╡ 37e751c7-8b6c-47d9-8013-97015d1e1fb2
@model logreg(X, y; predictors=size(X, 2)) = begin
#priors
α ~ Normal(0, 2.5)
β = Vector{Float64}(undef, predictors)
for i ∈ 1:predictors
β[i] ~ Normal()
end
#likelihood
for i ∈ 1:length(y)
y[i] ~ BernoulliLogit(α + X[i, :] ⋅ β)
end
end;
# ╔═╡ 7a21e7a0-322b-4f8e-9d8b-a2f452f7e092
md"""
* `Turing`'s `BernoulliLogit()` is a logit-parameterised Bernoulli distribution that convert logodds to probability.
"""
# ╔═╡ f8f59ebb-bb1e-401f-97b5-507634badb3f
md"""
Now a model *without* `for`-loops
"""
# ╔═╡ 15795f79-7d7b-43d2-a4b4-99ad968a7f72
@model logreg_vectorized(X, y; predictors=size(X, 2)) = begin
#priors
α ~ Normal(0, 2.5)
β ~ filldist(Normal(), predictors)
#likelihood
# y .~ BernoulliLogit.(α .+ X * β)
y ~ arraydist(LazyArray(@~ BernoulliLogit.(α .+ X * β)))
end;
# ╔═╡ dd5fbb2a-4220-4e47-945a-6870b799c50d
md"""
* `Turing`'s `arraydist()` function wraps an array of distributions returning a new distribution sampling from the individual distributions.
* `LazyArrays`' `LazyArray()` constructor wrap a lazy object that wraps a computation producing an `array` to an `array`. Last, but not least, the macro `@~` creates a broadcast and is a nice short hand for the familiar dot `.` broadcasting operator in Julia. This is an efficient way to tell Turing that our `y` vector is distributed lazily as a `BernoulliLogit` broadcasted to `α` added to the product of the data matrix `X` and `β` coefficient vector.
"""
# ╔═╡ 0cc8e12c-9b72-41ec-9c13-d9ae0bdc6100
md"""
For our example, I will use a famous dataset called `wells` (Gelman & Hill, 2007), which is data from a survey of 3,200 residents in a small area of Bangladesh suffering from arsenic contamination of groundwater. Respondents with elevated arsenic levels in their wells had been encouraged to switch their water source to a safe public or private well in the nearby area and the survey was conducted several years later to learn which of the affected residents had switched wells. It has 3,200 observations and the following variables:
* `switch` – binary/dummy (0 or 1) for well-switching.
* `arsenic` – arsenic level in respondent's well.
* `dist` – distance (meters) from the respondent's house to the nearest well with safe drinking water.
* `association` – binary/dummy (0 or 1) if member(s) of household participate in community organizations.
* `educ` – years of education (head of household).
"""
# ╔═╡ fce0f511-3b00-4079-85c6-9b2d2d7c04cb
begin
# Logistic Regression
wells = CSV.read(download("https://github.com/storopoli/Turing-Workshop/blob/master/data/wells.csv?raw=true"), DataFrame);
X_wells = Matrix(select(wells, Not(:switch)));
y_wells = wells[:, :switch];
end
# ╔═╡ 5ba6b247-8277-4100-abe7-8d06af04a011
md"""
Why do that?
1. Well, you'll have nice performance gains
"""
# ╔═╡ 0f000fc4-1a7b-4522-8355-8df572ee8800
with_terminal() do
@btime sample(logreg($X_wells, $y_wells), MH(), 100);
end
# ╔═╡ 8a87e324-f3d9-4162-88ab-3833a6d1fc2e
with_terminal() do
@btime sample(logreg_vectorized($X_wells, $y_wells), MH(), 100);
end
# ╔═╡ 3c954cbc-aed7-4d22-b578-a80ce62ebb49
md"""
2. Some [autodiff backends only works without `for`-loops inside the `@model`](https://turing.ml/dev/docs/using-turing/performancetips#special-care-for-codetrackercode-and-codezygotecode):
* [`Tracker.jl`](https://github.com/FluxML/Tracker.jl)
* [`Zygote.jl`](https://github.com/FluxML/Zygote.jl)
"""
# ╔═╡ 521e2473-1aba-43be-951a-25537062891e
md"""
### 5.1 Which [autodiff backend](https://turing.ml/dev/docs/using-turing/autodiff) to use?
"""
# ╔═╡ bafc91d2-8cae-4af8-b5ed-8199eef40c4d
md"""
We have mainly two [types of autodiff](https://en.wikipedia.org/wiki/Automatic_differentiation) (both uses the chain rule $\mathbb{R}^N \to \mathbb{R}^M$)
* **Forward Autodiff**: The **independent** variable is fixed and differentiation is performed in a *forward* manner. Preffered when $N < M$
* [`ForwardDiff.jl`](https://github.com/JuliaDiff/ForwardDiff.jl): current (version 0.16) Turing's default, `:forwarddiff`
* **Reverse Autodiff**: The **dependent** variable is fixed and differentiation is performed in a *backward* manner. Preffered when $N > M$
* [`Tracker.jl`](https://github.com/FluxML/Tracker.jl): `:tracker`
* [`Zygote.jl`](https://github.com/FluxML/Zygote.jl): `:zygote`
* [`ReverseDiff.jl`](https://github.com/JuliaDiff/ReverseDiff.jl): `:reversediff`
Checkout this video is awesome to learn what Automatic Differentiation is!
"""
# ╔═╡ a2292bc1-3379-450d-beb5-ae8f41b69be8
html"""<iframe width="560" height="315" src="https://www.youtube.com/embed/wG_nF1awSSY" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>"""
# ╔═╡ 38055b57-f983-4440-bef5-0ab6d180ff1e
md"""
To change `Turing`'s autodiff backend just type:
```julia
Turing.setadbackend(:zygote)
```
or
```julia
Turing.setadbackend(:tracker)
```
Note that you need to import the backend:
```julia
using Zygote
```
"""
# ╔═╡ 7d4d06ca-f96d-4b1e-860f-d9e0d6eb6723
md"""
## 6. Take me up! Let's get Hierarchical (Hierarchical Models)
"""
# ╔═╡ c64d355f-f5a2-46a5-86f3-2d02da98f305
md"""
Bayesian **hierarchical** models (also called **multilevel** models) are a statistical model written at **multiple levels** (hierarchical form) that estimates the parameters of the posterior distribution using the Bayesian approach. The sub-models combine to form the hierarchical model, and **Bayes' theorem is used to integrate them with the observed data** and to account for all the **uncertainty** that is present.
Hierarchical modeling is used when **information is available at several different levels of observation units**. The hierarchical form of analysis and organization helps to understand multiparameter problems and also plays an important role in the development of computational strategies.
"""
# ╔═╡ 262cb245-0bc1-4a36-b0bc-de52c08ccde0
md"""
"Even though observations directly inform only a single set of parameters, the latent population model couples the individual parameters and provides a backdoor for observations to inform all of the contexts. For example the observations from the $k$th context, $y_k$, directly inform the parameters that quantify the behavior of that context, $\theta_k$. Those parameters, however, directly inform the population parameters $\phi$ which then inform all of the other contexts through the population model. Similarly observations that directly inform the other contexts indirectly inform the population parameters which then feeds back into the $k$th context."
[Betancourt (2020)](https://betanalpha.github.io/assets/case_studies/hierarchical_modeling.html)
"""
# ╔═╡ 3ecc92b8-6a10-4f51-93d7-72449e248dc2
Resource("https://github.com/storopoli/Turing-Workshop/blob/master/images/multilevel_models.png?raw=true", :width => 1_000)
# ╔═╡ a0c7ca50-3a3f-483c-ae01-fd774e0c072d
md"""
> figure adapted from [Michael Betancourt (CC-BY-SA-4.0)](https://betanalpha.github.io/assets/case_studies/hierarchical_modeling.html)
"""
# ╔═╡ cb3dd785-11ff-42fe-ab85-0dd03e45209e
md"""
### 6.1 Hyperprior
As the priors of the parameters are sampled from another prior of the hyperparameter (upper-level's parameter), which are called hyperpriors. This makes one group's estimates help the model to better estimate the other groups by providing more **robust and stable estimates**.
We call the global parameters as **population effects** (or population-level effects, also sometimes called fixed effects) and the parameters of each group as **group effects** (or group-level effects, also sometimes called random effects). That is why multilevel models are also known as mixed models in which we have both fixed effects and random effects.
"""
# ╔═╡ 4812f80e-79a9-4519-9e4d-a45127ca6a49
md"""
### 6.2 Three Approaches to Multilevel Models
Multilevel models generally fall into three approaches:
1. **Random-intercept model**: each group receives a **different intercept** in addition to the global intercept.
2. **Random-slope model**: each group receives **different coefficients** for each (or a subset of) independent variable(s) in addition to a global intercept.
3. **Random-intercept-slope model**: each group receives **both a different intercept and different coefficients** for each independent variable in addition to a global intercept.
"""
# ╔═╡ 318697fe-1fbc-4ac3-a2aa-5ecf775072d4
md"""
#### Random-Intercept Model
The first approach is the **random-intercept model** in which we specify a different intercept for each group,
in addition to the global intercept. These group-level intercepts are sampled from a hyperprior.
To illustrate a multilevel model, I will use a linear regression example with a Gaussian/normal likelihood function.
Mathematically a Bayesian multilevel random-slope linear regression model is:
$$\begin{aligned}
\mathbf{y} &\sim \text{Normal}\left( \alpha + \alpha_j + \mathbf{X} \cdot \boldsymbol{\beta}, \sigma \right) \\
\alpha &\sim \text{Normal}(\mu_\alpha, \sigma_\alpha) \\
\alpha_j &\sim \text{Normal}(0, \tau) \\
\boldsymbol{\beta} &\sim \text{Normal}(\mu_{\boldsymbol{\beta}}, \sigma_{\boldsymbol{\beta}}) \\
\tau &\sim \text{Cauchy}^+(0, \psi_{\alpha})\\
\sigma &\sim \text{Exponential}(\lambda_\sigma)
\end{aligned}$$
"""
# ╔═╡ 9acc7a1c-f638-4a2e-ad67-c16cff125c86
@model varying_intercept(X, idx, y; n_gr=length(unique(idx)), predictors=size(X, 2)) = begin
# priors
α ~ Normal(mean(y), 2.5 * std(y)) # population-level intercept
β ~ filldist(Normal(0, 2), predictors) # population-level coefficients
σ ~ Exponential(1 / std(y)) # residual SD
# prior for variance of random intercepts
# usually requires thoughtful specification
τ ~ truncated(Cauchy(0, 2), 0, Inf) # group-level SDs intercepts
αⱼ ~ filldist(Normal(0, τ), n_gr) # group-level intercepts
# likelihood
ŷ = α .+ X * β .+ αⱼ[idx]
y ~ MvNormal(ŷ, σ)
end;
# ╔═╡ 885fbe97-edd6-44d2-808d-8eeb1e9cb2b4
md"""
#### Random-Slope Model
The second approach is the **random-slope model** in which we specify a different slope for each group,
in addition to the global intercept. These group-level slopes are sampled from a hyperprior.
To illustrate a multilevel model, I will use a linear regression example with a Gaussian/normal likelihood function.
Mathematically a Bayesian multilevel random-slope linear regression model is:
$$\begin{aligned}
\mathbf{y} &\sim \text{Normal}\left( \alpha + \mathbf{X} \cdot \boldsymbol{\beta}_j \cdot \boldsymbol{\tau}, \sigma \right) \\
\alpha &\sim \text{Normal}(\mu_\alpha, \sigma_\alpha) \\
\boldsymbol{\beta}_j &\sim \text{Normal}(0, 1) \\
\boldsymbol{\tau} &\sim \text{Cauchy}^+(0, \psi_{\boldsymbol{\beta}})\\
\sigma &\sim \text{Exponential}(\lambda_\sigma)
\end{aligned}$$
"""
# ╔═╡ 7f526d1f-bd56-4e51-9f7b-ce6b5a2a1853
@model varying_slope(X, idx, y; n_gr=length(unique(idx)), predictors=size(X, 2)) = begin
# priors
α ~ Normal(mean(y), 2.5 * std(y)) # population-level intercept
σ ~ Exponential(1 / std(y)) # residual SD
# prior for variance of random slopes
# usually requires thoughtful specification
τ ~ filldist(truncated(Cauchy(0, 2), 0, Inf), n_gr) # group-level slopes SDs
βⱼ ~ filldist(Normal(0, 1), predictors, n_gr) # group-level standard normal slopes
# likelihood
ŷ = α .+ X * βⱼ * τ
y ~ MvNormal(ŷ, σ)
end;
# ╔═╡ f7971da6-ead8-4679-b8cf-e3c35c93e6cf
md"""
#### Random-intercept-slope Model
The third approach is the **random-intercept-slope model** in which we specify a different intercept
and slope for each group, in addition to the global intercept.
These group-level intercepts and slopes are sampled from hyperpriors.
To illustrate a multilevel model, I will use a linear regression example with a Gaussian/normal likelihood function.
Mathematically a Bayesian multilevel random-intercept-slope linear regression model is:
$$\begin{aligned}
\mathbf{y} &\sim \text{Normal}\left( \alpha + \alpha_j + \mathbf{X} \cdot \boldsymbol{\beta}_j \cdot \boldsymbol{\tau}_{\boldsymbol{\beta}}, \sigma \right) \\
\alpha &\sim \text{Normal}(\mu_\alpha, \sigma_\alpha) \\
\alpha_j &\sim \text{Normal}(0, \tau_{\alpha}) \\
\boldsymbol{\beta}_j &\sim \text{Normal}(0, 1) \\
\tau_{\alpha} &\sim \text{Cauchy}^+(0, \psi_{\alpha})\\
\boldsymbol{\tau}_{\boldsymbol{\beta}} &\sim \text{Cauchy}^+(0, \psi_{\boldsymbol{\beta}})\\
\sigma &\sim \text{Exponential}(\lambda_\sigma)
\end{aligned}$$
"""
# ╔═╡ 546726af-5420-4a4f-8c0c-fe96a2ba43bc
@model varying_intercept_slope(X, idx, y; n_gr=length(unique(idx)), predictors=size(X, 2)) = begin
# priors
α ~ Normal(mean(y), 2.5 * std(y)) # population-level intercept
σ ~ Exponential(1 / std(y)) # residual SD
# prior for variance of random intercepts and slopes
# usually requires thoughtful specification
τₐ ~ truncated(Cauchy(0, 2), 0, Inf) # group-level SDs intercepts
τᵦ ~ filldist(truncated(Cauchy(0, 2), 0, Inf), n_gr) # group-level slopes SDs
αⱼ ~ filldist(Normal(0, τₐ), n_gr) # group-level intercepts
βⱼ ~ filldist(Normal(0, 1), predictors, n_gr) # group-level standard normal slopes
# likelihood
ŷ = α .+ αⱼ[idx] .+ X * βⱼ * τᵦ
y ~ MvNormal(ŷ, σ)
end;
# ╔═╡ 9ebac6ba-d213-4ed8-a1d5-66b841fafa00
md"""
## 7. Crazy Stuff
"""
# ╔═╡ 45c342fd-b893-46aa-b2ee-7c93e7a1d207
md"""
There is a **lot** of *crazy* stuff you can do with `Turing` and Bayesian models.
Here I will cover:
1. **Discrete Parameters (HMM)**
2. **Models with ODEs**
"""
# ╔═╡ d44c7baa-80d2-4fdb-a2de-35806477dd58
md"""
### 7.1 Discrete Parameters (HMM)
"""
# ╔═╡ c1b2d007-1004-42f5-b65c-b4e2e7ff7d8e
Resource("https://github.com/storopoli/Turing-Workshop/blob/master/images/HMM.png?raw=true", :width => 400)
# ╔═╡ c1dcfd47-9e25-470b-a1b3-ab66bfac59d6
md"""
$\mu_1$ = $(@bind μ₁_sim Slider(1:1:10, default = 1, show_value=true))
$\mu_2$ = $(@bind μ₂_sim Slider(1:1:10, default = 5, show_value=true))
"""
# ╔═╡ f1153918-0748-4400-ae8b-3b59f8c5d755
md"""
I **love** [`Stan`](https://mc-stan.org), use it on a daily basis. But `Stan` has some quirks. Particularly, NUTS and HMC samplers **cannot tolerate discrete parameters**.
Solution? We have to **marginalize** them.
First, I will show the `Stan` example of a Hidden Markov Model (HMM) with marginalization. And then let's see how `Turing` fare with the same problem.
"""
# ╔═╡ ad6c4533-cd56-4f6f-b10d-d7bc3145ba16
md"""
We have several ways to marginalize discrete parameters in HMM:
1. **Filtering** (a.k.a [Forward Algorithm](https://en.wikipedia.org/wiki/Forward_algorithm)) <---- we'll cover this one
2. **Smoothing** (a.k.a [Forward-Backward Algorithm](https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm))
3. **MAP Estimation** (a.k.a [Viterbi Algorithm](https://en.wikipedia.org/wiki/Viterbi_algorithm))
A very good reference is [Damiano, Peterson & Weylandt (2017)](https://github.com/luisdamiano/stancon18)
"""
# ╔═╡ 2ef397a6-f7fb-4fc2-b918-40ab545ce19f
md"""
#### Forward Algorithm
"""
# ╔═╡ 8d347172-2d26-4d9e-954d-b8924ed4c9e2
md"""
We define the forward variables $\boldsymbol{\alpha}_t$, beginning at time $t = 1$, as follows
$$\boldsymbol{\alpha}_1 = \boldsymbol{\delta}^{(1)} \boldsymbol{\Gamma}(y_1), \qquad \boldsymbol{\alpha}_{t} = \boldsymbol{\alpha}_{t-1} \boldsymbol{\Gamma} \boldsymbol{\delta}^{t-1}(y_{t})$$
where:
* Observed variable of interest at time $t$: $y_t$
* Unobserved transition probability matrix at time $t$: $\boldsymbol{\Gamma}^{(t)} \in \mathbb{R}^{j \times j}$
* Unobserved state distribution at time $t=1$: $\boldsymbol{\delta}^{(1)} \sim \text{Dirichlet}(\boldsymbol{\nu})$
Then, the marginal likelihood is obtained by summing over:
$$\sum^J_{j=1} \alpha_T (j) = \boldsymbol{\alpha}_T \mathbf{1}^{\top}$$
(dot product of the vector of $\alpha$s with a row vector of $1$s)
Note that one of the assumptions is $\boldsymbol{\Gamma}$ is **fullrank** (no linear dependence) and **ergodic** (it converges to a unique stationary distribution in $\lim_{t \to \infty}$)
"""
# ╔═╡ ca962c0e-4620-4888-b7c3-aa7f6d7899e9
md"""
As an example, I will use [Leos-Barajas & Michelot, 2018](http://arxiv.org/abs/1806.10639)'s.
It's a $2$-state HMM with Gaussian state-dependent distributions for the observation process $X_t$. That is, at each time step $(t=1,2,\dots)$, we have
$$Y_t \mid S_t = j \sim N(\mu_j,\sigma^2)$$
for $j \in \{1, 2\}$
The marginal likelihood of the model can be written with the forward algorithm using a transition matrix $\boldsymbol{\Gamma}$:
$$\boldsymbol{\Gamma}(x_t) =
\begin{pmatrix}
\phi(x_t \vert \mu_1, \sigma^2) & 0 \\
0 & \phi(x_t \vert \mu_2, \sigma^2) \\
\end{pmatrix}$$
where $\phi$ is the Gaussian PDF.
We are interested in knowing $\boldsymbol{\Gamma}$ and also $\boldsymbol{\mu}$!
"""
# ╔═╡ 6fd49295-d0e3-4b54-aeae-e9cd07a5281c
md"""
#### Random Data
"""
# ╔═╡ 58c5460f-c7f4-4a0a-9e18-71b9580e9148
begin
const T = 2 # Number of States
# Transition Probabilities
const Γ = Matrix([0.9 0.1; 0.1 0.9])
# initial distribution set to the stationary distribution
const δ = (Diagonal(ones(T)) - Γ .+ 1) \ ones(T)
# State-Dependent Gaussian means
const μ = [1, 5]
const n_obs = 1_000
S = Vector{Int64}(undef, n_obs)
y = Vector{Float64}(undef, n_obs)
# initialise state and observation
S[1] = sample(1:T, aweights(δ))
y[1] = rand(Normal(μ[S[1]], 2))
# simulate state and observation processes forward
for t in 2:n_obs
S[t] = sample(1:T, aweights(Γ[S[t - 1], :]))
y[t] = rand(Normal(μ[S[t]], 2))
end
end
# ╔═╡ 46ba21ab-bce5-4eed-bd63-aae7340c8180
begin
# State-Dependent Gaussian means
μ_sim = [μ₁_sim, μ₂_sim]
S_sim = Vector{Int64}(undef, n_obs)
y_sim = Vector{Float64}(undef, n_obs)
# initialise state and observation
S_sim[1] = sample(1:T, aweights(δ))
y_sim[1] = rand(Normal(μ[S[1]], 2))
# simulate state and observation processes forward
for t in 2:n_obs
S_sim[t] = sample(1:T, aweights(Γ[S_sim[t - 1], :]))
y_sim[t] = rand(Normal(μ_sim[S_sim[t]], 2))
end
Plots.gr(dpi=300)
scatter(y_sim, mc= S_sim, xlabel=L"t", ylabel=L"y", label=false, ylim=(-5,13), yticks=(vcat(0, μ_sim, 10), vcat("0", "μ₁", "μ₂", "10")))
hline!([μ₁_sim,μ₂_sim], lw=4, label=false, c=:black, style=:dash)
end
# ╔═╡ 5d3d2abb-85e3-4371-926e-61ff236253f1
md"""
Here is the `Stan` code (I've simplified from Leos-Barajas & Michelot's original code) :
> Note that we are using the `log_sum_exp()` trick
"""
# ╔═╡ 247a02e5-8599-43fd-9ee5-32ba8b827477
md"""
```cpp
data {
int<lower=1> K; // number of states
int<lower=1> T; // length of data set
real y[T]; // observations
}
parameters {
positive_ordered[K] mu; // state-dependent parameters
simplex[K] theta[K]; // N x N tpm
}
model{
// priors
mu ~ student_t(3, 0, 1);
for (k in 1:K)
theta[k] ~ dirichlet([0.5, 0.5]);
// Compute the marginal probability over possible sequences
vector[K] acc;
vector[K] lp;
// forward algorithm implementation
for(k in 1:K) // first observation
lp[k] = normal_lpdf(y[1] | mu[k], 2);
for (t in 2:T) { // looping over observations
for (k in 1:K){ // looping over states
acc[k] = log_sum_exp(log(theta[k]) + lp) +
normal_lpdf(y[t] | mu[k], 2);
}
lp = acc;
}
target += log_sum_exp(lp);
}
```
Obs: `log_sum_exp(a, b) = log(exp(a) + exp(b))`
"""
# ╔═╡ 6db0245b-0461-4db0-9462-7a5f80f7d589
md"""
Here's how we would do in `Turing`
> Note the Composite MCMC Sampler
"""
# ╔═╡ b5a79826-151e-416e-b0a2-1a58eec9196c
begin
@model hmm(y, K::Int64; T=length(y)) = begin
# state sequence in a Libtask.TArray
s = tzeros(Int, T)
# Transition Probability Matrix.
θ = Vector{Vector}(undef, K)
# Priors
μ ~ filldist(truncated(TDist(3), 1, 6), K)
for i = 1:K
θ[i] ~ Dirichlet([0.5, 0.5])
end
# Positive Ordered
if any(μ[i] > μ[i+1] for i in 1:(length(μ) - 1))
# update the joint log probability of the samples
# we set it to -Inf such that the samples are rejected
Turing.@addlogprob!(-Inf)
end
# first observation
s[1] ~ Categorical(K)
y[1] ~ Normal(μ[s[1]], 2)
# looping over observations
for i = 2:T
s[i] ~ Categorical(vec(θ[s[i - 1]]))
y[i] ~ Normal(μ[s[i]], 2)
end
end;
composite_sampler = Gibbs(NUTS(10, 0.65, :μ, :θ),
PG(1, :s));
hmm_chain = sample(hmm(y, 2), composite_sampler, 20);
summarystats(hmm_chain[:, 1:6, :]) #only μ and θ
end
# ╔═╡ cd410368-9022-4030-86a0-1d125e76bc62
md"""
> Obs: probably in the future we'll have better implementation for positive ordered constraints in `Turing`. It will reside in the [`Bijectors.jl`](https://github.com/TuringLang/Bijectors.jl) package. Actually check this [PR](https://github.com/TuringLang/Bijectors.jl/pull/186), it seems positive ordered is coming to `Turing`.
"""
# ╔═╡ 9b0b62cb-2c61-4d47-a6c7-09c0c1a75a24
md"""
### 7.2 ODEs in `Turing` (SIR Model)
"""
# ╔═╡ 9b020402-ea15-4f52-9fff-c70d275b97ac
Resource("https://github.com/storopoli/Turing-Workshop/blob/master/images/SIR.png?raw=true", :width => 400)
# ╔═╡ c81f4877-024f-4dc8-b7ce-e781ab6101f3
md"""
The Susceptible-Infected-Recovered (SIR) model splits the population in three time-dependent compartments: the susceptible, the infected (and infectious), and the recovered (and not infectious) compartments. When a susceptible individual comes into contact with an infectious individual, the former can become infected for some time, and then recover and become immune. The dynamics can be summarized in a system ODEs:
"""
# ╔═╡ f2272fd5-5132-4a6e-b2ff-136dc2fb2903
md"""
$$\begin{aligned}
\frac{dS}{dt} &= -\beta S \frac{I}{N}\\
\frac{dI}{dt} &= \beta S \frac{I}{N} - \gamma I \\
\frac{dR}{dt} &= \gamma I
\end{aligned}$$
where