diff --git a/examples/notebooks/FLARE.ipynb b/examples/notebooks/FLARE.ipynb
index bf837e237..0e833aefa 100644
--- a/examples/notebooks/FLARE.ipynb
+++ b/examples/notebooks/FLARE.ipynb
@@ -53,7 +53,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    }
+   },
    "outputs": [],
    "source": [
     "! pip install ragstack-ai"
@@ -699,4 +706,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/QA_with_cassio.ipynb b/examples/notebooks/QA_with_cassio.ipynb
index b488c9bb1..27b224028 100644
--- a/examples/notebooks/QA_with_cassio.ipynb
+++ b/examples/notebooks/QA_with_cassio.ipynb
@@ -19,11 +19,11 @@
     "This notebook guides you through setting up [RAGStack](https://www.datastax.com/products/ragstack) using [Astra Vector Search](https://docs.datastax.com/en/astra-serverless/docs/vector-search/overview.html), [OpenAI](https://openai.com/about), and [CassIO](https://cassio.org/) to implement a generative Q&A over your own documentation.\n",
     "\n",
     "## Astra Vector Search\n",
-    "Astra vector search enables developers to search a database by context or meaning rather than keywords or literal values. This is done by using “embeddings”. Embeddings are a type of representation used in machine learning where high-dimensional or complex data is mapped onto vectors in a lower-dimensional space. These vectors capture the semantic properties of the input data, meaning that similar data points have similar embeddings.\n",
+    "Astra vector search enables developers to search a database by context or meaning rather than keywords or literal values. This is done by using \u201cembeddings\u201d. Embeddings are a type of representation used in machine learning where high-dimensional or complex data is mapped onto vectors in a lower-dimensional space. These vectors capture the semantic properties of the input data, meaning that similar data points have similar embeddings.\n",
     "Reference: [Astra Vector Search](https://docs.datastax.com/en/astra-serverless/docs/vector-search/overview.html)\n",
     "\n",
     "## CassIO\n",
-    "CassIO is the ultimate solution for seamlessly integrating Apache Cassandra® with generative artificial intelligence and other machine learning workloads. This powerful Python library simplifies the complicated process of accessing the advanced features of the Cassandra database, including vector search capabilities. With CassIO, developers can fully concentrate on designing and perfecting their AI systems without any concerns regarding the complexities of integration with Cassandra.\n",
+    "CassIO is the ultimate solution for seamlessly integrating Apache Cassandra\u00ae with generative artificial intelligence and other machine learning workloads. This powerful Python library simplifies the complicated process of accessing the advanced features of the Cassandra database, including vector search capabilities. With CassIO, developers can fully concentrate on designing and perfecting their AI systems without any concerns regarding the complexities of integration with Cassandra.\n",
     "Reference: [CassIO](https://cassio.org/)\n",
     "\n",
     "## OpenAI\n",
@@ -72,6 +72,12 @@
     },
     "editable": true,
     "id": "a6d88d66",
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    },
     "outputId": "dc543d17-3fb2-4362-cc4e-0050bd7787ba",
     "slideshow": {
      "slide_type": ""
@@ -543,4 +549,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/RAG_with_cassio.ipynb b/examples/notebooks/RAG_with_cassio.ipynb
index ea07994c0..7a2164e4c 100644
--- a/examples/notebooks/RAG_with_cassio.ipynb
+++ b/examples/notebooks/RAG_with_cassio.ipynb
@@ -67,19 +67,24 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "2953d95b",
-    "outputId": "f1d1a4dc-9984-405d-bf28-74697815ea12", 
     "nbmake": {
-     "post_cell_execute": ["from conftest import before_notebook", "before_notebook()"]
-    }
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    },
+    "outputId": "f1d1a4dc-9984-405d-bf28-74697815ea12"
+   },
+   "nbmake": {
+    "post_cell_execute": [
+     "before_notebook()"
+    ]
    },
    "outputs": [],
    "source": [
     "# install required dependencies\n",
     "! pip install -qU ragstack-ai datasets google-cloud-aiplatform "
-   ],
-   "nbmake": {
-     "post_cell_execute": ["before_notebook()"]
-    }
+   ]
   },
   {
    "cell_type": "markdown",
@@ -478,4 +483,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/advancedRAG.ipynb b/examples/notebooks/advancedRAG.ipynb
index b122b3c00..eafaebf10 100644
--- a/examples/notebooks/advancedRAG.ipynb
+++ b/examples/notebooks/advancedRAG.ipynb
@@ -55,7 +55,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    }
+   },
    "outputs": [],
    "source": [
     "%pip install ragstack-ai"
@@ -1035,4 +1042,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/astradb.ipynb b/examples/notebooks/astradb.ipynb
index 51990d67f..c1c711a72 100644
--- a/examples/notebooks/astradb.ipynb
+++ b/examples/notebooks/astradb.ipynb
@@ -13,7 +13,7 @@
    "source": [
     "# Astra DB\n",
     "\n",
-    "This page provides a quickstart for using [Astra DB](https://docs.datastax.com/en/astra/home/astra.html) and [Apache Cassandra®](https://cassandra.apache.org/) as a Vector Store.\n",
+    "This page provides a quickstart for using [Astra DB](https://docs.datastax.com/en/astra/home/astra.html) and [Apache Cassandra\u00ae](https://cassandra.apache.org/) as a Vector Store.\n",
     "\n",
     "_Note: in addition to access to the database, an OpenAI API Key is required to run the full example._"
    ]
@@ -39,7 +39,14 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "8d00fcf4-9798-4289-9214-d9734690adfc",
-   "metadata": {},
+   "metadata": {
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    }
+   },
    "outputs": [],
    "source": [
     "!pip install --quiet datasets pypdf"
@@ -902,4 +909,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/langchain-unstructured-astra.ipynb b/examples/notebooks/langchain-unstructured-astra.ipynb
index b53674183..71d5d692f 100644
--- a/examples/notebooks/langchain-unstructured-astra.ipynb
+++ b/examples/notebooks/langchain-unstructured-astra.ipynb
@@ -32,6 +32,12 @@
      "start_time": "2024-02-13T11:47:41.568768Z"
     },
     "editable": true,
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    },
     "slideshow": {
      "slide_type": ""
     },
@@ -254,7 +260,7 @@
     {
      "data": {
       "text/html": [
-       "<table><thead><th></th><th>N</th><th>dwss</th><th>dn</th><th>b</th><th>di</th><th>Pug</th><th>as</th><th>gon</th><th>| 08</th><th>BORT</th><th>PR</th></thead><tr><td>base</td><td>| 6</td><td>512</td><td>2048</td><td>8</td><td>64</td><td>0.1</td><td>01</td><td>100K</td><td>| 492</td><td>258</td><td>65</td></tr><tr><td></td><td></td><td></td><td></td><td>1</td><td>512</td><td></td><td></td><td></td><td>529</td><td>249</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td>4</td><td>128</td><td></td><td></td><td></td><td>500</td><td>255</td><td></td></tr><tr><td>(A)</td><td></td><td></td><td></td><td>16</td><td>32</td><td></td><td></td><td></td><td>491</td><td>258</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td>32</td><td>16</td><td></td><td></td><td></td><td>501</td><td>254</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td><td>16</td><td></td><td></td><td></td><td>516</td><td>251</td><td>58</td></tr><tr><td>®)</td><td></td><td></td><td></td><td></td><td>32</td><td></td><td></td><td></td><td>501</td><td>254</td><td>60</td></tr><tr><td rowspan=\"7\">©)</td><td>2</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>6.11</td><td>237</td><td>36</td></tr><tr><td></td><td>4</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>519</td><td>253</td><td>50</td></tr><tr><td></td><td>8</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>488</td><td>255</td><td>80</td></tr><tr><td></td><td></td><td>256</td><td></td><td></td><td>32</td><td></td><td></td><td></td><td>575</td><td>245</td><td>28</td></tr><tr><td></td><td></td><td>1024</td><td></td><td></td><td>128</td><td></td><td></td><td></td><td>4.66</td><td>26.0</td><td>168</td></tr><tr><td></td><td></td><td></td><td>1024</td><td></td><td></td><td></td><td></td><td></td><td>512</td><td>254</td><td>53</td></tr><tr><td></td><td></td><td></td><td>4096</td><td></td><td></td><td></td><td></td><td></td><td>475</td><td>262</td><td>90</td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td>0.0</td><td></td><td></td><td>577</td><td>246</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td>0.2</td><td></td><td></td><td>495</td><td>255</td><td></td></tr><tr><td>D</td><td></td><td></td><td></td><td></td><td></td><td></td><td>0.0</td><td></td><td>461</td><td>253</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>0.2</td><td></td><td>547</td><td>257</td><td></td></tr><tr><td>(E)</td><td></td><td></td><td>positional</td><td>embedding</td><td>instead</td><td>of sinusoids</td><td></td><td></td><td>4.92</td><td>25.7</td><td></td></tr><tr><td>big</td><td>| 6</td><td>1024</td><td>4096</td><td>16</td><td></td><td>0.3</td><td></td><td>300K</td><td>| 433</td><td>264</td><td>213</td></tr></table>"
+       "<table><thead><th></th><th>N</th><th>dwss</th><th>dn</th><th>b</th><th>di</th><th>Pug</th><th>as</th><th>gon</th><th>| 08</th><th>BORT</th><th>PR</th></thead><tr><td>base</td><td>| 6</td><td>512</td><td>2048</td><td>8</td><td>64</td><td>0.1</td><td>01</td><td>100K</td><td>| 492</td><td>258</td><td>65</td></tr><tr><td></td><td></td><td></td><td></td><td>1</td><td>512</td><td></td><td></td><td></td><td>529</td><td>249</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td>4</td><td>128</td><td></td><td></td><td></td><td>500</td><td>255</td><td></td></tr><tr><td>(A)</td><td></td><td></td><td></td><td>16</td><td>32</td><td></td><td></td><td></td><td>491</td><td>258</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td>32</td><td>16</td><td></td><td></td><td></td><td>501</td><td>254</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td><td>16</td><td></td><td></td><td></td><td>516</td><td>251</td><td>58</td></tr><tr><td>\u00ae)</td><td></td><td></td><td></td><td></td><td>32</td><td></td><td></td><td></td><td>501</td><td>254</td><td>60</td></tr><tr><td rowspan=\"7\">\u00a9)</td><td>2</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>6.11</td><td>237</td><td>36</td></tr><tr><td></td><td>4</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>519</td><td>253</td><td>50</td></tr><tr><td></td><td>8</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>488</td><td>255</td><td>80</td></tr><tr><td></td><td></td><td>256</td><td></td><td></td><td>32</td><td></td><td></td><td></td><td>575</td><td>245</td><td>28</td></tr><tr><td></td><td></td><td>1024</td><td></td><td></td><td>128</td><td></td><td></td><td></td><td>4.66</td><td>26.0</td><td>168</td></tr><tr><td></td><td></td><td></td><td>1024</td><td></td><td></td><td></td><td></td><td></td><td>512</td><td>254</td><td>53</td></tr><tr><td></td><td></td><td></td><td>4096</td><td></td><td></td><td></td><td></td><td></td><td>475</td><td>262</td><td>90</td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td>0.0</td><td></td><td></td><td>577</td><td>246</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td>0.2</td><td></td><td></td><td>495</td><td>255</td><td></td></tr><tr><td>D</td><td></td><td></td><td></td><td></td><td></td><td></td><td>0.0</td><td></td><td>461</td><td>253</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>0.2</td><td></td><td>547</td><td>257</td><td></td></tr><tr><td>(E)</td><td></td><td></td><td>positional</td><td>embedding</td><td>instead</td><td>of sinusoids</td><td></td><td></td><td>4.92</td><td>25.7</td><td></td></tr><tr><td>big</td><td>| 6</td><td>1024</td><td>4096</td><td>16</td><td></td><td>0.3</td><td></td><td>300K</td><td>| 433</td><td>264</td><td>213</td></tr></table>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -269,10 +275,10 @@
      "text": [
       "development set, newstest2013. We used beam search as described in the previous section, but no checkpoint averaging. We present these results in Table 3.\n",
       "In Table 3 rows (A), we vary the number of attention heads and the attention key and value dimensions, keeping the amount of computation constant, as described in Section 3.2.2. While single-head attention is 0.9 BLEU worse than the best setting, quality also drops off with too many heads.\n",
-      "In Table 3 rows (B), we observe that reducing the attention key size dk hurts model quality. This suggests that determining compatibility is not easy and that a more sophisticated compatibility function than dot product may be beneﬁcial. We further observe in rows (C) and (D) that, as expected, bigger models are better, and dropout is very helpful in avoiding over-ﬁtting. In row (E) we replace our sinusoidal positional encoding with learned positional embeddings [9], and observe nearly identical results to the base model.\n",
+      "In Table 3 rows (B), we observe that reducing the attention key size dk hurts model quality. This suggests that determining compatibility is not easy and that a more sophisticated compatibility function than dot product may be bene\ufb01cial. We further observe in rows (C) and (D) that, as expected, bigger models are better, and dropout is very helpful in avoiding over-\ufb01tting. In row (E) we replace our sinusoidal positional encoding with learned positional embeddings [9], and observe nearly identical results to the base model.\n",
       "6.3 English Constituency Parsing\n",
-      "parent: '6.3 English Constituency Parsing' content: To evaluate if the Transformer can generalize to other tasks we performed experiments on English constituency parsing. This task presents speciﬁc challenges: the output is subject to strong structural constraints and is signiﬁcantly longer than the input. Furthermore, RNN sequence-to-sequence models have not been able to attain state-of-the-art results in small-data regimes [37].\n",
-      "parent: '6.3 English Constituency Parsing' content: We trained a 4-layer transformer with dmodel = 1024 on the Wall Street Journal (WSJ) portion of the Penn Treebank [25], about 40K training sentences. We also trained it in a semi-supervised setting, using the larger high-conﬁdence and BerkleyParser corpora from with approximately 17M sentences [37]. We used a vocabulary of 16K tokens for the WSJ only setting and a vocabulary of 32K tokens for the semi-supervised setting.\n",
+      "parent: '6.3 English Constituency Parsing' content: To evaluate if the Transformer can generalize to other tasks we performed experiments on English constituency parsing. This task presents speci\ufb01c challenges: the output is subject to strong structural constraints and is signi\ufb01cantly longer than the input. Furthermore, RNN sequence-to-sequence models have not been able to attain state-of-the-art results in small-data regimes [37].\n",
+      "parent: '6.3 English Constituency Parsing' content: We trained a 4-layer transformer with dmodel = 1024 on the Wall Street Journal (WSJ) portion of the Penn Treebank [25], about 40K training sentences. We also trained it in a semi-supervised setting, using the larger high-con\ufb01dence and BerkleyParser corpora from with approximately 17M sentences [37]. We used a vocabulary of 16K tokens for the WSJ only setting and a vocabulary of 32K tokens for the semi-supervised setting.\n",
       "parent: '6.3 English Constituency Parsing' content: We performed only a small number of experiments to select the dropout, both attention and residual (section 5.4), learning rates and beam size on the Section 22 development set, all other parameters remained unchanged from the English-to-German base translation model. During inference, we\n",
       "9\n",
       "Table 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23 of WSJ)\n"
@@ -294,13 +300,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "increased the maximum output length to input length + 300. We used a beam size of 21 and ↵ = 0.3 for both WSJ only and the semi-supervised setting.\n",
-      "Our results in Table 4 show that despite the lack of task-speciﬁc tuning our model performs sur- prisingly well, yielding better results than all previously reported models with the exception of the Recurrent Neural Network Grammar [8].\n",
+      "increased the maximum output length to input length + 300. We used a beam size of 21 and \u21b5 = 0.3 for both WSJ only and the semi-supervised setting.\n",
+      "Our results in Table 4 show that despite the lack of task-speci\ufb01c tuning our model performs sur- prisingly well, yielding better results than all previously reported models with the exception of the Recurrent Neural Network Grammar [8].\n",
       "In contrast to RNN sequence-to-sequence models [37], the Transformer outperforms the Berkeley- Parser [29] even when training only on the WSJ training set of 40K sentences.\n",
       "7 Conclusion\n",
-      "parent: '7 Conclusion' content: In this work, we presented the Transformer, the ﬁrst sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention.\n",
-      "parent: '7 Conclusion' content: For translation tasks, the Transformer can be trained signiﬁcantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles.\n",
-      "parent: '7 Conclusion' content: We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efﬁciently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours.\n",
+      "parent: '7 Conclusion' content: In this work, we presented the Transformer, the \ufb01rst sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention.\n",
+      "parent: '7 Conclusion' content: For translation tasks, the Transformer can be trained signi\ufb01cantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles.\n",
+      "parent: '7 Conclusion' content: We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to ef\ufb01ciently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours.\n",
       "parent: '7 Conclusion' content: The code we used to train and evaluate our models is available at https://github.com/ tensorflow/tensor2tensor.\n",
       "parent: '7 Conclusion' content: Acknowledgements We are grateful to Nal Kalchbrenner and Stephan Gouws for their fruitful comments, corrections and inspiration.\n",
       "parent: '7 Conclusion' content: References [1] Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. Layer normalization. arXiv preprint\n",
@@ -557,4 +563,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/langchain_evaluation.ipynb b/examples/notebooks/langchain_evaluation.ipynb
index 8ade28b18..3ef18e5bd 100644
--- a/examples/notebooks/langchain_evaluation.ipynb
+++ b/examples/notebooks/langchain_evaluation.ipynb
@@ -47,7 +47,14 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    }
+   },
    "outputs": [],
    "source": [
     "! pip install -q ragstack-ai"
@@ -648,4 +655,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/langchain_multimodal_gemini.ipynb b/examples/notebooks/langchain_multimodal_gemini.ipynb
index 8347fdd3f..42fb43667 100644
--- a/examples/notebooks/langchain_multimodal_gemini.ipynb
+++ b/examples/notebooks/langchain_multimodal_gemini.ipynb
@@ -69,6 +69,12 @@
     },
     "editable": true,
     "id": "-7fvXorW6wdA",
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    },
     "outputId": "a4e137ad-95b5-4601-8ad1-d82bb687d185",
     "slideshow": {
      "slide_type": ""
@@ -139,7 +145,7 @@
      "name": "stdin",
      "output_type": "stream",
      "text": [
-      "Enter Google JSON credentials file:  ········\n"
+      "Enter Google JSON credentials file:  \u00b7\u00b7\u00b7\u00b7\u00b7\u00b7\u00b7\u00b7\n"
      ]
     }
    ],
@@ -652,4 +658,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/llama-astra.ipynb b/examples/notebooks/llama-astra.ipynb
index ed76a10f9..1bde8209a 100644
--- a/examples/notebooks/llama-astra.ipynb
+++ b/examples/notebooks/llama-astra.ipynb
@@ -48,7 +48,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    }
+   },
    "outputs": [],
    "source": [
     "! pip install ragstack-ai"
@@ -342,7 +349,7 @@
    "outputs": [],
    "source": [
     "# WARNING: This will delete the collection and all documents in the collection\n",
-    "# astra_db_store.delete_collection()"
+    "#\u00a0astra_db_store.delete_collection()"
    ]
   }
  ],
@@ -367,4 +374,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/llama-parse-astra.ipynb b/examples/notebooks/llama-parse-astra.ipynb
index 7b8735bbe..415c57456 100644
--- a/examples/notebooks/llama-parse-astra.ipynb
+++ b/examples/notebooks/llama-parse-astra.ipynb
@@ -30,6 +30,12 @@
      "start_time": "2024-02-13T11:47:41.568768Z"
     },
     "editable": true,
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    },
     "slideshow": {
      "slide_type": ""
     },
@@ -333,7 +339,7 @@
     {
      "data": {
       "text/plain": [
-       "'We used beam search as described in the previous section, but no\\ncheckpoint averaging. We present these results in Table 3.\\nIn Table 3 rows (A), we vary the number of attention heads and the attention key and value dimensions,\\nkeeping the amount of computation constant, as described in Section 3.2.2. While single-head\\nattention is 0.9 BLEU worse than the best setting, quality also drops off with too many heads.\\nIn Table 3 rows (B), we observe that reducing the attention key size dk hurts model quality. This\\nsuggests that determining compatibility is not easy and that a more sophisticated compatibility\\nfunction than dot product may be beneficial. We further observe in rows (C) and (D) that, as expected,\\nbigger models are better, and dropout is very helpful in avoiding over-fitting. In row (E) we replace our\\nsinusoidal positional encoding with learned positional embeddings [9], and observe nearly identical\\nresults to the base model.\\n6.3    English Constituency Parsing\\nTo evaluate if the Transformer can generalize to other tasks we performed experiments on English\\nconstituency parsing. This task presents specific challenges: the output is subject to strong structural\\nconstraints and is significantly longer than the input. Furthermore, RNN sequence-to-sequence\\nmodels have not been able to attain state-of-the-art results in small-data regimes [37].\\nWe trained a 4-layer transformer with dmodel = 1024 on the Wall Street Journal (WSJ) portion of the\\nPenn Treebank [25], about 40K training sentences. We also trained it in a semi-supervised setting,\\nusing the larger high-confidence and BerkleyParser corpora from with approximately 17M sentences\\n[37]. We used a vocabulary of 16K tokens for the WSJ only setting and a vocabulary of 32K tokens\\nfor the semi-supervised setting.\\nWe performed only a small number of experiments to select the dropout, both attention and residual\\n(section 5.4), learning rates and beam size on the Section 22 development set, all other parameters\\nremained unchanged from the English-to-German base translation model. During inference, we\\n                                                             9\\n---\\nTable 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23\\nof WSJ)\\n                          Parser                           Training             WSJ 23 F1\\n            Vinyals & Kaiser el al. (2014) [37]    WSJ only, discriminative         88.3\\n                 Petrov et al. (2006) [29]         WSJ only, discriminative         90.4\\n                   Zhu et al. (2013) [40]          WSJ only, discriminative         90.4\\n                   Dyer et al. (2016) [8]          WSJ only, discriminative         91.7\\n                  Transformer (4 layers)           WSJ only, discriminative         91.3\\n                   Zhu et al. (2013) [40]              semi-supervised              91.3\\n               Huang & Harper (2009) [14]              semi-supervised              91.3\\n               McClosky et al. (2006) [26]             semi-supervised              92.1\\n            Vinyals & Kaiser el al. (2014) [37]        semi-supervised              92.1\\n                  Transformer (4 layers)               semi-supervised              92.7\\n                 Luong et al. (2015) [23]                  multi-task               93.0\\n                   Dyer et al. (2016) [8]                  generative               93.3\\nincreased the maximum output length to input length + 300. We used a beam size of 21 and α = 0.3\\nfor both WSJ only and the semi-supervised setting.\\nOur results in Table 4 show that despite the lack of task-specific tuning our model performs sur-\\nprisingly well, yielding better results than all previously reported models with the exception of the\\nRecurrent Neural Network Grammar [8].\\nIn contrast to RNN sequence-to-sequence models [37], the Transformer outperforms the Berkeley-\\nParser [29] even when training only on the WSJ training set of 40K sentences.\\n7    Conclusion\\nIn this work, we presented the Transformer, the first sequence transduction model based entirely on\\nattention, replacing the recurrent layers most commonly used in encoder-decoder architectures with\\nmulti-headed self-attention.\\nFor translation tasks, the Transformer can be trained significantly faster than architectures based\\non recurrent or convolutional layers.'"
+       "'We used beam search as described in the previous section, but no\\ncheckpoint averaging. We present these results in Table 3.\\nIn Table 3 rows (A), we vary the number of attention heads and the attention key and value dimensions,\\nkeeping the amount of computation constant, as described in Section 3.2.2. While single-head\\nattention is 0.9 BLEU worse than the best setting, quality also drops off with too many heads.\\nIn Table 3 rows (B), we observe that reducing the attention key size dk hurts model quality. This\\nsuggests that determining compatibility is not easy and that a more sophisticated compatibility\\nfunction than dot product may be beneficial. We further observe in rows (C) and (D) that, as expected,\\nbigger models are better, and dropout is very helpful in avoiding over-fitting. In row (E) we replace our\\nsinusoidal positional encoding with learned positional embeddings [9], and observe nearly identical\\nresults to the base model.\\n6.3    English Constituency Parsing\\nTo evaluate if the Transformer can generalize to other tasks we performed experiments on English\\nconstituency parsing. This task presents specific challenges: the output is subject to strong structural\\nconstraints and is significantly longer than the input. Furthermore, RNN sequence-to-sequence\\nmodels have not been able to attain state-of-the-art results in small-data regimes [37].\\nWe trained a 4-layer transformer with dmodel = 1024 on the Wall Street Journal (WSJ) portion of the\\nPenn Treebank [25], about 40K training sentences. We also trained it in a semi-supervised setting,\\nusing the larger high-confidence and BerkleyParser corpora from with approximately 17M sentences\\n[37]. We used a vocabulary of 16K tokens for the WSJ only setting and a vocabulary of 32K tokens\\nfor the semi-supervised setting.\\nWe performed only a small number of experiments to select the dropout, both attention and residual\\n(section 5.4), learning rates and beam size on the Section 22 development set, all other parameters\\nremained unchanged from the English-to-German base translation model. During inference, we\\n                                                             9\\n---\\nTable 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23\\nof WSJ)\\n                          Parser                           Training             WSJ 23 F1\\n            Vinyals & Kaiser el al. (2014) [37]    WSJ only, discriminative         88.3\\n                 Petrov et al. (2006) [29]         WSJ only, discriminative         90.4\\n                   Zhu et al. (2013) [40]          WSJ only, discriminative         90.4\\n                   Dyer et al. (2016) [8]          WSJ only, discriminative         91.7\\n                  Transformer (4 layers)           WSJ only, discriminative         91.3\\n                   Zhu et al. (2013) [40]              semi-supervised              91.3\\n               Huang & Harper (2009) [14]              semi-supervised              91.3\\n               McClosky et al. (2006) [26]             semi-supervised              92.1\\n            Vinyals & Kaiser el al. (2014) [37]        semi-supervised              92.1\\n                  Transformer (4 layers)               semi-supervised              92.7\\n                 Luong et al. (2015) [23]                  multi-task               93.0\\n                   Dyer et al. (2016) [8]                  generative               93.3\\nincreased the maximum output length to input length + 300. We used a beam size of 21 and \u03b1 = 0.3\\nfor both WSJ only and the semi-supervised setting.\\nOur results in Table 4 show that despite the lack of task-specific tuning our model performs sur-\\nprisingly well, yielding better results than all previously reported models with the exception of the\\nRecurrent Neural Network Grammar [8].\\nIn contrast to RNN sequence-to-sequence models [37], the Transformer outperforms the Berkeley-\\nParser [29] even when training only on the WSJ training set of 40K sentences.\\n7    Conclusion\\nIn this work, we presented the Transformer, the first sequence transduction model based entirely on\\nattention, replacing the recurrent layers most commonly used in encoder-decoder architectures with\\nmulti-headed self-attention.\\nFor translation tasks, the Transformer can be trained significantly faster than architectures based\\non recurrent or convolutional layers.'"
       ]
      },
      "execution_count": 16,
@@ -392,4 +398,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/nvidia.ipynb b/examples/notebooks/nvidia.ipynb
index f74e35cea..4091bfad5 100644
--- a/examples/notebooks/nvidia.ipynb
+++ b/examples/notebooks/nvidia.ipynb
@@ -50,7 +50,14 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    }
+   },
    "outputs": [],
    "source": [
     "! pip install -qU ragstack-ai langchain-nvidia-ai-endpoints datasets"
@@ -355,4 +362,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/qa-maximal-marginal-relevance.ipynb b/examples/notebooks/qa-maximal-marginal-relevance.ipynb
index a92e85f9c..45a7dc1c3 100644
--- a/examples/notebooks/qa-maximal-marginal-relevance.ipynb
+++ b/examples/notebooks/qa-maximal-marginal-relevance.ipynb
@@ -52,7 +52,14 @@
    "cell_type": "code",
    "execution_count": 1,
    "id": "7672da56",
-   "metadata": {},
+   "metadata": {
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    }
+   },
    "outputs": [],
    "source": [
     "from langchain.indexes.vectorstore import VectorStoreIndexWrapper\n",
@@ -706,4 +713,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/notebooks/quickstart.ipynb b/examples/notebooks/quickstart.ipynb
index 8e5d774c7..29557f912 100644
--- a/examples/notebooks/quickstart.ipynb
+++ b/examples/notebooks/quickstart.ipynb
@@ -50,6 +50,12 @@
    "execution_count": 3,
    "metadata": {
     "editable": true,
+    "nbmake": {
+     "post_cell_execute": [
+      "from conftest import before_notebook",
+      "before_notebook()"
+     ]
+    },
     "slideshow": {
      "slide_type": ""
     },
@@ -342,4 +348,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/scripts/format-example-notebooks.py b/scripts/format-example-notebooks.py
index 1898b160a..c7a77fabb 100755
--- a/scripts/format-example-notebooks.py
+++ b/scripts/format-example-notebooks.py
@@ -6,8 +6,8 @@ def main():
     nb_path = os.path.join("examples", "notebooks")
     for root, dirs, files in os.walk(nb_path):
         for file in files:
-            if file.endswith(".ipynb"):
-                file = os.path.join(root, file)
+            file = os.path.join(root, file)
+            if ".ipynb_checkpoints" not in file and file.endswith(".ipynb"):
                 print("Formatting file: ", file)
                 with open(file, "r") as f:
                     contents = f.read()
@@ -23,8 +23,9 @@ def main():
                             break
                     if not found:
                         raise ValueError("No code cells found in file: ", file)
-                    with open(file, "w") as f:
-                        f.write(contents)
+                with open(file, "w") as f:
+                    f.write(json.dumps(as_json, indent=1, sort_keys=True))
+
 
 if __name__ == "__main__":
     main()