From d298d28c2763e8f8dfa3089b1231b11772fad0d9 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Mon, 27 Nov 2023 13:02:54 +0400
Subject: [PATCH] return red-pajama back working in llmchatbot (#1492)

---
 .../254-llm-chatbot/254-llm-chatbot.ipynb     | 376 ++++++------------
 1 file changed, 111 insertions(+), 265 deletions(-)

diff --git a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
index 275f0e73450..61158b53fcb 100644
--- a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
+++ b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
@@ -28,10 +28,10 @@
     "- [Select model for inference](#Select-model-for-inference)\n",
     "- [login to huggingfacehub to get access to pretrained model](#login-to-huggingfacehub-to-get-access-to-pretrained-model)\n",
     "- [Instantiate Model using Optimum Intel](#Instantiate-Model-using-Optimum-Intel)\n",
-    "- [Compress model weights $\\uparrow$(#Table-of-content:)](#Compress-model-weights-\\uparrow(#Table-of-content:))\n",
-    "    - [Weights Compression using Optimum Intel $\\uparrow$(#Table-of-content:)](#Weights-Compression-using-Optimum-Intel-\\uparrow(#Table-of-content:))\n",
-    "    - [Weights Compression using NNCF $\\uparrow$(#Table-of-content:)](#Weights-Compression-using-NNCF-\\uparrow(#Table-of-content:))\n",
-    "- [Select device for inference and model variant $\\uparrow$(#Table-of-content:)](#Select-device-for-inference-and-model-variant-\\uparrow(#Table-of-content:))\n",
+    "- [Compress model weights](#Compress-model-weights)\n",
+    "    - [Weights Compression using Optimum Intel](#Weights-Compression-using-Optimum-Intel)\n",
+    "    - [Weights Compression using NNCF](#Weights-Compression-using-NNCF)\n",
+    "- [Select device for inference and model variant](#Select-device-for-inference-and-model-variant)\n",
     "- [Run Chatbot](#Run-Chatbot)\n"
    ]
   },
@@ -55,10 +55,11 @@
    },
    "outputs": [],
    "source": [
+    "%pip uninstall -q -y openvino-dev openvino openvino-nightly\n",
     "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\\\n",
     "\"git+https://github.com/huggingface/optimum-intel.git\"\\\n",
-    "\"openvino>=2023.2.0\"\\\n",
-    "\"git+https://github.com/openvinotoolkit/nncf.git\"\\\n",
+    "\"nncf>=2.7\"\\\n",
+    "\"openvino-nightly\"\\\n",
     "\"gradio\"\\\n",
     "\"onnx\" \"einops\" \"transformers_stream_generator\" \"tiktoken\" \"transformers>=4.34.0\""
    ]
@@ -101,7 +102,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "f93282b6-f1f1-4153-84af-31aac79c3ef4",
    "metadata": {
     "tags": []
@@ -114,7 +115,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "8d22fedb-d1f6-4306-b910-efac5b849c7c",
    "metadata": {
     "tags": []
@@ -123,15 +124,15 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b91a151a2bc8493bad8072cf55077618",
+       "model_id": "a843a649f0bc4ce48ae835437cbf0852",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Model:', index=5, options=('red-pajama-3b-chat', 'llama-2-chat-7b', 'mpt-7b-chat', 'qwen…"
+       "Dropdown(description='Model:', options=('red-pajama-3b-chat', 'llama-2-chat-7b', 'mpt-7b-chat', 'qwen-7b-chat'…"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -141,7 +142,7 @@
     "\n",
     "model_id = widgets.Dropdown(\n",
     "    options=model_ids,\n",
-    "    value=model_ids[-1],\n",
+    "    value=model_ids[0],\n",
     "    description=\"Model:\",\n",
     "    disabled=False,\n",
     ")\n",
@@ -151,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "906022ec-96bf-41a9-9447-789d2e875250",
    "metadata": {
     "tags": []
@@ -161,7 +162,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Selected model chatglm2-6b\n"
+      "Selected model red-pajama-3b-chat\n"
      ]
     }
    ],
@@ -202,7 +203,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "8cd910c2",
    "metadata": {},
    "outputs": [
@@ -217,15 +218,10 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2023-11-20 21:46:26.883173: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2023-11-20 21:46:26.885587: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
-      "2023-11-20 21:46:26.916596: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "2023-11-20 21:46:26.916622: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-      "2023-11-20 21:46:26.916646: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "2023-11-20 21:46:26.922413: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
-      "2023-11-20 21:46:26.923979: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "2023-11-27 12:20:34.867719: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2023-11-27 12:20:34.907808: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
       "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-11-20 21:46:27.585064: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
+      "2023-11-27 12:20:35.741323: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
      ]
     }
    ],
@@ -248,13 +244,13 @@
    "id": "13694bf8-ee7b-4186-a3e0-a8705be9733c",
    "metadata": {},
    "source": [
-    "## Compress model weights [$\\uparrow$](#Table-of-content:)\n",
+    "## Compress model weights\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
     "The Weights Compression algorithm is aimed at compressing the weights of the models and can be used to optimize the model footprint and performance of large models where the size of weights is relatively larger than the size of activations, for example, Large Language Models (LLM). Compared to INT8 compression, INT4 compression improves performance even more, but introduces a minor drop in prediction quality.\n",
     "\n",
     "\n",
-    "### Weights Compression using Optimum Intel [$\\uparrow$](#Table-of-content:)\n",
+    "### Weights Compression using Optimum Intel\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
     "To enable weights compression via NNCF for models supported by Optimum Intel `OVQuantizer` class should be used for `OVModelForCausalLM` model. `OVQuantizer.quantize(save_directory=save_dir, weights_only=True)` enables weights compression. We will consider how to do it on RedPajama, LLAMA and Zephyr examples. \n",
@@ -263,7 +259,7 @@
     "\n",
     ">**Note**: There may be no speedup for INT4/INT8 compressed models on dGPU.\n",
     "\n",
-    "### Weights Compression using NNCF [$\\uparrow$](#Table-of-content:)\n",
+    "### Weights Compression using NNCF\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
     "You also can perform weights compression for OpenVINO models using NNCF directly. `nncf.compress_weights` function accepts OpenVINO model instance and compresses its weights for Linear and Embedding layers. We will consider this variant based on MPT model.\n",
@@ -274,7 +270,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "91eb2ccf",
    "metadata": {
     "collapsed": false,
@@ -286,7 +282,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5ce22b29dc524be8a0de838b04fcc8d1",
+       "model_id": "bff7eafbed074831a9e6fd5ef51f7dab",
        "version_major": 2,
        "version_minor": 0
       },
@@ -300,7 +296,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9c930931da8b4070a617d8ed34022a09",
+       "model_id": "6ca4f283f8404c32ba855e3d1fba46ed",
        "version_major": 2,
        "version_minor": 0
       },
@@ -314,7 +310,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a830f797c8a2404baa5df8648b9d8e5b",
+       "model_id": "d9f7da9cf2764dbf8d30ae706d26d936",
        "version_major": 2,
        "version_minor": 0
       },
@@ -329,22 +325,20 @@
    "source": [
     "from IPython.display import display\n",
     "\n",
-    "# TODO: red-pajama-3b-chat currently can't be compiled in INT4 or FP16 due to ticket 123973\n",
-    "is_pajama_model = model_id.value == \"red-pajama-3b-chat\"\n",
     "prepare_int4_model = widgets.Checkbox(\n",
-    "    value=True and not is_pajama_model,\n",
+    "    value=True,\n",
     "    description=\"Prepare INT4 model\",\n",
-    "    disabled=is_pajama_model,\n",
+    "    disabled=False,\n",
     ")\n",
     "prepare_int8_model = widgets.Checkbox(\n",
-    "    value=False or is_pajama_model,\n",
+    "    value=False,\n",
     "    description=\"Prepare INT8 model\",\n",
     "    disabled=False,\n",
     ")\n",
     "prepare_fp16_model = widgets.Checkbox(\n",
     "    value=False,\n",
     "    description=\"Prepare FP16 model\",\n",
-    "    disabled=is_pajama_model,\n",
+    "    disabled=False,\n",
     ")\n",
     "\n",
     "display(prepare_int4_model)\n",
@@ -362,7 +356,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "c4ef9112",
    "metadata": {
     "collapsed": false,
@@ -372,33 +366,43 @@
    },
    "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "696946162a804d038a5f8a4bb1653a52",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Framework not specified. Using pt to export to ONNX.\n",
+      "Using the export variant default. Available variants are:\n",
+      "    - default: The default ONNX variant.\n",
+      "Using framework PyTorch: 2.0.1+cu117\n",
+      "Overriding 1 configuration item(s)\n",
+      "\t- use_cache -> True\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11.\n"
+     ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/ethan/.cache/huggingface/modules/transformers_modules/THUDM/chatglm2-6b/7fabe56db91e085c9c027f56f1c654d137bdba40/modeling_chatglm.py:818: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):\n",
-      "/home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.11/site-packages/torch/jit/_trace.py:160: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.)\n",
-      "  if a.grad is not None:\n"
+      "[ WARNING ]  Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s.\n",
+      "/home/ea/work/openvino_notebooks/test_env/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:616: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n",
+      "/home/ea/work/openvino_notebooks/test_env/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:313: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  if seq_len > self.max_seq_len_cached:\n",
+      "/home/ea/work/openvino_notebooks/test_env/lib/python3.8/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py:235: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  if key_length > self.bias.shape[-1]:\n",
+      "/home/ea/work/openvino_notebooks/test_env/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:75: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "  op1 = operator(*args, **kwargs)\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ddd891f09894430dab1b6f791aa40523",
+       "model_id": "2f2ce938515746b78bf0bcb19490ad49",
        "version_major": 2,
        "version_minor": 0
       },
@@ -435,7 +439,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ec557260de1247e68c99459a3b7e5b6e",
+       "model_id": "47d4f3a44128477c8f75e00fd720d7eb",
        "version_major": 2,
        "version_minor": 0
       },
@@ -536,19 +540,30 @@
     "def convert_to_int4():\n",
     "    compression_configs = {\n",
     "        \"zephyr-7b-beta\": {\n",
-    "            \"mode\": nncf.CompressWeightsMode.INT4_ASYM,\n",
+    "            \"mode\": nncf.CompressWeightsMode.INT4_SYM,\n",
     "            \"group_size\": 64,\n",
     "            \"ratio\": 0.6,\n",
     "        },\n",
     "        \"llama-2-chat-7b\": {\n",
-    "            \"mode\": nncf.CompressWeightsMode.INT4_ASYM,\n",
+    "            \"mode\": nncf.CompressWeightsMode.INT4_SYM,\n",
     "            \"group_size\": 128,\n",
     "            \"ratio\": 0.8,\n",
     "        },\n",
     "        \"chatglm2-6b\": {\n",
-    "            \"mode\": nncf.CompressWeightsMode.INT4_ASYM,\n",
+    "            \"mode\": nncf.CompressWeightsMode.INT4_SYM,\n",
     "            \"group_size\": 128,\n",
     "            \"ratio\": 0.72,\n",
+    "            \"ignored_scope\": nncf.IgnoredScope([\"__module.transformer/aten::index_67/Gather\"])\n",
+    "        },\n",
+    "        \"qwen-7b-chat\": {\n",
+    "            \"mode\": nncf.CompressWeightsMode.INT4_SYM, \n",
+    "            \"group_size\": 128, \n",
+    "            \"ratio\": 0.6\n",
+    "        },\n",
+    "        'red-pajama-3b-chat': {\n",
+    "            \"mode\": nncf.CompressWeightsMode.INT4_ASYM,\n",
+    "            \"group_size\": 128,\n",
+    "            \"ratio\": 0.6,\n",
     "        },\n",
     "        \"default\": {\n",
     "            \"mode\": nncf.CompressWeightsMode.INT4_ASYM,\n",
@@ -564,7 +579,6 @@
     "        return\n",
     "    int4_model_dir.mkdir(parents=True, exist_ok=True)\n",
     "    if not model_configuration[\"remote\"]:\n",
-    "        # TODO: remove compression via NNCF for non-MPT models when INT4 weight compression is added to optimum-intel\n",
     "        if not fp16_model_dir.exists():\n",
     "            model = OVModelForCausalLM.from_pretrained(\n",
     "                pt_model_id, export=True, compile=False\n",
@@ -575,7 +589,7 @@
     "            gc.collect()\n",
     "        else:\n",
     "            ov_model = ov.Core().read_model(fp16_model_dir / \"openvino_model.xml\")\n",
-    "            shutil.copy(fp16_model_dir / \"config.json\", int8_model_dir / \"config.json\")\n",
+    "            shutil.copy(fp16_model_dir / \"config.json\", int4_model_dir / \"config.json\")\n",
     "\n",
     "    else:\n",
     "        convert_to_fp16()\n",
@@ -616,7 +630,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "281f1d07-998e-4e13-ba95-0264564ede82",
    "metadata": {},
    "outputs": [
@@ -624,9 +638,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Size of FP16 model is 11912.69 MB\n",
-      "Size of model with INT4 compressed weights is 4082.52 MB\n",
-      "Compression rate for INT4 model: 2.918\n"
+      "Size of model with INT4 compressed weights is 1740.61 MB\n"
      ]
     }
    ],
@@ -653,7 +665,7 @@
    "id": "6d62f9f4-5434-4550-b372-c86b5a5089d5",
    "metadata": {},
    "source": [
-    "## Select device for inference and model variant [$\\uparrow$](#Table-of-content:)\n",
+    "## Select device for inference and model variant\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
     ">**Note**: There may be no speedup for INT4/INT8 compressed models on dGPU."
@@ -661,7 +673,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "837b4a3b-ccc3-4004-9577-2b2c7b802dea",
    "metadata": {
     "tags": []
@@ -670,15 +682,15 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "90b9a35d7a5444b49f135f7af31354a2",
+       "model_id": "1b25cb757d934f22a07a6c5038a044d4",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')"
+       "Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -705,7 +717,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "5333ab9b-ff5d-4a7f-bcdc-9cca5d56dc0a",
    "metadata": {
     "tags": []
@@ -727,7 +739,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "3536a1a7",
    "metadata": {
     "collapsed": false,
@@ -739,15 +751,15 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9ceb8b09a8a743c4876063e7e76da761",
+       "model_id": "3c2c23bf37364951ad74821ffef7579c",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Model to run:', options=('INT4', 'FP16'), value='INT4')"
+       "Dropdown(description='Model to run:', options=('INT4',), value='INT4')"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -773,7 +785,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "7a041101-7336-40fd-96c9-cd298015a0f3",
    "metadata": {
     "tags": []
@@ -783,7 +795,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loading model from chatglm2-6b/INT4_compressed_weights\n"
+      "Loading model from red-pajama-3b-chat/INT4_compressed_weights\n"
      ]
     },
     {
@@ -828,7 +840,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "id": "8f6f7596-5677-4931-875b-aaabfa23cabc",
    "metadata": {},
    "outputs": [
@@ -836,7 +848,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/ethan/intel/openvino_notebooks/notebooks/254-llm-chatbot/ovmodel.py:398: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
+      "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n",
+      "/home/ea/work/openvino_notebooks/test_env/lib/python3.8/site-packages/optimum/intel/openvino/modeling_decoder.py:388: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
       "  self.request.start_async(inputs, shared_memory=True)\n"
      ]
     },
@@ -844,7 +857,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2 + 2 = 4\n"
+      "2 + 2 = 3\n",
+      "\n"
      ]
     }
    ],
@@ -901,17 +915,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "id": "01f8f7f8-072e-45dc-b7c9-18d8c3c47754",
    "metadata": {
     "tags": []
    },
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ea/work/openvino_notebooks/test_env/lib/python3.8/site-packages/gradio/blocks.py:890: UserWarning: api_name user already exists, using user_1\n",
+      "  warnings.warn(f\"api_name {api_name} already exists, using {api_name_}\")\n",
+      "/home/ea/work/openvino_notebooks/test_env/lib/python3.8/site-packages/gradio/blocks.py:890: UserWarning: api_name bot already exists, using bot_1\n",
+      "  warnings.warn(f\"api_name {api_name} already exists, using {api_name_}\")\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Running on local URL:  http://127.0.0.1:7861\n",
+      "Running on local URL:  http://127.0.0.1:7860\n",
       "\n",
       "To create a public link, set `share=True` in `launch()`.\n"
      ]
@@ -919,7 +943,7 @@
     {
      "data": {
       "text/html": [
-       "<div><iframe src=\"http://127.0.0.1:7861/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -932,7 +956,7 @@
      "data": {
       "text/plain": []
      },
-     "execution_count": 13,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1272,18 +1296,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "id": "7b837f9e-4152-4a5c-880a-ed874aa64a74",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Closing server running on port: 7861\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# please run this cell for stopping gradio interface\n",
     "# demo.close()"
@@ -1306,181 +1322,11 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.8.10"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
-    "state": {
-     "0dcaad88693442a6b1f2d764ec57384b": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxModel",
-      "state": {
-       "description": "Prepare FP16 model",
-       "disabled": false,
-       "layout": "IPY_MODEL_ecea11ca618c4fa9837133da9c4b6a88",
-       "style": "IPY_MODEL_bdc7ccf323714773948b0c421392e8a0",
-       "value": false
-      }
-     },
-     "1001a1d9d0ad4c81a819776c25ff014d": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "2beacad6a9d74671b540f82ed6a7c398": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "3fe2c896dfbf4eba9f9514605fa2378c": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DescriptionStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "43609c48dadd466a9d9f3f6aefad7044": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DropdownModel",
-      "state": {
-       "_options_labels": [
-        "INT4"
-       ],
-       "description": "Model to run:",
-       "index": 0,
-       "layout": "IPY_MODEL_9abd09fc1c974f0f9901012e312f9bd2",
-       "style": "IPY_MODEL_ac5407b3423649eaa8d6996443f31ffb"
-      }
-     },
-     "4d923db7f48a44e58ee781d780d7c33f": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxModel",
-      "state": {
-       "description": "Prepare INT8 model",
-       "disabled": false,
-       "layout": "IPY_MODEL_aac9e3150e054fe887f6294b01471620",
-       "style": "IPY_MODEL_60a04b4251d44c1c9923dd63d3e4abbe",
-       "value": false
-      }
-     },
-     "5668fa59f9d24b41b52527195cec1e32": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxModel",
-      "state": {
-       "description": "Prepare INT4 model",
-       "disabled": false,
-       "layout": "IPY_MODEL_2beacad6a9d74671b540f82ed6a7c398",
-       "style": "IPY_MODEL_1001a1d9d0ad4c81a819776c25ff014d",
-       "value": true
-      }
-     },
-     "60a04b4251d44c1c9923dd63d3e4abbe": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "9abd09fc1c974f0f9901012e312f9bd2": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "aac9e3150e054fe887f6294b01471620": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "ac5407b3423649eaa8d6996443f31ffb": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DescriptionStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "af9cb335826f44389c2f3b56287af4bb": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "ba12593b6126467ebb19401833927b8d": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "bcb41169ec57464da1c679a3a7afe622": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DropdownModel",
-      "state": {
-       "_options_labels": [
-        "red-pajama-3b-chat",
-        "llama-2-chat-7b",
-        "mpt-7b-chat",
-        "qwen-7b-chat",
-        "chatglm2-6b",
-        "zephyr-7b-beta"
-       ],
-       "description": "Model:",
-       "index": 5,
-       "layout": "IPY_MODEL_af9cb335826f44389c2f3b56287af4bb",
-       "style": "IPY_MODEL_3fe2c896dfbf4eba9f9514605fa2378c"
-      }
-     },
-     "bdc7ccf323714773948b0c421392e8a0": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "CheckboxStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "dabb84314cac4774bbcb30a47270b9c8": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DropdownModel",
-      "state": {
-       "_options_labels": [
-        "CPU",
-        "GPU",
-        "AUTO"
-       ],
-       "description": "Device:",
-       "index": 0,
-       "layout": "IPY_MODEL_ba12593b6126467ebb19401833927b8d",
-       "style": "IPY_MODEL_f607fb49b5b645bba103615605469ca0"
-      }
-     },
-     "ecea11ca618c4fa9837133da9c4b6a88": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "f607fb49b5b645bba103615605469ca0": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "DescriptionStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     }
-    },
+    "state": {},
     "version_major": 2,
     "version_minor": 0
    }