update the method of audio extraction

openvinotoolkit · Nov 19, 2024 · cce7f97 · cce7f97
1 parent 655ab9c
commit cce7f97
Showing 1 changed file with 20 additions and 113 deletions.
diff --git a/notebooks/multimodal-rag/multimodal-rag-llamaindex.ipynb b/notebooks/multimodal-rag/multimodal-rag-llamaindex.ipynb
@@ -56,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "ad6c48df",
    "metadata": {},
    "outputs": [
@@ -74,6 +74,8 @@
     "    \"transformers>=4.45\" \\\n",
     "    \"pytube\" \\\n",
     "    \"moviepy==1.0.3\" \\\n",
+    "    \"librosa\" \\\n",
+    "    \"python-ffmpeg<=1.0.16\" \\\n",
     "    \"yt-dlp\" \\\n",
     "    \"open_clip_torch\" \\\n",
     "    \"gradio>=4.44.1\" --extra-index-url https://download.pytorch.org/whl/cpu"
@@ -99,7 +101,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "id": "2c61cb01-9c46-46e3-bf22-20c4ca0da417",
    "metadata": {},
    "outputs": [],
@@ -134,7 +136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "id": "b4d0e724",
    "metadata": {},
    "outputs": [],
@@ -162,7 +164,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "id": "d2ea678c",
    "metadata": {},
    "outputs": [],
@@ -190,35 +192,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 6,
    "id": "eedcf36c",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loading checkpoint shards: 100%|██████████████████| 2/2 [00:01<00:00,  1.56it/s]\n",
-      "The repository for microsoft/Phi-3.5-vision-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3.5-vision-instruct.\n",
-      "You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n",
-      "\n",
-      "Do you wish to run the custom code? [y/N] The class `optimum.bettertransformers.transformation.BetterTransformer` is deprecated and will be removed in a future release.\n",
-      "WARNING:root:Cannot apply model.to_bettertransformer because of the exception:\n",
-      "The model type phi3_v is not yet supported to be used with BetterTransformer. Feel free to open an issue at https://github.com/huggingface/optimum/issues if you would like this model type to be supported. Currently supported models are: dict_keys(['albert', 'bark', 'bart', 'bert', 'bert-generation', 'blenderbot', 'bloom', 'camembert', 'blip-2', 'clip', 'codegen', 'data2vec-text', 'deit', 'distilbert', 'electra', 'ernie', 'fsmt', 'gpt2', 'gptj', 'gpt_neo', 'gpt_neox', 'hubert', 'layoutlm', 'm2m_100', 'marian', 'markuplm', 'mbart', 'opt', 'pegasus', 'rembert', 'prophetnet', 'roberta', 'roc_bert', 'roformer', 'splinter', 'tapas', 't5', 'vilt', 'vit', 'vit_mae', 'vit_msn', 'wav2vec2', 'xlm-roberta', 'yolos']).. Usage model with stateful=True may be non-effective if model does not contain torch.functional.scaled_dot_product_attention\n",
-      "/home2/ethan/intel/openvino_notebooks/openvino_venv/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py:116: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:\n",
-      "/home2/ethan/intel/openvino_notebooks/openvino_venv/lib/python3.10/site-packages/optimum/exporters/onnx/model_patcher.py:306: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if past_key_values_length > 0:\n",
-      "/home/ethan/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:444: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  seq_len = seq_len or torch.max(position_ids) + 1\n",
-      "/home/ethan/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:445: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
-      "  if seq_len > self.original_max_position_embeddings:\n",
-      "/home2/ethan/intel/openvino_notebooks/openvino_venv/lib/python3.10/site-packages/nncf/torch/dynamic_graph/wrappers.py:86: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  op1 = operator(*args, **kwargs)\n",
-      "Exporting tokenizers to OpenVINO is not supported for tokenizers version > 0.19 and openvino version <= 2024.4. Please downgrade to tokenizers version <= 0.19 to export tokenizers to OpenVINO.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "vlm_model_id = \"microsoft/Phi-3.5-vision-instruct\"\n",
     "vlm_model_path = Path(vlm_model_id.split(\"/\")[-1]) / \"FP16\"\n",
@@ -229,71 +206,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 7,
    "id": "f4e131e3-0ab4-4e9e-ab0e-e68e7793cba5",
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5ff5e94e92bf4420b9201433439a1f2c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Output()"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO:nncf:Statistics of the bitwidth distribution:\n",
-      "┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
-      "│   Num bits (N) │ % all parameters (layers)   │ % ratio-defining parameters (layers)   │\n",
-      "┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n",
-      "│              8 │ 42% (54 / 129)              │ 40% (53 / 128)                         │\n",
-      "├────────────────┼─────────────────────────────┼────────────────────────────────────────┤\n",
-      "│              4 │ 58% (75 / 129)              │ 60% (75 / 128)                         │\n",
-      "┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n"
+      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n"
      ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3c01b13647964d3ba085e0367d4ab897",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Output()"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
     }
    ],
    "source": [
@@ -344,7 +266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 8,
    "id": "093464db-893e-4813-a6cc-19473a1a890c",
    "metadata": {},
    "outputs": [],
@@ -375,22 +297,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 9,
    "id": "b6636cd0",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d789f277251b4502b2dff572e2391976",
+       "model_id": "c3943b9a6b844f6fa53437caff903b4c",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
        "Dropdown(description='Device:', index=2, options=('CPU', 'GPU', 'AUTO'), value='AUTO')"
       ]
      },
-     "execution_count": 67,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -415,7 +337,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 14,
    "id": "534c83b8-a8f4-499f-bfad-6799fdbabe8c",
    "metadata": {},
    "outputs": [],
@@ -431,32 +353,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 15,
    "id": "d300f17f-bf8d-4cc2-a61a-86fbb2529b3d",
    "metadata": {},
    "outputs": [],
    "source": [
     "from pytube import YouTube\n",
+    "import librosa\n",
     "from moviepy.video.io.VideoFileClip import VideoFileClip\n",
     "\n",
     "\n",
-    "def download_video(url, output_path):\n",
-    "    \"\"\"\n",
-    "    Download a video from a given url and save it to the output path.\n",
-    "\n",
-    "    Params:\n",
-    "    url (str): The url of the video to download.\n",
-    "    output_path (str): The path to save the video to.\n",
-    "\n",
-    "    Returns:\n",
-    "    dict: A dictionary containing the metadata of the video.\n",
-    "    \"\"\"\n",
-    "    yt = YouTube(url)\n",
-    "    metadata = {\"Author\": yt.author, \"Title\": yt.title, \"Views\": yt.views}\n",
-    "    yt.streams.get_highest_resolution().download(output_path=output_path, filename=\"input_vid.mp4\")\n",
-    "    return metadata\n",
-    "\n",
-    "\n",
     "def video_to_images(video_path, output_folder):\n",
     "    \"\"\"\n",
     "    Convert a video to a sequence of images and save them to the output folder.\n",
@@ -495,7 +401,8 @@
     "    test (str): The text recognized from the audio.\n",
     "\n",
     "    \"\"\"\n",
-    "    result = pipe(audio_path, return_timestamps=True)\n",
+    "    en_raw_speech, samplerate = librosa.load(audio_path, sr=16000)\n",
+    "    result = pipe(en_raw_speech, return_timestamps=True)\n",
     "\n",
     "    return result[\"text\"]"
    ]
@@ -510,7 +417,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 16,
    "id": "8f45641a",
    "metadata": {},
    "outputs": [