Merge pull request #814 from ftnext/feature/faster-whisper

Support Faster Whisper (with less copy and paste)
Uberi · Dec 30, 2024 · dd67b7c · dd67b7c
2 parents 1a56fd5 + e663cb0
commit dd67b7c
Show file tree

Hide file tree

Showing 10 changed files with 407 additions and 28 deletions.
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
@@ -44,7 +44,7 @@ jobs:
       - name: Install Python dependencies (Ubuntu, <=3.12)
         if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.13'
         run: |
-          python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,openai,groq]
+          python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,faster-whisper,openai,groq]
       - name: Install Python dependencies (Ubuntu, 3.13)
         if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13'
         run: |
@@ -53,7 +53,7 @@ jobs:
       - name: Install Python dependencies (Windows)
         if: matrix.os == 'windows-latest'
         run: |
-          python -m pip install .[dev,whisper-local,google-cloud,openai,groq]
+          python -m pip install .[dev,whisper-local,faster-whisper,google-cloud,openai,groq]
       - name: Test with unittest
         run: |
           pytest --doctest-modules -v speech_recognition/recognizers/ tests/
diff --git a/README.rst b/README.rst
@@ -97,6 +97,7 @@ To use all of the functionality of the library, you should have:
 * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X)
 * **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``)
 * **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``)
+* **Faster Whisper** (required only if you need to use Faster Whisper ``recognizer_instance.recognize_faster_whisper``)
 * **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_openai``)
 * **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``)
 
@@ -179,6 +180,13 @@ Whisper is **required if and only if you want to use whisper** (``recognizer_ins
 
 You can install it with ``python3 -m pip install SpeechRecognition[whisper-local]``.
 
+Faster Whisper (for Faster Whisper users)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The library `faster-whisper <https://pypi.org/project/faster-whisper/>`__ is **required if and only if you want to use Faster Whisper** (``recognizer_instance.recognize_faster_whisper``).
+
+You can install it with ``python3 -m pip install SpeechRecognition[faster-whisper]``.
+
 OpenAI Whisper API (for OpenAI Whisper API users) 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/reference/library-reference.rst b/reference/library-reference.rst
@@ -290,11 +290,16 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot
 
 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
 
-``recognizer_instance.recognize_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, load_options=None, **transcribe_options):``
-----------------------------------------------------------------------------------------------------------------------------------------------------
+``recognizer_instance.recognize_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, load_options=None, **transcribe_options)``
+---------------------------------------------------------------------------------------------------------------------------------------------------
 
 .. autofunction:: speech_recognition.recognizers.whisper_local.whisper.recognize
 
+``recognizer_instance.recognize_faster_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, **transcribe_options)``
+---------------------------------------------------------------------------------------------------------------------------------------
+
+.. autofunction:: speech_recognition.recognizers.whisper_local.faster_whisper.recognize
+
 ``recognizer_instance.recognize_openai(audio_data: AudioData, model = "whisper-1", **kwargs)``
 ----------------------------------------------------------------------------------------------
 

diff --git a/setup.cfg b/setup.cfg
@@ -3,6 +3,7 @@ dev =
     pytest
     pytest-randomly
     respx
+    numpy
 audio =
     PyAudio >= 0.2.11
 pocketsphinx =
@@ -12,6 +13,8 @@ google-cloud =
 whisper-local =
     openai-whisper
     soundfile
+faster-whisper =
+    faster-whisper
 openai =
     openai
     httpx < 0.28

diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -1392,13 +1392,14 @@ def flush(self, *args, **kwargs):
 try:
     from .recognizers import google, google_cloud
     from .recognizers.whisper_api import groq, openai
-    from .recognizers.whisper_local import whisper
+    from .recognizers.whisper_local import faster_whisper, whisper
 except (ModuleNotFoundError, ImportError):
     pass
 else:
     Recognizer.recognize_google = google.recognize_legacy
     Recognizer.recognize_google_cloud = google_cloud.recognize
     Recognizer.recognize_whisper = whisper.recognize
+    Recognizer.recognize_faster_whisper = faster_whisper.recognize
     Recognizer.recognize_openai = openai.recognize
     Recognizer.recognize_whisper_api = openai.recognize  # Deprecated
     Recognizer.recognize_groq = groq.recognize

diff --git a/speech_recognition/recognizers/whisper_local/base.py b/speech_recognition/recognizers/whisper_local/base.py
@@ -1,13 +1,18 @@
 from __future__ import annotations
 
 import io
-from typing import Any, Protocol
+from typing import TYPE_CHECKING, Any, Protocol
 
 from speech_recognition.audio import AudioData
 
+if TYPE_CHECKING:
+    import numpy as np
+
 
 class Transcribable(Protocol):
-    def transcribe(self, audio_array, **kwargs) -> str | dict[str, Any]:
+    def transcribe(
+        self, audio_array: np.ndarray, **kwargs
+    ) -> str | dict[str, Any]:
         pass
 
 
@@ -32,10 +37,6 @@ def recognize(
         audio_array, sampling_rate = sf.read(wav_stream)
         audio_array = audio_array.astype(np.float32)
 
-        if "fp16" not in kwargs:
-            import torch
-
-            kwargs["fp16"] = torch.cuda.is_available()
         result = self.model.transcribe(audio_array, **kwargs)
 
         if show_dict:

diff --git a/speech_recognition/recognizers/whisper_local/faster_whisper.py b/speech_recognition/recognizers/whisper_local/faster_whisper.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal, TypedDict
+
+from speech_recognition.audio import AudioData
+from speech_recognition.recognizers.whisper_local.base import (
+    WhisperCompatibleRecognizer,
+)
+
+if TYPE_CHECKING:
+    import numpy as np
+    from faster_whisper import WhisperModel
+    from faster_whisper.transcribe import Segment
+    from typing_extensions import Unpack
+
+
+class TranscribeOutput(TypedDict):
+    text: str
+    segments: list[Segment]
+    language: str
+
+
+class TranscribableAdapter:
+    def __init__(self, model: WhisperModel) -> None:
+        self.model = model
+
+    def transcribe(
+        self, audio_array: np.ndarray, **kwargs
+    ) -> TranscribeOutput:
+        segments_generator, info = self.model.transcribe(audio_array, **kwargs)
+        segments = list(segments_generator)
+        return {
+            "text": " ".join(segment.text for segment in segments),
+            "segments": segments,
+            "language": info.language,
+        }
+
+
+class InitOptionalParameters(TypedDict, total=False):
+    # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L575
+    device: Literal["cpu", "gpu", "auto"]
+    compute_type: str
+    download_root: str
+    # TODO Add others
+
+
+class TranscribeOptionalParameters(TypedDict, total=False):
+    # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692
+    language: str
+    task: Literal["transcribe", "translate"]
+    beam_size: int
+    # TODO Add others
+
+
+def recognize(
+    recognizer,
+    audio_data: AudioData,
+    model: str = "base",
+    show_dict: bool = False,
+    init_options: InitOptionalParameters | None = None,
+    **transcribe_options: Unpack[TranscribeOptionalParameters],
+) -> str | TranscribeOutput:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
+
+    Pick ``model`` size (Same as Whisper).
+
+    If ``show_dict`` is true, returns the detailed response from Whisper, including the detected language. Otherwise returns only the transcription.
+
+    You can specify:
+
+        * ``language``: recognition language, an uncapitalized 2 letters language name like "en" or "fr".
+
+            * If not set, Faster Whisper will automatically detect the language.
+
+        * ``task``
+
+            * If you want transcribe + **translate** to english, set ``task="translate"``.
+
+    Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options.
+    """
+    from faster_whisper import WhisperModel
+
+    model = WhisperModel(model, **init_options or {})
+    whisper_recognizer = WhisperCompatibleRecognizer(
+        TranscribableAdapter(model)
+    )
+    return whisper_recognizer.recognize(
+        audio_data, show_dict=show_dict, **transcribe_options
+    )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    import speech_recognition as sr
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("audio_file")
+    args = parser.parse_args()
+
+    r = sr.Recognizer()
+    with sr.AudioFile(args.audio_file) as source:
+        audio_data = r.listen(source)
+
+    transcription = recognize(None, audio_data)
+    print(transcription)
diff --git a/speech_recognition/recognizers/whisper_local/whisper.py b/speech_recognition/recognizers/whisper_local/whisper.py
@@ -8,8 +8,10 @@
 )
 
 if TYPE_CHECKING:
+    import numpy as np
     import torch
     from typing_extensions import Unpack
+    from whisper import Whisper
 
 
 class LoadModelOptionalParameters(TypedDict, total=False):
@@ -52,6 +54,21 @@ class TranscribeOutput(TypedDict):
     language: str
 
 
+class TranscribableAdapter:
+    def __init__(self, model: Whisper) -> None:
+        self.model = model
+
+    def transcribe(
+        self, audio_array: np.ndarray, **kwargs
+    ) -> TranscribeOutput:
+        if "fp16" not in kwargs:
+            import torch
+
+            kwargs["fp16"] = torch.cuda.is_available()
+
+        return self.model.transcribe(audio_array, **kwargs)
+
+
 def recognize(
     recognizer,
     audio_data: AudioData,
@@ -70,17 +87,22 @@ def recognize(
     You can specify:
 
         * ``language``: recognition language, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
+
+            * If not set, Whisper will automatically detect the language.
+
         * ``task``
 
-            * If you want transcribe + **translate**, set ``task="translate"``.
+            * If you want transcribe + **translate** to english, set ``task="translate"``.
 
-    Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options
+    Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options.
     """
 
     import whisper
 
     whisper_model = whisper.load_model(model, **load_options or {})
-    whisper_recognizer = WhisperCompatibleRecognizer(whisper_model)
+    whisper_recognizer = WhisperCompatibleRecognizer(
+        TranscribableAdapter(whisper_model)
+    )
     return whisper_recognizer.recognize(
         audio_data, show_dict=show_dict, **transcribe_options
     )