Skip to content

Commit

Permalink
Merge pull request #814 from ftnext/feature/faster-whisper
Browse files Browse the repository at this point in the history
Support Faster Whisper (with less copy and paste)
  • Loading branch information
ftnext authored Dec 30, 2024
2 parents 1a56fd5 + e663cb0 commit dd67b7c
Show file tree
Hide file tree
Showing 10 changed files with 407 additions and 28 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
- name: Install Python dependencies (Ubuntu, <=3.12)
if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.13'
run: |
python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,openai,groq]
python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,faster-whisper,openai,groq]
- name: Install Python dependencies (Ubuntu, 3.13)
if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13'
run: |
Expand All @@ -53,7 +53,7 @@ jobs:
- name: Install Python dependencies (Windows)
if: matrix.os == 'windows-latest'
run: |
python -m pip install .[dev,whisper-local,google-cloud,openai,groq]
python -m pip install .[dev,whisper-local,faster-whisper,google-cloud,openai,groq]
- name: Test with unittest
run: |
pytest --doctest-modules -v speech_recognition/recognizers/ tests/
8 changes: 8 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ To use all of the functionality of the library, you should have:
* **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X)
* **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``)
* **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``)
* **Faster Whisper** (required only if you need to use Faster Whisper ``recognizer_instance.recognize_faster_whisper``)
* **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_openai``)
* **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``)

Expand Down Expand Up @@ -179,6 +180,13 @@ Whisper is **required if and only if you want to use whisper** (``recognizer_ins

You can install it with ``python3 -m pip install SpeechRecognition[whisper-local]``.

Faster Whisper (for Faster Whisper users)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The library `faster-whisper <https://pypi.org/project/faster-whisper/>`__ is **required if and only if you want to use Faster Whisper** (``recognizer_instance.recognize_faster_whisper``).

You can install it with ``python3 -m pip install SpeechRecognition[faster-whisper]``.

OpenAI Whisper API (for OpenAI Whisper API users)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
9 changes: 7 additions & 2 deletions reference/library-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -290,11 +290,16 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot

Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.

``recognizer_instance.recognize_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, load_options=None, **transcribe_options):``
----------------------------------------------------------------------------------------------------------------------------------------------------
``recognizer_instance.recognize_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, load_options=None, **transcribe_options)``
---------------------------------------------------------------------------------------------------------------------------------------------------

.. autofunction:: speech_recognition.recognizers.whisper_local.whisper.recognize

``recognizer_instance.recognize_faster_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, **transcribe_options)``
---------------------------------------------------------------------------------------------------------------------------------------

.. autofunction:: speech_recognition.recognizers.whisper_local.faster_whisper.recognize

``recognizer_instance.recognize_openai(audio_data: AudioData, model = "whisper-1", **kwargs)``
----------------------------------------------------------------------------------------------

Expand Down
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ dev =
pytest
pytest-randomly
respx
numpy
audio =
PyAudio >= 0.2.11
pocketsphinx =
Expand All @@ -12,6 +13,8 @@ google-cloud =
whisper-local =
openai-whisper
soundfile
faster-whisper =
faster-whisper
openai =
openai
httpx < 0.28
Expand Down
3 changes: 2 additions & 1 deletion speech_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1392,13 +1392,14 @@ def flush(self, *args, **kwargs):
try:
from .recognizers import google, google_cloud
from .recognizers.whisper_api import groq, openai
from .recognizers.whisper_local import whisper
from .recognizers.whisper_local import faster_whisper, whisper
except (ModuleNotFoundError, ImportError):
pass
else:
Recognizer.recognize_google = google.recognize_legacy
Recognizer.recognize_google_cloud = google_cloud.recognize
Recognizer.recognize_whisper = whisper.recognize
Recognizer.recognize_faster_whisper = faster_whisper.recognize
Recognizer.recognize_openai = openai.recognize
Recognizer.recognize_whisper_api = openai.recognize # Deprecated
Recognizer.recognize_groq = groq.recognize
Expand Down
13 changes: 7 additions & 6 deletions speech_recognition/recognizers/whisper_local/base.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
from __future__ import annotations

import io
from typing import Any, Protocol
from typing import TYPE_CHECKING, Any, Protocol

from speech_recognition.audio import AudioData

if TYPE_CHECKING:
import numpy as np


class Transcribable(Protocol):
def transcribe(self, audio_array, **kwargs) -> str | dict[str, Any]:
def transcribe(
self, audio_array: np.ndarray, **kwargs
) -> str | dict[str, Any]:
pass


Expand All @@ -32,10 +37,6 @@ def recognize(
audio_array, sampling_rate = sf.read(wav_stream)
audio_array = audio_array.astype(np.float32)

if "fp16" not in kwargs:
import torch

kwargs["fp16"] = torch.cuda.is_available()
result = self.model.transcribe(audio_array, **kwargs)

if show_dict:
Expand Down
106 changes: 106 additions & 0 deletions speech_recognition/recognizers/whisper_local/faster_whisper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Literal, TypedDict

from speech_recognition.audio import AudioData
from speech_recognition.recognizers.whisper_local.base import (
WhisperCompatibleRecognizer,
)

if TYPE_CHECKING:
import numpy as np
from faster_whisper import WhisperModel
from faster_whisper.transcribe import Segment
from typing_extensions import Unpack


class TranscribeOutput(TypedDict):
text: str
segments: list[Segment]
language: str


class TranscribableAdapter:
def __init__(self, model: WhisperModel) -> None:
self.model = model

def transcribe(
self, audio_array: np.ndarray, **kwargs
) -> TranscribeOutput:
segments_generator, info = self.model.transcribe(audio_array, **kwargs)
segments = list(segments_generator)
return {
"text": " ".join(segment.text for segment in segments),
"segments": segments,
"language": info.language,
}


class InitOptionalParameters(TypedDict, total=False):
# https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L575
device: Literal["cpu", "gpu", "auto"]
compute_type: str
download_root: str
# TODO Add others


class TranscribeOptionalParameters(TypedDict, total=False):
# https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692
language: str
task: Literal["transcribe", "translate"]
beam_size: int
# TODO Add others


def recognize(
recognizer,
audio_data: AudioData,
model: str = "base",
show_dict: bool = False,
init_options: InitOptionalParameters | None = None,
**transcribe_options: Unpack[TranscribeOptionalParameters],
) -> str | TranscribeOutput:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
Pick ``model`` size (Same as Whisper).
If ``show_dict`` is true, returns the detailed response from Whisper, including the detected language. Otherwise returns only the transcription.
You can specify:
* ``language``: recognition language, an uncapitalized 2 letters language name like "en" or "fr".
* If not set, Faster Whisper will automatically detect the language.
* ``task``
* If you want transcribe + **translate** to english, set ``task="translate"``.
Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options.
"""
from faster_whisper import WhisperModel

model = WhisperModel(model, **init_options or {})
whisper_recognizer = WhisperCompatibleRecognizer(
TranscribableAdapter(model)
)
return whisper_recognizer.recognize(
audio_data, show_dict=show_dict, **transcribe_options
)


if __name__ == "__main__":
import argparse

import speech_recognition as sr

parser = argparse.ArgumentParser()
parser.add_argument("audio_file")
args = parser.parse_args()

r = sr.Recognizer()
with sr.AudioFile(args.audio_file) as source:
audio_data = r.listen(source)

transcription = recognize(None, audio_data)
print(transcription)
28 changes: 25 additions & 3 deletions speech_recognition/recognizers/whisper_local/whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
)

if TYPE_CHECKING:
import numpy as np
import torch
from typing_extensions import Unpack
from whisper import Whisper


class LoadModelOptionalParameters(TypedDict, total=False):
Expand Down Expand Up @@ -52,6 +54,21 @@ class TranscribeOutput(TypedDict):
language: str


class TranscribableAdapter:
def __init__(self, model: Whisper) -> None:
self.model = model

def transcribe(
self, audio_array: np.ndarray, **kwargs
) -> TranscribeOutput:
if "fp16" not in kwargs:
import torch

kwargs["fp16"] = torch.cuda.is_available()

return self.model.transcribe(audio_array, **kwargs)


def recognize(
recognizer,
audio_data: AudioData,
Expand All @@ -70,17 +87,22 @@ def recognize(
You can specify:
* ``language``: recognition language, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
* If not set, Whisper will automatically detect the language.
* ``task``
* If you want transcribe + **translate**, set ``task="translate"``.
* If you want transcribe + **translate** to english, set ``task="translate"``.
Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options
Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options.
"""

import whisper

whisper_model = whisper.load_model(model, **load_options or {})
whisper_recognizer = WhisperCompatibleRecognizer(whisper_model)
whisper_recognizer = WhisperCompatibleRecognizer(
TranscribableAdapter(whisper_model)
)
return whisper_recognizer.recognize(
audio_data, show_dict=show_dict, **transcribe_options
)
Loading

0 comments on commit dd67b7c

Please sign in to comment.