From 6761719387c6effc2db0e297433cd7010164ffaa Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 13:48:37 +0900 Subject: [PATCH 01/22] [feat] Support faster-whisper with adapter --- .../recognizers/whisper_local/base.py | 4 - .../whisper_local/faster_whisper.py | 45 +++++++++++ .../recognizers/whisper_local/whisper.py | 18 ++++- .../whisper_local/test_faster_whisper.py | 75 +++++++++++++++++++ 4 files changed, 137 insertions(+), 5 deletions(-) create mode 100644 speech_recognition/recognizers/whisper_local/faster_whisper.py create mode 100644 tests/recognizers/whisper_local/test_faster_whisper.py diff --git a/speech_recognition/recognizers/whisper_local/base.py b/speech_recognition/recognizers/whisper_local/base.py index 7bf458b6..12e04e65 100644 --- a/speech_recognition/recognizers/whisper_local/base.py +++ b/speech_recognition/recognizers/whisper_local/base.py @@ -32,10 +32,6 @@ def recognize( audio_array, sampling_rate = sf.read(wav_stream) audio_array = audio_array.astype(np.float32) - if "fp16" not in kwargs: - import torch - - kwargs["fp16"] = torch.cuda.is_available() result = self.model.transcribe(audio_array, **kwargs) if show_dict: diff --git a/speech_recognition/recognizers/whisper_local/faster_whisper.py b/speech_recognition/recognizers/whisper_local/faster_whisper.py new file mode 100644 index 00000000..58d4b860 --- /dev/null +++ b/speech_recognition/recognizers/whisper_local/faster_whisper.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, TypedDict + +from speech_recognition.audio import AudioData +from speech_recognition.recognizers.whisper_local.base import ( + WhisperCompatibleRecognizer, +) + +if TYPE_CHECKING: + from faster_whisper import WhisperModel + from faster_whisper.transcribe import Segment + + +class TranscribeOutput(TypedDict): + text: str + segments: list[Segment] + language: str + + +class TranscribableAdapter: + def __init__(self, model: WhisperModel) -> None: + self.model = model + + def transcribe(self, audio_array, **kwargs) -> TranscribeOutput: + segments_generator, info = self.model.transcribe(audio_array) + segments = list(segments_generator) + return { + "text": " ".join(segment.text for segment in segments), + "segments": segments, + "language": info.language, + } + + +def recognize(recognizer, audio_data: AudioData) -> str: + import torch + from faster_whisper import WhisperModel + + device = "cuda" if torch.cuda.is_available() else "cpu" + + model = WhisperModel("base", device=device) + whisper_recognizer = WhisperCompatibleRecognizer( + TranscribableAdapter(model) + ) + return whisper_recognizer.recognize(audio_data) diff --git a/speech_recognition/recognizers/whisper_local/whisper.py b/speech_recognition/recognizers/whisper_local/whisper.py index 1c8299bb..7e05b94a 100644 --- a/speech_recognition/recognizers/whisper_local/whisper.py +++ b/speech_recognition/recognizers/whisper_local/whisper.py @@ -10,6 +10,7 @@ if TYPE_CHECKING: import torch from typing_extensions import Unpack + from whisper import Whisper class LoadModelOptionalParameters(TypedDict, total=False): @@ -52,6 +53,19 @@ class TranscribeOutput(TypedDict): language: str +class TranscribableAdapter: + def __init__(self, model: Whisper) -> None: + self.model = model + + def transcribe(self, audio_array, **kwargs) -> TranscribeOutput: + if "fp16" not in kwargs: + import torch + + kwargs["fp16"] = torch.cuda.is_available() + + return self.model.transcribe(audio_array, **kwargs) + + def recognize( recognizer, audio_data: AudioData, @@ -80,7 +94,9 @@ def recognize( import whisper whisper_model = whisper.load_model(model, **load_options or {}) - whisper_recognizer = WhisperCompatibleRecognizer(whisper_model) + whisper_recognizer = WhisperCompatibleRecognizer( + TranscribableAdapter(whisper_model) + ) return whisper_recognizer.recognize( audio_data, show_dict=show_dict, **transcribe_options ) diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py new file mode 100644 index 00000000..4d4f04fd --- /dev/null +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -0,0 +1,75 @@ +import sys +from unittest.mock import ANY, MagicMock, patch + +import numpy as np +import pytest +from faster_whisper.transcribe import ( + Segment, + TranscriptionInfo, + TranscriptionOptions, + VadOptions, +) + +from speech_recognition import Recognizer +from speech_recognition.audio import AudioData +from speech_recognition.recognizers.whisper_local.faster_whisper import ( + recognize, +) + + +@pytest.mark.skipif( + sys.version_info >= (3, 13), reason="skip on Python 3.13 or later" +) +@patch("soundfile.read") +@patch("torch.cuda.is_available", return_value=False) +@patch("faster_whisper.WhisperModel") +def test_transcribe_with_default_parameters( + WhisperModel, is_available, sf_read +): + def segments(): + yield Segment( + id=1, + seek=0, + start=0.0, + end=2.64, + text=" 1, 2, 3", + tokens=[50364, 502, 11, 568, 11, 805, 50496], + avg_logprob=-0.5378808751702309, + compression_ratio=0.4666666666666667, + no_speech_prob=0.17316274344921112, + words=None, + temperature=0.0, + ) + + info = TranscriptionInfo( + language="en", + language_probability=0.9314374923706055, + duration=2.7449375, + duration_after_vad=2.7449375, + all_language_probs=[("en", 0.9314374923706055)], + transcription_options=MagicMock(spec=TranscriptionOptions), + vad_options=MagicMock(spec=VadOptions), + ) + + whisper_model = WhisperModel.return_value + whisper_model.transcribe.return_value = segments(), info + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"audio data" + + audio_array = MagicMock(spec=np.ndarray) + dummy_sampling_rate = 99_999 + sf_read.return_value = (audio_array, dummy_sampling_rate) + + actual = recognize(MagicMock(spec=Recognizer), audio_data) + + assert actual == " 1, 2, 3" + is_available.assert_called_once_with() + WhisperModel.assert_called_once_with("base", device="cpu") + audio_data.get_wav_data.assert_called_once_with(convert_rate=16_000) + sf_read.assert_called_once_with(ANY) + assert sf_read.call_args[0][0].read() == b"audio data" + audio_array.astype.assert_called_once_with(np.float32) + whisper_model.transcribe.assert_called_once_with( + audio_array.astype.return_value + ) From cbca8b994bbed0817a71447857e03d16743ab5ad Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 14:00:29 +0900 Subject: [PATCH 02/22] [refactor] Reduce mock in whisper tests --- .../recognizers/whisper_local/test_whisper.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/recognizers/whisper_local/test_whisper.py b/tests/recognizers/whisper_local/test_whisper.py index be9c305b..5fbb6f4c 100644 --- a/tests/recognizers/whisper_local/test_whisper.py +++ b/tests/recognizers/whisper_local/test_whisper.py @@ -1,47 +1,45 @@ import sys from unittest import TestCase, skipIf -from unittest.mock import MagicMock, patch +from unittest.mock import ANY, MagicMock, patch + +import numpy as np from speech_recognition import AudioData, Recognizer from speech_recognition.recognizers.whisper_local.whisper import recognize @skipIf(sys.version_info >= (3, 13), "skip on Python 3.13") -@patch("speech_recognition.recognizers.whisper_local.base.io.BytesIO") @patch("soundfile.read") @patch("torch.cuda.is_available") @patch("whisper.load_model") class RecognizeWhisperTestCase(TestCase): - def test_default_parameters( - self, load_model, is_available, sf_read, BytesIO - ): - import numpy as np - + def test_default_parameters(self, load_model, is_available, sf_read): whisper_model = load_model.return_value whisper_model.transcribe.return_value = { "text": "Transcription by Whisper model", "language": "en", # Omit "segments" } - audio_array = MagicMock() + audio_array = MagicMock(spec=np.ndarray) dummy_sampling_rate = 99_999 sf_read.return_value = (audio_array, dummy_sampling_rate) audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"wav_data" actual = recognize(MagicMock(spec=Recognizer), audio_data) assert actual == "Transcription by Whisper model" load_model.assert_called_once_with("base") audio_data.get_wav_data.assert_called_once_with(convert_rate=16000) - BytesIO.assert_called_once_with(audio_data.get_wav_data.return_value) - sf_read.assert_called_once_with(BytesIO.return_value) + sf_read.assert_called_once_with(ANY) + assert sf_read.call_args[0][0].read() == b"wav_data" audio_array.astype.assert_called_once_with(np.float32) whisper_model.transcribe.assert_called_once_with( audio_array.astype.return_value, fp16=is_available.return_value, ) - def test_return_as_dict(self, load_model, is_available, sf_read, BytesIO): + def test_return_as_dict(self, load_model, is_available, sf_read): whisper_model = load_model.return_value whisper_model.transcribe.return_value = { "text": " 1, 2, 3", @@ -61,11 +59,12 @@ def test_return_as_dict(self, load_model, is_available, sf_read, BytesIO): ], "language": "en", } - audio_array = MagicMock() + audio_array = MagicMock(spec=np.ndarray) dummy_sampling_rate = 99_999 sf_read.return_value = (audio_array, dummy_sampling_rate) audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"" actual = recognize( MagicMock(spec=Recognizer), audio_data, show_dict=True ) @@ -91,13 +90,14 @@ def test_return_as_dict(self, load_model, is_available, sf_read, BytesIO): assert actual == expected - def test_pass_parameters(self, load_model, is_available, sf_read, BytesIO): + def test_pass_parameters(self, load_model, is_available, sf_read): whisper_model = load_model.return_value - audio_array = MagicMock() + audio_array = MagicMock(spec=np.ndarray) dummy_sampling_rate = 99_999 sf_read.return_value = (audio_array, dummy_sampling_rate) audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"" _ = recognize( MagicMock(spec=Recognizer), audio_data, From 7df12536e7f940ffe6993a7123bb422b2fb75016 Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 14:06:58 +0900 Subject: [PATCH 03/22] [refactor] Test class to add other case easier --- .../whisper_local/test_faster_whisper.py | 89 +++++++++---------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py index 4d4f04fd..ddd98352 100644 --- a/tests/recognizers/whisper_local/test_faster_whisper.py +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -21,55 +21,54 @@ sys.version_info >= (3, 13), reason="skip on Python 3.13 or later" ) @patch("soundfile.read") -@patch("torch.cuda.is_available", return_value=False) @patch("faster_whisper.WhisperModel") -def test_transcribe_with_default_parameters( - WhisperModel, is_available, sf_read -): - def segments(): - yield Segment( - id=1, - seek=0, - start=0.0, - end=2.64, - text=" 1, 2, 3", - tokens=[50364, 502, 11, 568, 11, 805, 50496], - avg_logprob=-0.5378808751702309, - compression_ratio=0.4666666666666667, - no_speech_prob=0.17316274344921112, - words=None, - temperature=0.0, - ) +class TestTranscribe: + @patch("torch.cuda.is_available", return_value=False) + def test_default_parameters(self, is_available, WhisperModel, sf_read): + def segments(): + yield Segment( + id=1, + seek=0, + start=0.0, + end=2.64, + text=" 1, 2, 3", + tokens=[50364, 502, 11, 568, 11, 805, 50496], + avg_logprob=-0.5378808751702309, + compression_ratio=0.4666666666666667, + no_speech_prob=0.17316274344921112, + words=None, + temperature=0.0, + ) - info = TranscriptionInfo( - language="en", - language_probability=0.9314374923706055, - duration=2.7449375, - duration_after_vad=2.7449375, - all_language_probs=[("en", 0.9314374923706055)], - transcription_options=MagicMock(spec=TranscriptionOptions), - vad_options=MagicMock(spec=VadOptions), - ) + info = TranscriptionInfo( + language="en", + language_probability=0.9314374923706055, + duration=2.7449375, + duration_after_vad=2.7449375, + all_language_probs=[("en", 0.9314374923706055)], + transcription_options=MagicMock(spec=TranscriptionOptions), + vad_options=MagicMock(spec=VadOptions), + ) - whisper_model = WhisperModel.return_value - whisper_model.transcribe.return_value = segments(), info + whisper_model = WhisperModel.return_value + whisper_model.transcribe.return_value = segments(), info - audio_data = MagicMock(spec=AudioData) - audio_data.get_wav_data.return_value = b"audio data" + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"audio data" - audio_array = MagicMock(spec=np.ndarray) - dummy_sampling_rate = 99_999 - sf_read.return_value = (audio_array, dummy_sampling_rate) + audio_array = MagicMock(spec=np.ndarray) + dummy_sampling_rate = 99_999 + sf_read.return_value = (audio_array, dummy_sampling_rate) - actual = recognize(MagicMock(spec=Recognizer), audio_data) + actual = recognize(MagicMock(spec=Recognizer), audio_data) - assert actual == " 1, 2, 3" - is_available.assert_called_once_with() - WhisperModel.assert_called_once_with("base", device="cpu") - audio_data.get_wav_data.assert_called_once_with(convert_rate=16_000) - sf_read.assert_called_once_with(ANY) - assert sf_read.call_args[0][0].read() == b"audio data" - audio_array.astype.assert_called_once_with(np.float32) - whisper_model.transcribe.assert_called_once_with( - audio_array.astype.return_value - ) + assert actual == " 1, 2, 3" + is_available.assert_called_once_with() + WhisperModel.assert_called_once_with("base", device="cpu") + audio_data.get_wav_data.assert_called_once_with(convert_rate=16_000) + sf_read.assert_called_once_with(ANY) + assert sf_read.call_args[0][0].read() == b"audio data" + audio_array.astype.assert_called_once_with(np.float32) + whisper_model.transcribe.assert_called_once_with( + audio_array.astype.return_value + ) From 36b046493205b44db2fad9f6641f4186932a4bb6 Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 14:22:56 +0900 Subject: [PATCH 04/22] [test] GPU available case --- .../whisper_local/test_faster_whisper.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py index ddd98352..e2ce46e1 100644 --- a/tests/recognizers/whisper_local/test_faster_whisper.py +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -72,3 +72,26 @@ def segments(): whisper_model.transcribe.assert_called_once_with( audio_array.astype.return_value ) + + @patch("torch.cuda.is_available", return_value=True) + def test_gpu_available(self, is_available, WhisperModel, sf_read): + def segments_generator(): + mocked_segment = MagicMock(spec=Segment(*[None] * 11)) + mocked_segment.text = "" + yield mocked_segment + + WhisperModel.return_value.transcribe.return_value = ( + segments_generator(), + MagicMock(spec=TranscriptionInfo(*[None] * 7)), + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"" + + audio_array = MagicMock(spec=np.ndarray) + dummy_sampling_rate = 99_999 + sf_read.return_value = (audio_array, dummy_sampling_rate) + + _ = recognize(MagicMock(spec=Recognizer), audio_data) + + WhisperModel.assert_called_once_with("base", device="cuda") From 3dbbe173c430b88d4be40ea997643e4556c048a8 Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 14:39:45 +0900 Subject: [PATCH 05/22] [feat] Specify transcribe's parameters --- .../whisper_local/faster_whisper.py | 19 +++++++--- .../whisper_local/test_faster_whisper.py | 38 +++++++++++++++++++ 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/speech_recognition/recognizers/whisper_local/faster_whisper.py b/speech_recognition/recognizers/whisper_local/faster_whisper.py index 58d4b860..c512bccf 100644 --- a/speech_recognition/recognizers/whisper_local/faster_whisper.py +++ b/speech_recognition/recognizers/whisper_local/faster_whisper.py @@ -8,6 +8,7 @@ ) if TYPE_CHECKING: + import numpy as np from faster_whisper import WhisperModel from faster_whisper.transcribe import Segment @@ -22,8 +23,10 @@ class TranscribableAdapter: def __init__(self, model: WhisperModel) -> None: self.model = model - def transcribe(self, audio_array, **kwargs) -> TranscribeOutput: - segments_generator, info = self.model.transcribe(audio_array) + def transcribe( + self, audio_array: np.ndarray, **kwargs + ) -> TranscribeOutput: + segments_generator, info = self.model.transcribe(audio_array, **kwargs) segments = list(segments_generator) return { "text": " ".join(segment.text for segment in segments), @@ -32,14 +35,20 @@ def transcribe(self, audio_array, **kwargs) -> TranscribeOutput: } -def recognize(recognizer, audio_data: AudioData) -> str: +def recognize( + recognizer, + audio_data: AudioData, + model: str = "base", + show_dict: bool = False, + **transcribe_options, +) -> str: import torch from faster_whisper import WhisperModel device = "cuda" if torch.cuda.is_available() else "cpu" - model = WhisperModel("base", device=device) + model = WhisperModel(model, device=device) whisper_recognizer = WhisperCompatibleRecognizer( TranscribableAdapter(model) ) - return whisper_recognizer.recognize(audio_data) + return whisper_recognizer.recognize(audio_data, **transcribe_options) diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py index e2ce46e1..d47ff702 100644 --- a/tests/recognizers/whisper_local/test_faster_whisper.py +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -95,3 +95,41 @@ def segments_generator(): _ = recognize(MagicMock(spec=Recognizer), audio_data) WhisperModel.assert_called_once_with("base", device="cuda") + + @patch("torch.cuda.is_available", return_value=False) + def test_pass_parameters(self, is_available, WhisperModel, sf_read): + def segments_generator(): + mocked_segment = MagicMock(spec=Segment(*[None] * 11)) + mocked_segment.text = "" + yield mocked_segment + + whisper_model = WhisperModel.return_value + whisper_model.transcribe.return_value = ( + segments_generator(), + MagicMock(spec=TranscriptionInfo(*[None] * 7)), + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.get_wav_data.return_value = b"" + + audio_array = MagicMock(spec=np.ndarray) + dummy_sampling_rate = 99_999 + sf_read.return_value = (audio_array, dummy_sampling_rate) + + _ = recognize( + MagicMock(spec=Recognizer), + audio_data, + model="small", + show_dict=True, + language="fr", + task="translate", + beam_size=5, + ) + + WhisperModel.assert_called_once_with("small", device="cpu") + whisper_model.transcribe.assert_called_once_with( + audio_array.astype.return_value, + language="fr", + task="translate", + beam_size=5, + ) From 2ee6044fa57aba46b9f808c75070bbb3ba9c4b59 Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 14:43:58 +0900 Subject: [PATCH 06/22] [refactor] Extract AudioData fixture --- .../whisper_local/test_faster_whisper.py | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py index d47ff702..3bcb7a6b 100644 --- a/tests/recognizers/whisper_local/test_faster_whisper.py +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -17,6 +17,13 @@ ) +@pytest.fixture +def audio_data() -> AudioData: + audio = MagicMock(spec=AudioData) + audio.get_wav_data.return_value = b"" + return audio + + @pytest.mark.skipif( sys.version_info >= (3, 13), reason="skip on Python 3.13 or later" ) @@ -24,7 +31,9 @@ @patch("faster_whisper.WhisperModel") class TestTranscribe: @patch("torch.cuda.is_available", return_value=False) - def test_default_parameters(self, is_available, WhisperModel, sf_read): + def test_default_parameters( + self, is_available, WhisperModel, sf_read, audio_data + ): def segments(): yield Segment( id=1, @@ -53,7 +62,6 @@ def segments(): whisper_model = WhisperModel.return_value whisper_model.transcribe.return_value = segments(), info - audio_data = MagicMock(spec=AudioData) audio_data.get_wav_data.return_value = b"audio data" audio_array = MagicMock(spec=np.ndarray) @@ -74,7 +82,9 @@ def segments(): ) @patch("torch.cuda.is_available", return_value=True) - def test_gpu_available(self, is_available, WhisperModel, sf_read): + def test_gpu_available( + self, is_available, WhisperModel, sf_read, audio_data + ): def segments_generator(): mocked_segment = MagicMock(spec=Segment(*[None] * 11)) mocked_segment.text = "" @@ -85,9 +95,6 @@ def segments_generator(): MagicMock(spec=TranscriptionInfo(*[None] * 7)), ) - audio_data = MagicMock(spec=AudioData) - audio_data.get_wav_data.return_value = b"" - audio_array = MagicMock(spec=np.ndarray) dummy_sampling_rate = 99_999 sf_read.return_value = (audio_array, dummy_sampling_rate) @@ -97,7 +104,9 @@ def segments_generator(): WhisperModel.assert_called_once_with("base", device="cuda") @patch("torch.cuda.is_available", return_value=False) - def test_pass_parameters(self, is_available, WhisperModel, sf_read): + def test_pass_parameters( + self, is_available, WhisperModel, sf_read, audio_data + ): def segments_generator(): mocked_segment = MagicMock(spec=Segment(*[None] * 11)) mocked_segment.text = "" @@ -109,9 +118,6 @@ def segments_generator(): MagicMock(spec=TranscriptionInfo(*[None] * 7)), ) - audio_data = MagicMock(spec=AudioData) - audio_data.get_wav_data.return_value = b"" - audio_array = MagicMock(spec=np.ndarray) dummy_sampling_rate = 99_999 sf_read.return_value = (audio_array, dummy_sampling_rate) From 4dd2985bc1e9c4ffbc40c6ad2160456d3772b337 Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 14:54:37 +0900 Subject: [PATCH 07/22] [refactor] Patch sf.read as fixture --- .../whisper_local/test_faster_whisper.py | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py index 3bcb7a6b..87a3732d 100644 --- a/tests/recognizers/whisper_local/test_faster_whisper.py +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -1,3 +1,4 @@ +from collections.abc import Generator import sys from unittest.mock import ANY, MagicMock, patch @@ -24,16 +25,27 @@ def audio_data() -> AudioData: return audio +@pytest.fixture +def soundfile_read() -> Generator[tuple[MagicMock, np.ndarray], None, None]: + audio_array = MagicMock(spec=np.ndarray) + dummy_sampling_rate = 99_999 + + with patch("soundfile.read") as mock_read: + mock_read.return_value = (audio_array, dummy_sampling_rate) + yield mock_read, audio_array + + @pytest.mark.skipif( sys.version_info >= (3, 13), reason="skip on Python 3.13 or later" ) -@patch("soundfile.read") @patch("faster_whisper.WhisperModel") class TestTranscribe: @patch("torch.cuda.is_available", return_value=False) def test_default_parameters( - self, is_available, WhisperModel, sf_read, audio_data + self, is_available, WhisperModel, audio_data, soundfile_read ): + sf_read, audio_array = soundfile_read + def segments(): yield Segment( id=1, @@ -63,11 +75,6 @@ def segments(): whisper_model.transcribe.return_value = segments(), info audio_data.get_wav_data.return_value = b"audio data" - - audio_array = MagicMock(spec=np.ndarray) - dummy_sampling_rate = 99_999 - sf_read.return_value = (audio_array, dummy_sampling_rate) - actual = recognize(MagicMock(spec=Recognizer), audio_data) assert actual == " 1, 2, 3" @@ -83,7 +90,7 @@ def segments(): @patch("torch.cuda.is_available", return_value=True) def test_gpu_available( - self, is_available, WhisperModel, sf_read, audio_data + self, is_available, WhisperModel, audio_data, soundfile_read ): def segments_generator(): mocked_segment = MagicMock(spec=Segment(*[None] * 11)) @@ -95,18 +102,16 @@ def segments_generator(): MagicMock(spec=TranscriptionInfo(*[None] * 7)), ) - audio_array = MagicMock(spec=np.ndarray) - dummy_sampling_rate = 99_999 - sf_read.return_value = (audio_array, dummy_sampling_rate) - _ = recognize(MagicMock(spec=Recognizer), audio_data) WhisperModel.assert_called_once_with("base", device="cuda") @patch("torch.cuda.is_available", return_value=False) def test_pass_parameters( - self, is_available, WhisperModel, sf_read, audio_data + self, is_available, WhisperModel, audio_data, soundfile_read ): + _, audio_array = soundfile_read + def segments_generator(): mocked_segment = MagicMock(spec=Segment(*[None] * 11)) mocked_segment.text = "" @@ -118,10 +123,6 @@ def segments_generator(): MagicMock(spec=TranscriptionInfo(*[None] * 7)), ) - audio_array = MagicMock(spec=np.ndarray) - dummy_sampling_rate = 99_999 - sf_read.return_value = (audio_array, dummy_sampling_rate) - _ = recognize( MagicMock(spec=Recognizer), audio_data, From 8b2d91c302dbfe5e987268d93ce5c8c618564f7e Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 14:59:48 +0900 Subject: [PATCH 08/22] [refactor] Extract mock creation into fixtures --- .../whisper_local/test_faster_whisper.py | 44 ++++++++++++++----- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py index 87a3732d..750ddd29 100644 --- a/tests/recognizers/whisper_local/test_faster_whisper.py +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -1,5 +1,5 @@ -from collections.abc import Generator import sys +from collections.abc import Generator from unittest.mock import ANY, MagicMock, patch import numpy as np @@ -25,6 +25,18 @@ def audio_data() -> AudioData: return audio +@pytest.fixture +def segment() -> Segment: + mocked_segment = MagicMock(spec=Segment(*[None] * 11)) + mocked_segment.text = "" + return mocked_segment + + +@pytest.fixture +def transcription_info() -> TranscriptionInfo: + return MagicMock(spec=TranscriptionInfo(*[None] * 7)) + + @pytest.fixture def soundfile_read() -> Generator[tuple[MagicMock, np.ndarray], None, None]: audio_array = MagicMock(spec=np.ndarray) @@ -66,7 +78,7 @@ def segments(): language_probability=0.9314374923706055, duration=2.7449375, duration_after_vad=2.7449375, - all_language_probs=[("en", 0.9314374923706055)], + all_language_probs=[("en", 0.9314374923706055)], # Omitted transcription_options=MagicMock(spec=TranscriptionOptions), vad_options=MagicMock(spec=VadOptions), ) @@ -90,16 +102,20 @@ def segments(): @patch("torch.cuda.is_available", return_value=True) def test_gpu_available( - self, is_available, WhisperModel, audio_data, soundfile_read + self, + is_available, + WhisperModel, + audio_data, + segment, + transcription_info, + soundfile_read, ): def segments_generator(): - mocked_segment = MagicMock(spec=Segment(*[None] * 11)) - mocked_segment.text = "" - yield mocked_segment + yield segment WhisperModel.return_value.transcribe.return_value = ( segments_generator(), - MagicMock(spec=TranscriptionInfo(*[None] * 7)), + transcription_info, ) _ = recognize(MagicMock(spec=Recognizer), audio_data) @@ -108,19 +124,23 @@ def segments_generator(): @patch("torch.cuda.is_available", return_value=False) def test_pass_parameters( - self, is_available, WhisperModel, audio_data, soundfile_read + self, + is_available, + WhisperModel, + audio_data, + segment, + transcription_info, + soundfile_read, ): _, audio_array = soundfile_read def segments_generator(): - mocked_segment = MagicMock(spec=Segment(*[None] * 11)) - mocked_segment.text = "" - yield mocked_segment + yield segment whisper_model = WhisperModel.return_value whisper_model.transcribe.return_value = ( segments_generator(), - MagicMock(spec=TranscriptionInfo(*[None] * 7)), + transcription_info, ) _ = recognize( From d502a2e7a0efde6153203d5f74efa73c41e3c1b9 Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 15:00:24 +0900 Subject: [PATCH 09/22] [docs] Add type hints --- speech_recognition/recognizers/whisper_local/base.py | 9 +++++++-- speech_recognition/recognizers/whisper_local/whisper.py | 5 ++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/speech_recognition/recognizers/whisper_local/base.py b/speech_recognition/recognizers/whisper_local/base.py index 12e04e65..ad6ee101 100644 --- a/speech_recognition/recognizers/whisper_local/base.py +++ b/speech_recognition/recognizers/whisper_local/base.py @@ -1,13 +1,18 @@ from __future__ import annotations import io -from typing import Any, Protocol +from typing import TYPE_CHECKING, Any, Protocol from speech_recognition.audio import AudioData +if TYPE_CHECKING: + import numpy as np + class Transcribable(Protocol): - def transcribe(self, audio_array, **kwargs) -> str | dict[str, Any]: + def transcribe( + self, audio_array: np.ndarray, **kwargs + ) -> str | dict[str, Any]: pass diff --git a/speech_recognition/recognizers/whisper_local/whisper.py b/speech_recognition/recognizers/whisper_local/whisper.py index 7e05b94a..c4642e22 100644 --- a/speech_recognition/recognizers/whisper_local/whisper.py +++ b/speech_recognition/recognizers/whisper_local/whisper.py @@ -8,6 +8,7 @@ ) if TYPE_CHECKING: + import numpy as np import torch from typing_extensions import Unpack from whisper import Whisper @@ -57,7 +58,9 @@ class TranscribableAdapter: def __init__(self, model: Whisper) -> None: self.model = model - def transcribe(self, audio_array, **kwargs) -> TranscribeOutput: + def transcribe( + self, audio_array: np.ndarray, **kwargs + ) -> TranscribeOutput: if "fp16" not in kwargs: import torch From 5da7fdffa87c5c338c5385ba74057574d6dc4563 Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 15:03:29 +0900 Subject: [PATCH 10/22] [chore] faster-whisper extra --- .github/workflows/unittests.yml | 4 ++-- setup.cfg | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 170b78f8..9b460c2f 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -44,7 +44,7 @@ jobs: - name: Install Python dependencies (Ubuntu, <=3.12) if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.13' run: | - python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,openai,groq] + python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,faster-whisper,openai,groq] - name: Install Python dependencies (Ubuntu, 3.13) if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13' run: | @@ -53,7 +53,7 @@ jobs: - name: Install Python dependencies (Windows) if: matrix.os == 'windows-latest' run: | - python -m pip install .[dev,whisper-local,google-cloud,openai,groq] + python -m pip install .[dev,whisper-local,faster-whisper,google-cloud,openai,groq] - name: Test with unittest run: | pytest --doctest-modules -v speech_recognition/recognizers/ tests/ diff --git a/setup.cfg b/setup.cfg index 9d1def40..f3848d4d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,6 +12,8 @@ google-cloud = whisper-local = openai-whisper soundfile +faster-whisper = + faster-whisper openai = openai httpx < 0.28 From 5c340845fc4eff6de2ef46a405eba4fd6979a1ad Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 15:09:31 +0900 Subject: [PATCH 11/22] [docs] Add parameters' type hint --- .../recognizers/whisper_local/faster_whisper.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/speech_recognition/recognizers/whisper_local/faster_whisper.py b/speech_recognition/recognizers/whisper_local/faster_whisper.py index c512bccf..a4fe2105 100644 --- a/speech_recognition/recognizers/whisper_local/faster_whisper.py +++ b/speech_recognition/recognizers/whisper_local/faster_whisper.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, TypedDict +from typing import TYPE_CHECKING, Literal, TypedDict from speech_recognition.audio import AudioData from speech_recognition.recognizers.whisper_local.base import ( @@ -11,6 +11,7 @@ import numpy as np from faster_whisper import WhisperModel from faster_whisper.transcribe import Segment + from typing_extensions import Unpack class TranscribeOutput(TypedDict): @@ -35,12 +36,20 @@ def transcribe( } +class TranscribeOptionalParameters(TypedDict, total=False): + # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692 + language: str + task: Literal["transcribe", "translate"] + beam_size: int + # TODO Add others + + def recognize( recognizer, audio_data: AudioData, model: str = "base", show_dict: bool = False, - **transcribe_options, + **transcribe_options: Unpack[TranscribeOptionalParameters], ) -> str: import torch from faster_whisper import WhisperModel From 5f55e920c6bf102b45492e903c3b7847bfe1904b Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 15:14:44 +0900 Subject: [PATCH 12/22] [bugfix] show_dict=True --- .../whisper_local/faster_whisper.py | 4 +- .../whisper_local/test_faster_whisper.py | 59 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/speech_recognition/recognizers/whisper_local/faster_whisper.py b/speech_recognition/recognizers/whisper_local/faster_whisper.py index a4fe2105..1593d4e3 100644 --- a/speech_recognition/recognizers/whisper_local/faster_whisper.py +++ b/speech_recognition/recognizers/whisper_local/faster_whisper.py @@ -60,4 +60,6 @@ def recognize( whisper_recognizer = WhisperCompatibleRecognizer( TranscribableAdapter(model) ) - return whisper_recognizer.recognize(audio_data, **transcribe_options) + return whisper_recognizer.recognize( + audio_data, show_dict=show_dict, **transcribe_options + ) diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py index 750ddd29..06a6b0e6 100644 --- a/tests/recognizers/whisper_local/test_faster_whisper.py +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -122,6 +122,65 @@ def segments_generator(): WhisperModel.assert_called_once_with("base", device="cuda") + @patch("torch.cuda.is_available", return_value=False) + def test_show_dict( + self, is_available, WhisperModel, audio_data, soundfile_read + ): + sf_read, audio_array = soundfile_read + + def segments(): + yield Segment( + id=1, + seek=0, + start=0.0, + end=2.64, + text=" 1, 2, 3", + tokens=[50364, 502, 11, 568, 11, 805, 50496], + avg_logprob=-0.5378808751702309, + compression_ratio=0.4666666666666667, + no_speech_prob=0.17316274344921112, + words=None, + temperature=0.0, + ) + + info = TranscriptionInfo( + language="en", + language_probability=0.9314374923706055, + duration=2.7449375, + duration_after_vad=2.7449375, + all_language_probs=[("en", 0.9314374923706055)], # Omitted + transcription_options=MagicMock(spec=TranscriptionOptions), + vad_options=MagicMock(spec=VadOptions), + ) + + whisper_model = WhisperModel.return_value + whisper_model.transcribe.return_value = segments(), info + + actual = recognize( + MagicMock(spec=Recognizer), audio_data, show_dict=True + ) + + expected = { + "text": " 1, 2, 3", + "language": "en", + "segments": [ + Segment( + id=1, + seek=0, + start=0.0, + end=2.64, + text=" 1, 2, 3", + tokens=[50364, 502, 11, 568, 11, 805, 50496], + avg_logprob=-0.5378808751702309, + compression_ratio=0.4666666666666667, + no_speech_prob=0.17316274344921112, + words=None, + temperature=0.0, + ) + ], + } + assert actual == expected + @patch("torch.cuda.is_available", return_value=False) def test_pass_parameters( self, From a422db06e00bac1bf8ba3879a615f5660593f898 Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 15:28:17 +0900 Subject: [PATCH 13/22] [docs] Add docstring --- .../whisper_local/faster_whisper.py | 18 ++++++++++++++++++ .../recognizers/whisper_local/whisper.py | 7 +++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/speech_recognition/recognizers/whisper_local/faster_whisper.py b/speech_recognition/recognizers/whisper_local/faster_whisper.py index 1593d4e3..ffc858b3 100644 --- a/speech_recognition/recognizers/whisper_local/faster_whisper.py +++ b/speech_recognition/recognizers/whisper_local/faster_whisper.py @@ -51,6 +51,24 @@ def recognize( show_dict: bool = False, **transcribe_options: Unpack[TranscribeOptionalParameters], ) -> str: + """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper. + + Pick ``model`` size (Same as Whisper). + + If ``show_dict`` is true, returns the detailed response from Whisper, including the detected language. Otherwise returns only the transcription. + + You can specify: + + * ``language``: recognition language, an uncapitalized 2 letters language name like "en" or "fr". + + * If not set, Faster Whisper will automatically detect the language. + + * ``task`` + + * If you want transcribe + **translate** to english, set ``task="translate"``. + + Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options. + """ import torch from faster_whisper import WhisperModel diff --git a/speech_recognition/recognizers/whisper_local/whisper.py b/speech_recognition/recognizers/whisper_local/whisper.py index c4642e22..622ee071 100644 --- a/speech_recognition/recognizers/whisper_local/whisper.py +++ b/speech_recognition/recognizers/whisper_local/whisper.py @@ -87,11 +87,14 @@ def recognize( You can specify: * ``language``: recognition language, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py + + * If not set, Whisper will automatically detect the language. + * ``task`` - * If you want transcribe + **translate**, set ``task="translate"``. + * If you want transcribe + **translate** to english, set ``task="translate"``. - Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options + Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options. """ import whisper From ec8d05f9afb108d04f41e32ab82d1e99e4b1e3f2 Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 15:31:07 +0900 Subject: [PATCH 14/22] [docs] Add faster_whisper in reference --- reference/library-reference.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/reference/library-reference.rst b/reference/library-reference.rst index dd5c2169..32ee973a 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -290,11 +290,16 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. -``recognizer_instance.recognize_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, load_options=None, **transcribe_options):`` ----------------------------------------------------------------------------------------------------------------------------------------------------- +``recognizer_instance.recognize_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, load_options=None, **transcribe_options)`` +--------------------------------------------------------------------------------------------------------------------------------------------------- .. autofunction:: speech_recognition.recognizers.whisper_local.whisper.recognize +``recognizer_instance.recognize_faster_whisper(audio_data: AudioData, model: str="base", show_dict: bool=False, **transcribe_options)`` +--------------------------------------------------------------------------------------------------------------------------------------- + +.. autofunction:: speech_recognition.recognizers.whisper_local.faster_whisper.recognize + ``recognizer_instance.recognize_openai(audio_data: AudioData, model = "whisper-1", **kwargs)`` ---------------------------------------------------------------------------------------------- From 4f2506aef5fb5205ef7c370f4ae286659c161a6e Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 15:31:53 +0900 Subject: [PATCH 15/22] [feat] Users can call Recognizer.recognize_faster_whisper() --- speech_recognition/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 96d6d74f..140d8cba 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1392,13 +1392,14 @@ def flush(self, *args, **kwargs): try: from .recognizers import google, google_cloud from .recognizers.whisper_api import groq, openai - from .recognizers.whisper_local import whisper + from .recognizers.whisper_local import faster_whisper, whisper except (ModuleNotFoundError, ImportError): pass else: Recognizer.recognize_google = google.recognize_legacy Recognizer.recognize_google_cloud = google_cloud.recognize Recognizer.recognize_whisper = whisper.recognize + Recognizer.recognize_faster_whisper = faster_whisper.recognize Recognizer.recognize_openai = openai.recognize Recognizer.recognize_whisper_api = openai.recognize # Deprecated Recognizer.recognize_groq = groq.recognize From e23338aa26a99f1a34b8632d563cfa552350e539 Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 15:32:39 +0900 Subject: [PATCH 16/22] [docs] faster-whisper installation --- README.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.rst b/README.rst index a05f0670..83a05a4d 100644 --- a/README.rst +++ b/README.rst @@ -97,6 +97,7 @@ To use all of the functionality of the library, you should have: * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X) * **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``) * **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``) +* **Faster Whisper** (required only if you need to use Faster Whisper ``recognizer_instance.recognize_faster_whisper``) * **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_openai``) * **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``) @@ -179,6 +180,13 @@ Whisper is **required if and only if you want to use whisper** (``recognizer_ins You can install it with ``python3 -m pip install SpeechRecognition[whisper-local]``. +Faster Whisper (for Faster Whisper users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The library `faster-whisper `__ is **required if and only if you want to use Faster Whisper** (``recognizer_instance.recognize_faster_whisper``). + +You can install it with ``python3 -m pip install SpeechRecognition[faster-whisper]``. + OpenAI Whisper API (for OpenAI Whisper API users) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 3fea94819113143e7856ca87e1a701ce4b9873bc Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 15:42:53 +0900 Subject: [PATCH 17/22] [bugfix] Install numpy in dev environment * Fix Ubuntu 3.13 failed: ModuleNotFoundError: No module named 'numpy' --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index f3848d4d..9a4a4ce8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,6 +3,7 @@ dev = pytest pytest-randomly respx + numpy audio = PyAudio >= 0.2.11 pocketsphinx = From 070d02f98650b24e3b36b496e9b33f8891509f6c Mon Sep 17 00:00:00 2001 From: ftnext Date: Mon, 30 Dec 2024 18:08:15 +0900 Subject: [PATCH 18/22] [bugfix] Run (=skip) tests without faster-whisper --- .../whisper_local/test_faster_whisper.py | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py index 06a6b0e6..a2d2ffc8 100644 --- a/tests/recognizers/whisper_local/test_faster_whisper.py +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -1,15 +1,12 @@ +from __future__ import annotations + import sys from collections.abc import Generator +from typing import TYPE_CHECKING from unittest.mock import ANY, MagicMock, patch import numpy as np import pytest -from faster_whisper.transcribe import ( - Segment, - TranscriptionInfo, - TranscriptionOptions, - VadOptions, -) from speech_recognition import Recognizer from speech_recognition.audio import AudioData @@ -17,6 +14,9 @@ recognize, ) +if TYPE_CHECKING: + from faster_whisper.transcribe import Segment, TranscriptionInfo + @pytest.fixture def audio_data() -> AudioData: @@ -27,6 +27,8 @@ def audio_data() -> AudioData: @pytest.fixture def segment() -> Segment: + from faster_whisper.transcribe import Segment + mocked_segment = MagicMock(spec=Segment(*[None] * 11)) mocked_segment.text = "" return mocked_segment @@ -34,6 +36,8 @@ def segment() -> Segment: @pytest.fixture def transcription_info() -> TranscriptionInfo: + from faster_whisper.transcribe import TranscriptionInfo + return MagicMock(spec=TranscriptionInfo(*[None] * 7)) @@ -56,6 +60,13 @@ class TestTranscribe: def test_default_parameters( self, is_available, WhisperModel, audio_data, soundfile_read ): + from faster_whisper.transcribe import ( + Segment, + TranscriptionInfo, + TranscriptionOptions, + VadOptions, + ) + sf_read, audio_array = soundfile_read def segments(): @@ -126,6 +137,13 @@ def segments_generator(): def test_show_dict( self, is_available, WhisperModel, audio_data, soundfile_read ): + from faster_whisper.transcribe import ( + Segment, + TranscriptionInfo, + TranscriptionOptions, + VadOptions, + ) + sf_read, audio_array = soundfile_read def segments(): From 0edcc439fcfef609b33422ebf8ce53c7453b8aa8 Mon Sep 17 00:00:00 2001 From: ftnext Date: Tue, 31 Dec 2024 00:08:22 +0900 Subject: [PATCH 19/22] [feat] Use device="auto" (default) --- .../whisper_local/faster_whisper.py | 5 +-- .../whisper_local/test_faster_whisper.py | 37 ++----------------- 2 files changed, 5 insertions(+), 37 deletions(-) diff --git a/speech_recognition/recognizers/whisper_local/faster_whisper.py b/speech_recognition/recognizers/whisper_local/faster_whisper.py index ffc858b3..e8a41664 100644 --- a/speech_recognition/recognizers/whisper_local/faster_whisper.py +++ b/speech_recognition/recognizers/whisper_local/faster_whisper.py @@ -69,12 +69,9 @@ def recognize( Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options. """ - import torch from faster_whisper import WhisperModel - device = "cuda" if torch.cuda.is_available() else "cpu" - - model = WhisperModel(model, device=device) + model = WhisperModel(model) whisper_recognizer = WhisperCompatibleRecognizer( TranscribableAdapter(model) ) diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py index a2d2ffc8..05883250 100644 --- a/tests/recognizers/whisper_local/test_faster_whisper.py +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -56,9 +56,8 @@ def soundfile_read() -> Generator[tuple[MagicMock, np.ndarray], None, None]: ) @patch("faster_whisper.WhisperModel") class TestTranscribe: - @patch("torch.cuda.is_available", return_value=False) def test_default_parameters( - self, is_available, WhisperModel, audio_data, soundfile_read + self, WhisperModel, audio_data, soundfile_read ): from faster_whisper.transcribe import ( Segment, @@ -101,8 +100,7 @@ def segments(): actual = recognize(MagicMock(spec=Recognizer), audio_data) assert actual == " 1, 2, 3" - is_available.assert_called_once_with() - WhisperModel.assert_called_once_with("base", device="cpu") + WhisperModel.assert_called_once_with("base") audio_data.get_wav_data.assert_called_once_with(convert_rate=16_000) sf_read.assert_called_once_with(ANY) assert sf_read.call_args[0][0].read() == b"audio data" @@ -111,32 +109,7 @@ def segments(): audio_array.astype.return_value ) - @patch("torch.cuda.is_available", return_value=True) - def test_gpu_available( - self, - is_available, - WhisperModel, - audio_data, - segment, - transcription_info, - soundfile_read, - ): - def segments_generator(): - yield segment - - WhisperModel.return_value.transcribe.return_value = ( - segments_generator(), - transcription_info, - ) - - _ = recognize(MagicMock(spec=Recognizer), audio_data) - - WhisperModel.assert_called_once_with("base", device="cuda") - - @patch("torch.cuda.is_available", return_value=False) - def test_show_dict( - self, is_available, WhisperModel, audio_data, soundfile_read - ): + def test_show_dict(self, WhisperModel, audio_data, soundfile_read): from faster_whisper.transcribe import ( Segment, TranscriptionInfo, @@ -199,10 +172,8 @@ def segments(): } assert actual == expected - @patch("torch.cuda.is_available", return_value=False) def test_pass_parameters( self, - is_available, WhisperModel, audio_data, segment, @@ -230,7 +201,7 @@ def segments_generator(): beam_size=5, ) - WhisperModel.assert_called_once_with("small", device="cpu") + WhisperModel.assert_called_once_with("small") whisper_model.transcribe.assert_called_once_with( audio_array.astype.return_value, language="fr", From ef69c4a382f58d1a859522786bd5c453a7036995 Mon Sep 17 00:00:00 2001 From: ftnext Date: Tue, 31 Dec 2024 00:15:58 +0900 Subject: [PATCH 20/22] [feat] Run manually as e2e python -m speech_recognition.recognizers.whisper_local.faster_whisper --- .../recognizers/whisper_local/faster_whisper.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/speech_recognition/recognizers/whisper_local/faster_whisper.py b/speech_recognition/recognizers/whisper_local/faster_whisper.py index e8a41664..a6d59ae6 100644 --- a/speech_recognition/recognizers/whisper_local/faster_whisper.py +++ b/speech_recognition/recognizers/whisper_local/faster_whisper.py @@ -78,3 +78,20 @@ def recognize( return whisper_recognizer.recognize( audio_data, show_dict=show_dict, **transcribe_options ) + + +if __name__ == "__main__": + import argparse + + import speech_recognition as sr + + parser = argparse.ArgumentParser() + parser.add_argument("audio_file") + args = parser.parse_args() + + r = sr.Recognizer() + with sr.AudioFile(args.audio_file) as source: + audio_data = r.listen(source) + + transcription = recognize(None, audio_data) + print(transcription) From ac556c15d6323e3654673364478754e9d2a3f904 Mon Sep 17 00:00:00 2001 From: ftnext Date: Tue, 31 Dec 2024 00:27:03 +0900 Subject: [PATCH 21/22] [feat] Enable to specify initialization parameters --- .../whisper_local/faster_whisper.py | 11 +++++++- .../whisper_local/test_faster_whisper.py | 27 +++++++++++++++++-- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/speech_recognition/recognizers/whisper_local/faster_whisper.py b/speech_recognition/recognizers/whisper_local/faster_whisper.py index a6d59ae6..48a2e696 100644 --- a/speech_recognition/recognizers/whisper_local/faster_whisper.py +++ b/speech_recognition/recognizers/whisper_local/faster_whisper.py @@ -36,6 +36,14 @@ def transcribe( } +class InitOptionalParameters(TypedDict, total=False): + # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L575 + device: Literal["cpu", "gpu", "auto"] + compute_type: str + download_root: str + # TODO Add others + + class TranscribeOptionalParameters(TypedDict, total=False): # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692 language: str @@ -49,6 +57,7 @@ def recognize( audio_data: AudioData, model: str = "base", show_dict: bool = False, + init_options: InitOptionalParameters | None = None, **transcribe_options: Unpack[TranscribeOptionalParameters], ) -> str: """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper. @@ -71,7 +80,7 @@ def recognize( """ from faster_whisper import WhisperModel - model = WhisperModel(model) + model = WhisperModel(model, **init_options or {}) whisper_recognizer = WhisperCompatibleRecognizer( TranscribableAdapter(model) ) diff --git a/tests/recognizers/whisper_local/test_faster_whisper.py b/tests/recognizers/whisper_local/test_faster_whisper.py index 05883250..e2eb7632 100644 --- a/tests/recognizers/whisper_local/test_faster_whisper.py +++ b/tests/recognizers/whisper_local/test_faster_whisper.py @@ -117,8 +117,6 @@ def test_show_dict(self, WhisperModel, audio_data, soundfile_read): VadOptions, ) - sf_read, audio_array = soundfile_read - def segments(): yield Segment( id=1, @@ -208,3 +206,28 @@ def segments_generator(): task="translate", beam_size=5, ) + + def test_init_parameters( + self, + WhisperModel, + audio_data, + segment, + transcription_info, + soundfile_read, + ): + def segments_generator(): + yield segment + + whisper_model = WhisperModel.return_value + whisper_model.transcribe.return_value = ( + segments_generator(), + transcription_info, + ) + + _ = recognize( + MagicMock(spec=Recognizer), + audio_data, + init_options={"compute_type": "int8"}, + ) + + WhisperModel.assert_called_once_with("base", compute_type="int8") From e663cb0609b76afc64ee90883d1cc72ab6a951f7 Mon Sep 17 00:00:00 2001 From: ftnext Date: Tue, 31 Dec 2024 00:41:37 +0900 Subject: [PATCH 22/22] [docs] Fix type hint --- speech_recognition/recognizers/whisper_local/faster_whisper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speech_recognition/recognizers/whisper_local/faster_whisper.py b/speech_recognition/recognizers/whisper_local/faster_whisper.py index 48a2e696..e7ce10fb 100644 --- a/speech_recognition/recognizers/whisper_local/faster_whisper.py +++ b/speech_recognition/recognizers/whisper_local/faster_whisper.py @@ -59,7 +59,7 @@ def recognize( show_dict: bool = False, init_options: InitOptionalParameters | None = None, **transcribe_options: Unpack[TranscribeOptionalParameters], -) -> str: +) -> str | TranscribeOutput: """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper. Pick ``model`` size (Same as Whisper).