Skip to content

Commit

Permalink
Merge pull request #812 from ftnext/feat/google-cloud-parameters
Browse files Browse the repository at this point in the history
Refine google_cloud recognizer's parameters
  • Loading branch information
ftnext authored Dec 28, 2024
2 parents 0fb14ff + 53dccb5 commit 346ea8c
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 60 deletions.
5 changes: 3 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,9 @@ You can install it with :command:`python3 -m pip install SpeechRecognition[googl

**Prerequisite**: Create local authentication credentials for your Google account

* `Before you begin (Transcribe speech to text by using client libraries) <https://cloud.google.com/speech-to-text/docs/transcribe-client-libraries#before-you-begin>`__
* Detail: `User credentials (Set up ADC for a local development environment) <https://cloud.google.com/docs/authentication/set-up-adc-local-dev-environment#local-user-cred>`__
* Digest: `Before you begin (Transcribe speech to text by using client libraries) <https://cloud.google.com/speech-to-text/docs/transcribe-client-libraries#before-you-begin>`__
* `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__
* `User credentials (Set up ADC for a local development environment) <https://cloud.google.com/docs/authentication/set-up-adc-local-dev-environment#local-user-cred>`__

Currently only `V1 <https://cloud.google.com/speech-to-text/docs/quickstart>`__ is supported. (`V2 <https://cloud.google.com/speech-to-text/v2/docs/quickstart>`__ is not supported)

Expand Down
4 changes: 2 additions & 2 deletions examples/audio_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@
print("Could not request results from Google Speech Recognition service; {0}".format(e))

# recognize speech using Google Cloud Speech
GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE"""
# Before run, create local authentication credentials (``gcloud auth application-default login``)
try:
print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS))
print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio))
except sr.UnknownValueError:
print("Google Cloud Speech could not understand audio")
except sr.RequestError as e:
Expand Down
4 changes: 2 additions & 2 deletions examples/extended_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@
print("Could not request results from Google Speech Recognition service; {0}".format(e))

# recognize speech using Google Cloud Speech
GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE"""
# Before run, create local authentication credentials (``gcloud auth application-default login``)
try:
print("Google Cloud Speech recognition results:")
pprint(r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, show_all=True)) # pretty-print the recognition result
pprint(r.recognize_google_cloud(audio, show_all=True)) # pretty-print the recognition result
except sr.UnknownValueError:
print("Google Cloud Speech could not understand audio")
except sr.RequestError as e:
Expand Down
4 changes: 2 additions & 2 deletions examples/microphone_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@
print("Could not request results from Google Speech Recognition service; {0}".format(e))

# recognize speech using Google Cloud Speech
GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE"""
# Before run, create local authentication credentials (``gcloud auth application-default login``)
try:
print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS))
print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio))
except sr.UnknownValueError:
print("Google Cloud Speech could not understand audio")
except sr.RequestError as e:
Expand Down
6 changes: 3 additions & 3 deletions examples/special_recognizer_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@


# recognize preferred phrases using Google Cloud Speech
GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE"""
# Before run, create local authentication credentials (``gcloud auth application-default login``)
try:
print("Google Cloud Speech recognition for \"numero\" with different sets of preferred phrases:")
print(r.recognize_google_cloud(audio_fr, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, preferred_phrases=["noomarow"]))
print(r.recognize_google_cloud(audio_fr, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, preferred_phrases=["newmarrow"]))
print(r.recognize_google_cloud(audio_fr, preferred_phrases=["noomarow"]))
print(r.recognize_google_cloud(audio_fr, preferred_phrases=["newmarrow"]))
except sr.UnknownValueError:
print("Google Cloud Speech could not understand audio")
except sr.RequestError as e:
Expand Down
4 changes: 2 additions & 2 deletions reference/library-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,8 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot

Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.

``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json_path: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]``
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json_path: Union[str, None] = None, **kwargs) -> Union[str, Dict[str, Any]]``
-------------------------------------------------------------------------------------------------------------------------------------------------------------

.. autofunction:: speech_recognition.recognizers.google_cloud.recognize

Expand Down
125 changes: 79 additions & 46 deletions speech_recognition/recognizers/google_cloud.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,99 @@
from __future__ import annotations

from typing import TYPE_CHECKING, TypedDict
from urllib.error import URLError

from speech_recognition.audio import AudioData
from speech_recognition.exceptions import RequestError, UnknownValueError

if TYPE_CHECKING:
from google.cloud.speech import (
RecognitionConfig,
RecognizeResponse,
SpeechContext,
)
from typing_extensions import Required

def recognize(
recognizer,
audio_data: AudioData,
credentials_json_path: str | None = None,
language: str = "en-US",
preferred_phrases=None,
show_all: bool = False,
**api_params,
):
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.

This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart <https://cloud.google.com/speech/docs/getting-started>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
class GoogleCloudRecognizerParameters(TypedDict, total=False):
"""Optional parameters.
The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``.
A list of supported language tags can be found in the `Speech-to-Text supported languages <https://cloud.google.com/speech/docs/languages>`__.
If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives.
This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary.
Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
``show_all``: See :py:func:`recognize`.
``model``: You can select the model to get best results. (See `RecognitionConfig's documentation <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>`__ for detail)
The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation <https://cloud.google.com/speech/docs/languages>`__.
``use_enhanced``: Set to true to use an enhanced model for speech recognition.
"""

# SpeechRecognition specific parameters
preferred_phrases: list[str]
show_all: bool

# Speech-to-Text V1 API's parameters
language_code: str
model: str
use_enhanced: bool
# TODO Add others support


class GoogleCloudSpeechV1Parameters(TypedDict, total=False):
"""Speech-to-Text V1 API's parameters.
If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig
"""

``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>
encoding: Required[RecognitionConfig.AudioEncoding]
sample_rate_hertz: Required[int]
language_code: Required[str]
speech_contexts: list[SpeechContext]
enable_word_time_offsets: bool
model: str
use_enhanced: bool

The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set,
then an appropriate enhanced model is chosen if an enhanced model exists for the audio.
If use_enhanced is true and an enhanced version of the specified model does not exist,
then the speech is recognized using the standard version of the specified model.

Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best
suited to your domain to get best results. If a model is not explicitly specified,
then we auto-select a model based on the other parameters of this method.
def _build_config(
audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters
) -> RecognitionConfig:
from google.cloud import speech

parameters: GoogleCloudSpeechV1Parameters = {
"encoding": speech.RecognitionConfig.AudioEncoding.FLAC,
"sample_rate_hertz": audio_data.sample_rate,
"language_code": recognizer_params.pop("language_code", "en-US"),
}
if preferred_phrases := recognizer_params.pop("preferred_phrases", None):
parameters["speech_contexts"] = [
speech.SpeechContext(phrases=preferred_phrases)
]
if recognizer_params.pop("show_all", False):
# ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets
parameters["enable_word_time_offsets"] = True
return speech.RecognitionConfig(**(parameters | recognizer_params))


def recognize(
recognizer,
audio_data: AudioData,
credentials_json_path: str | None = None,
**kwargs: GoogleCloudRecognizerParameters,
) -> str | RecognizeResponse:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project.
And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
For other parameters, see :py:class:`GoogleCloudRecognizerParameters`.
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
"""
assert isinstance(
audio_data, AudioData
), "``audio_data`` must be audio data"
assert isinstance(language, str), "``language`` must be a string"
assert preferred_phrases is None or all(
isinstance(preferred_phrases, (type(""), type("")))
for preferred_phrases in preferred_phrases
), "``preferred_phrases`` must be a list of strings"

try:
from google.api_core.exceptions import GoogleAPICallError
from google.cloud import speech
Expand All @@ -72,21 +119,7 @@ def recognize(
)
audio = speech.RecognitionAudio(content=flac_data)

config = {
"encoding": speech.RecognitionConfig.AudioEncoding.FLAC,
"sample_rate_hertz": audio_data.sample_rate,
"language_code": language,
**api_params,
}
if preferred_phrases is not None:
config["speechContexts"] = [
speech.SpeechContext(phrases=preferred_phrases)
]
if show_all:
# ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets
config["enable_word_time_offsets"] = True

config = speech.RecognitionConfig(**config)
config = _build_config(audio_data, kwargs.copy())

try:
response = client.recognize(config=config, audio=audio)
Expand All @@ -97,7 +130,7 @@ def recognize(
"recognition connection failed: {0}".format(e.reason)
)

if show_all:
if kwargs.get("show_all"):
return response
if len(response.results) == 0:
raise UnknownValueError()
Expand Down
5 changes: 4 additions & 1 deletion tests/recognizers/test_google_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
RecognitionAudio,
RecognitionConfig,
RecognizeResponse,
SpeechContext,
SpeechRecognitionAlternative,
SpeechRecognitionResult,
WordInfo,
Expand Down Expand Up @@ -164,7 +165,8 @@ def test_transcribe_with_specified_api_parameters(SpeechClient):
_ = recognize(
MagicMock(spec=Recognizer),
audio_data,
language="ja-JP",
language_code="ja-JP",
preferred_phrases=["numero", "hoge"],
use_enhanced=True,
)

Expand All @@ -173,6 +175,7 @@ def test_transcribe_with_specified_api_parameters(SpeechClient):
encoding=RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=16_000,
language_code="ja-JP",
speech_contexts=[SpeechContext(phrases=["numero", "hoge"])],
use_enhanced=True,
),
audio=RecognitionAudio(content=b"flac_data"),
Expand Down

0 comments on commit 346ea8c

Please sign in to comment.