Merge pull request #812 from ftnext/feat/google-cloud-parameters

Refine google_cloud recognizer's parameters
Uberi · Dec 28, 2024 · 346ea8c · 346ea8c
2 parents 0fb14ff + 53dccb5
commit 346ea8c
Show file tree

Hide file tree

Showing 8 changed files with 97 additions and 60 deletions.
diff --git a/README.rst b/README.rst
@@ -160,8 +160,9 @@ You can install it with :command:`python3 -m pip install SpeechRecognition[googl
 
 **Prerequisite**: Create local authentication credentials for your Google account
 
-* `Before you begin (Transcribe speech to text by using client libraries) <https://cloud.google.com/speech-to-text/docs/transcribe-client-libraries#before-you-begin>`__
-* Detail: `User credentials (Set up ADC for a local development environment) <https://cloud.google.com/docs/authentication/set-up-adc-local-dev-environment#local-user-cred>`__
+* Digest: `Before you begin (Transcribe speech to text by using client libraries) <https://cloud.google.com/speech-to-text/docs/transcribe-client-libraries#before-you-begin>`__
+* `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__
+* `User credentials (Set up ADC for a local development environment) <https://cloud.google.com/docs/authentication/set-up-adc-local-dev-environment#local-user-cred>`__
 
 Currently only `V1 <https://cloud.google.com/speech-to-text/docs/quickstart>`__ is supported. (`V2 <https://cloud.google.com/speech-to-text/v2/docs/quickstart>`__ is not supported)
 

diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py
@@ -33,9 +33,9 @@
     print("Could not request results from Google Speech Recognition service; {0}".format(e))
 
 # recognize speech using Google Cloud Speech
-GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE"""
+# Before run, create local authentication credentials (``gcloud auth application-default login``)
 try:
-    print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS))
+    print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio))
 except sr.UnknownValueError:
     print("Google Cloud Speech could not understand audio")
 except sr.RequestError as e:

diff --git a/examples/extended_results.py b/examples/extended_results.py
@@ -37,10 +37,10 @@
     print("Could not request results from Google Speech Recognition service; {0}".format(e))
 
 # recognize speech using Google Cloud Speech
-GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE"""
+# Before run, create local authentication credentials (``gcloud auth application-default login``)
 try:
     print("Google Cloud Speech recognition results:")
-    pprint(r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, show_all=True))  # pretty-print the recognition result
+    pprint(r.recognize_google_cloud(audio, show_all=True))  # pretty-print the recognition result
 except sr.UnknownValueError:
     print("Google Cloud Speech could not understand audio")
 except sr.RequestError as e:

diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py
@@ -32,9 +32,9 @@
     print("Could not request results from Google Speech Recognition service; {0}".format(e))
 
 # recognize speech using Google Cloud Speech
-GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE"""
+# Before run, create local authentication credentials (``gcloud auth application-default login``)
 try:
-    print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS))
+    print("Google Cloud Speech thinks you said " + r.recognize_google_cloud(audio))
 except sr.UnknownValueError:
     print("Google Cloud Speech could not understand audio")
 except sr.RequestError as e:

diff --git a/examples/special_recognizer_features.py b/examples/special_recognizer_features.py
@@ -35,11 +35,11 @@
 
 
 # recognize preferred phrases using Google Cloud Speech
-GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""INSERT THE CONTENTS OF THE GOOGLE CLOUD SPEECH JSON CREDENTIALS FILE HERE"""
+# Before run, create local authentication credentials (``gcloud auth application-default login``)
 try:
     print("Google Cloud Speech recognition for \"numero\" with different sets of preferred phrases:")
-    print(r.recognize_google_cloud(audio_fr, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, preferred_phrases=["noomarow"]))
-    print(r.recognize_google_cloud(audio_fr, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS, preferred_phrases=["newmarrow"]))
+    print(r.recognize_google_cloud(audio_fr, preferred_phrases=["noomarow"]))
+    print(r.recognize_google_cloud(audio_fr, preferred_phrases=["newmarrow"]))
 except sr.UnknownValueError:
     print("Google Cloud Speech could not understand audio")
 except sr.RequestError as e:

diff --git a/reference/library-reference.rst b/reference/library-reference.rst
@@ -227,8 +227,8 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot
 
 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
 
-``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json_path: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]``
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json_path: Union[str, None] = None, **kwargs) -> Union[str, Dict[str, Any]]``
+-------------------------------------------------------------------------------------------------------------------------------------------------------------
 
 .. autofunction:: speech_recognition.recognizers.google_cloud.recognize
 

diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py
@@ -1,52 +1,99 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING, TypedDict
 from urllib.error import URLError
 
 from speech_recognition.audio import AudioData
 from speech_recognition.exceptions import RequestError, UnknownValueError
 
+if TYPE_CHECKING:
+    from google.cloud.speech import (
+        RecognitionConfig,
+        RecognizeResponse,
+        SpeechContext,
+    )
+    from typing_extensions import Required
 
-def recognize(
-    recognizer,
-    audio_data: AudioData,
-    credentials_json_path: str | None = None,
-    language: str = "en-US",
-    preferred_phrases=None,
-    show_all: bool = False,
-    **api_params,
-):
-    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
 
-    This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart <https://cloud.google.com/speech/docs/getting-started>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
+class GoogleCloudRecognizerParameters(TypedDict, total=False):
+    """Optional parameters.
+
+    The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``.
+    A list of supported language tags can be found in the `Speech-to-Text supported languages <https://cloud.google.com/speech/docs/languages>`__.
+
+    If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives.
+    This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary.
+    Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
+
+    ``show_all``: See :py:func:`recognize`.
+
+    ``model``: You can select the model to get best results. (See `RecognitionConfig's documentation <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>`__ for detail)
 
-    The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation <https://cloud.google.com/speech/docs/languages>`__.
+    ``use_enhanced``: Set to true to use an enhanced model for speech recognition.
+    """
+
+    # SpeechRecognition specific parameters
+    preferred_phrases: list[str]
+    show_all: bool
+
+    # Speech-to-Text V1 API's parameters
+    language_code: str
+    model: str
+    use_enhanced: bool
+    # TODO Add others support
+
+
+class GoogleCloudSpeechV1Parameters(TypedDict, total=False):
+    """Speech-to-Text V1 API's parameters.
 
-    If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
+    https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig
+    """
 
-    ``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>
+    encoding: Required[RecognitionConfig.AudioEncoding]
+    sample_rate_hertz: Required[int]
+    language_code: Required[str]
+    speech_contexts: list[SpeechContext]
+    enable_word_time_offsets: bool
+    model: str
+    use_enhanced: bool
 
-        The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set,
-        then an appropriate enhanced model is chosen if an enhanced model exists for the audio.
-        If use_enhanced is true and an enhanced version of the specified model does not exist,
-        then the speech is recognized using the standard version of the specified model.
 
-        Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best
-        suited to your domain to get best results. If a model is not explicitly specified,
-        then we auto-select a model based on the other parameters of this method.
+def _build_config(
+    audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters
+) -> RecognitionConfig:
+    from google.cloud import speech
+
+    parameters: GoogleCloudSpeechV1Parameters = {
+        "encoding": speech.RecognitionConfig.AudioEncoding.FLAC,
+        "sample_rate_hertz": audio_data.sample_rate,
+        "language_code": recognizer_params.pop("language_code", "en-US"),
+    }
+    if preferred_phrases := recognizer_params.pop("preferred_phrases", None):
+        parameters["speech_contexts"] = [
+            speech.SpeechContext(phrases=preferred_phrases)
+        ]
+    if recognizer_params.pop("show_all", False):
+        # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets
+        parameters["enable_word_time_offsets"] = True
+    return speech.RecognitionConfig(**(parameters | recognizer_params))
+
+
+def recognize(
+    recognizer,
+    audio_data: AudioData,
+    credentials_json_path: str | None = None,
+    **kwargs: GoogleCloudRecognizerParameters,
+) -> str | RecognizeResponse:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
+
+    This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project.
+    And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
 
     Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
+    For other parameters, see :py:class:`GoogleCloudRecognizerParameters`.
 
     Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
     """
-    assert isinstance(
-        audio_data, AudioData
-    ), "``audio_data`` must be audio data"
-    assert isinstance(language, str), "``language`` must be a string"
-    assert preferred_phrases is None or all(
-        isinstance(preferred_phrases, (type(""), type("")))
-        for preferred_phrases in preferred_phrases
-    ), "``preferred_phrases`` must be a list of strings"
-
     try:
         from google.api_core.exceptions import GoogleAPICallError
         from google.cloud import speech
@@ -72,21 +119,7 @@ def recognize(
     )
     audio = speech.RecognitionAudio(content=flac_data)
 
-    config = {
-        "encoding": speech.RecognitionConfig.AudioEncoding.FLAC,
-        "sample_rate_hertz": audio_data.sample_rate,
-        "language_code": language,
-        **api_params,
-    }
-    if preferred_phrases is not None:
-        config["speechContexts"] = [
-            speech.SpeechContext(phrases=preferred_phrases)
-        ]
-    if show_all:
-        # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets
-        config["enable_word_time_offsets"] = True
-
-    config = speech.RecognitionConfig(**config)
+    config = _build_config(audio_data, kwargs.copy())
 
     try:
         response = client.recognize(config=config, audio=audio)
@@ -97,7 +130,7 @@ def recognize(
             "recognition connection failed: {0}".format(e.reason)
         )
 
-    if show_all:
+    if kwargs.get("show_all"):
         return response
     if len(response.results) == 0:
         raise UnknownValueError()

diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py
@@ -4,6 +4,7 @@
     RecognitionAudio,
     RecognitionConfig,
     RecognizeResponse,
+    SpeechContext,
     SpeechRecognitionAlternative,
     SpeechRecognitionResult,
     WordInfo,
@@ -164,7 +165,8 @@ def test_transcribe_with_specified_api_parameters(SpeechClient):
     _ = recognize(
         MagicMock(spec=Recognizer),
         audio_data,
-        language="ja-JP",
+        language_code="ja-JP",
+        preferred_phrases=["numero", "hoge"],
         use_enhanced=True,
     )
 
@@ -173,6 +175,7 @@ def test_transcribe_with_specified_api_parameters(SpeechClient):
             encoding=RecognitionConfig.AudioEncoding.FLAC,
             sample_rate_hertz=16_000,
             language_code="ja-JP",
+            speech_contexts=[SpeechContext(phrases=["numero", "hoge"])],
             use_enhanced=True,
         ),
         audio=RecognitionAudio(content=b"flac_data"),