first commit

2025-04-04 13:23:15 -06:00
commit 216064f731
2103 changed files with 522593 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/init.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/init.py
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/init.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/init.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/google.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/google.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/google_cloud.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/google_cloud.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/pocketsphinx.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/pocketsphinx.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/google.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/google.py
@@ -0,0 +1,262 @@
+from __future__ import annotations
+
+import json
+from typing import Dict, Literal, TypedDict
+from urllib.error import HTTPError, URLError
+from urllib.parse import urlencode
+from urllib.request import Request, urlopen
+
+from typing_extensions import NotRequired
+
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import RequestError, UnknownValueError
+
+
+class Alternative(TypedDict):
+    transcript: str
+    confidence: float
+
+
+class Result(TypedDict):
+    alternative: list[Alternative]
+    final: bool
+
+
+class GoogleResponse(TypedDict):
+    result: list[Result]
+    result_index: NotRequired[int]
+
+
+ProfanityFilterLevel = Literal[0, 1]
+RequestHeaders = Dict[str, str]
+
+ENDPOINT = "http://www.google.com/speech-api/v2/recognize"
+
+
+class RequestBuilder:
+    def __init__(
+        self,
+        *,
+        endpoint: str,
+        key: str,
+        language: str,
+        filter_level: ProfanityFilterLevel,
+    ) -> None:
+        self.endpoint = endpoint
+        self.key = key
+        self.language = language
+        self.filter_level = filter_level
+
+    def build(self, audio_data: AudioData) -> Request:
+        if not isinstance(audio_data, AudioData):
+            raise ValueError("``audio_data`` must be audio data")
+
+        url = self.build_url()
+        headers = self.build_headers(audio_data)
+        flac_data = self.build_data(audio_data)
+        request = Request(url, data=flac_data, headers=headers)
+        return request
+
+    def build_url(self) -> str:
+        """
+        >>> builder = RequestBuilder(endpoint="http://www.google.com/speech-api/v2/recognize", key="awesome-key", language="en-US", filter_level=0)
+        >>> builder.build_url()
+        'http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0'
+        """
+        params = urlencode(
+            {
+                "client": "chromium",
+                "lang": self.language,
+                "key": self.key,
+                "pFilter": self.filter_level,
+            }
+        )
+        return f"{self.endpoint}?{params}"
+
+    def build_headers(self, audio_data: AudioData) -> RequestHeaders:
+        """
+        >>> builder = RequestBuilder(endpoint="", key="", language="", filter_level=1)
+        >>> audio_data = AudioData(b"", 16_000, 1)
+        >>> builder.build_headers(audio_data)
+        {'Content-Type': 'audio/x-flac; rate=16000'}
+        """
+        rate = audio_data.sample_rate
+        headers = {"Content-Type": f"audio/x-flac; rate={rate}"}
+        return headers
+
+    def build_data(self, audio_data: AudioData) -> bytes:
+        flac_data = audio_data.get_flac_data(
+            convert_rate=self.to_convert_rate(audio_data.sample_rate),
+            convert_width=2,  # audio samples must be 16-bit
+        )
+        return flac_data
+
+    @staticmethod
+    def to_convert_rate(sample_rate: int) -> int:
+        """Audio samples must be at least 8 kHz
+
+        >>> RequestBuilder.to_convert_rate(16_000)
+        >>> RequestBuilder.to_convert_rate(8_000)
+        >>> RequestBuilder.to_convert_rate(7_999)
+        8000
+        """
+        return None if sample_rate >= 8000 else 8000
+
+
+def create_request_builder(
+    *,
+    endpoint: str,
+    key: str | None = None,
+    language: str = "en-US",
+    filter_level: ProfanityFilterLevel = 0,
+) -> RequestBuilder:
+    if not isinstance(language, str):
+        raise ValueError("``language`` must be a string")
+    if key is not None and not isinstance(key, str):
+        raise ValueError("``key`` must be ``None`` or a string")
+
+    if key is None:
+        key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
+    return RequestBuilder(
+        endpoint=endpoint,
+        key=key,
+        language=language,
+        filter_level=filter_level,
+    )
+
+
+class OutputParser:
+    def __init__(self, *, show_all: bool, with_confidence: bool) -> None:
+        self.show_all = show_all
+        self.with_confidence = with_confidence
+
+    def parse(self, response_text: str):
+        actual_result = self.convert_to_result(response_text)
+        if self.show_all:
+            return actual_result
+
+        best_hypothesis = self.find_best_hypothesis(
+            actual_result["alternative"]
+        )
+        # https://cloud.google.com/speech-to-text/docs/basics#confidence-values
+        # "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results."
+        confidence = best_hypothesis.get("confidence", 0.5)
+        if self.with_confidence:
+            return best_hypothesis["transcript"], confidence
+        return best_hypothesis["transcript"]
+
+    @staticmethod
+    def convert_to_result(response_text: str) -> Result:
+        r"""
+        >>> response_text = '''{"result":[]}
+        ... {"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0}
+        ... '''
+        >>> OutputParser.convert_to_result(response_text)
+        {'alternative': [{'transcript': 'one two three', 'confidence': 0.49585345}, {'transcript': '1 2', 'confidence': 0.42899391}], 'final': True}
+
+        >>> OutputParser.convert_to_result("")
+        Traceback (most recent call last):
+          ...
+        speech_recognition.exceptions.UnknownValueError
+        >>> OutputParser.convert_to_result('\n{"result":[]}')
+        Traceback (most recent call last):
+          ...
+        speech_recognition.exceptions.UnknownValueError
+        >>> OutputParser.convert_to_result('{"result":[{"foo": "bar"}]}')
+        Traceback (most recent call last):
+          ...
+        speech_recognition.exceptions.UnknownValueError
+        >>> OutputParser.convert_to_result('{"result":[{"alternative": []}]}')
+        Traceback (most recent call last):
+          ...
+        speech_recognition.exceptions.UnknownValueError
+        """
+        # ignore any blank blocks
+        for line in response_text.split("\n"):
+            if not line:
+                continue
+            result: list[Result] = json.loads(line)["result"]
+            if len(result) != 0:
+                if len(result[0].get("alternative", [])) == 0:
+                    raise UnknownValueError()
+                return result[0]
+        raise UnknownValueError()
+
+    @staticmethod
+    def find_best_hypothesis(alternatives: list[Alternative]) -> Alternative:
+        """
+        >>> alternatives = [{"transcript": "one two three", "confidence": 0.42899391}, {"transcript": "1 2", "confidence": 0.49585345}]
+        >>> OutputParser.find_best_hypothesis(alternatives)
+        {'transcript': 'one two three', 'confidence': 0.42899391}
+
+        >>> alternatives = [{"confidence": 0.49585345}]
+        >>> OutputParser.find_best_hypothesis(alternatives)
+        Traceback (most recent call last):
+          ...
+        speech_recognition.exceptions.UnknownValueError
+        """
+        if "confidence" in alternatives:
+            # BUG: actual_result["alternative"] (=alternatives) is list, not dict
+            # return alternative with highest confidence score
+            best_hypothesis: Alternative = max(
+                alternatives,
+                key=lambda alternative: alternative["confidence"],
+            )
+        else:
+            # when there is no confidence available, we arbitrarily choose the first hypothesis.
+            best_hypothesis: Alternative = alternatives[0]
+        if "transcript" not in best_hypothesis:
+            raise UnknownValueError()
+        return best_hypothesis
+
+
+def obtain_transcription(request: Request, timeout: int) -> str:
+    try:
+        response = urlopen(request, timeout=timeout)
+    except HTTPError as e:
+        raise RequestError("recognition request failed: {}".format(e.reason))
+    except URLError as e:
+        raise RequestError(
+            "recognition connection failed: {}".format(e.reason)
+        )
+    return response.read().decode("utf-8")
+
+
+def recognize_legacy(
+    recognizer,
+    audio_data: AudioData,
+    key: str | None = None,
+    language: str = "en-US",
+    pfilter: ProfanityFilterLevel = 0,
+    show_all: bool = False,
+    with_confidence: bool = False,
+    *,
+    endpoint: str = ENDPOINT,
+):
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.
+
+    The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.
+
+    To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
+
+    The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.
+
+    The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0.
+
+    Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
+
+    Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
+    """
+    request_builder = create_request_builder(
+        endpoint=endpoint, key=key, language=language, filter_level=pfilter
+    )
+    request = request_builder.build(audio_data)
+
+    response_text = obtain_transcription(
+        request, timeout=recognizer.operation_timeout
+    )
+
+    output_parser = OutputParser(
+        show_all=show_all, with_confidence=with_confidence
+    )
+    return output_parser.parse(response_text)
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/google_cloud.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/google_cloud.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, TypedDict
+from urllib.error import URLError
+
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import RequestError, UnknownValueError
+
+if TYPE_CHECKING:
+    from google.cloud.speech import (
+        RecognitionConfig,
+        RecognizeResponse,
+        SpeechContext,
+    )
+    from typing_extensions import Required
+
+
+class GoogleCloudRecognizerParameters(TypedDict, total=False):
+    """Optional parameters.
+
+    The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``.
+    A list of supported language tags can be found in the `Speech-to-Text supported languages <https://cloud.google.com/speech/docs/languages>`__.
+
+    If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives.
+    This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary.
+    Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
+
+    ``show_all``: See :py:func:`recognize`.
+
+    ``model``: You can select the model to get best results. (See `RecognitionConfig's documentation <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>`__ for detail)
+
+    ``use_enhanced``: Set to true to use an enhanced model for speech recognition.
+    """
+
+    # SpeechRecognition specific parameters
+    preferred_phrases: list[str]
+    show_all: bool
+
+    # Speech-to-Text V1 API's parameters
+    language_code: str
+    model: str
+    use_enhanced: bool
+    # TODO Add others support
+
+
+class GoogleCloudSpeechV1Parameters(TypedDict, total=False):
+    """Speech-to-Text V1 API's parameters.
+
+    https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig
+    """
+
+    encoding: Required[RecognitionConfig.AudioEncoding]
+    sample_rate_hertz: Required[int]
+    language_code: Required[str]
+    speech_contexts: list[SpeechContext]
+    enable_word_time_offsets: bool
+    model: str
+    use_enhanced: bool
+
+
+def _build_config(
+    audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters
+) -> RecognitionConfig:
+    from google.cloud import speech
+
+    parameters: GoogleCloudSpeechV1Parameters = {
+        "encoding": speech.RecognitionConfig.AudioEncoding.FLAC,
+        "sample_rate_hertz": audio_data.sample_rate,
+        "language_code": recognizer_params.pop("language_code", "en-US"),
+    }
+    if preferred_phrases := recognizer_params.pop("preferred_phrases", None):
+        parameters["speech_contexts"] = [
+            speech.SpeechContext(phrases=preferred_phrases)
+        ]
+    if recognizer_params.pop("show_all", False):
+        # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets
+        parameters["enable_word_time_offsets"] = True
+    return speech.RecognitionConfig(**(parameters | recognizer_params))
+
+
+def recognize(
+    recognizer,
+    audio_data: AudioData,
+    credentials_json_path: str | None = None,
+    **kwargs: GoogleCloudRecognizerParameters,
+) -> str | RecognizeResponse:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
+
+    This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project.
+    And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
+
+    Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
+    For other parameters, see :py:class:`GoogleCloudRecognizerParameters`.
+
+    Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
+    """
+    try:
+        from google.api_core.exceptions import GoogleAPICallError
+        from google.cloud import speech
+    except ImportError:
+        raise RequestError(
+            "missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly."
+        )
+
+    client = (
+        speech.SpeechClient.from_service_account_json(credentials_json_path)
+        if credentials_json_path
+        else speech.SpeechClient()
+    )
+
+    flac_data = audio_data.get_flac_data(
+        # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
+        convert_rate=(
+            None
+            if 8000 <= audio_data.sample_rate <= 48000
+            else max(8000, min(audio_data.sample_rate, 48000))
+        ),
+        convert_width=2,  # audio samples must be 16-bit
+    )
+    audio = speech.RecognitionAudio(content=flac_data)
+
+    config = _build_config(audio_data, kwargs.copy())
+
+    try:
+        response = client.recognize(config=config, audio=audio)
+    except GoogleAPICallError as e:
+        raise RequestError(e)
+    except URLError as e:
+        raise RequestError(
+            "recognition connection failed: {0}".format(e.reason)
+        )
+
+    if kwargs.get("show_all"):
+        return response
+    if len(response.results) == 0:
+        raise UnknownValueError()
+
+    transcript = " ".join(
+        result.alternatives[0].transcript.strip()
+        for result in response.results
+    )
+    return transcript
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pocketsphinx.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pocketsphinx.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+import os
+from collections.abc import Sequence
+
+from speech_recognition import PortableNamedTemporaryFile
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import RequestError, UnknownValueError
+
+AcousticParametersDirectoryPath = str
+LanguageModelFilePath = str
+PhonemeDictionaryFilePath = str
+SphinxDataFilePaths = tuple[AcousticParametersDirectoryPath, LanguageModelFilePath, PhonemeDictionaryFilePath]
+
+Keyword = str
+Sensitivity = float
+KeywordEntry = tuple[Keyword, Sensitivity]
+
+
+def recognize(
+    recognizer,
+    audio_data: AudioData,
+    language: str | SphinxDataFilePaths = "en-US",
+    keyword_entries: Sequence[KeywordEntry] | None = None,
+    grammar: str | None = None,
+    show_all: bool = False,
+):
+    """
+    Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.
+
+    The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models.
+
+    If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for.
+
+    Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored.
+
+    Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition.
+
+    Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation.
+    """
+    # TODO Move this validation into KeywordEntry initialization
+    assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1"
+
+    try:
+        from pocketsphinx import FsgModel, Jsgf, pocketsphinx
+    except ImportError:
+        raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
+
+    if isinstance(language, str):  # directory containing language data
+        language_directory = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "pocketsphinx-data", language)
+        if not os.path.isdir(language_directory):
+            raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory))
+        acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
+        language_model_file = os.path.join(language_directory, "language-model.lm.bin")
+        phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
+    else:  # 3-tuple of Sphinx data file paths
+        acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language
+    if not os.path.isdir(acoustic_parameters_directory):
+        raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory))
+    if not os.path.isfile(language_model_file):
+        raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file))
+    if not os.path.isfile(phoneme_dictionary_file):
+        raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file))
+
+    # create decoder object
+    config = pocketsphinx.Config()
+    config.set_string("-hmm", acoustic_parameters_directory)  # set the path of the hidden Markov model (HMM) parameter files
+    config.set_string("-lm", language_model_file)
+    config.set_string("-dict", phoneme_dictionary_file)
+    config.set_string("-logfn", os.devnull)  # disable logging (logging causes unwanted output in terminal)
+    decoder = pocketsphinx.Decoder(config)
+
+    # obtain audio data
+    raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2)  # the included language models require audio to be 16-bit mono 16 kHz in little-endian format
+
+    # obtain recognition results
+    if keyword_entries is not None:  # explicitly specified set of keywords
+        with PortableNamedTemporaryFile("w") as f:
+            # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5
+            f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries)
+            f.flush()
+
+            # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done)
+            decoder.add_kws("keywords", f.name)
+            decoder.activate_search("keywords")
+    elif grammar is not None:  # a path to a FSG or JSGF grammar
+        if not os.path.exists(grammar):
+            raise ValueError("Grammar '{0}' does not exist.".format(grammar))
+        grammar_path = os.path.abspath(os.path.dirname(grammar))
+        grammar_name = os.path.splitext(os.path.basename(grammar))[0]
+        fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name)
+        if not os.path.exists(fsg_path):  # create FSG grammar if not available
+            jsgf = Jsgf(grammar)
+            rule = jsgf.get_rule("{0}.{0}".format(grammar_name))
+            fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5)
+            fsg.writefile(fsg_path)
+        else:
+            fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5)
+        decoder.set_fsg(grammar_name, fsg)
+        decoder.set_search(grammar_name)
+
+    decoder.start_utt()  # begin utterance processing
+    decoder.process_raw(raw_data, False, True)  # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
+    decoder.end_utt()  # stop utterance processing
+
+    if show_all: return decoder
+
+    # return results
+    hypothesis = decoder.hyp()
+    if hypothesis is not None: return hypothesis.hypstr
+    raise UnknownValueError()  # no transcriptions available
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/init.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/init.py
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/init.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/init.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/base.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/base.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/groq.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/groq.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/openai.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/openai.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/base.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/base.py
@@ -0,0 +1,22 @@
+from io import BytesIO
+
+from speech_recognition.audio import AudioData
+
+
+class OpenAICompatibleRecognizer:
+    def __init__(self, client) -> None:
+        self.client = client
+
+    def recognize(self, audio_data: "AudioData", model: str, **kwargs) -> str:
+        if not isinstance(audio_data, AudioData):
+            raise ValueError(
+                "``audio_data`` must be an ``AudioData`` instance"
+            )
+
+        wav_data = BytesIO(audio_data.get_wav_data())
+        wav_data.name = "SpeechRecognition_audio.wav"
+
+        transcript = self.client.audio.transcriptions.create(
+            file=wav_data, model=model, **kwargs
+        )
+        return transcript.text
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/groq.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/groq.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from typing import Literal, TypedDict
+
+from typing_extensions import Unpack
+
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import SetupError
+from speech_recognition.recognizers.whisper_api.base import (
+    OpenAICompatibleRecognizer,
+)
+
+# https://console.groq.com/docs/speech-text#supported-models
+GroqModel = Literal[
+    "whisper-large-v3-turbo", "whisper-large-v3", "distil-whisper-large-v3-en"
+]
+
+
+class GroqOptionalParameters(TypedDict):
+    """Groq speech transcription's optional parameters.
+
+    https://console.groq.com/docs/speech-text#transcription-endpoint-usage
+    """
+
+    prompt: str
+    response_format: str
+    temperature: float
+    language: str
+
+
+def recognize(
+    recognizer,
+    audio_data: "AudioData",
+    *,
+    model: GroqModel = "whisper-large-v3-turbo",
+    **kwargs: Unpack[GroqOptionalParameters],
+) -> str:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Groq Whisper API.
+
+    This function requires login to Groq; visit https://console.groq.com/login, then generate API Key in `API Keys <https://console.groq.com/keys>`__ menu.
+
+    Detail: https://console.groq.com/docs/speech-text
+
+    Set environment variable ``GROQ_API_KEY``; otherwise groq library will raise a ``groq.GroqError``.
+    """
+    try:
+        import groq
+    except ImportError:
+        raise SetupError(
+            "missing groq module: ensure that groq is set up correctly."
+        )
+
+    groq_recognizer = OpenAICompatibleRecognizer(groq.Groq())
+    return groq_recognizer.recognize(audio_data, model, **kwargs)
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/openai.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/openai.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+from typing import Literal
+
+from typing_extensions import Unpack
+
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import SetupError
+from speech_recognition.recognizers.whisper_api.base import (
+    OpenAICompatibleRecognizer,
+)
+
+# https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-model
+WhisperModel = Literal[
+    "whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"
+]
+
+
+class OpenAIOptionalParameters:
+    """OpenAI speech transcription's optional parameters.
+
+    https://platform.openai.com/docs/api-reference/audio/createTranscription
+    """
+
+    language: str
+    prompt: str
+    # TODO Add support `Literal["text", "srt", "verbose_json", "vtt"]`
+    response_format: Literal["json"]
+    temperature: float
+    # timestamp_granularities  # TODO support
+
+
+def recognize(
+    recognizer,
+    audio_data: "AudioData",
+    *,
+    model: WhisperModel = "whisper-1",
+    **kwargs: Unpack[OpenAIOptionalParameters],
+) -> str:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API.
+
+    This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings <https://platform.openai.com/account/api-keys>`__.
+
+    Detail: https://platform.openai.com/docs/guides/speech-to-text
+
+    Set environment variable ``OPENAI_API_KEY``; otherwise openai library will raise a ``openai.OpenAIError``.
+    """
+    try:
+        import openai
+    except ImportError:
+        raise SetupError(
+            "missing openai module: ensure that openai is set up correctly."
+        )
+
+    openai_recognizer = OpenAICompatibleRecognizer(openai.OpenAI())
+    return openai_recognizer.recognize(audio_data, model, **kwargs)
+
+
+if __name__ == "__main__":
+    import argparse
+    from typing import get_args
+
+    import speech_recognition as sr
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("audio_file")
+    parser.add_argument(
+        "--model", choices=get_args(WhisperModel), default="whisper-1"
+    )
+    parser.add_argument("-l", "--language")
+    args = parser.parse_args()
+
+    r = sr.Recognizer()
+    with sr.AudioFile(args.audio_file) as source:
+        audio_data = r.listen(source)
+
+    if args.language:
+        transcription = recognize(
+            None, audio_data, model=args.model, language=args.language
+        )
+    else:
+        transcription = recognize(None, audio_data, model=args.model)
+    print(transcription)
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/init.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/init.py
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/init.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/init.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/base.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/base.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/faster_whisper.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/faster_whisper.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/whisper.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/whisper.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/base.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/base.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+import io
+from typing import TYPE_CHECKING, Any, Protocol
+
+from speech_recognition.audio import AudioData
+
+if TYPE_CHECKING:
+    import numpy as np
+
+
+class Transcribable(Protocol):
+    def transcribe(
+        self, audio_array: np.ndarray, **kwargs
+    ) -> str | dict[str, Any]:
+        pass
+
+
+class WhisperCompatibleRecognizer:
+    def __init__(self, model: Transcribable) -> None:
+        self.model = model
+
+    def recognize(
+        self, audio_data: AudioData, show_dict: bool = False, **kwargs
+    ):
+        if not isinstance(audio_data, AudioData):
+            raise ValueError(
+                "``audio_data`` must be an ``AudioData`` instance"
+            )
+
+        import numpy as np
+        import soundfile as sf
+
+        # 16 kHz https://github.com/openai/whisper/blob/28769fcfe50755a817ab922a7bc83483159600a9/whisper/audio.py#L98-L99
+        wav_bytes = audio_data.get_wav_data(convert_rate=16000)
+        wav_stream = io.BytesIO(wav_bytes)
+        audio_array, sampling_rate = sf.read(wav_stream)
+        audio_array = audio_array.astype(np.float32)
+
+        result = self.model.transcribe(audio_array, **kwargs)
+
+        if show_dict:
+            return result
+        else:
+            return result["text"]
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/faster_whisper.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/faster_whisper.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal, TypedDict
+
+from speech_recognition.audio import AudioData
+from speech_recognition.recognizers.whisper_local.base import (
+    WhisperCompatibleRecognizer,
+)
+
+if TYPE_CHECKING:
+    import numpy as np
+    from faster_whisper import WhisperModel
+    from faster_whisper.transcribe import Segment
+    from typing_extensions import Unpack
+
+
+class TranscribeOutput(TypedDict):
+    text: str
+    segments: list[Segment]
+    language: str
+
+
+class TranscribableAdapter:
+    def __init__(self, model: WhisperModel) -> None:
+        self.model = model
+
+    def transcribe(
+        self, audio_array: np.ndarray, **kwargs
+    ) -> TranscribeOutput:
+        segments_generator, info = self.model.transcribe(audio_array, **kwargs)
+        segments = list(segments_generator)
+        return {
+            "text": " ".join(segment.text for segment in segments),
+            "segments": segments,
+            "language": info.language,
+        }
+
+
+class InitOptionalParameters(TypedDict, total=False):
+    # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L575
+    device: Literal["cpu", "gpu", "auto"]
+    compute_type: str
+    download_root: str
+    # TODO Add others
+
+
+class TranscribeOptionalParameters(TypedDict, total=False):
+    # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692
+    language: str
+    task: Literal["transcribe", "translate"]
+    beam_size: int
+    # TODO Add others
+
+
+def recognize(
+    recognizer,
+    audio_data: AudioData,
+    model: str = "base",
+    show_dict: bool = False,
+    init_options: InitOptionalParameters | None = None,
+    **transcribe_options: Unpack[TranscribeOptionalParameters],
+) -> str | TranscribeOutput:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
+
+    Pick ``model`` size (Same as Whisper).
+
+    If ``show_dict`` is true, returns the detailed response from Whisper, including the detected language. Otherwise returns only the transcription.
+
+    You can specify:
+
+        * ``language``: recognition language, an uncapitalized 2 letters language name like "en" or "fr".
+
+            * If not set, Faster Whisper will automatically detect the language.
+
+        * ``task``
+
+            * If you want transcribe + **translate** to english, set ``task="translate"``.
+
+    Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options.
+    """
+    from faster_whisper import WhisperModel
+
+    model = WhisperModel(model, **init_options or {})
+    whisper_recognizer = WhisperCompatibleRecognizer(
+        TranscribableAdapter(model)
+    )
+    return whisper_recognizer.recognize(
+        audio_data, show_dict=show_dict, **transcribe_options
+    )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    import speech_recognition as sr
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("audio_file")
+    args = parser.parse_args()
+
+    r = sr.Recognizer()
+    with sr.AudioFile(args.audio_file) as source:
+        audio_data = r.listen(source)
+
+    transcription = recognize(None, audio_data)
+    print(transcription)
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/whisper.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/whisper.py
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal, TypedDict
+
+from speech_recognition.audio import AudioData
+from speech_recognition.recognizers.whisper_local.base import (
+    WhisperCompatibleRecognizer,
+)
+
+if TYPE_CHECKING:
+    import numpy as np
+    import torch
+    from typing_extensions import Unpack
+    from whisper import Whisper
+
+
+class LoadModelOptionalParameters(TypedDict, total=False):
+    # ref: https://github.com/openai/whisper/blob/v20240930/whisper/__init__.py#L103
+    device: str | torch.device
+    download_root: str
+    in_memory: bool
+
+
+class TranscribeOptionalParameters(TypedDict, total=False):
+    """Transcribe optional parameters & DecodingOptions parameters."""
+
+    # ref: https://github.com/openai/whisper/blob/v20240930/whisper/transcribe.py#L38
+    temperature: float | tuple[float, ...]
+    # TODO Add others
+
+    # ref: https://github.com/openai/whisper/blob/v20240930/whisper/decoding.py#L81
+    task: Literal["transcribe", "translate"]
+    language: str
+    fp16: bool
+    # TODO Add others
+
+
+class Segment(TypedDict):
+    id: int
+    seek: int
+    start: float
+    end: float
+    text: str
+    tokens: list[int]
+    temperature: float
+    avg_logprob: float
+    compression_ratio: float
+    no_speech_prob: float
+
+
+class TranscribeOutput(TypedDict):
+    text: str
+    segments: list[Segment]
+    language: str
+
+
+class TranscribableAdapter:
+    def __init__(self, model: Whisper) -> None:
+        self.model = model
+
+    def transcribe(
+        self, audio_array: np.ndarray, **kwargs
+    ) -> TranscribeOutput:
+        if "fp16" not in kwargs:
+            import torch
+
+            kwargs["fp16"] = torch.cuda.is_available()
+
+        return self.model.transcribe(audio_array, **kwargs)
+
+
+def recognize(
+    recognizer,
+    audio_data: AudioData,
+    model: str = "base",
+    show_dict: bool = False,
+    load_options: LoadModelOptionalParameters | None = None,
+    **transcribe_options: Unpack[TranscribeOptionalParameters],
+) -> str | TranscribeOutput:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
+
+    Pick ``model`` from output of :command:`python -c 'import whisper; print(whisper.available_models())'`.
+    See also https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages.
+
+    If ``show_dict`` is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription.
+
+    You can specify:
+
+        * ``language``: recognition language, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
+
+            * If not set, Whisper will automatically detect the language.
+
+        * ``task``
+
+            * If you want transcribe + **translate** to english, set ``task="translate"``.
+
+    Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options.
+    """
+
+    import whisper
+
+    whisper_model = whisper.load_model(model, **load_options or {})
+    whisper_recognizer = WhisperCompatibleRecognizer(
+        TranscribableAdapter(whisper_model)
+    )
+    return whisper_recognizer.recognize(
+        audio_data, show_dict=show_dict, **transcribe_options
+    )