first commit

2025-04-04 13:23:15 -06:00
commit 216064f731
2103 changed files with 522593 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/speech_recognition/init.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/init.py
--- a/venv/lib/python3.11/site-packages/speech_recognition/main.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/main.py
@@ -0,0 +1,24 @@
+import speech_recognition as sr
+
+r = sr.Recognizer()
+m = sr.Microphone()
+
+try:
+    print("A moment of silence, please...")
+    with m as source: r.adjust_for_ambient_noise(source)
+    print("Set minimum energy threshold to {}".format(r.energy_threshold))
+    while True:
+        print("Say something!")
+        with m as source: audio = r.listen(source)
+        print("Got it! Now to recognize it...")
+        try:
+            # recognize speech using Google Speech Recognition
+            value = r.recognize_google(audio)
+
+            print("You said {}".format(value))
+        except sr.UnknownValueError:
+            print("Oops! Didn't catch that")
+        except sr.RequestError as e:
+            print("Uh oh! Couldn't request results from Google Speech Recognition service; {0}".format(e))
+except KeyboardInterrupt:
+    pass
--- a/venv/lib/python3.11/site-packages/speech_recognition/pycache/init.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pycache/init.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/pycache/main.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pycache/main.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/pycache/audio.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pycache/audio.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/pycache/exceptions.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pycache/exceptions.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/audio.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/audio.py
@@ -0,0 +1,318 @@
+import aifc
+import audioop
+import io
+import os
+import platform
+import stat
+import subprocess
+import sys
+import wave
+
+
+class AudioData(object):
+    """
+    Creates a new ``AudioData`` instance, which represents mono audio data.
+
+    The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format.
+
+    The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample.
+
+    The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz).
+
+    Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly.
+    """
+
+    def __init__(self, frame_data, sample_rate, sample_width):
+        assert sample_rate > 0, "Sample rate must be a positive integer"
+        assert (
+            sample_width % 1 == 0 and 1 <= sample_width <= 4
+        ), "Sample width must be between 1 and 4 inclusive"
+        self.frame_data = frame_data
+        self.sample_rate = sample_rate
+        self.sample_width = int(sample_width)
+
+    def get_segment(self, start_ms=None, end_ms=None):
+        """
+        Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in.
+
+        If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end.
+        """
+        assert (
+            start_ms is None or start_ms >= 0
+        ), "``start_ms`` must be a non-negative number"
+        assert end_ms is None or end_ms >= (
+            0 if start_ms is None else start_ms
+        ), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``"
+        if start_ms is None:
+            start_byte = 0
+        else:
+            start_byte = int(
+                (start_ms * self.sample_rate * self.sample_width) // 1000
+            )
+        if end_ms is None:
+            end_byte = len(self.frame_data)
+        else:
+            end_byte = int(
+                (end_ms * self.sample_rate * self.sample_width) // 1000
+            )
+        return AudioData(
+            self.frame_data[start_byte:end_byte],
+            self.sample_rate,
+            self.sample_width,
+        )
+
+    def get_raw_data(self, convert_rate=None, convert_width=None):
+        """
+        Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance.
+
+        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
+
+        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
+
+        Writing these bytes directly to a file results in a valid `RAW/PCM audio file <https://en.wikipedia.org/wiki/Raw_audio_format>`__.
+        """
+        assert (
+            convert_rate is None or convert_rate > 0
+        ), "Sample rate to convert to must be a positive integer"
+        assert convert_width is None or (
+            convert_width % 1 == 0 and 1 <= convert_width <= 4
+        ), "Sample width to convert to must be between 1 and 4 inclusive"
+
+        raw_data = self.frame_data
+
+        # make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples)
+        if self.sample_width == 1:
+            raw_data = audioop.bias(
+                raw_data, 1, -128
+            )  # subtract 128 from every sample to make them act like signed samples
+
+        # resample audio at the desired rate if specified
+        if convert_rate is not None and self.sample_rate != convert_rate:
+            raw_data, _ = audioop.ratecv(
+                raw_data,
+                self.sample_width,
+                1,
+                self.sample_rate,
+                convert_rate,
+                None,
+            )
+
+        # convert samples to desired sample width if specified
+        if convert_width is not None and self.sample_width != convert_width:
+            if (
+                convert_width == 3
+            ):  # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866)
+                raw_data = audioop.lin2lin(
+                    raw_data, self.sample_width, 4
+                )  # convert audio into 32-bit first, which is always supported
+                try:
+                    audioop.bias(
+                        b"", 3, 0
+                    )  # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do)
+                except (
+                    audioop.error
+                ):  # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less)
+                    raw_data = b"".join(
+                        raw_data[i + 1: i + 4]
+                        for i in range(0, len(raw_data), 4)
+                    )  # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample
+                else:  # 24-bit audio fully supported, we don't need to shim anything
+                    raw_data = audioop.lin2lin(
+                        raw_data, self.sample_width, convert_width
+                    )
+            else:
+                raw_data = audioop.lin2lin(
+                    raw_data, self.sample_width, convert_width
+                )
+
+        # if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again
+        if convert_width == 1:
+            raw_data = audioop.bias(
+                raw_data, 1, 128
+            )  # add 128 to every sample to make them act like unsigned samples again
+
+        return raw_data
+
+    def get_wav_data(self, convert_rate=None, convert_width=None):
+        """
+        Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance.
+
+        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
+
+        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
+
+        Writing these bytes directly to a file results in a valid `WAV file <https://en.wikipedia.org/wiki/WAV>`__.
+        """
+        raw_data = self.get_raw_data(convert_rate, convert_width)
+        sample_rate = (
+            self.sample_rate if convert_rate is None else convert_rate
+        )
+        sample_width = (
+            self.sample_width if convert_width is None else convert_width
+        )
+
+        # generate the WAV file contents
+        with io.BytesIO() as wav_file:
+            wav_writer = wave.open(wav_file, "wb")
+            try:  # note that we can't use context manager, since that was only added in Python 3.4
+                wav_writer.setframerate(sample_rate)
+                wav_writer.setsampwidth(sample_width)
+                wav_writer.setnchannels(1)
+                wav_writer.writeframes(raw_data)
+                wav_data = wav_file.getvalue()
+            finally:  # make sure resources are cleaned up
+                wav_writer.close()
+        return wav_data
+
+    def get_aiff_data(self, convert_rate=None, convert_width=None):
+        """
+        Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance.
+
+        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
+
+        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
+
+        Writing these bytes directly to a file results in a valid `AIFF-C file <https://en.wikipedia.org/wiki/Audio_Interchange_File_Format>`__.
+        """
+        raw_data = self.get_raw_data(convert_rate, convert_width)
+        sample_rate = (
+            self.sample_rate if convert_rate is None else convert_rate
+        )
+        sample_width = (
+            self.sample_width if convert_width is None else convert_width
+        )
+
+        # the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian
+        if hasattr(
+            audioop, "byteswap"
+        ):  # ``audioop.byteswap`` was only added in Python 3.4
+            raw_data = audioop.byteswap(raw_data, sample_width)
+        else:  # manually reverse the bytes of each sample, which is slower but works well enough as a fallback
+            raw_data = raw_data[sample_width - 1:: -1] + b"".join(
+                raw_data[i + sample_width: i: -1]
+                for i in range(sample_width - 1, len(raw_data), sample_width)
+            )
+
+        # generate the AIFF-C file contents
+        with io.BytesIO() as aiff_file:
+            aiff_writer = aifc.open(aiff_file, "wb")
+            try:  # note that we can't use context manager, since that was only added in Python 3.4
+                aiff_writer.setframerate(sample_rate)
+                aiff_writer.setsampwidth(sample_width)
+                aiff_writer.setnchannels(1)
+                aiff_writer.writeframes(raw_data)
+                aiff_data = aiff_file.getvalue()
+            finally:  # make sure resources are cleaned up
+                aiff_writer.close()
+        return aiff_data
+
+    def get_flac_data(self, convert_rate=None, convert_width=None):
+        """
+        Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance.
+
+        Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC.
+
+        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
+
+        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
+
+        Writing these bytes directly to a file results in a valid `FLAC file <https://en.wikipedia.org/wiki/FLAC>`__.
+        """
+        assert convert_width is None or (
+            convert_width % 1 == 0 and 1 <= convert_width <= 3
+        ), "Sample width to convert to must be between 1 and 3 inclusive"
+
+        if (
+            self.sample_width > 3 and convert_width is None
+        ):  # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder
+            convert_width = 3  # the largest supported sample width is 24-bit, so we'll limit the sample width to that
+
+        # run the FLAC converter with the WAV data to get the FLAC data
+        wav_data = self.get_wav_data(convert_rate, convert_width)
+        flac_converter = get_flac_converter()
+        if (
+            os.name == "nt"
+        ):  # on Windows, specify that the process is to be started without showing a console window
+            startup_info = subprocess.STARTUPINFO()
+            startup_info.dwFlags |= (
+                subprocess.STARTF_USESHOWWINDOW
+            )  # specify that the wShowWindow field of `startup_info` contains a value
+            startup_info.wShowWindow = (
+                subprocess.SW_HIDE
+            )  # specify that the console window should be hidden
+        else:
+            startup_info = None  # default startupinfo
+        process = subprocess.Popen(
+            [
+                flac_converter,
+                "--stdout",
+                "--totally-silent",  # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output
+                "--best",  # highest level of compression available
+                "-",  # the input FLAC file contents will be given in stdin
+            ],
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            startupinfo=startup_info,
+        )
+        flac_data, stderr = process.communicate(wav_data)
+        return flac_data
+
+
+def get_flac_converter():
+    """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
+    flac_converter = shutil_which("flac")  # check for installed version first
+    if flac_converter is None:  # flac utility is not installed
+        base_path = os.path.dirname(
+            os.path.abspath(__file__)
+        )  # directory of the current module file, where all the FLAC bundled binaries are stored
+        system, machine = platform.system(), platform.machine()
+        if system == "Windows" and machine in {
+            "i686",
+            "i786",
+            "x86",
+            "x86_64",
+            "AMD64",
+        }:
+            flac_converter = os.path.join(base_path, "flac-win32.exe")
+        elif system == "Darwin" and machine in {
+            "i686",
+            "i786",
+            "x86",
+            "x86_64",
+            "AMD64",
+            "arm64",
+        }:
+            flac_converter = os.path.join(base_path, "flac-mac")
+        elif system == "Linux" and machine in {"i686", "i786", "x86"}:
+            flac_converter = os.path.join(base_path, "flac-linux-x86")
+        elif system == "Linux" and machine in {"x86_64", "AMD64"}:
+            flac_converter = os.path.join(base_path, "flac-linux-x86_64")
+        else:  # no FLAC converter available
+            raise OSError(
+                "FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent"
+            )
+
+    # mark FLAC converter as executable if possible
+    try:
+        # handle known issue when running on docker:
+        # run executable right after chmod() may result in OSError "Text file busy"
+        # fix: flush FS with sync
+        if not os.access(flac_converter, os.X_OK):
+            stat_info = os.stat(flac_converter)
+            os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC)
+            if "Linux" in platform.system():
+                os.sync() if sys.version_info >= (3, 3) else os.system("sync")
+
+    except OSError:
+        pass
+
+    return flac_converter
+
+
+def shutil_which(pgm):
+    """Python 2 compatibility: backport of ``shutil.which()`` from Python 3"""
+    path = os.getenv("PATH")
+    for p in path.split(os.path.pathsep):
+        p = os.path.join(p, pgm)
+        if os.path.exists(p) and os.access(p, os.X_OK):
+            return p
--- a/venv/lib/python3.11/site-packages/speech_recognition/exceptions.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/exceptions.py
@@ -0,0 +1,22 @@
+class SetupError(Exception):
+    pass
+
+
+class WaitTimeoutError(Exception):
+    pass
+
+
+class RequestError(Exception):
+    pass
+
+
+class UnknownValueError(Exception):
+    pass
+
+
+class TranscriptionNotReady(Exception):
+    pass
+
+
+class TranscriptionFailed(Exception):
+    pass
--- a/venv/lib/python3.11/site-packages/speech_recognition/flac-linux-x86
+++ b/venv/lib/python3.11/site-packages/speech_recognition/flac-linux-x86
--- a/venv/lib/python3.11/site-packages/speech_recognition/flac-linux-x86_64
+++ b/venv/lib/python3.11/site-packages/speech_recognition/flac-linux-x86_64
--- a/venv/lib/python3.11/site-packages/speech_recognition/flac-mac
+++ b/venv/lib/python3.11/site-packages/speech_recognition/flac-mac
--- a/venv/lib/python3.11/site-packages/speech_recognition/flac-win32.exe
+++ b/venv/lib/python3.11/site-packages/speech_recognition/flac-win32.exe
--- a/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/LICENSE.txt
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/LICENSE.txt
@@ -0,0 +1,31 @@
+Copyright (c) 1999-2015 Carnegie Mellon University.  All rights
+reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer. 
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+This work was supported in part by funding from the Defense Advanced 
+Research Projects Agency and the National Science Foundation of the 
+United States of America, and the CMU Sphinx Speech Consortium.
+
+THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
+ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
+NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
--- a/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/README
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/README
@@ -0,0 +1,34 @@
+/* ====================================================================
+ * Copyright (c) 2015 Alpha Cephei Inc. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ALPHA CEPHEI INC. ``AS IS'' AND.
+ * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,.
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ALPHA CEPHEI INC.
+ * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT.
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,.
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY.
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT.
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE.
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ====================================================================
+ *
+ */
+
+This directory contains generic US english acoustic model trained with
+latest sphinxtrain.
--- a/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/feat.params
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/feat.params
@@ -0,0 +1,12 @@
+-lowerf 130
+-upperf 6800
+-nfilt 25
+-transform dct
+-lifter 22
+-feat 1s_c_d_dd
+-svspec 0-12/13-25/26-38
+-agc none
+-cmn current
+-varnorm no
+-model ptm
+-cmninit 40,3,-1
--- a/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/mdef
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/mdef
--- a/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/means
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/means
--- a/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/noisedict
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/noisedict
@@ -0,0 +1,5 @@
+<s> SIL
+</s> SIL
+<sil> SIL
+[NOISE] +NSN+
+[SPEECH] +SPN+
--- a/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/sendump
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/sendump
--- a/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/transition_matrices
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/transition_matrices
--- a/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/variances
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/acoustic-model/variances
--- a/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/language-model.lm.bin
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/language-model.lm.bin
--- a/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/pronounciation-dictionary.dict
+++ b/venv/lib/python3.11/site-packages/speech_recognition/pocketsphinx-data/en-US/pronounciation-dictionary.dict
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/init.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/init.py
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/init.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/init.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/google.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/google.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/google_cloud.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/google_cloud.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/pocketsphinx.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pycache/pocketsphinx.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/google.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/google.py
@@ -0,0 +1,262 @@
+from __future__ import annotations
+
+import json
+from typing import Dict, Literal, TypedDict
+from urllib.error import HTTPError, URLError
+from urllib.parse import urlencode
+from urllib.request import Request, urlopen
+
+from typing_extensions import NotRequired
+
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import RequestError, UnknownValueError
+
+
+class Alternative(TypedDict):
+    transcript: str
+    confidence: float
+
+
+class Result(TypedDict):
+    alternative: list[Alternative]
+    final: bool
+
+
+class GoogleResponse(TypedDict):
+    result: list[Result]
+    result_index: NotRequired[int]
+
+
+ProfanityFilterLevel = Literal[0, 1]
+RequestHeaders = Dict[str, str]
+
+ENDPOINT = "http://www.google.com/speech-api/v2/recognize"
+
+
+class RequestBuilder:
+    def __init__(
+        self,
+        *,
+        endpoint: str,
+        key: str,
+        language: str,
+        filter_level: ProfanityFilterLevel,
+    ) -> None:
+        self.endpoint = endpoint
+        self.key = key
+        self.language = language
+        self.filter_level = filter_level
+
+    def build(self, audio_data: AudioData) -> Request:
+        if not isinstance(audio_data, AudioData):
+            raise ValueError("``audio_data`` must be audio data")
+
+        url = self.build_url()
+        headers = self.build_headers(audio_data)
+        flac_data = self.build_data(audio_data)
+        request = Request(url, data=flac_data, headers=headers)
+        return request
+
+    def build_url(self) -> str:
+        """
+        >>> builder = RequestBuilder(endpoint="http://www.google.com/speech-api/v2/recognize", key="awesome-key", language="en-US", filter_level=0)
+        >>> builder.build_url()
+        'http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0'
+        """
+        params = urlencode(
+            {
+                "client": "chromium",
+                "lang": self.language,
+                "key": self.key,
+                "pFilter": self.filter_level,
+            }
+        )
+        return f"{self.endpoint}?{params}"
+
+    def build_headers(self, audio_data: AudioData) -> RequestHeaders:
+        """
+        >>> builder = RequestBuilder(endpoint="", key="", language="", filter_level=1)
+        >>> audio_data = AudioData(b"", 16_000, 1)
+        >>> builder.build_headers(audio_data)
+        {'Content-Type': 'audio/x-flac; rate=16000'}
+        """
+        rate = audio_data.sample_rate
+        headers = {"Content-Type": f"audio/x-flac; rate={rate}"}
+        return headers
+
+    def build_data(self, audio_data: AudioData) -> bytes:
+        flac_data = audio_data.get_flac_data(
+            convert_rate=self.to_convert_rate(audio_data.sample_rate),
+            convert_width=2,  # audio samples must be 16-bit
+        )
+        return flac_data
+
+    @staticmethod
+    def to_convert_rate(sample_rate: int) -> int:
+        """Audio samples must be at least 8 kHz
+
+        >>> RequestBuilder.to_convert_rate(16_000)
+        >>> RequestBuilder.to_convert_rate(8_000)
+        >>> RequestBuilder.to_convert_rate(7_999)
+        8000
+        """
+        return None if sample_rate >= 8000 else 8000
+
+
+def create_request_builder(
+    *,
+    endpoint: str,
+    key: str | None = None,
+    language: str = "en-US",
+    filter_level: ProfanityFilterLevel = 0,
+) -> RequestBuilder:
+    if not isinstance(language, str):
+        raise ValueError("``language`` must be a string")
+    if key is not None and not isinstance(key, str):
+        raise ValueError("``key`` must be ``None`` or a string")
+
+    if key is None:
+        key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
+    return RequestBuilder(
+        endpoint=endpoint,
+        key=key,
+        language=language,
+        filter_level=filter_level,
+    )
+
+
+class OutputParser:
+    def __init__(self, *, show_all: bool, with_confidence: bool) -> None:
+        self.show_all = show_all
+        self.with_confidence = with_confidence
+
+    def parse(self, response_text: str):
+        actual_result = self.convert_to_result(response_text)
+        if self.show_all:
+            return actual_result
+
+        best_hypothesis = self.find_best_hypothesis(
+            actual_result["alternative"]
+        )
+        # https://cloud.google.com/speech-to-text/docs/basics#confidence-values
+        # "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results."
+        confidence = best_hypothesis.get("confidence", 0.5)
+        if self.with_confidence:
+            return best_hypothesis["transcript"], confidence
+        return best_hypothesis["transcript"]
+
+    @staticmethod
+    def convert_to_result(response_text: str) -> Result:
+        r"""
+        >>> response_text = '''{"result":[]}
+        ... {"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0}
+        ... '''
+        >>> OutputParser.convert_to_result(response_text)
+        {'alternative': [{'transcript': 'one two three', 'confidence': 0.49585345}, {'transcript': '1 2', 'confidence': 0.42899391}], 'final': True}
+
+        >>> OutputParser.convert_to_result("")
+        Traceback (most recent call last):
+          ...
+        speech_recognition.exceptions.UnknownValueError
+        >>> OutputParser.convert_to_result('\n{"result":[]}')
+        Traceback (most recent call last):
+          ...
+        speech_recognition.exceptions.UnknownValueError
+        >>> OutputParser.convert_to_result('{"result":[{"foo": "bar"}]}')
+        Traceback (most recent call last):
+          ...
+        speech_recognition.exceptions.UnknownValueError
+        >>> OutputParser.convert_to_result('{"result":[{"alternative": []}]}')
+        Traceback (most recent call last):
+          ...
+        speech_recognition.exceptions.UnknownValueError
+        """
+        # ignore any blank blocks
+        for line in response_text.split("\n"):
+            if not line:
+                continue
+            result: list[Result] = json.loads(line)["result"]
+            if len(result) != 0:
+                if len(result[0].get("alternative", [])) == 0:
+                    raise UnknownValueError()
+                return result[0]
+        raise UnknownValueError()
+
+    @staticmethod
+    def find_best_hypothesis(alternatives: list[Alternative]) -> Alternative:
+        """
+        >>> alternatives = [{"transcript": "one two three", "confidence": 0.42899391}, {"transcript": "1 2", "confidence": 0.49585345}]
+        >>> OutputParser.find_best_hypothesis(alternatives)
+        {'transcript': 'one two three', 'confidence': 0.42899391}
+
+        >>> alternatives = [{"confidence": 0.49585345}]
+        >>> OutputParser.find_best_hypothesis(alternatives)
+        Traceback (most recent call last):
+          ...
+        speech_recognition.exceptions.UnknownValueError
+        """
+        if "confidence" in alternatives:
+            # BUG: actual_result["alternative"] (=alternatives) is list, not dict
+            # return alternative with highest confidence score
+            best_hypothesis: Alternative = max(
+                alternatives,
+                key=lambda alternative: alternative["confidence"],
+            )
+        else:
+            # when there is no confidence available, we arbitrarily choose the first hypothesis.
+            best_hypothesis: Alternative = alternatives[0]
+        if "transcript" not in best_hypothesis:
+            raise UnknownValueError()
+        return best_hypothesis
+
+
+def obtain_transcription(request: Request, timeout: int) -> str:
+    try:
+        response = urlopen(request, timeout=timeout)
+    except HTTPError as e:
+        raise RequestError("recognition request failed: {}".format(e.reason))
+    except URLError as e:
+        raise RequestError(
+            "recognition connection failed: {}".format(e.reason)
+        )
+    return response.read().decode("utf-8")
+
+
+def recognize_legacy(
+    recognizer,
+    audio_data: AudioData,
+    key: str | None = None,
+    language: str = "en-US",
+    pfilter: ProfanityFilterLevel = 0,
+    show_all: bool = False,
+    with_confidence: bool = False,
+    *,
+    endpoint: str = ENDPOINT,
+):
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.
+
+    The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.
+
+    To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
+
+    The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.
+
+    The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0.
+
+    Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
+
+    Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
+    """
+    request_builder = create_request_builder(
+        endpoint=endpoint, key=key, language=language, filter_level=pfilter
+    )
+    request = request_builder.build(audio_data)
+
+    response_text = obtain_transcription(
+        request, timeout=recognizer.operation_timeout
+    )
+
+    output_parser = OutputParser(
+        show_all=show_all, with_confidence=with_confidence
+    )
+    return output_parser.parse(response_text)
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/google_cloud.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/google_cloud.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, TypedDict
+from urllib.error import URLError
+
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import RequestError, UnknownValueError
+
+if TYPE_CHECKING:
+    from google.cloud.speech import (
+        RecognitionConfig,
+        RecognizeResponse,
+        SpeechContext,
+    )
+    from typing_extensions import Required
+
+
+class GoogleCloudRecognizerParameters(TypedDict, total=False):
+    """Optional parameters.
+
+    The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``.
+    A list of supported language tags can be found in the `Speech-to-Text supported languages <https://cloud.google.com/speech/docs/languages>`__.
+
+    If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives.
+    This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary.
+    Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
+
+    ``show_all``: See :py:func:`recognize`.
+
+    ``model``: You can select the model to get best results. (See `RecognitionConfig's documentation <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>`__ for detail)
+
+    ``use_enhanced``: Set to true to use an enhanced model for speech recognition.
+    """
+
+    # SpeechRecognition specific parameters
+    preferred_phrases: list[str]
+    show_all: bool
+
+    # Speech-to-Text V1 API's parameters
+    language_code: str
+    model: str
+    use_enhanced: bool
+    # TODO Add others support
+
+
+class GoogleCloudSpeechV1Parameters(TypedDict, total=False):
+    """Speech-to-Text V1 API's parameters.
+
+    https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig
+    """
+
+    encoding: Required[RecognitionConfig.AudioEncoding]
+    sample_rate_hertz: Required[int]
+    language_code: Required[str]
+    speech_contexts: list[SpeechContext]
+    enable_word_time_offsets: bool
+    model: str
+    use_enhanced: bool
+
+
+def _build_config(
+    audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters
+) -> RecognitionConfig:
+    from google.cloud import speech
+
+    parameters: GoogleCloudSpeechV1Parameters = {
+        "encoding": speech.RecognitionConfig.AudioEncoding.FLAC,
+        "sample_rate_hertz": audio_data.sample_rate,
+        "language_code": recognizer_params.pop("language_code", "en-US"),
+    }
+    if preferred_phrases := recognizer_params.pop("preferred_phrases", None):
+        parameters["speech_contexts"] = [
+            speech.SpeechContext(phrases=preferred_phrases)
+        ]
+    if recognizer_params.pop("show_all", False):
+        # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets
+        parameters["enable_word_time_offsets"] = True
+    return speech.RecognitionConfig(**(parameters | recognizer_params))
+
+
+def recognize(
+    recognizer,
+    audio_data: AudioData,
+    credentials_json_path: str | None = None,
+    **kwargs: GoogleCloudRecognizerParameters,
+) -> str | RecognizeResponse:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
+
+    This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project.
+    And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
+
+    Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
+    For other parameters, see :py:class:`GoogleCloudRecognizerParameters`.
+
+    Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
+    """
+    try:
+        from google.api_core.exceptions import GoogleAPICallError
+        from google.cloud import speech
+    except ImportError:
+        raise RequestError(
+            "missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly."
+        )
+
+    client = (
+        speech.SpeechClient.from_service_account_json(credentials_json_path)
+        if credentials_json_path
+        else speech.SpeechClient()
+    )
+
+    flac_data = audio_data.get_flac_data(
+        # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
+        convert_rate=(
+            None
+            if 8000 <= audio_data.sample_rate <= 48000
+            else max(8000, min(audio_data.sample_rate, 48000))
+        ),
+        convert_width=2,  # audio samples must be 16-bit
+    )
+    audio = speech.RecognitionAudio(content=flac_data)
+
+    config = _build_config(audio_data, kwargs.copy())
+
+    try:
+        response = client.recognize(config=config, audio=audio)
+    except GoogleAPICallError as e:
+        raise RequestError(e)
+    except URLError as e:
+        raise RequestError(
+            "recognition connection failed: {0}".format(e.reason)
+        )
+
+    if kwargs.get("show_all"):
+        return response
+    if len(response.results) == 0:
+        raise UnknownValueError()
+
+    transcript = " ".join(
+        result.alternatives[0].transcript.strip()
+        for result in response.results
+    )
+    return transcript
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pocketsphinx.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/pocketsphinx.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+import os
+from collections.abc import Sequence
+
+from speech_recognition import PortableNamedTemporaryFile
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import RequestError, UnknownValueError
+
+AcousticParametersDirectoryPath = str
+LanguageModelFilePath = str
+PhonemeDictionaryFilePath = str
+SphinxDataFilePaths = tuple[AcousticParametersDirectoryPath, LanguageModelFilePath, PhonemeDictionaryFilePath]
+
+Keyword = str
+Sensitivity = float
+KeywordEntry = tuple[Keyword, Sensitivity]
+
+
+def recognize(
+    recognizer,
+    audio_data: AudioData,
+    language: str | SphinxDataFilePaths = "en-US",
+    keyword_entries: Sequence[KeywordEntry] | None = None,
+    grammar: str | None = None,
+    show_all: bool = False,
+):
+    """
+    Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.
+
+    The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models.
+
+    If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for.
+
+    Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored.
+
+    Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition.
+
+    Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation.
+    """
+    # TODO Move this validation into KeywordEntry initialization
+    assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1"
+
+    try:
+        from pocketsphinx import FsgModel, Jsgf, pocketsphinx
+    except ImportError:
+        raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
+
+    if isinstance(language, str):  # directory containing language data
+        language_directory = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "pocketsphinx-data", language)
+        if not os.path.isdir(language_directory):
+            raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory))
+        acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
+        language_model_file = os.path.join(language_directory, "language-model.lm.bin")
+        phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
+    else:  # 3-tuple of Sphinx data file paths
+        acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language
+    if not os.path.isdir(acoustic_parameters_directory):
+        raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory))
+    if not os.path.isfile(language_model_file):
+        raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file))
+    if not os.path.isfile(phoneme_dictionary_file):
+        raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file))
+
+    # create decoder object
+    config = pocketsphinx.Config()
+    config.set_string("-hmm", acoustic_parameters_directory)  # set the path of the hidden Markov model (HMM) parameter files
+    config.set_string("-lm", language_model_file)
+    config.set_string("-dict", phoneme_dictionary_file)
+    config.set_string("-logfn", os.devnull)  # disable logging (logging causes unwanted output in terminal)
+    decoder = pocketsphinx.Decoder(config)
+
+    # obtain audio data
+    raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2)  # the included language models require audio to be 16-bit mono 16 kHz in little-endian format
+
+    # obtain recognition results
+    if keyword_entries is not None:  # explicitly specified set of keywords
+        with PortableNamedTemporaryFile("w") as f:
+            # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5
+            f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries)
+            f.flush()
+
+            # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done)
+            decoder.add_kws("keywords", f.name)
+            decoder.activate_search("keywords")
+    elif grammar is not None:  # a path to a FSG or JSGF grammar
+        if not os.path.exists(grammar):
+            raise ValueError("Grammar '{0}' does not exist.".format(grammar))
+        grammar_path = os.path.abspath(os.path.dirname(grammar))
+        grammar_name = os.path.splitext(os.path.basename(grammar))[0]
+        fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name)
+        if not os.path.exists(fsg_path):  # create FSG grammar if not available
+            jsgf = Jsgf(grammar)
+            rule = jsgf.get_rule("{0}.{0}".format(grammar_name))
+            fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5)
+            fsg.writefile(fsg_path)
+        else:
+            fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5)
+        decoder.set_fsg(grammar_name, fsg)
+        decoder.set_search(grammar_name)
+
+    decoder.start_utt()  # begin utterance processing
+    decoder.process_raw(raw_data, False, True)  # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
+    decoder.end_utt()  # stop utterance processing
+
+    if show_all: return decoder
+
+    # return results
+    hypothesis = decoder.hyp()
+    if hypothesis is not None: return hypothesis.hypstr
+    raise UnknownValueError()  # no transcriptions available
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/init.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/init.py
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/init.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/init.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/base.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/base.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/groq.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/groq.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/openai.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/pycache/openai.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/base.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/base.py
@@ -0,0 +1,22 @@
+from io import BytesIO
+
+from speech_recognition.audio import AudioData
+
+
+class OpenAICompatibleRecognizer:
+    def __init__(self, client) -> None:
+        self.client = client
+
+    def recognize(self, audio_data: "AudioData", model: str, **kwargs) -> str:
+        if not isinstance(audio_data, AudioData):
+            raise ValueError(
+                "``audio_data`` must be an ``AudioData`` instance"
+            )
+
+        wav_data = BytesIO(audio_data.get_wav_data())
+        wav_data.name = "SpeechRecognition_audio.wav"
+
+        transcript = self.client.audio.transcriptions.create(
+            file=wav_data, model=model, **kwargs
+        )
+        return transcript.text
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/groq.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/groq.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from typing import Literal, TypedDict
+
+from typing_extensions import Unpack
+
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import SetupError
+from speech_recognition.recognizers.whisper_api.base import (
+    OpenAICompatibleRecognizer,
+)
+
+# https://console.groq.com/docs/speech-text#supported-models
+GroqModel = Literal[
+    "whisper-large-v3-turbo", "whisper-large-v3", "distil-whisper-large-v3-en"
+]
+
+
+class GroqOptionalParameters(TypedDict):
+    """Groq speech transcription's optional parameters.
+
+    https://console.groq.com/docs/speech-text#transcription-endpoint-usage
+    """
+
+    prompt: str
+    response_format: str
+    temperature: float
+    language: str
+
+
+def recognize(
+    recognizer,
+    audio_data: "AudioData",
+    *,
+    model: GroqModel = "whisper-large-v3-turbo",
+    **kwargs: Unpack[GroqOptionalParameters],
+) -> str:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Groq Whisper API.
+
+    This function requires login to Groq; visit https://console.groq.com/login, then generate API Key in `API Keys <https://console.groq.com/keys>`__ menu.
+
+    Detail: https://console.groq.com/docs/speech-text
+
+    Set environment variable ``GROQ_API_KEY``; otherwise groq library will raise a ``groq.GroqError``.
+    """
+    try:
+        import groq
+    except ImportError:
+        raise SetupError(
+            "missing groq module: ensure that groq is set up correctly."
+        )
+
+    groq_recognizer = OpenAICompatibleRecognizer(groq.Groq())
+    return groq_recognizer.recognize(audio_data, model, **kwargs)
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/openai.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_api/openai.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+from typing import Literal
+
+from typing_extensions import Unpack
+
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import SetupError
+from speech_recognition.recognizers.whisper_api.base import (
+    OpenAICompatibleRecognizer,
+)
+
+# https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-model
+WhisperModel = Literal[
+    "whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"
+]
+
+
+class OpenAIOptionalParameters:
+    """OpenAI speech transcription's optional parameters.
+
+    https://platform.openai.com/docs/api-reference/audio/createTranscription
+    """
+
+    language: str
+    prompt: str
+    # TODO Add support `Literal["text", "srt", "verbose_json", "vtt"]`
+    response_format: Literal["json"]
+    temperature: float
+    # timestamp_granularities  # TODO support
+
+
+def recognize(
+    recognizer,
+    audio_data: "AudioData",
+    *,
+    model: WhisperModel = "whisper-1",
+    **kwargs: Unpack[OpenAIOptionalParameters],
+) -> str:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API.
+
+    This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings <https://platform.openai.com/account/api-keys>`__.
+
+    Detail: https://platform.openai.com/docs/guides/speech-to-text
+
+    Set environment variable ``OPENAI_API_KEY``; otherwise openai library will raise a ``openai.OpenAIError``.
+    """
+    try:
+        import openai
+    except ImportError:
+        raise SetupError(
+            "missing openai module: ensure that openai is set up correctly."
+        )
+
+    openai_recognizer = OpenAICompatibleRecognizer(openai.OpenAI())
+    return openai_recognizer.recognize(audio_data, model, **kwargs)
+
+
+if __name__ == "__main__":
+    import argparse
+    from typing import get_args
+
+    import speech_recognition as sr
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("audio_file")
+    parser.add_argument(
+        "--model", choices=get_args(WhisperModel), default="whisper-1"
+    )
+    parser.add_argument("-l", "--language")
+    args = parser.parse_args()
+
+    r = sr.Recognizer()
+    with sr.AudioFile(args.audio_file) as source:
+        audio_data = r.listen(source)
+
+    if args.language:
+        transcription = recognize(
+            None, audio_data, model=args.model, language=args.language
+        )
+    else:
+        transcription = recognize(None, audio_data, model=args.model)
+    print(transcription)
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/init.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/init.py
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/init.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/init.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/base.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/base.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/faster_whisper.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/faster_whisper.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/whisper.cpython-311.pyc
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/pycache/whisper.cpython-311.pyc
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/base.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/base.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+import io
+from typing import TYPE_CHECKING, Any, Protocol
+
+from speech_recognition.audio import AudioData
+
+if TYPE_CHECKING:
+    import numpy as np
+
+
+class Transcribable(Protocol):
+    def transcribe(
+        self, audio_array: np.ndarray, **kwargs
+    ) -> str | dict[str, Any]:
+        pass
+
+
+class WhisperCompatibleRecognizer:
+    def __init__(self, model: Transcribable) -> None:
+        self.model = model
+
+    def recognize(
+        self, audio_data: AudioData, show_dict: bool = False, **kwargs
+    ):
+        if not isinstance(audio_data, AudioData):
+            raise ValueError(
+                "``audio_data`` must be an ``AudioData`` instance"
+            )
+
+        import numpy as np
+        import soundfile as sf
+
+        # 16 kHz https://github.com/openai/whisper/blob/28769fcfe50755a817ab922a7bc83483159600a9/whisper/audio.py#L98-L99
+        wav_bytes = audio_data.get_wav_data(convert_rate=16000)
+        wav_stream = io.BytesIO(wav_bytes)
+        audio_array, sampling_rate = sf.read(wav_stream)
+        audio_array = audio_array.astype(np.float32)
+
+        result = self.model.transcribe(audio_array, **kwargs)
+
+        if show_dict:
+            return result
+        else:
+            return result["text"]
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/faster_whisper.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/faster_whisper.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal, TypedDict
+
+from speech_recognition.audio import AudioData
+from speech_recognition.recognizers.whisper_local.base import (
+    WhisperCompatibleRecognizer,
+)
+
+if TYPE_CHECKING:
+    import numpy as np
+    from faster_whisper import WhisperModel
+    from faster_whisper.transcribe import Segment
+    from typing_extensions import Unpack
+
+
+class TranscribeOutput(TypedDict):
+    text: str
+    segments: list[Segment]
+    language: str
+
+
+class TranscribableAdapter:
+    def __init__(self, model: WhisperModel) -> None:
+        self.model = model
+
+    def transcribe(
+        self, audio_array: np.ndarray, **kwargs
+    ) -> TranscribeOutput:
+        segments_generator, info = self.model.transcribe(audio_array, **kwargs)
+        segments = list(segments_generator)
+        return {
+            "text": " ".join(segment.text for segment in segments),
+            "segments": segments,
+            "language": info.language,
+        }
+
+
+class InitOptionalParameters(TypedDict, total=False):
+    # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L575
+    device: Literal["cpu", "gpu", "auto"]
+    compute_type: str
+    download_root: str
+    # TODO Add others
+
+
+class TranscribeOptionalParameters(TypedDict, total=False):
+    # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692
+    language: str
+    task: Literal["transcribe", "translate"]
+    beam_size: int
+    # TODO Add others
+
+
+def recognize(
+    recognizer,
+    audio_data: AudioData,
+    model: str = "base",
+    show_dict: bool = False,
+    init_options: InitOptionalParameters | None = None,
+    **transcribe_options: Unpack[TranscribeOptionalParameters],
+) -> str | TranscribeOutput:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
+
+    Pick ``model`` size (Same as Whisper).
+
+    If ``show_dict`` is true, returns the detailed response from Whisper, including the detected language. Otherwise returns only the transcription.
+
+    You can specify:
+
+        * ``language``: recognition language, an uncapitalized 2 letters language name like "en" or "fr".
+
+            * If not set, Faster Whisper will automatically detect the language.
+
+        * ``task``
+
+            * If you want transcribe + **translate** to english, set ``task="translate"``.
+
+    Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options.
+    """
+    from faster_whisper import WhisperModel
+
+    model = WhisperModel(model, **init_options or {})
+    whisper_recognizer = WhisperCompatibleRecognizer(
+        TranscribableAdapter(model)
+    )
+    return whisper_recognizer.recognize(
+        audio_data, show_dict=show_dict, **transcribe_options
+    )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    import speech_recognition as sr
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("audio_file")
+    args = parser.parse_args()
+
+    r = sr.Recognizer()
+    with sr.AudioFile(args.audio_file) as source:
+        audio_data = r.listen(source)
+
+    transcription = recognize(None, audio_data)
+    print(transcription)
--- a/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/whisper.py
+++ b/venv/lib/python3.11/site-packages/speech_recognition/recognizers/whisper_local/whisper.py
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal, TypedDict
+
+from speech_recognition.audio import AudioData
+from speech_recognition.recognizers.whisper_local.base import (
+    WhisperCompatibleRecognizer,
+)
+
+if TYPE_CHECKING:
+    import numpy as np
+    import torch
+    from typing_extensions import Unpack
+    from whisper import Whisper
+
+
+class LoadModelOptionalParameters(TypedDict, total=False):
+    # ref: https://github.com/openai/whisper/blob/v20240930/whisper/__init__.py#L103
+    device: str | torch.device
+    download_root: str
+    in_memory: bool
+
+
+class TranscribeOptionalParameters(TypedDict, total=False):
+    """Transcribe optional parameters & DecodingOptions parameters."""
+
+    # ref: https://github.com/openai/whisper/blob/v20240930/whisper/transcribe.py#L38
+    temperature: float | tuple[float, ...]
+    # TODO Add others
+
+    # ref: https://github.com/openai/whisper/blob/v20240930/whisper/decoding.py#L81
+    task: Literal["transcribe", "translate"]
+    language: str
+    fp16: bool
+    # TODO Add others
+
+
+class Segment(TypedDict):
+    id: int
+    seek: int
+    start: float
+    end: float
+    text: str
+    tokens: list[int]
+    temperature: float
+    avg_logprob: float
+    compression_ratio: float
+    no_speech_prob: float
+
+
+class TranscribeOutput(TypedDict):
+    text: str
+    segments: list[Segment]
+    language: str
+
+
+class TranscribableAdapter:
+    def __init__(self, model: Whisper) -> None:
+        self.model = model
+
+    def transcribe(
+        self, audio_array: np.ndarray, **kwargs
+    ) -> TranscribeOutput:
+        if "fp16" not in kwargs:
+            import torch
+
+            kwargs["fp16"] = torch.cuda.is_available()
+
+        return self.model.transcribe(audio_array, **kwargs)
+
+
+def recognize(
+    recognizer,
+    audio_data: AudioData,
+    model: str = "base",
+    show_dict: bool = False,
+    load_options: LoadModelOptionalParameters | None = None,
+    **transcribe_options: Unpack[TranscribeOptionalParameters],
+) -> str | TranscribeOutput:
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
+
+    Pick ``model`` from output of :command:`python -c 'import whisper; print(whisper.available_models())'`.
+    See also https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages.
+
+    If ``show_dict`` is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription.
+
+    You can specify:
+
+        * ``language``: recognition language, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
+
+            * If not set, Whisper will automatically detect the language.
+
+        * ``task``
+
+            * If you want transcribe + **translate** to english, set ``task="translate"``.
+
+    Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options.
+    """
+
+    import whisper
+
+    whisper_model = whisper.load_model(model, **load_options or {})
+    whisper_recognizer = WhisperCompatibleRecognizer(
+        TranscribableAdapter(whisper_model)
+    )
+    return whisper_recognizer.recognize(
+        audio_data, show_dict=show_dict, **transcribe_options
+    )