first commit

This commit is contained in:
2025-04-04 13:23:15 -06:00
commit 216064f731
2103 changed files with 522593 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,24 @@
import speech_recognition as sr
r = sr.Recognizer()
m = sr.Microphone()
try:
print("A moment of silence, please...")
with m as source: r.adjust_for_ambient_noise(source)
print("Set minimum energy threshold to {}".format(r.energy_threshold))
while True:
print("Say something!")
with m as source: audio = r.listen(source)
print("Got it! Now to recognize it...")
try:
# recognize speech using Google Speech Recognition
value = r.recognize_google(audio)
print("You said {}".format(value))
except sr.UnknownValueError:
print("Oops! Didn't catch that")
except sr.RequestError as e:
print("Uh oh! Couldn't request results from Google Speech Recognition service; {0}".format(e))
except KeyboardInterrupt:
pass

View File

@@ -0,0 +1,318 @@
import aifc
import audioop
import io
import os
import platform
import stat
import subprocess
import sys
import wave
class AudioData(object):
"""
Creates a new ``AudioData`` instance, which represents mono audio data.
The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format.
The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample.
The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz).
Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly.
"""
def __init__(self, frame_data, sample_rate, sample_width):
assert sample_rate > 0, "Sample rate must be a positive integer"
assert (
sample_width % 1 == 0 and 1 <= sample_width <= 4
), "Sample width must be between 1 and 4 inclusive"
self.frame_data = frame_data
self.sample_rate = sample_rate
self.sample_width = int(sample_width)
def get_segment(self, start_ms=None, end_ms=None):
"""
Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in.
If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end.
"""
assert (
start_ms is None or start_ms >= 0
), "``start_ms`` must be a non-negative number"
assert end_ms is None or end_ms >= (
0 if start_ms is None else start_ms
), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``"
if start_ms is None:
start_byte = 0
else:
start_byte = int(
(start_ms * self.sample_rate * self.sample_width) // 1000
)
if end_ms is None:
end_byte = len(self.frame_data)
else:
end_byte = int(
(end_ms * self.sample_rate * self.sample_width) // 1000
)
return AudioData(
self.frame_data[start_byte:end_byte],
self.sample_rate,
self.sample_width,
)
def get_raw_data(self, convert_rate=None, convert_width=None):
"""
Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance.
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
Writing these bytes directly to a file results in a valid `RAW/PCM audio file <https://en.wikipedia.org/wiki/Raw_audio_format>`__.
"""
assert (
convert_rate is None or convert_rate > 0
), "Sample rate to convert to must be a positive integer"
assert convert_width is None or (
convert_width % 1 == 0 and 1 <= convert_width <= 4
), "Sample width to convert to must be between 1 and 4 inclusive"
raw_data = self.frame_data
# make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples)
if self.sample_width == 1:
raw_data = audioop.bias(
raw_data, 1, -128
) # subtract 128 from every sample to make them act like signed samples
# resample audio at the desired rate if specified
if convert_rate is not None and self.sample_rate != convert_rate:
raw_data, _ = audioop.ratecv(
raw_data,
self.sample_width,
1,
self.sample_rate,
convert_rate,
None,
)
# convert samples to desired sample width if specified
if convert_width is not None and self.sample_width != convert_width:
if (
convert_width == 3
): # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866)
raw_data = audioop.lin2lin(
raw_data, self.sample_width, 4
) # convert audio into 32-bit first, which is always supported
try:
audioop.bias(
b"", 3, 0
) # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do)
except (
audioop.error
): # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less)
raw_data = b"".join(
raw_data[i + 1: i + 4]
for i in range(0, len(raw_data), 4)
) # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample
else: # 24-bit audio fully supported, we don't need to shim anything
raw_data = audioop.lin2lin(
raw_data, self.sample_width, convert_width
)
else:
raw_data = audioop.lin2lin(
raw_data, self.sample_width, convert_width
)
# if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again
if convert_width == 1:
raw_data = audioop.bias(
raw_data, 1, 128
) # add 128 to every sample to make them act like unsigned samples again
return raw_data
def get_wav_data(self, convert_rate=None, convert_width=None):
"""
Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance.
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
Writing these bytes directly to a file results in a valid `WAV file <https://en.wikipedia.org/wiki/WAV>`__.
"""
raw_data = self.get_raw_data(convert_rate, convert_width)
sample_rate = (
self.sample_rate if convert_rate is None else convert_rate
)
sample_width = (
self.sample_width if convert_width is None else convert_width
)
# generate the WAV file contents
with io.BytesIO() as wav_file:
wav_writer = wave.open(wav_file, "wb")
try: # note that we can't use context manager, since that was only added in Python 3.4
wav_writer.setframerate(sample_rate)
wav_writer.setsampwidth(sample_width)
wav_writer.setnchannels(1)
wav_writer.writeframes(raw_data)
wav_data = wav_file.getvalue()
finally: # make sure resources are cleaned up
wav_writer.close()
return wav_data
def get_aiff_data(self, convert_rate=None, convert_width=None):
"""
Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance.
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
Writing these bytes directly to a file results in a valid `AIFF-C file <https://en.wikipedia.org/wiki/Audio_Interchange_File_Format>`__.
"""
raw_data = self.get_raw_data(convert_rate, convert_width)
sample_rate = (
self.sample_rate if convert_rate is None else convert_rate
)
sample_width = (
self.sample_width if convert_width is None else convert_width
)
# the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian
if hasattr(
audioop, "byteswap"
): # ``audioop.byteswap`` was only added in Python 3.4
raw_data = audioop.byteswap(raw_data, sample_width)
else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback
raw_data = raw_data[sample_width - 1:: -1] + b"".join(
raw_data[i + sample_width: i: -1]
for i in range(sample_width - 1, len(raw_data), sample_width)
)
# generate the AIFF-C file contents
with io.BytesIO() as aiff_file:
aiff_writer = aifc.open(aiff_file, "wb")
try: # note that we can't use context manager, since that was only added in Python 3.4
aiff_writer.setframerate(sample_rate)
aiff_writer.setsampwidth(sample_width)
aiff_writer.setnchannels(1)
aiff_writer.writeframes(raw_data)
aiff_data = aiff_file.getvalue()
finally: # make sure resources are cleaned up
aiff_writer.close()
return aiff_data
def get_flac_data(self, convert_rate=None, convert_width=None):
"""
Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance.
Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC.
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
Writing these bytes directly to a file results in a valid `FLAC file <https://en.wikipedia.org/wiki/FLAC>`__.
"""
assert convert_width is None or (
convert_width % 1 == 0 and 1 <= convert_width <= 3
), "Sample width to convert to must be between 1 and 3 inclusive"
if (
self.sample_width > 3 and convert_width is None
): # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder
convert_width = 3 # the largest supported sample width is 24-bit, so we'll limit the sample width to that
# run the FLAC converter with the WAV data to get the FLAC data
wav_data = self.get_wav_data(convert_rate, convert_width)
flac_converter = get_flac_converter()
if (
os.name == "nt"
): # on Windows, specify that the process is to be started without showing a console window
startup_info = subprocess.STARTUPINFO()
startup_info.dwFlags |= (
subprocess.STARTF_USESHOWWINDOW
) # specify that the wShowWindow field of `startup_info` contains a value
startup_info.wShowWindow = (
subprocess.SW_HIDE
) # specify that the console window should be hidden
else:
startup_info = None # default startupinfo
process = subprocess.Popen(
[
flac_converter,
"--stdout",
"--totally-silent", # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output
"--best", # highest level of compression available
"-", # the input FLAC file contents will be given in stdin
],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
startupinfo=startup_info,
)
flac_data, stderr = process.communicate(wav_data)
return flac_data
def get_flac_converter():
"""Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
flac_converter = shutil_which("flac") # check for installed version first
if flac_converter is None: # flac utility is not installed
base_path = os.path.dirname(
os.path.abspath(__file__)
) # directory of the current module file, where all the FLAC bundled binaries are stored
system, machine = platform.system(), platform.machine()
if system == "Windows" and machine in {
"i686",
"i786",
"x86",
"x86_64",
"AMD64",
}:
flac_converter = os.path.join(base_path, "flac-win32.exe")
elif system == "Darwin" and machine in {
"i686",
"i786",
"x86",
"x86_64",
"AMD64",
"arm64",
}:
flac_converter = os.path.join(base_path, "flac-mac")
elif system == "Linux" and machine in {"i686", "i786", "x86"}:
flac_converter = os.path.join(base_path, "flac-linux-x86")
elif system == "Linux" and machine in {"x86_64", "AMD64"}:
flac_converter = os.path.join(base_path, "flac-linux-x86_64")
else: # no FLAC converter available
raise OSError(
"FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent"
)
# mark FLAC converter as executable if possible
try:
# handle known issue when running on docker:
# run executable right after chmod() may result in OSError "Text file busy"
# fix: flush FS with sync
if not os.access(flac_converter, os.X_OK):
stat_info = os.stat(flac_converter)
os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC)
if "Linux" in platform.system():
os.sync() if sys.version_info >= (3, 3) else os.system("sync")
except OSError:
pass
return flac_converter
def shutil_which(pgm):
"""Python 2 compatibility: backport of ``shutil.which()`` from Python 3"""
path = os.getenv("PATH")
for p in path.split(os.path.pathsep):
p = os.path.join(p, pgm)
if os.path.exists(p) and os.access(p, os.X_OK):
return p

View File

@@ -0,0 +1,22 @@
class SetupError(Exception):
pass
class WaitTimeoutError(Exception):
pass
class RequestError(Exception):
pass
class UnknownValueError(Exception):
pass
class TranscriptionNotReady(Exception):
pass
class TranscriptionFailed(Exception):
pass

View File

@@ -0,0 +1,31 @@
Copyright (c) 1999-2015 Carnegie Mellon University. All rights
reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
This work was supported in part by funding from the Defense Advanced
Research Projects Agency and the National Science Foundation of the
United States of America, and the CMU Sphinx Speech Consortium.
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,34 @@
/* ====================================================================
* Copyright (c) 2015 Alpha Cephei Inc. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY ALPHA CEPHEI INC. ``AS IS'' AND.
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,.
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ALPHA CEPHEI INC.
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT.
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,.
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY.
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT.
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE.
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ====================================================================
*
*/
This directory contains generic US english acoustic model trained with
latest sphinxtrain.

View File

@@ -0,0 +1,12 @@
-lowerf 130
-upperf 6800
-nfilt 25
-transform dct
-lifter 22
-feat 1s_c_d_dd
-svspec 0-12/13-25/26-38
-agc none
-cmn current
-varnorm no
-model ptm
-cmninit 40,3,-1

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 MiB

View File

@@ -0,0 +1,5 @@
<s> SIL
</s> SIL
<sil> SIL
[NOISE] +NSN+
[SPEECH] +SPN+

View File

@@ -0,0 +1,262 @@
from __future__ import annotations
import json
from typing import Dict, Literal, TypedDict
from urllib.error import HTTPError, URLError
from urllib.parse import urlencode
from urllib.request import Request, urlopen
from typing_extensions import NotRequired
from speech_recognition.audio import AudioData
from speech_recognition.exceptions import RequestError, UnknownValueError
class Alternative(TypedDict):
transcript: str
confidence: float
class Result(TypedDict):
alternative: list[Alternative]
final: bool
class GoogleResponse(TypedDict):
result: list[Result]
result_index: NotRequired[int]
ProfanityFilterLevel = Literal[0, 1]
RequestHeaders = Dict[str, str]
ENDPOINT = "http://www.google.com/speech-api/v2/recognize"
class RequestBuilder:
def __init__(
self,
*,
endpoint: str,
key: str,
language: str,
filter_level: ProfanityFilterLevel,
) -> None:
self.endpoint = endpoint
self.key = key
self.language = language
self.filter_level = filter_level
def build(self, audio_data: AudioData) -> Request:
if not isinstance(audio_data, AudioData):
raise ValueError("``audio_data`` must be audio data")
url = self.build_url()
headers = self.build_headers(audio_data)
flac_data = self.build_data(audio_data)
request = Request(url, data=flac_data, headers=headers)
return request
def build_url(self) -> str:
"""
>>> builder = RequestBuilder(endpoint="http://www.google.com/speech-api/v2/recognize", key="awesome-key", language="en-US", filter_level=0)
>>> builder.build_url()
'http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0'
"""
params = urlencode(
{
"client": "chromium",
"lang": self.language,
"key": self.key,
"pFilter": self.filter_level,
}
)
return f"{self.endpoint}?{params}"
def build_headers(self, audio_data: AudioData) -> RequestHeaders:
"""
>>> builder = RequestBuilder(endpoint="", key="", language="", filter_level=1)
>>> audio_data = AudioData(b"", 16_000, 1)
>>> builder.build_headers(audio_data)
{'Content-Type': 'audio/x-flac; rate=16000'}
"""
rate = audio_data.sample_rate
headers = {"Content-Type": f"audio/x-flac; rate={rate}"}
return headers
def build_data(self, audio_data: AudioData) -> bytes:
flac_data = audio_data.get_flac_data(
convert_rate=self.to_convert_rate(audio_data.sample_rate),
convert_width=2, # audio samples must be 16-bit
)
return flac_data
@staticmethod
def to_convert_rate(sample_rate: int) -> int:
"""Audio samples must be at least 8 kHz
>>> RequestBuilder.to_convert_rate(16_000)
>>> RequestBuilder.to_convert_rate(8_000)
>>> RequestBuilder.to_convert_rate(7_999)
8000
"""
return None if sample_rate >= 8000 else 8000
def create_request_builder(
*,
endpoint: str,
key: str | None = None,
language: str = "en-US",
filter_level: ProfanityFilterLevel = 0,
) -> RequestBuilder:
if not isinstance(language, str):
raise ValueError("``language`` must be a string")
if key is not None and not isinstance(key, str):
raise ValueError("``key`` must be ``None`` or a string")
if key is None:
key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
return RequestBuilder(
endpoint=endpoint,
key=key,
language=language,
filter_level=filter_level,
)
class OutputParser:
def __init__(self, *, show_all: bool, with_confidence: bool) -> None:
self.show_all = show_all
self.with_confidence = with_confidence
def parse(self, response_text: str):
actual_result = self.convert_to_result(response_text)
if self.show_all:
return actual_result
best_hypothesis = self.find_best_hypothesis(
actual_result["alternative"]
)
# https://cloud.google.com/speech-to-text/docs/basics#confidence-values
# "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results."
confidence = best_hypothesis.get("confidence", 0.5)
if self.with_confidence:
return best_hypothesis["transcript"], confidence
return best_hypothesis["transcript"]
@staticmethod
def convert_to_result(response_text: str) -> Result:
r"""
>>> response_text = '''{"result":[]}
... {"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0}
... '''
>>> OutputParser.convert_to_result(response_text)
{'alternative': [{'transcript': 'one two three', 'confidence': 0.49585345}, {'transcript': '1 2', 'confidence': 0.42899391}], 'final': True}
>>> OutputParser.convert_to_result("")
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
>>> OutputParser.convert_to_result('\n{"result":[]}')
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
>>> OutputParser.convert_to_result('{"result":[{"foo": "bar"}]}')
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
>>> OutputParser.convert_to_result('{"result":[{"alternative": []}]}')
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
"""
# ignore any blank blocks
for line in response_text.split("\n"):
if not line:
continue
result: list[Result] = json.loads(line)["result"]
if len(result) != 0:
if len(result[0].get("alternative", [])) == 0:
raise UnknownValueError()
return result[0]
raise UnknownValueError()
@staticmethod
def find_best_hypothesis(alternatives: list[Alternative]) -> Alternative:
"""
>>> alternatives = [{"transcript": "one two three", "confidence": 0.42899391}, {"transcript": "1 2", "confidence": 0.49585345}]
>>> OutputParser.find_best_hypothesis(alternatives)
{'transcript': 'one two three', 'confidence': 0.42899391}
>>> alternatives = [{"confidence": 0.49585345}]
>>> OutputParser.find_best_hypothesis(alternatives)
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
"""
if "confidence" in alternatives:
# BUG: actual_result["alternative"] (=alternatives) is list, not dict
# return alternative with highest confidence score
best_hypothesis: Alternative = max(
alternatives,
key=lambda alternative: alternative["confidence"],
)
else:
# when there is no confidence available, we arbitrarily choose the first hypothesis.
best_hypothesis: Alternative = alternatives[0]
if "transcript" not in best_hypothesis:
raise UnknownValueError()
return best_hypothesis
def obtain_transcription(request: Request, timeout: int) -> str:
try:
response = urlopen(request, timeout=timeout)
except HTTPError as e:
raise RequestError("recognition request failed: {}".format(e.reason))
except URLError as e:
raise RequestError(
"recognition connection failed: {}".format(e.reason)
)
return response.read().decode("utf-8")
def recognize_legacy(
recognizer,
audio_data: AudioData,
key: str | None = None,
language: str = "en-US",
pfilter: ProfanityFilterLevel = 0,
show_all: bool = False,
with_confidence: bool = False,
*,
endpoint: str = ENDPOINT,
):
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.
The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.
To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.
The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0.
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
"""
request_builder = create_request_builder(
endpoint=endpoint, key=key, language=language, filter_level=pfilter
)
request = request_builder.build(audio_data)
response_text = obtain_transcription(
request, timeout=recognizer.operation_timeout
)
output_parser = OutputParser(
show_all=show_all, with_confidence=with_confidence
)
return output_parser.parse(response_text)

View File

@@ -0,0 +1,142 @@
from __future__ import annotations
from typing import TYPE_CHECKING, TypedDict
from urllib.error import URLError
from speech_recognition.audio import AudioData
from speech_recognition.exceptions import RequestError, UnknownValueError
if TYPE_CHECKING:
from google.cloud.speech import (
RecognitionConfig,
RecognizeResponse,
SpeechContext,
)
from typing_extensions import Required
class GoogleCloudRecognizerParameters(TypedDict, total=False):
"""Optional parameters.
The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``.
A list of supported language tags can be found in the `Speech-to-Text supported languages <https://cloud.google.com/speech/docs/languages>`__.
If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives.
This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary.
Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
``show_all``: See :py:func:`recognize`.
``model``: You can select the model to get best results. (See `RecognitionConfig's documentation <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>`__ for detail)
``use_enhanced``: Set to true to use an enhanced model for speech recognition.
"""
# SpeechRecognition specific parameters
preferred_phrases: list[str]
show_all: bool
# Speech-to-Text V1 API's parameters
language_code: str
model: str
use_enhanced: bool
# TODO Add others support
class GoogleCloudSpeechV1Parameters(TypedDict, total=False):
"""Speech-to-Text V1 API's parameters.
https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig
"""
encoding: Required[RecognitionConfig.AudioEncoding]
sample_rate_hertz: Required[int]
language_code: Required[str]
speech_contexts: list[SpeechContext]
enable_word_time_offsets: bool
model: str
use_enhanced: bool
def _build_config(
audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters
) -> RecognitionConfig:
from google.cloud import speech
parameters: GoogleCloudSpeechV1Parameters = {
"encoding": speech.RecognitionConfig.AudioEncoding.FLAC,
"sample_rate_hertz": audio_data.sample_rate,
"language_code": recognizer_params.pop("language_code", "en-US"),
}
if preferred_phrases := recognizer_params.pop("preferred_phrases", None):
parameters["speech_contexts"] = [
speech.SpeechContext(phrases=preferred_phrases)
]
if recognizer_params.pop("show_all", False):
# ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets
parameters["enable_word_time_offsets"] = True
return speech.RecognitionConfig(**(parameters | recognizer_params))
def recognize(
recognizer,
audio_data: AudioData,
credentials_json_path: str | None = None,
**kwargs: GoogleCloudRecognizerParameters,
) -> str | RecognizeResponse:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project.
And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
For other parameters, see :py:class:`GoogleCloudRecognizerParameters`.
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
"""
try:
from google.api_core.exceptions import GoogleAPICallError
from google.cloud import speech
except ImportError:
raise RequestError(
"missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly."
)
client = (
speech.SpeechClient.from_service_account_json(credentials_json_path)
if credentials_json_path
else speech.SpeechClient()
)
flac_data = audio_data.get_flac_data(
# audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
convert_rate=(
None
if 8000 <= audio_data.sample_rate <= 48000
else max(8000, min(audio_data.sample_rate, 48000))
),
convert_width=2, # audio samples must be 16-bit
)
audio = speech.RecognitionAudio(content=flac_data)
config = _build_config(audio_data, kwargs.copy())
try:
response = client.recognize(config=config, audio=audio)
except GoogleAPICallError as e:
raise RequestError(e)
except URLError as e:
raise RequestError(
"recognition connection failed: {0}".format(e.reason)
)
if kwargs.get("show_all"):
return response
if len(response.results) == 0:
raise UnknownValueError()
transcript = " ".join(
result.alternatives[0].transcript.strip()
for result in response.results
)
return transcript

View File

@@ -0,0 +1,111 @@
from __future__ import annotations
import os
from collections.abc import Sequence
from speech_recognition import PortableNamedTemporaryFile
from speech_recognition.audio import AudioData
from speech_recognition.exceptions import RequestError, UnknownValueError
AcousticParametersDirectoryPath = str
LanguageModelFilePath = str
PhonemeDictionaryFilePath = str
SphinxDataFilePaths = tuple[AcousticParametersDirectoryPath, LanguageModelFilePath, PhonemeDictionaryFilePath]
Keyword = str
Sensitivity = float
KeywordEntry = tuple[Keyword, Sensitivity]
def recognize(
recognizer,
audio_data: AudioData,
language: str | SphinxDataFilePaths = "en-US",
keyword_entries: Sequence[KeywordEntry] | None = None,
grammar: str | None = None,
show_all: bool = False,
):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.
The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models.
If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for.
Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored.
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition.
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation.
"""
# TODO Move this validation into KeywordEntry initialization
assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1"
try:
from pocketsphinx import FsgModel, Jsgf, pocketsphinx
except ImportError:
raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
if isinstance(language, str): # directory containing language data
language_directory = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "pocketsphinx-data", language)
if not os.path.isdir(language_directory):
raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory))
acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
language_model_file = os.path.join(language_directory, "language-model.lm.bin")
phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
else: # 3-tuple of Sphinx data file paths
acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language
if not os.path.isdir(acoustic_parameters_directory):
raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory))
if not os.path.isfile(language_model_file):
raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file))
if not os.path.isfile(phoneme_dictionary_file):
raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file))
# create decoder object
config = pocketsphinx.Config()
config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files
config.set_string("-lm", language_model_file)
config.set_string("-dict", phoneme_dictionary_file)
config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal)
decoder = pocketsphinx.Decoder(config)
# obtain audio data
raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format
# obtain recognition results
if keyword_entries is not None: # explicitly specified set of keywords
with PortableNamedTemporaryFile("w") as f:
# generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5
f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries)
f.flush()
# perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done)
decoder.add_kws("keywords", f.name)
decoder.activate_search("keywords")
elif grammar is not None: # a path to a FSG or JSGF grammar
if not os.path.exists(grammar):
raise ValueError("Grammar '{0}' does not exist.".format(grammar))
grammar_path = os.path.abspath(os.path.dirname(grammar))
grammar_name = os.path.splitext(os.path.basename(grammar))[0]
fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name)
if not os.path.exists(fsg_path): # create FSG grammar if not available
jsgf = Jsgf(grammar)
rule = jsgf.get_rule("{0}.{0}".format(grammar_name))
fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5)
fsg.writefile(fsg_path)
else:
fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5)
decoder.set_fsg(grammar_name, fsg)
decoder.set_search(grammar_name)
decoder.start_utt() # begin utterance processing
decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
decoder.end_utt() # stop utterance processing
if show_all: return decoder
# return results
hypothesis = decoder.hyp()
if hypothesis is not None: return hypothesis.hypstr
raise UnknownValueError() # no transcriptions available

View File

@@ -0,0 +1,22 @@
from io import BytesIO
from speech_recognition.audio import AudioData
class OpenAICompatibleRecognizer:
def __init__(self, client) -> None:
self.client = client
def recognize(self, audio_data: "AudioData", model: str, **kwargs) -> str:
if not isinstance(audio_data, AudioData):
raise ValueError(
"``audio_data`` must be an ``AudioData`` instance"
)
wav_data = BytesIO(audio_data.get_wav_data())
wav_data.name = "SpeechRecognition_audio.wav"
transcript = self.client.audio.transcriptions.create(
file=wav_data, model=model, **kwargs
)
return transcript.text

View File

@@ -0,0 +1,54 @@
from __future__ import annotations
from typing import Literal, TypedDict
from typing_extensions import Unpack
from speech_recognition.audio import AudioData
from speech_recognition.exceptions import SetupError
from speech_recognition.recognizers.whisper_api.base import (
OpenAICompatibleRecognizer,
)
# https://console.groq.com/docs/speech-text#supported-models
GroqModel = Literal[
"whisper-large-v3-turbo", "whisper-large-v3", "distil-whisper-large-v3-en"
]
class GroqOptionalParameters(TypedDict):
"""Groq speech transcription's optional parameters.
https://console.groq.com/docs/speech-text#transcription-endpoint-usage
"""
prompt: str
response_format: str
temperature: float
language: str
def recognize(
recognizer,
audio_data: "AudioData",
*,
model: GroqModel = "whisper-large-v3-turbo",
**kwargs: Unpack[GroqOptionalParameters],
) -> str:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Groq Whisper API.
This function requires login to Groq; visit https://console.groq.com/login, then generate API Key in `API Keys <https://console.groq.com/keys>`__ menu.
Detail: https://console.groq.com/docs/speech-text
Set environment variable ``GROQ_API_KEY``; otherwise groq library will raise a ``groq.GroqError``.
"""
try:
import groq
except ImportError:
raise SetupError(
"missing groq module: ensure that groq is set up correctly."
)
groq_recognizer = OpenAICompatibleRecognizer(groq.Groq())
return groq_recognizer.recognize(audio_data, model, **kwargs)

View File

@@ -0,0 +1,83 @@
from __future__ import annotations
from typing import Literal
from typing_extensions import Unpack
from speech_recognition.audio import AudioData
from speech_recognition.exceptions import SetupError
from speech_recognition.recognizers.whisper_api.base import (
OpenAICompatibleRecognizer,
)
# https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-model
WhisperModel = Literal[
"whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"
]
class OpenAIOptionalParameters:
"""OpenAI speech transcription's optional parameters.
https://platform.openai.com/docs/api-reference/audio/createTranscription
"""
language: str
prompt: str
# TODO Add support `Literal["text", "srt", "verbose_json", "vtt"]`
response_format: Literal["json"]
temperature: float
# timestamp_granularities # TODO support
def recognize(
recognizer,
audio_data: "AudioData",
*,
model: WhisperModel = "whisper-1",
**kwargs: Unpack[OpenAIOptionalParameters],
) -> str:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API.
This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings <https://platform.openai.com/account/api-keys>`__.
Detail: https://platform.openai.com/docs/guides/speech-to-text
Set environment variable ``OPENAI_API_KEY``; otherwise openai library will raise a ``openai.OpenAIError``.
"""
try:
import openai
except ImportError:
raise SetupError(
"missing openai module: ensure that openai is set up correctly."
)
openai_recognizer = OpenAICompatibleRecognizer(openai.OpenAI())
return openai_recognizer.recognize(audio_data, model, **kwargs)
if __name__ == "__main__":
import argparse
from typing import get_args
import speech_recognition as sr
parser = argparse.ArgumentParser()
parser.add_argument("audio_file")
parser.add_argument(
"--model", choices=get_args(WhisperModel), default="whisper-1"
)
parser.add_argument("-l", "--language")
args = parser.parse_args()
r = sr.Recognizer()
with sr.AudioFile(args.audio_file) as source:
audio_data = r.listen(source)
if args.language:
transcription = recognize(
None, audio_data, model=args.model, language=args.language
)
else:
transcription = recognize(None, audio_data, model=args.model)
print(transcription)

View File

@@ -0,0 +1,45 @@
from __future__ import annotations
import io
from typing import TYPE_CHECKING, Any, Protocol
from speech_recognition.audio import AudioData
if TYPE_CHECKING:
import numpy as np
class Transcribable(Protocol):
def transcribe(
self, audio_array: np.ndarray, **kwargs
) -> str | dict[str, Any]:
pass
class WhisperCompatibleRecognizer:
def __init__(self, model: Transcribable) -> None:
self.model = model
def recognize(
self, audio_data: AudioData, show_dict: bool = False, **kwargs
):
if not isinstance(audio_data, AudioData):
raise ValueError(
"``audio_data`` must be an ``AudioData`` instance"
)
import numpy as np
import soundfile as sf
# 16 kHz https://github.com/openai/whisper/blob/28769fcfe50755a817ab922a7bc83483159600a9/whisper/audio.py#L98-L99
wav_bytes = audio_data.get_wav_data(convert_rate=16000)
wav_stream = io.BytesIO(wav_bytes)
audio_array, sampling_rate = sf.read(wav_stream)
audio_array = audio_array.astype(np.float32)
result = self.model.transcribe(audio_array, **kwargs)
if show_dict:
return result
else:
return result["text"]

View File

@@ -0,0 +1,106 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Literal, TypedDict
from speech_recognition.audio import AudioData
from speech_recognition.recognizers.whisper_local.base import (
WhisperCompatibleRecognizer,
)
if TYPE_CHECKING:
import numpy as np
from faster_whisper import WhisperModel
from faster_whisper.transcribe import Segment
from typing_extensions import Unpack
class TranscribeOutput(TypedDict):
text: str
segments: list[Segment]
language: str
class TranscribableAdapter:
def __init__(self, model: WhisperModel) -> None:
self.model = model
def transcribe(
self, audio_array: np.ndarray, **kwargs
) -> TranscribeOutput:
segments_generator, info = self.model.transcribe(audio_array, **kwargs)
segments = list(segments_generator)
return {
"text": " ".join(segment.text for segment in segments),
"segments": segments,
"language": info.language,
}
class InitOptionalParameters(TypedDict, total=False):
# https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L575
device: Literal["cpu", "gpu", "auto"]
compute_type: str
download_root: str
# TODO Add others
class TranscribeOptionalParameters(TypedDict, total=False):
# https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692
language: str
task: Literal["transcribe", "translate"]
beam_size: int
# TODO Add others
def recognize(
recognizer,
audio_data: AudioData,
model: str = "base",
show_dict: bool = False,
init_options: InitOptionalParameters | None = None,
**transcribe_options: Unpack[TranscribeOptionalParameters],
) -> str | TranscribeOutput:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
Pick ``model`` size (Same as Whisper).
If ``show_dict`` is true, returns the detailed response from Whisper, including the detected language. Otherwise returns only the transcription.
You can specify:
* ``language``: recognition language, an uncapitalized 2 letters language name like "en" or "fr".
* If not set, Faster Whisper will automatically detect the language.
* ``task``
* If you want transcribe + **translate** to english, set ``task="translate"``.
Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options.
"""
from faster_whisper import WhisperModel
model = WhisperModel(model, **init_options or {})
whisper_recognizer = WhisperCompatibleRecognizer(
TranscribableAdapter(model)
)
return whisper_recognizer.recognize(
audio_data, show_dict=show_dict, **transcribe_options
)
if __name__ == "__main__":
import argparse
import speech_recognition as sr
parser = argparse.ArgumentParser()
parser.add_argument("audio_file")
args = parser.parse_args()
r = sr.Recognizer()
with sr.AudioFile(args.audio_file) as source:
audio_data = r.listen(source)
transcription = recognize(None, audio_data)
print(transcription)

View File

@@ -0,0 +1,108 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Literal, TypedDict
from speech_recognition.audio import AudioData
from speech_recognition.recognizers.whisper_local.base import (
WhisperCompatibleRecognizer,
)
if TYPE_CHECKING:
import numpy as np
import torch
from typing_extensions import Unpack
from whisper import Whisper
class LoadModelOptionalParameters(TypedDict, total=False):
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/__init__.py#L103
device: str | torch.device
download_root: str
in_memory: bool
class TranscribeOptionalParameters(TypedDict, total=False):
"""Transcribe optional parameters & DecodingOptions parameters."""
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/transcribe.py#L38
temperature: float | tuple[float, ...]
# TODO Add others
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/decoding.py#L81
task: Literal["transcribe", "translate"]
language: str
fp16: bool
# TODO Add others
class Segment(TypedDict):
id: int
seek: int
start: float
end: float
text: str
tokens: list[int]
temperature: float
avg_logprob: float
compression_ratio: float
no_speech_prob: float
class TranscribeOutput(TypedDict):
text: str
segments: list[Segment]
language: str
class TranscribableAdapter:
def __init__(self, model: Whisper) -> None:
self.model = model
def transcribe(
self, audio_array: np.ndarray, **kwargs
) -> TranscribeOutput:
if "fp16" not in kwargs:
import torch
kwargs["fp16"] = torch.cuda.is_available()
return self.model.transcribe(audio_array, **kwargs)
def recognize(
recognizer,
audio_data: AudioData,
model: str = "base",
show_dict: bool = False,
load_options: LoadModelOptionalParameters | None = None,
**transcribe_options: Unpack[TranscribeOptionalParameters],
) -> str | TranscribeOutput:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
Pick ``model`` from output of :command:`python -c 'import whisper; print(whisper.available_models())'`.
See also https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages.
If ``show_dict`` is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription.
You can specify:
* ``language``: recognition language, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
* If not set, Whisper will automatically detect the language.
* ``task``
* If you want transcribe + **translate** to english, set ``task="translate"``.
Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options.
"""
import whisper
whisper_model = whisper.load_model(model, **load_options or {})
whisper_recognizer = WhisperCompatibleRecognizer(
TranscribableAdapter(whisper_model)
)
return whisper_recognizer.recognize(
audio_data, show_dict=show_dict, **transcribe_options
)