first commit
This commit is contained in:
1342
venv/lib/python3.11/site-packages/speech_recognition/__init__.py
Normal file
1342
venv/lib/python3.11/site-packages/speech_recognition/__init__.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,24 @@
|
||||
import speech_recognition as sr
|
||||
|
||||
r = sr.Recognizer()
|
||||
m = sr.Microphone()
|
||||
|
||||
try:
|
||||
print("A moment of silence, please...")
|
||||
with m as source: r.adjust_for_ambient_noise(source)
|
||||
print("Set minimum energy threshold to {}".format(r.energy_threshold))
|
||||
while True:
|
||||
print("Say something!")
|
||||
with m as source: audio = r.listen(source)
|
||||
print("Got it! Now to recognize it...")
|
||||
try:
|
||||
# recognize speech using Google Speech Recognition
|
||||
value = r.recognize_google(audio)
|
||||
|
||||
print("You said {}".format(value))
|
||||
except sr.UnknownValueError:
|
||||
print("Oops! Didn't catch that")
|
||||
except sr.RequestError as e:
|
||||
print("Uh oh! Couldn't request results from Google Speech Recognition service; {0}".format(e))
|
||||
except KeyboardInterrupt:
|
||||
pass
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
318
venv/lib/python3.11/site-packages/speech_recognition/audio.py
Normal file
318
venv/lib/python3.11/site-packages/speech_recognition/audio.py
Normal file
@@ -0,0 +1,318 @@
|
||||
import aifc
|
||||
import audioop
|
||||
import io
|
||||
import os
|
||||
import platform
|
||||
import stat
|
||||
import subprocess
|
||||
import sys
|
||||
import wave
|
||||
|
||||
|
||||
class AudioData(object):
|
||||
"""
|
||||
Creates a new ``AudioData`` instance, which represents mono audio data.
|
||||
|
||||
The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format.
|
||||
|
||||
The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample.
|
||||
|
||||
The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz).
|
||||
|
||||
Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly.
|
||||
"""
|
||||
|
||||
def __init__(self, frame_data, sample_rate, sample_width):
|
||||
assert sample_rate > 0, "Sample rate must be a positive integer"
|
||||
assert (
|
||||
sample_width % 1 == 0 and 1 <= sample_width <= 4
|
||||
), "Sample width must be between 1 and 4 inclusive"
|
||||
self.frame_data = frame_data
|
||||
self.sample_rate = sample_rate
|
||||
self.sample_width = int(sample_width)
|
||||
|
||||
def get_segment(self, start_ms=None, end_ms=None):
|
||||
"""
|
||||
Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in.
|
||||
|
||||
If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end.
|
||||
"""
|
||||
assert (
|
||||
start_ms is None or start_ms >= 0
|
||||
), "``start_ms`` must be a non-negative number"
|
||||
assert end_ms is None or end_ms >= (
|
||||
0 if start_ms is None else start_ms
|
||||
), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``"
|
||||
if start_ms is None:
|
||||
start_byte = 0
|
||||
else:
|
||||
start_byte = int(
|
||||
(start_ms * self.sample_rate * self.sample_width) // 1000
|
||||
)
|
||||
if end_ms is None:
|
||||
end_byte = len(self.frame_data)
|
||||
else:
|
||||
end_byte = int(
|
||||
(end_ms * self.sample_rate * self.sample_width) // 1000
|
||||
)
|
||||
return AudioData(
|
||||
self.frame_data[start_byte:end_byte],
|
||||
self.sample_rate,
|
||||
self.sample_width,
|
||||
)
|
||||
|
||||
def get_raw_data(self, convert_rate=None, convert_width=None):
|
||||
"""
|
||||
Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance.
|
||||
|
||||
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
|
||||
|
||||
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
|
||||
|
||||
Writing these bytes directly to a file results in a valid `RAW/PCM audio file <https://en.wikipedia.org/wiki/Raw_audio_format>`__.
|
||||
"""
|
||||
assert (
|
||||
convert_rate is None or convert_rate > 0
|
||||
), "Sample rate to convert to must be a positive integer"
|
||||
assert convert_width is None or (
|
||||
convert_width % 1 == 0 and 1 <= convert_width <= 4
|
||||
), "Sample width to convert to must be between 1 and 4 inclusive"
|
||||
|
||||
raw_data = self.frame_data
|
||||
|
||||
# make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples)
|
||||
if self.sample_width == 1:
|
||||
raw_data = audioop.bias(
|
||||
raw_data, 1, -128
|
||||
) # subtract 128 from every sample to make them act like signed samples
|
||||
|
||||
# resample audio at the desired rate if specified
|
||||
if convert_rate is not None and self.sample_rate != convert_rate:
|
||||
raw_data, _ = audioop.ratecv(
|
||||
raw_data,
|
||||
self.sample_width,
|
||||
1,
|
||||
self.sample_rate,
|
||||
convert_rate,
|
||||
None,
|
||||
)
|
||||
|
||||
# convert samples to desired sample width if specified
|
||||
if convert_width is not None and self.sample_width != convert_width:
|
||||
if (
|
||||
convert_width == 3
|
||||
): # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866)
|
||||
raw_data = audioop.lin2lin(
|
||||
raw_data, self.sample_width, 4
|
||||
) # convert audio into 32-bit first, which is always supported
|
||||
try:
|
||||
audioop.bias(
|
||||
b"", 3, 0
|
||||
) # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do)
|
||||
except (
|
||||
audioop.error
|
||||
): # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less)
|
||||
raw_data = b"".join(
|
||||
raw_data[i + 1: i + 4]
|
||||
for i in range(0, len(raw_data), 4)
|
||||
) # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample
|
||||
else: # 24-bit audio fully supported, we don't need to shim anything
|
||||
raw_data = audioop.lin2lin(
|
||||
raw_data, self.sample_width, convert_width
|
||||
)
|
||||
else:
|
||||
raw_data = audioop.lin2lin(
|
||||
raw_data, self.sample_width, convert_width
|
||||
)
|
||||
|
||||
# if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again
|
||||
if convert_width == 1:
|
||||
raw_data = audioop.bias(
|
||||
raw_data, 1, 128
|
||||
) # add 128 to every sample to make them act like unsigned samples again
|
||||
|
||||
return raw_data
|
||||
|
||||
def get_wav_data(self, convert_rate=None, convert_width=None):
|
||||
"""
|
||||
Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance.
|
||||
|
||||
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
|
||||
|
||||
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
|
||||
|
||||
Writing these bytes directly to a file results in a valid `WAV file <https://en.wikipedia.org/wiki/WAV>`__.
|
||||
"""
|
||||
raw_data = self.get_raw_data(convert_rate, convert_width)
|
||||
sample_rate = (
|
||||
self.sample_rate if convert_rate is None else convert_rate
|
||||
)
|
||||
sample_width = (
|
||||
self.sample_width if convert_width is None else convert_width
|
||||
)
|
||||
|
||||
# generate the WAV file contents
|
||||
with io.BytesIO() as wav_file:
|
||||
wav_writer = wave.open(wav_file, "wb")
|
||||
try: # note that we can't use context manager, since that was only added in Python 3.4
|
||||
wav_writer.setframerate(sample_rate)
|
||||
wav_writer.setsampwidth(sample_width)
|
||||
wav_writer.setnchannels(1)
|
||||
wav_writer.writeframes(raw_data)
|
||||
wav_data = wav_file.getvalue()
|
||||
finally: # make sure resources are cleaned up
|
||||
wav_writer.close()
|
||||
return wav_data
|
||||
|
||||
def get_aiff_data(self, convert_rate=None, convert_width=None):
|
||||
"""
|
||||
Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance.
|
||||
|
||||
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
|
||||
|
||||
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
|
||||
|
||||
Writing these bytes directly to a file results in a valid `AIFF-C file <https://en.wikipedia.org/wiki/Audio_Interchange_File_Format>`__.
|
||||
"""
|
||||
raw_data = self.get_raw_data(convert_rate, convert_width)
|
||||
sample_rate = (
|
||||
self.sample_rate if convert_rate is None else convert_rate
|
||||
)
|
||||
sample_width = (
|
||||
self.sample_width if convert_width is None else convert_width
|
||||
)
|
||||
|
||||
# the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian
|
||||
if hasattr(
|
||||
audioop, "byteswap"
|
||||
): # ``audioop.byteswap`` was only added in Python 3.4
|
||||
raw_data = audioop.byteswap(raw_data, sample_width)
|
||||
else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback
|
||||
raw_data = raw_data[sample_width - 1:: -1] + b"".join(
|
||||
raw_data[i + sample_width: i: -1]
|
||||
for i in range(sample_width - 1, len(raw_data), sample_width)
|
||||
)
|
||||
|
||||
# generate the AIFF-C file contents
|
||||
with io.BytesIO() as aiff_file:
|
||||
aiff_writer = aifc.open(aiff_file, "wb")
|
||||
try: # note that we can't use context manager, since that was only added in Python 3.4
|
||||
aiff_writer.setframerate(sample_rate)
|
||||
aiff_writer.setsampwidth(sample_width)
|
||||
aiff_writer.setnchannels(1)
|
||||
aiff_writer.writeframes(raw_data)
|
||||
aiff_data = aiff_file.getvalue()
|
||||
finally: # make sure resources are cleaned up
|
||||
aiff_writer.close()
|
||||
return aiff_data
|
||||
|
||||
def get_flac_data(self, convert_rate=None, convert_width=None):
|
||||
"""
|
||||
Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance.
|
||||
|
||||
Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC.
|
||||
|
||||
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
|
||||
|
||||
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
|
||||
|
||||
Writing these bytes directly to a file results in a valid `FLAC file <https://en.wikipedia.org/wiki/FLAC>`__.
|
||||
"""
|
||||
assert convert_width is None or (
|
||||
convert_width % 1 == 0 and 1 <= convert_width <= 3
|
||||
), "Sample width to convert to must be between 1 and 3 inclusive"
|
||||
|
||||
if (
|
||||
self.sample_width > 3 and convert_width is None
|
||||
): # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder
|
||||
convert_width = 3 # the largest supported sample width is 24-bit, so we'll limit the sample width to that
|
||||
|
||||
# run the FLAC converter with the WAV data to get the FLAC data
|
||||
wav_data = self.get_wav_data(convert_rate, convert_width)
|
||||
flac_converter = get_flac_converter()
|
||||
if (
|
||||
os.name == "nt"
|
||||
): # on Windows, specify that the process is to be started without showing a console window
|
||||
startup_info = subprocess.STARTUPINFO()
|
||||
startup_info.dwFlags |= (
|
||||
subprocess.STARTF_USESHOWWINDOW
|
||||
) # specify that the wShowWindow field of `startup_info` contains a value
|
||||
startup_info.wShowWindow = (
|
||||
subprocess.SW_HIDE
|
||||
) # specify that the console window should be hidden
|
||||
else:
|
||||
startup_info = None # default startupinfo
|
||||
process = subprocess.Popen(
|
||||
[
|
||||
flac_converter,
|
||||
"--stdout",
|
||||
"--totally-silent", # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output
|
||||
"--best", # highest level of compression available
|
||||
"-", # the input FLAC file contents will be given in stdin
|
||||
],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
startupinfo=startup_info,
|
||||
)
|
||||
flac_data, stderr = process.communicate(wav_data)
|
||||
return flac_data
|
||||
|
||||
|
||||
def get_flac_converter():
|
||||
"""Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
|
||||
flac_converter = shutil_which("flac") # check for installed version first
|
||||
if flac_converter is None: # flac utility is not installed
|
||||
base_path = os.path.dirname(
|
||||
os.path.abspath(__file__)
|
||||
) # directory of the current module file, where all the FLAC bundled binaries are stored
|
||||
system, machine = platform.system(), platform.machine()
|
||||
if system == "Windows" and machine in {
|
||||
"i686",
|
||||
"i786",
|
||||
"x86",
|
||||
"x86_64",
|
||||
"AMD64",
|
||||
}:
|
||||
flac_converter = os.path.join(base_path, "flac-win32.exe")
|
||||
elif system == "Darwin" and machine in {
|
||||
"i686",
|
||||
"i786",
|
||||
"x86",
|
||||
"x86_64",
|
||||
"AMD64",
|
||||
"arm64",
|
||||
}:
|
||||
flac_converter = os.path.join(base_path, "flac-mac")
|
||||
elif system == "Linux" and machine in {"i686", "i786", "x86"}:
|
||||
flac_converter = os.path.join(base_path, "flac-linux-x86")
|
||||
elif system == "Linux" and machine in {"x86_64", "AMD64"}:
|
||||
flac_converter = os.path.join(base_path, "flac-linux-x86_64")
|
||||
else: # no FLAC converter available
|
||||
raise OSError(
|
||||
"FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent"
|
||||
)
|
||||
|
||||
# mark FLAC converter as executable if possible
|
||||
try:
|
||||
# handle known issue when running on docker:
|
||||
# run executable right after chmod() may result in OSError "Text file busy"
|
||||
# fix: flush FS with sync
|
||||
if not os.access(flac_converter, os.X_OK):
|
||||
stat_info = os.stat(flac_converter)
|
||||
os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC)
|
||||
if "Linux" in platform.system():
|
||||
os.sync() if sys.version_info >= (3, 3) else os.system("sync")
|
||||
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return flac_converter
|
||||
|
||||
|
||||
def shutil_which(pgm):
|
||||
"""Python 2 compatibility: backport of ``shutil.which()`` from Python 3"""
|
||||
path = os.getenv("PATH")
|
||||
for p in path.split(os.path.pathsep):
|
||||
p = os.path.join(p, pgm)
|
||||
if os.path.exists(p) and os.access(p, os.X_OK):
|
||||
return p
|
@@ -0,0 +1,22 @@
|
||||
class SetupError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class WaitTimeoutError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RequestError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class UnknownValueError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class TranscriptionNotReady(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class TranscriptionFailed(Exception):
|
||||
pass
|
BIN
venv/lib/python3.11/site-packages/speech_recognition/flac-linux-x86
Executable file
BIN
venv/lib/python3.11/site-packages/speech_recognition/flac-linux-x86
Executable file
Binary file not shown.
BIN
venv/lib/python3.11/site-packages/speech_recognition/flac-linux-x86_64
Executable file
BIN
venv/lib/python3.11/site-packages/speech_recognition/flac-linux-x86_64
Executable file
Binary file not shown.
BIN
venv/lib/python3.11/site-packages/speech_recognition/flac-mac
Executable file
BIN
venv/lib/python3.11/site-packages/speech_recognition/flac-mac
Executable file
Binary file not shown.
BIN
venv/lib/python3.11/site-packages/speech_recognition/flac-win32.exe
Executable file
BIN
venv/lib/python3.11/site-packages/speech_recognition/flac-win32.exe
Executable file
Binary file not shown.
@@ -0,0 +1,31 @@
|
||||
Copyright (c) 1999-2015 Carnegie Mellon University. All rights
|
||||
reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
This work was supported in part by funding from the Defense Advanced
|
||||
Research Projects Agency and the National Science Foundation of the
|
||||
United States of America, and the CMU Sphinx Speech Consortium.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
||||
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
||||
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
@@ -0,0 +1,34 @@
|
||||
/* ====================================================================
|
||||
* Copyright (c) 2015 Alpha Cephei Inc. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY ALPHA CEPHEI INC. ``AS IS'' AND.
|
||||
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,.
|
||||
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ALPHA CEPHEI INC.
|
||||
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT.
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,.
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY.
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT.
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE.
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* ====================================================================
|
||||
*
|
||||
*/
|
||||
|
||||
This directory contains generic US english acoustic model trained with
|
||||
latest sphinxtrain.
|
@@ -0,0 +1,12 @@
|
||||
-lowerf 130
|
||||
-upperf 6800
|
||||
-nfilt 25
|
||||
-transform dct
|
||||
-lifter 22
|
||||
-feat 1s_c_d_dd
|
||||
-svspec 0-12/13-25/26-38
|
||||
-agc none
|
||||
-cmn current
|
||||
-varnorm no
|
||||
-model ptm
|
||||
-cmninit 40,3,-1
|
Binary file not shown.
After Width: | Height: | Size: 2.8 MiB |
Binary file not shown.
@@ -0,0 +1,5 @@
|
||||
<s> SIL
|
||||
</s> SIL
|
||||
<sil> SIL
|
||||
[NOISE] +NSN+
|
||||
[SPEECH] +SPN+
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,262 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Dict, Literal, TypedDict
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.parse import urlencode
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
from typing_extensions import NotRequired
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.exceptions import RequestError, UnknownValueError
|
||||
|
||||
|
||||
class Alternative(TypedDict):
|
||||
transcript: str
|
||||
confidence: float
|
||||
|
||||
|
||||
class Result(TypedDict):
|
||||
alternative: list[Alternative]
|
||||
final: bool
|
||||
|
||||
|
||||
class GoogleResponse(TypedDict):
|
||||
result: list[Result]
|
||||
result_index: NotRequired[int]
|
||||
|
||||
|
||||
ProfanityFilterLevel = Literal[0, 1]
|
||||
RequestHeaders = Dict[str, str]
|
||||
|
||||
ENDPOINT = "http://www.google.com/speech-api/v2/recognize"
|
||||
|
||||
|
||||
class RequestBuilder:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
endpoint: str,
|
||||
key: str,
|
||||
language: str,
|
||||
filter_level: ProfanityFilterLevel,
|
||||
) -> None:
|
||||
self.endpoint = endpoint
|
||||
self.key = key
|
||||
self.language = language
|
||||
self.filter_level = filter_level
|
||||
|
||||
def build(self, audio_data: AudioData) -> Request:
|
||||
if not isinstance(audio_data, AudioData):
|
||||
raise ValueError("``audio_data`` must be audio data")
|
||||
|
||||
url = self.build_url()
|
||||
headers = self.build_headers(audio_data)
|
||||
flac_data = self.build_data(audio_data)
|
||||
request = Request(url, data=flac_data, headers=headers)
|
||||
return request
|
||||
|
||||
def build_url(self) -> str:
|
||||
"""
|
||||
>>> builder = RequestBuilder(endpoint="http://www.google.com/speech-api/v2/recognize", key="awesome-key", language="en-US", filter_level=0)
|
||||
>>> builder.build_url()
|
||||
'http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0'
|
||||
"""
|
||||
params = urlencode(
|
||||
{
|
||||
"client": "chromium",
|
||||
"lang": self.language,
|
||||
"key": self.key,
|
||||
"pFilter": self.filter_level,
|
||||
}
|
||||
)
|
||||
return f"{self.endpoint}?{params}"
|
||||
|
||||
def build_headers(self, audio_data: AudioData) -> RequestHeaders:
|
||||
"""
|
||||
>>> builder = RequestBuilder(endpoint="", key="", language="", filter_level=1)
|
||||
>>> audio_data = AudioData(b"", 16_000, 1)
|
||||
>>> builder.build_headers(audio_data)
|
||||
{'Content-Type': 'audio/x-flac; rate=16000'}
|
||||
"""
|
||||
rate = audio_data.sample_rate
|
||||
headers = {"Content-Type": f"audio/x-flac; rate={rate}"}
|
||||
return headers
|
||||
|
||||
def build_data(self, audio_data: AudioData) -> bytes:
|
||||
flac_data = audio_data.get_flac_data(
|
||||
convert_rate=self.to_convert_rate(audio_data.sample_rate),
|
||||
convert_width=2, # audio samples must be 16-bit
|
||||
)
|
||||
return flac_data
|
||||
|
||||
@staticmethod
|
||||
def to_convert_rate(sample_rate: int) -> int:
|
||||
"""Audio samples must be at least 8 kHz
|
||||
|
||||
>>> RequestBuilder.to_convert_rate(16_000)
|
||||
>>> RequestBuilder.to_convert_rate(8_000)
|
||||
>>> RequestBuilder.to_convert_rate(7_999)
|
||||
8000
|
||||
"""
|
||||
return None if sample_rate >= 8000 else 8000
|
||||
|
||||
|
||||
def create_request_builder(
|
||||
*,
|
||||
endpoint: str,
|
||||
key: str | None = None,
|
||||
language: str = "en-US",
|
||||
filter_level: ProfanityFilterLevel = 0,
|
||||
) -> RequestBuilder:
|
||||
if not isinstance(language, str):
|
||||
raise ValueError("``language`` must be a string")
|
||||
if key is not None and not isinstance(key, str):
|
||||
raise ValueError("``key`` must be ``None`` or a string")
|
||||
|
||||
if key is None:
|
||||
key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
|
||||
return RequestBuilder(
|
||||
endpoint=endpoint,
|
||||
key=key,
|
||||
language=language,
|
||||
filter_level=filter_level,
|
||||
)
|
||||
|
||||
|
||||
class OutputParser:
|
||||
def __init__(self, *, show_all: bool, with_confidence: bool) -> None:
|
||||
self.show_all = show_all
|
||||
self.with_confidence = with_confidence
|
||||
|
||||
def parse(self, response_text: str):
|
||||
actual_result = self.convert_to_result(response_text)
|
||||
if self.show_all:
|
||||
return actual_result
|
||||
|
||||
best_hypothesis = self.find_best_hypothesis(
|
||||
actual_result["alternative"]
|
||||
)
|
||||
# https://cloud.google.com/speech-to-text/docs/basics#confidence-values
|
||||
# "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results."
|
||||
confidence = best_hypothesis.get("confidence", 0.5)
|
||||
if self.with_confidence:
|
||||
return best_hypothesis["transcript"], confidence
|
||||
return best_hypothesis["transcript"]
|
||||
|
||||
@staticmethod
|
||||
def convert_to_result(response_text: str) -> Result:
|
||||
r"""
|
||||
>>> response_text = '''{"result":[]}
|
||||
... {"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0}
|
||||
... '''
|
||||
>>> OutputParser.convert_to_result(response_text)
|
||||
{'alternative': [{'transcript': 'one two three', 'confidence': 0.49585345}, {'transcript': '1 2', 'confidence': 0.42899391}], 'final': True}
|
||||
|
||||
>>> OutputParser.convert_to_result("")
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
speech_recognition.exceptions.UnknownValueError
|
||||
>>> OutputParser.convert_to_result('\n{"result":[]}')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
speech_recognition.exceptions.UnknownValueError
|
||||
>>> OutputParser.convert_to_result('{"result":[{"foo": "bar"}]}')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
speech_recognition.exceptions.UnknownValueError
|
||||
>>> OutputParser.convert_to_result('{"result":[{"alternative": []}]}')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
speech_recognition.exceptions.UnknownValueError
|
||||
"""
|
||||
# ignore any blank blocks
|
||||
for line in response_text.split("\n"):
|
||||
if not line:
|
||||
continue
|
||||
result: list[Result] = json.loads(line)["result"]
|
||||
if len(result) != 0:
|
||||
if len(result[0].get("alternative", [])) == 0:
|
||||
raise UnknownValueError()
|
||||
return result[0]
|
||||
raise UnknownValueError()
|
||||
|
||||
@staticmethod
|
||||
def find_best_hypothesis(alternatives: list[Alternative]) -> Alternative:
|
||||
"""
|
||||
>>> alternatives = [{"transcript": "one two three", "confidence": 0.42899391}, {"transcript": "1 2", "confidence": 0.49585345}]
|
||||
>>> OutputParser.find_best_hypothesis(alternatives)
|
||||
{'transcript': 'one two three', 'confidence': 0.42899391}
|
||||
|
||||
>>> alternatives = [{"confidence": 0.49585345}]
|
||||
>>> OutputParser.find_best_hypothesis(alternatives)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
speech_recognition.exceptions.UnknownValueError
|
||||
"""
|
||||
if "confidence" in alternatives:
|
||||
# BUG: actual_result["alternative"] (=alternatives) is list, not dict
|
||||
# return alternative with highest confidence score
|
||||
best_hypothesis: Alternative = max(
|
||||
alternatives,
|
||||
key=lambda alternative: alternative["confidence"],
|
||||
)
|
||||
else:
|
||||
# when there is no confidence available, we arbitrarily choose the first hypothesis.
|
||||
best_hypothesis: Alternative = alternatives[0]
|
||||
if "transcript" not in best_hypothesis:
|
||||
raise UnknownValueError()
|
||||
return best_hypothesis
|
||||
|
||||
|
||||
def obtain_transcription(request: Request, timeout: int) -> str:
|
||||
try:
|
||||
response = urlopen(request, timeout=timeout)
|
||||
except HTTPError as e:
|
||||
raise RequestError("recognition request failed: {}".format(e.reason))
|
||||
except URLError as e:
|
||||
raise RequestError(
|
||||
"recognition connection failed: {}".format(e.reason)
|
||||
)
|
||||
return response.read().decode("utf-8")
|
||||
|
||||
|
||||
def recognize_legacy(
|
||||
recognizer,
|
||||
audio_data: AudioData,
|
||||
key: str | None = None,
|
||||
language: str = "en-US",
|
||||
pfilter: ProfanityFilterLevel = 0,
|
||||
show_all: bool = False,
|
||||
with_confidence: bool = False,
|
||||
*,
|
||||
endpoint: str = ENDPOINT,
|
||||
):
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.
|
||||
|
||||
The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.
|
||||
|
||||
To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
|
||||
|
||||
The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.
|
||||
|
||||
The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0.
|
||||
|
||||
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
|
||||
|
||||
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
|
||||
"""
|
||||
request_builder = create_request_builder(
|
||||
endpoint=endpoint, key=key, language=language, filter_level=pfilter
|
||||
)
|
||||
request = request_builder.build(audio_data)
|
||||
|
||||
response_text = obtain_transcription(
|
||||
request, timeout=recognizer.operation_timeout
|
||||
)
|
||||
|
||||
output_parser = OutputParser(
|
||||
show_all=show_all, with_confidence=with_confidence
|
||||
)
|
||||
return output_parser.parse(response_text)
|
@@ -0,0 +1,142 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, TypedDict
|
||||
from urllib.error import URLError
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.exceptions import RequestError, UnknownValueError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from google.cloud.speech import (
|
||||
RecognitionConfig,
|
||||
RecognizeResponse,
|
||||
SpeechContext,
|
||||
)
|
||||
from typing_extensions import Required
|
||||
|
||||
|
||||
class GoogleCloudRecognizerParameters(TypedDict, total=False):
|
||||
"""Optional parameters.
|
||||
|
||||
The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``.
|
||||
A list of supported language tags can be found in the `Speech-to-Text supported languages <https://cloud.google.com/speech/docs/languages>`__.
|
||||
|
||||
If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives.
|
||||
This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary.
|
||||
Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
|
||||
|
||||
``show_all``: See :py:func:`recognize`.
|
||||
|
||||
``model``: You can select the model to get best results. (See `RecognitionConfig's documentation <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>`__ for detail)
|
||||
|
||||
``use_enhanced``: Set to true to use an enhanced model for speech recognition.
|
||||
"""
|
||||
|
||||
# SpeechRecognition specific parameters
|
||||
preferred_phrases: list[str]
|
||||
show_all: bool
|
||||
|
||||
# Speech-to-Text V1 API's parameters
|
||||
language_code: str
|
||||
model: str
|
||||
use_enhanced: bool
|
||||
# TODO Add others support
|
||||
|
||||
|
||||
class GoogleCloudSpeechV1Parameters(TypedDict, total=False):
|
||||
"""Speech-to-Text V1 API's parameters.
|
||||
|
||||
https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig
|
||||
"""
|
||||
|
||||
encoding: Required[RecognitionConfig.AudioEncoding]
|
||||
sample_rate_hertz: Required[int]
|
||||
language_code: Required[str]
|
||||
speech_contexts: list[SpeechContext]
|
||||
enable_word_time_offsets: bool
|
||||
model: str
|
||||
use_enhanced: bool
|
||||
|
||||
|
||||
def _build_config(
|
||||
audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters
|
||||
) -> RecognitionConfig:
|
||||
from google.cloud import speech
|
||||
|
||||
parameters: GoogleCloudSpeechV1Parameters = {
|
||||
"encoding": speech.RecognitionConfig.AudioEncoding.FLAC,
|
||||
"sample_rate_hertz": audio_data.sample_rate,
|
||||
"language_code": recognizer_params.pop("language_code", "en-US"),
|
||||
}
|
||||
if preferred_phrases := recognizer_params.pop("preferred_phrases", None):
|
||||
parameters["speech_contexts"] = [
|
||||
speech.SpeechContext(phrases=preferred_phrases)
|
||||
]
|
||||
if recognizer_params.pop("show_all", False):
|
||||
# ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets
|
||||
parameters["enable_word_time_offsets"] = True
|
||||
return speech.RecognitionConfig(**(parameters | recognizer_params))
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: AudioData,
|
||||
credentials_json_path: str | None = None,
|
||||
**kwargs: GoogleCloudRecognizerParameters,
|
||||
) -> str | RecognizeResponse:
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
|
||||
|
||||
This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project.
|
||||
And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
|
||||
|
||||
Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
|
||||
For other parameters, see :py:class:`GoogleCloudRecognizerParameters`.
|
||||
|
||||
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
|
||||
"""
|
||||
try:
|
||||
from google.api_core.exceptions import GoogleAPICallError
|
||||
from google.cloud import speech
|
||||
except ImportError:
|
||||
raise RequestError(
|
||||
"missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly."
|
||||
)
|
||||
|
||||
client = (
|
||||
speech.SpeechClient.from_service_account_json(credentials_json_path)
|
||||
if credentials_json_path
|
||||
else speech.SpeechClient()
|
||||
)
|
||||
|
||||
flac_data = audio_data.get_flac_data(
|
||||
# audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
|
||||
convert_rate=(
|
||||
None
|
||||
if 8000 <= audio_data.sample_rate <= 48000
|
||||
else max(8000, min(audio_data.sample_rate, 48000))
|
||||
),
|
||||
convert_width=2, # audio samples must be 16-bit
|
||||
)
|
||||
audio = speech.RecognitionAudio(content=flac_data)
|
||||
|
||||
config = _build_config(audio_data, kwargs.copy())
|
||||
|
||||
try:
|
||||
response = client.recognize(config=config, audio=audio)
|
||||
except GoogleAPICallError as e:
|
||||
raise RequestError(e)
|
||||
except URLError as e:
|
||||
raise RequestError(
|
||||
"recognition connection failed: {0}".format(e.reason)
|
||||
)
|
||||
|
||||
if kwargs.get("show_all"):
|
||||
return response
|
||||
if len(response.results) == 0:
|
||||
raise UnknownValueError()
|
||||
|
||||
transcript = " ".join(
|
||||
result.alternatives[0].transcript.strip()
|
||||
for result in response.results
|
||||
)
|
||||
return transcript
|
@@ -0,0 +1,111 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from collections.abc import Sequence
|
||||
|
||||
from speech_recognition import PortableNamedTemporaryFile
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.exceptions import RequestError, UnknownValueError
|
||||
|
||||
AcousticParametersDirectoryPath = str
|
||||
LanguageModelFilePath = str
|
||||
PhonemeDictionaryFilePath = str
|
||||
SphinxDataFilePaths = tuple[AcousticParametersDirectoryPath, LanguageModelFilePath, PhonemeDictionaryFilePath]
|
||||
|
||||
Keyword = str
|
||||
Sensitivity = float
|
||||
KeywordEntry = tuple[Keyword, Sensitivity]
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: AudioData,
|
||||
language: str | SphinxDataFilePaths = "en-US",
|
||||
keyword_entries: Sequence[KeywordEntry] | None = None,
|
||||
grammar: str | None = None,
|
||||
show_all: bool = False,
|
||||
):
|
||||
"""
|
||||
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.
|
||||
|
||||
The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models.
|
||||
|
||||
If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for.
|
||||
|
||||
Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored.
|
||||
|
||||
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition.
|
||||
|
||||
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation.
|
||||
"""
|
||||
# TODO Move this validation into KeywordEntry initialization
|
||||
assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1"
|
||||
|
||||
try:
|
||||
from pocketsphinx import FsgModel, Jsgf, pocketsphinx
|
||||
except ImportError:
|
||||
raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
|
||||
|
||||
if isinstance(language, str): # directory containing language data
|
||||
language_directory = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "pocketsphinx-data", language)
|
||||
if not os.path.isdir(language_directory):
|
||||
raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory))
|
||||
acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
|
||||
language_model_file = os.path.join(language_directory, "language-model.lm.bin")
|
||||
phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
|
||||
else: # 3-tuple of Sphinx data file paths
|
||||
acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language
|
||||
if not os.path.isdir(acoustic_parameters_directory):
|
||||
raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory))
|
||||
if not os.path.isfile(language_model_file):
|
||||
raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file))
|
||||
if not os.path.isfile(phoneme_dictionary_file):
|
||||
raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file))
|
||||
|
||||
# create decoder object
|
||||
config = pocketsphinx.Config()
|
||||
config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files
|
||||
config.set_string("-lm", language_model_file)
|
||||
config.set_string("-dict", phoneme_dictionary_file)
|
||||
config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal)
|
||||
decoder = pocketsphinx.Decoder(config)
|
||||
|
||||
# obtain audio data
|
||||
raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format
|
||||
|
||||
# obtain recognition results
|
||||
if keyword_entries is not None: # explicitly specified set of keywords
|
||||
with PortableNamedTemporaryFile("w") as f:
|
||||
# generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5
|
||||
f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries)
|
||||
f.flush()
|
||||
|
||||
# perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done)
|
||||
decoder.add_kws("keywords", f.name)
|
||||
decoder.activate_search("keywords")
|
||||
elif grammar is not None: # a path to a FSG or JSGF grammar
|
||||
if not os.path.exists(grammar):
|
||||
raise ValueError("Grammar '{0}' does not exist.".format(grammar))
|
||||
grammar_path = os.path.abspath(os.path.dirname(grammar))
|
||||
grammar_name = os.path.splitext(os.path.basename(grammar))[0]
|
||||
fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name)
|
||||
if not os.path.exists(fsg_path): # create FSG grammar if not available
|
||||
jsgf = Jsgf(grammar)
|
||||
rule = jsgf.get_rule("{0}.{0}".format(grammar_name))
|
||||
fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5)
|
||||
fsg.writefile(fsg_path)
|
||||
else:
|
||||
fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5)
|
||||
decoder.set_fsg(grammar_name, fsg)
|
||||
decoder.set_search(grammar_name)
|
||||
|
||||
decoder.start_utt() # begin utterance processing
|
||||
decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
|
||||
decoder.end_utt() # stop utterance processing
|
||||
|
||||
if show_all: return decoder
|
||||
|
||||
# return results
|
||||
hypothesis = decoder.hyp()
|
||||
if hypothesis is not None: return hypothesis.hypstr
|
||||
raise UnknownValueError() # no transcriptions available
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,22 @@
|
||||
from io import BytesIO
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
|
||||
|
||||
class OpenAICompatibleRecognizer:
|
||||
def __init__(self, client) -> None:
|
||||
self.client = client
|
||||
|
||||
def recognize(self, audio_data: "AudioData", model: str, **kwargs) -> str:
|
||||
if not isinstance(audio_data, AudioData):
|
||||
raise ValueError(
|
||||
"``audio_data`` must be an ``AudioData`` instance"
|
||||
)
|
||||
|
||||
wav_data = BytesIO(audio_data.get_wav_data())
|
||||
wav_data.name = "SpeechRecognition_audio.wav"
|
||||
|
||||
transcript = self.client.audio.transcriptions.create(
|
||||
file=wav_data, model=model, **kwargs
|
||||
)
|
||||
return transcript.text
|
@@ -0,0 +1,54 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Literal, TypedDict
|
||||
|
||||
from typing_extensions import Unpack
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.exceptions import SetupError
|
||||
from speech_recognition.recognizers.whisper_api.base import (
|
||||
OpenAICompatibleRecognizer,
|
||||
)
|
||||
|
||||
# https://console.groq.com/docs/speech-text#supported-models
|
||||
GroqModel = Literal[
|
||||
"whisper-large-v3-turbo", "whisper-large-v3", "distil-whisper-large-v3-en"
|
||||
]
|
||||
|
||||
|
||||
class GroqOptionalParameters(TypedDict):
|
||||
"""Groq speech transcription's optional parameters.
|
||||
|
||||
https://console.groq.com/docs/speech-text#transcription-endpoint-usage
|
||||
"""
|
||||
|
||||
prompt: str
|
||||
response_format: str
|
||||
temperature: float
|
||||
language: str
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: "AudioData",
|
||||
*,
|
||||
model: GroqModel = "whisper-large-v3-turbo",
|
||||
**kwargs: Unpack[GroqOptionalParameters],
|
||||
) -> str:
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Groq Whisper API.
|
||||
|
||||
This function requires login to Groq; visit https://console.groq.com/login, then generate API Key in `API Keys <https://console.groq.com/keys>`__ menu.
|
||||
|
||||
Detail: https://console.groq.com/docs/speech-text
|
||||
|
||||
Set environment variable ``GROQ_API_KEY``; otherwise groq library will raise a ``groq.GroqError``.
|
||||
"""
|
||||
try:
|
||||
import groq
|
||||
except ImportError:
|
||||
raise SetupError(
|
||||
"missing groq module: ensure that groq is set up correctly."
|
||||
)
|
||||
|
||||
groq_recognizer = OpenAICompatibleRecognizer(groq.Groq())
|
||||
return groq_recognizer.recognize(audio_data, model, **kwargs)
|
@@ -0,0 +1,83 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Literal
|
||||
|
||||
from typing_extensions import Unpack
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.exceptions import SetupError
|
||||
from speech_recognition.recognizers.whisper_api.base import (
|
||||
OpenAICompatibleRecognizer,
|
||||
)
|
||||
|
||||
# https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-model
|
||||
WhisperModel = Literal[
|
||||
"whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"
|
||||
]
|
||||
|
||||
|
||||
class OpenAIOptionalParameters:
|
||||
"""OpenAI speech transcription's optional parameters.
|
||||
|
||||
https://platform.openai.com/docs/api-reference/audio/createTranscription
|
||||
"""
|
||||
|
||||
language: str
|
||||
prompt: str
|
||||
# TODO Add support `Literal["text", "srt", "verbose_json", "vtt"]`
|
||||
response_format: Literal["json"]
|
||||
temperature: float
|
||||
# timestamp_granularities # TODO support
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: "AudioData",
|
||||
*,
|
||||
model: WhisperModel = "whisper-1",
|
||||
**kwargs: Unpack[OpenAIOptionalParameters],
|
||||
) -> str:
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API.
|
||||
|
||||
This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings <https://platform.openai.com/account/api-keys>`__.
|
||||
|
||||
Detail: https://platform.openai.com/docs/guides/speech-to-text
|
||||
|
||||
Set environment variable ``OPENAI_API_KEY``; otherwise openai library will raise a ``openai.OpenAIError``.
|
||||
"""
|
||||
try:
|
||||
import openai
|
||||
except ImportError:
|
||||
raise SetupError(
|
||||
"missing openai module: ensure that openai is set up correctly."
|
||||
)
|
||||
|
||||
openai_recognizer = OpenAICompatibleRecognizer(openai.OpenAI())
|
||||
return openai_recognizer.recognize(audio_data, model, **kwargs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
from typing import get_args
|
||||
|
||||
import speech_recognition as sr
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("audio_file")
|
||||
parser.add_argument(
|
||||
"--model", choices=get_args(WhisperModel), default="whisper-1"
|
||||
)
|
||||
parser.add_argument("-l", "--language")
|
||||
args = parser.parse_args()
|
||||
|
||||
r = sr.Recognizer()
|
||||
with sr.AudioFile(args.audio_file) as source:
|
||||
audio_data = r.listen(source)
|
||||
|
||||
if args.language:
|
||||
transcription = recognize(
|
||||
None, audio_data, model=args.model, language=args.language
|
||||
)
|
||||
else:
|
||||
transcription = recognize(None, audio_data, model=args.model)
|
||||
print(transcription)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,45 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from typing import TYPE_CHECKING, Any, Protocol
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Transcribable(Protocol):
|
||||
def transcribe(
|
||||
self, audio_array: np.ndarray, **kwargs
|
||||
) -> str | dict[str, Any]:
|
||||
pass
|
||||
|
||||
|
||||
class WhisperCompatibleRecognizer:
|
||||
def __init__(self, model: Transcribable) -> None:
|
||||
self.model = model
|
||||
|
||||
def recognize(
|
||||
self, audio_data: AudioData, show_dict: bool = False, **kwargs
|
||||
):
|
||||
if not isinstance(audio_data, AudioData):
|
||||
raise ValueError(
|
||||
"``audio_data`` must be an ``AudioData`` instance"
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
# 16 kHz https://github.com/openai/whisper/blob/28769fcfe50755a817ab922a7bc83483159600a9/whisper/audio.py#L98-L99
|
||||
wav_bytes = audio_data.get_wav_data(convert_rate=16000)
|
||||
wav_stream = io.BytesIO(wav_bytes)
|
||||
audio_array, sampling_rate = sf.read(wav_stream)
|
||||
audio_array = audio_array.astype(np.float32)
|
||||
|
||||
result = self.model.transcribe(audio_array, **kwargs)
|
||||
|
||||
if show_dict:
|
||||
return result
|
||||
else:
|
||||
return result["text"]
|
@@ -0,0 +1,106 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Literal, TypedDict
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.recognizers.whisper_local.base import (
|
||||
WhisperCompatibleRecognizer,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
from faster_whisper import WhisperModel
|
||||
from faster_whisper.transcribe import Segment
|
||||
from typing_extensions import Unpack
|
||||
|
||||
|
||||
class TranscribeOutput(TypedDict):
|
||||
text: str
|
||||
segments: list[Segment]
|
||||
language: str
|
||||
|
||||
|
||||
class TranscribableAdapter:
|
||||
def __init__(self, model: WhisperModel) -> None:
|
||||
self.model = model
|
||||
|
||||
def transcribe(
|
||||
self, audio_array: np.ndarray, **kwargs
|
||||
) -> TranscribeOutput:
|
||||
segments_generator, info = self.model.transcribe(audio_array, **kwargs)
|
||||
segments = list(segments_generator)
|
||||
return {
|
||||
"text": " ".join(segment.text for segment in segments),
|
||||
"segments": segments,
|
||||
"language": info.language,
|
||||
}
|
||||
|
||||
|
||||
class InitOptionalParameters(TypedDict, total=False):
|
||||
# https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L575
|
||||
device: Literal["cpu", "gpu", "auto"]
|
||||
compute_type: str
|
||||
download_root: str
|
||||
# TODO Add others
|
||||
|
||||
|
||||
class TranscribeOptionalParameters(TypedDict, total=False):
|
||||
# https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692
|
||||
language: str
|
||||
task: Literal["transcribe", "translate"]
|
||||
beam_size: int
|
||||
# TODO Add others
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: AudioData,
|
||||
model: str = "base",
|
||||
show_dict: bool = False,
|
||||
init_options: InitOptionalParameters | None = None,
|
||||
**transcribe_options: Unpack[TranscribeOptionalParameters],
|
||||
) -> str | TranscribeOutput:
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
|
||||
|
||||
Pick ``model`` size (Same as Whisper).
|
||||
|
||||
If ``show_dict`` is true, returns the detailed response from Whisper, including the detected language. Otherwise returns only the transcription.
|
||||
|
||||
You can specify:
|
||||
|
||||
* ``language``: recognition language, an uncapitalized 2 letters language name like "en" or "fr".
|
||||
|
||||
* If not set, Faster Whisper will automatically detect the language.
|
||||
|
||||
* ``task``
|
||||
|
||||
* If you want transcribe + **translate** to english, set ``task="translate"``.
|
||||
|
||||
Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options.
|
||||
"""
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
model = WhisperModel(model, **init_options or {})
|
||||
whisper_recognizer = WhisperCompatibleRecognizer(
|
||||
TranscribableAdapter(model)
|
||||
)
|
||||
return whisper_recognizer.recognize(
|
||||
audio_data, show_dict=show_dict, **transcribe_options
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
import speech_recognition as sr
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("audio_file")
|
||||
args = parser.parse_args()
|
||||
|
||||
r = sr.Recognizer()
|
||||
with sr.AudioFile(args.audio_file) as source:
|
||||
audio_data = r.listen(source)
|
||||
|
||||
transcription = recognize(None, audio_data)
|
||||
print(transcription)
|
@@ -0,0 +1,108 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Literal, TypedDict
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.recognizers.whisper_local.base import (
|
||||
WhisperCompatibleRecognizer,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
import torch
|
||||
from typing_extensions import Unpack
|
||||
from whisper import Whisper
|
||||
|
||||
|
||||
class LoadModelOptionalParameters(TypedDict, total=False):
|
||||
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/__init__.py#L103
|
||||
device: str | torch.device
|
||||
download_root: str
|
||||
in_memory: bool
|
||||
|
||||
|
||||
class TranscribeOptionalParameters(TypedDict, total=False):
|
||||
"""Transcribe optional parameters & DecodingOptions parameters."""
|
||||
|
||||
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/transcribe.py#L38
|
||||
temperature: float | tuple[float, ...]
|
||||
# TODO Add others
|
||||
|
||||
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/decoding.py#L81
|
||||
task: Literal["transcribe", "translate"]
|
||||
language: str
|
||||
fp16: bool
|
||||
# TODO Add others
|
||||
|
||||
|
||||
class Segment(TypedDict):
|
||||
id: int
|
||||
seek: int
|
||||
start: float
|
||||
end: float
|
||||
text: str
|
||||
tokens: list[int]
|
||||
temperature: float
|
||||
avg_logprob: float
|
||||
compression_ratio: float
|
||||
no_speech_prob: float
|
||||
|
||||
|
||||
class TranscribeOutput(TypedDict):
|
||||
text: str
|
||||
segments: list[Segment]
|
||||
language: str
|
||||
|
||||
|
||||
class TranscribableAdapter:
|
||||
def __init__(self, model: Whisper) -> None:
|
||||
self.model = model
|
||||
|
||||
def transcribe(
|
||||
self, audio_array: np.ndarray, **kwargs
|
||||
) -> TranscribeOutput:
|
||||
if "fp16" not in kwargs:
|
||||
import torch
|
||||
|
||||
kwargs["fp16"] = torch.cuda.is_available()
|
||||
|
||||
return self.model.transcribe(audio_array, **kwargs)
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: AudioData,
|
||||
model: str = "base",
|
||||
show_dict: bool = False,
|
||||
load_options: LoadModelOptionalParameters | None = None,
|
||||
**transcribe_options: Unpack[TranscribeOptionalParameters],
|
||||
) -> str | TranscribeOutput:
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
|
||||
|
||||
Pick ``model`` from output of :command:`python -c 'import whisper; print(whisper.available_models())'`.
|
||||
See also https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages.
|
||||
|
||||
If ``show_dict`` is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription.
|
||||
|
||||
You can specify:
|
||||
|
||||
* ``language``: recognition language, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
|
||||
|
||||
* If not set, Whisper will automatically detect the language.
|
||||
|
||||
* ``task``
|
||||
|
||||
* If you want transcribe + **translate** to english, set ``task="translate"``.
|
||||
|
||||
Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options.
|
||||
"""
|
||||
|
||||
import whisper
|
||||
|
||||
whisper_model = whisper.load_model(model, **load_options or {})
|
||||
whisper_recognizer = WhisperCompatibleRecognizer(
|
||||
TranscribableAdapter(whisper_model)
|
||||
)
|
||||
return whisper_recognizer.recognize(
|
||||
audio_data, show_dict=show_dict, **transcribe_options
|
||||
)
|
Reference in New Issue
Block a user