first commit
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,262 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Dict, Literal, TypedDict
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.parse import urlencode
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
from typing_extensions import NotRequired
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.exceptions import RequestError, UnknownValueError
|
||||
|
||||
|
||||
class Alternative(TypedDict):
|
||||
transcript: str
|
||||
confidence: float
|
||||
|
||||
|
||||
class Result(TypedDict):
|
||||
alternative: list[Alternative]
|
||||
final: bool
|
||||
|
||||
|
||||
class GoogleResponse(TypedDict):
|
||||
result: list[Result]
|
||||
result_index: NotRequired[int]
|
||||
|
||||
|
||||
ProfanityFilterLevel = Literal[0, 1]
|
||||
RequestHeaders = Dict[str, str]
|
||||
|
||||
ENDPOINT = "http://www.google.com/speech-api/v2/recognize"
|
||||
|
||||
|
||||
class RequestBuilder:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
endpoint: str,
|
||||
key: str,
|
||||
language: str,
|
||||
filter_level: ProfanityFilterLevel,
|
||||
) -> None:
|
||||
self.endpoint = endpoint
|
||||
self.key = key
|
||||
self.language = language
|
||||
self.filter_level = filter_level
|
||||
|
||||
def build(self, audio_data: AudioData) -> Request:
|
||||
if not isinstance(audio_data, AudioData):
|
||||
raise ValueError("``audio_data`` must be audio data")
|
||||
|
||||
url = self.build_url()
|
||||
headers = self.build_headers(audio_data)
|
||||
flac_data = self.build_data(audio_data)
|
||||
request = Request(url, data=flac_data, headers=headers)
|
||||
return request
|
||||
|
||||
def build_url(self) -> str:
|
||||
"""
|
||||
>>> builder = RequestBuilder(endpoint="http://www.google.com/speech-api/v2/recognize", key="awesome-key", language="en-US", filter_level=0)
|
||||
>>> builder.build_url()
|
||||
'http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0'
|
||||
"""
|
||||
params = urlencode(
|
||||
{
|
||||
"client": "chromium",
|
||||
"lang": self.language,
|
||||
"key": self.key,
|
||||
"pFilter": self.filter_level,
|
||||
}
|
||||
)
|
||||
return f"{self.endpoint}?{params}"
|
||||
|
||||
def build_headers(self, audio_data: AudioData) -> RequestHeaders:
|
||||
"""
|
||||
>>> builder = RequestBuilder(endpoint="", key="", language="", filter_level=1)
|
||||
>>> audio_data = AudioData(b"", 16_000, 1)
|
||||
>>> builder.build_headers(audio_data)
|
||||
{'Content-Type': 'audio/x-flac; rate=16000'}
|
||||
"""
|
||||
rate = audio_data.sample_rate
|
||||
headers = {"Content-Type": f"audio/x-flac; rate={rate}"}
|
||||
return headers
|
||||
|
||||
def build_data(self, audio_data: AudioData) -> bytes:
|
||||
flac_data = audio_data.get_flac_data(
|
||||
convert_rate=self.to_convert_rate(audio_data.sample_rate),
|
||||
convert_width=2, # audio samples must be 16-bit
|
||||
)
|
||||
return flac_data
|
||||
|
||||
@staticmethod
|
||||
def to_convert_rate(sample_rate: int) -> int:
|
||||
"""Audio samples must be at least 8 kHz
|
||||
|
||||
>>> RequestBuilder.to_convert_rate(16_000)
|
||||
>>> RequestBuilder.to_convert_rate(8_000)
|
||||
>>> RequestBuilder.to_convert_rate(7_999)
|
||||
8000
|
||||
"""
|
||||
return None if sample_rate >= 8000 else 8000
|
||||
|
||||
|
||||
def create_request_builder(
|
||||
*,
|
||||
endpoint: str,
|
||||
key: str | None = None,
|
||||
language: str = "en-US",
|
||||
filter_level: ProfanityFilterLevel = 0,
|
||||
) -> RequestBuilder:
|
||||
if not isinstance(language, str):
|
||||
raise ValueError("``language`` must be a string")
|
||||
if key is not None and not isinstance(key, str):
|
||||
raise ValueError("``key`` must be ``None`` or a string")
|
||||
|
||||
if key is None:
|
||||
key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
|
||||
return RequestBuilder(
|
||||
endpoint=endpoint,
|
||||
key=key,
|
||||
language=language,
|
||||
filter_level=filter_level,
|
||||
)
|
||||
|
||||
|
||||
class OutputParser:
|
||||
def __init__(self, *, show_all: bool, with_confidence: bool) -> None:
|
||||
self.show_all = show_all
|
||||
self.with_confidence = with_confidence
|
||||
|
||||
def parse(self, response_text: str):
|
||||
actual_result = self.convert_to_result(response_text)
|
||||
if self.show_all:
|
||||
return actual_result
|
||||
|
||||
best_hypothesis = self.find_best_hypothesis(
|
||||
actual_result["alternative"]
|
||||
)
|
||||
# https://cloud.google.com/speech-to-text/docs/basics#confidence-values
|
||||
# "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results."
|
||||
confidence = best_hypothesis.get("confidence", 0.5)
|
||||
if self.with_confidence:
|
||||
return best_hypothesis["transcript"], confidence
|
||||
return best_hypothesis["transcript"]
|
||||
|
||||
@staticmethod
|
||||
def convert_to_result(response_text: str) -> Result:
|
||||
r"""
|
||||
>>> response_text = '''{"result":[]}
|
||||
... {"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0}
|
||||
... '''
|
||||
>>> OutputParser.convert_to_result(response_text)
|
||||
{'alternative': [{'transcript': 'one two three', 'confidence': 0.49585345}, {'transcript': '1 2', 'confidence': 0.42899391}], 'final': True}
|
||||
|
||||
>>> OutputParser.convert_to_result("")
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
speech_recognition.exceptions.UnknownValueError
|
||||
>>> OutputParser.convert_to_result('\n{"result":[]}')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
speech_recognition.exceptions.UnknownValueError
|
||||
>>> OutputParser.convert_to_result('{"result":[{"foo": "bar"}]}')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
speech_recognition.exceptions.UnknownValueError
|
||||
>>> OutputParser.convert_to_result('{"result":[{"alternative": []}]}')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
speech_recognition.exceptions.UnknownValueError
|
||||
"""
|
||||
# ignore any blank blocks
|
||||
for line in response_text.split("\n"):
|
||||
if not line:
|
||||
continue
|
||||
result: list[Result] = json.loads(line)["result"]
|
||||
if len(result) != 0:
|
||||
if len(result[0].get("alternative", [])) == 0:
|
||||
raise UnknownValueError()
|
||||
return result[0]
|
||||
raise UnknownValueError()
|
||||
|
||||
@staticmethod
|
||||
def find_best_hypothesis(alternatives: list[Alternative]) -> Alternative:
|
||||
"""
|
||||
>>> alternatives = [{"transcript": "one two three", "confidence": 0.42899391}, {"transcript": "1 2", "confidence": 0.49585345}]
|
||||
>>> OutputParser.find_best_hypothesis(alternatives)
|
||||
{'transcript': 'one two three', 'confidence': 0.42899391}
|
||||
|
||||
>>> alternatives = [{"confidence": 0.49585345}]
|
||||
>>> OutputParser.find_best_hypothesis(alternatives)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
speech_recognition.exceptions.UnknownValueError
|
||||
"""
|
||||
if "confidence" in alternatives:
|
||||
# BUG: actual_result["alternative"] (=alternatives) is list, not dict
|
||||
# return alternative with highest confidence score
|
||||
best_hypothesis: Alternative = max(
|
||||
alternatives,
|
||||
key=lambda alternative: alternative["confidence"],
|
||||
)
|
||||
else:
|
||||
# when there is no confidence available, we arbitrarily choose the first hypothesis.
|
||||
best_hypothesis: Alternative = alternatives[0]
|
||||
if "transcript" not in best_hypothesis:
|
||||
raise UnknownValueError()
|
||||
return best_hypothesis
|
||||
|
||||
|
||||
def obtain_transcription(request: Request, timeout: int) -> str:
|
||||
try:
|
||||
response = urlopen(request, timeout=timeout)
|
||||
except HTTPError as e:
|
||||
raise RequestError("recognition request failed: {}".format(e.reason))
|
||||
except URLError as e:
|
||||
raise RequestError(
|
||||
"recognition connection failed: {}".format(e.reason)
|
||||
)
|
||||
return response.read().decode("utf-8")
|
||||
|
||||
|
||||
def recognize_legacy(
|
||||
recognizer,
|
||||
audio_data: AudioData,
|
||||
key: str | None = None,
|
||||
language: str = "en-US",
|
||||
pfilter: ProfanityFilterLevel = 0,
|
||||
show_all: bool = False,
|
||||
with_confidence: bool = False,
|
||||
*,
|
||||
endpoint: str = ENDPOINT,
|
||||
):
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.
|
||||
|
||||
The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.
|
||||
|
||||
To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
|
||||
|
||||
The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.
|
||||
|
||||
The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0.
|
||||
|
||||
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
|
||||
|
||||
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
|
||||
"""
|
||||
request_builder = create_request_builder(
|
||||
endpoint=endpoint, key=key, language=language, filter_level=pfilter
|
||||
)
|
||||
request = request_builder.build(audio_data)
|
||||
|
||||
response_text = obtain_transcription(
|
||||
request, timeout=recognizer.operation_timeout
|
||||
)
|
||||
|
||||
output_parser = OutputParser(
|
||||
show_all=show_all, with_confidence=with_confidence
|
||||
)
|
||||
return output_parser.parse(response_text)
|
@@ -0,0 +1,142 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, TypedDict
|
||||
from urllib.error import URLError
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.exceptions import RequestError, UnknownValueError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from google.cloud.speech import (
|
||||
RecognitionConfig,
|
||||
RecognizeResponse,
|
||||
SpeechContext,
|
||||
)
|
||||
from typing_extensions import Required
|
||||
|
||||
|
||||
class GoogleCloudRecognizerParameters(TypedDict, total=False):
|
||||
"""Optional parameters.
|
||||
|
||||
The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``.
|
||||
A list of supported language tags can be found in the `Speech-to-Text supported languages <https://cloud.google.com/speech/docs/languages>`__.
|
||||
|
||||
If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives.
|
||||
This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary.
|
||||
Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
|
||||
|
||||
``show_all``: See :py:func:`recognize`.
|
||||
|
||||
``model``: You can select the model to get best results. (See `RecognitionConfig's documentation <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>`__ for detail)
|
||||
|
||||
``use_enhanced``: Set to true to use an enhanced model for speech recognition.
|
||||
"""
|
||||
|
||||
# SpeechRecognition specific parameters
|
||||
preferred_phrases: list[str]
|
||||
show_all: bool
|
||||
|
||||
# Speech-to-Text V1 API's parameters
|
||||
language_code: str
|
||||
model: str
|
||||
use_enhanced: bool
|
||||
# TODO Add others support
|
||||
|
||||
|
||||
class GoogleCloudSpeechV1Parameters(TypedDict, total=False):
|
||||
"""Speech-to-Text V1 API's parameters.
|
||||
|
||||
https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig
|
||||
"""
|
||||
|
||||
encoding: Required[RecognitionConfig.AudioEncoding]
|
||||
sample_rate_hertz: Required[int]
|
||||
language_code: Required[str]
|
||||
speech_contexts: list[SpeechContext]
|
||||
enable_word_time_offsets: bool
|
||||
model: str
|
||||
use_enhanced: bool
|
||||
|
||||
|
||||
def _build_config(
|
||||
audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters
|
||||
) -> RecognitionConfig:
|
||||
from google.cloud import speech
|
||||
|
||||
parameters: GoogleCloudSpeechV1Parameters = {
|
||||
"encoding": speech.RecognitionConfig.AudioEncoding.FLAC,
|
||||
"sample_rate_hertz": audio_data.sample_rate,
|
||||
"language_code": recognizer_params.pop("language_code", "en-US"),
|
||||
}
|
||||
if preferred_phrases := recognizer_params.pop("preferred_phrases", None):
|
||||
parameters["speech_contexts"] = [
|
||||
speech.SpeechContext(phrases=preferred_phrases)
|
||||
]
|
||||
if recognizer_params.pop("show_all", False):
|
||||
# ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets
|
||||
parameters["enable_word_time_offsets"] = True
|
||||
return speech.RecognitionConfig(**(parameters | recognizer_params))
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: AudioData,
|
||||
credentials_json_path: str | None = None,
|
||||
**kwargs: GoogleCloudRecognizerParameters,
|
||||
) -> str | RecognizeResponse:
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
|
||||
|
||||
This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project.
|
||||
And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
|
||||
|
||||
Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
|
||||
For other parameters, see :py:class:`GoogleCloudRecognizerParameters`.
|
||||
|
||||
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
|
||||
"""
|
||||
try:
|
||||
from google.api_core.exceptions import GoogleAPICallError
|
||||
from google.cloud import speech
|
||||
except ImportError:
|
||||
raise RequestError(
|
||||
"missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly."
|
||||
)
|
||||
|
||||
client = (
|
||||
speech.SpeechClient.from_service_account_json(credentials_json_path)
|
||||
if credentials_json_path
|
||||
else speech.SpeechClient()
|
||||
)
|
||||
|
||||
flac_data = audio_data.get_flac_data(
|
||||
# audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
|
||||
convert_rate=(
|
||||
None
|
||||
if 8000 <= audio_data.sample_rate <= 48000
|
||||
else max(8000, min(audio_data.sample_rate, 48000))
|
||||
),
|
||||
convert_width=2, # audio samples must be 16-bit
|
||||
)
|
||||
audio = speech.RecognitionAudio(content=flac_data)
|
||||
|
||||
config = _build_config(audio_data, kwargs.copy())
|
||||
|
||||
try:
|
||||
response = client.recognize(config=config, audio=audio)
|
||||
except GoogleAPICallError as e:
|
||||
raise RequestError(e)
|
||||
except URLError as e:
|
||||
raise RequestError(
|
||||
"recognition connection failed: {0}".format(e.reason)
|
||||
)
|
||||
|
||||
if kwargs.get("show_all"):
|
||||
return response
|
||||
if len(response.results) == 0:
|
||||
raise UnknownValueError()
|
||||
|
||||
transcript = " ".join(
|
||||
result.alternatives[0].transcript.strip()
|
||||
for result in response.results
|
||||
)
|
||||
return transcript
|
@@ -0,0 +1,111 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from collections.abc import Sequence
|
||||
|
||||
from speech_recognition import PortableNamedTemporaryFile
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.exceptions import RequestError, UnknownValueError
|
||||
|
||||
AcousticParametersDirectoryPath = str
|
||||
LanguageModelFilePath = str
|
||||
PhonemeDictionaryFilePath = str
|
||||
SphinxDataFilePaths = tuple[AcousticParametersDirectoryPath, LanguageModelFilePath, PhonemeDictionaryFilePath]
|
||||
|
||||
Keyword = str
|
||||
Sensitivity = float
|
||||
KeywordEntry = tuple[Keyword, Sensitivity]
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: AudioData,
|
||||
language: str | SphinxDataFilePaths = "en-US",
|
||||
keyword_entries: Sequence[KeywordEntry] | None = None,
|
||||
grammar: str | None = None,
|
||||
show_all: bool = False,
|
||||
):
|
||||
"""
|
||||
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.
|
||||
|
||||
The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models.
|
||||
|
||||
If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for.
|
||||
|
||||
Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored.
|
||||
|
||||
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition.
|
||||
|
||||
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation.
|
||||
"""
|
||||
# TODO Move this validation into KeywordEntry initialization
|
||||
assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1"
|
||||
|
||||
try:
|
||||
from pocketsphinx import FsgModel, Jsgf, pocketsphinx
|
||||
except ImportError:
|
||||
raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
|
||||
|
||||
if isinstance(language, str): # directory containing language data
|
||||
language_directory = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "pocketsphinx-data", language)
|
||||
if not os.path.isdir(language_directory):
|
||||
raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory))
|
||||
acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
|
||||
language_model_file = os.path.join(language_directory, "language-model.lm.bin")
|
||||
phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
|
||||
else: # 3-tuple of Sphinx data file paths
|
||||
acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language
|
||||
if not os.path.isdir(acoustic_parameters_directory):
|
||||
raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory))
|
||||
if not os.path.isfile(language_model_file):
|
||||
raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file))
|
||||
if not os.path.isfile(phoneme_dictionary_file):
|
||||
raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file))
|
||||
|
||||
# create decoder object
|
||||
config = pocketsphinx.Config()
|
||||
config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files
|
||||
config.set_string("-lm", language_model_file)
|
||||
config.set_string("-dict", phoneme_dictionary_file)
|
||||
config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal)
|
||||
decoder = pocketsphinx.Decoder(config)
|
||||
|
||||
# obtain audio data
|
||||
raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format
|
||||
|
||||
# obtain recognition results
|
||||
if keyword_entries is not None: # explicitly specified set of keywords
|
||||
with PortableNamedTemporaryFile("w") as f:
|
||||
# generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5
|
||||
f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries)
|
||||
f.flush()
|
||||
|
||||
# perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done)
|
||||
decoder.add_kws("keywords", f.name)
|
||||
decoder.activate_search("keywords")
|
||||
elif grammar is not None: # a path to a FSG or JSGF grammar
|
||||
if not os.path.exists(grammar):
|
||||
raise ValueError("Grammar '{0}' does not exist.".format(grammar))
|
||||
grammar_path = os.path.abspath(os.path.dirname(grammar))
|
||||
grammar_name = os.path.splitext(os.path.basename(grammar))[0]
|
||||
fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name)
|
||||
if not os.path.exists(fsg_path): # create FSG grammar if not available
|
||||
jsgf = Jsgf(grammar)
|
||||
rule = jsgf.get_rule("{0}.{0}".format(grammar_name))
|
||||
fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5)
|
||||
fsg.writefile(fsg_path)
|
||||
else:
|
||||
fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5)
|
||||
decoder.set_fsg(grammar_name, fsg)
|
||||
decoder.set_search(grammar_name)
|
||||
|
||||
decoder.start_utt() # begin utterance processing
|
||||
decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
|
||||
decoder.end_utt() # stop utterance processing
|
||||
|
||||
if show_all: return decoder
|
||||
|
||||
# return results
|
||||
hypothesis = decoder.hyp()
|
||||
if hypothesis is not None: return hypothesis.hypstr
|
||||
raise UnknownValueError() # no transcriptions available
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,22 @@
|
||||
from io import BytesIO
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
|
||||
|
||||
class OpenAICompatibleRecognizer:
|
||||
def __init__(self, client) -> None:
|
||||
self.client = client
|
||||
|
||||
def recognize(self, audio_data: "AudioData", model: str, **kwargs) -> str:
|
||||
if not isinstance(audio_data, AudioData):
|
||||
raise ValueError(
|
||||
"``audio_data`` must be an ``AudioData`` instance"
|
||||
)
|
||||
|
||||
wav_data = BytesIO(audio_data.get_wav_data())
|
||||
wav_data.name = "SpeechRecognition_audio.wav"
|
||||
|
||||
transcript = self.client.audio.transcriptions.create(
|
||||
file=wav_data, model=model, **kwargs
|
||||
)
|
||||
return transcript.text
|
@@ -0,0 +1,54 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Literal, TypedDict
|
||||
|
||||
from typing_extensions import Unpack
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.exceptions import SetupError
|
||||
from speech_recognition.recognizers.whisper_api.base import (
|
||||
OpenAICompatibleRecognizer,
|
||||
)
|
||||
|
||||
# https://console.groq.com/docs/speech-text#supported-models
|
||||
GroqModel = Literal[
|
||||
"whisper-large-v3-turbo", "whisper-large-v3", "distil-whisper-large-v3-en"
|
||||
]
|
||||
|
||||
|
||||
class GroqOptionalParameters(TypedDict):
|
||||
"""Groq speech transcription's optional parameters.
|
||||
|
||||
https://console.groq.com/docs/speech-text#transcription-endpoint-usage
|
||||
"""
|
||||
|
||||
prompt: str
|
||||
response_format: str
|
||||
temperature: float
|
||||
language: str
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: "AudioData",
|
||||
*,
|
||||
model: GroqModel = "whisper-large-v3-turbo",
|
||||
**kwargs: Unpack[GroqOptionalParameters],
|
||||
) -> str:
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Groq Whisper API.
|
||||
|
||||
This function requires login to Groq; visit https://console.groq.com/login, then generate API Key in `API Keys <https://console.groq.com/keys>`__ menu.
|
||||
|
||||
Detail: https://console.groq.com/docs/speech-text
|
||||
|
||||
Set environment variable ``GROQ_API_KEY``; otherwise groq library will raise a ``groq.GroqError``.
|
||||
"""
|
||||
try:
|
||||
import groq
|
||||
except ImportError:
|
||||
raise SetupError(
|
||||
"missing groq module: ensure that groq is set up correctly."
|
||||
)
|
||||
|
||||
groq_recognizer = OpenAICompatibleRecognizer(groq.Groq())
|
||||
return groq_recognizer.recognize(audio_data, model, **kwargs)
|
@@ -0,0 +1,83 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Literal
|
||||
|
||||
from typing_extensions import Unpack
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.exceptions import SetupError
|
||||
from speech_recognition.recognizers.whisper_api.base import (
|
||||
OpenAICompatibleRecognizer,
|
||||
)
|
||||
|
||||
# https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-model
|
||||
WhisperModel = Literal[
|
||||
"whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"
|
||||
]
|
||||
|
||||
|
||||
class OpenAIOptionalParameters:
|
||||
"""OpenAI speech transcription's optional parameters.
|
||||
|
||||
https://platform.openai.com/docs/api-reference/audio/createTranscription
|
||||
"""
|
||||
|
||||
language: str
|
||||
prompt: str
|
||||
# TODO Add support `Literal["text", "srt", "verbose_json", "vtt"]`
|
||||
response_format: Literal["json"]
|
||||
temperature: float
|
||||
# timestamp_granularities # TODO support
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: "AudioData",
|
||||
*,
|
||||
model: WhisperModel = "whisper-1",
|
||||
**kwargs: Unpack[OpenAIOptionalParameters],
|
||||
) -> str:
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API.
|
||||
|
||||
This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings <https://platform.openai.com/account/api-keys>`__.
|
||||
|
||||
Detail: https://platform.openai.com/docs/guides/speech-to-text
|
||||
|
||||
Set environment variable ``OPENAI_API_KEY``; otherwise openai library will raise a ``openai.OpenAIError``.
|
||||
"""
|
||||
try:
|
||||
import openai
|
||||
except ImportError:
|
||||
raise SetupError(
|
||||
"missing openai module: ensure that openai is set up correctly."
|
||||
)
|
||||
|
||||
openai_recognizer = OpenAICompatibleRecognizer(openai.OpenAI())
|
||||
return openai_recognizer.recognize(audio_data, model, **kwargs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
from typing import get_args
|
||||
|
||||
import speech_recognition as sr
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("audio_file")
|
||||
parser.add_argument(
|
||||
"--model", choices=get_args(WhisperModel), default="whisper-1"
|
||||
)
|
||||
parser.add_argument("-l", "--language")
|
||||
args = parser.parse_args()
|
||||
|
||||
r = sr.Recognizer()
|
||||
with sr.AudioFile(args.audio_file) as source:
|
||||
audio_data = r.listen(source)
|
||||
|
||||
if args.language:
|
||||
transcription = recognize(
|
||||
None, audio_data, model=args.model, language=args.language
|
||||
)
|
||||
else:
|
||||
transcription = recognize(None, audio_data, model=args.model)
|
||||
print(transcription)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,45 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from typing import TYPE_CHECKING, Any, Protocol
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Transcribable(Protocol):
|
||||
def transcribe(
|
||||
self, audio_array: np.ndarray, **kwargs
|
||||
) -> str | dict[str, Any]:
|
||||
pass
|
||||
|
||||
|
||||
class WhisperCompatibleRecognizer:
|
||||
def __init__(self, model: Transcribable) -> None:
|
||||
self.model = model
|
||||
|
||||
def recognize(
|
||||
self, audio_data: AudioData, show_dict: bool = False, **kwargs
|
||||
):
|
||||
if not isinstance(audio_data, AudioData):
|
||||
raise ValueError(
|
||||
"``audio_data`` must be an ``AudioData`` instance"
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
# 16 kHz https://github.com/openai/whisper/blob/28769fcfe50755a817ab922a7bc83483159600a9/whisper/audio.py#L98-L99
|
||||
wav_bytes = audio_data.get_wav_data(convert_rate=16000)
|
||||
wav_stream = io.BytesIO(wav_bytes)
|
||||
audio_array, sampling_rate = sf.read(wav_stream)
|
||||
audio_array = audio_array.astype(np.float32)
|
||||
|
||||
result = self.model.transcribe(audio_array, **kwargs)
|
||||
|
||||
if show_dict:
|
||||
return result
|
||||
else:
|
||||
return result["text"]
|
@@ -0,0 +1,106 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Literal, TypedDict
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.recognizers.whisper_local.base import (
|
||||
WhisperCompatibleRecognizer,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
from faster_whisper import WhisperModel
|
||||
from faster_whisper.transcribe import Segment
|
||||
from typing_extensions import Unpack
|
||||
|
||||
|
||||
class TranscribeOutput(TypedDict):
|
||||
text: str
|
||||
segments: list[Segment]
|
||||
language: str
|
||||
|
||||
|
||||
class TranscribableAdapter:
|
||||
def __init__(self, model: WhisperModel) -> None:
|
||||
self.model = model
|
||||
|
||||
def transcribe(
|
||||
self, audio_array: np.ndarray, **kwargs
|
||||
) -> TranscribeOutput:
|
||||
segments_generator, info = self.model.transcribe(audio_array, **kwargs)
|
||||
segments = list(segments_generator)
|
||||
return {
|
||||
"text": " ".join(segment.text for segment in segments),
|
||||
"segments": segments,
|
||||
"language": info.language,
|
||||
}
|
||||
|
||||
|
||||
class InitOptionalParameters(TypedDict, total=False):
|
||||
# https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L575
|
||||
device: Literal["cpu", "gpu", "auto"]
|
||||
compute_type: str
|
||||
download_root: str
|
||||
# TODO Add others
|
||||
|
||||
|
||||
class TranscribeOptionalParameters(TypedDict, total=False):
|
||||
# https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692
|
||||
language: str
|
||||
task: Literal["transcribe", "translate"]
|
||||
beam_size: int
|
||||
# TODO Add others
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: AudioData,
|
||||
model: str = "base",
|
||||
show_dict: bool = False,
|
||||
init_options: InitOptionalParameters | None = None,
|
||||
**transcribe_options: Unpack[TranscribeOptionalParameters],
|
||||
) -> str | TranscribeOutput:
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
|
||||
|
||||
Pick ``model`` size (Same as Whisper).
|
||||
|
||||
If ``show_dict`` is true, returns the detailed response from Whisper, including the detected language. Otherwise returns only the transcription.
|
||||
|
||||
You can specify:
|
||||
|
||||
* ``language``: recognition language, an uncapitalized 2 letters language name like "en" or "fr".
|
||||
|
||||
* If not set, Faster Whisper will automatically detect the language.
|
||||
|
||||
* ``task``
|
||||
|
||||
* If you want transcribe + **translate** to english, set ``task="translate"``.
|
||||
|
||||
Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options.
|
||||
"""
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
model = WhisperModel(model, **init_options or {})
|
||||
whisper_recognizer = WhisperCompatibleRecognizer(
|
||||
TranscribableAdapter(model)
|
||||
)
|
||||
return whisper_recognizer.recognize(
|
||||
audio_data, show_dict=show_dict, **transcribe_options
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
import speech_recognition as sr
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("audio_file")
|
||||
args = parser.parse_args()
|
||||
|
||||
r = sr.Recognizer()
|
||||
with sr.AudioFile(args.audio_file) as source:
|
||||
audio_data = r.listen(source)
|
||||
|
||||
transcription = recognize(None, audio_data)
|
||||
print(transcription)
|
@@ -0,0 +1,108 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Literal, TypedDict
|
||||
|
||||
from speech_recognition.audio import AudioData
|
||||
from speech_recognition.recognizers.whisper_local.base import (
|
||||
WhisperCompatibleRecognizer,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
import torch
|
||||
from typing_extensions import Unpack
|
||||
from whisper import Whisper
|
||||
|
||||
|
||||
class LoadModelOptionalParameters(TypedDict, total=False):
|
||||
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/__init__.py#L103
|
||||
device: str | torch.device
|
||||
download_root: str
|
||||
in_memory: bool
|
||||
|
||||
|
||||
class TranscribeOptionalParameters(TypedDict, total=False):
|
||||
"""Transcribe optional parameters & DecodingOptions parameters."""
|
||||
|
||||
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/transcribe.py#L38
|
||||
temperature: float | tuple[float, ...]
|
||||
# TODO Add others
|
||||
|
||||
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/decoding.py#L81
|
||||
task: Literal["transcribe", "translate"]
|
||||
language: str
|
||||
fp16: bool
|
||||
# TODO Add others
|
||||
|
||||
|
||||
class Segment(TypedDict):
|
||||
id: int
|
||||
seek: int
|
||||
start: float
|
||||
end: float
|
||||
text: str
|
||||
tokens: list[int]
|
||||
temperature: float
|
||||
avg_logprob: float
|
||||
compression_ratio: float
|
||||
no_speech_prob: float
|
||||
|
||||
|
||||
class TranscribeOutput(TypedDict):
|
||||
text: str
|
||||
segments: list[Segment]
|
||||
language: str
|
||||
|
||||
|
||||
class TranscribableAdapter:
|
||||
def __init__(self, model: Whisper) -> None:
|
||||
self.model = model
|
||||
|
||||
def transcribe(
|
||||
self, audio_array: np.ndarray, **kwargs
|
||||
) -> TranscribeOutput:
|
||||
if "fp16" not in kwargs:
|
||||
import torch
|
||||
|
||||
kwargs["fp16"] = torch.cuda.is_available()
|
||||
|
||||
return self.model.transcribe(audio_array, **kwargs)
|
||||
|
||||
|
||||
def recognize(
|
||||
recognizer,
|
||||
audio_data: AudioData,
|
||||
model: str = "base",
|
||||
show_dict: bool = False,
|
||||
load_options: LoadModelOptionalParameters | None = None,
|
||||
**transcribe_options: Unpack[TranscribeOptionalParameters],
|
||||
) -> str | TranscribeOutput:
|
||||
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
|
||||
|
||||
Pick ``model`` from output of :command:`python -c 'import whisper; print(whisper.available_models())'`.
|
||||
See also https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages.
|
||||
|
||||
If ``show_dict`` is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription.
|
||||
|
||||
You can specify:
|
||||
|
||||
* ``language``: recognition language, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
|
||||
|
||||
* If not set, Whisper will automatically detect the language.
|
||||
|
||||
* ``task``
|
||||
|
||||
* If you want transcribe + **translate** to english, set ``task="translate"``.
|
||||
|
||||
Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options.
|
||||
"""
|
||||
|
||||
import whisper
|
||||
|
||||
whisper_model = whisper.load_model(model, **load_options or {})
|
||||
whisper_recognizer = WhisperCompatibleRecognizer(
|
||||
TranscribableAdapter(whisper_model)
|
||||
)
|
||||
return whisper_recognizer.recognize(
|
||||
audio_data, show_dict=show_dict, **transcribe_options
|
||||
)
|
Reference in New Issue
Block a user