first commit

This commit is contained in:
2025-04-04 13:23:15 -06:00
commit 216064f731
2103 changed files with 522593 additions and 0 deletions

View File

@@ -0,0 +1,262 @@
from __future__ import annotations
import json
from typing import Dict, Literal, TypedDict
from urllib.error import HTTPError, URLError
from urllib.parse import urlencode
from urllib.request import Request, urlopen
from typing_extensions import NotRequired
from speech_recognition.audio import AudioData
from speech_recognition.exceptions import RequestError, UnknownValueError
class Alternative(TypedDict):
transcript: str
confidence: float
class Result(TypedDict):
alternative: list[Alternative]
final: bool
class GoogleResponse(TypedDict):
result: list[Result]
result_index: NotRequired[int]
ProfanityFilterLevel = Literal[0, 1]
RequestHeaders = Dict[str, str]
ENDPOINT = "http://www.google.com/speech-api/v2/recognize"
class RequestBuilder:
def __init__(
self,
*,
endpoint: str,
key: str,
language: str,
filter_level: ProfanityFilterLevel,
) -> None:
self.endpoint = endpoint
self.key = key
self.language = language
self.filter_level = filter_level
def build(self, audio_data: AudioData) -> Request:
if not isinstance(audio_data, AudioData):
raise ValueError("``audio_data`` must be audio data")
url = self.build_url()
headers = self.build_headers(audio_data)
flac_data = self.build_data(audio_data)
request = Request(url, data=flac_data, headers=headers)
return request
def build_url(self) -> str:
"""
>>> builder = RequestBuilder(endpoint="http://www.google.com/speech-api/v2/recognize", key="awesome-key", language="en-US", filter_level=0)
>>> builder.build_url()
'http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0'
"""
params = urlencode(
{
"client": "chromium",
"lang": self.language,
"key": self.key,
"pFilter": self.filter_level,
}
)
return f"{self.endpoint}?{params}"
def build_headers(self, audio_data: AudioData) -> RequestHeaders:
"""
>>> builder = RequestBuilder(endpoint="", key="", language="", filter_level=1)
>>> audio_data = AudioData(b"", 16_000, 1)
>>> builder.build_headers(audio_data)
{'Content-Type': 'audio/x-flac; rate=16000'}
"""
rate = audio_data.sample_rate
headers = {"Content-Type": f"audio/x-flac; rate={rate}"}
return headers
def build_data(self, audio_data: AudioData) -> bytes:
flac_data = audio_data.get_flac_data(
convert_rate=self.to_convert_rate(audio_data.sample_rate),
convert_width=2, # audio samples must be 16-bit
)
return flac_data
@staticmethod
def to_convert_rate(sample_rate: int) -> int:
"""Audio samples must be at least 8 kHz
>>> RequestBuilder.to_convert_rate(16_000)
>>> RequestBuilder.to_convert_rate(8_000)
>>> RequestBuilder.to_convert_rate(7_999)
8000
"""
return None if sample_rate >= 8000 else 8000
def create_request_builder(
*,
endpoint: str,
key: str | None = None,
language: str = "en-US",
filter_level: ProfanityFilterLevel = 0,
) -> RequestBuilder:
if not isinstance(language, str):
raise ValueError("``language`` must be a string")
if key is not None and not isinstance(key, str):
raise ValueError("``key`` must be ``None`` or a string")
if key is None:
key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
return RequestBuilder(
endpoint=endpoint,
key=key,
language=language,
filter_level=filter_level,
)
class OutputParser:
def __init__(self, *, show_all: bool, with_confidence: bool) -> None:
self.show_all = show_all
self.with_confidence = with_confidence
def parse(self, response_text: str):
actual_result = self.convert_to_result(response_text)
if self.show_all:
return actual_result
best_hypothesis = self.find_best_hypothesis(
actual_result["alternative"]
)
# https://cloud.google.com/speech-to-text/docs/basics#confidence-values
# "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results."
confidence = best_hypothesis.get("confidence", 0.5)
if self.with_confidence:
return best_hypothesis["transcript"], confidence
return best_hypothesis["transcript"]
@staticmethod
def convert_to_result(response_text: str) -> Result:
r"""
>>> response_text = '''{"result":[]}
... {"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0}
... '''
>>> OutputParser.convert_to_result(response_text)
{'alternative': [{'transcript': 'one two three', 'confidence': 0.49585345}, {'transcript': '1 2', 'confidence': 0.42899391}], 'final': True}
>>> OutputParser.convert_to_result("")
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
>>> OutputParser.convert_to_result('\n{"result":[]}')
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
>>> OutputParser.convert_to_result('{"result":[{"foo": "bar"}]}')
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
>>> OutputParser.convert_to_result('{"result":[{"alternative": []}]}')
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
"""
# ignore any blank blocks
for line in response_text.split("\n"):
if not line:
continue
result: list[Result] = json.loads(line)["result"]
if len(result) != 0:
if len(result[0].get("alternative", [])) == 0:
raise UnknownValueError()
return result[0]
raise UnknownValueError()
@staticmethod
def find_best_hypothesis(alternatives: list[Alternative]) -> Alternative:
"""
>>> alternatives = [{"transcript": "one two three", "confidence": 0.42899391}, {"transcript": "1 2", "confidence": 0.49585345}]
>>> OutputParser.find_best_hypothesis(alternatives)
{'transcript': 'one two three', 'confidence': 0.42899391}
>>> alternatives = [{"confidence": 0.49585345}]
>>> OutputParser.find_best_hypothesis(alternatives)
Traceback (most recent call last):
...
speech_recognition.exceptions.UnknownValueError
"""
if "confidence" in alternatives:
# BUG: actual_result["alternative"] (=alternatives) is list, not dict
# return alternative with highest confidence score
best_hypothesis: Alternative = max(
alternatives,
key=lambda alternative: alternative["confidence"],
)
else:
# when there is no confidence available, we arbitrarily choose the first hypothesis.
best_hypothesis: Alternative = alternatives[0]
if "transcript" not in best_hypothesis:
raise UnknownValueError()
return best_hypothesis
def obtain_transcription(request: Request, timeout: int) -> str:
try:
response = urlopen(request, timeout=timeout)
except HTTPError as e:
raise RequestError("recognition request failed: {}".format(e.reason))
except URLError as e:
raise RequestError(
"recognition connection failed: {}".format(e.reason)
)
return response.read().decode("utf-8")
def recognize_legacy(
recognizer,
audio_data: AudioData,
key: str | None = None,
language: str = "en-US",
pfilter: ProfanityFilterLevel = 0,
show_all: bool = False,
with_confidence: bool = False,
*,
endpoint: str = ENDPOINT,
):
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API.
The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**.
To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API".
The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__.
The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0.
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary.
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
"""
request_builder = create_request_builder(
endpoint=endpoint, key=key, language=language, filter_level=pfilter
)
request = request_builder.build(audio_data)
response_text = obtain_transcription(
request, timeout=recognizer.operation_timeout
)
output_parser = OutputParser(
show_all=show_all, with_confidence=with_confidence
)
return output_parser.parse(response_text)

View File

@@ -0,0 +1,142 @@
from __future__ import annotations
from typing import TYPE_CHECKING, TypedDict
from urllib.error import URLError
from speech_recognition.audio import AudioData
from speech_recognition.exceptions import RequestError, UnknownValueError
if TYPE_CHECKING:
from google.cloud.speech import (
RecognitionConfig,
RecognizeResponse,
SpeechContext,
)
from typing_extensions import Required
class GoogleCloudRecognizerParameters(TypedDict, total=False):
"""Optional parameters.
The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``.
A list of supported language tags can be found in the `Speech-to-Text supported languages <https://cloud.google.com/speech/docs/languages>`__.
If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives.
This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary.
Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
``show_all``: See :py:func:`recognize`.
``model``: You can select the model to get best results. (See `RecognitionConfig's documentation <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>`__ for detail)
``use_enhanced``: Set to true to use an enhanced model for speech recognition.
"""
# SpeechRecognition specific parameters
preferred_phrases: list[str]
show_all: bool
# Speech-to-Text V1 API's parameters
language_code: str
model: str
use_enhanced: bool
# TODO Add others support
class GoogleCloudSpeechV1Parameters(TypedDict, total=False):
"""Speech-to-Text V1 API's parameters.
https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig
"""
encoding: Required[RecognitionConfig.AudioEncoding]
sample_rate_hertz: Required[int]
language_code: Required[str]
speech_contexts: list[SpeechContext]
enable_word_time_offsets: bool
model: str
use_enhanced: bool
def _build_config(
audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters
) -> RecognitionConfig:
from google.cloud import speech
parameters: GoogleCloudSpeechV1Parameters = {
"encoding": speech.RecognitionConfig.AudioEncoding.FLAC,
"sample_rate_hertz": audio_data.sample_rate,
"language_code": recognizer_params.pop("language_code", "en-US"),
}
if preferred_phrases := recognizer_params.pop("preferred_phrases", None):
parameters["speech_contexts"] = [
speech.SpeechContext(phrases=preferred_phrases)
]
if recognizer_params.pop("show_all", False):
# ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets
parameters["enable_word_time_offsets"] = True
return speech.RecognitionConfig(**(parameters | recognizer_params))
def recognize(
recognizer,
audio_data: AudioData,
credentials_json_path: str | None = None,
**kwargs: GoogleCloudRecognizerParameters,
) -> str | RecognizeResponse:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project.
And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
For other parameters, see :py:class:`GoogleCloudRecognizerParameters`.
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
"""
try:
from google.api_core.exceptions import GoogleAPICallError
from google.cloud import speech
except ImportError:
raise RequestError(
"missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly."
)
client = (
speech.SpeechClient.from_service_account_json(credentials_json_path)
if credentials_json_path
else speech.SpeechClient()
)
flac_data = audio_data.get_flac_data(
# audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
convert_rate=(
None
if 8000 <= audio_data.sample_rate <= 48000
else max(8000, min(audio_data.sample_rate, 48000))
),
convert_width=2, # audio samples must be 16-bit
)
audio = speech.RecognitionAudio(content=flac_data)
config = _build_config(audio_data, kwargs.copy())
try:
response = client.recognize(config=config, audio=audio)
except GoogleAPICallError as e:
raise RequestError(e)
except URLError as e:
raise RequestError(
"recognition connection failed: {0}".format(e.reason)
)
if kwargs.get("show_all"):
return response
if len(response.results) == 0:
raise UnknownValueError()
transcript = " ".join(
result.alternatives[0].transcript.strip()
for result in response.results
)
return transcript

View File

@@ -0,0 +1,111 @@
from __future__ import annotations
import os
from collections.abc import Sequence
from speech_recognition import PortableNamedTemporaryFile
from speech_recognition.audio import AudioData
from speech_recognition.exceptions import RequestError, UnknownValueError
AcousticParametersDirectoryPath = str
LanguageModelFilePath = str
PhonemeDictionaryFilePath = str
SphinxDataFilePaths = tuple[AcousticParametersDirectoryPath, LanguageModelFilePath, PhonemeDictionaryFilePath]
Keyword = str
Sensitivity = float
KeywordEntry = tuple[Keyword, Sensitivity]
def recognize(
recognizer,
audio_data: AudioData,
language: str | SphinxDataFilePaths = "en-US",
keyword_entries: Sequence[KeywordEntry] | None = None,
grammar: str | None = None,
show_all: bool = False,
):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx.
The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models.
If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for.
Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored.
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition.
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation.
"""
# TODO Move this validation into KeywordEntry initialization
assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1"
try:
from pocketsphinx import FsgModel, Jsgf, pocketsphinx
except ImportError:
raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.")
if isinstance(language, str): # directory containing language data
language_directory = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "pocketsphinx-data", language)
if not os.path.isdir(language_directory):
raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory))
acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
language_model_file = os.path.join(language_directory, "language-model.lm.bin")
phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
else: # 3-tuple of Sphinx data file paths
acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language
if not os.path.isdir(acoustic_parameters_directory):
raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory))
if not os.path.isfile(language_model_file):
raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file))
if not os.path.isfile(phoneme_dictionary_file):
raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file))
# create decoder object
config = pocketsphinx.Config()
config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files
config.set_string("-lm", language_model_file)
config.set_string("-dict", phoneme_dictionary_file)
config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal)
decoder = pocketsphinx.Decoder(config)
# obtain audio data
raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format
# obtain recognition results
if keyword_entries is not None: # explicitly specified set of keywords
with PortableNamedTemporaryFile("w") as f:
# generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5
f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries)
f.flush()
# perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done)
decoder.add_kws("keywords", f.name)
decoder.activate_search("keywords")
elif grammar is not None: # a path to a FSG or JSGF grammar
if not os.path.exists(grammar):
raise ValueError("Grammar '{0}' does not exist.".format(grammar))
grammar_path = os.path.abspath(os.path.dirname(grammar))
grammar_name = os.path.splitext(os.path.basename(grammar))[0]
fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name)
if not os.path.exists(fsg_path): # create FSG grammar if not available
jsgf = Jsgf(grammar)
rule = jsgf.get_rule("{0}.{0}".format(grammar_name))
fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5)
fsg.writefile(fsg_path)
else:
fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5)
decoder.set_fsg(grammar_name, fsg)
decoder.set_search(grammar_name)
decoder.start_utt() # begin utterance processing
decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True)
decoder.end_utt() # stop utterance processing
if show_all: return decoder
# return results
hypothesis = decoder.hyp()
if hypothesis is not None: return hypothesis.hypstr
raise UnknownValueError() # no transcriptions available

View File

@@ -0,0 +1,22 @@
from io import BytesIO
from speech_recognition.audio import AudioData
class OpenAICompatibleRecognizer:
def __init__(self, client) -> None:
self.client = client
def recognize(self, audio_data: "AudioData", model: str, **kwargs) -> str:
if not isinstance(audio_data, AudioData):
raise ValueError(
"``audio_data`` must be an ``AudioData`` instance"
)
wav_data = BytesIO(audio_data.get_wav_data())
wav_data.name = "SpeechRecognition_audio.wav"
transcript = self.client.audio.transcriptions.create(
file=wav_data, model=model, **kwargs
)
return transcript.text

View File

@@ -0,0 +1,54 @@
from __future__ import annotations
from typing import Literal, TypedDict
from typing_extensions import Unpack
from speech_recognition.audio import AudioData
from speech_recognition.exceptions import SetupError
from speech_recognition.recognizers.whisper_api.base import (
OpenAICompatibleRecognizer,
)
# https://console.groq.com/docs/speech-text#supported-models
GroqModel = Literal[
"whisper-large-v3-turbo", "whisper-large-v3", "distil-whisper-large-v3-en"
]
class GroqOptionalParameters(TypedDict):
"""Groq speech transcription's optional parameters.
https://console.groq.com/docs/speech-text#transcription-endpoint-usage
"""
prompt: str
response_format: str
temperature: float
language: str
def recognize(
recognizer,
audio_data: "AudioData",
*,
model: GroqModel = "whisper-large-v3-turbo",
**kwargs: Unpack[GroqOptionalParameters],
) -> str:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Groq Whisper API.
This function requires login to Groq; visit https://console.groq.com/login, then generate API Key in `API Keys <https://console.groq.com/keys>`__ menu.
Detail: https://console.groq.com/docs/speech-text
Set environment variable ``GROQ_API_KEY``; otherwise groq library will raise a ``groq.GroqError``.
"""
try:
import groq
except ImportError:
raise SetupError(
"missing groq module: ensure that groq is set up correctly."
)
groq_recognizer = OpenAICompatibleRecognizer(groq.Groq())
return groq_recognizer.recognize(audio_data, model, **kwargs)

View File

@@ -0,0 +1,83 @@
from __future__ import annotations
from typing import Literal
from typing_extensions import Unpack
from speech_recognition.audio import AudioData
from speech_recognition.exceptions import SetupError
from speech_recognition.recognizers.whisper_api.base import (
OpenAICompatibleRecognizer,
)
# https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-model
WhisperModel = Literal[
"whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"
]
class OpenAIOptionalParameters:
"""OpenAI speech transcription's optional parameters.
https://platform.openai.com/docs/api-reference/audio/createTranscription
"""
language: str
prompt: str
# TODO Add support `Literal["text", "srt", "verbose_json", "vtt"]`
response_format: Literal["json"]
temperature: float
# timestamp_granularities # TODO support
def recognize(
recognizer,
audio_data: "AudioData",
*,
model: WhisperModel = "whisper-1",
**kwargs: Unpack[OpenAIOptionalParameters],
) -> str:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API.
This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings <https://platform.openai.com/account/api-keys>`__.
Detail: https://platform.openai.com/docs/guides/speech-to-text
Set environment variable ``OPENAI_API_KEY``; otherwise openai library will raise a ``openai.OpenAIError``.
"""
try:
import openai
except ImportError:
raise SetupError(
"missing openai module: ensure that openai is set up correctly."
)
openai_recognizer = OpenAICompatibleRecognizer(openai.OpenAI())
return openai_recognizer.recognize(audio_data, model, **kwargs)
if __name__ == "__main__":
import argparse
from typing import get_args
import speech_recognition as sr
parser = argparse.ArgumentParser()
parser.add_argument("audio_file")
parser.add_argument(
"--model", choices=get_args(WhisperModel), default="whisper-1"
)
parser.add_argument("-l", "--language")
args = parser.parse_args()
r = sr.Recognizer()
with sr.AudioFile(args.audio_file) as source:
audio_data = r.listen(source)
if args.language:
transcription = recognize(
None, audio_data, model=args.model, language=args.language
)
else:
transcription = recognize(None, audio_data, model=args.model)
print(transcription)

View File

@@ -0,0 +1,45 @@
from __future__ import annotations
import io
from typing import TYPE_CHECKING, Any, Protocol
from speech_recognition.audio import AudioData
if TYPE_CHECKING:
import numpy as np
class Transcribable(Protocol):
def transcribe(
self, audio_array: np.ndarray, **kwargs
) -> str | dict[str, Any]:
pass
class WhisperCompatibleRecognizer:
def __init__(self, model: Transcribable) -> None:
self.model = model
def recognize(
self, audio_data: AudioData, show_dict: bool = False, **kwargs
):
if not isinstance(audio_data, AudioData):
raise ValueError(
"``audio_data`` must be an ``AudioData`` instance"
)
import numpy as np
import soundfile as sf
# 16 kHz https://github.com/openai/whisper/blob/28769fcfe50755a817ab922a7bc83483159600a9/whisper/audio.py#L98-L99
wav_bytes = audio_data.get_wav_data(convert_rate=16000)
wav_stream = io.BytesIO(wav_bytes)
audio_array, sampling_rate = sf.read(wav_stream)
audio_array = audio_array.astype(np.float32)
result = self.model.transcribe(audio_array, **kwargs)
if show_dict:
return result
else:
return result["text"]

View File

@@ -0,0 +1,106 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Literal, TypedDict
from speech_recognition.audio import AudioData
from speech_recognition.recognizers.whisper_local.base import (
WhisperCompatibleRecognizer,
)
if TYPE_CHECKING:
import numpy as np
from faster_whisper import WhisperModel
from faster_whisper.transcribe import Segment
from typing_extensions import Unpack
class TranscribeOutput(TypedDict):
text: str
segments: list[Segment]
language: str
class TranscribableAdapter:
def __init__(self, model: WhisperModel) -> None:
self.model = model
def transcribe(
self, audio_array: np.ndarray, **kwargs
) -> TranscribeOutput:
segments_generator, info = self.model.transcribe(audio_array, **kwargs)
segments = list(segments_generator)
return {
"text": " ".join(segment.text for segment in segments),
"segments": segments,
"language": info.language,
}
class InitOptionalParameters(TypedDict, total=False):
# https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L575
device: Literal["cpu", "gpu", "auto"]
compute_type: str
download_root: str
# TODO Add others
class TranscribeOptionalParameters(TypedDict, total=False):
# https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692
language: str
task: Literal["transcribe", "translate"]
beam_size: int
# TODO Add others
def recognize(
recognizer,
audio_data: AudioData,
model: str = "base",
show_dict: bool = False,
init_options: InitOptionalParameters | None = None,
**transcribe_options: Unpack[TranscribeOptionalParameters],
) -> str | TranscribeOutput:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
Pick ``model`` size (Same as Whisper).
If ``show_dict`` is true, returns the detailed response from Whisper, including the detected language. Otherwise returns only the transcription.
You can specify:
* ``language``: recognition language, an uncapitalized 2 letters language name like "en" or "fr".
* If not set, Faster Whisper will automatically detect the language.
* ``task``
* If you want transcribe + **translate** to english, set ``task="translate"``.
Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options.
"""
from faster_whisper import WhisperModel
model = WhisperModel(model, **init_options or {})
whisper_recognizer = WhisperCompatibleRecognizer(
TranscribableAdapter(model)
)
return whisper_recognizer.recognize(
audio_data, show_dict=show_dict, **transcribe_options
)
if __name__ == "__main__":
import argparse
import speech_recognition as sr
parser = argparse.ArgumentParser()
parser.add_argument("audio_file")
args = parser.parse_args()
r = sr.Recognizer()
with sr.AudioFile(args.audio_file) as source:
audio_data = r.listen(source)
transcription = recognize(None, audio_data)
print(transcription)

View File

@@ -0,0 +1,108 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Literal, TypedDict
from speech_recognition.audio import AudioData
from speech_recognition.recognizers.whisper_local.base import (
WhisperCompatibleRecognizer,
)
if TYPE_CHECKING:
import numpy as np
import torch
from typing_extensions import Unpack
from whisper import Whisper
class LoadModelOptionalParameters(TypedDict, total=False):
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/__init__.py#L103
device: str | torch.device
download_root: str
in_memory: bool
class TranscribeOptionalParameters(TypedDict, total=False):
"""Transcribe optional parameters & DecodingOptions parameters."""
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/transcribe.py#L38
temperature: float | tuple[float, ...]
# TODO Add others
# ref: https://github.com/openai/whisper/blob/v20240930/whisper/decoding.py#L81
task: Literal["transcribe", "translate"]
language: str
fp16: bool
# TODO Add others
class Segment(TypedDict):
id: int
seek: int
start: float
end: float
text: str
tokens: list[int]
temperature: float
avg_logprob: float
compression_ratio: float
no_speech_prob: float
class TranscribeOutput(TypedDict):
text: str
segments: list[Segment]
language: str
class TranscribableAdapter:
def __init__(self, model: Whisper) -> None:
self.model = model
def transcribe(
self, audio_array: np.ndarray, **kwargs
) -> TranscribeOutput:
if "fp16" not in kwargs:
import torch
kwargs["fp16"] = torch.cuda.is_available()
return self.model.transcribe(audio_array, **kwargs)
def recognize(
recognizer,
audio_data: AudioData,
model: str = "base",
show_dict: bool = False,
load_options: LoadModelOptionalParameters | None = None,
**transcribe_options: Unpack[TranscribeOptionalParameters],
) -> str | TranscribeOutput:
"""Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper.
Pick ``model`` from output of :command:`python -c 'import whisper; print(whisper.available_models())'`.
See also https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages.
If ``show_dict`` is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription.
You can specify:
* ``language``: recognition language, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
* If not set, Whisper will automatically detect the language.
* ``task``
* If you want transcribe + **translate** to english, set ``task="translate"``.
Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options.
"""
import whisper
whisper_model = whisper.load_model(model, **load_options or {})
whisper_recognizer = WhisperCompatibleRecognizer(
TranscribableAdapter(whisper_model)
)
return whisper_recognizer.recognize(
audio_data, show_dict=show_dict, **transcribe_options
)