first commit
This commit is contained in:
		
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -0,0 +1,262 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| import json | ||||
| from typing import Dict, Literal, TypedDict | ||||
| from urllib.error import HTTPError, URLError | ||||
| from urllib.parse import urlencode | ||||
| from urllib.request import Request, urlopen | ||||
|  | ||||
| from typing_extensions import NotRequired | ||||
|  | ||||
| from speech_recognition.audio import AudioData | ||||
| from speech_recognition.exceptions import RequestError, UnknownValueError | ||||
|  | ||||
|  | ||||
| class Alternative(TypedDict): | ||||
|     transcript: str | ||||
|     confidence: float | ||||
|  | ||||
|  | ||||
| class Result(TypedDict): | ||||
|     alternative: list[Alternative] | ||||
|     final: bool | ||||
|  | ||||
|  | ||||
| class GoogleResponse(TypedDict): | ||||
|     result: list[Result] | ||||
|     result_index: NotRequired[int] | ||||
|  | ||||
|  | ||||
| ProfanityFilterLevel = Literal[0, 1] | ||||
| RequestHeaders = Dict[str, str] | ||||
|  | ||||
| ENDPOINT = "http://www.google.com/speech-api/v2/recognize" | ||||
|  | ||||
|  | ||||
| class RequestBuilder: | ||||
|     def __init__( | ||||
|         self, | ||||
|         *, | ||||
|         endpoint: str, | ||||
|         key: str, | ||||
|         language: str, | ||||
|         filter_level: ProfanityFilterLevel, | ||||
|     ) -> None: | ||||
|         self.endpoint = endpoint | ||||
|         self.key = key | ||||
|         self.language = language | ||||
|         self.filter_level = filter_level | ||||
|  | ||||
|     def build(self, audio_data: AudioData) -> Request: | ||||
|         if not isinstance(audio_data, AudioData): | ||||
|             raise ValueError("``audio_data`` must be audio data") | ||||
|  | ||||
|         url = self.build_url() | ||||
|         headers = self.build_headers(audio_data) | ||||
|         flac_data = self.build_data(audio_data) | ||||
|         request = Request(url, data=flac_data, headers=headers) | ||||
|         return request | ||||
|  | ||||
|     def build_url(self) -> str: | ||||
|         """ | ||||
|         >>> builder = RequestBuilder(endpoint="http://www.google.com/speech-api/v2/recognize", key="awesome-key", language="en-US", filter_level=0) | ||||
|         >>> builder.build_url() | ||||
|         'http://www.google.com/speech-api/v2/recognize?client=chromium&lang=en-US&key=awesome-key&pFilter=0' | ||||
|         """ | ||||
|         params = urlencode( | ||||
|             { | ||||
|                 "client": "chromium", | ||||
|                 "lang": self.language, | ||||
|                 "key": self.key, | ||||
|                 "pFilter": self.filter_level, | ||||
|             } | ||||
|         ) | ||||
|         return f"{self.endpoint}?{params}" | ||||
|  | ||||
|     def build_headers(self, audio_data: AudioData) -> RequestHeaders: | ||||
|         """ | ||||
|         >>> builder = RequestBuilder(endpoint="", key="", language="", filter_level=1) | ||||
|         >>> audio_data = AudioData(b"", 16_000, 1) | ||||
|         >>> builder.build_headers(audio_data) | ||||
|         {'Content-Type': 'audio/x-flac; rate=16000'} | ||||
|         """ | ||||
|         rate = audio_data.sample_rate | ||||
|         headers = {"Content-Type": f"audio/x-flac; rate={rate}"} | ||||
|         return headers | ||||
|  | ||||
|     def build_data(self, audio_data: AudioData) -> bytes: | ||||
|         flac_data = audio_data.get_flac_data( | ||||
|             convert_rate=self.to_convert_rate(audio_data.sample_rate), | ||||
|             convert_width=2,  # audio samples must be 16-bit | ||||
|         ) | ||||
|         return flac_data | ||||
|  | ||||
|     @staticmethod | ||||
|     def to_convert_rate(sample_rate: int) -> int: | ||||
|         """Audio samples must be at least 8 kHz | ||||
|  | ||||
|         >>> RequestBuilder.to_convert_rate(16_000) | ||||
|         >>> RequestBuilder.to_convert_rate(8_000) | ||||
|         >>> RequestBuilder.to_convert_rate(7_999) | ||||
|         8000 | ||||
|         """ | ||||
|         return None if sample_rate >= 8000 else 8000 | ||||
|  | ||||
|  | ||||
| def create_request_builder( | ||||
|     *, | ||||
|     endpoint: str, | ||||
|     key: str | None = None, | ||||
|     language: str = "en-US", | ||||
|     filter_level: ProfanityFilterLevel = 0, | ||||
| ) -> RequestBuilder: | ||||
|     if not isinstance(language, str): | ||||
|         raise ValueError("``language`` must be a string") | ||||
|     if key is not None and not isinstance(key, str): | ||||
|         raise ValueError("``key`` must be ``None`` or a string") | ||||
|  | ||||
|     if key is None: | ||||
|         key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw" | ||||
|     return RequestBuilder( | ||||
|         endpoint=endpoint, | ||||
|         key=key, | ||||
|         language=language, | ||||
|         filter_level=filter_level, | ||||
|     ) | ||||
|  | ||||
|  | ||||
| class OutputParser: | ||||
|     def __init__(self, *, show_all: bool, with_confidence: bool) -> None: | ||||
|         self.show_all = show_all | ||||
|         self.with_confidence = with_confidence | ||||
|  | ||||
|     def parse(self, response_text: str): | ||||
|         actual_result = self.convert_to_result(response_text) | ||||
|         if self.show_all: | ||||
|             return actual_result | ||||
|  | ||||
|         best_hypothesis = self.find_best_hypothesis( | ||||
|             actual_result["alternative"] | ||||
|         ) | ||||
|         # https://cloud.google.com/speech-to-text/docs/basics#confidence-values | ||||
|         # "Your code should not require the confidence field as it is not guaranteed to be accurate, or even set, in any of the results." | ||||
|         confidence = best_hypothesis.get("confidence", 0.5) | ||||
|         if self.with_confidence: | ||||
|             return best_hypothesis["transcript"], confidence | ||||
|         return best_hypothesis["transcript"] | ||||
|  | ||||
|     @staticmethod | ||||
|     def convert_to_result(response_text: str) -> Result: | ||||
|         r""" | ||||
|         >>> response_text = '''{"result":[]} | ||||
|         ... {"result":[{"alternative":[{"transcript":"one two three","confidence":0.49585345},{"transcript":"1 2","confidence":0.42899391}],"final":true}],"result_index":0} | ||||
|         ... ''' | ||||
|         >>> OutputParser.convert_to_result(response_text) | ||||
|         {'alternative': [{'transcript': 'one two three', 'confidence': 0.49585345}, {'transcript': '1 2', 'confidence': 0.42899391}], 'final': True} | ||||
|  | ||||
|         >>> OutputParser.convert_to_result("") | ||||
|         Traceback (most recent call last): | ||||
|           ... | ||||
|         speech_recognition.exceptions.UnknownValueError | ||||
|         >>> OutputParser.convert_to_result('\n{"result":[]}') | ||||
|         Traceback (most recent call last): | ||||
|           ... | ||||
|         speech_recognition.exceptions.UnknownValueError | ||||
|         >>> OutputParser.convert_to_result('{"result":[{"foo": "bar"}]}') | ||||
|         Traceback (most recent call last): | ||||
|           ... | ||||
|         speech_recognition.exceptions.UnknownValueError | ||||
|         >>> OutputParser.convert_to_result('{"result":[{"alternative": []}]}') | ||||
|         Traceback (most recent call last): | ||||
|           ... | ||||
|         speech_recognition.exceptions.UnknownValueError | ||||
|         """ | ||||
|         # ignore any blank blocks | ||||
|         for line in response_text.split("\n"): | ||||
|             if not line: | ||||
|                 continue | ||||
|             result: list[Result] = json.loads(line)["result"] | ||||
|             if len(result) != 0: | ||||
|                 if len(result[0].get("alternative", [])) == 0: | ||||
|                     raise UnknownValueError() | ||||
|                 return result[0] | ||||
|         raise UnknownValueError() | ||||
|  | ||||
|     @staticmethod | ||||
|     def find_best_hypothesis(alternatives: list[Alternative]) -> Alternative: | ||||
|         """ | ||||
|         >>> alternatives = [{"transcript": "one two three", "confidence": 0.42899391}, {"transcript": "1 2", "confidence": 0.49585345}] | ||||
|         >>> OutputParser.find_best_hypothesis(alternatives) | ||||
|         {'transcript': 'one two three', 'confidence': 0.42899391} | ||||
|  | ||||
|         >>> alternatives = [{"confidence": 0.49585345}] | ||||
|         >>> OutputParser.find_best_hypothesis(alternatives) | ||||
|         Traceback (most recent call last): | ||||
|           ... | ||||
|         speech_recognition.exceptions.UnknownValueError | ||||
|         """ | ||||
|         if "confidence" in alternatives: | ||||
|             # BUG: actual_result["alternative"] (=alternatives) is list, not dict | ||||
|             # return alternative with highest confidence score | ||||
|             best_hypothesis: Alternative = max( | ||||
|                 alternatives, | ||||
|                 key=lambda alternative: alternative["confidence"], | ||||
|             ) | ||||
|         else: | ||||
|             # when there is no confidence available, we arbitrarily choose the first hypothesis. | ||||
|             best_hypothesis: Alternative = alternatives[0] | ||||
|         if "transcript" not in best_hypothesis: | ||||
|             raise UnknownValueError() | ||||
|         return best_hypothesis | ||||
|  | ||||
|  | ||||
| def obtain_transcription(request: Request, timeout: int) -> str: | ||||
|     try: | ||||
|         response = urlopen(request, timeout=timeout) | ||||
|     except HTTPError as e: | ||||
|         raise RequestError("recognition request failed: {}".format(e.reason)) | ||||
|     except URLError as e: | ||||
|         raise RequestError( | ||||
|             "recognition connection failed: {}".format(e.reason) | ||||
|         ) | ||||
|     return response.read().decode("utf-8") | ||||
|  | ||||
|  | ||||
| def recognize_legacy( | ||||
|     recognizer, | ||||
|     audio_data: AudioData, | ||||
|     key: str | None = None, | ||||
|     language: str = "en-US", | ||||
|     pfilter: ProfanityFilterLevel = 0, | ||||
|     show_all: bool = False, | ||||
|     with_confidence: bool = False, | ||||
|     *, | ||||
|     endpoint: str = ENDPOINT, | ||||
| ): | ||||
|     """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Speech Recognition API. | ||||
|  | ||||
|     The Google Speech Recognition API key is specified by ``key``. If not specified, it uses a generic key that works out of the box. This should generally be used for personal or testing purposes only, as it **may be revoked by Google at any time**. | ||||
|  | ||||
|     To obtain your own API key, simply following the steps on the `API Keys <http://www.chromium.org/developers/how-tos/api-keys>`__ page at the Chromium Developers site. In the Google Developers Console, Google Speech Recognition is listed as "Speech API". | ||||
|  | ||||
|     The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language tags can be found in this `StackOverflow answer <http://stackoverflow.com/a/14302134>`__. | ||||
|  | ||||
|     The profanity filter level can be adjusted with ``pfilter``: 0 - No filter, 1 - Only shows the first character and replaces the rest with asterisks. The default is level 0. | ||||
|  | ||||
|     Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the raw API response as a JSON dictionary. | ||||
|  | ||||
|     Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. | ||||
|     """ | ||||
|     request_builder = create_request_builder( | ||||
|         endpoint=endpoint, key=key, language=language, filter_level=pfilter | ||||
|     ) | ||||
|     request = request_builder.build(audio_data) | ||||
|  | ||||
|     response_text = obtain_transcription( | ||||
|         request, timeout=recognizer.operation_timeout | ||||
|     ) | ||||
|  | ||||
|     output_parser = OutputParser( | ||||
|         show_all=show_all, with_confidence=with_confidence | ||||
|     ) | ||||
|     return output_parser.parse(response_text) | ||||
| @@ -0,0 +1,142 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| from typing import TYPE_CHECKING, TypedDict | ||||
| from urllib.error import URLError | ||||
|  | ||||
| from speech_recognition.audio import AudioData | ||||
| from speech_recognition.exceptions import RequestError, UnknownValueError | ||||
|  | ||||
| if TYPE_CHECKING: | ||||
|     from google.cloud.speech import ( | ||||
|         RecognitionConfig, | ||||
|         RecognizeResponse, | ||||
|         SpeechContext, | ||||
|     ) | ||||
|     from typing_extensions import Required | ||||
|  | ||||
|  | ||||
| class GoogleCloudRecognizerParameters(TypedDict, total=False): | ||||
|     """Optional parameters. | ||||
|  | ||||
|     The recognition language is determined by ``language_code``, which is a BCP-47 language tag like ``"en-US"`` (US English). Default: ``"en-US"``. | ||||
|     A list of supported language tags can be found in the `Speech-to-Text supported languages <https://cloud.google.com/speech/docs/languages>`__. | ||||
|  | ||||
|     If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. | ||||
|     This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. | ||||
|     Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__. | ||||
|  | ||||
|     ``show_all``: See :py:func:`recognize`. | ||||
|  | ||||
|     ``model``: You can select the model to get best results. (See `RecognitionConfig's documentation <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>`__ for detail) | ||||
|  | ||||
|     ``use_enhanced``: Set to true to use an enhanced model for speech recognition. | ||||
|     """ | ||||
|  | ||||
|     # SpeechRecognition specific parameters | ||||
|     preferred_phrases: list[str] | ||||
|     show_all: bool | ||||
|  | ||||
|     # Speech-to-Text V1 API's parameters | ||||
|     language_code: str | ||||
|     model: str | ||||
|     use_enhanced: bool | ||||
|     # TODO Add others support | ||||
|  | ||||
|  | ||||
| class GoogleCloudSpeechV1Parameters(TypedDict, total=False): | ||||
|     """Speech-to-Text V1 API's parameters. | ||||
|  | ||||
|     https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig | ||||
|     """ | ||||
|  | ||||
|     encoding: Required[RecognitionConfig.AudioEncoding] | ||||
|     sample_rate_hertz: Required[int] | ||||
|     language_code: Required[str] | ||||
|     speech_contexts: list[SpeechContext] | ||||
|     enable_word_time_offsets: bool | ||||
|     model: str | ||||
|     use_enhanced: bool | ||||
|  | ||||
|  | ||||
| def _build_config( | ||||
|     audio_data: AudioData, recognizer_params: GoogleCloudRecognizerParameters | ||||
| ) -> RecognitionConfig: | ||||
|     from google.cloud import speech | ||||
|  | ||||
|     parameters: GoogleCloudSpeechV1Parameters = { | ||||
|         "encoding": speech.RecognitionConfig.AudioEncoding.FLAC, | ||||
|         "sample_rate_hertz": audio_data.sample_rate, | ||||
|         "language_code": recognizer_params.pop("language_code", "en-US"), | ||||
|     } | ||||
|     if preferred_phrases := recognizer_params.pop("preferred_phrases", None): | ||||
|         parameters["speech_contexts"] = [ | ||||
|             speech.SpeechContext(phrases=preferred_phrases) | ||||
|         ] | ||||
|     if recognizer_params.pop("show_all", False): | ||||
|         # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets | ||||
|         parameters["enable_word_time_offsets"] = True | ||||
|     return speech.RecognitionConfig(**(parameters | recognizer_params)) | ||||
|  | ||||
|  | ||||
| def recognize( | ||||
|     recognizer, | ||||
|     audio_data: AudioData, | ||||
|     credentials_json_path: str | None = None, | ||||
|     **kwargs: GoogleCloudRecognizerParameters, | ||||
| ) -> str | RecognizeResponse: | ||||
|     """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API. | ||||
|  | ||||
|     This function requires a Google Cloud Platform account; see the `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project. | ||||
|     And create local authentication credentials for your user account. The result is a JSON file containing the API credentials. You can specify the JSON file by ``credentials_json_path``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__. | ||||
|  | ||||
|     Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. | ||||
|     For other parameters, see :py:class:`GoogleCloudRecognizerParameters`. | ||||
|  | ||||
|     Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. | ||||
|     """ | ||||
|     try: | ||||
|         from google.api_core.exceptions import GoogleAPICallError | ||||
|         from google.cloud import speech | ||||
|     except ImportError: | ||||
|         raise RequestError( | ||||
|             "missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly." | ||||
|         ) | ||||
|  | ||||
|     client = ( | ||||
|         speech.SpeechClient.from_service_account_json(credentials_json_path) | ||||
|         if credentials_json_path | ||||
|         else speech.SpeechClient() | ||||
|     ) | ||||
|  | ||||
|     flac_data = audio_data.get_flac_data( | ||||
|         # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range | ||||
|         convert_rate=( | ||||
|             None | ||||
|             if 8000 <= audio_data.sample_rate <= 48000 | ||||
|             else max(8000, min(audio_data.sample_rate, 48000)) | ||||
|         ), | ||||
|         convert_width=2,  # audio samples must be 16-bit | ||||
|     ) | ||||
|     audio = speech.RecognitionAudio(content=flac_data) | ||||
|  | ||||
|     config = _build_config(audio_data, kwargs.copy()) | ||||
|  | ||||
|     try: | ||||
|         response = client.recognize(config=config, audio=audio) | ||||
|     except GoogleAPICallError as e: | ||||
|         raise RequestError(e) | ||||
|     except URLError as e: | ||||
|         raise RequestError( | ||||
|             "recognition connection failed: {0}".format(e.reason) | ||||
|         ) | ||||
|  | ||||
|     if kwargs.get("show_all"): | ||||
|         return response | ||||
|     if len(response.results) == 0: | ||||
|         raise UnknownValueError() | ||||
|  | ||||
|     transcript = " ".join( | ||||
|         result.alternatives[0].transcript.strip() | ||||
|         for result in response.results | ||||
|     ) | ||||
|     return transcript | ||||
| @@ -0,0 +1,111 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| import os | ||||
| from collections.abc import Sequence | ||||
|  | ||||
| from speech_recognition import PortableNamedTemporaryFile | ||||
| from speech_recognition.audio import AudioData | ||||
| from speech_recognition.exceptions import RequestError, UnknownValueError | ||||
|  | ||||
| AcousticParametersDirectoryPath = str | ||||
| LanguageModelFilePath = str | ||||
| PhonemeDictionaryFilePath = str | ||||
| SphinxDataFilePaths = tuple[AcousticParametersDirectoryPath, LanguageModelFilePath, PhonemeDictionaryFilePath] | ||||
|  | ||||
| Keyword = str | ||||
| Sensitivity = float | ||||
| KeywordEntry = tuple[Keyword, Sensitivity] | ||||
|  | ||||
|  | ||||
| def recognize( | ||||
|     recognizer, | ||||
|     audio_data: AudioData, | ||||
|     language: str | SphinxDataFilePaths = "en-US", | ||||
|     keyword_entries: Sequence[KeywordEntry] | None = None, | ||||
|     grammar: str | None = None, | ||||
|     show_all: bool = False, | ||||
| ): | ||||
|     """ | ||||
|     Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx. | ||||
|  | ||||
|     The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models. | ||||
|  | ||||
|     If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for. | ||||
|  | ||||
|     Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored. | ||||
|  | ||||
|     Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition. | ||||
|  | ||||
|     Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation. | ||||
|     """ | ||||
|     # TODO Move this validation into KeywordEntry initialization | ||||
|     assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1" | ||||
|  | ||||
|     try: | ||||
|         from pocketsphinx import FsgModel, Jsgf, pocketsphinx | ||||
|     except ImportError: | ||||
|         raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.") | ||||
|  | ||||
|     if isinstance(language, str):  # directory containing language data | ||||
|         language_directory = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "pocketsphinx-data", language) | ||||
|         if not os.path.isdir(language_directory): | ||||
|             raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory)) | ||||
|         acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") | ||||
|         language_model_file = os.path.join(language_directory, "language-model.lm.bin") | ||||
|         phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict") | ||||
|     else:  # 3-tuple of Sphinx data file paths | ||||
|         acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language | ||||
|     if not os.path.isdir(acoustic_parameters_directory): | ||||
|         raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory)) | ||||
|     if not os.path.isfile(language_model_file): | ||||
|         raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file)) | ||||
|     if not os.path.isfile(phoneme_dictionary_file): | ||||
|         raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file)) | ||||
|  | ||||
|     # create decoder object | ||||
|     config = pocketsphinx.Config() | ||||
|     config.set_string("-hmm", acoustic_parameters_directory)  # set the path of the hidden Markov model (HMM) parameter files | ||||
|     config.set_string("-lm", language_model_file) | ||||
|     config.set_string("-dict", phoneme_dictionary_file) | ||||
|     config.set_string("-logfn", os.devnull)  # disable logging (logging causes unwanted output in terminal) | ||||
|     decoder = pocketsphinx.Decoder(config) | ||||
|  | ||||
|     # obtain audio data | ||||
|     raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2)  # the included language models require audio to be 16-bit mono 16 kHz in little-endian format | ||||
|  | ||||
|     # obtain recognition results | ||||
|     if keyword_entries is not None:  # explicitly specified set of keywords | ||||
|         with PortableNamedTemporaryFile("w") as f: | ||||
|             # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5 | ||||
|             f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries) | ||||
|             f.flush() | ||||
|  | ||||
|             # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done) | ||||
|             decoder.add_kws("keywords", f.name) | ||||
|             decoder.activate_search("keywords") | ||||
|     elif grammar is not None:  # a path to a FSG or JSGF grammar | ||||
|         if not os.path.exists(grammar): | ||||
|             raise ValueError("Grammar '{0}' does not exist.".format(grammar)) | ||||
|         grammar_path = os.path.abspath(os.path.dirname(grammar)) | ||||
|         grammar_name = os.path.splitext(os.path.basename(grammar))[0] | ||||
|         fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name) | ||||
|         if not os.path.exists(fsg_path):  # create FSG grammar if not available | ||||
|             jsgf = Jsgf(grammar) | ||||
|             rule = jsgf.get_rule("{0}.{0}".format(grammar_name)) | ||||
|             fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5) | ||||
|             fsg.writefile(fsg_path) | ||||
|         else: | ||||
|             fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5) | ||||
|         decoder.set_fsg(grammar_name, fsg) | ||||
|         decoder.set_search(grammar_name) | ||||
|  | ||||
|     decoder.start_utt()  # begin utterance processing | ||||
|     decoder.process_raw(raw_data, False, True)  # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) | ||||
|     decoder.end_utt()  # stop utterance processing | ||||
|  | ||||
|     if show_all: return decoder | ||||
|  | ||||
|     # return results | ||||
|     hypothesis = decoder.hyp() | ||||
|     if hypothesis is not None: return hypothesis.hypstr | ||||
|     raise UnknownValueError()  # no transcriptions available | ||||
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -0,0 +1,22 @@ | ||||
| from io import BytesIO | ||||
|  | ||||
| from speech_recognition.audio import AudioData | ||||
|  | ||||
|  | ||||
| class OpenAICompatibleRecognizer: | ||||
|     def __init__(self, client) -> None: | ||||
|         self.client = client | ||||
|  | ||||
|     def recognize(self, audio_data: "AudioData", model: str, **kwargs) -> str: | ||||
|         if not isinstance(audio_data, AudioData): | ||||
|             raise ValueError( | ||||
|                 "``audio_data`` must be an ``AudioData`` instance" | ||||
|             ) | ||||
|  | ||||
|         wav_data = BytesIO(audio_data.get_wav_data()) | ||||
|         wav_data.name = "SpeechRecognition_audio.wav" | ||||
|  | ||||
|         transcript = self.client.audio.transcriptions.create( | ||||
|             file=wav_data, model=model, **kwargs | ||||
|         ) | ||||
|         return transcript.text | ||||
| @@ -0,0 +1,54 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| from typing import Literal, TypedDict | ||||
|  | ||||
| from typing_extensions import Unpack | ||||
|  | ||||
| from speech_recognition.audio import AudioData | ||||
| from speech_recognition.exceptions import SetupError | ||||
| from speech_recognition.recognizers.whisper_api.base import ( | ||||
|     OpenAICompatibleRecognizer, | ||||
| ) | ||||
|  | ||||
| # https://console.groq.com/docs/speech-text#supported-models | ||||
| GroqModel = Literal[ | ||||
|     "whisper-large-v3-turbo", "whisper-large-v3", "distil-whisper-large-v3-en" | ||||
| ] | ||||
|  | ||||
|  | ||||
| class GroqOptionalParameters(TypedDict): | ||||
|     """Groq speech transcription's optional parameters. | ||||
|  | ||||
|     https://console.groq.com/docs/speech-text#transcription-endpoint-usage | ||||
|     """ | ||||
|  | ||||
|     prompt: str | ||||
|     response_format: str | ||||
|     temperature: float | ||||
|     language: str | ||||
|  | ||||
|  | ||||
| def recognize( | ||||
|     recognizer, | ||||
|     audio_data: "AudioData", | ||||
|     *, | ||||
|     model: GroqModel = "whisper-large-v3-turbo", | ||||
|     **kwargs: Unpack[GroqOptionalParameters], | ||||
| ) -> str: | ||||
|     """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Groq Whisper API. | ||||
|  | ||||
|     This function requires login to Groq; visit https://console.groq.com/login, then generate API Key in `API Keys <https://console.groq.com/keys>`__ menu. | ||||
|  | ||||
|     Detail: https://console.groq.com/docs/speech-text | ||||
|  | ||||
|     Set environment variable ``GROQ_API_KEY``; otherwise groq library will raise a ``groq.GroqError``. | ||||
|     """ | ||||
|     try: | ||||
|         import groq | ||||
|     except ImportError: | ||||
|         raise SetupError( | ||||
|             "missing groq module: ensure that groq is set up correctly." | ||||
|         ) | ||||
|  | ||||
|     groq_recognizer = OpenAICompatibleRecognizer(groq.Groq()) | ||||
|     return groq_recognizer.recognize(audio_data, model, **kwargs) | ||||
| @@ -0,0 +1,83 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| from typing import Literal | ||||
|  | ||||
| from typing_extensions import Unpack | ||||
|  | ||||
| from speech_recognition.audio import AudioData | ||||
| from speech_recognition.exceptions import SetupError | ||||
| from speech_recognition.recognizers.whisper_api.base import ( | ||||
|     OpenAICompatibleRecognizer, | ||||
| ) | ||||
|  | ||||
| # https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-model | ||||
| WhisperModel = Literal[ | ||||
|     "whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe" | ||||
| ] | ||||
|  | ||||
|  | ||||
| class OpenAIOptionalParameters: | ||||
|     """OpenAI speech transcription's optional parameters. | ||||
|  | ||||
|     https://platform.openai.com/docs/api-reference/audio/createTranscription | ||||
|     """ | ||||
|  | ||||
|     language: str | ||||
|     prompt: str | ||||
|     # TODO Add support `Literal["text", "srt", "verbose_json", "vtt"]` | ||||
|     response_format: Literal["json"] | ||||
|     temperature: float | ||||
|     # timestamp_granularities  # TODO support | ||||
|  | ||||
|  | ||||
| def recognize( | ||||
|     recognizer, | ||||
|     audio_data: "AudioData", | ||||
|     *, | ||||
|     model: WhisperModel = "whisper-1", | ||||
|     **kwargs: Unpack[OpenAIOptionalParameters], | ||||
| ) -> str: | ||||
|     """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API. | ||||
|  | ||||
|     This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings <https://platform.openai.com/account/api-keys>`__. | ||||
|  | ||||
|     Detail: https://platform.openai.com/docs/guides/speech-to-text | ||||
|  | ||||
|     Set environment variable ``OPENAI_API_KEY``; otherwise openai library will raise a ``openai.OpenAIError``. | ||||
|     """ | ||||
|     try: | ||||
|         import openai | ||||
|     except ImportError: | ||||
|         raise SetupError( | ||||
|             "missing openai module: ensure that openai is set up correctly." | ||||
|         ) | ||||
|  | ||||
|     openai_recognizer = OpenAICompatibleRecognizer(openai.OpenAI()) | ||||
|     return openai_recognizer.recognize(audio_data, model, **kwargs) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     import argparse | ||||
|     from typing import get_args | ||||
|  | ||||
|     import speech_recognition as sr | ||||
|  | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument("audio_file") | ||||
|     parser.add_argument( | ||||
|         "--model", choices=get_args(WhisperModel), default="whisper-1" | ||||
|     ) | ||||
|     parser.add_argument("-l", "--language") | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     r = sr.Recognizer() | ||||
|     with sr.AudioFile(args.audio_file) as source: | ||||
|         audio_data = r.listen(source) | ||||
|  | ||||
|     if args.language: | ||||
|         transcription = recognize( | ||||
|             None, audio_data, model=args.model, language=args.language | ||||
|         ) | ||||
|     else: | ||||
|         transcription = recognize(None, audio_data, model=args.model) | ||||
|     print(transcription) | ||||
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -0,0 +1,45 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| import io | ||||
| from typing import TYPE_CHECKING, Any, Protocol | ||||
|  | ||||
| from speech_recognition.audio import AudioData | ||||
|  | ||||
| if TYPE_CHECKING: | ||||
|     import numpy as np | ||||
|  | ||||
|  | ||||
| class Transcribable(Protocol): | ||||
|     def transcribe( | ||||
|         self, audio_array: np.ndarray, **kwargs | ||||
|     ) -> str | dict[str, Any]: | ||||
|         pass | ||||
|  | ||||
|  | ||||
| class WhisperCompatibleRecognizer: | ||||
|     def __init__(self, model: Transcribable) -> None: | ||||
|         self.model = model | ||||
|  | ||||
|     def recognize( | ||||
|         self, audio_data: AudioData, show_dict: bool = False, **kwargs | ||||
|     ): | ||||
|         if not isinstance(audio_data, AudioData): | ||||
|             raise ValueError( | ||||
|                 "``audio_data`` must be an ``AudioData`` instance" | ||||
|             ) | ||||
|  | ||||
|         import numpy as np | ||||
|         import soundfile as sf | ||||
|  | ||||
|         # 16 kHz https://github.com/openai/whisper/blob/28769fcfe50755a817ab922a7bc83483159600a9/whisper/audio.py#L98-L99 | ||||
|         wav_bytes = audio_data.get_wav_data(convert_rate=16000) | ||||
|         wav_stream = io.BytesIO(wav_bytes) | ||||
|         audio_array, sampling_rate = sf.read(wav_stream) | ||||
|         audio_array = audio_array.astype(np.float32) | ||||
|  | ||||
|         result = self.model.transcribe(audio_array, **kwargs) | ||||
|  | ||||
|         if show_dict: | ||||
|             return result | ||||
|         else: | ||||
|             return result["text"] | ||||
| @@ -0,0 +1,106 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| from typing import TYPE_CHECKING, Literal, TypedDict | ||||
|  | ||||
| from speech_recognition.audio import AudioData | ||||
| from speech_recognition.recognizers.whisper_local.base import ( | ||||
|     WhisperCompatibleRecognizer, | ||||
| ) | ||||
|  | ||||
| if TYPE_CHECKING: | ||||
|     import numpy as np | ||||
|     from faster_whisper import WhisperModel | ||||
|     from faster_whisper.transcribe import Segment | ||||
|     from typing_extensions import Unpack | ||||
|  | ||||
|  | ||||
| class TranscribeOutput(TypedDict): | ||||
|     text: str | ||||
|     segments: list[Segment] | ||||
|     language: str | ||||
|  | ||||
|  | ||||
| class TranscribableAdapter: | ||||
|     def __init__(self, model: WhisperModel) -> None: | ||||
|         self.model = model | ||||
|  | ||||
|     def transcribe( | ||||
|         self, audio_array: np.ndarray, **kwargs | ||||
|     ) -> TranscribeOutput: | ||||
|         segments_generator, info = self.model.transcribe(audio_array, **kwargs) | ||||
|         segments = list(segments_generator) | ||||
|         return { | ||||
|             "text": " ".join(segment.text for segment in segments), | ||||
|             "segments": segments, | ||||
|             "language": info.language, | ||||
|         } | ||||
|  | ||||
|  | ||||
| class InitOptionalParameters(TypedDict, total=False): | ||||
|     # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L575 | ||||
|     device: Literal["cpu", "gpu", "auto"] | ||||
|     compute_type: str | ||||
|     download_root: str | ||||
|     # TODO Add others | ||||
|  | ||||
|  | ||||
| class TranscribeOptionalParameters(TypedDict, total=False): | ||||
|     # https://github.com/SYSTRAN/faster-whisper/blob/v1.1.0/faster_whisper/transcribe.py#L692 | ||||
|     language: str | ||||
|     task: Literal["transcribe", "translate"] | ||||
|     beam_size: int | ||||
|     # TODO Add others | ||||
|  | ||||
|  | ||||
| def recognize( | ||||
|     recognizer, | ||||
|     audio_data: AudioData, | ||||
|     model: str = "base", | ||||
|     show_dict: bool = False, | ||||
|     init_options: InitOptionalParameters | None = None, | ||||
|     **transcribe_options: Unpack[TranscribeOptionalParameters], | ||||
| ) -> str | TranscribeOutput: | ||||
|     """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper. | ||||
|  | ||||
|     Pick ``model`` size (Same as Whisper). | ||||
|  | ||||
|     If ``show_dict`` is true, returns the detailed response from Whisper, including the detected language. Otherwise returns only the transcription. | ||||
|  | ||||
|     You can specify: | ||||
|  | ||||
|         * ``language``: recognition language, an uncapitalized 2 letters language name like "en" or "fr". | ||||
|  | ||||
|             * If not set, Faster Whisper will automatically detect the language. | ||||
|  | ||||
|         * ``task`` | ||||
|  | ||||
|             * If you want transcribe + **translate** to english, set ``task="translate"``. | ||||
|  | ||||
|     Other values are passed directly to whisper. See https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py for all options. | ||||
|     """ | ||||
|     from faster_whisper import WhisperModel | ||||
|  | ||||
|     model = WhisperModel(model, **init_options or {}) | ||||
|     whisper_recognizer = WhisperCompatibleRecognizer( | ||||
|         TranscribableAdapter(model) | ||||
|     ) | ||||
|     return whisper_recognizer.recognize( | ||||
|         audio_data, show_dict=show_dict, **transcribe_options | ||||
|     ) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     import argparse | ||||
|  | ||||
|     import speech_recognition as sr | ||||
|  | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument("audio_file") | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     r = sr.Recognizer() | ||||
|     with sr.AudioFile(args.audio_file) as source: | ||||
|         audio_data = r.listen(source) | ||||
|  | ||||
|     transcription = recognize(None, audio_data) | ||||
|     print(transcription) | ||||
| @@ -0,0 +1,108 @@ | ||||
| from __future__ import annotations | ||||
|  | ||||
| from typing import TYPE_CHECKING, Literal, TypedDict | ||||
|  | ||||
| from speech_recognition.audio import AudioData | ||||
| from speech_recognition.recognizers.whisper_local.base import ( | ||||
|     WhisperCompatibleRecognizer, | ||||
| ) | ||||
|  | ||||
| if TYPE_CHECKING: | ||||
|     import numpy as np | ||||
|     import torch | ||||
|     from typing_extensions import Unpack | ||||
|     from whisper import Whisper | ||||
|  | ||||
|  | ||||
| class LoadModelOptionalParameters(TypedDict, total=False): | ||||
|     # ref: https://github.com/openai/whisper/blob/v20240930/whisper/__init__.py#L103 | ||||
|     device: str | torch.device | ||||
|     download_root: str | ||||
|     in_memory: bool | ||||
|  | ||||
|  | ||||
| class TranscribeOptionalParameters(TypedDict, total=False): | ||||
|     """Transcribe optional parameters & DecodingOptions parameters.""" | ||||
|  | ||||
|     # ref: https://github.com/openai/whisper/blob/v20240930/whisper/transcribe.py#L38 | ||||
|     temperature: float | tuple[float, ...] | ||||
|     # TODO Add others | ||||
|  | ||||
|     # ref: https://github.com/openai/whisper/blob/v20240930/whisper/decoding.py#L81 | ||||
|     task: Literal["transcribe", "translate"] | ||||
|     language: str | ||||
|     fp16: bool | ||||
|     # TODO Add others | ||||
|  | ||||
|  | ||||
| class Segment(TypedDict): | ||||
|     id: int | ||||
|     seek: int | ||||
|     start: float | ||||
|     end: float | ||||
|     text: str | ||||
|     tokens: list[int] | ||||
|     temperature: float | ||||
|     avg_logprob: float | ||||
|     compression_ratio: float | ||||
|     no_speech_prob: float | ||||
|  | ||||
|  | ||||
| class TranscribeOutput(TypedDict): | ||||
|     text: str | ||||
|     segments: list[Segment] | ||||
|     language: str | ||||
|  | ||||
|  | ||||
| class TranscribableAdapter: | ||||
|     def __init__(self, model: Whisper) -> None: | ||||
|         self.model = model | ||||
|  | ||||
|     def transcribe( | ||||
|         self, audio_array: np.ndarray, **kwargs | ||||
|     ) -> TranscribeOutput: | ||||
|         if "fp16" not in kwargs: | ||||
|             import torch | ||||
|  | ||||
|             kwargs["fp16"] = torch.cuda.is_available() | ||||
|  | ||||
|         return self.model.transcribe(audio_array, **kwargs) | ||||
|  | ||||
|  | ||||
| def recognize( | ||||
|     recognizer, | ||||
|     audio_data: AudioData, | ||||
|     model: str = "base", | ||||
|     show_dict: bool = False, | ||||
|     load_options: LoadModelOptionalParameters | None = None, | ||||
|     **transcribe_options: Unpack[TranscribeOptionalParameters], | ||||
| ) -> str | TranscribeOutput: | ||||
|     """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using Whisper. | ||||
|  | ||||
|     Pick ``model`` from output of :command:`python -c 'import whisper; print(whisper.available_models())'`. | ||||
|     See also https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages. | ||||
|  | ||||
|     If ``show_dict`` is true, returns the full dict response from Whisper, including the detected language. Otherwise returns only the transcription. | ||||
|  | ||||
|     You can specify: | ||||
|  | ||||
|         * ``language``: recognition language, an uncapitalized full language name like "english" or "chinese". See the full language list at https://github.com/openai/whisper/blob/main/whisper/tokenizer.py | ||||
|  | ||||
|             * If not set, Whisper will automatically detect the language. | ||||
|  | ||||
|         * ``task`` | ||||
|  | ||||
|             * If you want transcribe + **translate** to english, set ``task="translate"``. | ||||
|  | ||||
|     Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options. | ||||
|     """ | ||||
|  | ||||
|     import whisper | ||||
|  | ||||
|     whisper_model = whisper.load_model(model, **load_options or {}) | ||||
|     whisper_recognizer = WhisperCompatibleRecognizer( | ||||
|         TranscribableAdapter(whisper_model) | ||||
|     ) | ||||
|     return whisper_recognizer.recognize( | ||||
|         audio_data, show_dict=show_dict, **transcribe_options | ||||
|     ) | ||||
		Reference in New Issue
	
	Block a user