first commit

This commit is contained in:
2025-04-04 13:23:15 -06:00
commit 216064f731
2103 changed files with 522593 additions and 0 deletions

View File

@@ -0,0 +1,23 @@
'''
Utility functions to help with Python 2/3 compatibility
'''
from .. import six
def toUtf8(value):
'''
Takes in a value and converts it to a text (unicode) type. Then decodes that
type to a byte array encoded in utf-8. In 2.X the resulting object will be a
str and in 3.X the resulting object will be bytes. In both 2.X and 3.X any
object can be passed in and the object's __str__ will be used (or __repr__ if
__str__ is not defined) if the object is not already a text type.
'''
return six.text_type(value).encode('utf-8')
def fromUtf8(value):
'''
Takes in a byte array encoded as utf-8 and returns a text (unicode) type. In
2.X we expect a str type and return a unicde type. In 3.X we expect a bytes
type and return a str type.
'''
return value.decode('utf-8')

View File

@@ -0,0 +1,500 @@
from __future__ import print_function
import time
from ctypes import (CFUNCTYPE, POINTER, Structure, Union, c_char_p, c_int,
c_long, c_short, c_ubyte, c_uint, c_ulong, c_void_p,
c_wchar, cdll)
def cfunc(name, dll, result, *args):
"""build and apply a ctypes prototype complete with parameter flags"""
atypes = []
aflags = []
for arg in args:
atypes.append(arg[1])
aflags.append((arg[2], arg[0]) + arg[3:])
return CFUNCTYPE(result, *atypes)((name, dll), tuple(aflags))
dll = None
def load_library():
global dll
paths = [
# macOS paths
'/usr/local/lib/libespeak-ng.1.dylib',
'/usr/local/lib/libespeak.dylib',
# Linux paths
'libespeak-ng.so.1',
'/usr/local/lib/libespeak-ng.so.1',
'libespeak.so.1',
# Windows paths
r'C:\Program Files\eSpeak NG\libespeak-ng.dll',
r'C:\Program Files (x86)\eSpeak NG\libespeak-ng.dll'
]
for path in paths:
try:
dll = cdll.LoadLibrary(path)
return True
except Exception:
continue # Try the next path
return False
try:
if not load_library():
raise RuntimeError("This means you probably do not have eSpeak or eSpeak-ng installed!")
except Exception as exp:
raise
# constants and such from speak_lib.h
EVENT_LIST_TERMINATED = 0
EVENT_WORD = 1
EVENT_SENTENCE = 2
EVENT_MARK = 3
EVENT_PLAY = 4
EVENT_END = 5
EVENT_MSG_TERMINATED = 6
class numberORname(Union):
_fields_ = [
('number', c_int),
('name', c_char_p)
]
class EVENT(Structure):
_fields_ = [
('type', c_int),
('unique_identifier', c_uint),
('text_position', c_int),
('length', c_int),
('audio_position', c_int),
('sample', c_int),
('user_data', c_void_p),
('id', numberORname)
]
AUDIO_OUTPUT_PLAYBACK = 0
AUDIO_OUTPUT_RETRIEVAL = 1
AUDIO_OUTPUT_SYNCHRONOUS = 2
AUDIO_OUTPUT_SYNCH_PLAYBACK = 3
EE_OK = 0
EE_INTERNAL_ERROR = -1
EE_BUFFER_FULL = 1
EE_NOT_FOUND = 2
Initialize = cfunc('espeak_Initialize', dll, c_int,
('output', c_int, 1, AUDIO_OUTPUT_PLAYBACK),
('bufflength', c_int, 1, 100),
('path', c_char_p, 1, None),
('option', c_int, 1, 0))
Initialize.__doc__ = """Must be called before any synthesis functions are called.
output: the audio data can either be played by eSpeak or passed back by the SynthCallback function.
buflength: The length in mS of sound buffers passed to the SynthCallback function.
path: The directory which contains the espeak-data directory, or NULL for the default location.
options: bit 0: 1=allow espeakEVENT_PHONEME events.
Returns: sample rate in Hz, or -1 (EE_INTERNAL_ERROR)."""
t_espeak_callback = CFUNCTYPE(c_int, POINTER(c_short), c_int, POINTER(EVENT))
cSetSynthCallback = cfunc('espeak_SetSynthCallback', dll, None,
('SynthCallback', t_espeak_callback, 1))
SynthCallback = None
def SetSynthCallback(cb):
global SynthCallback
SynthCallback = t_espeak_callback(cb)
cSetSynthCallback(SynthCallback)
SetSynthCallback.__doc__ = """Must be called before any synthesis functions are called.
This specifies a function in the calling program which is called when a buffer of
speech sound data has been produced.
The callback function is of the form:
int SynthCallback(short *wav, int numsamples, espeak_EVENT *events);
wav: is the speech sound data which has been produced.
NULL indicates that the synthesis has been completed.
numsamples: is the number of entries in wav. This number may vary, may be less than
the value implied by the buflength parameter given in espeak_Initialize, and may
sometimes be zero (which does NOT indicate end of synthesis).
events: an array of espeak_EVENT items which indicate word and sentence events, and
also the occurance if <mark> and <audio> elements within the text.
Callback returns: 0=continue synthesis, 1=abort synthesis."""
t_UriCallback = CFUNCTYPE(c_int, c_int, c_char_p, c_char_p)
cSetUriCallback = cfunc('espeak_SetUriCallback', dll, None,
('UriCallback', t_UriCallback, 1))
UriCallback = None
def SetUriCallback(cb):
global UriCallback
UriCallback = t_UriCallback(UriCallback)
cSetUriCallback(UriCallback)
SetUriCallback.__doc__ = """This function must be called before synthesis functions are used, in order to deal with
<audio> tags. It specifies a callback function which is called when an <audio> element is
encountered and allows the calling program to indicate whether the sound file which
is specified in the <audio> element is available and is to be played.
The callback function is of the form:
int UriCallback(int type, const char *uri, const char *base);
type: type of callback event. Currently only 1= <audio> element
uri: the "src" attribute from the <audio> element
base: the "xml:base" attribute (if any) from the <speak> element
Return: 1=don't play the sound, but speak the text alternative.
0=place a PLAY event in the event list at the point where the <audio> element
occurs. The calling program can then play the sound at that point."""
# a few manifest constants
CHARS_AUTO = 0
CHARS_UTF8 = 1
CHARS_8BIT = 2
CHARS_WCHAR = 3
SSML = 0x10
PHONEMES = 0x100
ENDPAUSE = 0x1000
KEEP_NAMEDATA = 0x2000
POS_CHARACTER = 1
POS_WORD = 2
POS_SENTENCE = 3
def Synth(text, position=0, position_type=POS_CHARACTER, end_position=0, flags=0, user_data=None):
return cSynth(text, len(text) * 10, position, position_type, end_position, flags, None, user_data)
cSynth = cfunc('espeak_Synth', dll, c_int,
('text', c_char_p, 1),
('size', c_long, 1),
('position', c_uint, 1, 0),
('position_type', c_int, 1, POS_CHARACTER),
('end_position', c_uint, 1, 0),
('flags', c_uint, 1, CHARS_AUTO),
('unique_identifier', POINTER(c_uint), 1, None),
('user_data', c_void_p, 1, None))
Synth.__doc__ = """Synthesize speech for the specified text. The speech sound data is passed to the calling
program in buffers by means of the callback function specified by espeak_SetSynthCallback(). The command is asynchronous: it is internally buffered and returns as soon as possible. If espeak_Initialize was previously called with AUDIO_OUTPUT_PLAYBACK as argument, the sound data are played by eSpeak.
text: The text to be spoken, terminated by a zero character. It may be either 8-bit characters,
wide characters (wchar_t), or UTF8 encoding. Which of these is determined by the "flags"
parameter.
size: Equal to (or greater than) the size of the text data, in bytes. This is used in order
to allocate internal storage space for the text. This value is not used for
AUDIO_OUTPUT_SYNCHRONOUS mode.
position: The position in the text where speaking starts. Zero indicates speak from the
start of the text.
position_type: Determines whether "position" is a number of characters, words, or sentences.
Values:
end_position: If set, this gives a character position at which speaking will stop. A value
of zero indicates no end position.
flags: These may be OR'd together:
Type of character codes, one of:
espeak.CHARS_UTF8 UTF8 encoding
espeak.CHARS_8BIT The 8 bit ISO-8859 character set for the particular language.
espeak.CHARS_AUTO 8 bit or UTF8 (this is the default)
espeak.CHARS_WCHAR Wide characters (wchar_t)
espeak.SSML Elements within < > are treated as SSML elements, or if not recognised are ignored.
espeak.PHONEMES Text within [[ ]] is treated as phonemes codes (in espeak's Hirschenbaum encoding).
espeak.ENDPAUSE If set then a sentence pause is added at the end of the text. If not set then
this pause is suppressed.
unique_identifier: message identifier; helpful for identifying later
data supplied to the callback.
user_data: pointer which will be passed to the callback function.
Return: EE_OK: operation achieved
EE_BUFFER_FULL: the command can not be buffered;
you may try after a while to call the function again.
EE_INTERNAL_ERROR."""
def Synth_Mark(text, index_mark, end_position=0, flags=CHARS_AUTO):
cSynth_Mark(text, len(text) + 1, index_mark, end_position, flags)
cSynth_Mark = cfunc('espeak_Synth_Mark', dll, c_int,
('text', c_char_p, 1),
('size', c_ulong, 1),
('index_mark', c_char_p, 1),
('end_position', c_uint, 1, 0),
('flags', c_uint, 1, CHARS_AUTO),
('unique_identifier', POINTER(c_uint), 1, None),
('user_data', c_void_p, 1, None))
Synth_Mark.__doc__ = """Synthesize speech for the specified text. Similar to espeak_Synth() but the start position is
specified by the name of a <mark> element in the text.
index_mark: The "name" attribute of a <mark> element within the text which specified the
point at which synthesis starts. UTF8 string.
For the other parameters, see espeak_Synth()
Return: EE_OK: operation achieved
EE_BUFFER_FULL: the command can not be buffered;
you may try after a while to call the function again.
EE_INTERNAL_ERROR."""
Key = cfunc('espeak_Key', dll, c_int,
('key_name', c_char_p, 1))
Key.__doc__ = """Speak the name of a keyboard key.
Currently this just speaks the "key_name" as given
Return: EE_OK: operation achieved
EE_BUFFER_FULL: the command can not be buffered;
you may try after a while to call the function again.
EE_INTERNAL_ERROR."""
Char = cfunc('espeak_Char', dll, c_int,
('character', c_wchar, 1))
Char.__doc__ = """Speak the name of the given character
Return: EE_OK: operation achieved
EE_BUFFER_FULL: the command can not be buffered;
you may try after a while to call the function again.
EE_INTERNAL_ERROR."""
# Speech Parameters
SILENCE = 0 # internal use
RATE = 1
VOLUME = 2
PITCH = 3
RANGE = 4
PUNCTUATION = 5
CAPITALS = 6
EMPHASIS = 7 # internal use
LINELENGTH = 8 # internal use
PUNCT_NONE = 0
PUNCT_ALL = 1
PUNCT_SOME = 2
SetParameter = cfunc('espeak_SetParameter', dll, c_int,
('parameter', c_int, 1),
('value', c_int, 1),
('relative', c_int, 1, 0))
SetParameter.__doc__ = """Sets the value of the specified parameter.
relative=0 Sets the absolute value of the parameter.
relative=1 Sets a relative value of the parameter.
parameter:
espeak.RATE: speaking speed in word per minute.
espeak.VOLUME: volume in range 0-100 0=silence
espeak.PITCH: base pitch, range 0-100. 50=normal
espeak.RANGE: pitch range, range 0-100. 0-monotone, 50=normal
espeak.PUNCTUATION: which punctuation characters to announce:
value in espeak_PUNCT_TYPE (none, all, some),
see espeak_GetParameter() to specify which characters are announced.
espeak.CAPITALS: announce capital letters by:
0=none,
1=sound icon,
2=spelling,
3 or higher, by raising pitch. This values gives the amount in Hz by which the pitch
of a word raised to indicate it has a capital letter.
Return: EE_OK: operation achieved
EE_BUFFER_FULL: the command can not be buffered;
you may try after a while to call the function again.
EE_INTERNAL_ERROR."""
GetParameter = cfunc('espeak_GetParameter', dll, c_int,
('parameter', c_int, 1))
GetParameter.__doc__ = """current=0 Returns the default value of the specified parameter.
current=1 Returns the current value of the specified parameter, as set by SetParameter()"""
SetPunctuationList = cfunc('espeak_SetPunctuationList', dll, c_int,
('punctlist', c_wchar, 1))
SetPunctuationList.__doc__ = """Specified a list of punctuation characters whose names are
to be spoken when the value of the Punctuation parameter is set to "some".
punctlist: A list of character codes, terminated by a zero character.
Return: EE_OK: operation achieved
EE_BUFFER_FULL: the command can not be buffered;
you may try after a while to call the function again.
EE_INTERNAL_ERROR."""
SetPhonemeTrace = cfunc('espeak_SetPhonemeTrace', dll, None,
('value', c_int, 1),
('stream', c_void_p, 1))
SetPhonemeTrace.__doc__ = """Controls the output of phoneme symbols for the text
value=0 No phoneme output (default)
value=1 Output the translated phoneme symbols for the text
value=2 as (1), but also output a trace of how the translation was done (matching rules and list entries)
stream output stream for the phoneme symbols (and trace). If stream=NULL then it uses stdout."""
CompileDictionary = cfunc('espeak_CompileDictionary', dll, None,
('path', c_char_p, 1),
('log', c_void_p, 1))
CompileDictionary.__doc__ = """Compile pronunciation dictionary for a language which corresponds to the currently
selected voice. The required voice should be selected before calling this function.
path: The directory which contains the language's '_rules' and '_list' files.
'path' should end with a path separator character ('/').
log: Stream for error reports and statistics information. If log=NULL then stderr will be used."""
class VOICE(Structure):
_fields_ = [
('name', c_char_p),
('languages', c_char_p),
('identifier', c_char_p),
('gender', c_ubyte),
('age', c_ubyte),
('variant', c_ubyte),
('xx1', c_ubyte),
('score', c_int),
('spare', c_void_p),
]
def __repr__(self):
"""Print the fields"""
res = []
for field in self._fields_:
res.append('%s=%s' % (field[0], repr(getattr(self, field[0]))))
return self.__class__.__name__ + '(' + ','.join(res) + ')'
cListVoices = cfunc('espeak_ListVoices', dll, POINTER(POINTER(VOICE)),
('voice_spec', POINTER(VOICE), 1))
cListVoices.__doc__ = """Reads the voice files from espeak-data/voices and creates an array of espeak_VOICE pointers.
The list is terminated by a NULL pointer
If voice_spec is NULL then all voices are listed.
If voice spec is given, then only the voices which are compatible with the voice_spec
are listed, and they are listed in preference order."""
def ListVoices(voice_spec=None):
"""Reads the voice files from espeak-data/voices and returns a list of VOICE objects.
If voice_spec is None then all voices are listed.
If voice spec is given, then only the voices which are compatible with the voice_spec
are listed, and they are listed in preference order."""
ppv = cListVoices(voice_spec)
res = []
i = 0
while ppv[i]:
res.append(ppv[i][0])
i += 1
return res
SetVoiceByName = cfunc('espeak_SetVoiceByName', dll, c_int,
('name', c_char_p, 1))
SetVoiceByName.__doc__ = """Searches for a voice with a matching "name" field. Language is not considered.
"name" is a UTF8 string.
Return: EE_OK: operation achieved
EE_BUFFER_FULL: the command can not be buffered;
you may try after a while to call the function again.
EE_INTERNAL_ERROR."""
SetVoiceByProperties = cfunc('espeak_SetVoiceByProperties', dll, c_int,
('voice_spec', POINTER(VOICE), 1))
SetVoiceByProperties.__doc__ = """An espeak_VOICE structure is used to pass criteria to select a voice. Any of the following
fields may be set:
name NULL, or a voice name
languages NULL, or a single language string (with optional dialect), eg. "en-uk", or "en"
gender 0=not specified, 1=male, 2=female
age 0=not specified, or an age in years
variant After a list of candidates is produced, scored and sorted, "variant" is used to index
that list and choose a voice.
variant=0 takes the top voice (i.e. best match). variant=1 takes the next voice, etc"""
GetCurrentVoice = cfunc('espeak_GetCurrentVoice', dll, POINTER(VOICE),
)
GetCurrentVoice.__doc__ = """Returns the espeak_VOICE data for the currently selected voice.
This is not affected by temporary voice changes caused by SSML elements such as <voice> and <s>"""
Cancel = cfunc('espeak_Cancel', dll, c_int)
Cancel.__doc__ = """Stop immediately synthesis and audio output of the current text. When this
function returns, the audio output is fully stopped and the synthesizer is ready to
synthesize a new message.
Return: EE_OK: operation achieved
EE_INTERNAL_ERROR."""
IsPlaying = cfunc('espeak_IsPlaying', dll, c_int)
IsPlaying.__doc__ = """Returns 1 if audio is played, 0 otherwise."""
Synchronize = cfunc('espeak_Synchronize', dll, c_int)
Synchronize.__doc__ = """This function returns when all data have been spoken.
Return: EE_OK: operation achieved
EE_INTERNAL_ERROR."""
Terminate = cfunc('espeak_Terminate', dll, c_int)
Terminate.__doc__ = """last function to be called.
Return: EE_OK: operation achieved
EE_INTERNAL_ERROR."""
Info = cfunc('espeak_Info', dll, c_char_p, ('ptr', c_void_p, 1, 0))
Info.__doc__ = """Returns the version number string.
The parameter is for future use, and should be set to NULL"""
if __name__ == '__main__':
def synth_cb(wav, numsample, events):
print(numsample, end="")
i = 0
while True:
if events[i].type == EVENT_LIST_TERMINATED:
break
print(events[i].type, end="")
i += 1
return 0
samplerate = Initialize(output=AUDIO_OUTPUT_PLAYBACK)
SetSynthCallback(synth_cb)
s = 'This is a test, only a test. '
uid = c_uint(0)
# print 'pitch=',GetParameter(PITCH)
# SetParameter(PITCH, 50, 0)
print(Synth(s))
while IsPlaying():
time.sleep(0.1)

View File

@@ -0,0 +1,175 @@
from ..voice import Voice
import time
def buildDriver(proxy):
'''
Builds a new instance of a driver and returns it for use by the driver
proxy.
@param proxy: Proxy creating the driver
@type proxy: L{driver.DriverProxy}
'''
return DummyDriver(proxy)
class DummyDriver(object):
'''
Dummy speech engine implementation. Documents the interface, notifications,
properties, and sequencing responsibilities of a driver implementation.
@ivar _proxy: Driver proxy that manages this instance
@type _proxy: L{driver.DriverProxy}
@ivar _config: Dummy configuration
@type _config: dict
@ivar _looping: True when in the dummy event loop, False when not
@ivar _looping: bool
'''
def __init__(self, proxy):
'''
Constructs the driver.
@param proxy: Proxy creating the driver
@type proxy: L{driver.DriverProxy}
'''
self._proxy = proxy
self._looping = False
# hold config values as if we had a real tts implementation that
# supported them
voices = [
Voice('dummy.voice1', 'John Doe', ['en-US', 'en-GB'], 'male', 'adult'),
Voice('dummy.voice2', 'Jane Doe', ['en-US', 'en-GB'], 'female', 'adult'),
Voice('dummy.voice3', 'Jimmy Doe', ['en-US', 'en-GB'], 'male', 10)
]
self._config = {
'rate' : 200,
'volume' : 1.0,
'voice' : voices[0],
'voices' : voices
}
def destroy(self):
'''
Optional method that will be called when the driver proxy is being
destroyed. Can cleanup any resources to make sure the engine terminates
properly.
'''
pass
def startLoop(self):
'''
Starts a blocking run loop in which driver callbacks are properly
invoked.
@precondition: There was no previous successful call to L{startLoop}
without an intervening call to L{stopLoop}.
'''
first = True
self._looping = True
while self._looping:
if first:
self._proxy.setBusy(False)
first = False
time.sleep(0.5)
def endLoop(self):
'''
Stops a previously started run loop.
@precondition: A previous call to L{startLoop} suceeded and there was
no intervening call to L{endLoop}.
'''
self._looping = False
def iterate(self):
'''
Iterates from within an external run loop.
'''
self._proxy.setBusy(False)
yield
def say(self, text):
'''
Speaks the given text. Generates the following notifications during
output:
started-utterance: When speech output has started
started-word: When a word is about to be spoken. Includes the character
"location" of the start of the word in the original utterance text
and the "length" of the word in characters.
finished-utterance: When speech output has finished. Includes a flag
indicating if the entire utterance was "completed" or not.
The proxy automatically adds any "name" associated with the utterance
to the notifications on behalf of the driver.
When starting to output an utterance, the driver must inform its proxy
that it is busy by invoking L{driver.DriverProxy.setBusy} with a flag
of True. When the utterance completes or is interrupted, the driver
inform the proxy that it is no longer busy by invoking
L{driver.DriverProxy.setBusy} with a flag of False.
@param text: Unicode text to speak
@type text: unicode
'''
self._proxy.setBusy(True)
self._proxy.notify('started-utterance')
i = 0
for word in text.split(' '):
self._proxy.notify('started-word', location=i, length=len(word))
try:
i = text.index(' ', i+1)+1
except Exception:
pass
self._proxy.notify('finished-utterance', completed=True)
self._proxy.setBusy(False)
def stop(self):
'''
Stops any current output. If an utterance was being spoken, the driver
is still responsible for sending the closing finished-utterance
notification documented above and resetting the busy state of the
proxy.
'''
pass
def getProperty(self, name):
'''
Gets a property value of the speech engine. The suppoted properties
and their values are:
voices: List of L{voice.Voice} objects supported by the driver
voice: String ID of the current voice
rate: Integer speech rate in words per minute
volume: Floating point volume of speech in the range [0.0, 1.0]
@param name: Property name
@type name: str
@raise KeyError: When the property name is unknown
'''
try:
return self._config[name]
except KeyError:
raise KeyError('unknown property %s' % name)
def setProperty(self, name, value):
'''
Sets one of the supported property values of the speech engine listed
above. If a value is invalid, attempts to clip it / coerce so it is
valid before giving up and firing an exception.
@param name: Property name
@type name: str
@param value: Property value
@type value: object
@raise KeyError: When the property name is unknown
@raise ValueError: When the value cannot be coerced to fit the property
'''
if name == 'voice':
v = filter(lambda v: v.id == value, self._config['voices'])
self._config['voice'] = v[0]
elif name == 'rate':
self._config['rate'] = value
elif name == 'volume':
self._config['volume'] = value
else:
raise KeyError('unknown property %s' % name)

View File

@@ -0,0 +1,233 @@
import os
import wave
import platform
import ctypes
import time
import subprocess
from tempfile import NamedTemporaryFile
if platform.system() == 'Windows':
import winsound
from ..voice import Voice
from . import _espeak, fromUtf8, toUtf8
# noinspection PyPep8Naming
def buildDriver(proxy):
return EspeakDriver(proxy)
# noinspection PyPep8Naming
class EspeakDriver(object):
_moduleInitialized = False
_defaultVoice = ''
def __init__(self, proxy):
if not EspeakDriver._moduleInitialized:
# espeak cannot initialize more than once per process and has
# issues when terminating from python (assert error on close)
# so just keep it alive and init once
rate = _espeak.Initialize(_espeak.AUDIO_OUTPUT_RETRIEVAL, 1000)
if rate == -1:
raise RuntimeError('could not initialize espeak')
EspeakDriver._defaultVoice = 'default'
EspeakDriver._moduleInitialized = True
self._proxy = proxy
self._looping = False
self._stopping = False
self._speaking = False
self._text_to_say = None
self._data_buffer = b''
self._numerise_buffer = []
_espeak.SetSynthCallback(self._onSynth)
self.setProperty('voice', EspeakDriver._defaultVoice)
self.setProperty('rate', 200)
self.setProperty('volume', 1.0)
def numerise(self, data):
self._numerise_buffer.append(data)
return ctypes.c_void_p(len(self._numerise_buffer))
def decode_numeric(self, data):
return self._numerise_buffer[int(data) - 1]
@staticmethod
def destroy():
_espeak.SetSynthCallback(None)
def stop(self):
if _espeak.IsPlaying():
self._stopping = True
_espeak.Cancel()
@staticmethod
def getProperty(name: str):
if name == 'voices':
voices = []
for v in _espeak.ListVoices(None):
kwargs = {'id': fromUtf8(v.name), 'name': fromUtf8(v.name)}
if v.languages:
try:
language_code_bytes = v.languages[1:]
language_code = language_code_bytes.decode('utf-8', errors='ignore')
kwargs['languages'] = [language_code]
except UnicodeDecodeError as e:
kwargs['languages'] = ["Unknown"]
genders = [None, 'male', 'female']
kwargs['gender'] = genders[v.gender]
kwargs['age'] = v.age or None
voices.append(Voice(**kwargs))
return voices
elif name == 'voice':
voice = _espeak.GetCurrentVoice()
return fromUtf8(voice.contents.name)
elif name == 'rate':
return _espeak.GetParameter(_espeak.RATE)
elif name == 'volume':
return _espeak.GetParameter(_espeak.VOLUME) / 100.0
elif name == 'pitch':
return _espeak.GetParameter(_espeak.PITCH)
else:
raise KeyError('unknown property %s' % name)
@staticmethod
def setProperty(name: str, value):
if name == 'voice':
if value is None:
return
try:
utf8Value = toUtf8(value)
_espeak.SetVoiceByName(utf8Value)
except ctypes.ArgumentError as e:
raise ValueError(str(e))
elif name == 'rate':
try:
_espeak.SetParameter(_espeak.RATE, value, 0)
except ctypes.ArgumentError as e:
raise ValueError(str(e))
elif name == 'volume':
try:
_espeak.SetParameter(
_espeak.VOLUME, int(round(value * 100, 2)), 0)
except TypeError as e:
raise ValueError(str(e))
elif name == 'pitch':
try:
_espeak.SetParameter(
_espeak.PITCH, int(value), 0
)
except TypeError as e:
raise ValueError(str(e))
else:
raise KeyError('unknown property %s' % name)
def save_to_file(self, text, filename):
code = self.numerise(filename)
_espeak.Synth(toUtf8(text), flags=_espeak.ENDPAUSE | _espeak.CHARS_UTF8, user_data=code)
def _start_synthesis(self, text):
self._proxy.setBusy(True)
self._proxy.notify('started-utterance')
self._speaking = True
self._data_buffer = b'' # Ensure buffer is cleared before starting
try:
_espeak.Synth(toUtf8(text), flags=_espeak.ENDPAUSE | _espeak.CHARS_UTF8)
except Exception as e:
self._proxy.setBusy(False)
self._proxy.notify('error', exception=e)
raise
def _onSynth(self, wav, numsamples, events):
i = 0
while True:
event = events[i]
if event.type == _espeak.EVENT_LIST_TERMINATED:
break
if event.type == _espeak.EVENT_WORD:
if self._text_to_say:
start_index = event.text_position-1
end_index = start_index + event.length
word = self._text_to_say[start_index:end_index]
else:
word = "Unknown"
self._proxy.notify('started-word', name=word, location=event.text_position, length=event.length)
elif event.type == _espeak.EVENT_END:
stream = NamedTemporaryFile(delete=False, suffix='.wav')
try:
with wave.open(stream, 'wb') as f:
f.setnchannels(1)
f.setsampwidth(2)
f.setframerate(22050.0)
f.writeframes(self._data_buffer)
self._data_buffer = b''
if event.user_data:
os.system(f'ffmpeg -y -i {stream.name} {self.decode_numeric(event.user_data)} -loglevel quiet')
else:
if platform.system() == 'Darwin': # macOS
try:
result = subprocess.run(['afplay', stream.name], check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"[EspeakDriver._onSynth] Mac afplay failed with error: {e}")
elif platform.system() == 'Linux':
os.system(f'aplay {stream.name} -q')
elif platform.system() == 'Windows':
winsound.PlaySound(stream.name, winsound.SND_FILENAME) # Blocking playback
except Exception as e:
raise RuntimeError(f"Error during playback: {e}")
finally:
try:
stream.close() # Ensure the file is closed
os.remove(stream.name)
except Exception as e:
raise RuntimeError(f"Error deleting temporary WAV file: {e}")
self._proxy.notify('finished-utterance', completed=True)
self._proxy.setBusy(False)
self.endLoop() # End the loop here
break # Exit the loop after handling the termination event
i += 1
if numsamples > 0:
self._data_buffer += ctypes.string_at(wav, numsamples * ctypes.sizeof(ctypes.c_short))
return 0
def endLoop(self):
self._looping = False
def startLoop(self):
first = True
self._looping = True
while self._looping:
if not self._looping:
break
if first:
self._proxy.setBusy(False)
first = False
if self._text_to_say:
self._start_synthesis(self._text_to_say)
self.iterate()
time.sleep(0.01)
def iterate(self):
if not self._looping:
return
if self._stopping:
_espeak.Cancel()
self._stopping = False
self._proxy.notify('finished-utterance', completed=False)
self._proxy.setBusy(False)
self.endLoop()
def say(self, text):
self._text_to_say = text

View File

@@ -0,0 +1,165 @@
# noinspection PyUnresolvedReferences
import objc
from AppKit import NSSpeechSynthesizer
from Foundation import *
from PyObjCTools import AppHelper
# noinspection PyProtectedMember
from PyObjCTools.AppHelper import PyObjCAppHelperRunLoopStopper
from ..voice import Voice
# noinspection PyUnresolvedReferences
class RunLoopStopper(PyObjCAppHelperRunLoopStopper):
"""
Overrides PyObjCAppHelperRunLoopStopper to terminate after endLoop.
"""
def __init__(self):
self.shouldStop = False
def init(self):
return objc.super(RunLoopStopper, self).init()
def stop(self):
self.shouldStop = True
# noinspection PyPep8Naming
def buildDriver(proxy):
return NSSpeechDriver.alloc().initWithProxy(proxy)
# noinspection PyUnresolvedReferences,PyPep8Naming,PyUnusedLocal
class NSSpeechDriver(NSObject):
def __init__(self):
self._proxy = None
self._tts = None
self._completed = False
self._current_text = ''
@objc.python_method
def initWithProxy(self, proxy):
try:
proxy_attr = objc.super(NSSpeechDriver, self).init()
except AttributeError:
proxy_attr = self
if proxy_attr:
self._proxy = proxy
self._tts = NSSpeechSynthesizer.alloc().initWithVoice_(None)
self._tts.setDelegate_(self)
# default rate
self._tts.setRate_(200)
self._completed = True
return self
def destroy(self):
self._tts.setDelegate_(None)
del self._tts
def onPumpFirst_(self, timer):
self._proxy.setBusy(False)
def startLoop(self):
# https://github.com/ronaldoussoren/pyobjc/blob/mater/pyobjc-framework-Cocoa/Lib/PyObjCTools/AppHelper.py#L243C25-L243C25 # noqa
NSTimer.scheduledTimerWithTimeInterval_target_selector_userInfo_repeats_(
0.0, self, 'onPumpFirst:', None, False
)
runLoop = NSRunLoop.currentRunLoop()
stopper = RunLoopStopper.alloc().init()
PyObjCAppHelperRunLoopStopper.addRunLoopStopper_toRunLoop_(stopper, runLoop)
while stopper.shouldRun():
nextfire = runLoop.limitDateForMode_(NSDefaultRunLoopMode)
soon = NSDate.dateWithTimeIntervalSinceNow_(0) # maxTimeout in runConsoleEventLoop
if nextfire is not None:
nextfire = soon.earlierDate_(nextfire)
if not runLoop.runMode_beforeDate_(NSDefaultRunLoopMode, nextfire):
stopper.stop()
break
PyObjCAppHelperRunLoopStopper.removeRunLoopStopperFromRunLoop_(runLoop)
@staticmethod
def endLoop():
AppHelper.stopEventLoop()
def iterate(self):
self._proxy.setBusy(False)
yield
@objc.python_method
def say(self, text):
self._proxy.setBusy(True)
self._completed = True
self._proxy.notify('started-utterance')
self._current_text = text
self._tts.startSpeakingString_(text)
def stop(self):
if self._proxy.isBusy():
self._completed = False
self._tts.stopSpeaking()
@objc.python_method
def _toVoice(self, attr):
return Voice(attr.get('VoiceIdentifier'), attr.get('VoiceName'),
[attr.get('VoiceLocaleIdentifier', attr.get('VoiceLanguage'))], attr.get('VoiceGender'),
attr.get('VoiceAge'))
@objc.python_method
def getProperty(self, name):
if name == 'voices':
return [self._toVoice(NSSpeechSynthesizer.attributesForVoice_(v))
for v in list(NSSpeechSynthesizer.availableVoices())]
elif name == 'voice':
return self._tts.voice()
elif name == 'rate':
return self._tts.rate()
elif name == 'volume':
return self._tts.volume()
elif name == "pitch":
print("Pitch adjustment not supported when using NSSS")
else:
raise KeyError('unknown property %s' % name)
@objc.python_method
def setProperty(self, name, value):
if name == 'voice':
# vol/rate gets reset, so store and restore it
vol = self._tts.volume()
rate = self._tts.rate()
self._tts.setVoice_(value)
self._tts.setRate_(rate)
self._tts.setVolume_(vol)
elif name == 'rate':
self._tts.setRate_(value)
elif name == 'volume':
self._tts.setVolume_(value)
elif name == 'pitch':
print("Pitch adjustment not supported when using NSSS")
else:
raise KeyError('unknown property %s' % name)
@objc.python_method
def save_to_file(self, text, filename):
self._proxy.setBusy(True)
self._completed = True
url = Foundation.NSURL.fileURLWithPath_(filename)
self._tts.startSpeakingString_toURL_(text, url)
def speechSynthesizer_didFinishSpeaking_(self, tts, success):
if not self._completed:
success = False
else:
success = True
self._proxy.notify('finished-utterance', completed=success)
self._proxy.setBusy(False)
def speechSynthesizer_willSpeakWord_ofString_(self, tts, rng, text):
if self._current_text:
current_word = self._current_text[rng.location:rng.location + rng.length]
else:
current_word = "Unknown"
self._proxy.notify('started-word', name=current_word, location=rng.location,
length=rng.length)

View File

@@ -0,0 +1,187 @@
# noinspection PyUnresolvedReferences
import comtypes.client # Importing comtypes.client will make the gen subpackage
try:
from comtypes.gen import SpeechLib # comtypes
except ImportError:
# Generate the SpeechLib lib and any associated files
engine = comtypes.client.CreateObject("SAPI.SpVoice")
stream = comtypes.client.CreateObject("SAPI.SpFileStream")
# noinspection PyUnresolvedReferences
from comtypes.gen import SpeechLib
# noinspection PyUnresolvedReferences
import math
import os
import time
import weakref
import pythoncom
from ..voice import Voice
from . import fromUtf8, toUtf8
# common voices
MSSAM = 'HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech\\Voices\\Tokens\\MSSam'
MSMARY = 'HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech\\Voices\\Tokens\\MSMary'
MSMIKE = 'HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech\\Voices\\Tokens\\MSMike'
# coeffs for wpm conversion
E_REG = {MSSAM: (137.89, 1.11),
MSMARY: (156.63, 1.11),
MSMIKE: (154.37, 1.11)}
# noinspection PyPep8Naming
def buildDriver(proxy):
return SAPI5Driver(proxy)
# noinspection PyPep8Naming,PyShadowingNames
class SAPI5Driver(object):
def __init__(self, proxy):
self._tts = comtypes.client.CreateObject('SAPI.SPVoice')
# all events
self._tts.EventInterests = 33790
self._event_sink = SAPI5DriverEventSink()
self._event_sink.setDriver(weakref.proxy(self))
self._advise = comtypes.client.GetEvents(self._tts, self._event_sink)
self._proxy = proxy
self._looping = False
self._speaking = False
self._stopping = False
self._current_text = ''
# initial rate
self._rateWpm = 200
self.setProperty('voice', self.getProperty('voice'))
def destroy(self):
self._tts.EventInterests = 0
def say(self, text):
self._proxy.setBusy(True)
self._proxy.notify('started-utterance')
self._speaking = True
self._current_text = text
# call this async otherwise this blocks the callbacks
# see SpeechVoiceSpeakFlags: https://docs.microsoft.com/en-us/previous-versions/windows/desktop/ms720892%28v%3dvs.85%29
# and Speak : https://docs.microsoft.com/en-us/previous-versions/windows/desktop/ms723609(v=vs.85)
self._tts.Speak(fromUtf8(toUtf8(text)), 1) # -> stream_number as described in the remarks of the documentation
def stop(self):
if not self._speaking:
return
self._proxy.setBusy(True)
self._stopping = True
self._tts.Speak('', 3)
def save_to_file(self, text, filename):
cwd = os.getcwd()
stream = comtypes.client.CreateObject('SAPI.SPFileStream')
stream.Open(filename, SpeechLib.SSFMCreateForWrite)
temp_stream = self._tts.AudioOutputStream
self._tts.AudioOutputStream = stream
self._tts.Speak(fromUtf8(toUtf8(text)))
self._tts.AudioOutputStream = temp_stream
stream.close()
os.chdir(cwd)
@staticmethod
def _toVoice(attr):
return Voice(attr.Id, attr.GetDescription())
def _tokenFromId(self, id_):
tokens = self._tts.GetVoices()
for token in tokens:
if token.Id == id_:
return token
raise ValueError('unknown voice id %s', id_)
def getProperty(self, name):
if name == 'voices':
return [self._toVoice(attr) for attr in self._tts.GetVoices()]
elif name == 'voice':
return self._tts.Voice.Id
elif name == 'rate':
return self._rateWpm
elif name == 'volume':
return self._tts.Volume / 100.0
elif name == 'pitch':
print("Pitch adjustment not supported when using SAPI5")
else:
raise KeyError('unknown property %s' % name)
def setProperty(self, name, value):
if name == 'voice':
token = self._tokenFromId(value)
self._tts.Voice = token
a, b = E_REG.get(value, E_REG[MSMARY])
self._tts.Rate = int(math.log(self._rateWpm / a, b))
elif name == 'rate':
id_ = self._tts.Voice.Id
a, b = E_REG.get(id_, E_REG[MSMARY])
try:
self._tts.Rate = int(math.log(value / a, b))
except TypeError as e:
raise ValueError(str(e))
self._rateWpm = value
elif name == 'volume':
try:
self._tts.Volume = int(round(value * 100, 2))
except TypeError as e:
raise ValueError(str(e))
elif name == 'pitch':
print("Pitch adjustment not supported when using SAPI5")
else:
raise KeyError('unknown property %s' % name)
def startLoop(self):
first = True
self._looping = True
while self._looping:
if first:
self._proxy.setBusy(False)
first = False
pythoncom.PumpWaitingMessages()
time.sleep(0.05)
def endLoop(self):
self._looping = False
def iterate(self):
self._proxy.setBusy(False)
while 1:
pythoncom.PumpWaitingMessages()
yield
# noinspection PyPep8Naming,PyProtectedMember,PyUnusedLocal,PyShadowingNames
class SAPI5DriverEventSink(object):
def __init__(self):
self._driver = None
def setDriver(self, driver):
self._driver = driver
def _ISpeechVoiceEvents_StartStream(self, stream_number, stream_position):
self._driver._proxy.notify(
'started-word', location=stream_number, length=stream_position)
def _ISpeechVoiceEvents_EndStream(self, stream_number, stream_position):
d = self._driver
if d._speaking:
d._proxy.notify('finished-utterance', completed=not d._stopping)
d._speaking = False
d._stopping = False
d._proxy.setBusy(False)
d.endLoop() # hangs if you dont have this
def _ISpeechVoiceEvents_Word(self, stream_number, stream_position, char, length):
current_text = self._driver._current_text
if current_text:
current_word = current_text[char:char + length]
else:
current_word = "Unknown"
self._driver._proxy.notify(
'started-word', name=current_word, location=char, length=length)