500 lines
18 KiB
Python
500 lines
18 KiB
Python
from __future__ import print_function
|
|
|
|
import time
|
|
from ctypes import (CFUNCTYPE, POINTER, Structure, Union, c_char_p, c_int,
|
|
c_long, c_short, c_ubyte, c_uint, c_ulong, c_void_p,
|
|
c_wchar, cdll)
|
|
|
|
|
|
def cfunc(name, dll, result, *args):
|
|
"""build and apply a ctypes prototype complete with parameter flags"""
|
|
atypes = []
|
|
aflags = []
|
|
for arg in args:
|
|
atypes.append(arg[1])
|
|
aflags.append((arg[2], arg[0]) + arg[3:])
|
|
return CFUNCTYPE(result, *atypes)((name, dll), tuple(aflags))
|
|
|
|
|
|
dll = None
|
|
|
|
def load_library():
|
|
global dll
|
|
paths = [
|
|
# macOS paths
|
|
'/usr/local/lib/libespeak-ng.1.dylib',
|
|
'/usr/local/lib/libespeak.dylib',
|
|
|
|
# Linux paths
|
|
'libespeak-ng.so.1',
|
|
'/usr/local/lib/libespeak-ng.so.1',
|
|
'libespeak.so.1',
|
|
|
|
# Windows paths
|
|
r'C:\Program Files\eSpeak NG\libespeak-ng.dll',
|
|
r'C:\Program Files (x86)\eSpeak NG\libespeak-ng.dll'
|
|
]
|
|
|
|
for path in paths:
|
|
try:
|
|
dll = cdll.LoadLibrary(path)
|
|
return True
|
|
except Exception:
|
|
continue # Try the next path
|
|
return False
|
|
|
|
try:
|
|
if not load_library():
|
|
raise RuntimeError("This means you probably do not have eSpeak or eSpeak-ng installed!")
|
|
except Exception as exp:
|
|
raise
|
|
|
|
# constants and such from speak_lib.h
|
|
|
|
EVENT_LIST_TERMINATED = 0
|
|
EVENT_WORD = 1
|
|
EVENT_SENTENCE = 2
|
|
EVENT_MARK = 3
|
|
EVENT_PLAY = 4
|
|
EVENT_END = 5
|
|
EVENT_MSG_TERMINATED = 6
|
|
|
|
|
|
class numberORname(Union):
|
|
_fields_ = [
|
|
('number', c_int),
|
|
('name', c_char_p)
|
|
]
|
|
|
|
|
|
class EVENT(Structure):
|
|
_fields_ = [
|
|
('type', c_int),
|
|
('unique_identifier', c_uint),
|
|
('text_position', c_int),
|
|
('length', c_int),
|
|
('audio_position', c_int),
|
|
('sample', c_int),
|
|
('user_data', c_void_p),
|
|
('id', numberORname)
|
|
]
|
|
|
|
|
|
AUDIO_OUTPUT_PLAYBACK = 0
|
|
AUDIO_OUTPUT_RETRIEVAL = 1
|
|
AUDIO_OUTPUT_SYNCHRONOUS = 2
|
|
AUDIO_OUTPUT_SYNCH_PLAYBACK = 3
|
|
|
|
EE_OK = 0
|
|
EE_INTERNAL_ERROR = -1
|
|
EE_BUFFER_FULL = 1
|
|
EE_NOT_FOUND = 2
|
|
|
|
Initialize = cfunc('espeak_Initialize', dll, c_int,
|
|
('output', c_int, 1, AUDIO_OUTPUT_PLAYBACK),
|
|
('bufflength', c_int, 1, 100),
|
|
('path', c_char_p, 1, None),
|
|
('option', c_int, 1, 0))
|
|
Initialize.__doc__ = """Must be called before any synthesis functions are called.
|
|
output: the audio data can either be played by eSpeak or passed back by the SynthCallback function.
|
|
buflength: The length in mS of sound buffers passed to the SynthCallback function.
|
|
path: The directory which contains the espeak-data directory, or NULL for the default location.
|
|
options: bit 0: 1=allow espeakEVENT_PHONEME events.
|
|
|
|
Returns: sample rate in Hz, or -1 (EE_INTERNAL_ERROR)."""
|
|
|
|
t_espeak_callback = CFUNCTYPE(c_int, POINTER(c_short), c_int, POINTER(EVENT))
|
|
|
|
cSetSynthCallback = cfunc('espeak_SetSynthCallback', dll, None,
|
|
('SynthCallback', t_espeak_callback, 1))
|
|
SynthCallback = None
|
|
|
|
|
|
def SetSynthCallback(cb):
|
|
global SynthCallback
|
|
SynthCallback = t_espeak_callback(cb)
|
|
cSetSynthCallback(SynthCallback)
|
|
|
|
|
|
SetSynthCallback.__doc__ = """Must be called before any synthesis functions are called.
|
|
This specifies a function in the calling program which is called when a buffer of
|
|
speech sound data has been produced.
|
|
|
|
|
|
The callback function is of the form:
|
|
|
|
int SynthCallback(short *wav, int numsamples, espeak_EVENT *events);
|
|
|
|
wav: is the speech sound data which has been produced.
|
|
NULL indicates that the synthesis has been completed.
|
|
|
|
numsamples: is the number of entries in wav. This number may vary, may be less than
|
|
the value implied by the buflength parameter given in espeak_Initialize, and may
|
|
sometimes be zero (which does NOT indicate end of synthesis).
|
|
|
|
events: an array of espeak_EVENT items which indicate word and sentence events, and
|
|
also the occurance if <mark> and <audio> elements within the text.
|
|
|
|
|
|
Callback returns: 0=continue synthesis, 1=abort synthesis."""
|
|
|
|
t_UriCallback = CFUNCTYPE(c_int, c_int, c_char_p, c_char_p)
|
|
|
|
cSetUriCallback = cfunc('espeak_SetUriCallback', dll, None,
|
|
('UriCallback', t_UriCallback, 1))
|
|
UriCallback = None
|
|
|
|
|
|
def SetUriCallback(cb):
|
|
global UriCallback
|
|
UriCallback = t_UriCallback(UriCallback)
|
|
cSetUriCallback(UriCallback)
|
|
|
|
|
|
SetUriCallback.__doc__ = """This function must be called before synthesis functions are used, in order to deal with
|
|
<audio> tags. It specifies a callback function which is called when an <audio> element is
|
|
encountered and allows the calling program to indicate whether the sound file which
|
|
is specified in the <audio> element is available and is to be played.
|
|
|
|
The callback function is of the form:
|
|
|
|
int UriCallback(int type, const char *uri, const char *base);
|
|
|
|
type: type of callback event. Currently only 1= <audio> element
|
|
|
|
uri: the "src" attribute from the <audio> element
|
|
|
|
base: the "xml:base" attribute (if any) from the <speak> element
|
|
|
|
Return: 1=don't play the sound, but speak the text alternative.
|
|
0=place a PLAY event in the event list at the point where the <audio> element
|
|
occurs. The calling program can then play the sound at that point."""
|
|
|
|
# a few manifest constants
|
|
CHARS_AUTO = 0
|
|
CHARS_UTF8 = 1
|
|
CHARS_8BIT = 2
|
|
CHARS_WCHAR = 3
|
|
|
|
SSML = 0x10
|
|
PHONEMES = 0x100
|
|
ENDPAUSE = 0x1000
|
|
KEEP_NAMEDATA = 0x2000
|
|
|
|
POS_CHARACTER = 1
|
|
POS_WORD = 2
|
|
POS_SENTENCE = 3
|
|
|
|
|
|
def Synth(text, position=0, position_type=POS_CHARACTER, end_position=0, flags=0, user_data=None):
|
|
return cSynth(text, len(text) * 10, position, position_type, end_position, flags, None, user_data)
|
|
|
|
|
|
cSynth = cfunc('espeak_Synth', dll, c_int,
|
|
('text', c_char_p, 1),
|
|
('size', c_long, 1),
|
|
('position', c_uint, 1, 0),
|
|
('position_type', c_int, 1, POS_CHARACTER),
|
|
('end_position', c_uint, 1, 0),
|
|
('flags', c_uint, 1, CHARS_AUTO),
|
|
('unique_identifier', POINTER(c_uint), 1, None),
|
|
('user_data', c_void_p, 1, None))
|
|
Synth.__doc__ = """Synthesize speech for the specified text. The speech sound data is passed to the calling
|
|
program in buffers by means of the callback function specified by espeak_SetSynthCallback(). The command is asynchronous: it is internally buffered and returns as soon as possible. If espeak_Initialize was previously called with AUDIO_OUTPUT_PLAYBACK as argument, the sound data are played by eSpeak.
|
|
|
|
text: The text to be spoken, terminated by a zero character. It may be either 8-bit characters,
|
|
wide characters (wchar_t), or UTF8 encoding. Which of these is determined by the "flags"
|
|
parameter.
|
|
|
|
size: Equal to (or greater than) the size of the text data, in bytes. This is used in order
|
|
to allocate internal storage space for the text. This value is not used for
|
|
AUDIO_OUTPUT_SYNCHRONOUS mode.
|
|
|
|
position: The position in the text where speaking starts. Zero indicates speak from the
|
|
start of the text.
|
|
|
|
position_type: Determines whether "position" is a number of characters, words, or sentences.
|
|
Values:
|
|
|
|
end_position: If set, this gives a character position at which speaking will stop. A value
|
|
of zero indicates no end position.
|
|
|
|
flags: These may be OR'd together:
|
|
Type of character codes, one of:
|
|
espeak.CHARS_UTF8 UTF8 encoding
|
|
espeak.CHARS_8BIT The 8 bit ISO-8859 character set for the particular language.
|
|
espeak.CHARS_AUTO 8 bit or UTF8 (this is the default)
|
|
espeak.CHARS_WCHAR Wide characters (wchar_t)
|
|
|
|
espeak.SSML Elements within < > are treated as SSML elements, or if not recognised are ignored.
|
|
|
|
espeak.PHONEMES Text within [[ ]] is treated as phonemes codes (in espeak's Hirschenbaum encoding).
|
|
|
|
espeak.ENDPAUSE If set then a sentence pause is added at the end of the text. If not set then
|
|
this pause is suppressed.
|
|
|
|
unique_identifier: message identifier; helpful for identifying later
|
|
data supplied to the callback.
|
|
|
|
user_data: pointer which will be passed to the callback function.
|
|
|
|
Return: EE_OK: operation achieved
|
|
EE_BUFFER_FULL: the command can not be buffered;
|
|
you may try after a while to call the function again.
|
|
EE_INTERNAL_ERROR."""
|
|
|
|
|
|
def Synth_Mark(text, index_mark, end_position=0, flags=CHARS_AUTO):
|
|
cSynth_Mark(text, len(text) + 1, index_mark, end_position, flags)
|
|
|
|
|
|
cSynth_Mark = cfunc('espeak_Synth_Mark', dll, c_int,
|
|
('text', c_char_p, 1),
|
|
('size', c_ulong, 1),
|
|
('index_mark', c_char_p, 1),
|
|
('end_position', c_uint, 1, 0),
|
|
('flags', c_uint, 1, CHARS_AUTO),
|
|
('unique_identifier', POINTER(c_uint), 1, None),
|
|
('user_data', c_void_p, 1, None))
|
|
Synth_Mark.__doc__ = """Synthesize speech for the specified text. Similar to espeak_Synth() but the start position is
|
|
specified by the name of a <mark> element in the text.
|
|
|
|
index_mark: The "name" attribute of a <mark> element within the text which specified the
|
|
point at which synthesis starts. UTF8 string.
|
|
|
|
For the other parameters, see espeak_Synth()
|
|
|
|
Return: EE_OK: operation achieved
|
|
EE_BUFFER_FULL: the command can not be buffered;
|
|
you may try after a while to call the function again.
|
|
EE_INTERNAL_ERROR."""
|
|
|
|
Key = cfunc('espeak_Key', dll, c_int,
|
|
('key_name', c_char_p, 1))
|
|
Key.__doc__ = """Speak the name of a keyboard key.
|
|
Currently this just speaks the "key_name" as given
|
|
|
|
Return: EE_OK: operation achieved
|
|
EE_BUFFER_FULL: the command can not be buffered;
|
|
you may try after a while to call the function again.
|
|
EE_INTERNAL_ERROR."""
|
|
|
|
Char = cfunc('espeak_Char', dll, c_int,
|
|
('character', c_wchar, 1))
|
|
Char.__doc__ = """Speak the name of the given character
|
|
|
|
Return: EE_OK: operation achieved
|
|
EE_BUFFER_FULL: the command can not be buffered;
|
|
you may try after a while to call the function again.
|
|
EE_INTERNAL_ERROR."""
|
|
|
|
# Speech Parameters
|
|
SILENCE = 0 # internal use
|
|
RATE = 1
|
|
VOLUME = 2
|
|
PITCH = 3
|
|
RANGE = 4
|
|
PUNCTUATION = 5
|
|
CAPITALS = 6
|
|
EMPHASIS = 7 # internal use
|
|
LINELENGTH = 8 # internal use
|
|
|
|
PUNCT_NONE = 0
|
|
PUNCT_ALL = 1
|
|
PUNCT_SOME = 2
|
|
|
|
SetParameter = cfunc('espeak_SetParameter', dll, c_int,
|
|
('parameter', c_int, 1),
|
|
('value', c_int, 1),
|
|
('relative', c_int, 1, 0))
|
|
SetParameter.__doc__ = """Sets the value of the specified parameter.
|
|
relative=0 Sets the absolute value of the parameter.
|
|
relative=1 Sets a relative value of the parameter.
|
|
|
|
parameter:
|
|
espeak.RATE: speaking speed in word per minute.
|
|
|
|
espeak.VOLUME: volume in range 0-100 0=silence
|
|
|
|
espeak.PITCH: base pitch, range 0-100. 50=normal
|
|
|
|
espeak.RANGE: pitch range, range 0-100. 0-monotone, 50=normal
|
|
|
|
espeak.PUNCTUATION: which punctuation characters to announce:
|
|
value in espeak_PUNCT_TYPE (none, all, some),
|
|
see espeak_GetParameter() to specify which characters are announced.
|
|
|
|
espeak.CAPITALS: announce capital letters by:
|
|
0=none,
|
|
1=sound icon,
|
|
2=spelling,
|
|
3 or higher, by raising pitch. This values gives the amount in Hz by which the pitch
|
|
of a word raised to indicate it has a capital letter.
|
|
|
|
Return: EE_OK: operation achieved
|
|
EE_BUFFER_FULL: the command can not be buffered;
|
|
you may try after a while to call the function again.
|
|
EE_INTERNAL_ERROR."""
|
|
|
|
GetParameter = cfunc('espeak_GetParameter', dll, c_int,
|
|
('parameter', c_int, 1))
|
|
GetParameter.__doc__ = """current=0 Returns the default value of the specified parameter.
|
|
current=1 Returns the current value of the specified parameter, as set by SetParameter()"""
|
|
|
|
SetPunctuationList = cfunc('espeak_SetPunctuationList', dll, c_int,
|
|
('punctlist', c_wchar, 1))
|
|
SetPunctuationList.__doc__ = """Specified a list of punctuation characters whose names are
|
|
to be spoken when the value of the Punctuation parameter is set to "some".
|
|
|
|
punctlist: A list of character codes, terminated by a zero character.
|
|
|
|
Return: EE_OK: operation achieved
|
|
EE_BUFFER_FULL: the command can not be buffered;
|
|
you may try after a while to call the function again.
|
|
EE_INTERNAL_ERROR."""
|
|
|
|
SetPhonemeTrace = cfunc('espeak_SetPhonemeTrace', dll, None,
|
|
('value', c_int, 1),
|
|
('stream', c_void_p, 1))
|
|
SetPhonemeTrace.__doc__ = """Controls the output of phoneme symbols for the text
|
|
value=0 No phoneme output (default)
|
|
value=1 Output the translated phoneme symbols for the text
|
|
value=2 as (1), but also output a trace of how the translation was done (matching rules and list entries)
|
|
|
|
stream output stream for the phoneme symbols (and trace). If stream=NULL then it uses stdout."""
|
|
|
|
CompileDictionary = cfunc('espeak_CompileDictionary', dll, None,
|
|
('path', c_char_p, 1),
|
|
('log', c_void_p, 1))
|
|
CompileDictionary.__doc__ = """Compile pronunciation dictionary for a language which corresponds to the currently
|
|
selected voice. The required voice should be selected before calling this function.
|
|
|
|
path: The directory which contains the language's '_rules' and '_list' files.
|
|
'path' should end with a path separator character ('/').
|
|
log: Stream for error reports and statistics information. If log=NULL then stderr will be used."""
|
|
|
|
|
|
class VOICE(Structure):
|
|
_fields_ = [
|
|
('name', c_char_p),
|
|
('languages', c_char_p),
|
|
('identifier', c_char_p),
|
|
('gender', c_ubyte),
|
|
('age', c_ubyte),
|
|
('variant', c_ubyte),
|
|
('xx1', c_ubyte),
|
|
('score', c_int),
|
|
('spare', c_void_p),
|
|
]
|
|
|
|
def __repr__(self):
|
|
"""Print the fields"""
|
|
res = []
|
|
for field in self._fields_:
|
|
res.append('%s=%s' % (field[0], repr(getattr(self, field[0]))))
|
|
return self.__class__.__name__ + '(' + ','.join(res) + ')'
|
|
|
|
|
|
cListVoices = cfunc('espeak_ListVoices', dll, POINTER(POINTER(VOICE)),
|
|
('voice_spec', POINTER(VOICE), 1))
|
|
cListVoices.__doc__ = """Reads the voice files from espeak-data/voices and creates an array of espeak_VOICE pointers.
|
|
The list is terminated by a NULL pointer
|
|
|
|
If voice_spec is NULL then all voices are listed.
|
|
If voice spec is given, then only the voices which are compatible with the voice_spec
|
|
are listed, and they are listed in preference order."""
|
|
|
|
|
|
def ListVoices(voice_spec=None):
|
|
"""Reads the voice files from espeak-data/voices and returns a list of VOICE objects.
|
|
|
|
If voice_spec is None then all voices are listed.
|
|
If voice spec is given, then only the voices which are compatible with the voice_spec
|
|
are listed, and they are listed in preference order."""
|
|
ppv = cListVoices(voice_spec)
|
|
res = []
|
|
i = 0
|
|
while ppv[i]:
|
|
res.append(ppv[i][0])
|
|
i += 1
|
|
return res
|
|
|
|
|
|
SetVoiceByName = cfunc('espeak_SetVoiceByName', dll, c_int,
|
|
('name', c_char_p, 1))
|
|
SetVoiceByName.__doc__ = """Searches for a voice with a matching "name" field. Language is not considered.
|
|
"name" is a UTF8 string.
|
|
|
|
Return: EE_OK: operation achieved
|
|
EE_BUFFER_FULL: the command can not be buffered;
|
|
you may try after a while to call the function again.
|
|
EE_INTERNAL_ERROR."""
|
|
|
|
SetVoiceByProperties = cfunc('espeak_SetVoiceByProperties', dll, c_int,
|
|
('voice_spec', POINTER(VOICE), 1))
|
|
SetVoiceByProperties.__doc__ = """An espeak_VOICE structure is used to pass criteria to select a voice. Any of the following
|
|
fields may be set:
|
|
|
|
name NULL, or a voice name
|
|
|
|
languages NULL, or a single language string (with optional dialect), eg. "en-uk", or "en"
|
|
|
|
gender 0=not specified, 1=male, 2=female
|
|
|
|
age 0=not specified, or an age in years
|
|
|
|
variant After a list of candidates is produced, scored and sorted, "variant" is used to index
|
|
that list and choose a voice.
|
|
variant=0 takes the top voice (i.e. best match). variant=1 takes the next voice, etc"""
|
|
|
|
GetCurrentVoice = cfunc('espeak_GetCurrentVoice', dll, POINTER(VOICE),
|
|
)
|
|
GetCurrentVoice.__doc__ = """Returns the espeak_VOICE data for the currently selected voice.
|
|
This is not affected by temporary voice changes caused by SSML elements such as <voice> and <s>"""
|
|
|
|
Cancel = cfunc('espeak_Cancel', dll, c_int)
|
|
Cancel.__doc__ = """Stop immediately synthesis and audio output of the current text. When this
|
|
function returns, the audio output is fully stopped and the synthesizer is ready to
|
|
synthesize a new message.
|
|
|
|
Return: EE_OK: operation achieved
|
|
EE_INTERNAL_ERROR."""
|
|
|
|
IsPlaying = cfunc('espeak_IsPlaying', dll, c_int)
|
|
IsPlaying.__doc__ = """Returns 1 if audio is played, 0 otherwise."""
|
|
|
|
Synchronize = cfunc('espeak_Synchronize', dll, c_int)
|
|
Synchronize.__doc__ = """This function returns when all data have been spoken.
|
|
Return: EE_OK: operation achieved
|
|
EE_INTERNAL_ERROR."""
|
|
|
|
Terminate = cfunc('espeak_Terminate', dll, c_int)
|
|
Terminate.__doc__ = """last function to be called.
|
|
Return: EE_OK: operation achieved
|
|
EE_INTERNAL_ERROR."""
|
|
|
|
Info = cfunc('espeak_Info', dll, c_char_p, ('ptr', c_void_p, 1, 0))
|
|
Info.__doc__ = """Returns the version number string.
|
|
The parameter is for future use, and should be set to NULL"""
|
|
|
|
if __name__ == '__main__':
|
|
def synth_cb(wav, numsample, events):
|
|
print(numsample, end="")
|
|
i = 0
|
|
while True:
|
|
if events[i].type == EVENT_LIST_TERMINATED:
|
|
break
|
|
print(events[i].type, end="")
|
|
i += 1
|
|
return 0
|
|
|
|
|
|
samplerate = Initialize(output=AUDIO_OUTPUT_PLAYBACK)
|
|
SetSynthCallback(synth_cb)
|
|
s = 'This is a test, only a test. '
|
|
uid = c_uint(0)
|
|
# print 'pitch=',GetParameter(PITCH)
|
|
# SetParameter(PITCH, 50, 0)
|
|
print(Synth(s))
|
|
while IsPlaying():
|
|
time.sleep(0.1) |