talk2me/venv/lib/python3.11/site-packages/pyttsx3/drivers/espeak.py
2025-04-04 13:23:15 -06:00

233 lines
8.3 KiB
Python

import os
import wave
import platform
import ctypes
import time
import subprocess
from tempfile import NamedTemporaryFile
if platform.system() == 'Windows':
import winsound
from ..voice import Voice
from . import _espeak, fromUtf8, toUtf8
# noinspection PyPep8Naming
def buildDriver(proxy):
return EspeakDriver(proxy)
# noinspection PyPep8Naming
class EspeakDriver(object):
_moduleInitialized = False
_defaultVoice = ''
def __init__(self, proxy):
if not EspeakDriver._moduleInitialized:
# espeak cannot initialize more than once per process and has
# issues when terminating from python (assert error on close)
# so just keep it alive and init once
rate = _espeak.Initialize(_espeak.AUDIO_OUTPUT_RETRIEVAL, 1000)
if rate == -1:
raise RuntimeError('could not initialize espeak')
EspeakDriver._defaultVoice = 'default'
EspeakDriver._moduleInitialized = True
self._proxy = proxy
self._looping = False
self._stopping = False
self._speaking = False
self._text_to_say = None
self._data_buffer = b''
self._numerise_buffer = []
_espeak.SetSynthCallback(self._onSynth)
self.setProperty('voice', EspeakDriver._defaultVoice)
self.setProperty('rate', 200)
self.setProperty('volume', 1.0)
def numerise(self, data):
self._numerise_buffer.append(data)
return ctypes.c_void_p(len(self._numerise_buffer))
def decode_numeric(self, data):
return self._numerise_buffer[int(data) - 1]
@staticmethod
def destroy():
_espeak.SetSynthCallback(None)
def stop(self):
if _espeak.IsPlaying():
self._stopping = True
_espeak.Cancel()
@staticmethod
def getProperty(name: str):
if name == 'voices':
voices = []
for v in _espeak.ListVoices(None):
kwargs = {'id': fromUtf8(v.name), 'name': fromUtf8(v.name)}
if v.languages:
try:
language_code_bytes = v.languages[1:]
language_code = language_code_bytes.decode('utf-8', errors='ignore')
kwargs['languages'] = [language_code]
except UnicodeDecodeError as e:
kwargs['languages'] = ["Unknown"]
genders = [None, 'male', 'female']
kwargs['gender'] = genders[v.gender]
kwargs['age'] = v.age or None
voices.append(Voice(**kwargs))
return voices
elif name == 'voice':
voice = _espeak.GetCurrentVoice()
return fromUtf8(voice.contents.name)
elif name == 'rate':
return _espeak.GetParameter(_espeak.RATE)
elif name == 'volume':
return _espeak.GetParameter(_espeak.VOLUME) / 100.0
elif name == 'pitch':
return _espeak.GetParameter(_espeak.PITCH)
else:
raise KeyError('unknown property %s' % name)
@staticmethod
def setProperty(name: str, value):
if name == 'voice':
if value is None:
return
try:
utf8Value = toUtf8(value)
_espeak.SetVoiceByName(utf8Value)
except ctypes.ArgumentError as e:
raise ValueError(str(e))
elif name == 'rate':
try:
_espeak.SetParameter(_espeak.RATE, value, 0)
except ctypes.ArgumentError as e:
raise ValueError(str(e))
elif name == 'volume':
try:
_espeak.SetParameter(
_espeak.VOLUME, int(round(value * 100, 2)), 0)
except TypeError as e:
raise ValueError(str(e))
elif name == 'pitch':
try:
_espeak.SetParameter(
_espeak.PITCH, int(value), 0
)
except TypeError as e:
raise ValueError(str(e))
else:
raise KeyError('unknown property %s' % name)
def save_to_file(self, text, filename):
code = self.numerise(filename)
_espeak.Synth(toUtf8(text), flags=_espeak.ENDPAUSE | _espeak.CHARS_UTF8, user_data=code)
def _start_synthesis(self, text):
self._proxy.setBusy(True)
self._proxy.notify('started-utterance')
self._speaking = True
self._data_buffer = b'' # Ensure buffer is cleared before starting
try:
_espeak.Synth(toUtf8(text), flags=_espeak.ENDPAUSE | _espeak.CHARS_UTF8)
except Exception as e:
self._proxy.setBusy(False)
self._proxy.notify('error', exception=e)
raise
def _onSynth(self, wav, numsamples, events):
i = 0
while True:
event = events[i]
if event.type == _espeak.EVENT_LIST_TERMINATED:
break
if event.type == _espeak.EVENT_WORD:
if self._text_to_say:
start_index = event.text_position-1
end_index = start_index + event.length
word = self._text_to_say[start_index:end_index]
else:
word = "Unknown"
self._proxy.notify('started-word', name=word, location=event.text_position, length=event.length)
elif event.type == _espeak.EVENT_END:
stream = NamedTemporaryFile(delete=False, suffix='.wav')
try:
with wave.open(stream, 'wb') as f:
f.setnchannels(1)
f.setsampwidth(2)
f.setframerate(22050.0)
f.writeframes(self._data_buffer)
self._data_buffer = b''
if event.user_data:
os.system(f'ffmpeg -y -i {stream.name} {self.decode_numeric(event.user_data)} -loglevel quiet')
else:
if platform.system() == 'Darwin': # macOS
try:
result = subprocess.run(['afplay', stream.name], check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"[EspeakDriver._onSynth] Mac afplay failed with error: {e}")
elif platform.system() == 'Linux':
os.system(f'aplay {stream.name} -q')
elif platform.system() == 'Windows':
winsound.PlaySound(stream.name, winsound.SND_FILENAME) # Blocking playback
except Exception as e:
raise RuntimeError(f"Error during playback: {e}")
finally:
try:
stream.close() # Ensure the file is closed
os.remove(stream.name)
except Exception as e:
raise RuntimeError(f"Error deleting temporary WAV file: {e}")
self._proxy.notify('finished-utterance', completed=True)
self._proxy.setBusy(False)
self.endLoop() # End the loop here
break # Exit the loop after handling the termination event
i += 1
if numsamples > 0:
self._data_buffer += ctypes.string_at(wav, numsamples * ctypes.sizeof(ctypes.c_short))
return 0
def endLoop(self):
self._looping = False
def startLoop(self):
first = True
self._looping = True
while self._looping:
if not self._looping:
break
if first:
self._proxy.setBusy(False)
first = False
if self._text_to_say:
self._start_synthesis(self._text_to_say)
self.iterate()
time.sleep(0.01)
def iterate(self):
if not self._looping:
return
if self._stopping:
_espeak.Cancel()
self._stopping = False
self._proxy.notify('finished-utterance', completed=False)
self._proxy.setBusy(False)
self.endLoop()
def say(self, text):
self._text_to_say = text