From d46df7939a282bd0cd09b4be4e019ff454b4d858 Mon Sep 17 00:00:00 2001 From: adelorenzo Date: Sat, 12 Apr 2025 00:39:16 -0600 Subject: [PATCH] Add real-time language interpreter feature This commit adds a real-time interpreter mode to the Voice Language Translator: - Implements continuous speech-to-translation capabilities using WebSockets - Adds background processing threads for audio, transcription, and TTS - Adds client-side speech detection and streaming - Updates UI to include real-time interpreter controls - Adds necessary dependencies (flask-socketio, eventlet) --- app.py | 238 +++++++++++++++++++- requirements.txt | 2 + static/js/app.js | 500 +++++++++++++++++++++++++++++++++---------- templates/index.html | 310 ++++----------------------- 4 files changed, 666 insertions(+), 384 deletions(-) diff --git a/app.py b/app.py index 39c74a7..0161455 100644 --- a/app.py +++ b/app.py @@ -4,7 +4,10 @@ import tempfile import requests import json import logging +import threading +import queue from flask import Flask, render_template, request, jsonify, Response, send_file, send_from_directory +from flask_socketio import SocketIO, emit import whisper import torch import ollama @@ -18,6 +21,9 @@ app.config['UPLOAD_FOLDER'] = tempfile.mkdtemp() app.config['TTS_SERVER'] = os.environ.get('TTS_SERVER_URL', 'http://localhost:5050/v1/audio/speech') app.config['TTS_API_KEY'] = os.environ.get('TTS_API_KEY', '56461d8b44607f2cfcb8030dee313a8e') +# Initialize Flask-SocketIO +socketio = SocketIO(app, cors_allowed_origins="*") + @app.route('/') def root_files(filename): # Check if requested file is one of the common icon filenames @@ -393,5 +399,235 @@ def get_audio(filename): logger.error(f"Audio retrieval error: {str(e)}") return jsonify({'error': f'Audio retrieval failed: {str(e)}'}), 500 +# Real-time interpreter functions +def setup_realtime_interpreter(): + global whisper_model + logger.info("Setting up realtime interpreter...") + + # Create processing queues + audio_queue = queue.Queue() + transcription_queue = queue.Queue() + translation_queue = queue.Queue() + + # Audio processing thread function + def process_audio_chunks(): + while True: + try: + audio_data, source_lang, session_id = audio_queue.get(timeout=1) + if audio_data is None: # Poison pill to stop thread + break + + # Save audio chunk to a temporary file + temp_path = os.path.join(app.config['UPLOAD_FOLDER'], f'chunk_{session_id}_{int(time.time())}.wav') + with open(temp_path, 'wb') as f: + f.write(audio_data) + + # Use whisper to transcribe + try: + result = whisper_model.transcribe( + temp_path, + language=LANGUAGE_TO_CODE.get(source_lang, None) + ) + + # If we got a meaningful result, send it for translation + if result and result["text"].strip(): + transcription_queue.put((result["text"], source_lang, session_id)) + # Emit transcription result to client + socketio.emit('transcription_result', { + 'text': result["text"], + 'session_id': session_id + }) + except Exception as e: + logger.error(f"Error transcribing audio chunk: {str(e)}") + + # Clean up temp file + if os.path.exists(temp_path): + os.remove(temp_path) + + audio_queue.task_done() + except queue.Empty: + continue + except Exception as e: + logger.error(f"Error in audio processing thread: {str(e)}") + + # Translation processing thread function + def process_transcriptions(): + while True: + try: + text, source_lang, session_id = transcription_queue.get(timeout=1) + if text is None: # Poison pill to stop thread + break + + # Get the target language from the active sessions + target_lang = active_sessions.get(session_id, {}).get('target_lang', 'English') + + # Create a prompt for Gemma 3 translation + prompt = f""" + Translate the following text from {source_lang} to {target_lang}: + + "{text}" + + Provide only the translation without any additional text. + """ + + # Use Ollama to interact with Gemma 3 + try: + response = ollama.chat( + model="gemma3:27b", + messages=[ + { + "role": "user", + "content": prompt + } + ] + ) + + translated_text = response['message']['content'].strip() + translation_queue.put((translated_text, target_lang, session_id)) + + # Emit translation result to client + socketio.emit('translation_result', { + 'text': translated_text, + 'session_id': session_id + }) + except Exception as e: + logger.error(f"Error translating text: {str(e)}") + + transcription_queue.task_done() + except queue.Empty: + continue + except Exception as e: + logger.error(f"Error in translation thread: {str(e)}") + + # TTS processing thread function + def process_translations(): + while True: + try: + text, language, session_id = translation_queue.get(timeout=1) + if text is None: # Poison pill to stop thread + break + + voice = LANGUAGE_TO_VOICE.get(language, 'echo') + + # Get TTS server URL and API key from config + tts_server_url = app.config['TTS_SERVER'] + tts_api_key = app.config['TTS_API_KEY'] + + # Request TTS from the OpenAI Edge TTS server + try: + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {tts_api_key}" + } + + # Proper OpenAI TTS payload + payload = { + "input": text, + "voice": voice, + "response_format": "mp3", + "speed": 1.0 + } + + tts_response = requests.post( + tts_server_url, + headers=headers, + json=payload, + timeout=15 + ) + + if tts_response.status_code == 200: + # Save audio temporarily + temp_audio_path = os.path.join(app.config['UPLOAD_FOLDER'], f'output_{session_id}_{int(time.time())}.mp3') + with open(temp_audio_path, 'wb') as f: + f.write(tts_response.content) + + # Emit audio URL to client + audio_url = f'/get_audio/{os.path.basename(temp_audio_path)}' + socketio.emit('audio_ready', { + 'audio_url': audio_url, + 'session_id': session_id + }) + except Exception as e: + logger.error(f"Error generating TTS: {str(e)}") + + translation_queue.task_done() + except queue.Empty: + continue + except Exception as e: + logger.error(f"Error in TTS thread: {str(e)}") + + # Start worker threads + threading.Thread(target=process_audio_chunks, daemon=True).start() + threading.Thread(target=process_transcriptions, daemon=True).start() + threading.Thread(target=process_translations, daemon=True).start() + + logger.info("Realtime interpreter threads started") + + return audio_queue + +# Dictionary to store active interpretation sessions +active_sessions = {} + +# Socket.IO event handlers +@socketio.on('connect') +def handle_connect(): + logger.info(f"Client connected: {request.sid}") + +@socketio.on('disconnect') +def handle_disconnect(): + logger.info(f"Client disconnected: {request.sid}") + # Clean up any session data + if request.sid in active_sessions: + del active_sessions[request.sid] + +@socketio.on('start_interpreter_session') +def handle_start_session(data): + source_lang = data.get('source_lang', 'English') + target_lang = data.get('target_lang', 'Spanish') + + # Store session info + active_sessions[request.sid] = { + 'source_lang': source_lang, + 'target_lang': target_lang, + 'start_time': time.time() + } + + logger.info(f"Started interpreter session {request.sid}: {source_lang} -> {target_lang}") + emit('session_started', {'session_id': request.sid}) + +@socketio.on('end_interpreter_session') +def handle_end_session(): + if request.sid in active_sessions: + logger.info(f"Ended interpreter session {request.sid}") + del active_sessions[request.sid] + emit('session_ended') + +@socketio.on('audio_chunk') +def handle_audio_chunk(data): + if request.sid not in active_sessions: + emit('error', {'message': 'No active session'}) + return + + # Get audio data from the client + audio_data = data.get('audio') + if not audio_data: + emit('error', {'message': 'No audio data received'}) + return + + # Get session details + source_lang = active_sessions[request.sid]['source_lang'] + + # Add audio chunk to processing queue + audio_processing_queue.put((audio_data, source_lang, request.sid)) + + emit('chunk_received') + +# Initialize the audio processing queue +audio_processing_queue = None + if __name__ == '__main__': - app.run(host='0.0.0.0', port=5005, debug=True) + # Setup real-time interpreter + audio_processing_queue = setup_realtime_interpreter() + + # Run with SocketIO instead of regular Flask + socketio.run(app, host='0.0.0.0', port=5005, debug=True) diff --git a/requirements.txt b/requirements.txt index f619df1..92a3237 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ requests openai-whisper torch ollama +flask-socketio +eventlet \ No newline at end of file diff --git a/static/js/app.js b/static/js/app.js index c904d79..7139a46 100644 --- a/static/js/app.js +++ b/static/js/app.js @@ -8,6 +8,9 @@ document.addEventListener('DOMContentLoaded', function() { // Initialize app initApp(); + // Initialize the real-time interpreter + initRealtimeInterpreter(); + // Check for PWA installation prompts initInstallPrompt(); }); @@ -64,11 +67,6 @@ function initApp() { const progressContainer = document.getElementById('progressContainer'); const progressBar = document.getElementById('progressBar'); const audioPlayer = document.getElementById('audioPlayer'); - const ttsServerAlert = document.getElementById('ttsServerAlert'); - const ttsServerMessage = document.getElementById('ttsServerMessage'); - const ttsServerUrl = document.getElementById('ttsServerUrl'); - const ttsApiKey = document.getElementById('ttsApiKey'); - const updateTtsServer = document.getElementById('updateTtsServer'); // Set initial values let isRecording = false; @@ -76,52 +74,6 @@ function initApp() { let audioChunks = []; let currentSourceText = ''; let currentTranslationText = ''; - let currentTtsServerUrl = ''; - - // Check TTS server status on page load - checkTtsServer(); - - // Check for saved translations in IndexedDB - loadSavedTranslations(); - - // Update TTS server URL and API key - updateTtsServer.addEventListener('click', function() { - const newUrl = ttsServerUrl.value.trim(); - const newApiKey = ttsApiKey.value.trim(); - - if (!newUrl && !newApiKey) { - alert('Please provide at least one value to update'); - return; - } - - const updateData = {}; - if (newUrl) updateData.server_url = newUrl; - if (newApiKey) updateData.api_key = newApiKey; - - fetch('/update_tts_config', { - method: 'POST', - headers: { - 'Content-Type': 'application/json' - }, - body: JSON.stringify(updateData) - }) - .then(response => response.json()) - .then(data => { - if (data.success) { - statusIndicator.textContent = 'TTS configuration updated'; - // Save URL to localStorage but not the API key for security - if (newUrl) localStorage.setItem('ttsServerUrl', newUrl); - // Check TTS server with new configuration - checkTtsServer(); - } else { - alert('Failed to update TTS configuration: ' + data.error); - } - }) - .catch(error => { - console.error('Failed to update TTS config:', error); - alert('Failed to update TTS configuration. See console for details.'); - }); - }); // Make sure target language is different from source if (targetLanguage.options[0].value === sourceLanguage.value) { @@ -337,29 +289,13 @@ function initApp() { audioPlayer.play(); } else { statusIndicator.textContent = 'TTS failed'; - - // Show TTS server alert with error message - ttsServerAlert.classList.remove('d-none'); - ttsServerAlert.classList.remove('alert-success'); - ttsServerAlert.classList.add('alert-warning'); - ttsServerMessage.textContent = data.error; - alert('Failed to play audio: ' + data.error); - - // Check TTS server status again - checkTtsServer(); } }) .catch(error => { hideProgress(); console.error('TTS error:', error); statusIndicator.textContent = 'TTS failed'; - - // Show TTS server alert - ttsServerAlert.classList.remove('d-none'); - ttsServerAlert.classList.remove('alert-success'); - ttsServerAlert.classList.add('alert-warning'); - ttsServerMessage.textContent = 'Failed to connect to TTS server'; }); } @@ -377,48 +313,6 @@ function initApp() { playTranslation.disabled = true; }); - // Function to check TTS server status - function checkTtsServer() { - fetch('/check_tts_server') - .then(response => response.json()) - .then(data => { - currentTtsServerUrl = data.url; - ttsServerUrl.value = currentTtsServerUrl; - - // Load saved API key if available - const savedApiKey = localStorage.getItem('ttsApiKeySet'); - if (savedApiKey === 'true') { - ttsApiKey.placeholder = '••••••• (API key saved)'; - } - - if (data.status === 'error' || data.status === 'auth_error') { - ttsServerAlert.classList.remove('d-none'); - ttsServerAlert.classList.remove('alert-success'); - ttsServerAlert.classList.add('alert-warning'); - ttsServerMessage.textContent = data.message; - - if (data.status === 'auth_error') { - ttsServerMessage.textContent = 'Authentication error with TTS server. Please check your API key.'; - } - } else { - ttsServerAlert.classList.remove('d-none'); - ttsServerAlert.classList.remove('alert-warning'); - ttsServerAlert.classList.add('alert-success'); - ttsServerMessage.textContent = 'TTS server is online and ready.'; - setTimeout(() => { - ttsServerAlert.classList.add('d-none'); - }, 3000); - } - }) - .catch(error => { - console.error('Failed to check TTS server:', error); - ttsServerAlert.classList.remove('d-none'); - ttsServerAlert.classList.remove('alert-success'); - ttsServerAlert.classList.add('alert-warning'); - ttsServerMessage.textContent = 'Failed to check TTS server status.'; - }); - } - // Progress indicator functions function showProgress() { progressContainer.classList.remove('d-none'); @@ -444,6 +338,394 @@ function initApp() { progressBar.style.width = '0%'; }, 500); } + + // Check TTS server status on page load if the alert element exists + const ttsServerAlert = document.getElementById('ttsServerAlert'); + if (ttsServerAlert) { + checkTtsServer(); + } + + // Function to check TTS server status + function checkTtsServer() { + const ttsServerMessage = document.getElementById('ttsServerMessage'); + const ttsServerUrl = document.getElementById('ttsServerUrl'); + + fetch('/check_tts_server') + .then(response => response.json()) + .then(data => { + let currentTtsServerUrl = data.url; + if (ttsServerUrl) ttsServerUrl.value = currentTtsServerUrl; + + // Load saved API key if available + const savedApiKey = localStorage.getItem('ttsApiKeySet'); + const ttsApiKey = document.getElementById('ttsApiKey'); + if (ttsApiKey && savedApiKey === 'true') { + ttsApiKey.placeholder = '••••••• (API key saved)'; + } + + if (ttsServerAlert && ttsServerMessage) { + if (data.status === 'error' || data.status === 'auth_error') { + ttsServerAlert.classList.remove('d-none'); + ttsServerAlert.classList.remove('alert-success'); + ttsServerAlert.classList.add('alert-warning'); + ttsServerMessage.textContent = data.message; + + if (data.status === 'auth_error') { + ttsServerMessage.textContent = 'Authentication error with TTS server. Please check your API key.'; + } + } else { + ttsServerAlert.classList.remove('d-none'); + ttsServerAlert.classList.remove('alert-warning'); + ttsServerAlert.classList.add('alert-success'); + ttsServerMessage.textContent = 'TTS server is online and ready.'; + setTimeout(() => { + ttsServerAlert.classList.add('d-none'); + }, 3000); + } + } + }) + .catch(error => { + console.error('Failed to check TTS server:', error); + if (ttsServerAlert && ttsServerMessage) { + ttsServerAlert.classList.remove('d-none'); + ttsServerAlert.classList.remove('alert-success'); + ttsServerAlert.classList.add('alert-warning'); + ttsServerMessage.textContent = 'Failed to check TTS server status.'; + } + }); + } +} + +// Real-time interpreter module +function initRealtimeInterpreter() { + // DOM elements + const realtimeBtn = document.getElementById('realtimeBtn'); + const realtimeStatusIndicator = document.getElementById('realtimeStatusIndicator'); + const sourceLanguage = document.getElementById('sourceLanguage'); + const targetLanguage = document.getElementById('targetLanguage'); + const sourceText = document.getElementById('sourceText'); + const translatedText = document.getElementById('translatedText'); + const audioPlayer = document.getElementById('audioPlayer'); + + // SocketIO connection + let socket = null; + let mediaRecorder = null; + let audioContext = null; + let isInterpreting = false; + let sessionId = null; + + // Audio processing variables + const bufferSize = 4096; + let audioProcessor = null; + let micStream = null; + + // Initialize the audio context + function initAudioContext() { + try { + window.AudioContext = window.AudioContext || window.webkitAudioContext; + audioContext = new AudioContext(); + return true; + } catch (e) { + console.error('Web Audio API is not supported in this browser', e); + return false; + } + } + + // Connect to Socket.IO server + function connectSocket() { + if (socket) { + return; // Already connected + } + + // Connect to the same host where the Flask app is running + socket = io.connect(window.location.origin, { + forceNew: true + }); + + // Socket event handlers + socket.on('connect', () => { + console.log('Socket connected'); + }); + + socket.on('disconnect', () => { + console.log('Socket disconnected'); + stopInterpreting(); + }); + + socket.on('error', (data) => { + console.error('Socket error:', data.message); + realtimeStatusIndicator.textContent = `Error: ${data.message}`; + }); + + socket.on('session_started', (data) => { + sessionId = data.session_id; + console.log('Interpreter session started:', sessionId); + realtimeStatusIndicator.textContent = 'Interpreter active - listening...'; + }); + + socket.on('session_ended', () => { + console.log('Interpreter session ended'); + sessionId = null; + realtimeStatusIndicator.textContent = 'Interpreter stopped'; + }); + + socket.on('chunk_received', () => { + // This is a confirmation that the server received our audio chunk + // We can use this to update UI if needed + }); + + socket.on('transcription_result', (data) => { + if (data.session_id === sessionId) { + // Update source text with transcription + if (sourceText.querySelector('p.text-muted')) { + sourceText.innerHTML = ''; // Clear placeholder + } + + const p = document.createElement('p'); + p.textContent = data.text; + sourceText.appendChild(p); + + // Auto-scroll to bottom + sourceText.scrollTop = sourceText.scrollHeight; + } + }); + + socket.on('translation_result', (data) => { + if (data.session_id === sessionId) { + // Update translated text + if (translatedText.querySelector('p.text-muted')) { + translatedText.innerHTML = ''; // Clear placeholder + } + + const p = document.createElement('p'); + p.textContent = data.text; + translatedText.appendChild(p); + + // Auto-scroll to bottom + translatedText.scrollTop = translatedText.scrollHeight; + } + }); + + socket.on('audio_ready', (data) => { + if (data.session_id === sessionId) { + // Play the translated audio + audioPlayer.src = data.audio_url; + audioPlayer.play(); + } + }); + } + + // Start the real-time interpreter + function startInterpreting() { + if (!initAudioContext()) { + alert('Your browser does not support the Web Audio API required for the real-time interpreter.'); + return; + } + + connectSocket(); + + // Request microphone access + navigator.mediaDevices.getUserMedia({ audio: true, video: false }) + .then((stream) => { + micStream = stream; + + // Start a new interpreter session + socket.emit('start_interpreter_session', { + source_lang: sourceLanguage.value, + target_lang: targetLanguage.value + }); + + // Setup audio processing + const audioInput = audioContext.createMediaStreamSource(stream); + audioProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1); + + // Connect the audio processing node + audioInput.connect(audioProcessor); + audioProcessor.connect(audioContext.destination); + + // Process audio data + let silenceStart = performance.now(); + let isSilent = true; + const silenceThreshold = 0.01; // Adjust based on testing + const silenceDelay = 1000; // 1 second of silence before stopping + + audioProcessor.onaudioprocess = function(e) { + if (!isInterpreting) return; + + // Get audio data + const inputData = e.inputBuffer.getChannelData(0); + + // Check for silence + let sum = 0; + for (let i = 0; i < inputData.length; i++) { + sum += Math.abs(inputData[i]); + } + const average = sum / inputData.length; + + // Detect speech vs silence + if (average > silenceThreshold) { + if (isSilent) { + isSilent = false; + realtimeStatusIndicator.textContent = 'Interpreting...'; + } + silenceStart = performance.now(); // Reset silence timer + } else if (!isSilent && (performance.now() - silenceStart > silenceDelay)) { + isSilent = true; + realtimeStatusIndicator.textContent = 'Waiting for speech...'; + } + + // Convert buffer to WAV format + const wavBuffer = convertToWav(inputData); + + // Send to server if not silent or within silence delay + if (!isSilent || (performance.now() - silenceStart <= silenceDelay)) { + socket.emit('audio_chunk', { + audio: wavBuffer + }); + } + }; + + // Update UI + isInterpreting = true; + realtimeBtn.textContent = 'Stop Interpreter'; + realtimeBtn.classList.replace('btn-primary', 'btn-danger'); + realtimeStatusIndicator.textContent = 'Interpreter active - listening...'; + + // Disable language selectors during interpretation + sourceLanguage.disabled = true; + targetLanguage.disabled = true; + }) + .catch((error) => { + console.error('Error accessing microphone:', error); + realtimeStatusIndicator.textContent = 'Error: Microphone access denied'; + alert('Error accessing microphone. Please make sure you have given permission for microphone access.'); + }); + } + + // Stop the real-time interpreter + function stopInterpreting() { + if (!isInterpreting) return; + + // Stop audio processing + if (audioProcessor) { + audioProcessor.disconnect(); + audioProcessor = null; + } + + // Stop microphone stream + if (micStream) { + micStream.getTracks().forEach(track => track.stop()); + micStream = null; + } + + // End the interpreter session + if (socket && socket.connected) { + socket.emit('end_interpreter_session'); + } + + // Update UI + isInterpreting = false; + realtimeBtn.textContent = 'Start Interpreter'; + realtimeBtn.classList.replace('btn-danger', 'btn-primary'); + realtimeStatusIndicator.textContent = 'Interpreter ready'; + + // Re-enable language selectors + sourceLanguage.disabled = false; + targetLanguage.disabled = false; + } + + // Convert audio buffer to WAV format + function convertToWav(audioBuffer) { + // Simple WAV header creation + const createWavHeader = function(sampleRate, bitsPerSample, channels, dataLength) { + const headerLength = 44; + const wavHeader = new ArrayBuffer(headerLength); + const view = new DataView(wavHeader); + + // RIFF identifier + writeString(view, 0, 'RIFF'); + // File length + view.setUint32(4, 32 + dataLength, true); + // RIFF type + writeString(view, 8, 'WAVE'); + // Format chunk identifier + writeString(view, 12, 'fmt '); + // Format chunk length + view.setUint32(16, 16, true); + // Sample format (raw) + view.setUint16(20, 1, true); + // Channel count + view.setUint16(22, channels, true); + // Sample rate + view.setUint32(24, sampleRate, true); + // Byte rate (sample rate * block align) + view.setUint32(28, sampleRate * channels * (bitsPerSample / 8), true); + // Block align (channel count * bytes per sample) + view.setUint16(32, channels * (bitsPerSample / 8), true); + // Bits per sample + view.setUint16(34, bitsPerSample, true); + // Data chunk identifier + writeString(view, 36, 'data'); + // Data chunk length + view.setUint32(40, dataLength, true); + + return wavHeader; + }; + + const writeString = function(view, offset, string) { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } + }; + + // Sample parameters + const sampleRate = audioContext.sampleRate; + const bitsPerSample = 16; + const channels = 1; + + // Create the audio data buffer + const dataLength = audioBuffer.length * 2; // 16-bit = 2 bytes per sample + const audioData = new ArrayBuffer(dataLength); + const dataView = new DataView(audioData); + + // Convert float32 to int16 + let offset = 0; + for (let i = 0; i < audioBuffer.length; i++) { + const s = Math.max(-1, Math.min(1, audioBuffer[i])); + dataView.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); + offset += 2; + } + + // Create the WAV header + const header = createWavHeader(sampleRate, bitsPerSample, channels, dataLength); + + // Combine header and audio data + const wavBuffer = new Uint8Array(header.byteLength + audioData.byteLength); + wavBuffer.set(new Uint8Array(header), 0); + wavBuffer.set(new Uint8Array(audioData), header.byteLength); + + return wavBuffer; + } + + // Toggle interpreter on button click + realtimeBtn.addEventListener('click', function() { + if (isInterpreting) { + stopInterpreting(); + } else { + startInterpreting(); + } + }); + + // Cleanup function for when the page is unloaded + window.addEventListener('beforeunload', function() { + stopInterpreting(); + if (socket) { + socket.disconnect(); + } + }); + + // Initialize status + realtimeStatusIndicator.textContent = 'Interpreter ready'; } // IndexedDB functions for offline data storage diff --git a/templates/index.html b/templates/index.html index 3188d0b..5573896 100644 --- a/templates/index.html +++ b/templates/index.html @@ -11,6 +11,8 @@ + +