From 378f976b76b2d08b3fde25332e2a59d85b73e02e Mon Sep 17 00:00:00 2001 From: Adolfo Delorenzo Date: Sat, 12 Apr 2025 01:50:39 -0600 Subject: [PATCH] Fix real-time interpreter audio processing - Add better logging for audio chunk processing - Increase audio buffer size for better speech detection - Improve WAV format creation for better Whisper compatibility - Lower silence threshold for better speech detection - Extend silence timeout from 1 to 2 seconds --- app.py | 18 ++++++-- static/js/app.js | 104 ++++++++++++++++++++--------------------------- 2 files changed, 58 insertions(+), 64 deletions(-) diff --git a/app.py b/app.py index eed89cf..f3f466c 100644 --- a/app.py +++ b/app.py @@ -417,11 +417,16 @@ def setup_realtime_interpreter(): if audio_data is None: # Poison pill to stop thread break + # Log received audio chunk size + logger.info(f"Processing audio chunk: {len(audio_data)} bytes for session {session_id}") + # Save audio chunk to a temporary file temp_path = os.path.join(app.config['UPLOAD_FOLDER'], f'chunk_{session_id}_{int(time.time())}.wav') with open(temp_path, 'wb') as f: f.write(audio_data) + logger.info(f"Saved audio chunk to {temp_path}, starting transcription") + # Use whisper to transcribe try: result = whisper_model.transcribe( @@ -429,14 +434,21 @@ def setup_realtime_interpreter(): language=LANGUAGE_TO_CODE.get(source_lang, None) ) + # Log the result length to see if we got anything + text_result = result["text"].strip() + logger.info(f"Transcription result: {len(text_result)} characters") + # If we got a meaningful result, send it for translation - if result and result["text"].strip(): - transcription_queue.put((result["text"], source_lang, session_id)) + if text_result: + logger.info(f"Transcription: '{text_result}'") + transcription_queue.put((text_result, source_lang, session_id)) # Emit transcription result to client socketio.emit('transcription_result', { - 'text': result["text"], + 'text': text_result, 'session_id': session_id }) + else: + logger.info("No transcription text detected in the audio") except Exception as e: logger.error(f"Error transcribing audio chunk: {str(e)}") diff --git a/static/js/app.js b/static/js/app.js index 7139a46..2752c66 100644 --- a/static/js/app.js +++ b/static/js/app.js @@ -415,7 +415,7 @@ function initRealtimeInterpreter() { let sessionId = null; // Audio processing variables - const bufferSize = 4096; + const bufferSize = 16384; // Increased from 4096 for better speech capture let audioProcessor = null; let micStream = null; @@ -546,8 +546,8 @@ function initRealtimeInterpreter() { // Process audio data let silenceStart = performance.now(); let isSilent = true; - const silenceThreshold = 0.01; // Adjust based on testing - const silenceDelay = 1000; // 1 second of silence before stopping + const silenceThreshold = 0.005; // Lower threshold to be more sensitive to speech + const silenceDelay = 2000; // 2 seconds of silence before stopping audioProcessor.onaudioprocess = function(e) { if (!isInterpreting) return; @@ -582,6 +582,7 @@ function initRealtimeInterpreter() { socket.emit('audio_chunk', { audio: wavBuffer }); + console.log(`Sent audio chunk: ${wavBuffer.length} bytes`); } }; @@ -636,75 +637,56 @@ function initRealtimeInterpreter() { // Convert audio buffer to WAV format function convertToWav(audioBuffer) { - // Simple WAV header creation - const createWavHeader = function(sampleRate, bitsPerSample, channels, dataLength) { - const headerLength = 44; - const wavHeader = new ArrayBuffer(headerLength); - const view = new DataView(wavHeader); - - // RIFF identifier - writeString(view, 0, 'RIFF'); - // File length - view.setUint32(4, 32 + dataLength, true); - // RIFF type - writeString(view, 8, 'WAVE'); - // Format chunk identifier - writeString(view, 12, 'fmt '); - // Format chunk length - view.setUint32(16, 16, true); - // Sample format (raw) - view.setUint16(20, 1, true); - // Channel count - view.setUint16(22, channels, true); - // Sample rate - view.setUint32(24, sampleRate, true); - // Byte rate (sample rate * block align) - view.setUint32(28, sampleRate * channels * (bitsPerSample / 8), true); - // Block align (channel count * bytes per sample) - view.setUint16(32, channels * (bitsPerSample / 8), true); - // Bits per sample - view.setUint16(34, bitsPerSample, true); - // Data chunk identifier - writeString(view, 36, 'data'); - // Data chunk length - view.setUint32(40, dataLength, true); - - return wavHeader; - }; - - const writeString = function(view, offset, string) { - for (let i = 0; i < string.length; i++) { - view.setUint8(offset + i, string.charCodeAt(i)); - } - }; - - // Sample parameters const sampleRate = audioContext.sampleRate; + const numberOfChannels = 1; const bitsPerSample = 16; - const channels = 1; - // Create the audio data buffer - const dataLength = audioBuffer.length * 2; // 16-bit = 2 bytes per sample - const audioData = new ArrayBuffer(dataLength); - const dataView = new DataView(audioData); + // Log audio properties for debugging + console.log(`Recording audio: ${sampleRate}Hz, ${numberOfChannels} channel(s), ${audioBuffer.length} samples`); - // Convert float32 to int16 - let offset = 0; + // Calculate sizes and create ArrayBuffers + const dataSize = audioBuffer.length * 2; // 16-bit samples + const totalSize = 44 + dataSize; // 44 bytes for header + data size + + const buffer = new ArrayBuffer(totalSize); + const view = new DataView(buffer); + + // Write WAV header + writeString(view, 0, 'RIFF'); + view.setUint32(4, 36 + dataSize, true); // File size - 8 + writeString(view, 8, 'WAVE'); + writeString(view, 12, 'fmt '); + view.setUint32(16, 16, true); // Format chunk length + view.setUint16(20, 1, true); // PCM format + view.setUint16(22, numberOfChannels, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, sampleRate * numberOfChannels * (bitsPerSample / 8), true); // Byte rate + view.setUint16(32, numberOfChannels * (bitsPerSample / 8), true); // Block align + view.setUint16(34, bitsPerSample, true); + writeString(view, 36, 'data'); + view.setUint32(40, dataSize, true); + + // Write audio data + let offset = 44; for (let i = 0; i < audioBuffer.length; i++) { + // Convert float to int16 const s = Math.max(-1, Math.min(1, audioBuffer[i])); - dataView.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true); + const val = s < 0 ? s * 0x8000 : s * 0x7FFF; + view.setInt16(offset, val, true); offset += 2; } - // Create the WAV header - const header = createWavHeader(sampleRate, bitsPerSample, channels, dataLength); + // Helper function to write strings to DataView + function writeString(view, offset, string) { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)); + } + } - // Combine header and audio data - const wavBuffer = new Uint8Array(header.byteLength + audioData.byteLength); - wavBuffer.set(new Uint8Array(header), 0); - wavBuffer.set(new Uint8Array(audioData), header.byteLength); + // Log the created buffer size + console.log(`Created WAV buffer: ${buffer.byteLength} bytes`); - return wavBuffer; + return new Uint8Array(buffer); } // Toggle interpreter on button click