Compare commits

..

No commits in common. "4bd56cbe5b5a504f1e8908d9b9d163e32ba8c1c6" and "ae8eaaacc07f220c9e3b079795d37b62115923c2" have entirely different histories.

2 changed files with 68 additions and 90 deletions

46
app.py
View File

@ -417,50 +417,26 @@ def setup_realtime_interpreter():
if audio_data is None: # Poison pill to stop thread if audio_data is None: # Poison pill to stop thread
break break
# Log received audio chunk size
logger.info(f"Processing audio chunk: {len(audio_data)} bytes for session {session_id}")
# Save audio chunk to a temporary file # Save audio chunk to a temporary file
temp_path = os.path.join(app.config['UPLOAD_FOLDER'], f'chunk_{session_id}_{int(time.time())}.wav') temp_path = os.path.join(app.config['UPLOAD_FOLDER'], f'chunk_{session_id}_{int(time.time())}.wav')
with open(temp_path, 'wb') as f: with open(temp_path, 'wb') as f:
f.write(audio_data) f.write(audio_data)
logger.info(f"Saved audio chunk to {temp_path}, starting transcription")
# Use whisper to transcribe # Use whisper to transcribe
try: try:
# Check if file is valid
file_size = os.path.getsize(temp_path)
logger.info(f"File size: {file_size} bytes")
# Enforce .wav extension for compatibility
new_temp_path = temp_path
if not temp_path.lower().endswith('.wav'):
new_temp_path = temp_path + '.wav'
os.rename(temp_path, new_temp_path)
logger.info(f"Renamed audio file to ensure .wav extension: {new_temp_path}")
temp_path = new_temp_path
result = whisper_model.transcribe( result = whisper_model.transcribe(
temp_path, temp_path,
language=LANGUAGE_TO_CODE.get(source_lang, None) language=LANGUAGE_TO_CODE.get(source_lang, None)
) )
# Log the result length to see if we got anything
text_result = result["text"].strip()
logger.info(f"Transcription result: {len(text_result)} characters")
# If we got a meaningful result, send it for translation # If we got a meaningful result, send it for translation
if text_result: if result and result["text"].strip():
logger.info(f"Transcription: '{text_result}'") transcription_queue.put((result["text"], source_lang, session_id))
transcription_queue.put((text_result, source_lang, session_id))
# Emit transcription result to client # Emit transcription result to client
socketio.emit('transcription_result', { socketio.emit('transcription_result', {
'text': text_result, 'text': result["text"],
'session_id': session_id 'session_id': session_id
}) })
else:
logger.info("No transcription text detected in the audio")
except Exception as e: except Exception as e:
logger.error(f"Error transcribing audio chunk: {str(e)}") logger.error(f"Error transcribing audio chunk: {str(e)}")
@ -638,22 +614,6 @@ def handle_audio_chunk(data):
emit('error', {'message': 'No audio data received'}) emit('error', {'message': 'No audio data received'})
return return
# Convert audio data to proper format if needed
if isinstance(audio_data, list):
# Convert list to bytes
audio_data = bytes(audio_data)
logger.info(f"Converted audio data from list to bytes: {len(audio_data)} bytes")
elif isinstance(audio_data, str):
# This might be base64 encoded data
try:
import base64
audio_data = base64.b64decode(audio_data.split(',')[1] if ',' in audio_data else audio_data)
logger.info(f"Decoded base64 audio data: {len(audio_data)} bytes")
except Exception as e:
logger.error(f"Error decoding audio data: {str(e)}")
emit('error', {'message': 'Invalid audio data format'})
return
# Get session details # Get session details
source_lang = active_sessions[request.sid]['source_lang'] source_lang = active_sessions[request.sid]['source_lang']

View File

@ -415,7 +415,7 @@ function initRealtimeInterpreter() {
let sessionId = null; let sessionId = null;
// Audio processing variables // Audio processing variables
const bufferSize = 16384; // Increased from 4096 for better speech capture const bufferSize = 4096;
let audioProcessor = null; let audioProcessor = null;
let micStream = null; let micStream = null;
@ -546,8 +546,8 @@ function initRealtimeInterpreter() {
// Process audio data // Process audio data
let silenceStart = performance.now(); let silenceStart = performance.now();
let isSilent = true; let isSilent = true;
const silenceThreshold = 0.005; // Lower threshold to be more sensitive to speech const silenceThreshold = 0.01; // Adjust based on testing
const silenceDelay = 2000; // 2 seconds of silence before stopping const silenceDelay = 1000; // 1 second of silence before stopping
audioProcessor.onaudioprocess = function(e) { audioProcessor.onaudioprocess = function(e) {
if (!isInterpreting) return; if (!isInterpreting) return;
@ -582,7 +582,6 @@ function initRealtimeInterpreter() {
socket.emit('audio_chunk', { socket.emit('audio_chunk', {
audio: wavBuffer audio: wavBuffer
}); });
console.log(`Sent audio chunk: ${wavBuffer.length} bytes`);
} }
}; };
@ -637,56 +636,75 @@ function initRealtimeInterpreter() {
// Convert audio buffer to WAV format // Convert audio buffer to WAV format
function convertToWav(audioBuffer) { function convertToWav(audioBuffer) {
const sampleRate = audioContext.sampleRate; // Simple WAV header creation
const numberOfChannels = 1; const createWavHeader = function(sampleRate, bitsPerSample, channels, dataLength) {
const bitsPerSample = 16; const headerLength = 44;
const wavHeader = new ArrayBuffer(headerLength);
const view = new DataView(wavHeader);
// Log audio properties for debugging // RIFF identifier
console.log(`Recording audio: ${sampleRate}Hz, ${numberOfChannels} channel(s), ${audioBuffer.length} samples`);
// Calculate sizes and create ArrayBuffers
const dataSize = audioBuffer.length * 2; // 16-bit samples
const totalSize = 44 + dataSize; // 44 bytes for header + data size
const buffer = new ArrayBuffer(totalSize);
const view = new DataView(buffer);
// Write WAV header
writeString(view, 0, 'RIFF'); writeString(view, 0, 'RIFF');
view.setUint32(4, 36 + dataSize, true); // File size - 8 // File length
view.setUint32(4, 32 + dataLength, true);
// RIFF type
writeString(view, 8, 'WAVE'); writeString(view, 8, 'WAVE');
// Format chunk identifier
writeString(view, 12, 'fmt '); writeString(view, 12, 'fmt ');
view.setUint32(16, 16, true); // Format chunk length // Format chunk length
view.setUint16(20, 1, true); // PCM format view.setUint32(16, 16, true);
view.setUint16(22, numberOfChannels, true); // Sample format (raw)
view.setUint16(20, 1, true);
// Channel count
view.setUint16(22, channels, true);
// Sample rate
view.setUint32(24, sampleRate, true); view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * numberOfChannels * (bitsPerSample / 8), true); // Byte rate // Byte rate (sample rate * block align)
view.setUint16(32, numberOfChannels * (bitsPerSample / 8), true); // Block align view.setUint32(28, sampleRate * channels * (bitsPerSample / 8), true);
// Block align (channel count * bytes per sample)
view.setUint16(32, channels * (bitsPerSample / 8), true);
// Bits per sample
view.setUint16(34, bitsPerSample, true); view.setUint16(34, bitsPerSample, true);
// Data chunk identifier
writeString(view, 36, 'data'); writeString(view, 36, 'data');
view.setUint32(40, dataSize, true); // Data chunk length
view.setUint32(40, dataLength, true);
// Write audio data return wavHeader;
let offset = 44; };
for (let i = 0; i < audioBuffer.length; i++) {
// Convert float to int16
const s = Math.max(-1, Math.min(1, audioBuffer[i]));
const val = s < 0 ? s * 0x8000 : s * 0x7FFF;
view.setInt16(offset, val, true);
offset += 2;
}
// Helper function to write strings to DataView const writeString = function(view, offset, string) {
function writeString(view, offset, string) {
for (let i = 0; i < string.length; i++) { for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i)); view.setUint8(offset + i, string.charCodeAt(i));
} }
};
// Sample parameters
const sampleRate = audioContext.sampleRate;
const bitsPerSample = 16;
const channels = 1;
// Create the audio data buffer
const dataLength = audioBuffer.length * 2; // 16-bit = 2 bytes per sample
const audioData = new ArrayBuffer(dataLength);
const dataView = new DataView(audioData);
// Convert float32 to int16
let offset = 0;
for (let i = 0; i < audioBuffer.length; i++) {
const s = Math.max(-1, Math.min(1, audioBuffer[i]));
dataView.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
offset += 2;
} }
// Log the created buffer size // Create the WAV header
console.log(`Created WAV buffer: ${buffer.byteLength} bytes`); const header = createWavHeader(sampleRate, bitsPerSample, channels, dataLength);
return new Uint8Array(buffer); // Combine header and audio data
const wavBuffer = new Uint8Array(header.byteLength + audioData.byteLength);
wavBuffer.set(new Uint8Array(header), 0);
wavBuffer.set(new Uint8Array(audioData), header.byteLength);
return wavBuffer;
} }
// Toggle interpreter on button click // Toggle interpreter on button click