Compare commits
2 Commits
ae8eaaacc0
...
4bd56cbe5b
Author | SHA1 | Date | |
---|---|---|---|
4bd56cbe5b | |||
378f976b76 |
46
app.py
46
app.py
@ -417,26 +417,50 @@ def setup_realtime_interpreter():
|
|||||||
if audio_data is None: # Poison pill to stop thread
|
if audio_data is None: # Poison pill to stop thread
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Log received audio chunk size
|
||||||
|
logger.info(f"Processing audio chunk: {len(audio_data)} bytes for session {session_id}")
|
||||||
|
|
||||||
# Save audio chunk to a temporary file
|
# Save audio chunk to a temporary file
|
||||||
temp_path = os.path.join(app.config['UPLOAD_FOLDER'], f'chunk_{session_id}_{int(time.time())}.wav')
|
temp_path = os.path.join(app.config['UPLOAD_FOLDER'], f'chunk_{session_id}_{int(time.time())}.wav')
|
||||||
with open(temp_path, 'wb') as f:
|
with open(temp_path, 'wb') as f:
|
||||||
f.write(audio_data)
|
f.write(audio_data)
|
||||||
|
|
||||||
|
logger.info(f"Saved audio chunk to {temp_path}, starting transcription")
|
||||||
|
|
||||||
# Use whisper to transcribe
|
# Use whisper to transcribe
|
||||||
try:
|
try:
|
||||||
|
# Check if file is valid
|
||||||
|
file_size = os.path.getsize(temp_path)
|
||||||
|
logger.info(f"File size: {file_size} bytes")
|
||||||
|
|
||||||
|
# Enforce .wav extension for compatibility
|
||||||
|
new_temp_path = temp_path
|
||||||
|
if not temp_path.lower().endswith('.wav'):
|
||||||
|
new_temp_path = temp_path + '.wav'
|
||||||
|
os.rename(temp_path, new_temp_path)
|
||||||
|
logger.info(f"Renamed audio file to ensure .wav extension: {new_temp_path}")
|
||||||
|
temp_path = new_temp_path
|
||||||
|
|
||||||
result = whisper_model.transcribe(
|
result = whisper_model.transcribe(
|
||||||
temp_path,
|
temp_path,
|
||||||
language=LANGUAGE_TO_CODE.get(source_lang, None)
|
language=LANGUAGE_TO_CODE.get(source_lang, None)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Log the result length to see if we got anything
|
||||||
|
text_result = result["text"].strip()
|
||||||
|
logger.info(f"Transcription result: {len(text_result)} characters")
|
||||||
|
|
||||||
# If we got a meaningful result, send it for translation
|
# If we got a meaningful result, send it for translation
|
||||||
if result and result["text"].strip():
|
if text_result:
|
||||||
transcription_queue.put((result["text"], source_lang, session_id))
|
logger.info(f"Transcription: '{text_result}'")
|
||||||
|
transcription_queue.put((text_result, source_lang, session_id))
|
||||||
# Emit transcription result to client
|
# Emit transcription result to client
|
||||||
socketio.emit('transcription_result', {
|
socketio.emit('transcription_result', {
|
||||||
'text': result["text"],
|
'text': text_result,
|
||||||
'session_id': session_id
|
'session_id': session_id
|
||||||
})
|
})
|
||||||
|
else:
|
||||||
|
logger.info("No transcription text detected in the audio")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error transcribing audio chunk: {str(e)}")
|
logger.error(f"Error transcribing audio chunk: {str(e)}")
|
||||||
|
|
||||||
@ -614,6 +638,22 @@ def handle_audio_chunk(data):
|
|||||||
emit('error', {'message': 'No audio data received'})
|
emit('error', {'message': 'No audio data received'})
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Convert audio data to proper format if needed
|
||||||
|
if isinstance(audio_data, list):
|
||||||
|
# Convert list to bytes
|
||||||
|
audio_data = bytes(audio_data)
|
||||||
|
logger.info(f"Converted audio data from list to bytes: {len(audio_data)} bytes")
|
||||||
|
elif isinstance(audio_data, str):
|
||||||
|
# This might be base64 encoded data
|
||||||
|
try:
|
||||||
|
import base64
|
||||||
|
audio_data = base64.b64decode(audio_data.split(',')[1] if ',' in audio_data else audio_data)
|
||||||
|
logger.info(f"Decoded base64 audio data: {len(audio_data)} bytes")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error decoding audio data: {str(e)}")
|
||||||
|
emit('error', {'message': 'Invalid audio data format'})
|
||||||
|
return
|
||||||
|
|
||||||
# Get session details
|
# Get session details
|
||||||
source_lang = active_sessions[request.sid]['source_lang']
|
source_lang = active_sessions[request.sid]['source_lang']
|
||||||
|
|
||||||
|
104
static/js/app.js
104
static/js/app.js
@ -415,7 +415,7 @@ function initRealtimeInterpreter() {
|
|||||||
let sessionId = null;
|
let sessionId = null;
|
||||||
|
|
||||||
// Audio processing variables
|
// Audio processing variables
|
||||||
const bufferSize = 4096;
|
const bufferSize = 16384; // Increased from 4096 for better speech capture
|
||||||
let audioProcessor = null;
|
let audioProcessor = null;
|
||||||
let micStream = null;
|
let micStream = null;
|
||||||
|
|
||||||
@ -546,8 +546,8 @@ function initRealtimeInterpreter() {
|
|||||||
// Process audio data
|
// Process audio data
|
||||||
let silenceStart = performance.now();
|
let silenceStart = performance.now();
|
||||||
let isSilent = true;
|
let isSilent = true;
|
||||||
const silenceThreshold = 0.01; // Adjust based on testing
|
const silenceThreshold = 0.005; // Lower threshold to be more sensitive to speech
|
||||||
const silenceDelay = 1000; // 1 second of silence before stopping
|
const silenceDelay = 2000; // 2 seconds of silence before stopping
|
||||||
|
|
||||||
audioProcessor.onaudioprocess = function(e) {
|
audioProcessor.onaudioprocess = function(e) {
|
||||||
if (!isInterpreting) return;
|
if (!isInterpreting) return;
|
||||||
@ -582,6 +582,7 @@ function initRealtimeInterpreter() {
|
|||||||
socket.emit('audio_chunk', {
|
socket.emit('audio_chunk', {
|
||||||
audio: wavBuffer
|
audio: wavBuffer
|
||||||
});
|
});
|
||||||
|
console.log(`Sent audio chunk: ${wavBuffer.length} bytes`);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -636,75 +637,56 @@ function initRealtimeInterpreter() {
|
|||||||
|
|
||||||
// Convert audio buffer to WAV format
|
// Convert audio buffer to WAV format
|
||||||
function convertToWav(audioBuffer) {
|
function convertToWav(audioBuffer) {
|
||||||
// Simple WAV header creation
|
|
||||||
const createWavHeader = function(sampleRate, bitsPerSample, channels, dataLength) {
|
|
||||||
const headerLength = 44;
|
|
||||||
const wavHeader = new ArrayBuffer(headerLength);
|
|
||||||
const view = new DataView(wavHeader);
|
|
||||||
|
|
||||||
// RIFF identifier
|
|
||||||
writeString(view, 0, 'RIFF');
|
|
||||||
// File length
|
|
||||||
view.setUint32(4, 32 + dataLength, true);
|
|
||||||
// RIFF type
|
|
||||||
writeString(view, 8, 'WAVE');
|
|
||||||
// Format chunk identifier
|
|
||||||
writeString(view, 12, 'fmt ');
|
|
||||||
// Format chunk length
|
|
||||||
view.setUint32(16, 16, true);
|
|
||||||
// Sample format (raw)
|
|
||||||
view.setUint16(20, 1, true);
|
|
||||||
// Channel count
|
|
||||||
view.setUint16(22, channels, true);
|
|
||||||
// Sample rate
|
|
||||||
view.setUint32(24, sampleRate, true);
|
|
||||||
// Byte rate (sample rate * block align)
|
|
||||||
view.setUint32(28, sampleRate * channels * (bitsPerSample / 8), true);
|
|
||||||
// Block align (channel count * bytes per sample)
|
|
||||||
view.setUint16(32, channels * (bitsPerSample / 8), true);
|
|
||||||
// Bits per sample
|
|
||||||
view.setUint16(34, bitsPerSample, true);
|
|
||||||
// Data chunk identifier
|
|
||||||
writeString(view, 36, 'data');
|
|
||||||
// Data chunk length
|
|
||||||
view.setUint32(40, dataLength, true);
|
|
||||||
|
|
||||||
return wavHeader;
|
|
||||||
};
|
|
||||||
|
|
||||||
const writeString = function(view, offset, string) {
|
|
||||||
for (let i = 0; i < string.length; i++) {
|
|
||||||
view.setUint8(offset + i, string.charCodeAt(i));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Sample parameters
|
|
||||||
const sampleRate = audioContext.sampleRate;
|
const sampleRate = audioContext.sampleRate;
|
||||||
|
const numberOfChannels = 1;
|
||||||
const bitsPerSample = 16;
|
const bitsPerSample = 16;
|
||||||
const channels = 1;
|
|
||||||
|
|
||||||
// Create the audio data buffer
|
// Log audio properties for debugging
|
||||||
const dataLength = audioBuffer.length * 2; // 16-bit = 2 bytes per sample
|
console.log(`Recording audio: ${sampleRate}Hz, ${numberOfChannels} channel(s), ${audioBuffer.length} samples`);
|
||||||
const audioData = new ArrayBuffer(dataLength);
|
|
||||||
const dataView = new DataView(audioData);
|
|
||||||
|
|
||||||
// Convert float32 to int16
|
// Calculate sizes and create ArrayBuffers
|
||||||
let offset = 0;
|
const dataSize = audioBuffer.length * 2; // 16-bit samples
|
||||||
|
const totalSize = 44 + dataSize; // 44 bytes for header + data size
|
||||||
|
|
||||||
|
const buffer = new ArrayBuffer(totalSize);
|
||||||
|
const view = new DataView(buffer);
|
||||||
|
|
||||||
|
// Write WAV header
|
||||||
|
writeString(view, 0, 'RIFF');
|
||||||
|
view.setUint32(4, 36 + dataSize, true); // File size - 8
|
||||||
|
writeString(view, 8, 'WAVE');
|
||||||
|
writeString(view, 12, 'fmt ');
|
||||||
|
view.setUint32(16, 16, true); // Format chunk length
|
||||||
|
view.setUint16(20, 1, true); // PCM format
|
||||||
|
view.setUint16(22, numberOfChannels, true);
|
||||||
|
view.setUint32(24, sampleRate, true);
|
||||||
|
view.setUint32(28, sampleRate * numberOfChannels * (bitsPerSample / 8), true); // Byte rate
|
||||||
|
view.setUint16(32, numberOfChannels * (bitsPerSample / 8), true); // Block align
|
||||||
|
view.setUint16(34, bitsPerSample, true);
|
||||||
|
writeString(view, 36, 'data');
|
||||||
|
view.setUint32(40, dataSize, true);
|
||||||
|
|
||||||
|
// Write audio data
|
||||||
|
let offset = 44;
|
||||||
for (let i = 0; i < audioBuffer.length; i++) {
|
for (let i = 0; i < audioBuffer.length; i++) {
|
||||||
|
// Convert float to int16
|
||||||
const s = Math.max(-1, Math.min(1, audioBuffer[i]));
|
const s = Math.max(-1, Math.min(1, audioBuffer[i]));
|
||||||
dataView.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
|
const val = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
||||||
|
view.setInt16(offset, val, true);
|
||||||
offset += 2;
|
offset += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the WAV header
|
// Helper function to write strings to DataView
|
||||||
const header = createWavHeader(sampleRate, bitsPerSample, channels, dataLength);
|
function writeString(view, offset, string) {
|
||||||
|
for (let i = 0; i < string.length; i++) {
|
||||||
|
view.setUint8(offset + i, string.charCodeAt(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Combine header and audio data
|
// Log the created buffer size
|
||||||
const wavBuffer = new Uint8Array(header.byteLength + audioData.byteLength);
|
console.log(`Created WAV buffer: ${buffer.byteLength} bytes`);
|
||||||
wavBuffer.set(new Uint8Array(header), 0);
|
|
||||||
wavBuffer.set(new Uint8Array(audioData), header.byteLength);
|
|
||||||
|
|
||||||
return wavBuffer;
|
return new Uint8Array(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Toggle interpreter on button click
|
// Toggle interpreter on button click
|
||||||
|
Loading…
Reference in New Issue
Block a user