Improve audio handling compatibility

- Add support for different audio data formats - Ensure proper .wav extension for Whisper compatibility - Add file size logging for debugging - Fix data format conversion between client and server
2025-04-12 01:51:15 -06:00 · 2025-04-12 01:51:15 -06:00 · 4bd56cbe5b
commit 4bd56cbe5b
parent 378f976b76
1 changed files with 28 additions and 0 deletions
--- a/app.py
+++ b/app.py
@ -429,6 +429,18 @@ def setup_realtime_interpreter():
                
                # Use whisper to transcribe
                try:
+                    # Check if file is valid
+                    file_size = os.path.getsize(temp_path)
+                    logger.info(f"File size: {file_size} bytes")
+                    
+                    # Enforce .wav extension for compatibility
+                    new_temp_path = temp_path
+                    if not temp_path.lower().endswith('.wav'):
+                        new_temp_path = temp_path + '.wav'
+                        os.rename(temp_path, new_temp_path)
+                        logger.info(f"Renamed audio file to ensure .wav extension: {new_temp_path}")
+                        temp_path = new_temp_path
+                    
                    result = whisper_model.transcribe(
                        temp_path,
                        language=LANGUAGE_TO_CODE.get(source_lang, None)
@ -626,6 +638,22 @@ def handle_audio_chunk(data):
        emit('error', {'message': 'No audio data received'})
        return
    
+    # Convert audio data to proper format if needed
+    if isinstance(audio_data, list):
+        # Convert list to bytes
+        audio_data = bytes(audio_data)
+        logger.info(f"Converted audio data from list to bytes: {len(audio_data)} bytes")
+    elif isinstance(audio_data, str):
+        # This might be base64 encoded data
+        try:
+            import base64
+            audio_data = base64.b64decode(audio_data.split(',')[1] if ',' in audio_data else audio_data)
+            logger.info(f"Decoded base64 audio data: {len(audio_data)} bytes")
+        except Exception as e:
+            logger.error(f"Error decoding audio data: {str(e)}")
+            emit('error', {'message': 'Invalid audio data format'})
+            return
+    
    # Get session details
    source_lang = active_sessions[request.sid]['source_lang']