Add comprehensive input validation and sanitization

Frontend Validation: - Created Validator class with comprehensive validation methods - HTML sanitization to prevent XSS attacks - Text sanitization removing dangerous characters - Language code validation against allowed list - Audio file validation (size, type, extension) - URL validation preventing injection attacks - API key format validation - Request size validation - Filename sanitization - Settings validation with type checking - Cache key sanitization - Client-side rate limiting tracking Backend Validation: - Created validators.py module for server-side validation - Audio file validation with size and type checks - Text sanitization with length limits - Language code validation - URL and API key validation - JSON request size validation - Rate limiting per endpoint (30 req/min) - Added validation to all API endpoints - Error boundary decorators on all routes - CSRF token support ready Security Features: - Prevents XSS through HTML escaping - Prevents SQL injection through input sanitization - Prevents directory traversal in filenames - Prevents oversized requests (DoS protection) - Rate limiting prevents abuse - Type checking prevents type confusion attacks - Length limits prevent memory exhaustion - Character filtering prevents control character injection All user inputs are now validated and sanitized before processing. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-06-02 22:58:17 -06:00
parent 3804897e2b
commit aedface2a9
5 changed files with 687 additions and 30 deletions
--- a/app.py
+++ b/app.py
@@ -17,6 +17,7 @@ from cryptography.hazmat.backends import default_backend
 import gc  # For garbage collection
 from functools import wraps
 import traceback
+from validators import Validators

 # Initialize logging
 logging.basicConfig(level=logging.INFO)
@@ -47,6 +48,13 @@ app.config['UPLOAD_FOLDER'] = tempfile.mkdtemp()
 app.config['TTS_SERVER'] = os.environ.get('TTS_SERVER_URL', 'http://localhost:5050/v1/audio/speech')
 app.config['TTS_API_KEY'] = os.environ.get('TTS_API_KEY', '56461d8b44607f2cfcb8030dee313a8e')

+# Rate limiting storage
+rate_limit_storage = {}
+
+# Simple CSRF token generation (in production, use Flask-WTF)
+import secrets
+app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', secrets.token_hex(32))
+
 # Generate VAPID keys for push notifications
 if not os.path.exists('vapid_private.pem'):
    # Generate new VAPID keys
@@ -272,18 +280,33 @@ def check_tts_server():
        })

@app.route('/update_tts_config', methods=['POST'])
+@with_error_boundary
 def update_tts_config():
    try:
        data = request.json
+        
+        # Validate and sanitize URL
        tts_server_url = data.get('server_url')
-        tts_api_key = data.get('api_key')
-
        if tts_server_url:
-            app.config['TTS_SERVER'] = tts_server_url
-            logger.info(f"Updated TTS server URL to {tts_server_url}")
+            validated_url = Validators.validate_url(tts_server_url)
+            if not validated_url:
+                return jsonify({
+                    'success': False,
+                    'error': 'Invalid server URL format'
+                }), 400
+            app.config['TTS_SERVER'] = validated_url
+            logger.info(f"Updated TTS server URL to {validated_url}")

+        # Validate and sanitize API key
+        tts_api_key = data.get('api_key')
        if tts_api_key:
-            app.config['TTS_API_KEY'] = tts_api_key
+            validated_key = Validators.validate_api_key(tts_api_key)
+            if not validated_key:
+                return jsonify({
+                    'success': False,
+                    'error': 'Invalid API key format'
+                }), 400
+            app.config['TTS_API_KEY'] = validated_key
            logger.info("Updated TTS API key")

        return jsonify({
@@ -412,12 +435,29 @@ def index():
    return render_template('index.html', languages=sorted(SUPPORTED_LANGUAGES.values()))

@app.route('/transcribe', methods=['POST'])
+@with_error_boundary
 def transcribe():
+    # Rate limiting
+    client_ip = request.remote_addr
+    if not Validators.rate_limit_check(
+        client_ip, 'transcribe', max_requests=30, window_seconds=60, storage=rate_limit_storage
+    ):
+        return jsonify({'error': 'Rate limit exceeded. Please wait before trying again.'}), 429
+    
    if 'audio' not in request.files:
        return jsonify({'error': 'No audio file provided'}), 400

    audio_file = request.files['audio']
+    
+    # Validate audio file
+    valid, error_msg = Validators.validate_audio_file(audio_file)
+    if not valid:
+        return jsonify({'error': error_msg}), 400
+    
+    # Validate and sanitize language code
    source_lang = request.form.get('source_lang', '')
+    allowed_languages = set(SUPPORTED_LANGUAGES.values())
+    source_lang = Validators.validate_language_code(source_lang, allowed_languages) or ''

    # Save the audio file temporarily
    temp_path = os.path.join(app.config['UPLOAD_FOLDER'], 'input_audio.wav')
@@ -502,15 +542,39 @@ def transcribe():
        gc.collect()

@app.route('/translate', methods=['POST'])
+@with_error_boundary
 def translate():
    try:
+        # Rate limiting
+        client_ip = request.remote_addr
+        if not Validators.rate_limit_check(
+            client_ip, 'translate', max_requests=30, window_seconds=60, storage=rate_limit_storage
+        ):
+            return jsonify({'error': 'Rate limit exceeded. Please wait before trying again.'}), 429
+        
+        # Validate request size
+        if not Validators.validate_json_size(request.json, max_size_kb=100):
+            return jsonify({'error': 'Request too large'}), 413
+        
        data = request.json
+        
+        # Sanitize and validate text
        text = data.get('text', '')
-        source_lang = data.get('source_lang', '')
-        target_lang = data.get('target_lang', '')
-
-        if not text or not source_lang or not target_lang:
-            return jsonify({'error': 'Missing required parameters'}), 400
+        text = Validators.sanitize_text(text)
+        if not text:
+            return jsonify({'error': 'No text provided'}), 400
+        
+        # Validate language codes
+        allowed_languages = set(SUPPORTED_LANGUAGES.values())
+        source_lang = Validators.validate_language_code(
+            data.get('source_lang', ''), allowed_languages
+        ) or 'auto'
+        target_lang = Validators.validate_language_code(
+            data.get('target_lang', ''), allowed_languages
+        )
+        
+        if not target_lang:
+            return jsonify({'error': 'Invalid target language'}), 400

        # Create a prompt for Gemma 3 translation
        prompt = f"""
@@ -552,14 +616,29 @@ def translate():
        return jsonify({'error': f'Translation failed: {str(e)}'}), 500

@app.route('/speak', methods=['POST'])
+@with_error_boundary
 def speak():
    try:
+        # Validate request size
+        if not Validators.validate_json_size(request.json, max_size_kb=100):
+            return jsonify({'error': 'Request too large'}), 413
+            
        data = request.json
+        
+        # Sanitize and validate text
        text = data.get('text', '')
-        language = data.get('language', '')
-
-        if not text or not language:
-            return jsonify({'error': 'Missing required parameters'}), 400
+        text = Validators.sanitize_text(text, max_length=5000)  # Shorter limit for TTS
+        if not text:
+            return jsonify({'error': 'No text provided'}), 400
+            
+        # Validate language code
+        allowed_languages = set(SUPPORTED_LANGUAGES.values())
+        language = Validators.validate_language_code(
+            data.get('language', ''), allowed_languages
+        )
+        
+        if not language:
+            return jsonify({'error': 'Invalid language'}), 400

        voice = LANGUAGE_TO_VOICE.get(language, 'echo')  # Default to echo if language not found