From 17f6d7cb4bb6b713f525e602a047f89b8cc8fbb7 Mon Sep 17 00:00:00 2001
From: Adolfo Delorenzo <adelorenzo@oe74.net>
Date: Wed, 25 Mar 2026 18:39:32 -0600
Subject: [PATCH] fix: streaming timeout + WebSocket close guard

- Streaming httpx client uses 300s read timeout (cloud LLMs can take
  30-60s for first token). Was using 120s general timeout.
- Guard all WebSocket sends with try/except for client disconnect.
  Prevents "Cannot send once close message has been sent" crash.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 packages/gateway/gateway/channels/web.py      | 28 +++++++++++--------
 .../orchestrator/agents/runner.py             |  4 ++-
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/packages/gateway/gateway/channels/web.py b/packages/gateway/gateway/channels/web.py
index cf51f66..fda505e 100644
--- a/packages/gateway/gateway/channels/web.py
+++ b/packages/gateway/gateway/channels/web.py
@@ -362,21 +362,25 @@ async def _handle_websocket_connection(
                 current_tenant_id.reset(rls_token2)
 
             # Signal stream completion to the client
-            await websocket.send_json({
-                "type": "done",
-                "text": response_text,
-                "conversation_id": saved_conversation_id,
-            })
+            try:
+                await websocket.send_json({
+                    "type": "done",
+                    "text": response_text,
+                    "conversation_id": saved_conversation_id,
+                })
+            except Exception:
+                pass  # Client already disconnected
         else:
             logger.warning(
-                "No response received within %ds for conversation=%s",
-                _RESPONSE_TIMEOUT_SECONDS,
-                saved_conversation_id,
+                "No response received for conversation=%s", saved_conversation_id,
             )
-            await websocket.send_json({
-                "type": "error",
-                "message": "Agent did not respond in time. Please try again.",
-            })
+            try:
+                await websocket.send_json({
+                    "type": "error",
+                    "message": "I'm having trouble responding right now. Please try again.",
+                })
+            except Exception:
+                pass  # Client already disconnected
 
 
 @web_chat_router.websocket("/chat/ws/{conversation_id}")
diff --git a/packages/orchestrator/orchestrator/agents/runner.py b/packages/orchestrator/orchestrator/agents/runner.py
index c53f881..d293014 100644
--- a/packages/orchestrator/orchestrator/agents/runner.py
+++ b/packages/orchestrator/orchestrator/agents/runner.py
@@ -52,6 +52,8 @@ _FALLBACK_RESPONSE = (
 
 # Timeout for LLM pool HTTP requests — generous to allow slow local inference
 _LLM_TIMEOUT = httpx.Timeout(timeout=120.0, connect=10.0)
+# Streaming needs a longer read timeout — first token can take 30-60s with cloud models
+_LLM_STREAM_TIMEOUT = httpx.Timeout(timeout=300.0, connect=10.0, read=300.0)
 
 # Maximum number of tool-call iterations before breaking the loop
 _MAX_TOOL_ITERATIONS = 5
@@ -321,7 +323,7 @@ async def run_agent_streaming(
     }
 
     try:
-        async with httpx.AsyncClient(timeout=_LLM_TIMEOUT) as client:
+        async with httpx.AsyncClient(timeout=_LLM_STREAM_TIMEOUT) as client:
             async with client.stream("POST", llm_stream_url, json=payload) as response:
                 if response.status_code != 200:
                     logger.error(