From 17f6d7cb4bb6b713f525e602a047f89b8cc8fbb7 Mon Sep 17 00:00:00 2001 From: Adolfo Delorenzo Date: Wed, 25 Mar 2026 18:39:32 -0600 Subject: [PATCH] fix: streaming timeout + WebSocket close guard - Streaming httpx client uses 300s read timeout (cloud LLMs can take 30-60s for first token). Was using 120s general timeout. - Guard all WebSocket sends with try/except for client disconnect. Prevents "Cannot send once close message has been sent" crash. Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/gateway/gateway/channels/web.py | 28 +++++++++++-------- .../orchestrator/agents/runner.py | 4 ++- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/packages/gateway/gateway/channels/web.py b/packages/gateway/gateway/channels/web.py index cf51f66..fda505e 100644 --- a/packages/gateway/gateway/channels/web.py +++ b/packages/gateway/gateway/channels/web.py @@ -362,21 +362,25 @@ async def _handle_websocket_connection( current_tenant_id.reset(rls_token2) # Signal stream completion to the client - await websocket.send_json({ - "type": "done", - "text": response_text, - "conversation_id": saved_conversation_id, - }) + try: + await websocket.send_json({ + "type": "done", + "text": response_text, + "conversation_id": saved_conversation_id, + }) + except Exception: + pass # Client already disconnected else: logger.warning( - "No response received within %ds for conversation=%s", - _RESPONSE_TIMEOUT_SECONDS, - saved_conversation_id, + "No response received for conversation=%s", saved_conversation_id, ) - await websocket.send_json({ - "type": "error", - "message": "Agent did not respond in time. Please try again.", - }) + try: + await websocket.send_json({ + "type": "error", + "message": "I'm having trouble responding right now. Please try again.", + }) + except Exception: + pass # Client already disconnected @web_chat_router.websocket("/chat/ws/{conversation_id}") diff --git a/packages/orchestrator/orchestrator/agents/runner.py b/packages/orchestrator/orchestrator/agents/runner.py index c53f881..d293014 100644 --- a/packages/orchestrator/orchestrator/agents/runner.py +++ b/packages/orchestrator/orchestrator/agents/runner.py @@ -52,6 +52,8 @@ _FALLBACK_RESPONSE = ( # Timeout for LLM pool HTTP requests — generous to allow slow local inference _LLM_TIMEOUT = httpx.Timeout(timeout=120.0, connect=10.0) +# Streaming needs a longer read timeout — first token can take 30-60s with cloud models +_LLM_STREAM_TIMEOUT = httpx.Timeout(timeout=300.0, connect=10.0, read=300.0) # Maximum number of tool-call iterations before breaking the loop _MAX_TOOL_ITERATIONS = 5 @@ -321,7 +323,7 @@ async def run_agent_streaming( } try: - async with httpx.AsyncClient(timeout=_LLM_TIMEOUT) as client: + async with httpx.AsyncClient(timeout=_LLM_STREAM_TIMEOUT) as client: async with client.stream("POST", llm_stream_url, json=payload) as response: if response.status_code != 200: logger.error(