fix: streaming timeout + WebSocket close guard
- Streaming httpx client uses 300s read timeout (cloud LLMs can take 30-60s for first token). Was using 120s general timeout. - Guard all WebSocket sends with try/except for client disconnect. Prevents "Cannot send once close message has been sent" crash. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -362,21 +362,25 @@ async def _handle_websocket_connection(
|
|||||||
current_tenant_id.reset(rls_token2)
|
current_tenant_id.reset(rls_token2)
|
||||||
|
|
||||||
# Signal stream completion to the client
|
# Signal stream completion to the client
|
||||||
|
try:
|
||||||
await websocket.send_json({
|
await websocket.send_json({
|
||||||
"type": "done",
|
"type": "done",
|
||||||
"text": response_text,
|
"text": response_text,
|
||||||
"conversation_id": saved_conversation_id,
|
"conversation_id": saved_conversation_id,
|
||||||
})
|
})
|
||||||
|
except Exception:
|
||||||
|
pass # Client already disconnected
|
||||||
else:
|
else:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"No response received within %ds for conversation=%s",
|
"No response received for conversation=%s", saved_conversation_id,
|
||||||
_RESPONSE_TIMEOUT_SECONDS,
|
|
||||||
saved_conversation_id,
|
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
await websocket.send_json({
|
await websocket.send_json({
|
||||||
"type": "error",
|
"type": "error",
|
||||||
"message": "Agent did not respond in time. Please try again.",
|
"message": "I'm having trouble responding right now. Please try again.",
|
||||||
})
|
})
|
||||||
|
except Exception:
|
||||||
|
pass # Client already disconnected
|
||||||
|
|
||||||
|
|
||||||
@web_chat_router.websocket("/chat/ws/{conversation_id}")
|
@web_chat_router.websocket("/chat/ws/{conversation_id}")
|
||||||
|
|||||||
@@ -52,6 +52,8 @@ _FALLBACK_RESPONSE = (
|
|||||||
|
|
||||||
# Timeout for LLM pool HTTP requests — generous to allow slow local inference
|
# Timeout for LLM pool HTTP requests — generous to allow slow local inference
|
||||||
_LLM_TIMEOUT = httpx.Timeout(timeout=120.0, connect=10.0)
|
_LLM_TIMEOUT = httpx.Timeout(timeout=120.0, connect=10.0)
|
||||||
|
# Streaming needs a longer read timeout — first token can take 30-60s with cloud models
|
||||||
|
_LLM_STREAM_TIMEOUT = httpx.Timeout(timeout=300.0, connect=10.0, read=300.0)
|
||||||
|
|
||||||
# Maximum number of tool-call iterations before breaking the loop
|
# Maximum number of tool-call iterations before breaking the loop
|
||||||
_MAX_TOOL_ITERATIONS = 5
|
_MAX_TOOL_ITERATIONS = 5
|
||||||
@@ -321,7 +323,7 @@ async def run_agent_streaming(
|
|||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=_LLM_TIMEOUT) as client:
|
async with httpx.AsyncClient(timeout=_LLM_STREAM_TIMEOUT) as client:
|
||||||
async with client.stream("POST", llm_stream_url, json=payload) as response:
|
async with client.stream("POST", llm_stream_url, json=payload) as response:
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
logger.error(
|
logger.error(
|
||||||
|
|||||||
Reference in New Issue
Block a user