feat(streaming): add complete_stream() generator and POST /complete/stream NDJSON endpoint to llm-pool

- complete_stream() in router.py yields token strings via acompletion(stream=True)
- POST /complete/stream returns NDJSON: chunk lines then a done line
- Streaming path does not support tool calls (plain text only)
- Non-streaming POST /complete endpoint unchanged
This commit is contained in:
2026-03-25 17:56:56 -06:00
parent b6c8da8cca
commit f3e358b418
2 changed files with 106 additions and 2 deletions

View File

@@ -2,19 +2,23 @@
LLM Backend Pool — FastAPI service on port 8004.
Endpoints:
POST /complete — route a completion request through the LiteLLM Router.
GET /health — liveness probe.
POST /complete — route a completion request through the LiteLLM Router.
POST /complete/stream — streaming variant; returns NDJSON token chunks.
GET /health — liveness probe.
"""
from __future__ import annotations
import json
import logging
from typing import Any
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from llm_pool.router import complete as router_complete
from llm_pool.router import complete_stream as router_complete_stream
logger = logging.getLogger(__name__)
@@ -69,6 +73,19 @@ class HealthResponse(BaseModel):
status: str
class StreamCompleteRequest(BaseModel):
"""Body for POST /complete/stream."""
model: str
"""Model group name: "quality" or "fast"."""
messages: list[dict]
"""OpenAI-format message list."""
tenant_id: str
"""Konstruct tenant UUID for cost tracking."""
# ---------------------------------------------------------------------------
# Routes
# ---------------------------------------------------------------------------
@@ -123,3 +140,44 @@ async def complete_endpoint(request: CompleteRequest) -> CompleteResponse:
status_code=503,
content={"error": "All providers unavailable"},
)
@app.post("/complete/stream")
async def complete_stream_endpoint(request: StreamCompleteRequest) -> StreamingResponse:
"""
Stream a completion through the LiteLLM Router using NDJSON.
Each line of the response body is a JSON object:
{"type": "chunk", "text": "<token>"} — zero or more times
{"type": "done"} — final line, signals end of stream
On provider failure, yields:
{"type": "error", "message": "All providers unavailable"}
The caller (orchestrator runner) reads line-by-line and forwards chunks
to Redis pub-sub for the web WebSocket handler.
NOTE: Tool calls are NOT supported in this endpoint — only plain text
streaming. Use POST /complete for tool-call responses.
"""
async def _generate() -> Any:
try:
async for token in router_complete_stream(
model_group=request.model,
messages=request.messages,
tenant_id=request.tenant_id,
):
yield json.dumps({"type": "chunk", "text": token}) + "\n"
yield json.dumps({"type": "done"}) + "\n"
except Exception:
logger.exception(
"Streaming LLM failed for tenant=%s model=%s",
request.tenant_id,
request.model,
)
yield json.dumps({"type": "error", "message": "All providers unavailable"}) + "\n"
return StreamingResponse(
_generate(),
media_type="application/x-ndjson",
)