feat(streaming): add complete_stream() generator and POST /complete/stream NDJSON endpoint to llm-pool

- complete_stream() in router.py yields token strings via acompletion(stream=True) - POST /complete/stream returns NDJSON: chunk lines then a done line - Streaming path does not support tool calls (plain text only) - Non-streaming POST /complete endpoint unchanged
2026-03-25 17:56:56 -06:00
parent b6c8da8cca
commit f3e358b418
2 changed files with 106 additions and 2 deletions
--- a/packages/llm-pool/llm_pool/main.py
+++ b/packages/llm-pool/llm_pool/main.py
@@ -2,19 +2,23 @@
 LLM Backend Pool — FastAPI service on port 8004.

 Endpoints:
-  POST /complete  — route a completion request through the LiteLLM Router.
-  GET  /health    — liveness probe.
+  POST /complete         — route a completion request through the LiteLLM Router.
+  POST /complete/stream  — streaming variant; returns NDJSON token chunks.
+  GET  /health           — liveness probe.
 """

 from __future__ import annotations

+import json
 import logging
 from typing import Any

 from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
 from pydantic import BaseModel

 from llm_pool.router import complete as router_complete
+from llm_pool.router import complete_stream as router_complete_stream

 logger = logging.getLogger(__name__)

@@ -69,6 +73,19 @@ class HealthResponse(BaseModel):
    status: str


+class StreamCompleteRequest(BaseModel):
+    """Body for POST /complete/stream."""
+
+    model: str
+    """Model group name: "quality" or "fast"."""
+
+    messages: list[dict]
+    """OpenAI-format message list."""
+
+    tenant_id: str
+    """Konstruct tenant UUID for cost tracking."""
+
+
 # ---------------------------------------------------------------------------
 # Routes
 # ---------------------------------------------------------------------------
@@ -123,3 +140,44 @@ async def complete_endpoint(request: CompleteRequest) -> CompleteResponse:
            status_code=503,
            content={"error": "All providers unavailable"},
        )
+
+
+@app.post("/complete/stream")
+async def complete_stream_endpoint(request: StreamCompleteRequest) -> StreamingResponse:
+    """
+    Stream a completion through the LiteLLM Router using NDJSON.
+
+    Each line of the response body is a JSON object:
+      {"type": "chunk", "text": "<token>"}   — zero or more times
+      {"type": "done"}                        — final line, signals end of stream
+
+    On provider failure, yields:
+      {"type": "error", "message": "All providers unavailable"}
+
+    The caller (orchestrator runner) reads line-by-line and forwards chunks
+    to Redis pub-sub for the web WebSocket handler.
+
+    NOTE: Tool calls are NOT supported in this endpoint — only plain text
+    streaming. Use POST /complete for tool-call responses.
+    """
+    async def _generate() -> Any:
+        try:
+            async for token in router_complete_stream(
+                model_group=request.model,
+                messages=request.messages,
+                tenant_id=request.tenant_id,
+            ):
+                yield json.dumps({"type": "chunk", "text": token}) + "\n"
+            yield json.dumps({"type": "done"}) + "\n"
+        except Exception:
+            logger.exception(
+                "Streaming LLM failed for tenant=%s model=%s",
+                request.tenant_id,
+                request.model,
+            )
+            yield json.dumps({"type": "error", "message": "All providers unavailable"}) + "\n"
+
+    return StreamingResponse(
+        _generate(),
+        media_type="application/x-ndjson",
+    )