feat(streaming): add complete_stream() generator and POST /complete/stream NDJSON endpoint to llm-pool

- complete_stream() in router.py yields token strings via acompletion(stream=True)
- POST /complete/stream returns NDJSON: chunk lines then a done line
- Streaming path does not support tool calls (plain text only)
- Non-streaming POST /complete endpoint unchanged
This commit is contained in:
2026-03-25 17:56:56 -06:00
parent b6c8da8cca
commit f3e358b418
2 changed files with 106 additions and 2 deletions

View File

@@ -16,6 +16,7 @@ NOTE: LiteLLM is pinned to ==1.82.5 in pyproject.toml.
from __future__ import annotations
import logging
from collections.abc import AsyncGenerator
from typing import Any
from litellm import Router
@@ -173,3 +174,48 @@ async def complete(
content: str = message.content or ""
return LLMResponse(content=content, tool_calls=tool_calls)
async def complete_stream(
model_group: str,
messages: list[dict],
tenant_id: str,
) -> AsyncGenerator[str, None]:
"""
Stream a completion from the LiteLLM Router, yielding token strings.
Only used for the web channel streaming path — does NOT support tool calls
(tool-call responses are not streamed). The caller is responsible for
assembling the full response from the yielded chunks.
Args:
model_group: "quality", "fast", etc. — selects the provider group.
messages: OpenAI-format message list.
tenant_id: Konstruct tenant UUID for cost tracking metadata.
Yields:
Token strings as they are generated by the LLM.
Raises:
Exception: Propagated if all providers (and fallbacks) fail.
"""
logger.info(
"LLM stream request",
extra={"model_group": model_group, "tenant_id": tenant_id},
)
response = await llm_router.acompletion(
model=model_group,
messages=messages,
metadata={"tenant_id": tenant_id},
stream=True,
)
async for chunk in response:
try:
delta = chunk.choices[0].delta
token = getattr(delta, "content", None)
if token:
yield token
except (IndexError, AttributeError):
continue