feat(streaming): add complete_stream() generator and POST /complete/stream NDJSON endpoint to llm-pool
- complete_stream() in router.py yields token strings via acompletion(stream=True) - POST /complete/stream returns NDJSON: chunk lines then a done line - Streaming path does not support tool calls (plain text only) - Non-streaming POST /complete endpoint unchanged
This commit is contained in:
@@ -16,6 +16,7 @@ NOTE: LiteLLM is pinned to ==1.82.5 in pyproject.toml.
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from collections.abc import AsyncGenerator
|
||||
from typing import Any
|
||||
|
||||
from litellm import Router
|
||||
@@ -173,3 +174,48 @@ async def complete(
|
||||
|
||||
content: str = message.content or ""
|
||||
return LLMResponse(content=content, tool_calls=tool_calls)
|
||||
|
||||
|
||||
async def complete_stream(
|
||||
model_group: str,
|
||||
messages: list[dict],
|
||||
tenant_id: str,
|
||||
) -> AsyncGenerator[str, None]:
|
||||
"""
|
||||
Stream a completion from the LiteLLM Router, yielding token strings.
|
||||
|
||||
Only used for the web channel streaming path — does NOT support tool calls
|
||||
(tool-call responses are not streamed). The caller is responsible for
|
||||
assembling the full response from the yielded chunks.
|
||||
|
||||
Args:
|
||||
model_group: "quality", "fast", etc. — selects the provider group.
|
||||
messages: OpenAI-format message list.
|
||||
tenant_id: Konstruct tenant UUID for cost tracking metadata.
|
||||
|
||||
Yields:
|
||||
Token strings as they are generated by the LLM.
|
||||
|
||||
Raises:
|
||||
Exception: Propagated if all providers (and fallbacks) fail.
|
||||
"""
|
||||
logger.info(
|
||||
"LLM stream request",
|
||||
extra={"model_group": model_group, "tenant_id": tenant_id},
|
||||
)
|
||||
|
||||
response = await llm_router.acompletion(
|
||||
model=model_group,
|
||||
messages=messages,
|
||||
metadata={"tenant_id": tenant_id},
|
||||
stream=True,
|
||||
)
|
||||
|
||||
async for chunk in response:
|
||||
try:
|
||||
delta = chunk.choices[0].delta
|
||||
token = getattr(delta, "content", None)
|
||||
if token:
|
||||
yield token
|
||||
except (IndexError, AttributeError):
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user