feat(streaming): add complete_stream() generator and POST /complete/stream NDJSON endpoint to llm-pool

- complete_stream() in router.py yields token strings via acompletion(stream=True) - POST /complete/stream returns NDJSON: chunk lines then a done line - Streaming path does not support tool calls (plain text only) - Non-streaming POST /complete endpoint unchanged
2026-03-25 17:56:56 -06:00
parent b6c8da8cca
commit f3e358b418
2 changed files with 106 additions and 2 deletions
--- a/packages/llm-pool/llm_pool/router.py
+++ b/packages/llm-pool/llm_pool/router.py
@@ -16,6 +16,7 @@ NOTE: LiteLLM is pinned to ==1.82.5 in pyproject.toml.
 from __future__ import annotations

 import logging
+from collections.abc import AsyncGenerator
 from typing import Any

 from litellm import Router
@@ -173,3 +174,48 @@ async def complete(

    content: str = message.content or ""
    return LLMResponse(content=content, tool_calls=tool_calls)
+
+
+async def complete_stream(
+    model_group: str,
+    messages: list[dict],
+    tenant_id: str,
+) -> AsyncGenerator[str, None]:
+    """
+    Stream a completion from the LiteLLM Router, yielding token strings.
+
+    Only used for the web channel streaming path — does NOT support tool calls
+    (tool-call responses are not streamed). The caller is responsible for
+    assembling the full response from the yielded chunks.
+
+    Args:
+        model_group: "quality", "fast", etc. — selects the provider group.
+        messages:    OpenAI-format message list.
+        tenant_id:   Konstruct tenant UUID for cost tracking metadata.
+
+    Yields:
+        Token strings as they are generated by the LLM.
+
+    Raises:
+        Exception: Propagated if all providers (and fallbacks) fail.
+    """
+    logger.info(
+        "LLM stream request",
+        extra={"model_group": model_group, "tenant_id": tenant_id},
+    )
+
+    response = await llm_router.acompletion(
+        model=model_group,
+        messages=messages,
+        metadata={"tenant_id": tenant_id},
+        stream=True,
+    )
+
+    async for chunk in response:
+        try:
+            delta = chunk.choices[0].delta
+            token = getattr(delta, "content", None)
+            if token:
+                yield token
+        except (IndexError, AttributeError):
+            continue