Files
konstruct/packages/llm-pool/llm_pool/router.py
Adolfo Delorenzo ee2f88e13b feat(01-02): LLM Backend Pool — LiteLLM Router with Ollama + Anthropic + OpenAI fallback
- Create llm_pool/router.py: LiteLLM Router with fast (Ollama) and quality (Anthropic/OpenAI) model groups
- Configure fallback chain: quality providers fail -> fast group
- Pin LiteLLM to ==1.82.5 (avoid September 2025 OOM regression in later releases)
- Create llm_pool/main.py: FastAPI service on port 8004 with /complete and /health endpoints
- Add providers/__init__.py: reserved for future per-provider customization
- Update docker-compose.yml: add llm-pool and celery-worker service stubs
2026-03-23 10:03:05 -06:00

100 lines
3.0 KiB
Python

"""
LiteLLM Router — multi-provider LLM backend with automatic fallback.
Provider groups:
"fast" → Ollama (local, low-latency, no cost)
"quality" → Anthropic claude-sonnet-4 (primary), OpenAI gpt-4o (fallback)
Fallback chain:
quality providers fail → fall back to "fast" group
NOTE: LiteLLM is pinned to ==1.82.5 in pyproject.toml.
Do NOT upgrade without testing — a September 2025 OOM regression exists
in releases between 1.83.x and later versions.
"""
from __future__ import annotations
import logging
from litellm import Router
from shared.config import settings
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Model list — three entries across two groups
# ---------------------------------------------------------------------------
_model_list: list[dict] = [
# fast group — local Ollama, no API cost
{
"model_name": "fast",
"litellm_params": {
"model": "ollama/qwen3:8b",
"api_base": settings.ollama_base_url,
},
},
# quality group — Anthropic primary
{
"model_name": "quality",
"litellm_params": {
"model": "anthropic/claude-sonnet-4-20250514",
"api_key": settings.anthropic_api_key,
},
},
# quality group — OpenAI fallback (within the same group)
{
"model_name": "quality",
"litellm_params": {
"model": "openai/gpt-4o",
"api_key": settings.openai_api_key,
},
},
]
# ---------------------------------------------------------------------------
# Router — latency-based, 2 retries per provider, then cross-group fallback
# ---------------------------------------------------------------------------
llm_router = Router(
model_list=_model_list,
# If all quality providers fail, fall back to the fast group
fallbacks=[{"quality": ["fast"]}],
routing_strategy="latency-based-routing",
num_retries=2,
set_verbose=False,
)
async def complete(
model_group: str,
messages: list[dict],
tenant_id: str,
) -> str:
"""
Request a completion from the LiteLLM Router.
Args:
model_group: "quality" or "fast" — selects the provider group.
messages: OpenAI-format message list, e.g.
[{"role": "system", "content": "..."}, {"role": "user", "content": "..."}]
tenant_id: Konstruct tenant UUID, attached to LiteLLM metadata for
per-tenant cost tracking.
Returns:
The model's response content as a plain string.
Raises:
Exception: Propagated if all providers in the group (and fallbacks) fail.
"""
logger.info("LLM request", extra={"model_group": model_group, "tenant_id": tenant_id})
response = await llm_router.acompletion(
model=model_group,
messages=messages,
metadata={"tenant_id": tenant_id},
)
content: str = response.choices[0].message.content or ""
return content