""" LiteLLM Router — multi-provider LLM backend with automatic fallback. Provider groups: "fast" → Ollama (local, low-latency, no cost) "quality" → Anthropic claude-sonnet-4 (primary), OpenAI gpt-4o (fallback) Fallback chain: quality providers fail → fall back to "fast" group NOTE: LiteLLM is pinned to ==1.82.5 in pyproject.toml. Do NOT upgrade without testing — a September 2025 OOM regression exists in releases between 1.83.x and later versions. """ from __future__ import annotations import logging from litellm import Router from shared.config import settings logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Model list — three entries across two groups # --------------------------------------------------------------------------- _model_list: list[dict] = [ # fast group — local Ollama, no API cost { "model_name": "fast", "litellm_params": { "model": "ollama/qwen3:8b", "api_base": settings.ollama_base_url, }, }, # quality group — Anthropic primary { "model_name": "quality", "litellm_params": { "model": "anthropic/claude-sonnet-4-20250514", "api_key": settings.anthropic_api_key, }, }, # quality group — OpenAI fallback (within the same group) { "model_name": "quality", "litellm_params": { "model": "openai/gpt-4o", "api_key": settings.openai_api_key, }, }, ] # --------------------------------------------------------------------------- # Router — latency-based, 2 retries per provider, then cross-group fallback # --------------------------------------------------------------------------- llm_router = Router( model_list=_model_list, # If all quality providers fail, fall back to the fast group fallbacks=[{"quality": ["fast"]}], routing_strategy="latency-based-routing", num_retries=2, set_verbose=False, ) async def complete( model_group: str, messages: list[dict], tenant_id: str, ) -> str: """ Request a completion from the LiteLLM Router. Args: model_group: "quality" or "fast" — selects the provider group. messages: OpenAI-format message list, e.g. [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}] tenant_id: Konstruct tenant UUID, attached to LiteLLM metadata for per-tenant cost tracking. Returns: The model's response content as a plain string. Raises: Exception: Propagated if all providers in the group (and fallbacks) fail. """ logger.info("LLM request", extra={"model_group": model_group, "tenant_id": tenant_id}) response = await llm_router.acompletion( model=model_group, messages=messages, metadata={"tenant_id": tenant_id}, ) content: str = response.choices[0].message.content or "" return content