""" LiteLLM Router — multi-provider LLM backend with automatic fallback. Provider groups: "fast" → Ollama (local, low-latency, no cost) "quality" → Anthropic claude-sonnet-4 (primary), OpenAI gpt-4o (fallback) Fallback chain: quality providers fail → fall back to "fast" group NOTE: LiteLLM is pinned to ==1.82.5 in pyproject.toml. Do NOT upgrade without testing — a September 2025 OOM regression exists in releases between 1.83.x and later versions. """ from __future__ import annotations import logging from typing import Any from litellm import Router from shared.config import settings logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Model list — three entries across two groups # --------------------------------------------------------------------------- _model_list: list[dict] = [ # ── local group — Ollama, no API cost ── { "model_name": "local", "litellm_params": { "model": "ollama/qwen3:32b", "api_base": settings.ollama_base_url, }, }, # ── fast group — same as local (aliases for preference mapping) ── { "model_name": "fast", "litellm_params": { "model": "ollama/qwen3:32b", "api_base": settings.ollama_base_url, }, }, # ── economy group — local model, cheaper than commercial ── { "model_name": "economy", "litellm_params": { "model": "ollama/qwen3:32b", "api_base": settings.ollama_base_url, }, }, # ── balanced group — Ollama primary, commercial fallback ── { "model_name": "balanced", "litellm_params": { "model": "ollama/qwen3:32b", "api_base": settings.ollama_base_url, }, }, { "model_name": "balanced", "litellm_params": { "model": "anthropic/claude-sonnet-4-20250514", "api_key": settings.anthropic_api_key, }, }, # ── quality group — Anthropic primary, OpenAI fallback ── { "model_name": "quality", "litellm_params": { "model": "anthropic/claude-sonnet-4-20250514", "api_key": settings.anthropic_api_key, }, }, { "model_name": "quality", "litellm_params": { "model": "openai/gpt-4o", "api_key": settings.openai_api_key, }, }, ] # --------------------------------------------------------------------------- # Router — latency-based, 2 retries per provider, then cross-group fallback # --------------------------------------------------------------------------- llm_router = Router( model_list=_model_list, # If all quality providers fail, fall back to the fast group fallbacks=[{"quality": ["fast"]}, {"balanced": ["fast"]}], routing_strategy="latency-based-routing", num_retries=2, set_verbose=False, ) class LLMResponse: """ Container for LLM completion response. Attributes: content: Text content of the response (empty string if tool_calls present). tool_calls: List of tool call dicts in OpenAI format, or empty list. """ def __init__(self, content: str, tool_calls: list[dict[str, Any]]) -> None: self.content = content self.tool_calls = tool_calls async def complete( model_group: str, messages: list[dict], tenant_id: str, tools: list[dict] | None = None, ) -> LLMResponse: """ Request a completion from the LiteLLM Router. Args: model_group: "quality" or "fast" — selects the provider group. messages: OpenAI-format message list, e.g. [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}] tenant_id: Konstruct tenant UUID, attached to LiteLLM metadata for per-tenant cost tracking. tools: Optional list of OpenAI function-calling tool dicts. When provided, the LLM may return tool_calls instead of text content. Returns: LLMResponse with content (text) and tool_calls (list of tool call dicts). - If LLM returns text: content is non-empty, tool_calls is empty. - If LLM returns tool calls: content is empty, tool_calls contains calls. Raises: Exception: Propagated if all providers in the group (and fallbacks) fail. """ logger.info("LLM request", extra={"model_group": model_group, "tenant_id": tenant_id}) kwargs: dict[str, Any] = { "model": model_group, "messages": messages, "metadata": {"tenant_id": tenant_id}, } if tools: kwargs["tools"] = tools response = await llm_router.acompletion(**kwargs) choice = response.choices[0] message = choice.message # Extract tool_calls if present raw_tool_calls = getattr(message, "tool_calls", None) or [] tool_calls: list[dict[str, Any]] = [] for tc in raw_tool_calls: # LiteLLM returns tool calls as objects with .id, .function.name, .function.arguments try: tool_calls.append({ "id": tc.id, "type": "function", "function": { "name": tc.function.name, "arguments": tc.function.arguments, }, }) except AttributeError: # Fallback: if it's already a dict (some providers) if isinstance(tc, dict): tool_calls.append(tc) content: str = message.content or "" return LLMResponse(content=content, tool_calls=tool_calls)