feat(01-02): LLM Backend Pool — LiteLLM Router with Ollama + Anthropic + OpenAI fallback
- Create llm_pool/router.py: LiteLLM Router with fast (Ollama) and quality (Anthropic/OpenAI) model groups - Configure fallback chain: quality providers fail -> fast group - Pin LiteLLM to ==1.82.5 (avoid September 2025 OOM regression in later releases) - Create llm_pool/main.py: FastAPI service on port 8004 with /complete and /health endpoints - Add providers/__init__.py: reserved for future per-provider customization - Update docker-compose.yml: add llm-pool and celery-worker service stubs
This commit is contained in:
99
packages/llm-pool/llm_pool/router.py
Normal file
99
packages/llm-pool/llm_pool/router.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""
|
||||
LiteLLM Router — multi-provider LLM backend with automatic fallback.
|
||||
|
||||
Provider groups:
|
||||
"fast" → Ollama (local, low-latency, no cost)
|
||||
"quality" → Anthropic claude-sonnet-4 (primary), OpenAI gpt-4o (fallback)
|
||||
|
||||
Fallback chain:
|
||||
quality providers fail → fall back to "fast" group
|
||||
|
||||
NOTE: LiteLLM is pinned to ==1.82.5 in pyproject.toml.
|
||||
Do NOT upgrade without testing — a September 2025 OOM regression exists
|
||||
in releases between 1.83.x and later versions.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from litellm import Router
|
||||
|
||||
from shared.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Model list — three entries across two groups
|
||||
# ---------------------------------------------------------------------------
|
||||
_model_list: list[dict] = [
|
||||
# fast group — local Ollama, no API cost
|
||||
{
|
||||
"model_name": "fast",
|
||||
"litellm_params": {
|
||||
"model": "ollama/qwen3:8b",
|
||||
"api_base": settings.ollama_base_url,
|
||||
},
|
||||
},
|
||||
# quality group — Anthropic primary
|
||||
{
|
||||
"model_name": "quality",
|
||||
"litellm_params": {
|
||||
"model": "anthropic/claude-sonnet-4-20250514",
|
||||
"api_key": settings.anthropic_api_key,
|
||||
},
|
||||
},
|
||||
# quality group — OpenAI fallback (within the same group)
|
||||
{
|
||||
"model_name": "quality",
|
||||
"litellm_params": {
|
||||
"model": "openai/gpt-4o",
|
||||
"api_key": settings.openai_api_key,
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Router — latency-based, 2 retries per provider, then cross-group fallback
|
||||
# ---------------------------------------------------------------------------
|
||||
llm_router = Router(
|
||||
model_list=_model_list,
|
||||
# If all quality providers fail, fall back to the fast group
|
||||
fallbacks=[{"quality": ["fast"]}],
|
||||
routing_strategy="latency-based-routing",
|
||||
num_retries=2,
|
||||
set_verbose=False,
|
||||
)
|
||||
|
||||
|
||||
async def complete(
|
||||
model_group: str,
|
||||
messages: list[dict],
|
||||
tenant_id: str,
|
||||
) -> str:
|
||||
"""
|
||||
Request a completion from the LiteLLM Router.
|
||||
|
||||
Args:
|
||||
model_group: "quality" or "fast" — selects the provider group.
|
||||
messages: OpenAI-format message list, e.g.
|
||||
[{"role": "system", "content": "..."}, {"role": "user", "content": "..."}]
|
||||
tenant_id: Konstruct tenant UUID, attached to LiteLLM metadata for
|
||||
per-tenant cost tracking.
|
||||
|
||||
Returns:
|
||||
The model's response content as a plain string.
|
||||
|
||||
Raises:
|
||||
Exception: Propagated if all providers in the group (and fallbacks) fail.
|
||||
"""
|
||||
logger.info("LLM request", extra={"model_group": model_group, "tenant_id": tenant_id})
|
||||
|
||||
response = await llm_router.acompletion(
|
||||
model=model_group,
|
||||
messages=messages,
|
||||
metadata={"tenant_id": tenant_id},
|
||||
)
|
||||
|
||||
content: str = response.choices[0].message.content or ""
|
||||
return content
|
||||
Reference in New Issue
Block a user