feat(01-02): LLM Backend Pool — LiteLLM Router with Ollama + Anthropic + OpenAI fallback
- Create llm_pool/router.py: LiteLLM Router with fast (Ollama) and quality (Anthropic/OpenAI) model groups - Configure fallback chain: quality providers fail -> fast group - Pin LiteLLM to ==1.82.5 (avoid September 2025 OOM regression in later releases) - Create llm_pool/main.py: FastAPI service on port 8004 with /complete and /health endpoints - Add providers/__init__.py: reserved for future per-provider customization - Update docker-compose.yml: add llm-pool and celery-worker service stubs
This commit is contained in:
107
packages/llm-pool/llm_pool/main.py
Normal file
107
packages/llm-pool/llm_pool/main.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""
|
||||
LLM Backend Pool — FastAPI service on port 8004.
|
||||
|
||||
Endpoints:
|
||||
POST /complete — route a completion request through the LiteLLM Router.
|
||||
GET /health — liveness probe.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
|
||||
from llm_pool.router import complete as router_complete
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI(
|
||||
title="Konstruct LLM Pool",
|
||||
description="LiteLLM Router — Ollama + Anthropic + OpenAI with automatic fallback",
|
||||
version="0.1.0",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Request / Response schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class CompleteRequest(BaseModel):
|
||||
"""Body for POST /complete."""
|
||||
|
||||
model: str
|
||||
"""Model group name: "quality" or "fast"."""
|
||||
|
||||
messages: list[dict]
|
||||
"""OpenAI-format message list."""
|
||||
|
||||
tenant_id: str
|
||||
"""Konstruct tenant UUID for cost tracking."""
|
||||
|
||||
|
||||
class UsageInfo(BaseModel):
|
||||
prompt_tokens: int = 0
|
||||
completion_tokens: int = 0
|
||||
|
||||
|
||||
class CompleteResponse(BaseModel):
|
||||
content: str
|
||||
model: str
|
||||
usage: UsageInfo
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: str
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Routes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@app.get("/health", response_model=HealthResponse)
|
||||
async def health() -> HealthResponse:
|
||||
"""Liveness probe — returns immediately."""
|
||||
return HealthResponse(status="ok")
|
||||
|
||||
|
||||
@app.post("/complete", response_model=CompleteResponse)
|
||||
async def complete_endpoint(request: CompleteRequest) -> CompleteResponse:
|
||||
"""
|
||||
Route a completion request through the LiteLLM Router.
|
||||
|
||||
The `model` field selects the provider group ("quality" or "fast").
|
||||
LiteLLM handles provider selection, retries, and cross-group fallback
|
||||
automatically.
|
||||
|
||||
Returns 503 JSON if all providers (including fallbacks) are unavailable.
|
||||
"""
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
try:
|
||||
content = await router_complete(
|
||||
model_group=request.model,
|
||||
messages=request.messages,
|
||||
tenant_id=request.tenant_id,
|
||||
)
|
||||
# LiteLLM Router doesn't expose per-call usage easily via acompletion
|
||||
# on all provider paths; we return zeroed usage for now and will wire
|
||||
# real token counts in a follow-up plan when cost tracking is added.
|
||||
return CompleteResponse(
|
||||
content=content,
|
||||
model=request.model,
|
||||
usage=UsageInfo(),
|
||||
)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"All LLM providers unavailable for tenant=%s model=%s",
|
||||
request.tenant_id,
|
||||
request.model,
|
||||
)
|
||||
return JSONResponse( # type: ignore[return-value]
|
||||
status_code=503,
|
||||
content={"error": "All providers unavailable"},
|
||||
)
|
||||
Reference in New Issue
Block a user