Files
konstruct/packages/llm-pool/llm_pool/main.py
Adolfo Delorenzo ee2f88e13b feat(01-02): LLM Backend Pool — LiteLLM Router with Ollama + Anthropic + OpenAI fallback
- Create llm_pool/router.py: LiteLLM Router with fast (Ollama) and quality (Anthropic/OpenAI) model groups
- Configure fallback chain: quality providers fail -> fast group
- Pin LiteLLM to ==1.82.5 (avoid September 2025 OOM regression in later releases)
- Create llm_pool/main.py: FastAPI service on port 8004 with /complete and /health endpoints
- Add providers/__init__.py: reserved for future per-provider customization
- Update docker-compose.yml: add llm-pool and celery-worker service stubs
2026-03-23 10:03:05 -06:00

108 lines
2.9 KiB
Python

"""
LLM Backend Pool — FastAPI service on port 8004.
Endpoints:
POST /complete — route a completion request through the LiteLLM Router.
GET /health — liveness probe.
"""
from __future__ import annotations
import logging
from fastapi import FastAPI
from pydantic import BaseModel
from llm_pool.router import complete as router_complete
logger = logging.getLogger(__name__)
app = FastAPI(
title="Konstruct LLM Pool",
description="LiteLLM Router — Ollama + Anthropic + OpenAI with automatic fallback",
version="0.1.0",
)
# ---------------------------------------------------------------------------
# Request / Response schemas
# ---------------------------------------------------------------------------
class CompleteRequest(BaseModel):
"""Body for POST /complete."""
model: str
"""Model group name: "quality" or "fast"."""
messages: list[dict]
"""OpenAI-format message list."""
tenant_id: str
"""Konstruct tenant UUID for cost tracking."""
class UsageInfo(BaseModel):
prompt_tokens: int = 0
completion_tokens: int = 0
class CompleteResponse(BaseModel):
content: str
model: str
usage: UsageInfo
class HealthResponse(BaseModel):
status: str
# ---------------------------------------------------------------------------
# Routes
# ---------------------------------------------------------------------------
@app.get("/health", response_model=HealthResponse)
async def health() -> HealthResponse:
"""Liveness probe — returns immediately."""
return HealthResponse(status="ok")
@app.post("/complete", response_model=CompleteResponse)
async def complete_endpoint(request: CompleteRequest) -> CompleteResponse:
"""
Route a completion request through the LiteLLM Router.
The `model` field selects the provider group ("quality" or "fast").
LiteLLM handles provider selection, retries, and cross-group fallback
automatically.
Returns 503 JSON if all providers (including fallbacks) are unavailable.
"""
from fastapi.responses import JSONResponse
try:
content = await router_complete(
model_group=request.model,
messages=request.messages,
tenant_id=request.tenant_id,
)
# LiteLLM Router doesn't expose per-call usage easily via acompletion
# on all provider paths; we return zeroed usage for now and will wire
# real token counts in a follow-up plan when cost tracking is added.
return CompleteResponse(
content=content,
model=request.model,
usage=UsageInfo(),
)
except Exception:
logger.exception(
"All LLM providers unavailable for tenant=%s model=%s",
request.tenant_id,
request.model,
)
return JSONResponse( # type: ignore[return-value]
status_code=503,
content={"error": "All providers unavailable"},
)