""" LLM Backend Pool — FastAPI service on port 8004. Endpoints: POST /complete — route a completion request through the LiteLLM Router. GET /health — liveness probe. """ from __future__ import annotations import logging from fastapi import FastAPI from pydantic import BaseModel from llm_pool.router import complete as router_complete logger = logging.getLogger(__name__) app = FastAPI( title="Konstruct LLM Pool", description="LiteLLM Router — Ollama + Anthropic + OpenAI with automatic fallback", version="0.1.0", ) # --------------------------------------------------------------------------- # Request / Response schemas # --------------------------------------------------------------------------- class CompleteRequest(BaseModel): """Body for POST /complete.""" model: str """Model group name: "quality" or "fast".""" messages: list[dict] """OpenAI-format message list.""" tenant_id: str """Konstruct tenant UUID for cost tracking.""" class UsageInfo(BaseModel): prompt_tokens: int = 0 completion_tokens: int = 0 class CompleteResponse(BaseModel): content: str model: str usage: UsageInfo class HealthResponse(BaseModel): status: str # --------------------------------------------------------------------------- # Routes # --------------------------------------------------------------------------- @app.get("/health", response_model=HealthResponse) async def health() -> HealthResponse: """Liveness probe — returns immediately.""" return HealthResponse(status="ok") @app.post("/complete", response_model=CompleteResponse) async def complete_endpoint(request: CompleteRequest) -> CompleteResponse: """ Route a completion request through the LiteLLM Router. The `model` field selects the provider group ("quality" or "fast"). LiteLLM handles provider selection, retries, and cross-group fallback automatically. Returns 503 JSON if all providers (including fallbacks) are unavailable. """ from fastapi.responses import JSONResponse try: content = await router_complete( model_group=request.model, messages=request.messages, tenant_id=request.tenant_id, ) # LiteLLM Router doesn't expose per-call usage easily via acompletion # on all provider paths; we return zeroed usage for now and will wire # real token counts in a follow-up plan when cost tracking is added. return CompleteResponse( content=content, model=request.model, usage=UsageInfo(), ) except Exception: logger.exception( "All LLM providers unavailable for tenant=%s model=%s", request.tenant_id, request.model, ) return JSONResponse( # type: ignore[return-value] status_code=503, content={"error": "All providers unavailable"}, )