feat(01-02): LLM Backend Pool — LiteLLM Router with Ollama + Anthropic + OpenAI fallback

- Create llm_pool/router.py: LiteLLM Router with fast (Ollama) and quality (Anthropic/OpenAI) model groups - Configure fallback chain: quality providers fail -> fast group - Pin LiteLLM to ==1.82.5 (avoid September 2025 OOM regression in later releases) - Create llm_pool/main.py: FastAPI service on port 8004 with /complete and /health endpoints - Add providers/__init__.py: reserved for future per-provider customization - Update docker-compose.yml: add llm-pool and celery-worker service stubs
2026-03-23 10:03:05 -06:00
parent 0054383be0
commit ee2f88e13b
7 changed files with 370 additions and 5 deletions
--- a/packages/llm-pool/llm_pool/main.py
+++ b/packages/llm-pool/llm_pool/main.py
@@ -0,0 +1,107 @@
+"""
+LLM Backend Pool — FastAPI service on port 8004.
+
+Endpoints:
+  POST /complete  — route a completion request through the LiteLLM Router.
+  GET  /health    — liveness probe.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from fastapi import FastAPI
+from pydantic import BaseModel
+
+from llm_pool.router import complete as router_complete
+
+logger = logging.getLogger(__name__)
+
+app = FastAPI(
+    title="Konstruct LLM Pool",
+    description="LiteLLM Router — Ollama + Anthropic + OpenAI with automatic fallback",
+    version="0.1.0",
+)
+
+
+# ---------------------------------------------------------------------------
+# Request / Response schemas
+# ---------------------------------------------------------------------------
+
+
+class CompleteRequest(BaseModel):
+    """Body for POST /complete."""
+
+    model: str
+    """Model group name: "quality" or "fast"."""
+
+    messages: list[dict]
+    """OpenAI-format message list."""
+
+    tenant_id: str
+    """Konstruct tenant UUID for cost tracking."""
+
+
+class UsageInfo(BaseModel):
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+
+
+class CompleteResponse(BaseModel):
+    content: str
+    model: str
+    usage: UsageInfo
+
+
+class HealthResponse(BaseModel):
+    status: str
+
+
+# ---------------------------------------------------------------------------
+# Routes
+# ---------------------------------------------------------------------------
+
+
+@app.get("/health", response_model=HealthResponse)
+async def health() -> HealthResponse:
+    """Liveness probe — returns immediately."""
+    return HealthResponse(status="ok")
+
+
+@app.post("/complete", response_model=CompleteResponse)
+async def complete_endpoint(request: CompleteRequest) -> CompleteResponse:
+    """
+    Route a completion request through the LiteLLM Router.
+
+    The `model` field selects the provider group ("quality" or "fast").
+    LiteLLM handles provider selection, retries, and cross-group fallback
+    automatically.
+
+    Returns 503 JSON if all providers (including fallbacks) are unavailable.
+    """
+    from fastapi.responses import JSONResponse
+
+    try:
+        content = await router_complete(
+            model_group=request.model,
+            messages=request.messages,
+            tenant_id=request.tenant_id,
+        )
+        # LiteLLM Router doesn't expose per-call usage easily via acompletion
+        # on all provider paths; we return zeroed usage for now and will wire
+        # real token counts in a follow-up plan when cost tracking is added.
+        return CompleteResponse(
+            content=content,
+            model=request.model,
+            usage=UsageInfo(),
+        )
+    except Exception:
+        logger.exception(
+            "All LLM providers unavailable for tenant=%s model=%s",
+            request.tenant_id,
+            request.model,
+        )
+        return JSONResponse(  # type: ignore[return-value]
+            status_code=503,
+            content={"error": "All providers unavailable"},
+        )