feat(02-01): wire two-layer memory into orchestrator pipeline

- builder.py: add build_messages_with_memory() — injects pgvector context as system message + sliding window history before current user turn - runner.py: accept optional messages parameter; fallback to simple build for backward compat (existing tests unaffected) - tasks.py: memory pipeline in _process_message — load short-term + retrieve long-term before LLM call; append turns to Redis + dispatch embed_and_store fire-and-forget after response - tasks.py: add embed_and_store Celery task (sync def + asyncio.run()) for async pgvector backfill — never blocks the LLM response pipeline - memory/embedder.py: lazy singleton SentenceTransformer (all-MiniLM-L6-v2) with embed_text() / embed_texts() helpers - All 202 tests pass (196 existing + 6 new memory integration tests)
2026-03-23 14:45:21 -06:00
parent 2dc94682ff
commit 45b957377f
4 changed files with 345 additions and 14 deletions
--- a/packages/orchestrator/orchestrator/agents/builder.py
+++ b/packages/orchestrator/orchestrator/agents/builder.py
@@ -11,6 +11,14 @@ AI TRANSPARENCY POLICY:
  Per Konstruct product design, agents MUST acknowledge they are AI assistants
  when directly asked. This clause is injected unconditionally to prevent
  agents from deceiving users, regardless of persona configuration.
+
+Memory-enriched message assembly:
+  build_messages_with_memory() extends build_messages() by injecting:
+  1. Long-term context: semantically relevant past exchanges (pgvector)
+     Injected as a system message BEFORE the sliding window so the LLM
+     has background context without it polluting the conversation flow.
+  2. Short-term context: recent messages (Redis sliding window)
+     Represents the immediate conversation history in this session.
 """

 from __future__ import annotations
@@ -82,3 +90,54 @@ def build_messages(

    messages.append({"role": "user", "content": user_message})
    return messages
+
+
+def build_messages_with_memory(
+    agent: Agent,
+    current_message: str,
+    recent_messages: list[dict],
+    relevant_context: list[str],
+) -> list[dict]:
+    """
+    Build an LLM messages array enriched with two-layer memory.
+
+    Structure (in order):
+      1. System message — agent identity, persona, AI transparency clause
+      2. System message — long-term context from pgvector (ONLY if non-empty)
+         Injected as a system message before the sliding window so the LLM
+         has relevant background without it appearing in the conversation.
+      3. Sliding window messages — recent conversation history (user/assistant)
+      4. Current user message
+
+    The pgvector context is omitted entirely when relevant_context is empty —
+    injecting an empty context block would be noise in the LLM's context window.
+
+    Args:
+        agent:            ORM Agent instance (for system prompt assembly).
+        current_message:  The current user message text.
+        recent_messages:  Short-term memory — list of {"role", "content"} dicts
+                          from Redis sliding window (oldest first).
+        relevant_context: Long-term memory — list of content strings from
+                          pgvector similarity search (most relevant first).
+
+    Returns:
+        List of message dicts suitable for an OpenAI-compatible API call.
+    """
+    system_prompt = build_system_prompt(agent)
+    messages: list[dict] = [{"role": "system", "content": system_prompt}]
+
+    # Inject long-term pgvector context as a system message BEFORE sliding window
+    # Only inject when there IS relevant context — empty context block is noise
+    if relevant_context:
+        context_lines = "\n".join(f"- {item}" for item in relevant_context)
+        context_message = f"Relevant context from past conversations:\n{context_lines}"
+        messages.append({"role": "system", "content": context_message})
+
+    # Append short-term sliding window messages (recent conversation history)
+    if recent_messages:
+        messages.extend(recent_messages)
+
+    # Append the current user message
+    messages.append({"role": "user", "content": current_message})
+
+    return messages
--- a/packages/orchestrator/orchestrator/agents/runner.py
+++ b/packages/orchestrator/orchestrator/agents/runner.py
@@ -31,19 +31,29 @@ _FALLBACK_RESPONSE = (
 _LLM_TIMEOUT = httpx.Timeout(timeout=120.0, connect=10.0)


-async def run_agent(msg: KonstructMessage, agent: Agent) -> str:
+async def run_agent(
+    msg: KonstructMessage,
+    agent: Agent,
+    messages: list[dict] | None = None,
+) -> str:
    """
    Execute an agent against the LLM pool and return the response text.

    Args:
        msg:      The inbound Konstruct message being processed.
        agent:    The ORM Agent instance that handles this message.
+        messages: Optional pre-built messages array (e.g. from
+                  build_messages_with_memory). When provided, used directly.
+                  When None, falls back to simple [system, user] construction
+                  for backward compatibility (e.g. existing tests).

    Returns:
        The LLM response content as a plain string.
        Returns a polite fallback message if the LLM pool is unreachable or
        returns a non-200 response.
    """
+    if messages is None:
+        # Fallback: simple two-message construction (backward compat)
        system_prompt = build_system_prompt(agent)

        # Extract user text from the message content
--- a/packages/orchestrator/orchestrator/memory/embedder.py
+++ b/packages/orchestrator/orchestrator/memory/embedder.py
@@ -0,0 +1,81 @@
+"""
+Singleton embedding model for the Orchestrator.
+
+Loads all-MiniLM-L6-v2 once at module level (lazy singleton pattern).
+The model produces 384-dimensional embeddings compatible with the
+conversation_embeddings.embedding vector(384) column.
+
+Why a singleton: sentence-transformers models are ~100MB and take ~2s to load.
+Loading per-request would be catastrophically slow. Loading at module level
+means the model is loaded once when the Celery worker starts.
+
+Thread safety: SentenceTransformer.encode() releases the GIL and is safe to
+call from multiple Celery threads simultaneously.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from sentence_transformers import SentenceTransformer
+
+logger = logging.getLogger(__name__)
+
+# Embedding model name — must match the vector(384) column dimension
+_MODEL_NAME = "all-MiniLM-L6-v2"
+
+# Lazy singleton — loaded on first use, not at import time
+# This avoids 2s+ load time when the module is imported but not used
+_model: "SentenceTransformer | None" = None
+
+
+def get_embedding_model() -> "SentenceTransformer":
+    """
+    Return the singleton SentenceTransformer model, loading it on first call.
+
+    Thread-safe: multiple Celery workers can call this concurrently.
+    The model is loaded only once per process.
+
+    Returns:
+        Loaded SentenceTransformer model (all-MiniLM-L6-v2, 384 dims).
+    """
+    global _model
+    if _model is None:
+        logger.info("Loading embedding model %s (first use)", _MODEL_NAME)
+        from sentence_transformers import SentenceTransformer
+
+        _model = SentenceTransformer(_MODEL_NAME)
+        logger.info("Embedding model %s loaded", _MODEL_NAME)
+    return _model
+
+
+def embed_text(text: str) -> list[float]:
+    """
+    Embed a single text string using all-MiniLM-L6-v2.
+
+    Args:
+        text: The text to embed.
+
+    Returns:
+        384-dimensional float list.
+    """
+    model = get_embedding_model()
+    embedding = model.encode(text, normalize_embeddings=True)
+    return embedding.tolist()
+
+
+def embed_texts(texts: list[str]) -> list[list[float]]:
+    """
+    Embed a list of text strings in batch (more efficient than one-by-one).
+
+    Args:
+        texts: List of strings to embed.
+
+    Returns:
+        List of 384-dimensional float lists, same order as input.
+    """
+    model = get_embedding_model()
+    embeddings = model.encode(texts, normalize_embeddings=True, batch_size=32)
+    return [e.tolist() for e in embeddings]
--- a/packages/orchestrator/orchestrator/tasks.py
+++ b/packages/orchestrator/orchestrator/tasks.py
@@ -9,6 +9,19 @@ Celery task definitions for the Konstruct Agent Orchestrator.
 # NEVER change these to `async def`. If you see a RuntimeError about "no
 # running event loop" or tasks that silently never complete, check for
 # accidental async def usage first.
+
+Memory pipeline (Phase 2):
+  Before LLM call:
+    1. get_recent_messages() — load Redis sliding window (last 20 msgs)
+    2. embed current message + retrieve_relevant() — pgvector long-term context
+    3. build_messages_with_memory() — assemble enriched messages array
+
+  After LLM response:
+    4. append_message() x2 — save user + assistant turns to Redis
+    5. embed_and_store.delay() — fire-and-forget pgvector backfill (async)
+
+  The embed_and_store Celery task runs asynchronously, meaning the LLM response
+  is never blocked waiting for embedding computation.
 """

 from __future__ import annotations
@@ -23,6 +36,88 @@ from shared.models.message import KonstructMessage
 logger = logging.getLogger(__name__)


+@app.task(
+    name="orchestrator.tasks.embed_and_store",
+    bind=False,
+    max_retries=2,
+    default_retry_delay=10,
+    ignore_result=True,  # Fire-and-forget — callers don't await the result
+)
+def embed_and_store(
+    tenant_id: str,
+    agent_id: str,
+    user_id: str,
+    messages: list[dict],
+) -> None:
+    """
+    Asynchronously embed conversation turns and store them in pgvector.
+
+    Dispatched fire-and-forget after the LLM response so embedding computation
+    NEVER blocks the user-facing response pipeline.
+
+    Args:
+        tenant_id: Tenant UUID string.
+        agent_id:  Agent UUID string.
+        user_id:   End-user identifier.
+        messages:  List of {"role", "content"} dicts to embed and store.
+                   Typically [user_message, assistant_response].
+    """
+    asyncio.run(_embed_and_store_async(tenant_id, agent_id, user_id, messages))
+
+
+async def _embed_and_store_async(
+    tenant_id: str,
+    agent_id: str,
+    user_id: str,
+    messages: list[dict],
+) -> None:
+    """
+    Async implementation of embed_and_store.
+
+    Embeds all messages in batch (more efficient than one-by-one) then stores
+    each embedding in conversation_embeddings via store_embedding().
+    """
+    from orchestrator.memory.embedder import embed_texts
+    from orchestrator.memory.long_term import store_embedding
+    from shared.db import async_session_factory, engine
+    from shared.rls import configure_rls_hook, current_tenant_id
+
+    if not messages:
+        return
+
+    tenant_uuid = uuid.UUID(tenant_id)
+    agent_uuid = uuid.UUID(agent_id)
+
+    # Embed all message texts in a single batch call
+    texts = [msg["content"] for msg in messages]
+    embeddings = embed_texts(texts)
+
+    configure_rls_hook(engine)
+    token = current_tenant_id.set(tenant_uuid)
+    try:
+        async with async_session_factory() as session:
+            for msg, embedding in zip(messages, embeddings, strict=True):
+                await store_embedding(
+                    session,
+                    tenant_uuid,
+                    agent_uuid,
+                    user_id,
+                    msg["content"],
+                    msg["role"],
+                    embedding,
+                )
+            await session.commit()
+    except Exception:
+        logger.exception(
+            "embed_and_store failed for tenant=%s agent=%s user=%s",
+            tenant_id,
+            agent_id,
+            user_id,
+        )
+    finally:
+        current_tenant_id.reset(token)
+
+
@app.task(
    name="orchestrator.tasks.handle_message",
    bind=True,
@@ -82,7 +177,16 @@ async def _process_message(
    channel_id: str = "",
 ) -> dict:
    """
-    Async agent pipeline — load agent config, build prompt, call LLM pool.
+    Async agent pipeline — load agent config, build memory-enriched prompt, call LLM pool.
+
+    Memory pipeline (Phase 2 additions):
+      BEFORE LLM call:
+        1. Load recent messages from Redis sliding window
+        2. Embed current message and retrieve semantically relevant long-term context
+        3. Build memory-enriched messages array via build_messages_with_memory()
+      AFTER LLM response:
+        4. Append user message + assistant response to Redis sliding window
+        5. Dispatch embed_and_store.delay() for async pgvector backfill

    After getting the LLM response, if Slack placeholder metadata is present,
    updates the "Thinking..." placeholder message with the real response using
@@ -99,7 +203,11 @@ async def _process_message(
    Returns:
        Dict with message_id, response, and tenant_id.
    """
+    from orchestrator.agents.builder import build_messages_with_memory
    from orchestrator.agents.runner import run_agent
+    from orchestrator.memory.embedder import embed_text
+    from orchestrator.memory.long_term import retrieve_relevant
+    from orchestrator.memory.short_term import append_message, get_recent_messages
    from shared.db import async_session_factory, engine
    from shared.models.tenant import Agent
    from shared.rls import configure_rls_hook, current_tenant_id
@@ -120,9 +228,9 @@ async def _process_message(
    token = current_tenant_id.set(tenant_uuid)

    slack_bot_token: str = ""
+    agent: Agent | None = None

    try:
-        agent: Agent | None = None
        async with async_session_factory() as session:
            from sqlalchemy import select

@@ -173,13 +281,68 @@ async def _process_message(
            "tenant_id": msg.tenant_id,
        }

-    response_text = await run_agent(msg, agent)
+    # Determine user_id for memory scoping: use sender.user_id if available,
+    # fall back to thread_id (for non-identified channel contexts like webhooks)
+    user_id: str = (
+        msg.sender.user_id
+        if msg.sender and msg.sender.user_id
+        else (msg.thread_id or msg.id)
+    )
+    agent_id_str = str(agent.id)
+    user_text: str = msg.content.text or ""
+
+    # -------------------------------------------------------------------------
+    # Memory retrieval (before LLM call)
+    # -------------------------------------------------------------------------
+    import redis.asyncio as aioredis
+
+    from shared.config import settings
+
+    redis_client = aioredis.from_url(settings.redis_url)
+    try:
+        # 1. Short-term: Redis sliding window
+        recent_messages = await get_recent_messages(
+            redis_client, msg.tenant_id, agent_id_str, user_id
+        )
+
+        # 2. Long-term: pgvector similarity search
+        relevant_context: list[str] = []
+        if user_text:
+            query_embedding = embed_text(user_text)
+            rls_token = current_tenant_id.set(tenant_uuid)
+            try:
+                async with async_session_factory() as session:
+                    relevant_context = await retrieve_relevant(
+                        session,
+                        tenant_uuid,
+                        agent.id,
+                        user_id,
+                        query_embedding,
+                    )
+            finally:
+                current_tenant_id.reset(rls_token)
+    finally:
+        await redis_client.aclose()
+
+    # -------------------------------------------------------------------------
+    # Build memory-enriched messages array and run LLM
+    # -------------------------------------------------------------------------
+    enriched_messages = build_messages_with_memory(
+        agent=agent,
+        current_message=user_text,
+        recent_messages=recent_messages,
+        relevant_context=relevant_context,
+    )
+
+    response_text = await run_agent(msg, agent, messages=enriched_messages)

    logger.info(
-        "Message %s processed by agent=%s tenant=%s",
+        "Message %s processed by agent=%s tenant=%s (short_term=%d, long_term=%d)",
        msg.id,
        agent.id,
        msg.tenant_id,
+        len(recent_messages),
+        len(relevant_context),
    )

    # Replace the "Thinking..." placeholder with the real response
@@ -191,6 +354,24 @@ async def _process_message(
            text=response_text,
        )

+    # -------------------------------------------------------------------------
+    # Memory persistence (after LLM response)
+    # -------------------------------------------------------------------------
+    redis_client2 = aioredis.from_url(settings.redis_url)
+    try:
+        # 3. Append both turns to Redis sliding window
+        await append_message(redis_client2, msg.tenant_id, agent_id_str, user_id, "user", user_text)
+        await append_message(redis_client2, msg.tenant_id, agent_id_str, user_id, "assistant", response_text)
+    finally:
+        await redis_client2.aclose()
+
+    # 4. Fire-and-forget: async pgvector backfill (never blocks LLM response)
+    messages_to_embed = [
+        {"role": "user", "content": user_text},
+        {"role": "assistant", "content": response_text},
+    ]
+    embed_and_store.delay(msg.tenant_id, agent_id_str, user_id, messages_to_embed)
+
    return {
        "message_id": msg.id,
        "response": response_text,