feat(02-01): add two-layer memory system — Redis sliding window + pgvector long-term

- ConversationEmbedding ORM model with Vector(384) column (pgvector)
- memory_short_key, escalation_status_key, pending_tool_confirm_key in redis_keys.py
- orchestrator/memory/short_term.py: RPUSH/LTRIM sliding window (get_recent_messages, append_message)
- orchestrator/memory/long_term.py: pgvector HNSW cosine search (retrieve_relevant, store_embedding)
- Migration 002: conversation_embeddings table, HNSW index, RLS with FORCE, SELECT/INSERT only
- 10 unit tests (fakeredis), 6 integration tests (pgvector) — all passing
- Auto-fix [Rule 3]: postgres image updated to pgvector/pgvector:pg16 (extension required)
This commit is contained in:
2026-03-23 14:41:57 -06:00
parent 370a860622
commit 28a5ee996e
11 changed files with 998 additions and 1 deletions

View File

@@ -19,6 +19,7 @@ dependencies = [
"httpx>=0.28.0",
"slowapi>=0.1.9",
"bcrypt>=4.0.0",
"pgvector>=0.3.0",
]
[tool.hatch.build.targets.wheel]

View File

@@ -0,0 +1,96 @@
"""
SQLAlchemy 2.0 ORM models for conversational memory.
ConversationEmbedding stores pgvector embeddings of past conversation turns
for long-term semantic retrieval across sessions. This is the persistence layer
for the long-term memory module in the Agent Orchestrator.
IMPORTANT:
- Embeddings are immutable (no UPDATE) — like audit records. We store and read
but never modify. This simplifies the data model and prevents mutation bugs.
- RLS is ENABLED with FORCE — tenant_id isolation is enforced at the DB level.
- The vector dimension (384) corresponds to all-MiniLM-L6-v2 output size.
"""
from __future__ import annotations
import uuid
from datetime import datetime
from pgvector.sqlalchemy import Vector
from sqlalchemy import DateTime, ForeignKey, Text, func
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import Mapped, mapped_column
from shared.models.tenant import Base
class ConversationEmbedding(Base):
"""
A single embedded conversation turn stored for long-term recall.
Each row represents one message (user or assistant) converted to a
384-dimensional embedding via all-MiniLM-L6-v2. The Agent Orchestrator
queries this table at prompt assembly time to inject relevant past context.
Scoped by:
- tenant_id: RLS enforced isolation between tenants
- agent_id: isolation between agents within a tenant
- user_id: isolation between end-users of the same agent
RLS policy enforces:
tenant_id = current_setting('app.current_tenant', TRUE)::uuid
FORCE ROW LEVEL SECURITY ensures even the table owner cannot bypass this.
"""
__tablename__ = "conversation_embeddings"
id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
primary_key=True,
default=uuid.uuid4,
)
tenant_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("tenants.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
agent_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
nullable=False,
index=True,
)
user_id: Mapped[str] = mapped_column(
Text,
nullable=False,
comment="Channel-native user identifier (e.g. Slack user ID U12345)",
)
content: Mapped[str] = mapped_column(
Text,
nullable=False,
comment="Original message text that was embedded",
)
role: Mapped[str] = mapped_column(
Text,
nullable=False,
comment="Message role: 'user' or 'assistant'",
)
embedding: Mapped[list[float]] = mapped_column(
Vector(384),
nullable=False,
comment="all-MiniLM-L6-v2 embedding (384 dimensions)",
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
server_default=func.now(),
)
def __repr__(self) -> str:
return (
f"<ConversationEmbedding id={self.id} "
f"tenant_id={self.tenant_id} agent_id={self.agent_id} "
f"user_id={self.user_id!r} role={self.role!r}>"
)

View File

@@ -86,3 +86,61 @@ def engaged_thread_key(tenant_id: str, thread_id: str) -> str:
Namespaced Redis key: "{tenant_id}:engaged:{thread_id}"
"""
return f"{tenant_id}:engaged:{thread_id}"
def memory_short_key(tenant_id: str, agent_id: str, user_id: str) -> str:
"""
Redis key for the short-term conversational memory sliding window.
Stores the last N messages (serialized as JSON) for a specific
tenant + agent + user combination. Used by the Agent Orchestrator to
inject recent conversation history into every LLM prompt.
Key includes all three discriminators to ensure:
- Two users talking to the same agent have separate histories
- The same user talking to two different agents has separate histories
- Two tenants with the same agent/user IDs are fully isolated
Args:
tenant_id: Konstruct tenant identifier.
agent_id: Agent identifier (UUID string).
user_id: End-user identifier (channel-native, e.g. Slack user ID).
Returns:
Namespaced Redis key: "{tenant_id}:memory:short:{agent_id}:{user_id}"
"""
return f"{tenant_id}:memory:short:{agent_id}:{user_id}"
def escalation_status_key(tenant_id: str, thread_id: str) -> str:
"""
Redis key for tracking escalation status of a thread.
Stores the current escalation state for a conversation thread —
whether it has been escalated to a human or another agent.
Args:
tenant_id: Konstruct tenant identifier.
thread_id: Thread identifier.
Returns:
Namespaced Redis key: "{tenant_id}:escalation:{thread_id}"
"""
return f"{tenant_id}:escalation:{thread_id}"
def pending_tool_confirm_key(tenant_id: str, thread_id: str) -> str:
"""
Redis key for tracking pending tool confirmation requests.
Stores the pending tool invocation that requires explicit user
confirmation before execution (e.g. destructive operations).
Args:
tenant_id: Konstruct tenant identifier.
thread_id: Thread identifier.
Returns:
Namespaced Redis key: "{tenant_id}:tool_confirm:{thread_id}"
"""
return f"{tenant_id}:tool_confirm:{thread_id}"