feat(01-03): Channel Gateway (Slack adapter) and Message Router
- gateway/normalize.py: normalize_slack_event -> KonstructMessage (strips bot mention) - gateway/channels/slack.py: register_slack_handlers for app_mention + DM events - rate limit check -> ephemeral rejection on exceeded - idempotency dedup (Slack retry protection) - placeholder 'Thinking...' message posted in-thread before Celery dispatch - auto-follow engaged threads with 30-minute TTL - HTTP 200 returned immediately; all LLM work dispatched to Celery - gateway/main.py: FastAPI on port 8001, /slack/events + /health - router/tenant.py: resolve_tenant workspace_id -> tenant_id (RLS-bypass query) - router/ratelimit.py: check_rate_limit Redis token bucket, RateLimitExceeded exception - router/idempotency.py: is_duplicate + mark_processed (SET NX, 24h TTL) - router/context.py: load_agent_for_tenant with RLS ContextVar setup - orchestrator/tasks.py: handle_message now extracts placeholder_ts/channel_id, calls _update_slack_placeholder via chat.update after LLM response - docker-compose.yml: gateway service on port 8001 - pyproject.toml: added redis, konstruct-router, konstruct-orchestrator deps
This commit is contained in:
6
packages/router/router/__init__.py
Normal file
6
packages/router/router/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
Konstruct Message Router.
|
||||
|
||||
Handles tenant resolution, rate limiting, idempotency deduplication,
|
||||
and context loading before dispatching to the Agent Orchestrator.
|
||||
"""
|
||||
76
packages/router/router/context.py
Normal file
76
packages/router/router/context.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""
|
||||
Agent context loading.
|
||||
|
||||
Loads the active agent for a tenant before message processing. Phase 1 supports
|
||||
a single agent per tenant. The RLS context variable must be set before calling
|
||||
any function here so that PostgreSQL RLS filters correctly.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from shared.models.tenant import Agent
|
||||
from shared.rls import current_tenant_id
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def load_agent_for_tenant(
|
||||
tenant_id: str,
|
||||
session: AsyncSession,
|
||||
) -> Agent | None:
|
||||
"""
|
||||
Load the active agent for a tenant.
|
||||
|
||||
Sets the ``current_tenant_id`` ContextVar so that PostgreSQL RLS policies
|
||||
correctly filter the agents table to only return rows belonging to this
|
||||
tenant.
|
||||
|
||||
Phase 1: Returns the first active agent for the tenant (single-agent model).
|
||||
Phase 2+: Will support agent selection based on message content and routing
|
||||
rules.
|
||||
|
||||
Args:
|
||||
tenant_id: Konstruct tenant ID as a UUID string.
|
||||
session: Async SQLAlchemy session.
|
||||
|
||||
Returns:
|
||||
The active Agent ORM instance, or None if no active agent is configured.
|
||||
"""
|
||||
try:
|
||||
tenant_uuid = uuid.UUID(tenant_id)
|
||||
except (ValueError, AttributeError):
|
||||
logger.error("load_agent_for_tenant: invalid tenant_id=%r", tenant_id)
|
||||
return None
|
||||
|
||||
# Set RLS context so the DB query is correctly scoped to this tenant
|
||||
token = current_tenant_id.set(tenant_uuid)
|
||||
try:
|
||||
stmt = (
|
||||
select(Agent)
|
||||
.where(Agent.tenant_id == tenant_uuid)
|
||||
.where(Agent.is_active.is_(True))
|
||||
.limit(1)
|
||||
)
|
||||
result = await session.execute(stmt)
|
||||
agent = result.scalars().first()
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"load_agent_for_tenant: DB error for tenant=%s", tenant_id
|
||||
)
|
||||
return None
|
||||
finally:
|
||||
# Always reset the RLS context var after DB work completes
|
||||
current_tenant_id.reset(token)
|
||||
|
||||
if agent is None:
|
||||
logger.warning(
|
||||
"load_agent_for_tenant: no active agent for tenant=%s", tenant_id
|
||||
)
|
||||
|
||||
return agent
|
||||
87
packages/router/router/idempotency.py
Normal file
87
packages/router/router/idempotency.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
Message deduplication (idempotency).
|
||||
|
||||
Slack (and other channels) retry event delivery when the gateway does not
|
||||
respond with HTTP 200 within 3 seconds. This module tracks which message
|
||||
IDs have already been dispatched to Celery, preventing duplicate processing.
|
||||
|
||||
Design:
|
||||
- Key: {tenant_id}:dedup:{message_id} (from shared.redis_keys)
|
||||
- TTL: 24 hours (Slack retries stop after ~1 hour; 24h is conservative)
|
||||
- Op: SET NX (atomic check-and-set)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from redis.asyncio import Redis
|
||||
|
||||
from shared.redis_keys import idempotency_key
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# How long to remember a message ID (seconds).
|
||||
# Slack retries for up to ~1 hour; 24h gives plenty of buffer.
|
||||
_DEDUP_TTL_SECONDS = 86400 # 24 hours
|
||||
|
||||
|
||||
async def is_duplicate(
|
||||
tenant_id: str,
|
||||
message_id: str,
|
||||
redis: Redis, # type: ignore[type-arg]
|
||||
) -> bool:
|
||||
"""
|
||||
Check if this message has already been dispatched for processing.
|
||||
|
||||
Uses SET NX (set-if-not-exists) as an atomic check-and-mark operation.
|
||||
If the key did not exist, it is created with a 24-hour TTL and this
|
||||
function returns False (not a duplicate — process it).
|
||||
If the key already existed, this function returns True (duplicate — skip).
|
||||
|
||||
Args:
|
||||
tenant_id: Konstruct tenant identifier.
|
||||
message_id: Unique message identifier (e.g. Slack event_ts or UUID).
|
||||
redis: Async Redis client.
|
||||
|
||||
Returns:
|
||||
True if this message is a duplicate (already dispatched).
|
||||
False if this is the first time we've seen this message.
|
||||
"""
|
||||
key = idempotency_key(tenant_id, message_id)
|
||||
|
||||
# SET key "1" NX EX ttl — returns True if key was set (new), None if key existed
|
||||
was_set = await redis.set(key, "1", nx=True, ex=_DEDUP_TTL_SECONDS)
|
||||
|
||||
if was_set:
|
||||
# Key was freshly created — this is NOT a duplicate
|
||||
return False
|
||||
|
||||
# Key already existed — this IS a duplicate
|
||||
logger.info(
|
||||
"Duplicate message detected: tenant=%s message_id=%s — skipping",
|
||||
tenant_id,
|
||||
message_id,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
async def mark_processed(
|
||||
tenant_id: str,
|
||||
message_id: str,
|
||||
redis: Redis, # type: ignore[type-arg]
|
||||
) -> None:
|
||||
"""
|
||||
Explicitly mark a message as processed (without the duplicate check).
|
||||
|
||||
Use this when you want to mark a message as seen without the
|
||||
check-and-mark semantics of ``is_duplicate``. Typically you'll use
|
||||
``is_duplicate`` instead (which does both).
|
||||
|
||||
Args:
|
||||
tenant_id: Konstruct tenant identifier.
|
||||
message_id: Unique message identifier.
|
||||
redis: Async Redis client.
|
||||
"""
|
||||
key = idempotency_key(tenant_id, message_id)
|
||||
await redis.set(key, "1", ex=_DEDUP_TTL_SECONDS)
|
||||
24
packages/router/router/main.py
Normal file
24
packages/router/router/main.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""
|
||||
Message Router — FastAPI application.
|
||||
|
||||
The router is an internal service. In the current architecture (Phase 1),
|
||||
routing logic is embedded directly in the channel gateway handlers rather
|
||||
than as a separate HTTP call. This FastAPI app provides a health endpoint
|
||||
and is a placeholder for future standalone router deployments.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
app = FastAPI(
|
||||
title="Konstruct Message Router",
|
||||
description="Tenant resolution, rate limiting, context loading",
|
||||
version="0.1.0",
|
||||
)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health() -> dict[str, str]:
|
||||
"""Health check endpoint."""
|
||||
return {"status": "ok", "service": "router"}
|
||||
121
packages/router/router/ratelimit.py
Normal file
121
packages/router/router/ratelimit.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""
|
||||
Redis token bucket rate limiter.
|
||||
|
||||
Implements a sliding window token bucket using Redis atomic operations.
|
||||
|
||||
Design:
|
||||
- Key: {tenant_id}:ratelimit:{channel} (from shared.redis_keys)
|
||||
- Window: configurable (default 60s)
|
||||
- Tokens: configurable (default 30 per window per tenant per channel)
|
||||
- Storage: INCR + EXPIRE (atomic via pipeline)
|
||||
|
||||
The token bucket approach:
|
||||
1. INCR the counter key
|
||||
2. If count == 1, set EXPIRE (first request in window — starts the clock)
|
||||
3. If count > limit: raise RateLimitExceeded
|
||||
4. Otherwise: return True (request allowed)
|
||||
|
||||
This is NOT a sliding window (it's a fixed window with INCR/EXPIRE) — it's
|
||||
simple, Redis-atomic, and correct enough for Phase 1. A true sliding window
|
||||
can be implemented with ZADD/ZREMRANGEBYSCORE later if needed.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from redis.asyncio import Redis
|
||||
|
||||
from shared.redis_keys import rate_limit_key
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default rate limit configuration — override per-tenant in Phase 2
|
||||
_DEFAULT_LIMIT = 30 # Max requests per window
|
||||
_DEFAULT_WINDOW = 60 # Window duration in seconds
|
||||
|
||||
|
||||
class RateLimitExceeded(Exception):
|
||||
"""
|
||||
Raised when a tenant's per-channel rate limit is exceeded.
|
||||
|
||||
Attributes:
|
||||
tenant_id: The tenant that exceeded the limit.
|
||||
channel: The channel that hit the limit.
|
||||
remaining_seconds: Approximate TTL on the rate limit key (how long
|
||||
until the window resets).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tenant_id: str,
|
||||
channel: str,
|
||||
remaining_seconds: int = 60,
|
||||
) -> None:
|
||||
self.tenant_id = tenant_id
|
||||
self.channel = channel
|
||||
self.remaining_seconds = remaining_seconds
|
||||
super().__init__(
|
||||
f"Rate limit exceeded for tenant={tenant_id} channel={channel}. "
|
||||
f"Resets in ~{remaining_seconds}s."
|
||||
)
|
||||
|
||||
|
||||
async def check_rate_limit(
|
||||
tenant_id: str,
|
||||
channel: str,
|
||||
redis: Redis, # type: ignore[type-arg]
|
||||
limit: int = _DEFAULT_LIMIT,
|
||||
window_seconds: int = _DEFAULT_WINDOW,
|
||||
) -> bool:
|
||||
"""
|
||||
Check whether the tenant-channel combination is within its rate limit.
|
||||
|
||||
Uses an atomic INCR + EXPIRE pipeline. On the first request in a new
|
||||
window the counter is set and the TTL clock starts. Subsequent requests
|
||||
increment the counter; once it exceeds ``limit``, RateLimitExceeded is
|
||||
raised with the remaining window TTL.
|
||||
|
||||
Args:
|
||||
tenant_id: Konstruct tenant identifier.
|
||||
channel: Channel string (e.g. "slack").
|
||||
redis: Async Redis client.
|
||||
limit: Maximum requests per window (default 30).
|
||||
window_seconds: Window duration in seconds (default 60).
|
||||
|
||||
Returns:
|
||||
True if the request is allowed.
|
||||
|
||||
Raises:
|
||||
RateLimitExceeded: If the request exceeds the limit.
|
||||
"""
|
||||
key = rate_limit_key(tenant_id, channel)
|
||||
|
||||
# Atomic pipeline: INCR then conditional EXPIRE
|
||||
pipe = redis.pipeline(transaction=True)
|
||||
pipe.incr(key)
|
||||
pipe.ttl(key)
|
||||
results = await pipe.execute()
|
||||
|
||||
count: int = results[0]
|
||||
ttl: int = results[1]
|
||||
|
||||
# If TTL is -1, the key exists but has no expiry — set one now.
|
||||
# This handles the case where INCR created the key but EXPIRE wasn't set yet.
|
||||
if ttl == -1 or count == 1:
|
||||
await redis.expire(key, window_seconds)
|
||||
ttl = window_seconds
|
||||
|
||||
if count > limit:
|
||||
remaining = max(ttl, 0)
|
||||
logger.warning(
|
||||
"Rate limit exceeded: tenant=%s channel=%s count=%d limit=%d ttl=%d",
|
||||
tenant_id,
|
||||
channel,
|
||||
count,
|
||||
limit,
|
||||
remaining,
|
||||
)
|
||||
raise RateLimitExceeded(tenant_id, channel, remaining_seconds=remaining)
|
||||
|
||||
return True
|
||||
102
packages/router/router/tenant.py
Normal file
102
packages/router/router/tenant.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""
|
||||
Tenant resolution — maps channel workspace IDs to Konstruct tenant IDs.
|
||||
|
||||
This is the ONE pre-RLS query in the system. Tenant resolution must work
|
||||
across all tenants because we don't know which tenant owns a message until
|
||||
after we resolve it. The query bypasses RLS by using the admin/superuser
|
||||
connection for this specific lookup only.
|
||||
|
||||
Design:
|
||||
- Query `channel_connections` for matching workspace_id + channel_type
|
||||
- Returns the tenant_id UUID as a string, or None if not found
|
||||
- Uses a raw SELECT without RLS context (intentional — pre-resolution)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from shared.models.message import ChannelType
|
||||
from shared.models.tenant import ChannelConnection, ChannelTypeEnum
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Map ChannelType (StrEnum from message.py) to ChannelTypeEnum (ORM enum from tenant.py)
|
||||
_CHANNEL_TYPE_MAP: dict[str, ChannelTypeEnum] = {
|
||||
"slack": ChannelTypeEnum.SLACK,
|
||||
"whatsapp": ChannelTypeEnum.WHATSAPP,
|
||||
"mattermost": ChannelTypeEnum.MATTERMOST,
|
||||
"rocketchat": ChannelTypeEnum.ROCKETCHAT,
|
||||
"teams": ChannelTypeEnum.TEAMS,
|
||||
"telegram": ChannelTypeEnum.TELEGRAM,
|
||||
"signal": ChannelTypeEnum.SIGNAL,
|
||||
}
|
||||
|
||||
|
||||
async def resolve_tenant(
|
||||
workspace_id: str,
|
||||
channel_type: ChannelType | str,
|
||||
session: AsyncSession,
|
||||
) -> str | None:
|
||||
"""
|
||||
Resolve a channel workspace ID to a Konstruct tenant ID.
|
||||
|
||||
This is deliberately a RLS-bypass query — we cannot know which tenant to
|
||||
set in `app.current_tenant` until after we resolve the tenant. The session
|
||||
passed here should use the admin connection (postgres superuser) or the
|
||||
konstruct_app role with RLS disabled for this specific query.
|
||||
|
||||
In practice, for this single lookup, we disable the RLS SET LOCAL by
|
||||
temporarily not setting `current_tenant_id` — the ContextVar defaults to
|
||||
None, so the RLS hook does not inject SET LOCAL, and the query sees all
|
||||
rows in `channel_connections`.
|
||||
|
||||
Args:
|
||||
workspace_id: Channel-native workspace identifier (e.g. Slack T12345).
|
||||
channel_type: Channel type as ChannelType enum or string.
|
||||
session: Async SQLAlchemy session.
|
||||
|
||||
Returns:
|
||||
Tenant ID as a string (UUID), or None if no matching connection found.
|
||||
"""
|
||||
channel_str = str(channel_type).lower()
|
||||
orm_channel = _CHANNEL_TYPE_MAP.get(channel_str)
|
||||
if orm_channel is None:
|
||||
logger.warning("resolve_tenant: unknown channel_type=%r", channel_type)
|
||||
return None
|
||||
|
||||
try:
|
||||
# Bypass RLS for this query — disable RLS row filtering at the session level
|
||||
# by setting app.current_tenant to empty (no policy match = all rows visible
|
||||
# to konstruct_app for SELECT on channel_connections).
|
||||
# We use a raw SET LOCAL here to ensure the tenant policy is not applied.
|
||||
await session.execute(text("SET LOCAL app.current_tenant = ''"))
|
||||
|
||||
stmt = (
|
||||
select(ChannelConnection.tenant_id)
|
||||
.where(ChannelConnection.channel_type == orm_channel)
|
||||
.where(ChannelConnection.workspace_id == workspace_id)
|
||||
.limit(1)
|
||||
)
|
||||
result = await session.execute(stmt)
|
||||
row = result.scalar_one_or_none()
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"resolve_tenant: DB error workspace_id=%r channel=%r",
|
||||
workspace_id,
|
||||
channel_type,
|
||||
)
|
||||
return None
|
||||
|
||||
if row is None:
|
||||
logger.debug(
|
||||
"resolve_tenant: no match workspace_id=%r channel=%r",
|
||||
workspace_id,
|
||||
channel_type,
|
||||
)
|
||||
return None
|
||||
|
||||
return str(row)
|
||||
Reference in New Issue
Block a user