feat(02-02): audit model, KB model, migration, and audit logger

- AuditEvent ORM model with tenant_id, action_type, latency_ms, metadata
- KnowledgeBaseDocument and KBChunk ORM models for vector KB
- Migration 003: audit_events (immutable via REVOKE), kb_documents, kb_chunks
  with HNSW index and RLS on all tables
- AuditLogger with log_llm_call, log_tool_call, log_escalation methods
- audit_events immutability enforced at DB level (UPDATE/DELETE rejected)
- [Rule 1 - Bug] Fixed CAST(:metadata AS jsonb) for asyncpg compatibility
This commit is contained in:
2026-03-23 14:50:51 -06:00
parent d489551130
commit 30b9f60668
5 changed files with 696 additions and 0 deletions

View File

@@ -0,0 +1 @@
"""Audit logging subsystem for the Konstruct Agent Orchestrator."""

View File

@@ -0,0 +1,224 @@
"""
Immutable audit event writer for the Konstruct Agent Orchestrator.
AuditLogger writes to the audit_events table. The table is protected at the DB
level: the konstruct_app role can only SELECT and INSERT — UPDATE and DELETE are
revoked. This ensures the audit trail is tamper-proof even if application code
contains a bug that would otherwise modify audit records.
Usage:
from shared.db import async_session_factory
from orchestrator.audit.logger import AuditLogger
audit_logger = AuditLogger(session_factory=async_session_factory)
await audit_logger.log_llm_call(
tenant_id=tenant_uuid,
agent_id=agent_uuid,
user_id="U12345",
input_summary="What is the weather?",
output_summary="It is sunny today.",
latency_ms=350,
)
"""
from __future__ import annotations
import json
import logging
import uuid
from typing import Any
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from shared.rls import current_tenant_id
logger = logging.getLogger(__name__)
# Maximum length for input/output summaries stored in audit trail
_SUMMARY_MAX_LEN = 500
def _truncate(value: str, max_len: int = _SUMMARY_MAX_LEN) -> str:
"""Truncate a string to max_len characters with ellipsis indicator."""
if len(value) <= max_len:
return value
return value[:max_len] + ""
class AuditLogger:
"""
Writes immutable audit events to the audit_events table.
The RLS context (current_tenant_id ContextVar) must be set by the caller
before invoking any log method. This ensures each insert is scoped to the
correct tenant.
Args:
session_factory: An async_sessionmaker configured with the application
engine. The logger creates a fresh session per write
to avoid transaction entanglement with the caller.
"""
def __init__(self, session_factory: async_sessionmaker[AsyncSession]) -> None:
self._session_factory = session_factory
async def _write_event(
self,
tenant_id: uuid.UUID,
agent_id: uuid.UUID | None,
user_id: str | None,
action_type: str,
input_summary: str | None,
output_summary: str | None,
latency_ms: int | None,
metadata: dict[str, Any],
) -> None:
"""
Internal: write a single audit event row.
Uses raw INSERT to avoid SQLAlchemy ORM session tracking — audit events
should never be accidentally queued for update/delete by the ORM.
"""
tenant_token = current_tenant_id.set(tenant_id)
try:
async with self._session_factory() as session:
await session.execute(
text("""
INSERT INTO audit_events
(tenant_id, agent_id, user_id, action_type,
input_summary, output_summary, latency_ms, metadata)
VALUES
(:tenant_id, :agent_id, :user_id, :action_type,
:input_summary, :output_summary, :latency_ms, CAST(:metadata AS jsonb))
"""),
{
"tenant_id": str(tenant_id),
"agent_id": str(agent_id) if agent_id else None,
"user_id": user_id,
"action_type": action_type,
"input_summary": input_summary,
"output_summary": output_summary,
"latency_ms": latency_ms,
"metadata": json.dumps(metadata),
},
)
await session.commit()
except Exception:
logger.exception(
"Failed to write audit event: action=%s tenant=%s",
action_type,
tenant_id,
)
raise
finally:
current_tenant_id.reset(tenant_token)
async def log_llm_call(
self,
tenant_id: uuid.UUID,
agent_id: uuid.UUID | None,
user_id: str | None,
input_summary: str,
output_summary: str,
latency_ms: int,
metadata: dict[str, Any] | None = None,
) -> None:
"""
Record an LLM completion call.
Args:
tenant_id: Tenant this call belongs to.
agent_id: Agent that made the LLM call.
user_id: End-user identifier from the channel.
input_summary: Truncated representation of the messages sent to LLM.
output_summary: Truncated LLM response content.
latency_ms: Round-trip duration in milliseconds.
metadata: Optional dict (model name, token counts, etc.).
"""
await self._write_event(
tenant_id=tenant_id,
agent_id=agent_id,
user_id=user_id,
action_type="llm_call",
input_summary=_truncate(input_summary),
output_summary=_truncate(output_summary),
latency_ms=latency_ms,
metadata=metadata or {},
)
async def log_tool_call(
self,
tool_name: str,
args: dict[str, Any],
result: str | None,
tenant_id: uuid.UUID,
agent_id: uuid.UUID | None,
latency_ms: int,
error: str | None = None,
) -> None:
"""
Record a tool invocation.
Args:
tool_name: Name of the invoked tool (e.g. "web_search").
args: Arguments passed to the tool (will be JSON-serialized).
result: Tool result string, or None if an error occurred.
tenant_id: Tenant this invocation belongs to.
agent_id: Agent that invoked the tool.
latency_ms: Tool execution duration in milliseconds.
error: Error message if the tool failed, else None.
"""
# Summarize args as truncated JSON for the audit trail
try:
args_str = json.dumps(args, ensure_ascii=False)
except Exception:
args_str = repr(args)
input_summary = _truncate(f"{tool_name}({args_str})")
output_summary = _truncate(error or result or "")
metadata: dict[str, Any] = {"tool_name": tool_name}
if error:
metadata["error"] = error
await self._write_event(
tenant_id=tenant_id,
agent_id=agent_id,
user_id=None,
action_type="tool_invocation",
input_summary=input_summary,
output_summary=output_summary,
latency_ms=latency_ms,
metadata=metadata,
)
async def log_escalation(
self,
tenant_id: uuid.UUID,
agent_id: uuid.UUID | None,
user_id: str | None,
trigger_reason: str,
metadata: dict[str, Any] | None = None,
) -> None:
"""
Record an agent escalation (handoff to human or another agent).
Args:
tenant_id: Tenant this escalation belongs to.
agent_id: Agent that triggered the escalation.
user_id: End-user identifier.
trigger_reason: Human-readable description of why escalation occurred.
metadata: Optional additional context.
"""
await self._write_event(
tenant_id=tenant_id,
agent_id=agent_id,
user_id=user_id,
action_type="escalation",
input_summary=_truncate(trigger_reason),
output_summary=None,
latency_ms=None,
metadata=metadata or {},
)