konstruct/packages/shared/shared/models/kb.py

"""
SQLAlchemy 2.0 ORM models for the Knowledge Base tables.

Tables:
  kb_documents — uploaded documents belonging to a tenant/agent
  kb_chunks    — text chunks with vector embeddings for semantic search

The embedding column uses pgvector's vector(384) type, matching the
all-MiniLM-L6-v2 model used for embeddings (same as conversation_embeddings).

RLS is applied to both tables — each tenant's KB is completely isolated.
"""

from __future__ import annotations

import uuid
from datetime import datetime

from sqlalchemy import DateTime, ForeignKey, Integer, Text, func
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship

# Valid status values for KnowledgeBaseDocument.status
KB_STATUS_PROCESSING = "processing"
KB_STATUS_READY = "ready"
KB_STATUS_ERROR = "error"


class KBBase(DeclarativeBase):
    """Separate declarative base for KB models."""

    pass


class KnowledgeBaseDocument(KBBase):
    """
    A document uploaded to a tenant's knowledge base.

    Documents are chunked into KBChunk rows for vector search.
    RLS ensures tenant isolation — each tenant sees only their documents.
    """

    __tablename__ = "kb_documents"

    id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        primary_key=True,
        server_default=func.gen_random_uuid(),
    )
    tenant_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        nullable=False,
        index=True,
    )
    agent_id: Mapped[uuid.UUID | None] = mapped_column(
        UUID(as_uuid=True),
        nullable=True,
        index=True,
        comment="Agent this document is associated with (nullable — KB is per-tenant)",
    )
    status: Mapped[str] = mapped_column(
        Text,
        nullable=False,
        server_default=KB_STATUS_PROCESSING,
        comment="Ingestion status: processing | ready | error",
    )
    error_message: Mapped[str | None] = mapped_column(
        Text,
        nullable=True,
        comment="Error details when status='error'",
    )
    chunk_count: Mapped[int | None] = mapped_column(
        Integer,
        nullable=True,
        comment="Number of chunks created after successful ingestion",
    )
    filename: Mapped[str | None] = mapped_column(
        Text,
        nullable=True,
        comment="Original filename if uploaded as a file",
    )
    source_url: Mapped[str | None] = mapped_column(
        Text,
        nullable=True,
        comment="Source URL if ingested from the web",
    )
    content_type: Mapped[str | None] = mapped_column(
        Text,
        nullable=True,
        comment="MIME type: text/plain, application/pdf, etc.",
    )
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        nullable=False,
        server_default=func.now(),
    )

    # Relationship
    chunks: Mapped[list[KBChunk]] = relationship("KBChunk", back_populates="document", cascade="all, delete-orphan")

    def __repr__(self) -> str:
        return f"<KnowledgeBaseDocument id={self.id} tenant={self.tenant_id}>"


class KBChunk(KBBase):
    """
    A text chunk from a knowledge base document, with a vector embedding.

    The embedding column is vector(384) — matches all-MiniLM-L6-v2 output dimensions.
    The HNSW index in the migration enables fast cosine similarity search.

    RLS ensures tenant isolation — each tenant's chunks are invisible to others.
    """

    __tablename__ = "kb_chunks"

    id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        primary_key=True,
        server_default=func.gen_random_uuid(),
    )
    tenant_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        nullable=False,
        index=True,
    )
    document_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        ForeignKey("kb_documents.id", ondelete="CASCADE"),
        nullable=False,
        index=True,
    )
    content: Mapped[str] = mapped_column(
        Text,
        nullable=False,
        comment="The text content of this chunk",
    )
    # embedding is vector(384) — raw DDL in migration, not mapped here
    # because SQLAlchemy doesn't natively know the pgvector type
    chunk_index: Mapped[int | None] = mapped_column(
        Integer,
        nullable=True,
        comment="Position of this chunk within its source document (0-indexed)",
    )
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        nullable=False,
        server_default=func.now(),
    )

    # Relationship
    document: Mapped[KnowledgeBaseDocument] = relationship("KnowledgeBaseDocument", back_populates="chunks")

    def __repr__(self) -> str:
        return f"<KBChunk id={self.id} document={self.document_id} idx={self.chunk_index}>"