Files
konstruct/packages/shared/shared/models/kb.py
Adolfo Delorenzo e8d3e8a108 feat(10-01): KB ingestion pipeline - migration, extractors, API router
- Migration 014: add status/error_message/chunk_count to kb_documents, make agent_id nullable
- Add GOOGLE_CALENDAR to ChannelTypeEnum in tenant.py
- Add brave_api_key, firecrawl_api_key, google_client_id/secret, minio_kb_bucket to config
- Add text extractors for PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD
- Add KB management API router with upload, list, delete, URL ingest, reindex endpoints
- Install pypdf, python-docx, python-pptx, openpyxl, pandas, firecrawl-py, youtube-transcript-api
- Update .env.example with new env vars
- Unit tests: test_extractors.py (10 tests) and test_kb_upload.py (7 tests) all pass
2026-03-26 09:05:29 -06:00

156 lines
4.7 KiB
Python

"""
SQLAlchemy 2.0 ORM models for the Knowledge Base tables.
Tables:
kb_documents — uploaded documents belonging to a tenant/agent
kb_chunks — text chunks with vector embeddings for semantic search
The embedding column uses pgvector's vector(384) type, matching the
all-MiniLM-L6-v2 model used for embeddings (same as conversation_embeddings).
RLS is applied to both tables — each tenant's KB is completely isolated.
"""
from __future__ import annotations
import uuid
from datetime import datetime
from sqlalchemy import DateTime, ForeignKey, Integer, Text, func
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
# Valid status values for KnowledgeBaseDocument.status
KB_STATUS_PROCESSING = "processing"
KB_STATUS_READY = "ready"
KB_STATUS_ERROR = "error"
class KBBase(DeclarativeBase):
"""Separate declarative base for KB models."""
pass
class KnowledgeBaseDocument(KBBase):
"""
A document uploaded to a tenant's knowledge base.
Documents are chunked into KBChunk rows for vector search.
RLS ensures tenant isolation — each tenant sees only their documents.
"""
__tablename__ = "kb_documents"
id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
primary_key=True,
server_default=func.gen_random_uuid(),
)
tenant_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
nullable=False,
index=True,
)
agent_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True),
nullable=True,
index=True,
comment="Agent this document is associated with (nullable — KB is per-tenant)",
)
status: Mapped[str] = mapped_column(
Text,
nullable=False,
server_default=KB_STATUS_PROCESSING,
comment="Ingestion status: processing | ready | error",
)
error_message: Mapped[str | None] = mapped_column(
Text,
nullable=True,
comment="Error details when status='error'",
)
chunk_count: Mapped[int | None] = mapped_column(
Integer,
nullable=True,
comment="Number of chunks created after successful ingestion",
)
filename: Mapped[str | None] = mapped_column(
Text,
nullable=True,
comment="Original filename if uploaded as a file",
)
source_url: Mapped[str | None] = mapped_column(
Text,
nullable=True,
comment="Source URL if ingested from the web",
)
content_type: Mapped[str | None] = mapped_column(
Text,
nullable=True,
comment="MIME type: text/plain, application/pdf, etc.",
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
server_default=func.now(),
)
# Relationship
chunks: Mapped[list[KBChunk]] = relationship("KBChunk", back_populates="document", cascade="all, delete-orphan")
def __repr__(self) -> str:
return f"<KnowledgeBaseDocument id={self.id} tenant={self.tenant_id}>"
class KBChunk(KBBase):
"""
A text chunk from a knowledge base document, with a vector embedding.
The embedding column is vector(384) — matches all-MiniLM-L6-v2 output dimensions.
The HNSW index in the migration enables fast cosine similarity search.
RLS ensures tenant isolation — each tenant's chunks are invisible to others.
"""
__tablename__ = "kb_chunks"
id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
primary_key=True,
server_default=func.gen_random_uuid(),
)
tenant_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
nullable=False,
index=True,
)
document_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("kb_documents.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
content: Mapped[str] = mapped_column(
Text,
nullable=False,
comment="The text content of this chunk",
)
# embedding is vector(384) — raw DDL in migration, not mapped here
# because SQLAlchemy doesn't natively know the pgvector type
chunk_index: Mapped[int | None] = mapped_column(
Integer,
nullable=True,
comment="Position of this chunk within its source document (0-indexed)",
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
server_default=func.now(),
)
# Relationship
document: Mapped[KnowledgeBaseDocument] = relationship("KnowledgeBaseDocument", back_populates="chunks")
def __repr__(self) -> str:
return f"<KBChunk id={self.id} document={self.document_id} idx={self.chunk_index}>"