- Migration 014: add status/error_message/chunk_count to kb_documents, make agent_id nullable - Add GOOGLE_CALENDAR to ChannelTypeEnum in tenant.py - Add brave_api_key, firecrawl_api_key, google_client_id/secret, minio_kb_bucket to config - Add text extractors for PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD - Add KB management API router with upload, list, delete, URL ingest, reindex endpoints - Install pypdf, python-docx, python-pptx, openpyxl, pandas, firecrawl-py, youtube-transcript-api - Update .env.example with new env vars - Unit tests: test_extractors.py (10 tests) and test_kb_upload.py (7 tests) all pass
156 lines
4.7 KiB
Python
156 lines
4.7 KiB
Python
"""
|
|
SQLAlchemy 2.0 ORM models for the Knowledge Base tables.
|
|
|
|
Tables:
|
|
kb_documents — uploaded documents belonging to a tenant/agent
|
|
kb_chunks — text chunks with vector embeddings for semantic search
|
|
|
|
The embedding column uses pgvector's vector(384) type, matching the
|
|
all-MiniLM-L6-v2 model used for embeddings (same as conversation_embeddings).
|
|
|
|
RLS is applied to both tables — each tenant's KB is completely isolated.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import uuid
|
|
from datetime import datetime
|
|
|
|
from sqlalchemy import DateTime, ForeignKey, Integer, Text, func
|
|
from sqlalchemy.dialects.postgresql import UUID
|
|
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
|
|
|
# Valid status values for KnowledgeBaseDocument.status
|
|
KB_STATUS_PROCESSING = "processing"
|
|
KB_STATUS_READY = "ready"
|
|
KB_STATUS_ERROR = "error"
|
|
|
|
|
|
class KBBase(DeclarativeBase):
|
|
"""Separate declarative base for KB models."""
|
|
|
|
pass
|
|
|
|
|
|
class KnowledgeBaseDocument(KBBase):
|
|
"""
|
|
A document uploaded to a tenant's knowledge base.
|
|
|
|
Documents are chunked into KBChunk rows for vector search.
|
|
RLS ensures tenant isolation — each tenant sees only their documents.
|
|
"""
|
|
|
|
__tablename__ = "kb_documents"
|
|
|
|
id: Mapped[uuid.UUID] = mapped_column(
|
|
UUID(as_uuid=True),
|
|
primary_key=True,
|
|
server_default=func.gen_random_uuid(),
|
|
)
|
|
tenant_id: Mapped[uuid.UUID] = mapped_column(
|
|
UUID(as_uuid=True),
|
|
nullable=False,
|
|
index=True,
|
|
)
|
|
agent_id: Mapped[uuid.UUID | None] = mapped_column(
|
|
UUID(as_uuid=True),
|
|
nullable=True,
|
|
index=True,
|
|
comment="Agent this document is associated with (nullable — KB is per-tenant)",
|
|
)
|
|
status: Mapped[str] = mapped_column(
|
|
Text,
|
|
nullable=False,
|
|
server_default=KB_STATUS_PROCESSING,
|
|
comment="Ingestion status: processing | ready | error",
|
|
)
|
|
error_message: Mapped[str | None] = mapped_column(
|
|
Text,
|
|
nullable=True,
|
|
comment="Error details when status='error'",
|
|
)
|
|
chunk_count: Mapped[int | None] = mapped_column(
|
|
Integer,
|
|
nullable=True,
|
|
comment="Number of chunks created after successful ingestion",
|
|
)
|
|
filename: Mapped[str | None] = mapped_column(
|
|
Text,
|
|
nullable=True,
|
|
comment="Original filename if uploaded as a file",
|
|
)
|
|
source_url: Mapped[str | None] = mapped_column(
|
|
Text,
|
|
nullable=True,
|
|
comment="Source URL if ingested from the web",
|
|
)
|
|
content_type: Mapped[str | None] = mapped_column(
|
|
Text,
|
|
nullable=True,
|
|
comment="MIME type: text/plain, application/pdf, etc.",
|
|
)
|
|
created_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True),
|
|
nullable=False,
|
|
server_default=func.now(),
|
|
)
|
|
|
|
# Relationship
|
|
chunks: Mapped[list[KBChunk]] = relationship("KBChunk", back_populates="document", cascade="all, delete-orphan")
|
|
|
|
def __repr__(self) -> str:
|
|
return f"<KnowledgeBaseDocument id={self.id} tenant={self.tenant_id}>"
|
|
|
|
|
|
class KBChunk(KBBase):
|
|
"""
|
|
A text chunk from a knowledge base document, with a vector embedding.
|
|
|
|
The embedding column is vector(384) — matches all-MiniLM-L6-v2 output dimensions.
|
|
The HNSW index in the migration enables fast cosine similarity search.
|
|
|
|
RLS ensures tenant isolation — each tenant's chunks are invisible to others.
|
|
"""
|
|
|
|
__tablename__ = "kb_chunks"
|
|
|
|
id: Mapped[uuid.UUID] = mapped_column(
|
|
UUID(as_uuid=True),
|
|
primary_key=True,
|
|
server_default=func.gen_random_uuid(),
|
|
)
|
|
tenant_id: Mapped[uuid.UUID] = mapped_column(
|
|
UUID(as_uuid=True),
|
|
nullable=False,
|
|
index=True,
|
|
)
|
|
document_id: Mapped[uuid.UUID] = mapped_column(
|
|
UUID(as_uuid=True),
|
|
ForeignKey("kb_documents.id", ondelete="CASCADE"),
|
|
nullable=False,
|
|
index=True,
|
|
)
|
|
content: Mapped[str] = mapped_column(
|
|
Text,
|
|
nullable=False,
|
|
comment="The text content of this chunk",
|
|
)
|
|
# embedding is vector(384) — raw DDL in migration, not mapped here
|
|
# because SQLAlchemy doesn't natively know the pgvector type
|
|
chunk_index: Mapped[int | None] = mapped_column(
|
|
Integer,
|
|
nullable=True,
|
|
comment="Position of this chunk within its source document (0-indexed)",
|
|
)
|
|
created_at: Mapped[datetime] = mapped_column(
|
|
DateTime(timezone=True),
|
|
nullable=False,
|
|
server_default=func.now(),
|
|
)
|
|
|
|
# Relationship
|
|
document: Mapped[KnowledgeBaseDocument] = relationship("KnowledgeBaseDocument", back_populates="chunks")
|
|
|
|
def __repr__(self) -> str:
|
|
return f"<KBChunk id={self.id} document={self.document_id} idx={self.chunk_index}>"
|