feat(10-01): KB ingestion pipeline - migration, extractors, API router
- Migration 014: add status/error_message/chunk_count to kb_documents, make agent_id nullable - Add GOOGLE_CALENDAR to ChannelTypeEnum in tenant.py - Add brave_api_key, firecrawl_api_key, google_client_id/secret, minio_kb_bucket to config - Add text extractors for PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD - Add KB management API router with upload, list, delete, URL ingest, reindex endpoints - Install pypdf, python-docx, python-pptx, openpyxl, pandas, firecrawl-py, youtube-transcript-api - Update .env.example with new env vars - Unit tests: test_extractors.py (10 tests) and test_kb_upload.py (7 tests) all pass
This commit is contained in:
84
migrations/versions/014_kb_status.py
Normal file
84
migrations/versions/014_kb_status.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""KB document status columns and agent_id nullable
|
||||
|
||||
Revision ID: 014
|
||||
Revises: 013
|
||||
Create Date: 2026-03-26
|
||||
|
||||
Changes:
|
||||
- kb_documents.status TEXT NOT NULL DEFAULT 'processing' (CHECK constraint)
|
||||
- kb_documents.error_message TEXT NULL
|
||||
- kb_documents.chunk_count INTEGER NULL
|
||||
- kb_documents.agent_id DROP NOT NULL (make nullable — KB is per-tenant, not per-agent)
|
||||
|
||||
Note: google_calendar channel type was added in migration 013.
|
||||
This migration is numbered 014 and depends on 013.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
|
||||
revision: str = "014"
|
||||
down_revision: Union[str, None] = "013"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# --------------------------------------------------------------------------
|
||||
# 1. Add status, error_message, chunk_count columns to kb_documents
|
||||
# --------------------------------------------------------------------------
|
||||
op.add_column(
|
||||
"kb_documents",
|
||||
sa.Column(
|
||||
"status",
|
||||
sa.Text(),
|
||||
nullable=False,
|
||||
server_default="processing",
|
||||
comment="Document ingestion status: processing | ready | error",
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"kb_documents",
|
||||
sa.Column(
|
||||
"error_message",
|
||||
sa.Text(),
|
||||
nullable=True,
|
||||
comment="Error details when status='error'",
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"kb_documents",
|
||||
sa.Column(
|
||||
"chunk_count",
|
||||
sa.Integer(),
|
||||
nullable=True,
|
||||
comment="Number of chunks created after ingestion",
|
||||
),
|
||||
)
|
||||
|
||||
# CHECK constraint on status values
|
||||
op.create_check_constraint(
|
||||
"ck_kb_documents_status",
|
||||
"kb_documents",
|
||||
"status IN ('processing', 'ready', 'error')",
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# 2. Make agent_id nullable — KB is per-tenant, not per-agent
|
||||
# --------------------------------------------------------------------------
|
||||
op.alter_column("kb_documents", "agent_id", nullable=True)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Restore agent_id NOT NULL
|
||||
op.alter_column("kb_documents", "agent_id", nullable=False)
|
||||
|
||||
# Drop added columns
|
||||
op.drop_constraint("ck_kb_documents_status", "kb_documents", type_="check")
|
||||
op.drop_column("kb_documents", "chunk_count")
|
||||
op.drop_column("kb_documents", "error_message")
|
||||
op.drop_column("kb_documents", "status")
|
||||
Reference in New Issue
Block a user