feat(10-01): KB ingestion pipeline - migration, extractors, API router

- Migration 014: add status/error_message/chunk_count to kb_documents, make agent_id nullable
- Add GOOGLE_CALENDAR to ChannelTypeEnum in tenant.py
- Add brave_api_key, firecrawl_api_key, google_client_id/secret, minio_kb_bucket to config
- Add text extractors for PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD
- Add KB management API router with upload, list, delete, URL ingest, reindex endpoints
- Install pypdf, python-docx, python-pptx, openpyxl, pandas, firecrawl-py, youtube-transcript-api
- Update .env.example with new env vars
- Unit tests: test_extractors.py (10 tests) and test_kb_upload.py (7 tests) all pass
This commit is contained in:
2026-03-26 09:05:29 -06:00
parent eae4b0324d
commit e8d3e8a108
11 changed files with 1745 additions and 28 deletions

View File

@@ -0,0 +1,84 @@
"""KB document status columns and agent_id nullable
Revision ID: 014
Revises: 013
Create Date: 2026-03-26
Changes:
- kb_documents.status TEXT NOT NULL DEFAULT 'processing' (CHECK constraint)
- kb_documents.error_message TEXT NULL
- kb_documents.chunk_count INTEGER NULL
- kb_documents.agent_id DROP NOT NULL (make nullable — KB is per-tenant, not per-agent)
Note: google_calendar channel type was added in migration 013.
This migration is numbered 014 and depends on 013.
"""
from __future__ import annotations
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
revision: str = "014"
down_revision: Union[str, None] = "013"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# --------------------------------------------------------------------------
# 1. Add status, error_message, chunk_count columns to kb_documents
# --------------------------------------------------------------------------
op.add_column(
"kb_documents",
sa.Column(
"status",
sa.Text(),
nullable=False,
server_default="processing",
comment="Document ingestion status: processing | ready | error",
),
)
op.add_column(
"kb_documents",
sa.Column(
"error_message",
sa.Text(),
nullable=True,
comment="Error details when status='error'",
),
)
op.add_column(
"kb_documents",
sa.Column(
"chunk_count",
sa.Integer(),
nullable=True,
comment="Number of chunks created after ingestion",
),
)
# CHECK constraint on status values
op.create_check_constraint(
"ck_kb_documents_status",
"kb_documents",
"status IN ('processing', 'ready', 'error')",
)
# --------------------------------------------------------------------------
# 2. Make agent_id nullable — KB is per-tenant, not per-agent
# --------------------------------------------------------------------------
op.alter_column("kb_documents", "agent_id", nullable=True)
def downgrade() -> None:
# Restore agent_id NOT NULL
op.alter_column("kb_documents", "agent_id", nullable=False)
# Drop added columns
op.drop_constraint("ck_kb_documents_status", "kb_documents", type_="check")
op.drop_column("kb_documents", "chunk_count")
op.drop_column("kb_documents", "error_message")
op.drop_column("kb_documents", "status")