feat(10-01): KB ingestion pipeline - migration, extractors, API router

- Migration 014: add status/error_message/chunk_count to kb_documents, make agent_id nullable - Add GOOGLE_CALENDAR to ChannelTypeEnum in tenant.py - Add brave_api_key, firecrawl_api_key, google_client_id/secret, minio_kb_bucket to config - Add text extractors for PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD - Add KB management API router with upload, list, delete, URL ingest, reindex endpoints - Install pypdf, python-docx, python-pptx, openpyxl, pandas, firecrawl-py, youtube-transcript-api - Update .env.example with new env vars - Unit tests: test_extractors.py (10 tests) and test_kb_upload.py (7 tests) all pass
2026-03-26 09:05:29 -06:00
parent eae4b0324d
commit e8d3e8a108
11 changed files with 1745 additions and 28 deletions
--- a/migrations/versions/014_kb_status.py
+++ b/migrations/versions/014_kb_status.py
@@ -0,0 +1,84 @@
+"""KB document status columns and agent_id nullable
+
+Revision ID: 014
+Revises: 013
+Create Date: 2026-03-26
+
+Changes:
+  - kb_documents.status TEXT NOT NULL DEFAULT 'processing' (CHECK constraint)
+  - kb_documents.error_message TEXT NULL
+  - kb_documents.chunk_count INTEGER NULL
+  - kb_documents.agent_id DROP NOT NULL (make nullable — KB is per-tenant, not per-agent)
+
+Note: google_calendar channel type was added in migration 013.
+      This migration is numbered 014 and depends on 013.
+"""
+
+from __future__ import annotations
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from alembic import op
+
+revision: str = "014"
+down_revision: Union[str, None] = "013"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # --------------------------------------------------------------------------
+    # 1. Add status, error_message, chunk_count columns to kb_documents
+    # --------------------------------------------------------------------------
+    op.add_column(
+        "kb_documents",
+        sa.Column(
+            "status",
+            sa.Text(),
+            nullable=False,
+            server_default="processing",
+            comment="Document ingestion status: processing | ready | error",
+        ),
+    )
+    op.add_column(
+        "kb_documents",
+        sa.Column(
+            "error_message",
+            sa.Text(),
+            nullable=True,
+            comment="Error details when status='error'",
+        ),
+    )
+    op.add_column(
+        "kb_documents",
+        sa.Column(
+            "chunk_count",
+            sa.Integer(),
+            nullable=True,
+            comment="Number of chunks created after ingestion",
+        ),
+    )
+
+    # CHECK constraint on status values
+    op.create_check_constraint(
+        "ck_kb_documents_status",
+        "kb_documents",
+        "status IN ('processing', 'ready', 'error')",
+    )
+
+    # --------------------------------------------------------------------------
+    # 2. Make agent_id nullable — KB is per-tenant, not per-agent
+    # --------------------------------------------------------------------------
+    op.alter_column("kb_documents", "agent_id", nullable=True)
+
+
+def downgrade() -> None:
+    # Restore agent_id NOT NULL
+    op.alter_column("kb_documents", "agent_id", nullable=False)
+
+    # Drop added columns
+    op.drop_constraint("ck_kb_documents_status", "kb_documents", type_="check")
+    op.drop_column("kb_documents", "chunk_count")
+    op.drop_column("kb_documents", "error_message")
+    op.drop_column("kb_documents", "status")