feat(10-01): KB ingestion pipeline - migration, extractors, API router

- Migration 014: add status/error_message/chunk_count to kb_documents, make agent_id nullable - Add GOOGLE_CALENDAR to ChannelTypeEnum in tenant.py - Add brave_api_key, firecrawl_api_key, google_client_id/secret, minio_kb_bucket to config - Add text extractors for PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD - Add KB management API router with upload, list, delete, URL ingest, reindex endpoints - Install pypdf, python-docx, python-pptx, openpyxl, pandas, firecrawl-py, youtube-transcript-api - Update .env.example with new env vars - Unit tests: test_extractors.py (10 tests) and test_kb_upload.py (7 tests) all pass
2026-03-26 09:05:29 -06:00
parent eae4b0324d
commit e8d3e8a108
11 changed files with 1745 additions and 28 deletions
--- a/packages/orchestrator/orchestrator/tools/extractors.py
+++ b/packages/orchestrator/orchestrator/tools/extractors.py
@@ -0,0 +1,141 @@
+"""
+Text extraction functions for knowledge base document ingestion.
+
+Supports: PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD
+
+Usage:
+    text = extract_text("document.pdf", pdf_bytes)
+    text = extract_text("report.docx", docx_bytes)
+
+Raises:
+    ValueError: If the file extension is not supported.
+"""
+
+from __future__ import annotations
+
+import io
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+
+# Supported extensions grouped by extraction method
+_PDF_EXTENSIONS = {".pdf"}
+_DOCX_EXTENSIONS = {".docx"}
+_PPTX_EXTENSIONS = {".pptx"}
+_SPREADSHEET_EXTENSIONS = {".xlsx", ".xls"}
+_TEXT_EXTENSIONS = {".csv", ".txt", ".md"}
+
+_ALL_SUPPORTED = (
+    _PDF_EXTENSIONS
+    | _DOCX_EXTENSIONS
+    | _PPTX_EXTENSIONS
+    | _SPREADSHEET_EXTENSIONS
+    | _TEXT_EXTENSIONS
+)
+
+# Minimum characters for a PDF to be considered successfully extracted
+# Below this threshold the PDF likely needs OCR (scanned/image-only PDF)
+_PDF_MIN_CHARS = 100
+
+
+def extract_text(filename: str, file_bytes: bytes) -> str:
+    """
+    Extract plain text from a document given its filename and raw bytes.
+
+    Args:
+        filename:   Original filename including extension (e.g., "report.pdf").
+                    The extension determines which parser to use.
+        file_bytes: Raw bytes of the document.
+
+    Returns:
+        Extracted plain text as a string.
+
+    Raises:
+        ValueError: If the file extension is not in the supported set.
+    """
+    _, ext = os.path.splitext(filename.lower())
+
+    if ext in _PDF_EXTENSIONS:
+        return _extract_pdf(file_bytes)
+    elif ext in _DOCX_EXTENSIONS:
+        return _extract_docx(file_bytes)
+    elif ext in _PPTX_EXTENSIONS:
+        return _extract_pptx(file_bytes)
+    elif ext in _SPREADSHEET_EXTENSIONS:
+        return _extract_spreadsheet(file_bytes)
+    elif ext in _TEXT_EXTENSIONS:
+        return _extract_text_plain(file_bytes)
+    else:
+        raise ValueError(
+            f"Unsupported file extension: '{ext}'. "
+            f"Supported formats: {', '.join(sorted(_ALL_SUPPORTED))}"
+        )
+
+
+def _extract_pdf(file_bytes: bytes) -> str:
+    """Extract text from a PDF file using pypdf."""
+    from pypdf import PdfReader
+
+    reader = PdfReader(io.BytesIO(file_bytes))
+    pages_text: list[str] = []
+    for page in reader.pages:
+        page_text = page.extract_text() or ""
+        if page_text.strip():
+            pages_text.append(page_text)
+
+    text = "\n".join(pages_text)
+
+    if len(text.strip()) < _PDF_MIN_CHARS:
+        logger.warning("PDF text extraction yielded < %d chars — PDF may be image-only", _PDF_MIN_CHARS)
+        return (
+            f"This PDF appears to be image-only or contains very little extractable text "
+            f"({len(text.strip())} characters). OCR is not supported in the current version. "
+            f"Please provide a text-based PDF or convert it to a text document first."
+        )
+
+    return text
+
+
+def _extract_docx(file_bytes: bytes) -> str:
+    """Extract text from a DOCX file using python-docx."""
+    from docx import Document
+
+    doc = Document(io.BytesIO(file_bytes))
+    paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
+    return "\n".join(paragraphs)
+
+
+def _extract_pptx(file_bytes: bytes) -> str:
+    """Extract text from a PPTX file using python-pptx."""
+    from pptx import Presentation
+    from pptx.util import Pt  # noqa: F401 — imported for type completeness
+
+    prs = Presentation(io.BytesIO(file_bytes))
+    slide_texts: list[str] = []
+
+    for slide_num, slide in enumerate(prs.slides, start=1):
+        texts: list[str] = []
+        for shape in slide.shapes:
+            if shape.has_text_frame:
+                for para in shape.text_frame.paragraphs:
+                    line = "".join(run.text for run in para.runs).strip()
+                    if line:
+                        texts.append(line)
+        if texts:
+            slide_texts.append(f"[Slide {slide_num}]\n" + "\n".join(texts))
+
+    return "\n\n".join(slide_texts)
+
+
+def _extract_spreadsheet(file_bytes: bytes) -> str:
+    """Extract text from XLSX/XLS files as CSV-formatted text using pandas."""
+    import pandas as pd
+
+    df = pd.read_excel(io.BytesIO(file_bytes))
+    return df.to_csv(index=False)
+
+
+def _extract_text_plain(file_bytes: bytes) -> str:
+    """Decode a plain text file (CSV, TXT, MD) as UTF-8."""
+    return file_bytes.decode("utf-8", errors="replace")
--- a/packages/orchestrator/pyproject.toml
+++ b/packages/orchestrator/pyproject.toml
@@ -14,6 +14,15 @@ dependencies = [
    "httpx>=0.28.0",
    "sentence-transformers>=3.0.0",
    "jsonschema>=4.26.0",
+    "pypdf>=6.9.2",
+    "python-docx>=1.2.0",
+    "python-pptx>=1.0.2",
+    "openpyxl>=3.1.5",
+    "pandas>=3.0.1",
+    "firecrawl-py>=4.21.0",
+    "youtube-transcript-api>=1.2.4",
+    "google-api-python-client>=2.193.0",
+    "google-auth-oauthlib>=1.3.0",
 ]

 [tool.uv.sources]