feat(10-01): KB ingestion pipeline - migration, extractors, API router

- Migration 014: add status/error_message/chunk_count to kb_documents, make agent_id nullable
- Add GOOGLE_CALENDAR to ChannelTypeEnum in tenant.py
- Add brave_api_key, firecrawl_api_key, google_client_id/secret, minio_kb_bucket to config
- Add text extractors for PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD
- Add KB management API router with upload, list, delete, URL ingest, reindex endpoints
- Install pypdf, python-docx, python-pptx, openpyxl, pandas, firecrawl-py, youtube-transcript-api
- Update .env.example with new env vars
- Unit tests: test_extractors.py (10 tests) and test_kb_upload.py (7 tests) all pass
This commit is contained in:
2026-03-26 09:05:29 -06:00
parent eae4b0324d
commit e8d3e8a108
11 changed files with 1745 additions and 28 deletions

View File

@@ -0,0 +1,141 @@
"""
Text extraction functions for knowledge base document ingestion.
Supports: PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD
Usage:
text = extract_text("document.pdf", pdf_bytes)
text = extract_text("report.docx", docx_bytes)
Raises:
ValueError: If the file extension is not supported.
"""
from __future__ import annotations
import io
import logging
import os
logger = logging.getLogger(__name__)
# Supported extensions grouped by extraction method
_PDF_EXTENSIONS = {".pdf"}
_DOCX_EXTENSIONS = {".docx"}
_PPTX_EXTENSIONS = {".pptx"}
_SPREADSHEET_EXTENSIONS = {".xlsx", ".xls"}
_TEXT_EXTENSIONS = {".csv", ".txt", ".md"}
_ALL_SUPPORTED = (
_PDF_EXTENSIONS
| _DOCX_EXTENSIONS
| _PPTX_EXTENSIONS
| _SPREADSHEET_EXTENSIONS
| _TEXT_EXTENSIONS
)
# Minimum characters for a PDF to be considered successfully extracted
# Below this threshold the PDF likely needs OCR (scanned/image-only PDF)
_PDF_MIN_CHARS = 100
def extract_text(filename: str, file_bytes: bytes) -> str:
"""
Extract plain text from a document given its filename and raw bytes.
Args:
filename: Original filename including extension (e.g., "report.pdf").
The extension determines which parser to use.
file_bytes: Raw bytes of the document.
Returns:
Extracted plain text as a string.
Raises:
ValueError: If the file extension is not in the supported set.
"""
_, ext = os.path.splitext(filename.lower())
if ext in _PDF_EXTENSIONS:
return _extract_pdf(file_bytes)
elif ext in _DOCX_EXTENSIONS:
return _extract_docx(file_bytes)
elif ext in _PPTX_EXTENSIONS:
return _extract_pptx(file_bytes)
elif ext in _SPREADSHEET_EXTENSIONS:
return _extract_spreadsheet(file_bytes)
elif ext in _TEXT_EXTENSIONS:
return _extract_text_plain(file_bytes)
else:
raise ValueError(
f"Unsupported file extension: '{ext}'. "
f"Supported formats: {', '.join(sorted(_ALL_SUPPORTED))}"
)
def _extract_pdf(file_bytes: bytes) -> str:
"""Extract text from a PDF file using pypdf."""
from pypdf import PdfReader
reader = PdfReader(io.BytesIO(file_bytes))
pages_text: list[str] = []
for page in reader.pages:
page_text = page.extract_text() or ""
if page_text.strip():
pages_text.append(page_text)
text = "\n".join(pages_text)
if len(text.strip()) < _PDF_MIN_CHARS:
logger.warning("PDF text extraction yielded < %d chars — PDF may be image-only", _PDF_MIN_CHARS)
return (
f"This PDF appears to be image-only or contains very little extractable text "
f"({len(text.strip())} characters). OCR is not supported in the current version. "
f"Please provide a text-based PDF or convert it to a text document first."
)
return text
def _extract_docx(file_bytes: bytes) -> str:
"""Extract text from a DOCX file using python-docx."""
from docx import Document
doc = Document(io.BytesIO(file_bytes))
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
return "\n".join(paragraphs)
def _extract_pptx(file_bytes: bytes) -> str:
"""Extract text from a PPTX file using python-pptx."""
from pptx import Presentation
from pptx.util import Pt # noqa: F401 — imported for type completeness
prs = Presentation(io.BytesIO(file_bytes))
slide_texts: list[str] = []
for slide_num, slide in enumerate(prs.slides, start=1):
texts: list[str] = []
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
line = "".join(run.text for run in para.runs).strip()
if line:
texts.append(line)
if texts:
slide_texts.append(f"[Slide {slide_num}]\n" + "\n".join(texts))
return "\n\n".join(slide_texts)
def _extract_spreadsheet(file_bytes: bytes) -> str:
"""Extract text from XLSX/XLS files as CSV-formatted text using pandas."""
import pandas as pd
df = pd.read_excel(io.BytesIO(file_bytes))
return df.to_csv(index=False)
def _extract_text_plain(file_bytes: bytes) -> str:
"""Decode a plain text file (CSV, TXT, MD) as UTF-8."""
return file_bytes.decode("utf-8", errors="replace")

View File

@@ -14,6 +14,15 @@ dependencies = [
"httpx>=0.28.0",
"sentence-transformers>=3.0.0",
"jsonschema>=4.26.0",
"pypdf>=6.9.2",
"python-docx>=1.2.0",
"python-pptx>=1.0.2",
"openpyxl>=3.1.5",
"pandas>=3.0.1",
"firecrawl-py>=4.21.0",
"youtube-transcript-api>=1.2.4",
"google-api-python-client>=2.193.0",
"google-auth-oauthlib>=1.3.0",
]
[tool.uv.sources]