feat(10-01): KB ingestion pipeline - migration, extractors, API router
- Migration 014: add status/error_message/chunk_count to kb_documents, make agent_id nullable - Add GOOGLE_CALENDAR to ChannelTypeEnum in tenant.py - Add brave_api_key, firecrawl_api_key, google_client_id/secret, minio_kb_bucket to config - Add text extractors for PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD - Add KB management API router with upload, list, delete, URL ingest, reindex endpoints - Install pypdf, python-docx, python-pptx, openpyxl, pandas, firecrawl-py, youtube-transcript-api - Update .env.example with new env vars - Unit tests: test_extractors.py (10 tests) and test_kb_upload.py (7 tests) all pass
This commit is contained in:
141
packages/orchestrator/orchestrator/tools/extractors.py
Normal file
141
packages/orchestrator/orchestrator/tools/extractors.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""
|
||||
Text extraction functions for knowledge base document ingestion.
|
||||
|
||||
Supports: PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD
|
||||
|
||||
Usage:
|
||||
text = extract_text("document.pdf", pdf_bytes)
|
||||
text = extract_text("report.docx", docx_bytes)
|
||||
|
||||
Raises:
|
||||
ValueError: If the file extension is not supported.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Supported extensions grouped by extraction method
|
||||
_PDF_EXTENSIONS = {".pdf"}
|
||||
_DOCX_EXTENSIONS = {".docx"}
|
||||
_PPTX_EXTENSIONS = {".pptx"}
|
||||
_SPREADSHEET_EXTENSIONS = {".xlsx", ".xls"}
|
||||
_TEXT_EXTENSIONS = {".csv", ".txt", ".md"}
|
||||
|
||||
_ALL_SUPPORTED = (
|
||||
_PDF_EXTENSIONS
|
||||
| _DOCX_EXTENSIONS
|
||||
| _PPTX_EXTENSIONS
|
||||
| _SPREADSHEET_EXTENSIONS
|
||||
| _TEXT_EXTENSIONS
|
||||
)
|
||||
|
||||
# Minimum characters for a PDF to be considered successfully extracted
|
||||
# Below this threshold the PDF likely needs OCR (scanned/image-only PDF)
|
||||
_PDF_MIN_CHARS = 100
|
||||
|
||||
|
||||
def extract_text(filename: str, file_bytes: bytes) -> str:
|
||||
"""
|
||||
Extract plain text from a document given its filename and raw bytes.
|
||||
|
||||
Args:
|
||||
filename: Original filename including extension (e.g., "report.pdf").
|
||||
The extension determines which parser to use.
|
||||
file_bytes: Raw bytes of the document.
|
||||
|
||||
Returns:
|
||||
Extracted plain text as a string.
|
||||
|
||||
Raises:
|
||||
ValueError: If the file extension is not in the supported set.
|
||||
"""
|
||||
_, ext = os.path.splitext(filename.lower())
|
||||
|
||||
if ext in _PDF_EXTENSIONS:
|
||||
return _extract_pdf(file_bytes)
|
||||
elif ext in _DOCX_EXTENSIONS:
|
||||
return _extract_docx(file_bytes)
|
||||
elif ext in _PPTX_EXTENSIONS:
|
||||
return _extract_pptx(file_bytes)
|
||||
elif ext in _SPREADSHEET_EXTENSIONS:
|
||||
return _extract_spreadsheet(file_bytes)
|
||||
elif ext in _TEXT_EXTENSIONS:
|
||||
return _extract_text_plain(file_bytes)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported file extension: '{ext}'. "
|
||||
f"Supported formats: {', '.join(sorted(_ALL_SUPPORTED))}"
|
||||
)
|
||||
|
||||
|
||||
def _extract_pdf(file_bytes: bytes) -> str:
|
||||
"""Extract text from a PDF file using pypdf."""
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader(io.BytesIO(file_bytes))
|
||||
pages_text: list[str] = []
|
||||
for page in reader.pages:
|
||||
page_text = page.extract_text() or ""
|
||||
if page_text.strip():
|
||||
pages_text.append(page_text)
|
||||
|
||||
text = "\n".join(pages_text)
|
||||
|
||||
if len(text.strip()) < _PDF_MIN_CHARS:
|
||||
logger.warning("PDF text extraction yielded < %d chars — PDF may be image-only", _PDF_MIN_CHARS)
|
||||
return (
|
||||
f"This PDF appears to be image-only or contains very little extractable text "
|
||||
f"({len(text.strip())} characters). OCR is not supported in the current version. "
|
||||
f"Please provide a text-based PDF or convert it to a text document first."
|
||||
)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def _extract_docx(file_bytes: bytes) -> str:
|
||||
"""Extract text from a DOCX file using python-docx."""
|
||||
from docx import Document
|
||||
|
||||
doc = Document(io.BytesIO(file_bytes))
|
||||
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
|
||||
return "\n".join(paragraphs)
|
||||
|
||||
|
||||
def _extract_pptx(file_bytes: bytes) -> str:
|
||||
"""Extract text from a PPTX file using python-pptx."""
|
||||
from pptx import Presentation
|
||||
from pptx.util import Pt # noqa: F401 — imported for type completeness
|
||||
|
||||
prs = Presentation(io.BytesIO(file_bytes))
|
||||
slide_texts: list[str] = []
|
||||
|
||||
for slide_num, slide in enumerate(prs.slides, start=1):
|
||||
texts: list[str] = []
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for para in shape.text_frame.paragraphs:
|
||||
line = "".join(run.text for run in para.runs).strip()
|
||||
if line:
|
||||
texts.append(line)
|
||||
if texts:
|
||||
slide_texts.append(f"[Slide {slide_num}]\n" + "\n".join(texts))
|
||||
|
||||
return "\n\n".join(slide_texts)
|
||||
|
||||
|
||||
def _extract_spreadsheet(file_bytes: bytes) -> str:
|
||||
"""Extract text from XLSX/XLS files as CSV-formatted text using pandas."""
|
||||
import pandas as pd
|
||||
|
||||
df = pd.read_excel(io.BytesIO(file_bytes))
|
||||
return df.to_csv(index=False)
|
||||
|
||||
|
||||
def _extract_text_plain(file_bytes: bytes) -> str:
|
||||
"""Decode a plain text file (CSV, TXT, MD) as UTF-8."""
|
||||
return file_bytes.decode("utf-8", errors="replace")
|
||||
@@ -14,6 +14,15 @@ dependencies = [
|
||||
"httpx>=0.28.0",
|
||||
"sentence-transformers>=3.0.0",
|
||||
"jsonschema>=4.26.0",
|
||||
"pypdf>=6.9.2",
|
||||
"python-docx>=1.2.0",
|
||||
"python-pptx>=1.0.2",
|
||||
"openpyxl>=3.1.5",
|
||||
"pandas>=3.0.1",
|
||||
"firecrawl-py>=4.21.0",
|
||||
"youtube-transcript-api>=1.2.4",
|
||||
"google-api-python-client>=2.193.0",
|
||||
"google-auth-oauthlib>=1.3.0",
|
||||
]
|
||||
|
||||
[tool.uv.sources]
|
||||
|
||||
Reference in New Issue
Block a user