Files
konstruct/tests/unit/test_extractors.py
Adolfo Delorenzo e8d3e8a108 feat(10-01): KB ingestion pipeline - migration, extractors, API router
- Migration 014: add status/error_message/chunk_count to kb_documents, make agent_id nullable
- Add GOOGLE_CALENDAR to ChannelTypeEnum in tenant.py
- Add brave_api_key, firecrawl_api_key, google_client_id/secret, minio_kb_bucket to config
- Add text extractors for PDF, DOCX, PPTX, XLSX/XLS, CSV, TXT, MD
- Add KB management API router with upload, list, delete, URL ingest, reindex endpoints
- Install pypdf, python-docx, python-pptx, openpyxl, pandas, firecrawl-py, youtube-transcript-api
- Update .env.example with new env vars
- Unit tests: test_extractors.py (10 tests) and test_kb_upload.py (7 tests) all pass
2026-03-26 09:05:29 -06:00

202 lines
6.4 KiB
Python

"""
Unit tests for orchestrator.tools.extractors.
Tests that each document format produces expected text output, and that
unsupported formats raise ValueError.
All test fixtures are constructed in-memory using the same libraries that
the extractor uses — no external files needed.
"""
from __future__ import annotations
import csv
import io
import pytest
# ---------------------------------------------------------------------------
# Helpers to build minimal valid files in memory
# ---------------------------------------------------------------------------
def _make_pdf_bytes(text: str) -> bytes:
"""Create a minimal valid PDF with one page containing the given text."""
from pypdf import PdfWriter
writer = PdfWriter()
page = writer.add_blank_page(width=200, height=200)
writer.add_page(page)
buf = io.BytesIO()
writer.write(buf)
# Build a simple PDF manually since pypdf cannot add text without a font
# Instead, use reportlab if available, fall back to a minimal hand-crafted PDF
try:
from reportlab.pdfgen import canvas as rl_canvas
buf2 = io.BytesIO()
c = rl_canvas.Canvas(buf2)
c.drawString(10, 100, text)
c.save()
return buf2.getvalue()
except ImportError:
pass
# Hand-crafted minimal PDF with embedded text stream
content_stream = f"BT /F1 12 Tf 50 700 Td ({text}) Tj ET"
stream_bytes = content_stream.encode()
pdf = (
b"%PDF-1.4\n"
b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"
b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"
b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]"
b" /Contents 4 0 R /Resources << /Font << /F1 << /Type /Font"
b" /Subtype /Type1 /BaseFont /Helvetica >> >> >> >>\nendobj\n"
b"4 0 obj\n<< /Length " + str(len(stream_bytes)).encode() + b" >>\n"
b"stream\n" + stream_bytes + b"\nendstream\nendobj\n"
b"xref\n0 5\n0000000000 65535 f \n"
b"trailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n0\n%%EOF"
)
return pdf
def _make_docx_bytes(paragraphs: list[str]) -> bytes:
"""Create a minimal DOCX with the given paragraph texts."""
from docx import Document
doc = Document()
for p in paragraphs:
doc.add_paragraph(p)
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()
def _make_pptx_bytes(slide_texts: list[str]) -> bytes:
"""Create a PPTX with one text box per slide."""
from pptx import Presentation
from pptx.util import Inches
prs = Presentation()
blank_layout = prs.slide_layouts[6] # blank layout
for text in slide_texts:
slide = prs.slides.add_slide(blank_layout)
txBox = slide.shapes.add_textbox(Inches(1), Inches(1), Inches(4), Inches(2))
txBox.text_frame.text = text
buf = io.BytesIO()
prs.save(buf)
return buf.getvalue()
def _make_xlsx_bytes(rows: list[list[str]]) -> bytes:
"""Create an XLSX with the given rows."""
import openpyxl
wb = openpyxl.Workbook()
ws = wb.active
for row in rows:
ws.append(row)
buf = io.BytesIO()
wb.save(buf)
return buf.getvalue()
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
class TestExtractTextDocx:
def test_extracts_paragraph_text(self) -> None:
from orchestrator.tools.extractors import extract_text
docx_bytes = _make_docx_bytes(["Hello world", "Second paragraph"])
result = extract_text("document.docx", docx_bytes)
assert "Hello world" in result
assert "Second paragraph" in result
def test_empty_docx_returns_string(self) -> None:
from orchestrator.tools.extractors import extract_text
docx_bytes = _make_docx_bytes([])
result = extract_text("empty.docx", docx_bytes)
assert isinstance(result, str)
class TestExtractTextPptx:
def test_extracts_slide_text(self) -> None:
from orchestrator.tools.extractors import extract_text
pptx_bytes = _make_pptx_bytes(["Slide one content", "Slide two content"])
result = extract_text("slides.pptx", pptx_bytes)
assert "Slide one content" in result
assert "Slide two content" in result
class TestExtractTextXlsx:
def test_extracts_cell_data_as_csv(self) -> None:
from orchestrator.tools.extractors import extract_text
xlsx_bytes = _make_xlsx_bytes([["Name", "Age"], ["Alice", "30"], ["Bob", "25"]])
result = extract_text("data.xlsx", xlsx_bytes)
assert "Name" in result
assert "Alice" in result
assert "Bob" in result
class TestExtractTextCsv:
def test_extracts_csv_text(self) -> None:
from orchestrator.tools.extractors import extract_text
csv_content = "col1,col2\nval1,val2\n"
csv_bytes = csv_content.encode("utf-8")
result = extract_text("data.csv", csv_bytes)
assert "col1" in result
assert "val1" in result
def test_handles_non_utf8_gracefully(self) -> None:
from orchestrator.tools.extractors import extract_text
bad_bytes = b"hello\xff world"
result = extract_text("data.csv", bad_bytes)
assert "hello" in result
class TestExtractTextTxt:
def test_extracts_plain_text(self) -> None:
from orchestrator.tools.extractors import extract_text
txt_bytes = b"Hello, this is a plain text file."
result = extract_text("notes.txt", txt_bytes)
assert "Hello, this is a plain text file." in result
class TestExtractTextMarkdown:
def test_extracts_markdown_text(self) -> None:
from orchestrator.tools.extractors import extract_text
md_bytes = b"# Heading\n\nSome paragraph text here."
result = extract_text("notes.md", md_bytes)
assert "Heading" in result
assert "Some paragraph text here." in result
class TestExtractTextUnsupported:
def test_raises_value_error_for_unsupported_extension(self) -> None:
from orchestrator.tools.extractors import extract_text
with pytest.raises(ValueError, match="Unsupported file extension"):
extract_text("file.exe", b"some bytes")
def test_raises_for_zip(self) -> None:
from orchestrator.tools.extractors import extract_text
with pytest.raises(ValueError, match="Unsupported file extension"):
extract_text("archive.zip", b"PK\x03\x04")