konstruct/tests/unit/test_extractors.py

"""
Unit tests for orchestrator.tools.extractors.

Tests that each document format produces expected text output, and that
unsupported formats raise ValueError.

All test fixtures are constructed in-memory using the same libraries that
the extractor uses — no external files needed.
"""

from __future__ import annotations

import csv
import io

import pytest


# ---------------------------------------------------------------------------
# Helpers to build minimal valid files in memory
# ---------------------------------------------------------------------------


def _make_pdf_bytes(text: str) -> bytes:
    """Create a minimal valid PDF with one page containing the given text."""
    from pypdf import PdfWriter

    writer = PdfWriter()
    page = writer.add_blank_page(width=200, height=200)
    writer.add_page(page)
    buf = io.BytesIO()
    writer.write(buf)

    # Build a simple PDF manually since pypdf cannot add text without a font
    # Instead, use reportlab if available, fall back to a minimal hand-crafted PDF
    try:
        from reportlab.pdfgen import canvas as rl_canvas

        buf2 = io.BytesIO()
        c = rl_canvas.Canvas(buf2)
        c.drawString(10, 100, text)
        c.save()
        return buf2.getvalue()
    except ImportError:
        pass

    # Hand-crafted minimal PDF with embedded text stream
    content_stream = f"BT /F1 12 Tf 50 700 Td ({text}) Tj ET"
    stream_bytes = content_stream.encode()
    pdf = (
        b"%PDF-1.4\n"
        b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"
        b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"
        b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]"
        b" /Contents 4 0 R /Resources << /Font << /F1 << /Type /Font"
        b" /Subtype /Type1 /BaseFont /Helvetica >> >> >> >>\nendobj\n"
        b"4 0 obj\n<< /Length " + str(len(stream_bytes)).encode() + b" >>\n"
        b"stream\n" + stream_bytes + b"\nendstream\nendobj\n"
        b"xref\n0 5\n0000000000 65535 f \n"
        b"trailer\n<< /Size 5 /Root 1 0 R >>\nstartxref\n0\n%%EOF"
    )
    return pdf


def _make_docx_bytes(paragraphs: list[str]) -> bytes:
    """Create a minimal DOCX with the given paragraph texts."""
    from docx import Document

    doc = Document()
    for p in paragraphs:
        doc.add_paragraph(p)
    buf = io.BytesIO()
    doc.save(buf)
    return buf.getvalue()


def _make_pptx_bytes(slide_texts: list[str]) -> bytes:
    """Create a PPTX with one text box per slide."""
    from pptx import Presentation
    from pptx.util import Inches

    prs = Presentation()
    blank_layout = prs.slide_layouts[6]  # blank layout
    for text in slide_texts:
        slide = prs.slides.add_slide(blank_layout)
        txBox = slide.shapes.add_textbox(Inches(1), Inches(1), Inches(4), Inches(2))
        txBox.text_frame.text = text
    buf = io.BytesIO()
    prs.save(buf)
    return buf.getvalue()


def _make_xlsx_bytes(rows: list[list[str]]) -> bytes:
    """Create an XLSX with the given rows."""
    import openpyxl

    wb = openpyxl.Workbook()
    ws = wb.active
    for row in rows:
        ws.append(row)
    buf = io.BytesIO()
    wb.save(buf)
    return buf.getvalue()


# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------


class TestExtractTextDocx:
    def test_extracts_paragraph_text(self) -> None:
        from orchestrator.tools.extractors import extract_text

        docx_bytes = _make_docx_bytes(["Hello world", "Second paragraph"])
        result = extract_text("document.docx", docx_bytes)

        assert "Hello world" in result
        assert "Second paragraph" in result

    def test_empty_docx_returns_string(self) -> None:
        from orchestrator.tools.extractors import extract_text

        docx_bytes = _make_docx_bytes([])
        result = extract_text("empty.docx", docx_bytes)
        assert isinstance(result, str)


class TestExtractTextPptx:
    def test_extracts_slide_text(self) -> None:
        from orchestrator.tools.extractors import extract_text

        pptx_bytes = _make_pptx_bytes(["Slide one content", "Slide two content"])
        result = extract_text("slides.pptx", pptx_bytes)

        assert "Slide one content" in result
        assert "Slide two content" in result


class TestExtractTextXlsx:
    def test_extracts_cell_data_as_csv(self) -> None:
        from orchestrator.tools.extractors import extract_text

        xlsx_bytes = _make_xlsx_bytes([["Name", "Age"], ["Alice", "30"], ["Bob", "25"]])
        result = extract_text("data.xlsx", xlsx_bytes)

        assert "Name" in result
        assert "Alice" in result
        assert "Bob" in result


class TestExtractTextCsv:
    def test_extracts_csv_text(self) -> None:
        from orchestrator.tools.extractors import extract_text

        csv_content = "col1,col2\nval1,val2\n"
        csv_bytes = csv_content.encode("utf-8")
        result = extract_text("data.csv", csv_bytes)

        assert "col1" in result
        assert "val1" in result

    def test_handles_non_utf8_gracefully(self) -> None:
        from orchestrator.tools.extractors import extract_text

        bad_bytes = b"hello\xff world"
        result = extract_text("data.csv", bad_bytes)
        assert "hello" in result


class TestExtractTextTxt:
    def test_extracts_plain_text(self) -> None:
        from orchestrator.tools.extractors import extract_text

        txt_bytes = b"Hello, this is a plain text file."
        result = extract_text("notes.txt", txt_bytes)
        assert "Hello, this is a plain text file." in result


class TestExtractTextMarkdown:
    def test_extracts_markdown_text(self) -> None:
        from orchestrator.tools.extractors import extract_text

        md_bytes = b"# Heading\n\nSome paragraph text here."
        result = extract_text("notes.md", md_bytes)
        assert "Heading" in result
        assert "Some paragraph text here." in result


class TestExtractTextUnsupported:
    def test_raises_value_error_for_unsupported_extension(self) -> None:
        from orchestrator.tools.extractors import extract_text

        with pytest.raises(ValueError, match="Unsupported file extension"):
            extract_text("file.exe", b"some bytes")

    def test_raises_for_zip(self) -> None:
        from orchestrator.tools.extractors import extract_text

        with pytest.raises(ValueError, match="Unsupported file extension"):
            extract_text("archive.zip", b"PK\x03\x04")