feat(10-01): Celery ingestion task, executor injection, KB search wiring
- Add ingest_document Celery task (sync def + asyncio.run per arch constraint) - Add ingest_document_pipeline: MinIO download, extract, chunk, embed, store - Add chunk_text sliding window chunker (500 chars default, 50 overlap) - Update execute_tool to inject tenant_id/agent_id into all tool handler kwargs - Update web_search to use settings.brave_api_key (shared config) not os.getenv - Unit tests: test_ingestion.py (9 tests) and test_executor_injection.py (5 tests) all pass
This commit is contained in:
183
tests/unit/test_ingestion.py
Normal file
183
tests/unit/test_ingestion.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""
|
||||
Unit tests for the KB ingestion pipeline.
|
||||
|
||||
Tests:
|
||||
- chunk_text: sliding window chunker produces correctly-sized, overlapping chunks
|
||||
- ingest_document_pipeline: downloads file from MinIO, extracts, chunks, embeds, stores
|
||||
- ingest_document_pipeline: sets status='error' on failure
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
class TestChunkText:
|
||||
def test_basic_chunking(self) -> None:
|
||||
from orchestrator.tools.ingest import chunk_text
|
||||
|
||||
text = "a" * 1000
|
||||
chunks = chunk_text(text, chunk_size=100, overlap=10)
|
||||
|
||||
assert len(chunks) > 0
|
||||
for chunk in chunks:
|
||||
assert len(chunk) <= 100
|
||||
|
||||
def test_overlap_between_chunks(self) -> None:
|
||||
from orchestrator.tools.ingest import chunk_text
|
||||
|
||||
# Create text with identifiable segments
|
||||
text = "AAAA" * 50 + "BBBB" * 50 # 400 chars
|
||||
chunks = chunk_text(text, chunk_size=200, overlap=50)
|
||||
|
||||
# With overlap=50, consecutive chunks should share chars
|
||||
assert len(chunks) >= 2
|
||||
|
||||
def test_short_text_returns_one_chunk(self) -> None:
|
||||
from orchestrator.tools.ingest import chunk_text
|
||||
|
||||
text = "Hello world"
|
||||
chunks = chunk_text(text, chunk_size=500, overlap=50)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0] == "Hello world"
|
||||
|
||||
def test_empty_text_returns_empty_list(self) -> None:
|
||||
from orchestrator.tools.ingest import chunk_text
|
||||
|
||||
chunks = chunk_text("", chunk_size=500, overlap=50)
|
||||
assert chunks == []
|
||||
|
||||
def test_whitespace_only_returns_empty_list(self) -> None:
|
||||
from orchestrator.tools.ingest import chunk_text
|
||||
|
||||
chunks = chunk_text(" \n ", chunk_size=500, overlap=50)
|
||||
assert chunks == []
|
||||
|
||||
def test_default_parameters(self) -> None:
|
||||
from orchestrator.tools.ingest import chunk_text
|
||||
|
||||
text = "word " * 500 # 2500 chars
|
||||
chunks = chunk_text(text)
|
||||
|
||||
assert len(chunks) > 1
|
||||
# Default chunk_size is 500
|
||||
for chunk in chunks:
|
||||
assert len(chunk) <= 500
|
||||
|
||||
|
||||
class TestIngestDocumentPipeline:
|
||||
@pytest.mark.asyncio
|
||||
async def test_file_upload_sets_status_ready(self) -> None:
|
||||
"""Pipeline downloads file, extracts, chunks, embeds, stores, sets ready."""
|
||||
from orchestrator.tools.ingest import ingest_document_pipeline
|
||||
|
||||
tenant_id = str(uuid.uuid4())
|
||||
document_id = str(uuid.uuid4())
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.id = uuid.UUID(document_id)
|
||||
mock_doc.tenant_id = uuid.UUID(tenant_id)
|
||||
mock_doc.filename = "test.txt"
|
||||
mock_doc.source_url = None
|
||||
mock_doc.status = "processing"
|
||||
|
||||
with (
|
||||
patch("orchestrator.tools.ingest.async_session_factory") as mock_sf,
|
||||
patch("orchestrator.tools.ingest.engine"),
|
||||
patch("orchestrator.tools.ingest.configure_rls_hook"),
|
||||
patch("orchestrator.tools.ingest.current_tenant_id"),
|
||||
patch("orchestrator.tools.ingest._get_minio_client") as mock_minio,
|
||||
patch("orchestrator.tools.ingest.extract_text", return_value="Test content " * 50) as mock_extract,
|
||||
patch("orchestrator.tools.ingest.embed_texts", return_value=[[0.1] * 384]) as mock_embed,
|
||||
):
|
||||
mock_session = AsyncMock()
|
||||
mock_result = MagicMock()
|
||||
mock_result.scalar_one_or_none.return_value = mock_doc
|
||||
mock_session.execute = AsyncMock(return_value=mock_result)
|
||||
mock_session.commit = AsyncMock()
|
||||
mock_session.__aenter__ = AsyncMock(return_value=mock_session)
|
||||
mock_session.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_sf.return_value = mock_session
|
||||
|
||||
# MinIO returns file bytes
|
||||
minio_client = MagicMock()
|
||||
response_obj = MagicMock()
|
||||
response_obj.read.return_value = b"Test content " * 50
|
||||
minio_client.get_object.return_value = response_obj
|
||||
mock_minio.return_value = minio_client
|
||||
|
||||
await ingest_document_pipeline(document_id, tenant_id)
|
||||
|
||||
# Status should be set to 'ready' on the document
|
||||
assert mock_doc.status == "ready"
|
||||
assert mock_doc.chunk_count is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_pipeline_sets_error_on_exception(self) -> None:
|
||||
"""Pipeline marks document as error when extraction fails."""
|
||||
from orchestrator.tools.ingest import ingest_document_pipeline
|
||||
|
||||
tenant_id = str(uuid.uuid4())
|
||||
document_id = str(uuid.uuid4())
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.id = uuid.UUID(document_id)
|
||||
mock_doc.tenant_id = uuid.UUID(tenant_id)
|
||||
mock_doc.filename = "test.txt"
|
||||
mock_doc.source_url = None
|
||||
mock_doc.status = "processing"
|
||||
|
||||
with (
|
||||
patch("orchestrator.tools.ingest.async_session_factory") as mock_sf,
|
||||
patch("orchestrator.tools.ingest.engine"),
|
||||
patch("orchestrator.tools.ingest.configure_rls_hook"),
|
||||
patch("orchestrator.tools.ingest.current_tenant_id"),
|
||||
patch("orchestrator.tools.ingest._get_minio_client") as mock_minio,
|
||||
):
|
||||
mock_session = AsyncMock()
|
||||
mock_result = MagicMock()
|
||||
mock_result.scalar_one_or_none.return_value = mock_doc
|
||||
mock_session.execute = AsyncMock(return_value=mock_result)
|
||||
mock_session.commit = AsyncMock()
|
||||
mock_session.__aenter__ = AsyncMock(return_value=mock_session)
|
||||
mock_session.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_sf.return_value = mock_session
|
||||
|
||||
# MinIO raises an error
|
||||
minio_client = MagicMock()
|
||||
minio_client.get_object.side_effect = Exception("MinIO connection failed")
|
||||
mock_minio.return_value = minio_client
|
||||
|
||||
await ingest_document_pipeline(document_id, tenant_id)
|
||||
|
||||
assert mock_doc.status == "error"
|
||||
assert mock_doc.error_message is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_document_not_found_is_no_op(self) -> None:
|
||||
"""If document doesn't exist, pipeline exits gracefully."""
|
||||
from orchestrator.tools.ingest import ingest_document_pipeline
|
||||
|
||||
tenant_id = str(uuid.uuid4())
|
||||
document_id = str(uuid.uuid4())
|
||||
|
||||
with (
|
||||
patch("orchestrator.tools.ingest.async_session_factory") as mock_sf,
|
||||
patch("orchestrator.tools.ingest.engine"),
|
||||
patch("orchestrator.tools.ingest.configure_rls_hook"),
|
||||
patch("orchestrator.tools.ingest.current_tenant_id"),
|
||||
):
|
||||
mock_session = AsyncMock()
|
||||
mock_result = MagicMock()
|
||||
mock_result.scalar_one_or_none.return_value = None # Not found
|
||||
mock_session.execute = AsyncMock(return_value=mock_result)
|
||||
mock_session.__aenter__ = AsyncMock(return_value=mock_session)
|
||||
mock_session.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_sf.return_value = mock_session
|
||||
|
||||
# Should not raise
|
||||
await ingest_document_pipeline(document_id, tenant_id)
|
||||
Reference in New Issue
Block a user