feat(10-01): Celery ingestion task, executor injection, KB search wiring

- Add ingest_document Celery task (sync def + asyncio.run per arch constraint)
- Add ingest_document_pipeline: MinIO download, extract, chunk, embed, store
- Add chunk_text sliding window chunker (500 chars default, 50 overlap)
- Update execute_tool to inject tenant_id/agent_id into all tool handler kwargs
- Update web_search to use settings.brave_api_key (shared config) not os.getenv
- Unit tests: test_ingestion.py (9 tests) and test_executor_injection.py (5 tests) all pass
This commit is contained in:
2026-03-26 09:09:36 -06:00
parent 08572fcc40
commit 9c7686a7b4
6 changed files with 748 additions and 4 deletions

View File

@@ -0,0 +1,183 @@
"""
Unit tests for the KB ingestion pipeline.
Tests:
- chunk_text: sliding window chunker produces correctly-sized, overlapping chunks
- ingest_document_pipeline: downloads file from MinIO, extracts, chunks, embeds, stores
- ingest_document_pipeline: sets status='error' on failure
"""
from __future__ import annotations
import uuid
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
class TestChunkText:
def test_basic_chunking(self) -> None:
from orchestrator.tools.ingest import chunk_text
text = "a" * 1000
chunks = chunk_text(text, chunk_size=100, overlap=10)
assert len(chunks) > 0
for chunk in chunks:
assert len(chunk) <= 100
def test_overlap_between_chunks(self) -> None:
from orchestrator.tools.ingest import chunk_text
# Create text with identifiable segments
text = "AAAA" * 50 + "BBBB" * 50 # 400 chars
chunks = chunk_text(text, chunk_size=200, overlap=50)
# With overlap=50, consecutive chunks should share chars
assert len(chunks) >= 2
def test_short_text_returns_one_chunk(self) -> None:
from orchestrator.tools.ingest import chunk_text
text = "Hello world"
chunks = chunk_text(text, chunk_size=500, overlap=50)
assert len(chunks) == 1
assert chunks[0] == "Hello world"
def test_empty_text_returns_empty_list(self) -> None:
from orchestrator.tools.ingest import chunk_text
chunks = chunk_text("", chunk_size=500, overlap=50)
assert chunks == []
def test_whitespace_only_returns_empty_list(self) -> None:
from orchestrator.tools.ingest import chunk_text
chunks = chunk_text(" \n ", chunk_size=500, overlap=50)
assert chunks == []
def test_default_parameters(self) -> None:
from orchestrator.tools.ingest import chunk_text
text = "word " * 500 # 2500 chars
chunks = chunk_text(text)
assert len(chunks) > 1
# Default chunk_size is 500
for chunk in chunks:
assert len(chunk) <= 500
class TestIngestDocumentPipeline:
@pytest.mark.asyncio
async def test_file_upload_sets_status_ready(self) -> None:
"""Pipeline downloads file, extracts, chunks, embeds, stores, sets ready."""
from orchestrator.tools.ingest import ingest_document_pipeline
tenant_id = str(uuid.uuid4())
document_id = str(uuid.uuid4())
mock_doc = MagicMock()
mock_doc.id = uuid.UUID(document_id)
mock_doc.tenant_id = uuid.UUID(tenant_id)
mock_doc.filename = "test.txt"
mock_doc.source_url = None
mock_doc.status = "processing"
with (
patch("orchestrator.tools.ingest.async_session_factory") as mock_sf,
patch("orchestrator.tools.ingest.engine"),
patch("orchestrator.tools.ingest.configure_rls_hook"),
patch("orchestrator.tools.ingest.current_tenant_id"),
patch("orchestrator.tools.ingest._get_minio_client") as mock_minio,
patch("orchestrator.tools.ingest.extract_text", return_value="Test content " * 50) as mock_extract,
patch("orchestrator.tools.ingest.embed_texts", return_value=[[0.1] * 384]) as mock_embed,
):
mock_session = AsyncMock()
mock_result = MagicMock()
mock_result.scalar_one_or_none.return_value = mock_doc
mock_session.execute = AsyncMock(return_value=mock_result)
mock_session.commit = AsyncMock()
mock_session.__aenter__ = AsyncMock(return_value=mock_session)
mock_session.__aexit__ = AsyncMock(return_value=False)
mock_sf.return_value = mock_session
# MinIO returns file bytes
minio_client = MagicMock()
response_obj = MagicMock()
response_obj.read.return_value = b"Test content " * 50
minio_client.get_object.return_value = response_obj
mock_minio.return_value = minio_client
await ingest_document_pipeline(document_id, tenant_id)
# Status should be set to 'ready' on the document
assert mock_doc.status == "ready"
assert mock_doc.chunk_count is not None
@pytest.mark.asyncio
async def test_pipeline_sets_error_on_exception(self) -> None:
"""Pipeline marks document as error when extraction fails."""
from orchestrator.tools.ingest import ingest_document_pipeline
tenant_id = str(uuid.uuid4())
document_id = str(uuid.uuid4())
mock_doc = MagicMock()
mock_doc.id = uuid.UUID(document_id)
mock_doc.tenant_id = uuid.UUID(tenant_id)
mock_doc.filename = "test.txt"
mock_doc.source_url = None
mock_doc.status = "processing"
with (
patch("orchestrator.tools.ingest.async_session_factory") as mock_sf,
patch("orchestrator.tools.ingest.engine"),
patch("orchestrator.tools.ingest.configure_rls_hook"),
patch("orchestrator.tools.ingest.current_tenant_id"),
patch("orchestrator.tools.ingest._get_minio_client") as mock_minio,
):
mock_session = AsyncMock()
mock_result = MagicMock()
mock_result.scalar_one_or_none.return_value = mock_doc
mock_session.execute = AsyncMock(return_value=mock_result)
mock_session.commit = AsyncMock()
mock_session.__aenter__ = AsyncMock(return_value=mock_session)
mock_session.__aexit__ = AsyncMock(return_value=False)
mock_sf.return_value = mock_session
# MinIO raises an error
minio_client = MagicMock()
minio_client.get_object.side_effect = Exception("MinIO connection failed")
mock_minio.return_value = minio_client
await ingest_document_pipeline(document_id, tenant_id)
assert mock_doc.status == "error"
assert mock_doc.error_message is not None
@pytest.mark.asyncio
async def test_document_not_found_is_no_op(self) -> None:
"""If document doesn't exist, pipeline exits gracefully."""
from orchestrator.tools.ingest import ingest_document_pipeline
tenant_id = str(uuid.uuid4())
document_id = str(uuid.uuid4())
with (
patch("orchestrator.tools.ingest.async_session_factory") as mock_sf,
patch("orchestrator.tools.ingest.engine"),
patch("orchestrator.tools.ingest.configure_rls_hook"),
patch("orchestrator.tools.ingest.current_tenant_id"),
):
mock_session = AsyncMock()
mock_result = MagicMock()
mock_result.scalar_one_or_none.return_value = None # Not found
mock_session.execute = AsyncMock(return_value=mock_result)
mock_session.__aenter__ = AsyncMock(return_value=mock_session)
mock_session.__aexit__ = AsyncMock(return_value=False)
mock_sf.return_value = mock_session
# Should not raise
await ingest_document_pipeline(document_id, tenant_id)