""" Unit tests for the KB ingestion pipeline. Tests: - chunk_text: sliding window chunker produces correctly-sized, overlapping chunks - ingest_document_pipeline: downloads file from MinIO, extracts, chunks, embeds, stores - ingest_document_pipeline: sets status='error' on failure """ from __future__ import annotations import uuid from unittest.mock import AsyncMock, MagicMock, patch import pytest class TestChunkText: def test_basic_chunking(self) -> None: from orchestrator.tools.ingest import chunk_text text = "a" * 1000 chunks = chunk_text(text, chunk_size=100, overlap=10) assert len(chunks) > 0 for chunk in chunks: assert len(chunk) <= 100 def test_overlap_between_chunks(self) -> None: from orchestrator.tools.ingest import chunk_text # Create text with identifiable segments text = "AAAA" * 50 + "BBBB" * 50 # 400 chars chunks = chunk_text(text, chunk_size=200, overlap=50) # With overlap=50, consecutive chunks should share chars assert len(chunks) >= 2 def test_short_text_returns_one_chunk(self) -> None: from orchestrator.tools.ingest import chunk_text text = "Hello world" chunks = chunk_text(text, chunk_size=500, overlap=50) assert len(chunks) == 1 assert chunks[0] == "Hello world" def test_empty_text_returns_empty_list(self) -> None: from orchestrator.tools.ingest import chunk_text chunks = chunk_text("", chunk_size=500, overlap=50) assert chunks == [] def test_whitespace_only_returns_empty_list(self) -> None: from orchestrator.tools.ingest import chunk_text chunks = chunk_text(" \n ", chunk_size=500, overlap=50) assert chunks == [] def test_default_parameters(self) -> None: from orchestrator.tools.ingest import chunk_text text = "word " * 500 # 2500 chars chunks = chunk_text(text) assert len(chunks) > 1 # Default chunk_size is 500 for chunk in chunks: assert len(chunk) <= 500 class TestIngestDocumentPipeline: @pytest.mark.asyncio async def test_file_upload_sets_status_ready(self) -> None: """Pipeline downloads file, extracts, chunks, embeds, stores, sets ready.""" from orchestrator.tools.ingest import ingest_document_pipeline tenant_id = str(uuid.uuid4()) document_id = str(uuid.uuid4()) mock_doc = MagicMock() mock_doc.id = uuid.UUID(document_id) mock_doc.tenant_id = uuid.UUID(tenant_id) mock_doc.filename = "test.txt" mock_doc.source_url = None mock_doc.status = "processing" with ( patch("orchestrator.tools.ingest.async_session_factory") as mock_sf, patch("orchestrator.tools.ingest.engine"), patch("orchestrator.tools.ingest.configure_rls_hook"), patch("orchestrator.tools.ingest.current_tenant_id"), patch("orchestrator.tools.ingest._get_minio_client") as mock_minio, patch("orchestrator.tools.ingest.extract_text", return_value="Test content " * 50) as mock_extract, patch("orchestrator.tools.ingest.embed_texts", return_value=[[0.1] * 384]) as mock_embed, ): mock_session = AsyncMock() mock_result = MagicMock() mock_result.scalar_one_or_none.return_value = mock_doc mock_session.execute = AsyncMock(return_value=mock_result) mock_session.commit = AsyncMock() mock_session.__aenter__ = AsyncMock(return_value=mock_session) mock_session.__aexit__ = AsyncMock(return_value=False) mock_sf.return_value = mock_session # MinIO returns file bytes minio_client = MagicMock() response_obj = MagicMock() response_obj.read.return_value = b"Test content " * 50 minio_client.get_object.return_value = response_obj mock_minio.return_value = minio_client await ingest_document_pipeline(document_id, tenant_id) # Status should be set to 'ready' on the document assert mock_doc.status == "ready" assert mock_doc.chunk_count is not None @pytest.mark.asyncio async def test_pipeline_sets_error_on_exception(self) -> None: """Pipeline marks document as error when extraction fails.""" from orchestrator.tools.ingest import ingest_document_pipeline tenant_id = str(uuid.uuid4()) document_id = str(uuid.uuid4()) mock_doc = MagicMock() mock_doc.id = uuid.UUID(document_id) mock_doc.tenant_id = uuid.UUID(tenant_id) mock_doc.filename = "test.txt" mock_doc.source_url = None mock_doc.status = "processing" with ( patch("orchestrator.tools.ingest.async_session_factory") as mock_sf, patch("orchestrator.tools.ingest.engine"), patch("orchestrator.tools.ingest.configure_rls_hook"), patch("orchestrator.tools.ingest.current_tenant_id"), patch("orchestrator.tools.ingest._get_minio_client") as mock_minio, ): mock_session = AsyncMock() mock_result = MagicMock() mock_result.scalar_one_or_none.return_value = mock_doc mock_session.execute = AsyncMock(return_value=mock_result) mock_session.commit = AsyncMock() mock_session.__aenter__ = AsyncMock(return_value=mock_session) mock_session.__aexit__ = AsyncMock(return_value=False) mock_sf.return_value = mock_session # MinIO raises an error minio_client = MagicMock() minio_client.get_object.side_effect = Exception("MinIO connection failed") mock_minio.return_value = minio_client await ingest_document_pipeline(document_id, tenant_id) assert mock_doc.status == "error" assert mock_doc.error_message is not None @pytest.mark.asyncio async def test_document_not_found_is_no_op(self) -> None: """If document doesn't exist, pipeline exits gracefully.""" from orchestrator.tools.ingest import ingest_document_pipeline tenant_id = str(uuid.uuid4()) document_id = str(uuid.uuid4()) with ( patch("orchestrator.tools.ingest.async_session_factory") as mock_sf, patch("orchestrator.tools.ingest.engine"), patch("orchestrator.tools.ingest.configure_rls_hook"), patch("orchestrator.tools.ingest.current_tenant_id"), ): mock_session = AsyncMock() mock_result = MagicMock() mock_result.scalar_one_or_none.return_value = None # Not found mock_session.execute = AsyncMock(return_value=mock_result) mock_session.__aenter__ = AsyncMock(return_value=mock_session) mock_session.__aexit__ = AsyncMock(return_value=False) mock_sf.return_value = mock_session # Should not raise await ingest_document_pipeline(document_id, tenant_id)