""" Unit tests for multimodal LLM interpretation — image_url content block injection. Tests: - message with IMAGE MediaAttachment + vision model produces image_url content block - message with IMAGE MediaAttachment + non-vision model produces text fallback "[Image attached: ...]" - message with DOCUMENT MediaAttachment produces text reference with presigned URL - message with no media produces standard text-only content (no regression) - supports_vision returns True for "claude-3-sonnet", "gpt-4o", False for "gpt-3.5-turbo" - presigned URL has correct format and expiry (mock boto3) """ from __future__ import annotations from unittest.mock import MagicMock, patch import pytest from shared.models.message import MediaAttachment, MediaType # --------------------------------------------------------------------------- # Test fixtures # --------------------------------------------------------------------------- def make_image_attachment( storage_key: str = "tenant-1/agent-1/msg-1/photo.jpg", filename: str = "photo.jpg", mime_type: str = "image/jpeg", ) -> MediaAttachment: """Create a sample IMAGE MediaAttachment.""" return MediaAttachment( media_type=MediaType.IMAGE, storage_key=storage_key, mime_type=mime_type, filename=filename, size_bytes=10240, ) def make_document_attachment( storage_key: str = "tenant-1/agent-1/msg-1/report.pdf", filename: str = "report.pdf", mime_type: str = "application/pdf", ) -> MediaAttachment: """Create a sample DOCUMENT MediaAttachment.""" return MediaAttachment( media_type=MediaType.DOCUMENT, storage_key=storage_key, mime_type=mime_type, filename=filename, size_bytes=204800, ) # --------------------------------------------------------------------------- # Tests: supports_vision() # --------------------------------------------------------------------------- class TestSupportsVision: """Tests for the supports_vision model detection function.""" def test_claude_3_sonnet_supports_vision(self): """claude-3-sonnet is a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("claude-3-sonnet-20240229") is True def test_claude_3_haiku_supports_vision(self): """claude-3-haiku is a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("claude-3-haiku-20240307") is True def test_claude_3_opus_supports_vision(self): """claude-3-opus is a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("claude-3-opus-20240229") is True def test_claude_3_5_sonnet_supports_vision(self): """claude-3-5-sonnet is a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("claude-3-5-sonnet-20241022") is True def test_gpt_4o_supports_vision(self): """gpt-4o is a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("gpt-4o") is True def test_gpt_4o_mini_supports_vision(self): """gpt-4o-mini is a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("gpt-4o-mini") is True def test_gpt_4_vision_supports_vision(self): """gpt-4-vision-preview is a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("gpt-4-vision-preview") is True def test_gemini_pro_vision_supports_vision(self): """gemini-pro-vision is a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("gemini-pro-vision") is True def test_gemini_1_5_pro_supports_vision(self): """gemini-1.5-pro is a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("gemini-1.5-pro") is True def test_gpt_3_5_turbo_does_not_support_vision(self): """gpt-3.5-turbo is NOT a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("gpt-3.5-turbo") is False def test_gpt_4_does_not_support_vision(self): """gpt-4 (without vision suffix) is NOT a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("gpt-4") is False def test_ollama_llama_does_not_support_vision(self): """ollama/llama3 is NOT a vision-capable model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("ollama/llama3") is False def test_empty_string_does_not_support_vision(self): """Empty model string is not a vision model.""" from orchestrator.agents.builder import supports_vision assert supports_vision("") is False def test_provider_prefixed_claude_3_supports_vision(self): """anthropic/claude-3-sonnet-20240229 should still be detected as vision.""" from orchestrator.agents.builder import supports_vision assert supports_vision("anthropic/claude-3-sonnet-20240229") is True # --------------------------------------------------------------------------- # Tests: generate_presigned_url() # --------------------------------------------------------------------------- class TestGeneratePresignedUrl: """Tests for the MinIO presigned URL generator.""" def test_generate_presigned_url_returns_string(self): """generate_presigned_url returns a non-empty URL string.""" from orchestrator.agents.builder import generate_presigned_url mock_s3 = MagicMock() mock_s3.generate_presigned_url.return_value = ( "http://minio:9000/konstruct-media/key?X-Amz-Signature=abc&X-Amz-Expires=3600" ) with patch("boto3.client", return_value=mock_s3): url = generate_presigned_url("tenant-1/agent-1/msg-1/photo.jpg") assert isinstance(url, str) assert len(url) > 0 def test_generate_presigned_url_default_expiry_1_hour(self): """Default expiry is 3600 seconds (1 hour).""" from orchestrator.agents.builder import generate_presigned_url mock_s3 = MagicMock() mock_s3.generate_presigned_url.return_value = "http://minio:9000/key?sig=x" with patch("boto3.client", return_value=mock_s3): generate_presigned_url("some/key") call_kwargs = mock_s3.generate_presigned_url.call_args assert call_kwargs[1].get("ExpiresIn") == 3600 or ( len(call_kwargs[0]) >= 3 and call_kwargs[0][2] == 3600 ) def test_generate_presigned_url_custom_expiry(self): """Custom expiry is passed through to the presigned URL generator.""" from orchestrator.agents.builder import generate_presigned_url mock_s3 = MagicMock() mock_s3.generate_presigned_url.return_value = "http://minio:9000/key?sig=x" with patch("boto3.client", return_value=mock_s3): generate_presigned_url("some/key", expiry=7200) call_kwargs = mock_s3.generate_presigned_url.call_args expires_in = call_kwargs[1].get("ExpiresIn") if expires_in is None: # Might be positional arg params_arg = call_kwargs[1].get("Params", {}) expires_in = call_kwargs[1].get("ExpiresIn", 3600) assert expires_in == 7200 def test_generate_presigned_url_uses_correct_storage_key(self): """The correct storage key is passed to boto3 generate_presigned_url.""" from orchestrator.agents.builder import generate_presigned_url mock_s3 = MagicMock() mock_s3.generate_presigned_url.return_value = "http://minio:9000/key" storage_key = "tenant-abc/agent-xyz/msg-123/document.pdf" with patch("boto3.client", return_value=mock_s3): generate_presigned_url(storage_key) call_args = mock_s3.generate_presigned_url.call_args params = call_args[1].get("Params", {}) assert params.get("Key") == storage_key # --------------------------------------------------------------------------- # Tests: build_messages_with_media() # --------------------------------------------------------------------------- class TestBuildMessagesWithMedia: """Tests for the multimodal message building function.""" def _make_agent_mock(self, model: str = "claude-3-sonnet-20240229") -> MagicMock: """Create a minimal Agent mock.""" agent = MagicMock() agent.name = "Test Agent" agent.role = "Support" agent.persona = "" agent.system_prompt = "You are a helpful assistant." agent.model_preference = model return agent def test_no_media_returns_text_only_messages(self): """Message with no media produces standard text-only content — no regression.""" from orchestrator.agents.builder import build_messages_with_media agent = self._make_agent_mock() mock_s3 = MagicMock() mock_s3.generate_presigned_url.return_value = "http://minio/key" with patch("boto3.client", return_value=mock_s3): messages = build_messages_with_media( agent=agent, current_message="What is the weather?", media_attachments=[], recent_messages=[], relevant_context=[], ) # Last message should be user role with plain string content user_message = messages[-1] assert user_message["role"] == "user" assert isinstance(user_message["content"], str) assert "What is the weather?" in user_message["content"] def test_image_attachment_with_vision_model_produces_image_url_block(self): """IMAGE attachment + vision model injects image_url content block.""" from orchestrator.agents.builder import build_messages_with_media agent = self._make_agent_mock(model="claude-3-sonnet-20240229") attachment = make_image_attachment() mock_s3 = MagicMock() presigned = "http://minio:9000/konstruct-media/tenant-1/agent-1/msg-1/photo.jpg?sig=abc" mock_s3.generate_presigned_url.return_value = presigned with patch("boto3.client", return_value=mock_s3): messages = build_messages_with_media( agent=agent, current_message="What is in this image?", media_attachments=[attachment], recent_messages=[], relevant_context=[], ) user_message = messages[-1] assert user_message["role"] == "user" assert isinstance(user_message["content"], list), ( "Expected multipart content list, got: " + repr(user_message["content"]) ) content_types = {block["type"] for block in user_message["content"]} assert "text" in content_types assert "image_url" in content_types image_block = next(b for b in user_message["content"] if b["type"] == "image_url") assert image_block["image_url"]["url"] == presigned def test_image_attachment_with_non_vision_model_produces_text_fallback(self): """IMAGE attachment + non-vision model produces text fallback, no image_url block.""" from orchestrator.agents.builder import build_messages_with_media agent = self._make_agent_mock(model="gpt-3.5-turbo") attachment = make_image_attachment(filename="screenshot.png") mock_s3 = MagicMock() mock_s3.generate_presigned_url.return_value = "http://minio/key" with patch("boto3.client", return_value=mock_s3): messages = build_messages_with_media( agent=agent, current_message="Check this out", media_attachments=[attachment], recent_messages=[], relevant_context=[], ) user_message = messages[-1] assert user_message["role"] == "user" # For non-vision: content is plain string, NOT a list assert isinstance(user_message["content"], str) assert "[Image attached: screenshot.png]" in user_message["content"] def test_document_attachment_produces_text_reference_with_presigned_url(self): """DOCUMENT attachment produces text reference including presigned URL.""" from orchestrator.agents.builder import build_messages_with_media agent = self._make_agent_mock(model="claude-3-sonnet-20240229") attachment = make_document_attachment(filename="report.pdf") mock_s3 = MagicMock() presigned = "http://minio:9000/key/report.pdf?sig=xyz" mock_s3.generate_presigned_url.return_value = presigned with patch("boto3.client", return_value=mock_s3): messages = build_messages_with_media( agent=agent, current_message="Summarize this document", media_attachments=[attachment], recent_messages=[], relevant_context=[], ) user_message = messages[-1] assert user_message["role"] == "user" # Documents are always text-referenced (no image_url blocks) # Content can be string or list containing text block content = user_message["content"] if isinstance(content, list): text_parts = [b["text"] for b in content if b["type"] == "text"] full_text = " ".join(text_parts) else: full_text = content assert "[Document attached: report.pdf" in full_text assert presigned in full_text def test_document_attachment_on_non_vision_model_produces_text_reference(self): """DOCUMENT attachment on non-vision model also produces text reference.""" from orchestrator.agents.builder import build_messages_with_media agent = self._make_agent_mock(model="gpt-3.5-turbo") attachment = make_document_attachment(filename="specs.pdf") mock_s3 = MagicMock() presigned = "http://minio:9000/key/specs.pdf?sig=abc" mock_s3.generate_presigned_url.return_value = presigned with patch("boto3.client", return_value=mock_s3): messages = build_messages_with_media( agent=agent, current_message="Review these specs", media_attachments=[attachment], recent_messages=[], relevant_context=[], ) user_message = messages[-1] content = user_message["content"] if isinstance(content, list): full_text = " ".join(b["text"] for b in content if b["type"] == "text") else: full_text = content assert "[Document attached: specs.pdf" in full_text def test_image_without_storage_key_skipped_gracefully(self): """Image attachment with no storage_key is not injected as image_url block.""" from orchestrator.agents.builder import build_messages_with_media agent = self._make_agent_mock(model="claude-3-sonnet-20240229") attachment = MediaAttachment( media_type=MediaType.IMAGE, storage_key=None, # No storage key — file wasn't stored filename="orphaned.jpg", mime_type="image/jpeg", ) mock_s3 = MagicMock() mock_s3.generate_presigned_url.return_value = "http://minio/key" with patch("boto3.client", return_value=mock_s3): messages = build_messages_with_media( agent=agent, current_message="Here is an image", media_attachments=[attachment], recent_messages=[], relevant_context=[], ) # Should not crash — just produce a message (possibly with fallback text) user_message = messages[-1] assert user_message["role"] == "user" def test_image_url_block_includes_detail_field(self): """image_url content block includes 'detail' field set to 'auto'.""" from orchestrator.agents.builder import build_messages_with_media agent = self._make_agent_mock(model="gpt-4o") attachment = make_image_attachment() mock_s3 = MagicMock() mock_s3.generate_presigned_url.return_value = "http://minio:9000/key/photo.jpg" with patch("boto3.client", return_value=mock_s3): messages = build_messages_with_media( agent=agent, current_message="Analyze this image", media_attachments=[attachment], recent_messages=[], relevant_context=[], ) user_message = messages[-1] content = user_message["content"] assert isinstance(content, list) image_block = next(b for b in content if b["type"] == "image_url") assert "detail" in image_block["image_url"] assert image_block["image_url"]["detail"] == "auto" def test_multiple_images_all_injected_for_vision_model(self): """Multiple image attachments all get image_url blocks for vision models.""" from orchestrator.agents.builder import build_messages_with_media agent = self._make_agent_mock(model="gpt-4o") attachments = [ make_image_attachment(storage_key="t/a/m/img1.png", filename="img1.png"), make_image_attachment(storage_key="t/a/m/img2.jpg", filename="img2.jpg"), ] mock_s3 = MagicMock() # Return different presigned URLs for each call mock_s3.generate_presigned_url.side_effect = [ "http://minio/key/img1.png", "http://minio/key/img2.jpg", ] with patch("boto3.client", return_value=mock_s3): messages = build_messages_with_media( agent=agent, current_message="Compare these images", media_attachments=attachments, recent_messages=[], relevant_context=[], ) user_message = messages[-1] content = user_message["content"] assert isinstance(content, list) image_blocks = [b for b in content if b["type"] == "image_url"] assert len(image_blocks) == 2 def test_memory_context_still_injected_with_media(self): """Memory context (recent + relevant) is still injected when media is present.""" from orchestrator.agents.builder import build_messages_with_media agent = self._make_agent_mock(model="claude-3-sonnet-20240229") attachment = make_image_attachment() mock_s3 = MagicMock() mock_s3.generate_presigned_url.return_value = "http://minio/key/img.jpg" recent = [ {"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}, ] relevant = ["Past context item 1"] with patch("boto3.client", return_value=mock_s3): messages = build_messages_with_media( agent=agent, current_message="Here is an image", media_attachments=[attachment], recent_messages=recent, relevant_context=relevant, ) # Should have: system, context system, recent[0], recent[1], current user assert len(messages) >= 4 roles = [m["role"] for m in messages] assert roles.count("system") >= 2 # main system + pgvector context assert "user" in roles assert "assistant" in roles