- Add supports_vision(model_name) to builder.py — detects vision-capable models (claude-3*, gpt-4o*, gpt-4-vision*, gemini-pro-vision*, gemini-1.5*, gemini-2*) with provider prefix stripping support - Add generate_presigned_url(storage_key, expiry=3600) to builder.py — generates 1-hour MinIO presigned URLs via boto3 S3 client - Add build_messages_with_media() to builder.py — extends build_messages_with_memory() with media injection: IMAGE -> image_url blocks for vision models / text fallback for non-vision models, DOCUMENT -> text reference with presigned URL - image_url blocks use 'detail: auto' per OpenAI/LiteLLM multipart format - Add 27 unit tests in test_multimodal_messages.py (TDD)
504 lines
19 KiB
Python
504 lines
19 KiB
Python
"""
|
|
Unit tests for multimodal LLM interpretation — image_url content block injection.
|
|
|
|
Tests:
|
|
- message with IMAGE MediaAttachment + vision model produces image_url content block
|
|
- message with IMAGE MediaAttachment + non-vision model produces text fallback "[Image attached: ...]"
|
|
- message with DOCUMENT MediaAttachment produces text reference with presigned URL
|
|
- message with no media produces standard text-only content (no regression)
|
|
- supports_vision returns True for "claude-3-sonnet", "gpt-4o", False for "gpt-3.5-turbo"
|
|
- presigned URL has correct format and expiry (mock boto3)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from shared.models.message import MediaAttachment, MediaType
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test fixtures
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def make_image_attachment(
|
|
storage_key: str = "tenant-1/agent-1/msg-1/photo.jpg",
|
|
filename: str = "photo.jpg",
|
|
mime_type: str = "image/jpeg",
|
|
) -> MediaAttachment:
|
|
"""Create a sample IMAGE MediaAttachment."""
|
|
return MediaAttachment(
|
|
media_type=MediaType.IMAGE,
|
|
storage_key=storage_key,
|
|
mime_type=mime_type,
|
|
filename=filename,
|
|
size_bytes=10240,
|
|
)
|
|
|
|
|
|
def make_document_attachment(
|
|
storage_key: str = "tenant-1/agent-1/msg-1/report.pdf",
|
|
filename: str = "report.pdf",
|
|
mime_type: str = "application/pdf",
|
|
) -> MediaAttachment:
|
|
"""Create a sample DOCUMENT MediaAttachment."""
|
|
return MediaAttachment(
|
|
media_type=MediaType.DOCUMENT,
|
|
storage_key=storage_key,
|
|
mime_type=mime_type,
|
|
filename=filename,
|
|
size_bytes=204800,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tests: supports_vision()
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestSupportsVision:
|
|
"""Tests for the supports_vision model detection function."""
|
|
|
|
def test_claude_3_sonnet_supports_vision(self):
|
|
"""claude-3-sonnet is a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("claude-3-sonnet-20240229") is True
|
|
|
|
def test_claude_3_haiku_supports_vision(self):
|
|
"""claude-3-haiku is a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("claude-3-haiku-20240307") is True
|
|
|
|
def test_claude_3_opus_supports_vision(self):
|
|
"""claude-3-opus is a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("claude-3-opus-20240229") is True
|
|
|
|
def test_claude_3_5_sonnet_supports_vision(self):
|
|
"""claude-3-5-sonnet is a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("claude-3-5-sonnet-20241022") is True
|
|
|
|
def test_gpt_4o_supports_vision(self):
|
|
"""gpt-4o is a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("gpt-4o") is True
|
|
|
|
def test_gpt_4o_mini_supports_vision(self):
|
|
"""gpt-4o-mini is a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("gpt-4o-mini") is True
|
|
|
|
def test_gpt_4_vision_supports_vision(self):
|
|
"""gpt-4-vision-preview is a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("gpt-4-vision-preview") is True
|
|
|
|
def test_gemini_pro_vision_supports_vision(self):
|
|
"""gemini-pro-vision is a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("gemini-pro-vision") is True
|
|
|
|
def test_gemini_1_5_pro_supports_vision(self):
|
|
"""gemini-1.5-pro is a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("gemini-1.5-pro") is True
|
|
|
|
def test_gpt_3_5_turbo_does_not_support_vision(self):
|
|
"""gpt-3.5-turbo is NOT a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("gpt-3.5-turbo") is False
|
|
|
|
def test_gpt_4_does_not_support_vision(self):
|
|
"""gpt-4 (without vision suffix) is NOT a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("gpt-4") is False
|
|
|
|
def test_ollama_llama_does_not_support_vision(self):
|
|
"""ollama/llama3 is NOT a vision-capable model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("ollama/llama3") is False
|
|
|
|
def test_empty_string_does_not_support_vision(self):
|
|
"""Empty model string is not a vision model."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("") is False
|
|
|
|
def test_provider_prefixed_claude_3_supports_vision(self):
|
|
"""anthropic/claude-3-sonnet-20240229 should still be detected as vision."""
|
|
from orchestrator.agents.builder import supports_vision
|
|
|
|
assert supports_vision("anthropic/claude-3-sonnet-20240229") is True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tests: generate_presigned_url()
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestGeneratePresignedUrl:
|
|
"""Tests for the MinIO presigned URL generator."""
|
|
|
|
def test_generate_presigned_url_returns_string(self):
|
|
"""generate_presigned_url returns a non-empty URL string."""
|
|
from orchestrator.agents.builder import generate_presigned_url
|
|
|
|
mock_s3 = MagicMock()
|
|
mock_s3.generate_presigned_url.return_value = (
|
|
"http://minio:9000/konstruct-media/key?X-Amz-Signature=abc&X-Amz-Expires=3600"
|
|
)
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
url = generate_presigned_url("tenant-1/agent-1/msg-1/photo.jpg")
|
|
|
|
assert isinstance(url, str)
|
|
assert len(url) > 0
|
|
|
|
def test_generate_presigned_url_default_expiry_1_hour(self):
|
|
"""Default expiry is 3600 seconds (1 hour)."""
|
|
from orchestrator.agents.builder import generate_presigned_url
|
|
|
|
mock_s3 = MagicMock()
|
|
mock_s3.generate_presigned_url.return_value = "http://minio:9000/key?sig=x"
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
generate_presigned_url("some/key")
|
|
|
|
call_kwargs = mock_s3.generate_presigned_url.call_args
|
|
assert call_kwargs[1].get("ExpiresIn") == 3600 or (
|
|
len(call_kwargs[0]) >= 3 and call_kwargs[0][2] == 3600
|
|
)
|
|
|
|
def test_generate_presigned_url_custom_expiry(self):
|
|
"""Custom expiry is passed through to the presigned URL generator."""
|
|
from orchestrator.agents.builder import generate_presigned_url
|
|
|
|
mock_s3 = MagicMock()
|
|
mock_s3.generate_presigned_url.return_value = "http://minio:9000/key?sig=x"
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
generate_presigned_url("some/key", expiry=7200)
|
|
|
|
call_kwargs = mock_s3.generate_presigned_url.call_args
|
|
expires_in = call_kwargs[1].get("ExpiresIn")
|
|
if expires_in is None:
|
|
# Might be positional arg
|
|
params_arg = call_kwargs[1].get("Params", {})
|
|
expires_in = call_kwargs[1].get("ExpiresIn", 3600)
|
|
assert expires_in == 7200
|
|
|
|
def test_generate_presigned_url_uses_correct_storage_key(self):
|
|
"""The correct storage key is passed to boto3 generate_presigned_url."""
|
|
from orchestrator.agents.builder import generate_presigned_url
|
|
|
|
mock_s3 = MagicMock()
|
|
mock_s3.generate_presigned_url.return_value = "http://minio:9000/key"
|
|
|
|
storage_key = "tenant-abc/agent-xyz/msg-123/document.pdf"
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
generate_presigned_url(storage_key)
|
|
|
|
call_args = mock_s3.generate_presigned_url.call_args
|
|
params = call_args[1].get("Params", {})
|
|
assert params.get("Key") == storage_key
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tests: build_messages_with_media()
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestBuildMessagesWithMedia:
|
|
"""Tests for the multimodal message building function."""
|
|
|
|
def _make_agent_mock(self, model: str = "claude-3-sonnet-20240229") -> MagicMock:
|
|
"""Create a minimal Agent mock."""
|
|
agent = MagicMock()
|
|
agent.name = "Test Agent"
|
|
agent.role = "Support"
|
|
agent.persona = ""
|
|
agent.system_prompt = "You are a helpful assistant."
|
|
agent.model_preference = model
|
|
return agent
|
|
|
|
def test_no_media_returns_text_only_messages(self):
|
|
"""Message with no media produces standard text-only content — no regression."""
|
|
from orchestrator.agents.builder import build_messages_with_media
|
|
|
|
agent = self._make_agent_mock()
|
|
mock_s3 = MagicMock()
|
|
mock_s3.generate_presigned_url.return_value = "http://minio/key"
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
messages = build_messages_with_media(
|
|
agent=agent,
|
|
current_message="What is the weather?",
|
|
media_attachments=[],
|
|
recent_messages=[],
|
|
relevant_context=[],
|
|
)
|
|
|
|
# Last message should be user role with plain string content
|
|
user_message = messages[-1]
|
|
assert user_message["role"] == "user"
|
|
assert isinstance(user_message["content"], str)
|
|
assert "What is the weather?" in user_message["content"]
|
|
|
|
def test_image_attachment_with_vision_model_produces_image_url_block(self):
|
|
"""IMAGE attachment + vision model injects image_url content block."""
|
|
from orchestrator.agents.builder import build_messages_with_media
|
|
|
|
agent = self._make_agent_mock(model="claude-3-sonnet-20240229")
|
|
attachment = make_image_attachment()
|
|
|
|
mock_s3 = MagicMock()
|
|
presigned = "http://minio:9000/konstruct-media/tenant-1/agent-1/msg-1/photo.jpg?sig=abc"
|
|
mock_s3.generate_presigned_url.return_value = presigned
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
messages = build_messages_with_media(
|
|
agent=agent,
|
|
current_message="What is in this image?",
|
|
media_attachments=[attachment],
|
|
recent_messages=[],
|
|
relevant_context=[],
|
|
)
|
|
|
|
user_message = messages[-1]
|
|
assert user_message["role"] == "user"
|
|
assert isinstance(user_message["content"], list), (
|
|
"Expected multipart content list, got: " + repr(user_message["content"])
|
|
)
|
|
|
|
content_types = {block["type"] for block in user_message["content"]}
|
|
assert "text" in content_types
|
|
assert "image_url" in content_types
|
|
|
|
image_block = next(b for b in user_message["content"] if b["type"] == "image_url")
|
|
assert image_block["image_url"]["url"] == presigned
|
|
|
|
def test_image_attachment_with_non_vision_model_produces_text_fallback(self):
|
|
"""IMAGE attachment + non-vision model produces text fallback, no image_url block."""
|
|
from orchestrator.agents.builder import build_messages_with_media
|
|
|
|
agent = self._make_agent_mock(model="gpt-3.5-turbo")
|
|
attachment = make_image_attachment(filename="screenshot.png")
|
|
|
|
mock_s3 = MagicMock()
|
|
mock_s3.generate_presigned_url.return_value = "http://minio/key"
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
messages = build_messages_with_media(
|
|
agent=agent,
|
|
current_message="Check this out",
|
|
media_attachments=[attachment],
|
|
recent_messages=[],
|
|
relevant_context=[],
|
|
)
|
|
|
|
user_message = messages[-1]
|
|
assert user_message["role"] == "user"
|
|
# For non-vision: content is plain string, NOT a list
|
|
assert isinstance(user_message["content"], str)
|
|
assert "[Image attached: screenshot.png]" in user_message["content"]
|
|
|
|
def test_document_attachment_produces_text_reference_with_presigned_url(self):
|
|
"""DOCUMENT attachment produces text reference including presigned URL."""
|
|
from orchestrator.agents.builder import build_messages_with_media
|
|
|
|
agent = self._make_agent_mock(model="claude-3-sonnet-20240229")
|
|
attachment = make_document_attachment(filename="report.pdf")
|
|
|
|
mock_s3 = MagicMock()
|
|
presigned = "http://minio:9000/key/report.pdf?sig=xyz"
|
|
mock_s3.generate_presigned_url.return_value = presigned
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
messages = build_messages_with_media(
|
|
agent=agent,
|
|
current_message="Summarize this document",
|
|
media_attachments=[attachment],
|
|
recent_messages=[],
|
|
relevant_context=[],
|
|
)
|
|
|
|
user_message = messages[-1]
|
|
assert user_message["role"] == "user"
|
|
# Documents are always text-referenced (no image_url blocks)
|
|
# Content can be string or list containing text block
|
|
content = user_message["content"]
|
|
if isinstance(content, list):
|
|
text_parts = [b["text"] for b in content if b["type"] == "text"]
|
|
full_text = " ".join(text_parts)
|
|
else:
|
|
full_text = content
|
|
|
|
assert "[Document attached: report.pdf" in full_text
|
|
assert presigned in full_text
|
|
|
|
def test_document_attachment_on_non_vision_model_produces_text_reference(self):
|
|
"""DOCUMENT attachment on non-vision model also produces text reference."""
|
|
from orchestrator.agents.builder import build_messages_with_media
|
|
|
|
agent = self._make_agent_mock(model="gpt-3.5-turbo")
|
|
attachment = make_document_attachment(filename="specs.pdf")
|
|
|
|
mock_s3 = MagicMock()
|
|
presigned = "http://minio:9000/key/specs.pdf?sig=abc"
|
|
mock_s3.generate_presigned_url.return_value = presigned
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
messages = build_messages_with_media(
|
|
agent=agent,
|
|
current_message="Review these specs",
|
|
media_attachments=[attachment],
|
|
recent_messages=[],
|
|
relevant_context=[],
|
|
)
|
|
|
|
user_message = messages[-1]
|
|
content = user_message["content"]
|
|
if isinstance(content, list):
|
|
full_text = " ".join(b["text"] for b in content if b["type"] == "text")
|
|
else:
|
|
full_text = content
|
|
|
|
assert "[Document attached: specs.pdf" in full_text
|
|
|
|
def test_image_without_storage_key_skipped_gracefully(self):
|
|
"""Image attachment with no storage_key is not injected as image_url block."""
|
|
from orchestrator.agents.builder import build_messages_with_media
|
|
|
|
agent = self._make_agent_mock(model="claude-3-sonnet-20240229")
|
|
attachment = MediaAttachment(
|
|
media_type=MediaType.IMAGE,
|
|
storage_key=None, # No storage key — file wasn't stored
|
|
filename="orphaned.jpg",
|
|
mime_type="image/jpeg",
|
|
)
|
|
|
|
mock_s3 = MagicMock()
|
|
mock_s3.generate_presigned_url.return_value = "http://minio/key"
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
messages = build_messages_with_media(
|
|
agent=agent,
|
|
current_message="Here is an image",
|
|
media_attachments=[attachment],
|
|
recent_messages=[],
|
|
relevant_context=[],
|
|
)
|
|
|
|
# Should not crash — just produce a message (possibly with fallback text)
|
|
user_message = messages[-1]
|
|
assert user_message["role"] == "user"
|
|
|
|
def test_image_url_block_includes_detail_field(self):
|
|
"""image_url content block includes 'detail' field set to 'auto'."""
|
|
from orchestrator.agents.builder import build_messages_with_media
|
|
|
|
agent = self._make_agent_mock(model="gpt-4o")
|
|
attachment = make_image_attachment()
|
|
|
|
mock_s3 = MagicMock()
|
|
mock_s3.generate_presigned_url.return_value = "http://minio:9000/key/photo.jpg"
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
messages = build_messages_with_media(
|
|
agent=agent,
|
|
current_message="Analyze this image",
|
|
media_attachments=[attachment],
|
|
recent_messages=[],
|
|
relevant_context=[],
|
|
)
|
|
|
|
user_message = messages[-1]
|
|
content = user_message["content"]
|
|
assert isinstance(content, list)
|
|
|
|
image_block = next(b for b in content if b["type"] == "image_url")
|
|
assert "detail" in image_block["image_url"]
|
|
assert image_block["image_url"]["detail"] == "auto"
|
|
|
|
def test_multiple_images_all_injected_for_vision_model(self):
|
|
"""Multiple image attachments all get image_url blocks for vision models."""
|
|
from orchestrator.agents.builder import build_messages_with_media
|
|
|
|
agent = self._make_agent_mock(model="gpt-4o")
|
|
attachments = [
|
|
make_image_attachment(storage_key="t/a/m/img1.png", filename="img1.png"),
|
|
make_image_attachment(storage_key="t/a/m/img2.jpg", filename="img2.jpg"),
|
|
]
|
|
|
|
mock_s3 = MagicMock()
|
|
# Return different presigned URLs for each call
|
|
mock_s3.generate_presigned_url.side_effect = [
|
|
"http://minio/key/img1.png",
|
|
"http://minio/key/img2.jpg",
|
|
]
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
messages = build_messages_with_media(
|
|
agent=agent,
|
|
current_message="Compare these images",
|
|
media_attachments=attachments,
|
|
recent_messages=[],
|
|
relevant_context=[],
|
|
)
|
|
|
|
user_message = messages[-1]
|
|
content = user_message["content"]
|
|
assert isinstance(content, list)
|
|
|
|
image_blocks = [b for b in content if b["type"] == "image_url"]
|
|
assert len(image_blocks) == 2
|
|
|
|
def test_memory_context_still_injected_with_media(self):
|
|
"""Memory context (recent + relevant) is still injected when media is present."""
|
|
from orchestrator.agents.builder import build_messages_with_media
|
|
|
|
agent = self._make_agent_mock(model="claude-3-sonnet-20240229")
|
|
attachment = make_image_attachment()
|
|
|
|
mock_s3 = MagicMock()
|
|
mock_s3.generate_presigned_url.return_value = "http://minio/key/img.jpg"
|
|
|
|
recent = [
|
|
{"role": "user", "content": "Hi"},
|
|
{"role": "assistant", "content": "Hello!"},
|
|
]
|
|
relevant = ["Past context item 1"]
|
|
|
|
with patch("boto3.client", return_value=mock_s3):
|
|
messages = build_messages_with_media(
|
|
agent=agent,
|
|
current_message="Here is an image",
|
|
media_attachments=[attachment],
|
|
recent_messages=recent,
|
|
relevant_context=relevant,
|
|
)
|
|
|
|
# Should have: system, context system, recent[0], recent[1], current user
|
|
assert len(messages) >= 4
|
|
roles = [m["role"] for m in messages]
|
|
assert roles.count("system") >= 2 # main system + pgvector context
|
|
assert "user" in roles
|
|
assert "assistant" in roles
|