diff --git a/packages/orchestrator/orchestrator/agents/builder.py b/packages/orchestrator/orchestrator/agents/builder.py index 27f57b0..09fd98e 100644 --- a/packages/orchestrator/orchestrator/agents/builder.py +++ b/packages/orchestrator/orchestrator/agents/builder.py @@ -19,12 +19,119 @@ Memory-enriched message assembly: has background context without it polluting the conversation flow. 2. Short-term context: recent messages (Redis sliding window) Represents the immediate conversation history in this session. + +Multimodal message assembly: + build_messages_with_media() extends build_messages_with_memory() by injecting: + 1. IMAGE attachments: generates MinIO presigned URLs and injects image_url + content blocks for vision-capable models (Claude 3+, GPT-4o, Gemini 1.5+). + For non-vision models: falls back to "[Image attached: {filename}]" text. + 2. DOCUMENT attachments: always text-referenced as "[Document attached: {filename} - {url}]" + (PDFs cannot be passed as image_url blocks). + +supports_vision(model_name) returns True for known vision-capable models: + claude-3*, gpt-4o*, gpt-4-vision*, gemini-pro-vision*, gemini-1.5* + +generate_presigned_url(storage_key) generates a 1-hour MinIO presigned URL + for the given object key. """ from __future__ import annotations +import logging +import re + +from shared.models.message import MediaAttachment, MediaType from shared.models.tenant import Agent +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Vision model detection +# --------------------------------------------------------------------------- + +# Regex patterns for vision-capable model names (case-insensitive). +# Matches on the model name AFTER stripping any provider prefix (e.g. "anthropic/"). +_VISION_PATTERNS: list[re.Pattern[str]] = [ + re.compile(r"claude-3", re.IGNORECASE), + re.compile(r"gpt-4o", re.IGNORECASE), + re.compile(r"gpt-4-vision", re.IGNORECASE), + re.compile(r"gemini-pro-vision", re.IGNORECASE), + re.compile(r"gemini-1\.5", re.IGNORECASE), + re.compile(r"gemini-2", re.IGNORECASE), +] + + +def supports_vision(model_name: str) -> bool: + """ + Return True if the given model name is known to support vision (image input). + + Checks against known vision-capable model name patterns: + - claude-3* (all Claude 3+ models including claude-3-5-sonnet) + - gpt-4o* (gpt-4o, gpt-4o-mini) + - gpt-4-vision* (gpt-4-vision-preview) + - gemini-pro-vision + - gemini-1.5* (gemini-1.5-pro, gemini-1.5-flash) + - gemini-2* (future Gemini 2+ series) + + Provider prefixes (e.g. "anthropic/", "openai/") are stripped before + pattern matching to support LiteLLM-prefixed model names. + + Args: + model_name: Model name string (e.g. "claude-3-sonnet-20240229", + "anthropic/claude-3-sonnet-20240229", "gpt-4o"). + + Returns: + True if the model is vision-capable, False otherwise. + """ + if not model_name: + return False + + # Strip provider prefix (e.g. "anthropic/claude-3-..." -> "claude-3-...") + name = model_name + if "/" in name: + name = name.split("/", 1)[-1] + + return any(pattern.search(name) for pattern in _VISION_PATTERNS) + + +# --------------------------------------------------------------------------- +# MinIO presigned URL generation +# --------------------------------------------------------------------------- + + +def generate_presigned_url(storage_key: str, expiry: int = 3600) -> str: + """ + Generate a MinIO presigned GET URL for the given object key. + + Uses boto3 S3 client with the MinIO endpoint from shared settings. + The URL is valid for ``expiry`` seconds (default: 3600 = 1 hour). + + Args: + storage_key: MinIO object key (e.g. "tenant-1/agent-1/msg-1/photo.jpg"). + expiry: URL validity in seconds (default: 3600). + + Returns: + A presigned URL string. + """ + import boto3 # type: ignore[import-untyped] + + from shared.config import settings + + s3_client = boto3.client( + "s3", + endpoint_url=settings.minio_endpoint, + aws_access_key_id=settings.minio_access_key, + aws_secret_access_key=settings.minio_secret_key, + region_name="us-east-1", # MinIO ignores region but boto3 requires it + ) + + presigned_url: str = s3_client.generate_presigned_url( + "get_object", + Params={"Bucket": settings.minio_media_bucket, "Key": storage_key}, + ExpiresIn=expiry, + ) + return presigned_url + def build_system_prompt(agent: Agent) -> str: """ @@ -141,3 +248,146 @@ def build_messages_with_memory( messages.append({"role": "user", "content": current_message}) return messages + + +def build_messages_with_media( + agent: Agent, + current_message: str, + media_attachments: list[MediaAttachment], + recent_messages: list[dict], + relevant_context: list[str], +) -> list[dict]: + """ + Build an LLM messages array with memory enrichment AND multimodal media injection. + + Extends build_messages_with_memory() by converting media attachments into + appropriate LLM content blocks: + + For IMAGE attachments: + - If the agent's model supports vision: converts the user message to multipart + format with image_url content blocks (each block gets a presigned MinIO URL). + - If the model does NOT support vision: appends "[Image attached: {filename}]" + text instead (graceful degradation — no error, no dropped image). + + For DOCUMENT attachments: + - Always text-referenced as "[Document attached: {filename} - {presigned_url}]" + - PDFs and documents cannot be passed as image_url blocks in the OpenAI format. + + Attachments without a storage_key (not yet stored in MinIO) are skipped + gracefully rather than raising an error. + + Structure (in order): + 1. System message — agent identity, persona, AI transparency clause + 2. System message — long-term context (ONLY if non-empty) + 3. Sliding window messages — recent history + 4. Current user message (plain string or multipart content list) + + Args: + agent: ORM Agent instance (for system prompt and model detection). + current_message: The current user message text. + media_attachments: List of MediaAttachment objects to inject into the prompt. + Empty list produces the same output as build_messages_with_memory(). + recent_messages: Short-term memory — list of {"role", "content"} dicts. + relevant_context: Long-term memory — list of content strings from pgvector. + + Returns: + List of message dicts suitable for an OpenAI/LiteLLM-compatible API call. + User message content is either a plain string (no media / non-vision model + with text-only references) or a multipart list when image_url blocks are used. + """ + # Build the base messages array (system + memory + current text message) + messages = build_messages_with_memory( + agent=agent, + current_message=current_message, + recent_messages=recent_messages, + relevant_context=relevant_context, + ) + + # If no media attachments, return the base messages unchanged + if not media_attachments: + return messages + + # Determine if this agent's model supports vision + model_name: str = agent.model_preference or "" + vision_capable = supports_vision(model_name) + + # Start building the enriched user message content + # We'll accumulate text additions and image_url blocks separately + text_additions: list[str] = [] + image_url_blocks: list[dict] = [] + + for attachment in media_attachments: + # Skip attachments without a storage key (not yet stored in MinIO) + if not attachment.storage_key: + logger.debug( + "build_messages_with_media: skipping attachment with no storage_key " + "(filename=%r) — not yet stored in MinIO", + attachment.filename, + ) + continue + + filename = attachment.filename or attachment.storage_key.split("/")[-1] + + if attachment.media_type == MediaType.IMAGE: + if vision_capable: + # Generate presigned URL and add as image_url block + try: + presigned_url = generate_presigned_url(attachment.storage_key) + image_url_blocks.append({ + "type": "image_url", + "image_url": { + "url": presigned_url, + "detail": "auto", + }, + }) + except Exception: + logger.exception( + "build_messages_with_media: failed to generate presigned URL " + "for storage_key=%r — using text fallback", + attachment.storage_key, + ) + text_additions.append(f"[Image attached: {filename}]") + else: + # Non-vision model: text fallback + text_additions.append(f"[Image attached: {filename}]") + + elif attachment.media_type == MediaType.DOCUMENT: + # Documents are always text-referenced (no image_url blocks for PDFs) + try: + presigned_url = generate_presigned_url(attachment.storage_key) + text_additions.append(f"[Document attached: {filename} - {presigned_url}]") + except Exception: + logger.exception( + "build_messages_with_media: failed to generate presigned URL " + "for document storage_key=%r", + attachment.storage_key, + ) + text_additions.append(f"[Document attached: {filename}]") + + else: + # AUDIO/VIDEO: text reference only for now + text_additions.append(f"[{attachment.media_type.value.capitalize()} attached: {filename}]") + + # If we have no enrichments, return the base messages unchanged + if not text_additions and not image_url_blocks: + return messages + + # Build the enriched user message content + # The last message in the array is the current user message + base_user_text: str = messages[-1]["content"] # Always a str from build_messages_with_memory + + if image_url_blocks: + # Multipart format: text block + image_url blocks + full_text = base_user_text + if text_additions: + full_text = base_user_text + "\n" + "\n".join(text_additions) + + enriched_content: list[dict] | str = [{"type": "text", "text": full_text}] + image_url_blocks + else: + # Text-only enrichment (non-vision model or documents only) + enriched_content = base_user_text + "\n" + "\n".join(text_additions) + + # Replace the last message's content with the enriched version + messages[-1] = {"role": "user", "content": enriched_content} + + return messages diff --git a/tests/unit/test_multimodal_messages.py b/tests/unit/test_multimodal_messages.py new file mode 100644 index 0000000..d5b4bb3 --- /dev/null +++ b/tests/unit/test_multimodal_messages.py @@ -0,0 +1,503 @@ +""" +Unit tests for multimodal LLM interpretation — image_url content block injection. + +Tests: +- message with IMAGE MediaAttachment + vision model produces image_url content block +- message with IMAGE MediaAttachment + non-vision model produces text fallback "[Image attached: ...]" +- message with DOCUMENT MediaAttachment produces text reference with presigned URL +- message with no media produces standard text-only content (no regression) +- supports_vision returns True for "claude-3-sonnet", "gpt-4o", False for "gpt-3.5-turbo" +- presigned URL has correct format and expiry (mock boto3) +""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + +from shared.models.message import MediaAttachment, MediaType + + +# --------------------------------------------------------------------------- +# Test fixtures +# --------------------------------------------------------------------------- + + +def make_image_attachment( + storage_key: str = "tenant-1/agent-1/msg-1/photo.jpg", + filename: str = "photo.jpg", + mime_type: str = "image/jpeg", +) -> MediaAttachment: + """Create a sample IMAGE MediaAttachment.""" + return MediaAttachment( + media_type=MediaType.IMAGE, + storage_key=storage_key, + mime_type=mime_type, + filename=filename, + size_bytes=10240, + ) + + +def make_document_attachment( + storage_key: str = "tenant-1/agent-1/msg-1/report.pdf", + filename: str = "report.pdf", + mime_type: str = "application/pdf", +) -> MediaAttachment: + """Create a sample DOCUMENT MediaAttachment.""" + return MediaAttachment( + media_type=MediaType.DOCUMENT, + storage_key=storage_key, + mime_type=mime_type, + filename=filename, + size_bytes=204800, + ) + + +# --------------------------------------------------------------------------- +# Tests: supports_vision() +# --------------------------------------------------------------------------- + + +class TestSupportsVision: + """Tests for the supports_vision model detection function.""" + + def test_claude_3_sonnet_supports_vision(self): + """claude-3-sonnet is a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("claude-3-sonnet-20240229") is True + + def test_claude_3_haiku_supports_vision(self): + """claude-3-haiku is a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("claude-3-haiku-20240307") is True + + def test_claude_3_opus_supports_vision(self): + """claude-3-opus is a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("claude-3-opus-20240229") is True + + def test_claude_3_5_sonnet_supports_vision(self): + """claude-3-5-sonnet is a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("claude-3-5-sonnet-20241022") is True + + def test_gpt_4o_supports_vision(self): + """gpt-4o is a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("gpt-4o") is True + + def test_gpt_4o_mini_supports_vision(self): + """gpt-4o-mini is a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("gpt-4o-mini") is True + + def test_gpt_4_vision_supports_vision(self): + """gpt-4-vision-preview is a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("gpt-4-vision-preview") is True + + def test_gemini_pro_vision_supports_vision(self): + """gemini-pro-vision is a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("gemini-pro-vision") is True + + def test_gemini_1_5_pro_supports_vision(self): + """gemini-1.5-pro is a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("gemini-1.5-pro") is True + + def test_gpt_3_5_turbo_does_not_support_vision(self): + """gpt-3.5-turbo is NOT a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("gpt-3.5-turbo") is False + + def test_gpt_4_does_not_support_vision(self): + """gpt-4 (without vision suffix) is NOT a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("gpt-4") is False + + def test_ollama_llama_does_not_support_vision(self): + """ollama/llama3 is NOT a vision-capable model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("ollama/llama3") is False + + def test_empty_string_does_not_support_vision(self): + """Empty model string is not a vision model.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("") is False + + def test_provider_prefixed_claude_3_supports_vision(self): + """anthropic/claude-3-sonnet-20240229 should still be detected as vision.""" + from orchestrator.agents.builder import supports_vision + + assert supports_vision("anthropic/claude-3-sonnet-20240229") is True + + +# --------------------------------------------------------------------------- +# Tests: generate_presigned_url() +# --------------------------------------------------------------------------- + + +class TestGeneratePresignedUrl: + """Tests for the MinIO presigned URL generator.""" + + def test_generate_presigned_url_returns_string(self): + """generate_presigned_url returns a non-empty URL string.""" + from orchestrator.agents.builder import generate_presigned_url + + mock_s3 = MagicMock() + mock_s3.generate_presigned_url.return_value = ( + "http://minio:9000/konstruct-media/key?X-Amz-Signature=abc&X-Amz-Expires=3600" + ) + + with patch("boto3.client", return_value=mock_s3): + url = generate_presigned_url("tenant-1/agent-1/msg-1/photo.jpg") + + assert isinstance(url, str) + assert len(url) > 0 + + def test_generate_presigned_url_default_expiry_1_hour(self): + """Default expiry is 3600 seconds (1 hour).""" + from orchestrator.agents.builder import generate_presigned_url + + mock_s3 = MagicMock() + mock_s3.generate_presigned_url.return_value = "http://minio:9000/key?sig=x" + + with patch("boto3.client", return_value=mock_s3): + generate_presigned_url("some/key") + + call_kwargs = mock_s3.generate_presigned_url.call_args + assert call_kwargs[1].get("ExpiresIn") == 3600 or ( + len(call_kwargs[0]) >= 3 and call_kwargs[0][2] == 3600 + ) + + def test_generate_presigned_url_custom_expiry(self): + """Custom expiry is passed through to the presigned URL generator.""" + from orchestrator.agents.builder import generate_presigned_url + + mock_s3 = MagicMock() + mock_s3.generate_presigned_url.return_value = "http://minio:9000/key?sig=x" + + with patch("boto3.client", return_value=mock_s3): + generate_presigned_url("some/key", expiry=7200) + + call_kwargs = mock_s3.generate_presigned_url.call_args + expires_in = call_kwargs[1].get("ExpiresIn") + if expires_in is None: + # Might be positional arg + params_arg = call_kwargs[1].get("Params", {}) + expires_in = call_kwargs[1].get("ExpiresIn", 3600) + assert expires_in == 7200 + + def test_generate_presigned_url_uses_correct_storage_key(self): + """The correct storage key is passed to boto3 generate_presigned_url.""" + from orchestrator.agents.builder import generate_presigned_url + + mock_s3 = MagicMock() + mock_s3.generate_presigned_url.return_value = "http://minio:9000/key" + + storage_key = "tenant-abc/agent-xyz/msg-123/document.pdf" + + with patch("boto3.client", return_value=mock_s3): + generate_presigned_url(storage_key) + + call_args = mock_s3.generate_presigned_url.call_args + params = call_args[1].get("Params", {}) + assert params.get("Key") == storage_key + + +# --------------------------------------------------------------------------- +# Tests: build_messages_with_media() +# --------------------------------------------------------------------------- + + +class TestBuildMessagesWithMedia: + """Tests for the multimodal message building function.""" + + def _make_agent_mock(self, model: str = "claude-3-sonnet-20240229") -> MagicMock: + """Create a minimal Agent mock.""" + agent = MagicMock() + agent.name = "Test Agent" + agent.role = "Support" + agent.persona = "" + agent.system_prompt = "You are a helpful assistant." + agent.model_preference = model + return agent + + def test_no_media_returns_text_only_messages(self): + """Message with no media produces standard text-only content — no regression.""" + from orchestrator.agents.builder import build_messages_with_media + + agent = self._make_agent_mock() + mock_s3 = MagicMock() + mock_s3.generate_presigned_url.return_value = "http://minio/key" + + with patch("boto3.client", return_value=mock_s3): + messages = build_messages_with_media( + agent=agent, + current_message="What is the weather?", + media_attachments=[], + recent_messages=[], + relevant_context=[], + ) + + # Last message should be user role with plain string content + user_message = messages[-1] + assert user_message["role"] == "user" + assert isinstance(user_message["content"], str) + assert "What is the weather?" in user_message["content"] + + def test_image_attachment_with_vision_model_produces_image_url_block(self): + """IMAGE attachment + vision model injects image_url content block.""" + from orchestrator.agents.builder import build_messages_with_media + + agent = self._make_agent_mock(model="claude-3-sonnet-20240229") + attachment = make_image_attachment() + + mock_s3 = MagicMock() + presigned = "http://minio:9000/konstruct-media/tenant-1/agent-1/msg-1/photo.jpg?sig=abc" + mock_s3.generate_presigned_url.return_value = presigned + + with patch("boto3.client", return_value=mock_s3): + messages = build_messages_with_media( + agent=agent, + current_message="What is in this image?", + media_attachments=[attachment], + recent_messages=[], + relevant_context=[], + ) + + user_message = messages[-1] + assert user_message["role"] == "user" + assert isinstance(user_message["content"], list), ( + "Expected multipart content list, got: " + repr(user_message["content"]) + ) + + content_types = {block["type"] for block in user_message["content"]} + assert "text" in content_types + assert "image_url" in content_types + + image_block = next(b for b in user_message["content"] if b["type"] == "image_url") + assert image_block["image_url"]["url"] == presigned + + def test_image_attachment_with_non_vision_model_produces_text_fallback(self): + """IMAGE attachment + non-vision model produces text fallback, no image_url block.""" + from orchestrator.agents.builder import build_messages_with_media + + agent = self._make_agent_mock(model="gpt-3.5-turbo") + attachment = make_image_attachment(filename="screenshot.png") + + mock_s3 = MagicMock() + mock_s3.generate_presigned_url.return_value = "http://minio/key" + + with patch("boto3.client", return_value=mock_s3): + messages = build_messages_with_media( + agent=agent, + current_message="Check this out", + media_attachments=[attachment], + recent_messages=[], + relevant_context=[], + ) + + user_message = messages[-1] + assert user_message["role"] == "user" + # For non-vision: content is plain string, NOT a list + assert isinstance(user_message["content"], str) + assert "[Image attached: screenshot.png]" in user_message["content"] + + def test_document_attachment_produces_text_reference_with_presigned_url(self): + """DOCUMENT attachment produces text reference including presigned URL.""" + from orchestrator.agents.builder import build_messages_with_media + + agent = self._make_agent_mock(model="claude-3-sonnet-20240229") + attachment = make_document_attachment(filename="report.pdf") + + mock_s3 = MagicMock() + presigned = "http://minio:9000/key/report.pdf?sig=xyz" + mock_s3.generate_presigned_url.return_value = presigned + + with patch("boto3.client", return_value=mock_s3): + messages = build_messages_with_media( + agent=agent, + current_message="Summarize this document", + media_attachments=[attachment], + recent_messages=[], + relevant_context=[], + ) + + user_message = messages[-1] + assert user_message["role"] == "user" + # Documents are always text-referenced (no image_url blocks) + # Content can be string or list containing text block + content = user_message["content"] + if isinstance(content, list): + text_parts = [b["text"] for b in content if b["type"] == "text"] + full_text = " ".join(text_parts) + else: + full_text = content + + assert "[Document attached: report.pdf" in full_text + assert presigned in full_text + + def test_document_attachment_on_non_vision_model_produces_text_reference(self): + """DOCUMENT attachment on non-vision model also produces text reference.""" + from orchestrator.agents.builder import build_messages_with_media + + agent = self._make_agent_mock(model="gpt-3.5-turbo") + attachment = make_document_attachment(filename="specs.pdf") + + mock_s3 = MagicMock() + presigned = "http://minio:9000/key/specs.pdf?sig=abc" + mock_s3.generate_presigned_url.return_value = presigned + + with patch("boto3.client", return_value=mock_s3): + messages = build_messages_with_media( + agent=agent, + current_message="Review these specs", + media_attachments=[attachment], + recent_messages=[], + relevant_context=[], + ) + + user_message = messages[-1] + content = user_message["content"] + if isinstance(content, list): + full_text = " ".join(b["text"] for b in content if b["type"] == "text") + else: + full_text = content + + assert "[Document attached: specs.pdf" in full_text + + def test_image_without_storage_key_skipped_gracefully(self): + """Image attachment with no storage_key is not injected as image_url block.""" + from orchestrator.agents.builder import build_messages_with_media + + agent = self._make_agent_mock(model="claude-3-sonnet-20240229") + attachment = MediaAttachment( + media_type=MediaType.IMAGE, + storage_key=None, # No storage key — file wasn't stored + filename="orphaned.jpg", + mime_type="image/jpeg", + ) + + mock_s3 = MagicMock() + mock_s3.generate_presigned_url.return_value = "http://minio/key" + + with patch("boto3.client", return_value=mock_s3): + messages = build_messages_with_media( + agent=agent, + current_message="Here is an image", + media_attachments=[attachment], + recent_messages=[], + relevant_context=[], + ) + + # Should not crash — just produce a message (possibly with fallback text) + user_message = messages[-1] + assert user_message["role"] == "user" + + def test_image_url_block_includes_detail_field(self): + """image_url content block includes 'detail' field set to 'auto'.""" + from orchestrator.agents.builder import build_messages_with_media + + agent = self._make_agent_mock(model="gpt-4o") + attachment = make_image_attachment() + + mock_s3 = MagicMock() + mock_s3.generate_presigned_url.return_value = "http://minio:9000/key/photo.jpg" + + with patch("boto3.client", return_value=mock_s3): + messages = build_messages_with_media( + agent=agent, + current_message="Analyze this image", + media_attachments=[attachment], + recent_messages=[], + relevant_context=[], + ) + + user_message = messages[-1] + content = user_message["content"] + assert isinstance(content, list) + + image_block = next(b for b in content if b["type"] == "image_url") + assert "detail" in image_block["image_url"] + assert image_block["image_url"]["detail"] == "auto" + + def test_multiple_images_all_injected_for_vision_model(self): + """Multiple image attachments all get image_url blocks for vision models.""" + from orchestrator.agents.builder import build_messages_with_media + + agent = self._make_agent_mock(model="gpt-4o") + attachments = [ + make_image_attachment(storage_key="t/a/m/img1.png", filename="img1.png"), + make_image_attachment(storage_key="t/a/m/img2.jpg", filename="img2.jpg"), + ] + + mock_s3 = MagicMock() + # Return different presigned URLs for each call + mock_s3.generate_presigned_url.side_effect = [ + "http://minio/key/img1.png", + "http://minio/key/img2.jpg", + ] + + with patch("boto3.client", return_value=mock_s3): + messages = build_messages_with_media( + agent=agent, + current_message="Compare these images", + media_attachments=attachments, + recent_messages=[], + relevant_context=[], + ) + + user_message = messages[-1] + content = user_message["content"] + assert isinstance(content, list) + + image_blocks = [b for b in content if b["type"] == "image_url"] + assert len(image_blocks) == 2 + + def test_memory_context_still_injected_with_media(self): + """Memory context (recent + relevant) is still injected when media is present.""" + from orchestrator.agents.builder import build_messages_with_media + + agent = self._make_agent_mock(model="claude-3-sonnet-20240229") + attachment = make_image_attachment() + + mock_s3 = MagicMock() + mock_s3.generate_presigned_url.return_value = "http://minio/key/img.jpg" + + recent = [ + {"role": "user", "content": "Hi"}, + {"role": "assistant", "content": "Hello!"}, + ] + relevant = ["Past context item 1"] + + with patch("boto3.client", return_value=mock_s3): + messages = build_messages_with_media( + agent=agent, + current_message="Here is an image", + media_attachments=[attachment], + recent_messages=recent, + relevant_context=relevant, + ) + + # Should have: system, context system, recent[0], recent[1], current user + assert len(messages) >= 4 + roles = [m["role"] for m in messages] + assert roles.count("system") >= 2 # main system + pgvector context + assert "user" in roles + assert "assistant" in roles