feat(02-05): multimodal LLM interpretation with image_url content blocks

- Add supports_vision(model_name) to builder.py — detects vision-capable models (claude-3*, gpt-4o*, gpt-4-vision*, gemini-pro-vision*, gemini-1.5*, gemini-2*) with provider prefix stripping support - Add generate_presigned_url(storage_key, expiry=3600) to builder.py — generates 1-hour MinIO presigned URLs via boto3 S3 client - Add build_messages_with_media() to builder.py — extends build_messages_with_memory() with media injection: IMAGE -> image_url blocks for vision models / text fallback for non-vision models, DOCUMENT -> text reference with presigned URL - image_url blocks use 'detail: auto' per OpenAI/LiteLLM multipart format - Add 27 unit tests in test_multimodal_messages.py (TDD)
2026-03-23 15:09:18 -06:00
parent 9dd7c481a3
commit 669c0b52b3
2 changed files with 753 additions and 0 deletions
--- a/packages/orchestrator/orchestrator/agents/builder.py
+++ b/packages/orchestrator/orchestrator/agents/builder.py
@@ -19,12 +19,119 @@ Memory-enriched message assembly:
     has background context without it polluting the conversation flow.
  2. Short-term context: recent messages (Redis sliding window)
     Represents the immediate conversation history in this session.
+
+Multimodal message assembly:
+  build_messages_with_media() extends build_messages_with_memory() by injecting:
+  1. IMAGE attachments: generates MinIO presigned URLs and injects image_url
+     content blocks for vision-capable models (Claude 3+, GPT-4o, Gemini 1.5+).
+     For non-vision models: falls back to "[Image attached: {filename}]" text.
+  2. DOCUMENT attachments: always text-referenced as "[Document attached: {filename} - {url}]"
+     (PDFs cannot be passed as image_url blocks).
+
+supports_vision(model_name) returns True for known vision-capable models:
+  claude-3*, gpt-4o*, gpt-4-vision*, gemini-pro-vision*, gemini-1.5*
+
+generate_presigned_url(storage_key) generates a 1-hour MinIO presigned URL
+  for the given object key.
 """

 from __future__ import annotations

+import logging
+import re
+
+from shared.models.message import MediaAttachment, MediaType
 from shared.models.tenant import Agent

+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Vision model detection
+# ---------------------------------------------------------------------------
+
+# Regex patterns for vision-capable model names (case-insensitive).
+# Matches on the model name AFTER stripping any provider prefix (e.g. "anthropic/").
+_VISION_PATTERNS: list[re.Pattern[str]] = [
+    re.compile(r"claude-3", re.IGNORECASE),
+    re.compile(r"gpt-4o", re.IGNORECASE),
+    re.compile(r"gpt-4-vision", re.IGNORECASE),
+    re.compile(r"gemini-pro-vision", re.IGNORECASE),
+    re.compile(r"gemini-1\.5", re.IGNORECASE),
+    re.compile(r"gemini-2", re.IGNORECASE),
+]
+
+
+def supports_vision(model_name: str) -> bool:
+    """
+    Return True if the given model name is known to support vision (image input).
+
+    Checks against known vision-capable model name patterns:
+      - claude-3* (all Claude 3+ models including claude-3-5-sonnet)
+      - gpt-4o* (gpt-4o, gpt-4o-mini)
+      - gpt-4-vision* (gpt-4-vision-preview)
+      - gemini-pro-vision
+      - gemini-1.5* (gemini-1.5-pro, gemini-1.5-flash)
+      - gemini-2* (future Gemini 2+ series)
+
+    Provider prefixes (e.g. "anthropic/", "openai/") are stripped before
+    pattern matching to support LiteLLM-prefixed model names.
+
+    Args:
+        model_name: Model name string (e.g. "claude-3-sonnet-20240229",
+                    "anthropic/claude-3-sonnet-20240229", "gpt-4o").
+
+    Returns:
+        True if the model is vision-capable, False otherwise.
+    """
+    if not model_name:
+        return False
+
+    # Strip provider prefix (e.g. "anthropic/claude-3-..." -> "claude-3-...")
+    name = model_name
+    if "/" in name:
+        name = name.split("/", 1)[-1]
+
+    return any(pattern.search(name) for pattern in _VISION_PATTERNS)
+
+
+# ---------------------------------------------------------------------------
+# MinIO presigned URL generation
+# ---------------------------------------------------------------------------
+
+
+def generate_presigned_url(storage_key: str, expiry: int = 3600) -> str:
+    """
+    Generate a MinIO presigned GET URL for the given object key.
+
+    Uses boto3 S3 client with the MinIO endpoint from shared settings.
+    The URL is valid for ``expiry`` seconds (default: 3600 = 1 hour).
+
+    Args:
+        storage_key: MinIO object key (e.g. "tenant-1/agent-1/msg-1/photo.jpg").
+        expiry:      URL validity in seconds (default: 3600).
+
+    Returns:
+        A presigned URL string.
+    """
+    import boto3  # type: ignore[import-untyped]
+
+    from shared.config import settings
+
+    s3_client = boto3.client(
+        "s3",
+        endpoint_url=settings.minio_endpoint,
+        aws_access_key_id=settings.minio_access_key,
+        aws_secret_access_key=settings.minio_secret_key,
+        region_name="us-east-1",  # MinIO ignores region but boto3 requires it
+    )
+
+    presigned_url: str = s3_client.generate_presigned_url(
+        "get_object",
+        Params={"Bucket": settings.minio_media_bucket, "Key": storage_key},
+        ExpiresIn=expiry,
+    )
+    return presigned_url
+

 def build_system_prompt(agent: Agent) -> str:
    """
@@ -141,3 +248,146 @@ def build_messages_with_memory(
    messages.append({"role": "user", "content": current_message})

    return messages
+
+
+def build_messages_with_media(
+    agent: Agent,
+    current_message: str,
+    media_attachments: list[MediaAttachment],
+    recent_messages: list[dict],
+    relevant_context: list[str],
+) -> list[dict]:
+    """
+    Build an LLM messages array with memory enrichment AND multimodal media injection.
+
+    Extends build_messages_with_memory() by converting media attachments into
+    appropriate LLM content blocks:
+
+    For IMAGE attachments:
+      - If the agent's model supports vision: converts the user message to multipart
+        format with image_url content blocks (each block gets a presigned MinIO URL).
+      - If the model does NOT support vision: appends "[Image attached: {filename}]"
+        text instead (graceful degradation — no error, no dropped image).
+
+    For DOCUMENT attachments:
+      - Always text-referenced as "[Document attached: {filename} - {presigned_url}]"
+      - PDFs and documents cannot be passed as image_url blocks in the OpenAI format.
+
+    Attachments without a storage_key (not yet stored in MinIO) are skipped
+    gracefully rather than raising an error.
+
+    Structure (in order):
+      1. System message — agent identity, persona, AI transparency clause
+      2. System message — long-term context (ONLY if non-empty)
+      3. Sliding window messages — recent history
+      4. Current user message (plain string or multipart content list)
+
+    Args:
+        agent:             ORM Agent instance (for system prompt and model detection).
+        current_message:   The current user message text.
+        media_attachments: List of MediaAttachment objects to inject into the prompt.
+                           Empty list produces the same output as build_messages_with_memory().
+        recent_messages:   Short-term memory — list of {"role", "content"} dicts.
+        relevant_context:  Long-term memory — list of content strings from pgvector.
+
+    Returns:
+        List of message dicts suitable for an OpenAI/LiteLLM-compatible API call.
+        User message content is either a plain string (no media / non-vision model
+        with text-only references) or a multipart list when image_url blocks are used.
+    """
+    # Build the base messages array (system + memory + current text message)
+    messages = build_messages_with_memory(
+        agent=agent,
+        current_message=current_message,
+        recent_messages=recent_messages,
+        relevant_context=relevant_context,
+    )
+
+    # If no media attachments, return the base messages unchanged
+    if not media_attachments:
+        return messages
+
+    # Determine if this agent's model supports vision
+    model_name: str = agent.model_preference or ""
+    vision_capable = supports_vision(model_name)
+
+    # Start building the enriched user message content
+    # We'll accumulate text additions and image_url blocks separately
+    text_additions: list[str] = []
+    image_url_blocks: list[dict] = []
+
+    for attachment in media_attachments:
+        # Skip attachments without a storage key (not yet stored in MinIO)
+        if not attachment.storage_key:
+            logger.debug(
+                "build_messages_with_media: skipping attachment with no storage_key "
+                "(filename=%r) — not yet stored in MinIO",
+                attachment.filename,
+            )
+            continue
+
+        filename = attachment.filename or attachment.storage_key.split("/")[-1]
+
+        if attachment.media_type == MediaType.IMAGE:
+            if vision_capable:
+                # Generate presigned URL and add as image_url block
+                try:
+                    presigned_url = generate_presigned_url(attachment.storage_key)
+                    image_url_blocks.append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url": presigned_url,
+                            "detail": "auto",
+                        },
+                    })
+                except Exception:
+                    logger.exception(
+                        "build_messages_with_media: failed to generate presigned URL "
+                        "for storage_key=%r — using text fallback",
+                        attachment.storage_key,
+                    )
+                    text_additions.append(f"[Image attached: {filename}]")
+            else:
+                # Non-vision model: text fallback
+                text_additions.append(f"[Image attached: {filename}]")
+
+        elif attachment.media_type == MediaType.DOCUMENT:
+            # Documents are always text-referenced (no image_url blocks for PDFs)
+            try:
+                presigned_url = generate_presigned_url(attachment.storage_key)
+                text_additions.append(f"[Document attached: {filename} - {presigned_url}]")
+            except Exception:
+                logger.exception(
+                    "build_messages_with_media: failed to generate presigned URL "
+                    "for document storage_key=%r",
+                    attachment.storage_key,
+                )
+                text_additions.append(f"[Document attached: {filename}]")
+
+        else:
+            # AUDIO/VIDEO: text reference only for now
+            text_additions.append(f"[{attachment.media_type.value.capitalize()} attached: {filename}]")
+
+    # If we have no enrichments, return the base messages unchanged
+    if not text_additions and not image_url_blocks:
+        return messages
+
+    # Build the enriched user message content
+    # The last message in the array is the current user message
+    base_user_text: str = messages[-1]["content"]  # Always a str from build_messages_with_memory
+
+    if image_url_blocks:
+        # Multipart format: text block + image_url blocks
+        full_text = base_user_text
+        if text_additions:
+            full_text = base_user_text + "\n" + "\n".join(text_additions)
+
+        enriched_content: list[dict] | str = [{"type": "text", "text": full_text}] + image_url_blocks
+    else:
+        # Text-only enrichment (non-vision model or documents only)
+        enriched_content = base_user_text + "\n" + "\n".join(text_additions)
+
+    # Replace the last message's content with the enriched version
+    messages[-1] = {"role": "user", "content": enriched_content}
+
+    return messages
--- a/tests/unit/test_multimodal_messages.py
+++ b/tests/unit/test_multimodal_messages.py
@@ -0,0 +1,503 @@
+"""
+Unit tests for multimodal LLM interpretation — image_url content block injection.
+
+Tests:
+- message with IMAGE MediaAttachment + vision model produces image_url content block
+- message with IMAGE MediaAttachment + non-vision model produces text fallback "[Image attached: ...]"
+- message with DOCUMENT MediaAttachment produces text reference with presigned URL
+- message with no media produces standard text-only content (no regression)
+- supports_vision returns True for "claude-3-sonnet", "gpt-4o", False for "gpt-3.5-turbo"
+- presigned URL has correct format and expiry (mock boto3)
+"""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from shared.models.message import MediaAttachment, MediaType
+
+
+# ---------------------------------------------------------------------------
+# Test fixtures
+# ---------------------------------------------------------------------------
+
+
+def make_image_attachment(
+    storage_key: str = "tenant-1/agent-1/msg-1/photo.jpg",
+    filename: str = "photo.jpg",
+    mime_type: str = "image/jpeg",
+) -> MediaAttachment:
+    """Create a sample IMAGE MediaAttachment."""
+    return MediaAttachment(
+        media_type=MediaType.IMAGE,
+        storage_key=storage_key,
+        mime_type=mime_type,
+        filename=filename,
+        size_bytes=10240,
+    )
+
+
+def make_document_attachment(
+    storage_key: str = "tenant-1/agent-1/msg-1/report.pdf",
+    filename: str = "report.pdf",
+    mime_type: str = "application/pdf",
+) -> MediaAttachment:
+    """Create a sample DOCUMENT MediaAttachment."""
+    return MediaAttachment(
+        media_type=MediaType.DOCUMENT,
+        storage_key=storage_key,
+        mime_type=mime_type,
+        filename=filename,
+        size_bytes=204800,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Tests: supports_vision()
+# ---------------------------------------------------------------------------
+
+
+class TestSupportsVision:
+    """Tests for the supports_vision model detection function."""
+
+    def test_claude_3_sonnet_supports_vision(self):
+        """claude-3-sonnet is a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("claude-3-sonnet-20240229") is True
+
+    def test_claude_3_haiku_supports_vision(self):
+        """claude-3-haiku is a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("claude-3-haiku-20240307") is True
+
+    def test_claude_3_opus_supports_vision(self):
+        """claude-3-opus is a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("claude-3-opus-20240229") is True
+
+    def test_claude_3_5_sonnet_supports_vision(self):
+        """claude-3-5-sonnet is a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("claude-3-5-sonnet-20241022") is True
+
+    def test_gpt_4o_supports_vision(self):
+        """gpt-4o is a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("gpt-4o") is True
+
+    def test_gpt_4o_mini_supports_vision(self):
+        """gpt-4o-mini is a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("gpt-4o-mini") is True
+
+    def test_gpt_4_vision_supports_vision(self):
+        """gpt-4-vision-preview is a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("gpt-4-vision-preview") is True
+
+    def test_gemini_pro_vision_supports_vision(self):
+        """gemini-pro-vision is a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("gemini-pro-vision") is True
+
+    def test_gemini_1_5_pro_supports_vision(self):
+        """gemini-1.5-pro is a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("gemini-1.5-pro") is True
+
+    def test_gpt_3_5_turbo_does_not_support_vision(self):
+        """gpt-3.5-turbo is NOT a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("gpt-3.5-turbo") is False
+
+    def test_gpt_4_does_not_support_vision(self):
+        """gpt-4 (without vision suffix) is NOT a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("gpt-4") is False
+
+    def test_ollama_llama_does_not_support_vision(self):
+        """ollama/llama3 is NOT a vision-capable model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("ollama/llama3") is False
+
+    def test_empty_string_does_not_support_vision(self):
+        """Empty model string is not a vision model."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("") is False
+
+    def test_provider_prefixed_claude_3_supports_vision(self):
+        """anthropic/claude-3-sonnet-20240229 should still be detected as vision."""
+        from orchestrator.agents.builder import supports_vision
+
+        assert supports_vision("anthropic/claude-3-sonnet-20240229") is True
+
+
+# ---------------------------------------------------------------------------
+# Tests: generate_presigned_url()
+# ---------------------------------------------------------------------------
+
+
+class TestGeneratePresignedUrl:
+    """Tests for the MinIO presigned URL generator."""
+
+    def test_generate_presigned_url_returns_string(self):
+        """generate_presigned_url returns a non-empty URL string."""
+        from orchestrator.agents.builder import generate_presigned_url
+
+        mock_s3 = MagicMock()
+        mock_s3.generate_presigned_url.return_value = (
+            "http://minio:9000/konstruct-media/key?X-Amz-Signature=abc&X-Amz-Expires=3600"
+        )
+
+        with patch("boto3.client", return_value=mock_s3):
+            url = generate_presigned_url("tenant-1/agent-1/msg-1/photo.jpg")
+
+        assert isinstance(url, str)
+        assert len(url) > 0
+
+    def test_generate_presigned_url_default_expiry_1_hour(self):
+        """Default expiry is 3600 seconds (1 hour)."""
+        from orchestrator.agents.builder import generate_presigned_url
+
+        mock_s3 = MagicMock()
+        mock_s3.generate_presigned_url.return_value = "http://minio:9000/key?sig=x"
+
+        with patch("boto3.client", return_value=mock_s3):
+            generate_presigned_url("some/key")
+
+        call_kwargs = mock_s3.generate_presigned_url.call_args
+        assert call_kwargs[1].get("ExpiresIn") == 3600 or (
+            len(call_kwargs[0]) >= 3 and call_kwargs[0][2] == 3600
+        )
+
+    def test_generate_presigned_url_custom_expiry(self):
+        """Custom expiry is passed through to the presigned URL generator."""
+        from orchestrator.agents.builder import generate_presigned_url
+
+        mock_s3 = MagicMock()
+        mock_s3.generate_presigned_url.return_value = "http://minio:9000/key?sig=x"
+
+        with patch("boto3.client", return_value=mock_s3):
+            generate_presigned_url("some/key", expiry=7200)
+
+        call_kwargs = mock_s3.generate_presigned_url.call_args
+        expires_in = call_kwargs[1].get("ExpiresIn")
+        if expires_in is None:
+            # Might be positional arg
+            params_arg = call_kwargs[1].get("Params", {})
+            expires_in = call_kwargs[1].get("ExpiresIn", 3600)
+        assert expires_in == 7200
+
+    def test_generate_presigned_url_uses_correct_storage_key(self):
+        """The correct storage key is passed to boto3 generate_presigned_url."""
+        from orchestrator.agents.builder import generate_presigned_url
+
+        mock_s3 = MagicMock()
+        mock_s3.generate_presigned_url.return_value = "http://minio:9000/key"
+
+        storage_key = "tenant-abc/agent-xyz/msg-123/document.pdf"
+
+        with patch("boto3.client", return_value=mock_s3):
+            generate_presigned_url(storage_key)
+
+        call_args = mock_s3.generate_presigned_url.call_args
+        params = call_args[1].get("Params", {})
+        assert params.get("Key") == storage_key
+
+
+# ---------------------------------------------------------------------------
+# Tests: build_messages_with_media()
+# ---------------------------------------------------------------------------
+
+
+class TestBuildMessagesWithMedia:
+    """Tests for the multimodal message building function."""
+
+    def _make_agent_mock(self, model: str = "claude-3-sonnet-20240229") -> MagicMock:
+        """Create a minimal Agent mock."""
+        agent = MagicMock()
+        agent.name = "Test Agent"
+        agent.role = "Support"
+        agent.persona = ""
+        agent.system_prompt = "You are a helpful assistant."
+        agent.model_preference = model
+        return agent
+
+    def test_no_media_returns_text_only_messages(self):
+        """Message with no media produces standard text-only content — no regression."""
+        from orchestrator.agents.builder import build_messages_with_media
+
+        agent = self._make_agent_mock()
+        mock_s3 = MagicMock()
+        mock_s3.generate_presigned_url.return_value = "http://minio/key"
+
+        with patch("boto3.client", return_value=mock_s3):
+            messages = build_messages_with_media(
+                agent=agent,
+                current_message="What is the weather?",
+                media_attachments=[],
+                recent_messages=[],
+                relevant_context=[],
+            )
+
+        # Last message should be user role with plain string content
+        user_message = messages[-1]
+        assert user_message["role"] == "user"
+        assert isinstance(user_message["content"], str)
+        assert "What is the weather?" in user_message["content"]
+
+    def test_image_attachment_with_vision_model_produces_image_url_block(self):
+        """IMAGE attachment + vision model injects image_url content block."""
+        from orchestrator.agents.builder import build_messages_with_media
+
+        agent = self._make_agent_mock(model="claude-3-sonnet-20240229")
+        attachment = make_image_attachment()
+
+        mock_s3 = MagicMock()
+        presigned = "http://minio:9000/konstruct-media/tenant-1/agent-1/msg-1/photo.jpg?sig=abc"
+        mock_s3.generate_presigned_url.return_value = presigned
+
+        with patch("boto3.client", return_value=mock_s3):
+            messages = build_messages_with_media(
+                agent=agent,
+                current_message="What is in this image?",
+                media_attachments=[attachment],
+                recent_messages=[],
+                relevant_context=[],
+            )
+
+        user_message = messages[-1]
+        assert user_message["role"] == "user"
+        assert isinstance(user_message["content"], list), (
+            "Expected multipart content list, got: " + repr(user_message["content"])
+        )
+
+        content_types = {block["type"] for block in user_message["content"]}
+        assert "text" in content_types
+        assert "image_url" in content_types
+
+        image_block = next(b for b in user_message["content"] if b["type"] == "image_url")
+        assert image_block["image_url"]["url"] == presigned
+
+    def test_image_attachment_with_non_vision_model_produces_text_fallback(self):
+        """IMAGE attachment + non-vision model produces text fallback, no image_url block."""
+        from orchestrator.agents.builder import build_messages_with_media
+
+        agent = self._make_agent_mock(model="gpt-3.5-turbo")
+        attachment = make_image_attachment(filename="screenshot.png")
+
+        mock_s3 = MagicMock()
+        mock_s3.generate_presigned_url.return_value = "http://minio/key"
+
+        with patch("boto3.client", return_value=mock_s3):
+            messages = build_messages_with_media(
+                agent=agent,
+                current_message="Check this out",
+                media_attachments=[attachment],
+                recent_messages=[],
+                relevant_context=[],
+            )
+
+        user_message = messages[-1]
+        assert user_message["role"] == "user"
+        # For non-vision: content is plain string, NOT a list
+        assert isinstance(user_message["content"], str)
+        assert "[Image attached: screenshot.png]" in user_message["content"]
+
+    def test_document_attachment_produces_text_reference_with_presigned_url(self):
+        """DOCUMENT attachment produces text reference including presigned URL."""
+        from orchestrator.agents.builder import build_messages_with_media
+
+        agent = self._make_agent_mock(model="claude-3-sonnet-20240229")
+        attachment = make_document_attachment(filename="report.pdf")
+
+        mock_s3 = MagicMock()
+        presigned = "http://minio:9000/key/report.pdf?sig=xyz"
+        mock_s3.generate_presigned_url.return_value = presigned
+
+        with patch("boto3.client", return_value=mock_s3):
+            messages = build_messages_with_media(
+                agent=agent,
+                current_message="Summarize this document",
+                media_attachments=[attachment],
+                recent_messages=[],
+                relevant_context=[],
+            )
+
+        user_message = messages[-1]
+        assert user_message["role"] == "user"
+        # Documents are always text-referenced (no image_url blocks)
+        # Content can be string or list containing text block
+        content = user_message["content"]
+        if isinstance(content, list):
+            text_parts = [b["text"] for b in content if b["type"] == "text"]
+            full_text = " ".join(text_parts)
+        else:
+            full_text = content
+
+        assert "[Document attached: report.pdf" in full_text
+        assert presigned in full_text
+
+    def test_document_attachment_on_non_vision_model_produces_text_reference(self):
+        """DOCUMENT attachment on non-vision model also produces text reference."""
+        from orchestrator.agents.builder import build_messages_with_media
+
+        agent = self._make_agent_mock(model="gpt-3.5-turbo")
+        attachment = make_document_attachment(filename="specs.pdf")
+
+        mock_s3 = MagicMock()
+        presigned = "http://minio:9000/key/specs.pdf?sig=abc"
+        mock_s3.generate_presigned_url.return_value = presigned
+
+        with patch("boto3.client", return_value=mock_s3):
+            messages = build_messages_with_media(
+                agent=agent,
+                current_message="Review these specs",
+                media_attachments=[attachment],
+                recent_messages=[],
+                relevant_context=[],
+            )
+
+        user_message = messages[-1]
+        content = user_message["content"]
+        if isinstance(content, list):
+            full_text = " ".join(b["text"] for b in content if b["type"] == "text")
+        else:
+            full_text = content
+
+        assert "[Document attached: specs.pdf" in full_text
+
+    def test_image_without_storage_key_skipped_gracefully(self):
+        """Image attachment with no storage_key is not injected as image_url block."""
+        from orchestrator.agents.builder import build_messages_with_media
+
+        agent = self._make_agent_mock(model="claude-3-sonnet-20240229")
+        attachment = MediaAttachment(
+            media_type=MediaType.IMAGE,
+            storage_key=None,  # No storage key — file wasn't stored
+            filename="orphaned.jpg",
+            mime_type="image/jpeg",
+        )
+
+        mock_s3 = MagicMock()
+        mock_s3.generate_presigned_url.return_value = "http://minio/key"
+
+        with patch("boto3.client", return_value=mock_s3):
+            messages = build_messages_with_media(
+                agent=agent,
+                current_message="Here is an image",
+                media_attachments=[attachment],
+                recent_messages=[],
+                relevant_context=[],
+            )
+
+        # Should not crash — just produce a message (possibly with fallback text)
+        user_message = messages[-1]
+        assert user_message["role"] == "user"
+
+    def test_image_url_block_includes_detail_field(self):
+        """image_url content block includes 'detail' field set to 'auto'."""
+        from orchestrator.agents.builder import build_messages_with_media
+
+        agent = self._make_agent_mock(model="gpt-4o")
+        attachment = make_image_attachment()
+
+        mock_s3 = MagicMock()
+        mock_s3.generate_presigned_url.return_value = "http://minio:9000/key/photo.jpg"
+
+        with patch("boto3.client", return_value=mock_s3):
+            messages = build_messages_with_media(
+                agent=agent,
+                current_message="Analyze this image",
+                media_attachments=[attachment],
+                recent_messages=[],
+                relevant_context=[],
+            )
+
+        user_message = messages[-1]
+        content = user_message["content"]
+        assert isinstance(content, list)
+
+        image_block = next(b for b in content if b["type"] == "image_url")
+        assert "detail" in image_block["image_url"]
+        assert image_block["image_url"]["detail"] == "auto"
+
+    def test_multiple_images_all_injected_for_vision_model(self):
+        """Multiple image attachments all get image_url blocks for vision models."""
+        from orchestrator.agents.builder import build_messages_with_media
+
+        agent = self._make_agent_mock(model="gpt-4o")
+        attachments = [
+            make_image_attachment(storage_key="t/a/m/img1.png", filename="img1.png"),
+            make_image_attachment(storage_key="t/a/m/img2.jpg", filename="img2.jpg"),
+        ]
+
+        mock_s3 = MagicMock()
+        # Return different presigned URLs for each call
+        mock_s3.generate_presigned_url.side_effect = [
+            "http://minio/key/img1.png",
+            "http://minio/key/img2.jpg",
+        ]
+
+        with patch("boto3.client", return_value=mock_s3):
+            messages = build_messages_with_media(
+                agent=agent,
+                current_message="Compare these images",
+                media_attachments=attachments,
+                recent_messages=[],
+                relevant_context=[],
+            )
+
+        user_message = messages[-1]
+        content = user_message["content"]
+        assert isinstance(content, list)
+
+        image_blocks = [b for b in content if b["type"] == "image_url"]
+        assert len(image_blocks) == 2
+
+    def test_memory_context_still_injected_with_media(self):
+        """Memory context (recent + relevant) is still injected when media is present."""
+        from orchestrator.agents.builder import build_messages_with_media
+
+        agent = self._make_agent_mock(model="claude-3-sonnet-20240229")
+        attachment = make_image_attachment()
+
+        mock_s3 = MagicMock()
+        mock_s3.generate_presigned_url.return_value = "http://minio/key/img.jpg"
+
+        recent = [
+            {"role": "user", "content": "Hi"},
+            {"role": "assistant", "content": "Hello!"},
+        ]
+        relevant = ["Past context item 1"]
+
+        with patch("boto3.client", return_value=mock_s3):
+            messages = build_messages_with_media(
+                agent=agent,
+                current_message="Here is an image",
+                media_attachments=[attachment],
+                recent_messages=recent,
+                relevant_context=relevant,
+            )
+
+        # Should have: system, context system, recent[0], recent[1], current user
+        assert len(messages) >= 4
+        roles = [m["role"] for m in messages]
+        assert roles.count("system") >= 2  # main system + pgvector context
+        assert "user" in roles
+        assert "assistant" in roles