feat(02-05): multimodal LLM interpretation with image_url content blocks
- Add supports_vision(model_name) to builder.py — detects vision-capable models (claude-3*, gpt-4o*, gpt-4-vision*, gemini-pro-vision*, gemini-1.5*, gemini-2*) with provider prefix stripping support - Add generate_presigned_url(storage_key, expiry=3600) to builder.py — generates 1-hour MinIO presigned URLs via boto3 S3 client - Add build_messages_with_media() to builder.py — extends build_messages_with_memory() with media injection: IMAGE -> image_url blocks for vision models / text fallback for non-vision models, DOCUMENT -> text reference with presigned URL - image_url blocks use 'detail: auto' per OpenAI/LiteLLM multipart format - Add 27 unit tests in test_multimodal_messages.py (TDD)
This commit is contained in:
@@ -19,12 +19,119 @@ Memory-enriched message assembly:
|
||||
has background context without it polluting the conversation flow.
|
||||
2. Short-term context: recent messages (Redis sliding window)
|
||||
Represents the immediate conversation history in this session.
|
||||
|
||||
Multimodal message assembly:
|
||||
build_messages_with_media() extends build_messages_with_memory() by injecting:
|
||||
1. IMAGE attachments: generates MinIO presigned URLs and injects image_url
|
||||
content blocks for vision-capable models (Claude 3+, GPT-4o, Gemini 1.5+).
|
||||
For non-vision models: falls back to "[Image attached: {filename}]" text.
|
||||
2. DOCUMENT attachments: always text-referenced as "[Document attached: {filename} - {url}]"
|
||||
(PDFs cannot be passed as image_url blocks).
|
||||
|
||||
supports_vision(model_name) returns True for known vision-capable models:
|
||||
claude-3*, gpt-4o*, gpt-4-vision*, gemini-pro-vision*, gemini-1.5*
|
||||
|
||||
generate_presigned_url(storage_key) generates a 1-hour MinIO presigned URL
|
||||
for the given object key.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
from shared.models.message import MediaAttachment, MediaType
|
||||
from shared.models.tenant import Agent
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Vision model detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Regex patterns for vision-capable model names (case-insensitive).
|
||||
# Matches on the model name AFTER stripping any provider prefix (e.g. "anthropic/").
|
||||
_VISION_PATTERNS: list[re.Pattern[str]] = [
|
||||
re.compile(r"claude-3", re.IGNORECASE),
|
||||
re.compile(r"gpt-4o", re.IGNORECASE),
|
||||
re.compile(r"gpt-4-vision", re.IGNORECASE),
|
||||
re.compile(r"gemini-pro-vision", re.IGNORECASE),
|
||||
re.compile(r"gemini-1\.5", re.IGNORECASE),
|
||||
re.compile(r"gemini-2", re.IGNORECASE),
|
||||
]
|
||||
|
||||
|
||||
def supports_vision(model_name: str) -> bool:
|
||||
"""
|
||||
Return True if the given model name is known to support vision (image input).
|
||||
|
||||
Checks against known vision-capable model name patterns:
|
||||
- claude-3* (all Claude 3+ models including claude-3-5-sonnet)
|
||||
- gpt-4o* (gpt-4o, gpt-4o-mini)
|
||||
- gpt-4-vision* (gpt-4-vision-preview)
|
||||
- gemini-pro-vision
|
||||
- gemini-1.5* (gemini-1.5-pro, gemini-1.5-flash)
|
||||
- gemini-2* (future Gemini 2+ series)
|
||||
|
||||
Provider prefixes (e.g. "anthropic/", "openai/") are stripped before
|
||||
pattern matching to support LiteLLM-prefixed model names.
|
||||
|
||||
Args:
|
||||
model_name: Model name string (e.g. "claude-3-sonnet-20240229",
|
||||
"anthropic/claude-3-sonnet-20240229", "gpt-4o").
|
||||
|
||||
Returns:
|
||||
True if the model is vision-capable, False otherwise.
|
||||
"""
|
||||
if not model_name:
|
||||
return False
|
||||
|
||||
# Strip provider prefix (e.g. "anthropic/claude-3-..." -> "claude-3-...")
|
||||
name = model_name
|
||||
if "/" in name:
|
||||
name = name.split("/", 1)[-1]
|
||||
|
||||
return any(pattern.search(name) for pattern in _VISION_PATTERNS)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MinIO presigned URL generation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def generate_presigned_url(storage_key: str, expiry: int = 3600) -> str:
|
||||
"""
|
||||
Generate a MinIO presigned GET URL for the given object key.
|
||||
|
||||
Uses boto3 S3 client with the MinIO endpoint from shared settings.
|
||||
The URL is valid for ``expiry`` seconds (default: 3600 = 1 hour).
|
||||
|
||||
Args:
|
||||
storage_key: MinIO object key (e.g. "tenant-1/agent-1/msg-1/photo.jpg").
|
||||
expiry: URL validity in seconds (default: 3600).
|
||||
|
||||
Returns:
|
||||
A presigned URL string.
|
||||
"""
|
||||
import boto3 # type: ignore[import-untyped]
|
||||
|
||||
from shared.config import settings
|
||||
|
||||
s3_client = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=settings.minio_endpoint,
|
||||
aws_access_key_id=settings.minio_access_key,
|
||||
aws_secret_access_key=settings.minio_secret_key,
|
||||
region_name="us-east-1", # MinIO ignores region but boto3 requires it
|
||||
)
|
||||
|
||||
presigned_url: str = s3_client.generate_presigned_url(
|
||||
"get_object",
|
||||
Params={"Bucket": settings.minio_media_bucket, "Key": storage_key},
|
||||
ExpiresIn=expiry,
|
||||
)
|
||||
return presigned_url
|
||||
|
||||
|
||||
def build_system_prompt(agent: Agent) -> str:
|
||||
"""
|
||||
@@ -141,3 +248,146 @@ def build_messages_with_memory(
|
||||
messages.append({"role": "user", "content": current_message})
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
def build_messages_with_media(
|
||||
agent: Agent,
|
||||
current_message: str,
|
||||
media_attachments: list[MediaAttachment],
|
||||
recent_messages: list[dict],
|
||||
relevant_context: list[str],
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Build an LLM messages array with memory enrichment AND multimodal media injection.
|
||||
|
||||
Extends build_messages_with_memory() by converting media attachments into
|
||||
appropriate LLM content blocks:
|
||||
|
||||
For IMAGE attachments:
|
||||
- If the agent's model supports vision: converts the user message to multipart
|
||||
format with image_url content blocks (each block gets a presigned MinIO URL).
|
||||
- If the model does NOT support vision: appends "[Image attached: {filename}]"
|
||||
text instead (graceful degradation — no error, no dropped image).
|
||||
|
||||
For DOCUMENT attachments:
|
||||
- Always text-referenced as "[Document attached: {filename} - {presigned_url}]"
|
||||
- PDFs and documents cannot be passed as image_url blocks in the OpenAI format.
|
||||
|
||||
Attachments without a storage_key (not yet stored in MinIO) are skipped
|
||||
gracefully rather than raising an error.
|
||||
|
||||
Structure (in order):
|
||||
1. System message — agent identity, persona, AI transparency clause
|
||||
2. System message — long-term context (ONLY if non-empty)
|
||||
3. Sliding window messages — recent history
|
||||
4. Current user message (plain string or multipart content list)
|
||||
|
||||
Args:
|
||||
agent: ORM Agent instance (for system prompt and model detection).
|
||||
current_message: The current user message text.
|
||||
media_attachments: List of MediaAttachment objects to inject into the prompt.
|
||||
Empty list produces the same output as build_messages_with_memory().
|
||||
recent_messages: Short-term memory — list of {"role", "content"} dicts.
|
||||
relevant_context: Long-term memory — list of content strings from pgvector.
|
||||
|
||||
Returns:
|
||||
List of message dicts suitable for an OpenAI/LiteLLM-compatible API call.
|
||||
User message content is either a plain string (no media / non-vision model
|
||||
with text-only references) or a multipart list when image_url blocks are used.
|
||||
"""
|
||||
# Build the base messages array (system + memory + current text message)
|
||||
messages = build_messages_with_memory(
|
||||
agent=agent,
|
||||
current_message=current_message,
|
||||
recent_messages=recent_messages,
|
||||
relevant_context=relevant_context,
|
||||
)
|
||||
|
||||
# If no media attachments, return the base messages unchanged
|
||||
if not media_attachments:
|
||||
return messages
|
||||
|
||||
# Determine if this agent's model supports vision
|
||||
model_name: str = agent.model_preference or ""
|
||||
vision_capable = supports_vision(model_name)
|
||||
|
||||
# Start building the enriched user message content
|
||||
# We'll accumulate text additions and image_url blocks separately
|
||||
text_additions: list[str] = []
|
||||
image_url_blocks: list[dict] = []
|
||||
|
||||
for attachment in media_attachments:
|
||||
# Skip attachments without a storage key (not yet stored in MinIO)
|
||||
if not attachment.storage_key:
|
||||
logger.debug(
|
||||
"build_messages_with_media: skipping attachment with no storage_key "
|
||||
"(filename=%r) — not yet stored in MinIO",
|
||||
attachment.filename,
|
||||
)
|
||||
continue
|
||||
|
||||
filename = attachment.filename or attachment.storage_key.split("/")[-1]
|
||||
|
||||
if attachment.media_type == MediaType.IMAGE:
|
||||
if vision_capable:
|
||||
# Generate presigned URL and add as image_url block
|
||||
try:
|
||||
presigned_url = generate_presigned_url(attachment.storage_key)
|
||||
image_url_blocks.append({
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": presigned_url,
|
||||
"detail": "auto",
|
||||
},
|
||||
})
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"build_messages_with_media: failed to generate presigned URL "
|
||||
"for storage_key=%r — using text fallback",
|
||||
attachment.storage_key,
|
||||
)
|
||||
text_additions.append(f"[Image attached: {filename}]")
|
||||
else:
|
||||
# Non-vision model: text fallback
|
||||
text_additions.append(f"[Image attached: {filename}]")
|
||||
|
||||
elif attachment.media_type == MediaType.DOCUMENT:
|
||||
# Documents are always text-referenced (no image_url blocks for PDFs)
|
||||
try:
|
||||
presigned_url = generate_presigned_url(attachment.storage_key)
|
||||
text_additions.append(f"[Document attached: {filename} - {presigned_url}]")
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"build_messages_with_media: failed to generate presigned URL "
|
||||
"for document storage_key=%r",
|
||||
attachment.storage_key,
|
||||
)
|
||||
text_additions.append(f"[Document attached: {filename}]")
|
||||
|
||||
else:
|
||||
# AUDIO/VIDEO: text reference only for now
|
||||
text_additions.append(f"[{attachment.media_type.value.capitalize()} attached: {filename}]")
|
||||
|
||||
# If we have no enrichments, return the base messages unchanged
|
||||
if not text_additions and not image_url_blocks:
|
||||
return messages
|
||||
|
||||
# Build the enriched user message content
|
||||
# The last message in the array is the current user message
|
||||
base_user_text: str = messages[-1]["content"] # Always a str from build_messages_with_memory
|
||||
|
||||
if image_url_blocks:
|
||||
# Multipart format: text block + image_url blocks
|
||||
full_text = base_user_text
|
||||
if text_additions:
|
||||
full_text = base_user_text + "\n" + "\n".join(text_additions)
|
||||
|
||||
enriched_content: list[dict] | str = [{"type": "text", "text": full_text}] + image_url_blocks
|
||||
else:
|
||||
# Text-only enrichment (non-vision model or documents only)
|
||||
enriched_content = base_user_text + "\n" + "\n".join(text_additions)
|
||||
|
||||
# Replace the last message's content with the enriched version
|
||||
messages[-1] = {"role": "user", "content": enriched_content}
|
||||
|
||||
return messages
|
||||
|
||||
Reference in New Issue
Block a user