From 962b3840e6604b71b2c054a7db325caa218bc63f Mon Sep 17 00:00:00 2001 From: Michael Lam Date: Sat, 16 May 2026 03:54:28 -0700 Subject: [PATCH] fix: strip historical images in text mode --- CHANGELOG.md | 4 +++ api/streaming.py | 42 +++++++++++++++++++++++--- tests/test_native_image_attachments.py | 36 ++++++++++++++++++++++ 3 files changed, 78 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fa25bab..3e582476 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased] +### Fixed + +- **PR #2378** by @Michaelyklam (closes #2297) — Text-mode image handling now strips historical native `image_url` parts from provider-facing conversation replay. Current-turn uploads already respected `agent.image_input_mode: text`; this closes the remaining gap where an older image in the saved transcript could keep making text-only providers such as DeepSeek reject every later turn with `unknown variant image_url, expected text`. + ## [v0.51.74] — 2026-05-16 — Release AX (stage-367 — 4-PR safe-lane batch — #2362 table-cell spacing + #2363 run-state-consistency RFC + #2365 custom_providers list-format + #2367 settings sidebar i18n) ### Added diff --git a/api/streaming.py b/api/streaming.py index 18a32fc2..7ef8dc4c 100644 --- a/api/streaming.py +++ b/api/streaming.py @@ -1831,7 +1831,32 @@ def _maybe_schedule_title_refresh(session, put_event, agent): ).start() -def _sanitize_messages_for_api(messages): +def _strip_native_image_parts_from_content(content): + """Return provider-safe content with native image parts removed. + + Text-only provider endpoints (for example DeepSeek/OpenAI-compatible text + models) reject historical OpenAI-style ``image_url`` parts before the agent + can recover. When WebUI is configured for text-mode image handling, preserve + textual content from mixed content arrays and drop only the native image + blocks from replayed history. + """ + if not isinstance(content, list): + return content + clean_parts = [] + for part in content: + if not isinstance(part, dict): + continue + if part.get('type') == 'image_url' or 'image_url' in part: + continue + clean_parts.append(copy.deepcopy(part)) + if not clean_parts: + return '' + if len(clean_parts) == 1 and clean_parts[0].get('type') == 'text': + return str(clean_parts[0].get('text') or '') + return clean_parts + + +def _sanitize_messages_for_api(messages, *, cfg: dict = None): """Return a deep copy of messages with only API-safe fields. The webui stores extra metadata on messages (attachments, timestamp, _ts) @@ -1843,7 +1868,14 @@ def _sanitize_messages_for_api(messages): (Mercury-2/Inception, newer OpenAI models) reject histories containing dangling tool results with a 400 error: "Message has tool role, but there was no previous assistant message with a tool call." + + If ``agent.image_input_mode`` resolves to ``text``, native historical + ``image_url`` content parts are stripped too. Current-turn uploads already + respect text mode in ``_build_native_multimodal_message``; this closes the + remaining replay gap where an older native image in the saved transcript kept + causing 400s on every later text-only turn (#2297). """ + strip_native_images = cfg is not None and _resolve_image_input_mode(cfg) == "text" # First pass: collect all tool_call_ids declared by assistant messages. # Handles both OpenAI ('id') and Anthropic ('call_id') field names. valid_tool_call_ids: set = set() @@ -1872,6 +1904,8 @@ def _sanitize_messages_for_api(messages): # Orphaned tool result — skip to avoid 400 from strict providers. continue sanitized = {k: v for k, v in msg.items() if k in _API_SAFE_MSG_KEYS} + if strip_native_images and 'content' in sanitized: + sanitized['content'] = _strip_native_image_parts_from_content(sanitized.get('content')) if sanitized.get('role'): clean.append(sanitized) return clean @@ -3515,7 +3549,7 @@ def _run_agent_streaming( result = agent.run_conversation( user_message=user_message, system_message=workspace_system_msg, - conversation_history=_sanitize_messages_for_api(_previous_context_messages), + conversation_history=_sanitize_messages_for_api(_previous_context_messages, cfg=_cfg), task_id=session_id, persist_user_message=msg_text, ) @@ -3726,7 +3760,7 @@ def _run_agent_streaming( _heal_result = agent.run_conversation( user_message=user_message, system_message=workspace_system_msg, - conversation_history=_sanitize_messages_for_api(_previous_context_messages), + conversation_history=_sanitize_messages_for_api(_previous_context_messages, cfg=_cfg), task_id=session_id, persist_user_message=msg_text, ) @@ -4505,7 +4539,7 @@ def _run_agent_streaming( _heal_result = _heal_agent.run_conversation( user_message=user_message, system_message=workspace_system_msg, - conversation_history=_sanitize_messages_for_api(_previous_context_messages), + conversation_history=_sanitize_messages_for_api(_previous_context_messages, cfg=_cfg), task_id=session_id, persist_user_message=msg_text, ) diff --git a/tests/test_native_image_attachments.py b/tests/test_native_image_attachments.py index f6b04166..8c38aea5 100644 --- a/tests/test_native_image_attachments.py +++ b/tests/test_native_image_attachments.py @@ -17,6 +17,7 @@ from api.streaming import ( _attachment_name, _build_native_multimodal_message, _NATIVE_IMAGE_MAX_BYTES, + _sanitize_messages_for_api, ) from api.routes import _normalize_chat_attachments @@ -318,6 +319,41 @@ class TestBuildNativeMultimodalMessage: assert data_url.startswith('data:image/png;base64,') assert len(result) == 2 + def test_text_image_mode_strips_historical_image_url_parts(self): + """#2297: text-only providers must not replay old native image parts.""" + history = [ + { + 'role': 'user', + 'content': [ + {'type': 'text', 'text': 'what is in this image?'}, + {'type': 'image_url', 'image_url': {'url': 'data:image/png;base64,AAA='}}, + ], + 'attachments': [{'name': 'photo.png'}], + 'timestamp': 123, + }, + {'role': 'assistant', 'content': 'It is a chart.'}, + ] + cfg = {'agent': {'image_input_mode': 'text'}} + + sanitized = _sanitize_messages_for_api(history, cfg=cfg) + + assert sanitized[0] == {'role': 'user', 'content': 'what is in this image?'} + assert 'image_url' not in str(sanitized) + assert 'attachments' not in sanitized[0] + assert sanitized[1] == {'role': 'assistant', 'content': 'It is a chart.'} + + def test_native_image_mode_keeps_historical_image_url_parts(self): + """Vision-capable/native mode keeps existing multimodal history intact.""" + content = [ + {'type': 'text', 'text': 'describe'}, + {'type': 'image_url', 'image_url': {'url': 'data:image/png;base64,AAA='}}, + ] + cfg = {'agent': {'image_input_mode': 'native'}} + + sanitized = _sanitize_messages_for_api([{'role': 'user', 'content': content}], cfg=cfg) + + assert sanitized == [{'role': 'user', 'content': content}] + def test_fake_png_rejected_by_magic_bytes(self): """A file named .png that is not actually an image must be rejected.""" with TemporaryDirectory() as d: