fix: strip historical images in text mode

2026-05-26 11:40:26 +00:00 · 2026-05-16 03:54:28 -07:00
parent e3035b3e40
commit 962b3840e6
3 changed files with 78 additions and 4 deletions
@@ -2,6 +2,10 @@

 ## [Unreleased]

+### Fixed
+
+- **PR #2378** by @Michaelyklam (closes #2297) — Text-mode image handling now strips historical native `image_url` parts from provider-facing conversation replay. Current-turn uploads already respected `agent.image_input_mode: text`; this closes the remaining gap where an older image in the saved transcript could keep making text-only providers such as DeepSeek reject every later turn with `unknown variant image_url, expected text`.
+
 ## [v0.51.74] — 2026-05-16 — Release AX (stage-367 — 4-PR safe-lane batch — #2362 table-cell spacing + #2363 run-state-consistency RFC + #2365 custom_providers list-format + #2367 settings sidebar i18n)

 ### Added
@@ -1831,7 +1831,32 @@ def _maybe_schedule_title_refresh(session, put_event, agent):
    ).start()


-def _sanitize_messages_for_api(messages):
+def _strip_native_image_parts_from_content(content):
+    """Return provider-safe content with native image parts removed.
+
+    Text-only provider endpoints (for example DeepSeek/OpenAI-compatible text
+    models) reject historical OpenAI-style ``image_url`` parts before the agent
+    can recover.  When WebUI is configured for text-mode image handling, preserve
+    textual content from mixed content arrays and drop only the native image
+    blocks from replayed history.
+    """
+    if not isinstance(content, list):
+        return content
+    clean_parts = []
+    for part in content:
+        if not isinstance(part, dict):
+            continue
+        if part.get('type') == 'image_url' or 'image_url' in part:
+            continue
+        clean_parts.append(copy.deepcopy(part))
+    if not clean_parts:
+        return ''
+    if len(clean_parts) == 1 and clean_parts[0].get('type') == 'text':
+        return str(clean_parts[0].get('text') or '')
+    return clean_parts
+
+
+def _sanitize_messages_for_api(messages, *, cfg: dict = None):
    """Return a deep copy of messages with only API-safe fields.

    The webui stores extra metadata on messages (attachments, timestamp, _ts)
@@ -1843,7 +1868,14 @@ def _sanitize_messages_for_api(messages):
    (Mercury-2/Inception, newer OpenAI models) reject histories containing dangling
    tool results with a 400 error: "Message has tool role, but there was no previous
    assistant message with a tool call."
+
+    If ``agent.image_input_mode`` resolves to ``text``, native historical
+    ``image_url`` content parts are stripped too.  Current-turn uploads already
+    respect text mode in ``_build_native_multimodal_message``; this closes the
+    remaining replay gap where an older native image in the saved transcript kept
+    causing 400s on every later text-only turn (#2297).
    """
+    strip_native_images = cfg is not None and _resolve_image_input_mode(cfg) == "text"
    # First pass: collect all tool_call_ids declared by assistant messages.
    # Handles both OpenAI ('id') and Anthropic ('call_id') field names.
    valid_tool_call_ids: set = set()
@@ -1872,6 +1904,8 @@ def _sanitize_messages_for_api(messages):
                # Orphaned tool result — skip to avoid 400 from strict providers.
                continue
        sanitized = {k: v for k, v in msg.items() if k in _API_SAFE_MSG_KEYS}
+        if strip_native_images and 'content' in sanitized:
+            sanitized['content'] = _strip_native_image_parts_from_content(sanitized.get('content'))
        if sanitized.get('role'):
            clean.append(sanitized)
    return clean
@@ -3515,7 +3549,7 @@ def _run_agent_streaming(
            result = agent.run_conversation(
                user_message=user_message,
                system_message=workspace_system_msg,
-                conversation_history=_sanitize_messages_for_api(_previous_context_messages),
+                conversation_history=_sanitize_messages_for_api(_previous_context_messages, cfg=_cfg),
                task_id=session_id,
                persist_user_message=msg_text,
            )
@@ -3726,7 +3760,7 @@ def _run_agent_streaming(
                                _heal_result = agent.run_conversation(
                                    user_message=user_message,
                                    system_message=workspace_system_msg,
-                                    conversation_history=_sanitize_messages_for_api(_previous_context_messages),
+                                    conversation_history=_sanitize_messages_for_api(_previous_context_messages, cfg=_cfg),
                                    task_id=session_id,
                                    persist_user_message=msg_text,
                                )
@@ -4505,7 +4539,7 @@ def _run_agent_streaming(
                        _heal_result = _heal_agent.run_conversation(
                            user_message=user_message,
                            system_message=workspace_system_msg,
-                            conversation_history=_sanitize_messages_for_api(_previous_context_messages),
+                            conversation_history=_sanitize_messages_for_api(_previous_context_messages, cfg=_cfg),
                            task_id=session_id,
                            persist_user_message=msg_text,
                        )
@@ -17,6 +17,7 @@ from api.streaming import (
    _attachment_name,
    _build_native_multimodal_message,
    _NATIVE_IMAGE_MAX_BYTES,
+    _sanitize_messages_for_api,
 )
 from api.routes import _normalize_chat_attachments

@@ -318,6 +319,41 @@ class TestBuildNativeMultimodalMessage:
            assert data_url.startswith('data:image/png;base64,')
            assert len(result) == 2

+    def test_text_image_mode_strips_historical_image_url_parts(self):
+        """#2297: text-only providers must not replay old native image parts."""
+        history = [
+            {
+                'role': 'user',
+                'content': [
+                    {'type': 'text', 'text': 'what is in this image?'},
+                    {'type': 'image_url', 'image_url': {'url': 'data:image/png;base64,AAA='}},
+                ],
+                'attachments': [{'name': 'photo.png'}],
+                'timestamp': 123,
+            },
+            {'role': 'assistant', 'content': 'It is a chart.'},
+        ]
+        cfg = {'agent': {'image_input_mode': 'text'}}
+
+        sanitized = _sanitize_messages_for_api(history, cfg=cfg)
+
+        assert sanitized[0] == {'role': 'user', 'content': 'what is in this image?'}
+        assert 'image_url' not in str(sanitized)
+        assert 'attachments' not in sanitized[0]
+        assert sanitized[1] == {'role': 'assistant', 'content': 'It is a chart.'}
+
+    def test_native_image_mode_keeps_historical_image_url_parts(self):
+        """Vision-capable/native mode keeps existing multimodal history intact."""
+        content = [
+            {'type': 'text', 'text': 'describe'},
+            {'type': 'image_url', 'image_url': {'url': 'data:image/png;base64,AAA='}},
+        ]
+        cfg = {'agent': {'image_input_mode': 'native'}}
+
+        sanitized = _sanitize_messages_for_api([{'role': 'user', 'content': content}], cfg=cfg)
+
+        assert sanitized == [{'role': 'user', 'content': content}]
+
    def test_fake_png_rejected_by_magic_bytes(self):
        """A file named .png that is not actually an image must be rejected."""
        with TemporaryDirectory() as d: