From 962b3840e6604b71b2c054a7db325caa218bc63f Mon Sep 17 00:00:00 2001
From: Michael Lam <michael@example.local>
Date: Sat, 16 May 2026 03:54:28 -0700
Subject: [PATCH] fix: strip historical images in text mode

---
 CHANGELOG.md                           |  4 +++
 api/streaming.py                       | 42 +++++++++++++++++++++++---
 tests/test_native_image_attachments.py | 36 ++++++++++++++++++++++
 3 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6fa25bab..3e582476 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## [Unreleased]
 
+### Fixed
+
+- **PR #2378** by @Michaelyklam (closes #2297) — Text-mode image handling now strips historical native `image_url` parts from provider-facing conversation replay. Current-turn uploads already respected `agent.image_input_mode: text`; this closes the remaining gap where an older image in the saved transcript could keep making text-only providers such as DeepSeek reject every later turn with `unknown variant image_url, expected text`.
+
 ## [v0.51.74] — 2026-05-16 — Release AX (stage-367 — 4-PR safe-lane batch — #2362 table-cell spacing + #2363 run-state-consistency RFC + #2365 custom_providers list-format + #2367 settings sidebar i18n)
 
 ### Added
diff --git a/api/streaming.py b/api/streaming.py
index 18a32fc2..7ef8dc4c 100644
--- a/api/streaming.py
+++ b/api/streaming.py
@@ -1831,7 +1831,32 @@ def _maybe_schedule_title_refresh(session, put_event, agent):
     ).start()
 
 
-def _sanitize_messages_for_api(messages):
+def _strip_native_image_parts_from_content(content):
+    """Return provider-safe content with native image parts removed.
+
+    Text-only provider endpoints (for example DeepSeek/OpenAI-compatible text
+    models) reject historical OpenAI-style ``image_url`` parts before the agent
+    can recover.  When WebUI is configured for text-mode image handling, preserve
+    textual content from mixed content arrays and drop only the native image
+    blocks from replayed history.
+    """
+    if not isinstance(content, list):
+        return content
+    clean_parts = []
+    for part in content:
+        if not isinstance(part, dict):
+            continue
+        if part.get('type') == 'image_url' or 'image_url' in part:
+            continue
+        clean_parts.append(copy.deepcopy(part))
+    if not clean_parts:
+        return ''
+    if len(clean_parts) == 1 and clean_parts[0].get('type') == 'text':
+        return str(clean_parts[0].get('text') or '')
+    return clean_parts
+
+
+def _sanitize_messages_for_api(messages, *, cfg: dict = None):
     """Return a deep copy of messages with only API-safe fields.
 
     The webui stores extra metadata on messages (attachments, timestamp, _ts)
@@ -1843,7 +1868,14 @@ def _sanitize_messages_for_api(messages):
     (Mercury-2/Inception, newer OpenAI models) reject histories containing dangling
     tool results with a 400 error: "Message has tool role, but there was no previous
     assistant message with a tool call."
+
+    If ``agent.image_input_mode`` resolves to ``text``, native historical
+    ``image_url`` content parts are stripped too.  Current-turn uploads already
+    respect text mode in ``_build_native_multimodal_message``; this closes the
+    remaining replay gap where an older native image in the saved transcript kept
+    causing 400s on every later text-only turn (#2297).
     """
+    strip_native_images = cfg is not None and _resolve_image_input_mode(cfg) == "text"
     # First pass: collect all tool_call_ids declared by assistant messages.
     # Handles both OpenAI ('id') and Anthropic ('call_id') field names.
     valid_tool_call_ids: set = set()
@@ -1872,6 +1904,8 @@ def _sanitize_messages_for_api(messages):
                 # Orphaned tool result — skip to avoid 400 from strict providers.
                 continue
         sanitized = {k: v for k, v in msg.items() if k in _API_SAFE_MSG_KEYS}
+        if strip_native_images and 'content' in sanitized:
+            sanitized['content'] = _strip_native_image_parts_from_content(sanitized.get('content'))
         if sanitized.get('role'):
             clean.append(sanitized)
     return clean
@@ -3515,7 +3549,7 @@ def _run_agent_streaming(
             result = agent.run_conversation(
                 user_message=user_message,
                 system_message=workspace_system_msg,
-                conversation_history=_sanitize_messages_for_api(_previous_context_messages),
+                conversation_history=_sanitize_messages_for_api(_previous_context_messages, cfg=_cfg),
                 task_id=session_id,
                 persist_user_message=msg_text,
             )
@@ -3726,7 +3760,7 @@ def _run_agent_streaming(
                                 _heal_result = agent.run_conversation(
                                     user_message=user_message,
                                     system_message=workspace_system_msg,
-                                    conversation_history=_sanitize_messages_for_api(_previous_context_messages),
+                                    conversation_history=_sanitize_messages_for_api(_previous_context_messages, cfg=_cfg),
                                     task_id=session_id,
                                     persist_user_message=msg_text,
                                 )
@@ -4505,7 +4539,7 @@ def _run_agent_streaming(
                         _heal_result = _heal_agent.run_conversation(
                             user_message=user_message,
                             system_message=workspace_system_msg,
-                            conversation_history=_sanitize_messages_for_api(_previous_context_messages),
+                            conversation_history=_sanitize_messages_for_api(_previous_context_messages, cfg=_cfg),
                             task_id=session_id,
                             persist_user_message=msg_text,
                         )
diff --git a/tests/test_native_image_attachments.py b/tests/test_native_image_attachments.py
index f6b04166..8c38aea5 100644
--- a/tests/test_native_image_attachments.py
+++ b/tests/test_native_image_attachments.py
@@ -17,6 +17,7 @@ from api.streaming import (
     _attachment_name,
     _build_native_multimodal_message,
     _NATIVE_IMAGE_MAX_BYTES,
+    _sanitize_messages_for_api,
 )
 from api.routes import _normalize_chat_attachments
 
@@ -318,6 +319,41 @@ class TestBuildNativeMultimodalMessage:
             assert data_url.startswith('data:image/png;base64,')
             assert len(result) == 2
 
+    def test_text_image_mode_strips_historical_image_url_parts(self):
+        """#2297: text-only providers must not replay old native image parts."""
+        history = [
+            {
+                'role': 'user',
+                'content': [
+                    {'type': 'text', 'text': 'what is in this image?'},
+                    {'type': 'image_url', 'image_url': {'url': 'data:image/png;base64,AAA='}},
+                ],
+                'attachments': [{'name': 'photo.png'}],
+                'timestamp': 123,
+            },
+            {'role': 'assistant', 'content': 'It is a chart.'},
+        ]
+        cfg = {'agent': {'image_input_mode': 'text'}}
+
+        sanitized = _sanitize_messages_for_api(history, cfg=cfg)
+
+        assert sanitized[0] == {'role': 'user', 'content': 'what is in this image?'}
+        assert 'image_url' not in str(sanitized)
+        assert 'attachments' not in sanitized[0]
+        assert sanitized[1] == {'role': 'assistant', 'content': 'It is a chart.'}
+
+    def test_native_image_mode_keeps_historical_image_url_parts(self):
+        """Vision-capable/native mode keeps existing multimodal history intact."""
+        content = [
+            {'type': 'text', 'text': 'describe'},
+            {'type': 'image_url', 'image_url': {'url': 'data:image/png;base64,AAA='}},
+        ]
+        cfg = {'agent': {'image_input_mode': 'native'}}
+
+        sanitized = _sanitize_messages_for_api([{'role': 'user', 'content': content}], cfg=cfg)
+
+        assert sanitized == [{'role': 'user', 'content': content}]
+
     def test_fake_png_rejected_by_magic_bytes(self):
         """A file named .png that is not actually an image must be rejected."""
         with TemporaryDirectory() as d: