mistralai · juliendenize · Jun 16, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/src/mistral_common/experimental/app/routers.py b/src/mistral_common/experimental/app/routers.py
@@ -14,7 +14,7 @@
 )
 from mistral_common.experimental.think import _split_content_and_think_chunks
 from mistral_common.experimental.tools import _decode_tool_calls, _split_content_and_tool_calls
-from mistral_common.protocol.instruct.chunk import TextChunk, ThinkChunk
+from mistral_common.protocol.instruct.chunk import ContentChunk, TextChunk, ThinkChunk
 from mistral_common.protocol.instruct.messages import AssistantMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, Tokenized, TokenizerVersion
@@ -100,7 +100,7 @@ async def detokenize_to_assistant_message(
     else:
         content_tokens, tool_calls_tokens = tokens, ()
 
-    content: str | list[TextChunk | ThinkChunk] | None = None
+    content: str | list[ContentChunk] | None = None
 
     if settings.tokenizer.instruct_tokenizer.tokenizer.version >= TokenizerVersion.v13:
         assert isinstance(settings.tokenizer.instruct_tokenizer, InstructTokenizerV13)

diff --git a/src/mistral_common/integrations/chat_templates/template_generator.py b/src/mistral_common/integrations/chat_templates/template_generator.py
@@ -216,6 +216,30 @@ def validates_assistant_non_empty(self) -> bool:
         r"""Whether to validate that assistant messages have non-empty content or tool calls."""
         return self.version >= TokenizerVersion.v7 or (self.version >= TokenizerVersion.v3 and not self.spm)
 
+    @property
+    def tool_supports_multimodal(self) -> bool:
+        r"""Whether tool messages can contain non-text content chunks. V15+."""
+        return self.version >= TokenizerVersion.v15
+
+    @property
+    def system_supports_audio(self) -> bool:
+        r"""Whether system messages can contain audio. V15+ with audio support."""
+        return self.audio_support and self.version >= TokenizerVersion.v15
+
+
+def _join_types_desc(parts: list[str]) -> str:
+    r"""Join type names into a human-readable description string.
+
+    Args:
+        parts: List of type names (e.g. ["text", "thinking", "image"]).
+
+    Returns:
+        Formatted string like "text", "text and thinking", or "text, thinking and image".
+    """
+    if len(parts) == 1:
+        return parts[0]
+    return ", ".join(parts[:-1]) + " and " + parts[-1]
+
 
 def _generate_header(config: TemplateConfig) -> str:
     r"""Generate template header with default system message.
@@ -872,6 +896,8 @@ def _generate_system_message_handling(config: TemplateConfig) -> str:
     if has_extra_types:
         if config.system_supports_thinking:
             rc_args += ", supported_types_desc='text and thinking'"
+        elif config.system_supports_audio:
+            rc_args += ", supported_types_desc='text and audio'"
         else:
             rc_args += ", supported_types_desc='text'"
     if config.any_thinking_support:
@@ -882,7 +908,7 @@ def _generate_system_message_handling(config: TemplateConfig) -> str:
     if config.image_support:
         rc_args += ", support_images=false"
     if config.audio_support:
-        rc_args += ", support_audio=false"
+        rc_args += f", support_audio={'true' if config.system_supports_audio else 'false'}"
     lines.append("        {{- render_content(" + rc_args + ") -}}")
 
     lines.append("        {{- '" + _END_SYSTEM + "' -}}")
@@ -1204,10 +1230,10 @@ def _generate_assistant_message_handling(config: TemplateConfig) -> str:
     """
     lines = []
 
+    comment_parts = ["text"]
     if config.any_thinking_support:
-        chunk_types = "text and thinking"
-    else:
-        chunk_types = "text"
+        comment_parts.append("thinking")
+    chunk_types = _join_types_desc(comment_parts)
 
     comment = f"{{#- Assistant messages supports {chunk_types} content. #}}"
     lines.append("")
@@ -1235,10 +1261,10 @@ def _generate_assistant_message_handling(config: TemplateConfig) -> str:
     has_extra_types = config.any_thinking_support or config.image_support or config.audio_support
     rc_call_args = "message['content'], 'assistant message contents'"
     if has_extra_types:
+        desc_parts = ["text"]
         if config.any_thinking_support:
-            rc_call_args += ", supported_types_desc='text and thinking'"
-        else:
-            rc_call_args += ", supported_types_desc='text'"
+            desc_parts.append("thinking")
+        rc_call_args += f", supported_types_desc='{_join_types_desc(desc_parts)}'"
     if config.any_thinking_support:
         rc_call_args += ", support_thinking=true"
     if config.image_support:
@@ -1423,7 +1449,10 @@ def _generate_tool_message_handling(config: TemplateConfig) -> str:
         lines.append("    {#- Tool messages supports int, float or text content. #}")
         lines.append("    {%- elif message['role'] == 'tool' and ns.index > ns.max_idx_user %}")
     else:
-        lines.append("    {#- Tool messages only supports text content. #}")
+        if config.tool_supports_multimodal:
+            lines.append("    {#- Tool messages (multimodal). #}")
+        else:
+            lines.append("    {#- Tool messages only supports text content. #}")
         lines.append("    {%- elif message['role'] == 'tool' %}")
 
     if config.uses_spm_space_tracking:
@@ -1484,9 +1513,26 @@ def _generate_tool_message_handling(config: TemplateConfig) -> str:
                 + "' }}"  # noqa: E501
             )
     elif config.uses_simple_tool_results:
-        lines.append(
-            "        {{- '" + _BEGIN_TOOL_RESULTS + "' + message['content']|string + '" + _END_TOOL_RESULTS + "' }}"
-        )  # noqa: E501
+        if config.tool_supports_multimodal:
+            tool_rc_args = "message['content'], 'tool message contents'"
+            if config.image_support or config.audio_support:
+                desc_parts = ["text"]
+                if config.image_support:
+                    desc_parts.append("image")
+                if config.audio_support:
+                    desc_parts.append("audio")
+                tool_rc_args += f", supported_types_desc='{_join_types_desc(desc_parts)}'"
+            if config.image_support:
+                tool_rc_args += ", support_images=true"
+            if config.audio_support:
+                tool_rc_args += ", support_audio=true"
+            lines.append("        {{- '" + _BEGIN_TOOL_RESULTS + "' -}}")
+            lines.append("        {{- render_content(" + tool_rc_args + ") -}}")
+            lines.append("        {{- '" + _END_TOOL_RESULTS + "' }}")
+        else:
+            lines.append(
+                "        {{- '" + _BEGIN_TOOL_RESULTS + "' + message['content']|string + '" + _END_TOOL_RESULTS + "' }}"
+            )  # noqa: E501
     else:
         # v3 non-spm style
         lines.extend(_emit_int_float_parsing("        "))

diff --git a/src/mistral_common/protocol/instruct/chunk.py b/src/mistral_common/protocol/instruct/chunk.py
@@ -451,9 +451,6 @@ def from_openai(cls, openai_chunk: dict[str, Any]) -> "ThinkChunk":
 ContentChunk = Annotated[
     TextChunk | ImageChunk | ImageURLChunk | AudioChunk | AudioURLChunk | ThinkChunk, Field(discriminator="type")
 ]
-UserContentChunk = Annotated[
-    TextChunk | ImageChunk | ImageURLChunk | AudioChunk | AudioURLChunk, Field(discriminator="type")
-]
 
 
 def _convert_openai_content_chunks(openai_content_chunks: dict[str, Any]) -> ContentChunk: