From 445db76eda14875797852b18a91e34143564e250 Mon Sep 17 00:00:00 2001 From: haoruilee Date: Mon, 15 Jun 2026 21:28:34 +0800 Subject: [PATCH 1/3] Fix AudioChunk OpenAI serialization --- src/mistral_common/protocol/instruct/chunk.py | 22 +++++++++++++-- tests/test_converters.py | 28 +++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/mistral_common/protocol/instruct/chunk.py b/src/mistral_common/protocol/instruct/chunk.py index 2ed61528..6feacd9e 100644 --- a/src/mistral_common/protocol/instruct/chunk.py +++ b/src/mistral_common/protocol/instruct/chunk.py @@ -1,4 +1,5 @@ import base64 +import binascii import io import re from enum import Enum @@ -20,6 +21,20 @@ from mistral_common.tokens.tokenizers.audio import Audio +def _strip_audio_data_url_prefix(data: str) -> str: + r"""Remove the optional base64 audio data URL prefix.""" + if re.match(r"^data:audio/\w+;base64,", data): + return data.split(",", 1)[1] + return data + + +def _audio_input_to_base64(data: str | bytes) -> str: + r"""Convert raw audio bytes or base64 audio text to base64 audio text.""" + if isinstance(data, bytes): + return base64.b64encode(data).decode("utf-8") + return _strip_audio_data_url_prefix(data) + + def _detect_audio_format(data: str | bytes) -> str: r"""Detect audio format from base64-encoded string or raw bytes. @@ -37,7 +52,10 @@ def _detect_audio_format(data: str | bytes) -> str: assert_soundfile_installed() if isinstance(data, str): - audio_bytes = base64.b64decode(data) + try: + audio_bytes = base64.b64decode(_strip_audio_data_url_prefix(data)) + except (binascii.Error, ValueError) as e: + raise ValueError("Failed to detect audio format. Verify that the given file is valid wav or mp3.") from e else: audio_bytes = data @@ -379,7 +397,7 @@ def to_openai(self) -> dict[str, Any]: Returns: A dictionary representing the audio chunk in the OpenAI format. """ - content = self.input_audio.decode("utf-8") if isinstance(self.input_audio, bytes) else self.input_audio + content = _audio_input_to_base64(self.input_audio) fmt = _detect_audio_format(self.input_audio) return { "type": self.type, diff --git a/tests/test_converters.py b/tests/test_converters.py index d093d945..e8f175da 100644 --- a/tests/test_converters.py +++ b/tests/test_converters.py @@ -1,3 +1,4 @@ +import base64 import copy import io import warnings @@ -1197,6 +1198,33 @@ def test_audio_chunk_to_openai_format_detection(fmt: str) -> None: assert AudioChunk.from_openai(result).input_audio == b64 +@pytest.mark.parametrize("fmt", ["wav", "flac"]) +def test_audio_chunk_to_openai_raw_bytes_format_detection(fmt: str) -> None: + audio = _make_fake_audio(0.5) + buffer = io.BytesIO() + sf.write(buffer, audio.audio_array, audio.sampling_rate, format=fmt) + raw_bytes = buffer.getvalue() + + result = AudioChunk(input_audio=raw_bytes).to_openai() + + assert result["input_audio"]["format"] == fmt + assert result["input_audio"]["data"] == base64.b64encode(raw_bytes).decode("utf-8") + assert AudioChunk.from_openai(result).input_audio == result["input_audio"]["data"] + + +@pytest.mark.parametrize("fmt", ["wav", "flac"]) +def test_audio_chunk_to_openai_strips_base64_data_url_prefix(fmt: str) -> None: + audio = _make_fake_audio(0.5) + b64 = audio.to_base64(fmt) + chunk = AudioChunk(input_audio=f"data:audio/{fmt};base64,{b64}") + + result = chunk.to_openai() + + assert result["input_audio"]["format"] == fmt + assert result["input_audio"]["data"] == b64 + assert AudioChunk.from_openai(result).input_audio == b64 + + @pytest.mark.parametrize("fmt", ["wav", "flac"]) def test_transcription_to_openai_format_detection(fmt: str) -> None: audio = _make_fake_audio(0.5) From 7d34afb40a641ed16dc1e113f96a2f0c53be01f8 Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Tue, 16 Jun 2026 15:28:52 +0200 Subject: [PATCH 2/3] Remove redundant try/except in _detect_audio_format --- src/mistral_common/protocol/instruct/chunk.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/mistral_common/protocol/instruct/chunk.py b/src/mistral_common/protocol/instruct/chunk.py index 6feacd9e..cfb84050 100644 --- a/src/mistral_common/protocol/instruct/chunk.py +++ b/src/mistral_common/protocol/instruct/chunk.py @@ -1,5 +1,4 @@ import base64 -import binascii import io import re from enum import Enum @@ -52,10 +51,7 @@ def _detect_audio_format(data: str | bytes) -> str: assert_soundfile_installed() if isinstance(data, str): - try: - audio_bytes = base64.b64decode(_strip_audio_data_url_prefix(data)) - except (binascii.Error, ValueError) as e: - raise ValueError("Failed to detect audio format. Verify that the given file is valid wav or mp3.") from e + audio_bytes = base64.b64decode(_strip_audio_data_url_prefix(data)) else: audio_bytes = data From d1300fb68d57a43f3f2fc48050b0f1678dea4023 Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Tue, 16 Jun 2026 16:23:24 +0200 Subject: [PATCH 3/3] Inline _audio_input_to_base64 into AudioChunk.to_openai --- src/mistral_common/protocol/instruct/chunk.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/mistral_common/protocol/instruct/chunk.py b/src/mistral_common/protocol/instruct/chunk.py index cfb84050..c616425b 100644 --- a/src/mistral_common/protocol/instruct/chunk.py +++ b/src/mistral_common/protocol/instruct/chunk.py @@ -27,13 +27,6 @@ def _strip_audio_data_url_prefix(data: str) -> str: return data -def _audio_input_to_base64(data: str | bytes) -> str: - r"""Convert raw audio bytes or base64 audio text to base64 audio text.""" - if isinstance(data, bytes): - return base64.b64encode(data).decode("utf-8") - return _strip_audio_data_url_prefix(data) - - def _detect_audio_format(data: str | bytes) -> str: r"""Detect audio format from base64-encoded string or raw bytes. @@ -393,7 +386,10 @@ def to_openai(self) -> dict[str, Any]: Returns: A dictionary representing the audio chunk in the OpenAI format. """ - content = _audio_input_to_base64(self.input_audio) + if isinstance(self.input_audio, bytes): + content = base64.b64encode(self.input_audio).decode("utf-8") + else: + content = _strip_audio_data_url_prefix(self.input_audio) fmt = _detect_audio_format(self.input_audio) return { "type": self.type,