(inworld tts): add language param (#5723)

tinalenguyen · web-flow · commit 51be60eb5c55 · 2026-05-12T20:48:00.000-04:00
diff --git a/livekit-plugins/livekit-plugins-inworld/livekit/plugins/inworld/tts.py b/livekit-plugins/livekit-plugins-inworld/livekit/plugins/inworld/tts.py
@@ -95,6 +95,7 @@ class _TTSOptions:
     bit_rate: int
     speaking_rate: float
     temperature: float
+    language: NotGivenOr[str] = NOT_GIVEN
     timestamp_type: NotGivenOr[TimestampType] = NOT_GIVEN
     text_normalization: NotGivenOr[TextNormalization] = NOT_GIVEN
     timestamp_transport_strategy: TimestampTransportStrategy = DEFAULT_TIMESTAMP_TRANSPORT_STRATEGY
@@ -401,6 +402,8 @@ async def _send_loop(self) -> None:
                         },
                         "contextId": msg.context_id,
                     }
+                    if is_given(opts.language):
+                        pkt["create"]["language"] = opts.language
                     if is_given(opts.timestamp_type):
                         pkt["create"]["timestampType"] = opts.timestamp_type
                     if is_given(opts.text_normalization):
@@ -825,6 +828,7 @@ def __init__(
         sample_rate: NotGivenOr[int] = NOT_GIVEN,
         speaking_rate: NotGivenOr[float] = NOT_GIVEN,
         temperature: NotGivenOr[float] = NOT_GIVEN,
+        language: NotGivenOr[str] = NOT_GIVEN,
         timestamp_type: NotGivenOr[TimestampType] = NOT_GIVEN,
         text_normalization: NotGivenOr[TextNormalization] = NOT_GIVEN,
         timestamp_transport_strategy: NotGivenOr[TimestampTransportStrategy] = NOT_GIVEN,
@@ -853,6 +857,9 @@ def __init__(
                 Defaults to 1.0.
             temperature (float, optional): Determines the degree of randomness when sampling audio
                 tokens to generate the response. Range (0, 2]. Defaults to 1.0.
+            language (str, optional): BCP-47 language tag (e.g., "en-US", "fr-FR", "ja-JP")
+                specifying the language that the given voice should speak the text in.
+                If not set, the model default applies.
             timestamp_type (str, optional): Controls timestamp metadata returned with the audio.
                 Use "WORD" for word-level timestamps or "CHARACTER" for character-level.
                 Useful for karaoke-style captions, word highlighting, and lipsync.
@@ -926,6 +933,7 @@ def __init__(
             sample_rate=sample_rate if is_given(sample_rate) else DEFAULT_SAMPLE_RATE,
             speaking_rate=speaking_rate if is_given(speaking_rate) else DEFAULT_SPEAKING_RATE,
             temperature=temperature if is_given(temperature) else DEFAULT_TEMPERATURE,
+            language=language,
             timestamp_type=timestamp_type,
             text_normalization=text_normalization,
             timestamp_transport_strategy=timestamp_transport_strategy
@@ -983,6 +991,7 @@ def update_options(
         sample_rate: NotGivenOr[int] = NOT_GIVEN,
         speaking_rate: NotGivenOr[float] = NOT_GIVEN,
         temperature: NotGivenOr[float] = NOT_GIVEN,
+        language: NotGivenOr[str] = NOT_GIVEN,
         timestamp_type: NotGivenOr[TimestampType] = NOT_GIVEN,
         text_normalization: NotGivenOr[TextNormalization] = NOT_GIVEN,
         timestamp_transport_strategy: NotGivenOr[TimestampTransportStrategy] = NOT_GIVEN,
@@ -1001,6 +1010,7 @@ def update_options(
             speaking_rate (float, optional): The speed of the voice.
             temperature (float, optional): Determines the degree of randomness when sampling audio
                 tokens to generate the response.
+            language (str, optional): BCP-47 language tag (e.g., "en-US", "fr-FR").
             timestamp_type (str, optional): Controls timestamp metadata ("WORD" or "CHARACTER").
             text_normalization (str, optional): Controls text normalization ("ON" or "OFF").
             timestamp_transport_strategy (str, optional): Controls timestamp transport strategy
@@ -1023,6 +1033,8 @@ def update_options(
             self._opts.speaking_rate = speaking_rate
         if is_given(temperature):
             self._opts.temperature = temperature
+        if is_given(language):
+            self._opts.language = language
         if is_given(timestamp_type):
             _validate_str_param(timestamp_type, "timestamp_type", TimestampType)
             self._opts.timestamp_type = timestamp_type
@@ -1133,6 +1145,8 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None:
                 "modelId": self._opts.model,
                 "audioConfig": audio_config,
             }
+            if utils.is_given(self._opts.language):
+                body_params["language"] = self._opts.language
             if utils.is_given(self._opts.timestamp_type):
                 body_params["timestampType"] = self._opts.timestamp_type
             if utils.is_given(self._opts.text_normalization):