From 8fbbf8765f2c094837d29efe86bb65c85696b897 Mon Sep 17 00:00:00 2001
From: Shivank Kacker <kacker.shivank@gmail.com>
Date: Tue, 12 May 2026 16:22:59 +0530
Subject: [PATCH 1/6] add support for normal transcripitions

---
 .../migrations/0012_scribe_transcript_only.py |  98 ++++++++
 care_scribe/models/scribe.py                  |   4 +
 care_scribe/serializers/scribe.py             |   1 +
 care_scribe/settings.py                       |  45 ++--
 care_scribe/tasks/scribe.py                   | 216 +++++++++++++++---
 requirements_dev.txt                          |   1 +
 setup.py                                      |   1 +
 7 files changed, 311 insertions(+), 55 deletions(-)
 create mode 100644 care_scribe/migrations/0012_scribe_transcript_only.py

diff --git a/care_scribe/migrations/0012_scribe_transcript_only.py b/care_scribe/migrations/0012_scribe_transcript_only.py
new file mode 100644
index 0000000..7147115
--- /dev/null
+++ b/care_scribe/migrations/0012_scribe_transcript_only.py
@@ -0,0 +1,98 @@
+from django.db import migrations, models
+
+
+def rename_processing_meta_keys_forward(apps, schema_editor):
+    """Rename old processing meta keys to the new keys.
+
+    Old -> New:
+        provider     -> chat_provider
+        audio_model  -> transcribe_model
+
+    Also adds `transcribe_provider` mirroring `chat_provider` so historical
+    entries match the new shape.
+    """
+    Scribe = apps.get_model("care_scribe", "Scribe")
+    to_update = []
+    for scribe in Scribe.objects.exclude(meta={}).iterator():
+        meta = scribe.meta or {}
+        processings = meta.get("processings")
+        if not processings:
+            continue
+        changed = False
+        for processing in processings:
+            if not isinstance(processing, dict):
+                continue
+            if "provider" in processing and "chat_provider" not in processing:
+                processing["chat_provider"] = processing.pop("provider")
+                changed = True
+            if "audio_model" in processing and "transcribe_model" not in processing:
+                processing["transcribe_model"] = processing.pop("audio_model")
+                changed = True
+            if (
+                "chat_provider" in processing
+                and "transcribe_provider" not in processing
+            ):
+                processing["transcribe_provider"] = processing["chat_provider"]
+                changed = True
+        if changed:
+            scribe.meta = meta
+            to_update.append(scribe)
+            if len(to_update) >= 500:
+                Scribe.objects.bulk_update(to_update, ["meta"])
+                to_update = []
+    if to_update:
+        Scribe.objects.bulk_update(to_update, ["meta"])
+
+
+def rename_processing_meta_keys_reverse(apps, schema_editor):
+    """Revert the rename: new keys -> old keys."""
+    Scribe = apps.get_model("care_scribe", "Scribe")
+    to_update = []
+    for scribe in Scribe.objects.exclude(meta={}).iterator():
+        meta = scribe.meta or {}
+        processings = meta.get("processings")
+        if not processings:
+            continue
+        changed = False
+        for processing in processings:
+            if not isinstance(processing, dict):
+                continue
+            if "chat_provider" in processing and "provider" not in processing:
+                processing["provider"] = processing.pop("chat_provider")
+                changed = True
+            if "transcribe_model" in processing and "audio_model" not in processing:
+                processing["audio_model"] = processing.pop("transcribe_model")
+                changed = True
+            if "transcribe_provider" in processing:
+                processing.pop("transcribe_provider")
+                changed = True
+        if changed:
+            scribe.meta = meta
+            to_update.append(scribe)
+            if len(to_update) >= 500:
+                Scribe.objects.bulk_update(to_update, ["meta"])
+                to_update = []
+    if to_update:
+        Scribe.objects.bulk_update(to_update, ["meta"])
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('care_scribe', '0011_scribefile_mime_type'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='scribe',
+            name='transcript_only',
+            field=models.BooleanField(
+                default=False,
+                help_text='If True, only transcribe the audio without running any AI form-fill processing.',
+            ),
+        ),
+        migrations.RunPython(
+            rename_processing_meta_keys_forward,
+            rename_processing_meta_keys_reverse,
+        ),
+    ]
diff --git a/care_scribe/models/scribe.py b/care_scribe/models/scribe.py
index e7e070f..0d9bcc3 100644
--- a/care_scribe/models/scribe.py
+++ b/care_scribe/models/scribe.py
@@ -124,6 +124,10 @@ class Status(models.TextChoices):
     chat_model = models.CharField(max_length=100, null=True, blank=True)
     audio_model = models.CharField(max_length=100, null=True, blank=True)
     chat_model_temperature = models.FloatField(null=True, blank=True)
+    transcript_only = models.BooleanField(
+        default=False,
+        help_text="If True, only transcribe the audio without running any AI form-fill processing.",
+    )
 
     is_feedback_positive = models.BooleanField(null=True, blank=True, help_text="Whether the user has given positive feedback on the AI response")
     feedback_comments = models.TextField(null=True, blank=True, help_text="Details of the feedback provided by the user")
diff --git a/care_scribe/serializers/scribe.py b/care_scribe/serializers/scribe.py
index d093b00..e36e53b 100644
--- a/care_scribe/serializers/scribe.py
+++ b/care_scribe/serializers/scribe.py
@@ -77,6 +77,7 @@ class Meta:
             "chat_model",
             "audio_model",
             "chat_model_temperature",
+            "transcript_only",
             "is_feedback_positive",
             "feedback_comments",
         ]
diff --git a/care_scribe/settings.py b/care_scribe/settings.py
index f31199a..1b914c6 100644
--- a/care_scribe/settings.py
+++ b/care_scribe/settings.py
@@ -86,21 +86,32 @@ def validate(self) -> None:
                     f'Please set the "{setting}" in the environment or the {PLUGIN_NAME} plugin config.'
                 )
 
-        if getattr(self, "SCRIBE_API_PROVIDER") not in ("openai", "azure", "google"):
+        valid_providers = ("openai", "azure", "google")
+        providers_in_use = set()
+
+        for setting_name in ("SCRIBE_CHAT_MODEL_NAME", "SCRIBE_TRANSCRIBE_MODEL_NAME"):
+            value = getattr(self, setting_name)
+            if "/" not in value:
+                raise ImproperlyConfigured(
+                    f'Invalid value for "{setting_name}". '
+                    f'Expected format "provider/model-name" '
+                    f'(provider must be one of {valid_providers}).'
+                )
+            provider = value.split("/", 1)[0]
+            if provider not in valid_providers:
+                raise ImproperlyConfigured(
+                    f'Invalid provider "{provider}" in "{setting_name}". '
+                    f'Provider must be one of {valid_providers}.'
+                )
+            providers_in_use.add(provider)
+
+        if "openai" in providers_in_use and not getattr(self, "SCRIBE_OPENAI_API_KEY"):
             raise ImproperlyConfigured(
-                'Invalid value for "SCRIBE_API_PROVIDER". '
-                'Please set the "SCRIBE_API_PROVIDER" to "openai", "google" or "azure".'
+                'The "SCRIBE_OPENAI_API_KEY" setting is required when using OpenAI API. '
+                f'Please set it in the environment or the {PLUGIN_NAME} plugin config.'
             )
 
-        if getattr(self, "SCRIBE_API_PROVIDER") == "openai":
-            for setting in ("SCRIBE_OPENAI_API_KEY",):
-                if not getattr(self, setting):
-                    raise ImproperlyConfigured(
-                        f'The "{setting}" setting is required when using OpenAI API. '
-                        f'Please set the "{setting}" in the environment or the {PLUGIN_NAME} plugin config.'
-                    )
-
-        if getattr(self, "SCRIBE_API_PROVIDER") == "azure":
+        if "azure" in providers_in_use:
             for setting in ("SCRIBE_AZURE_API_VERSION", "SCRIBE_AZURE_ENDPOINT", "SCRIBE_AZURE_API_KEY"):
                 if not getattr(self, setting):
                     raise ImproperlyConfigured(
@@ -108,7 +119,7 @@ def validate(self) -> None:
                         f'Please set the "{setting}" in the environment or the {PLUGIN_NAME} plugin config.'
                     )
 
-        if getattr(self, "SCRIBE_API_PROVIDER") == "google":
+        if "google" in providers_in_use:
             for setting in ("SCRIBE_GOOGLE_PROJECT_ID", "SCRIBE_GOOGLE_LOCATION"):
                 if not getattr(self, setting):
                     raise ImproperlyConfigured(
@@ -129,19 +140,19 @@ def reload(self) -> None:
 
 REQUIRED_SETTINGS = {
     "SCRIBE_CHAT_MODEL_NAME",
-    "SCRIBE_API_PROVIDER",
+    "SCRIBE_TRANSCRIBE_MODEL_NAME",
 }
 
 DEFAULTS = {
     "SCRIBE_OPENAI_API_KEY": "",
     "SCRIBE_AZURE_API_KEY": "",
-    "SCRIBE_AUDIO_MODEL_NAME": "whisper-1",
-    "SCRIBE_CHAT_MODEL_NAME": "gpt-4o",
-    "SCRIBE_API_PROVIDER": "openai",
+    "SCRIBE_TRANSCRIBE_MODEL_NAME": "openai/whisper-1",
+    "SCRIBE_CHAT_MODEL_NAME": "openai/gpt-4o",
     "SCRIBE_AZURE_API_VERSION": "",
     "SCRIBE_AZURE_ENDPOINT": "",
     "SCRIBE_GOOGLE_PROJECT_ID" : "",
     "SCRIBE_GOOGLE_LOCATION" : "",
+    "SCRIBE_GOOGLE_LANGUAGE_CODES": "auto",
     "SCRIBE_TNC": "<Please add your terms and conditions here>",
 }
 
diff --git a/care_scribe/tasks/scribe.py b/care_scribe/tasks/scribe.py
index 4f2368b..21ed2cf 100644
--- a/care_scribe/tasks/scribe.py
+++ b/care_scribe/tasks/scribe.py
@@ -21,7 +21,101 @@
 
 logger = logging.getLogger(__name__)
 
-def ai_client(provider=plugin_settings.SCRIBE_API_PROVIDER):
+def _google_credentials():
+    b64_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_B64")
+    if not b64_credentials:
+        return None
+    info = json.loads(base64.b64decode(b64_credentials).decode("utf-8"))
+    return service_account.Credentials.from_service_account_info(
+        info, scopes=["https://www.googleapis.com/auth/cloud-platform"]
+    )
+
+def google_stt_transcribe(audio_file_object, model_name):
+    """Transcribe a single audio file using Google Cloud Speech-to-Text v2."""
+    from google.api_core.client_options import ClientOptions
+    from google.cloud.speech_v2 import SpeechClient
+    from google.cloud.speech_v2.types import cloud_speech
+
+    location = plugin_settings.SCRIBE_GOOGLE_LOCATION or "global"
+    client_options = None
+    if location and location != "global":
+        client_options = ClientOptions(
+            api_endpoint=f"{location}-speech.googleapis.com"
+        )
+    client = SpeechClient(
+        credentials=_google_credentials(), client_options=client_options
+    )
+
+    _, audio_data = audio_file_object.files_manager.file_contents(audio_file_object)
+    recognizer = (
+        f"projects/{plugin_settings.SCRIBE_GOOGLE_PROJECT_ID}"
+        f"/locations/{location}/recognizers/_"
+    )
+    config = cloud_speech.RecognitionConfig(
+        auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
+        language_codes=[
+            code.strip()
+            for code in (plugin_settings.SCRIBE_GOOGLE_LANGUAGE_CODES or "en-US").split(",")
+            if code.strip()
+        ],
+        model=model_name or "long",
+    )
+    response = client.recognize(
+        request=cloud_speech.RecognizeRequest(
+            recognizer=recognizer,
+            config=config,
+            content=audio_data,
+        )
+    )
+    return " ".join(
+        result.alternatives[0].transcript
+        for result in response.results
+        if result.alternatives
+    )
+
+
+def transcribe_audio_file(audio_file_object, provider, audio_model):
+    """Transcribe a single audio file using the configured provider."""
+    if provider == "google":
+        return google_stt_transcribe(audio_file_object, audio_model)
+
+    client = ai_client(provider)
+    _, audio_file_data = audio_file_object.files_manager.file_contents(
+        audio_file_object
+    )
+    fmt = audio_file_object.internal_name.split(".")[-1]
+    buffer = io.BytesIO(audio_file_data)
+    buffer.name = "file." + fmt
+    # Only whisper-1 supports the /audio/translations endpoint.
+    # Newer models (gpt-4o-transcribe, gpt-4o-mini-transcribe, etc.) are
+    # transcription-only and must use /audio/transcriptions.
+    if audio_model == "whisper-1":
+        transcription = client.audio.translations.create(
+            model=audio_model, file=buffer
+        )
+    else:
+        transcription = client.audio.transcriptions.create(
+            model=audio_model, file=buffer
+        )
+    return transcription.text
+
+
+def _parse_provider_model(value: str):
+    """Split a 'provider/model-name' string into (provider, model).
+
+    The model portion may itself contain '/' characters (kept intact).
+    """
+    if not value or "/" not in value:
+        raise ValueError(
+            f"Expected 'provider/model-name' format, got: {value!r}"
+        )
+    provider, model = value.split("/", 1)
+    if provider == "openai" and plugin_settings.SCRIBE_AZURE_API_KEY:
+        provider = "azure"
+    return provider, model
+
+
+def ai_client(provider):
     if provider == "azure":
         AiClient = AzureOpenAI(
             api_key=plugin_settings.SCRIBE_AZURE_API_KEY,
@@ -34,25 +128,18 @@ def ai_client(provider=plugin_settings.SCRIBE_API_PROVIDER):
         )
 
     elif provider == "google":
-        credentials = None
-        b64_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_B64")
-
-        if b64_credentials:
-            info = json.loads(base64.b64decode(b64_credentials).decode("utf-8"))
-            credentials = service_account.Credentials.from_service_account_info(info, scopes=["https://www.googleapis.com/auth/cloud-platform"])
-
         AiClient = genai.Client(
             vertexai=True,
             project=plugin_settings.SCRIBE_GOOGLE_PROJECT_ID,
             location=plugin_settings.SCRIBE_GOOGLE_LOCATION,
-            credentials=credentials,
+            credentials=_google_credentials(),
         )
 
     else:
         raise Exception("Invalid api provider")
     return AiClient
 
-def chat_message(provider=plugin_settings.SCRIBE_API_PROVIDER, role="user", text=None, file_object=None, file_type="audio"):
+def chat_message(provider, role="user", text=None, file_object=None, file_type="audio"):
     """ Generates a chat message compatible with the given AI provider client."""
     if file_object:
         _, file_data = file_object.files_manager.file_contents(file_object)
@@ -184,35 +271,88 @@ def process_ai_form_fill(external_id):
             form.save()
             return
 
-    api_provider = plugin_settings.SCRIBE_API_PROVIDER
-    chat_model = plugin_settings.SCRIBE_CHAT_MODEL_NAME
-    audio_model = plugin_settings.SCRIBE_AUDIO_MODEL_NAME
+    chat_provider, chat_model = _parse_provider_model(
+        plugin_settings.SCRIBE_CHAT_MODEL_NAME
+    )
+    transcribe_provider, transcribe_model = _parse_provider_model(
+        plugin_settings.SCRIBE_TRANSCRIBE_MODEL_NAME
+    )
     temperature = 0
 
     if form.chat_model:
-        api_provider = form.chat_model.split("/")[0]
-        if api_provider == "openai" and plugin_settings.SCRIBE_AZURE_API_KEY is not "":
-            api_provider = "azure"
-        chat_model = form.chat_model.split("/")[1]
+        chat_provider, chat_model = _parse_provider_model(form.chat_model)
 
     if form.audio_model:
-        audio_model = form.audio_model
+        # Form override may be either "provider/model" or just a model name
+        if "/" in form.audio_model:
+            transcribe_provider, transcribe_model = _parse_provider_model(
+                form.audio_model
+            )
+        else:
+            transcribe_model = form.audio_model
 
     if form.chat_model_temperature is not None:
         temperature = form.chat_model_temperature
 
-    processing["provider"] = api_provider
+    processing["chat_provider"] = chat_provider
     processing["chat_model"] = chat_model
-    processing["audio_model"] = audio_model if api_provider != "google" else None
+    processing["transcribe_provider"] = transcribe_provider
+    processing["transcribe_model"] = (
+        transcribe_model if chat_provider != "google" else None
+    )
     processing["form_data"] = form.form_data
 
-    # Instantiate the AI client once to avoid premature closure and resource management issues,
-    # especially with the Google GenAI provider. Reuse this client instance throughout the function.
-    client = ai_client(api_provider)
-
     audio_files = ScribeFile.objects.filter(external_id__in=form.audio_file_ids)
     total_audio_duration = sum(file.meta.get("length", 0) for file in audio_files)
 
+    if form.transcript_only:
+        logger.info(f"=== Processing transcript-only Scribe {form.external_id} ===")
+        processing["transcript_only"] = True
+        processing["transcribe_model"] = transcribe_model
+        try:
+            form.status = Scribe.Status.GENERATING_TRANSCRIPT
+            form.save()
+            transcript = form.transcript or ""
+            if not transcript:
+                transcription_start = perf_counter()
+                for audio_file_object in audio_files:
+                    transcript += (
+                        transcribe_audio_file(
+                            audio_file_object=audio_file_object,
+                            provider=transcribe_provider,
+                            audio_model=transcribe_model,
+                        )
+                        or ""
+                    )
+                processing["transcription_time"] = perf_counter() - transcription_start
+            form.transcript = transcript
+            form.meta["processings"] = [
+                *form.meta.get("processings", []),
+                processing,
+            ]
+            form.status = Scribe.Status.COMPLETED
+            form.save()
+            if not is_benchmark:
+                user_quota.calculate_used()
+                facility_quota.calculate_used()
+        except Exception as e:
+            logger.error(
+                f"Transcript-only processing failed at line "
+                f"{e.__traceback__.tb_lineno}: {e}"
+            )
+            processing["error"] = str(e)
+            form.meta["processings"] = [
+                *form.meta.get("processings", []),
+                processing,
+            ]
+            form.status = Scribe.Status.FAILED
+            form.save()
+        return
+
+    # Instantiate the AI client once to avoid premature closure and resource management issues,
+    # especially with the Google GenAI provider. Reuse this client instance throughout the function.
+    client = ai_client(chat_provider)
+
     processed_fields = {}
 
     def process_fields(fields: list, indent: int = 0):
@@ -247,7 +387,7 @@ def process_fields(fields: list, indent: int = 0):
         # Asking for the full transcription on longer audio would eat up too many tokens.
         output_schema["properties"]["__scribe__transcription"]["description"] = f"A short summarized transcription of the {'image' if len(form.document_file_ids) > 0 else 'audio'} content, focusing on key points and insights in English."
 
-    if api_provider != "google" and len(form.document_file_ids) == 0:
+    if chat_provider != "google" and len(form.document_file_ids) == 0:
         # As we are transcribing using whisper, we do not need the transcription field in the output schema
         del output_schema["properties"]["__scribe__transcription"]
         output_schema["required"].remove("__scribe__transcription")
@@ -261,7 +401,7 @@ def process_fields(fields: list, indent: int = 0):
 
     messages.append(
         chat_message(
-            provider=api_provider,
+            provider=chat_provider,
             role="system",
             text=base_prompt,
         )
@@ -270,7 +410,7 @@ def process_fields(fields: list, indent: int = 0):
     if form.text:
         messages.append(
             chat_message(
-                provider=api_provider,
+                provider=chat_provider,
                 role="user",
                 text=form.text,
             )
@@ -286,10 +426,10 @@ def process_fields(fields: list, indent: int = 0):
 
             for audio_file_object in audio_files:
 
-                if api_provider == "google":
+                if chat_provider == "google":
                     messages.append(
                         chat_message(
-                            provider=api_provider,
+                            provider=chat_provider,
                             role="user",
                             file_object=audio_file_object,
                             file_type="audio",
@@ -297,13 +437,13 @@ def process_fields(fields: list, indent: int = 0):
                     )
 
                 else:
-                    _, audio_file_data = audio_file_object.files_manager.file_contents(audio_file_object)
-                    format = audio_file_object.internal_name.split(".")[-1]
-                    buffer = io.BytesIO(audio_file_data)
-                    buffer.name = "file." + format
                     logger.info(f"=== Generating transcript for AI form fill {form.external_id} ===")
                     try:
-                        transcription = client.audio.translations.create(model=audio_model, file=buffer)
+                        transcription_text = transcribe_audio_file(
+                            audio_file_object=audio_file_object,
+                            provider=transcribe_provider,
+                            audio_model=transcribe_model,
+                        )
                     except Exception as e:
                         logger.error(f"Error generating transcript: {e}")
                         processing["error"] = f"Error generating transcript: {e}"
@@ -315,7 +455,7 @@ def process_fields(fields: list, indent: int = 0):
                         form.save()
                         return
 
-                    transcript += transcription.text
+                    transcript += transcription_text or ""
                     logger.info(f"Transcript: {transcript}")
 
                     transcription_time = perf_counter() - initiation_time
@@ -333,7 +473,7 @@ def process_fields(fields: list, indent: int = 0):
         for document_file_object in document_file_objects:
             messages.append(
                 chat_message(
-                    provider=api_provider,
+                    provider=chat_provider,
                     role="user",
                     file_object=document_file_object,
                     file_type="image",
@@ -343,7 +483,7 @@ def process_fields(fields: list, indent: int = 0):
         if transcript != "":
             messages.append(
                 chat_message(
-                    provider=api_provider,
+                    provider=chat_provider,
                     role="user",
                     text=transcript,
                 )
@@ -355,7 +495,7 @@ def process_fields(fields: list, indent: int = 0):
 
         completion_start_time = perf_counter()
 
-        if api_provider == "google":
+        if chat_provider == "google":
 
             output_schema_hash = hash_string(json.dumps(output_schema, sort_keys=True))
             try:
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 6e09cc8..8517411 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -19,3 +19,4 @@ django-filter
 jsonschema
 openai==2.2.0
 google-genai==1.41.0
+google-cloud-speech==2.39.0
diff --git a/setup.py b/setup.py
index 4466f13..b43c577 100644
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,7 @@
     "jsonschema",
     "openai==2.2.0",
     "google-genai==1.41.0",
+    "google-cloud-speech==2.27.0",
 ]
 
 test_requirements = []

From ec5cd67ec88a403db4ddc7cbf70ded4743b7db43 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 14 May 2026 11:09:37 +0000
Subject: [PATCH 2/6] Switch Google STT from transcription to translation

Agent-Logs-Url: https://github.com/ohcnetwork/care_scribe/sessions/d33ec1c6-4ec2-4750-95c8-83441bab343f

Co-authored-by: shivankacker <23238460+shivankacker@users.noreply.github.com>
---
 care_scribe/tasks/scribe.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/care_scribe/tasks/scribe.py b/care_scribe/tasks/scribe.py
index 21ed2cf..0cafb3f 100644
--- a/care_scribe/tasks/scribe.py
+++ b/care_scribe/tasks/scribe.py
@@ -30,8 +30,8 @@ def _google_credentials():
         info, scopes=["https://www.googleapis.com/auth/cloud-platform"]
     )
 
-def google_stt_transcribe(audio_file_object, model_name):
-    """Transcribe a single audio file using Google Cloud Speech-to-Text v2."""
+def google_stt_translate(audio_file_object, model_name):
+    """Translate a single audio file to English using Google Cloud Speech-to-Text v2."""
     from google.api_core.client_options import ClientOptions
     from google.cloud.speech_v2 import SpeechClient
     from google.cloud.speech_v2.types import cloud_speech
@@ -51,14 +51,18 @@ def google_stt_transcribe(audio_file_object, model_name):
         f"projects/{plugin_settings.SCRIBE_GOOGLE_PROJECT_ID}"
         f"/locations/{location}/recognizers/_"
     )
+    language_codes = [
+        stripped
+        for code in (plugin_settings.SCRIBE_GOOGLE_LANGUAGE_CODES or "en-US").split(",")
+        if (stripped := code.strip()) and stripped.lower() != "auto"
+    ] or ["en-US"]
     config = cloud_speech.RecognitionConfig(
         auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
-        language_codes=[
-            code.strip()
-            for code in (plugin_settings.SCRIBE_GOOGLE_LANGUAGE_CODES or "en-US").split(",")
-            if code.strip()
-        ],
+        language_codes=language_codes,
         model=model_name or "long",
+        translation_config=cloud_speech.TranslationConfig(
+            target_language="en-US",
+        ),
     )
     response = client.recognize(
         request=cloud_speech.RecognizeRequest(
@@ -68,7 +72,7 @@ def google_stt_transcribe(audio_file_object, model_name):
         )
     )
     return " ".join(
-        result.alternatives[0].transcript
+        result.alternatives[0].translation
         for result in response.results
         if result.alternatives
     )
@@ -77,7 +81,7 @@ def google_stt_transcribe(audio_file_object, model_name):
 def transcribe_audio_file(audio_file_object, provider, audio_model):
     """Transcribe a single audio file using the configured provider."""
     if provider == "google":
-        return google_stt_transcribe(audio_file_object, audio_model)
+        return google_stt_translate(audio_file_object, audio_model)
 
     client = ai_client(provider)
     _, audio_file_data = audio_file_object.files_manager.file_contents(

From 211f7542d48c66a0e46d9e483c9bcc28db58e6f8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 14 May 2026 11:20:34 +0000
Subject: [PATCH 3/6] Rename SCRIBE_GOOGLE_LANGUAGE_CODES to
 SCRIBE_GOOGLE_LANGUAGE_CODE

Agent-Logs-Url: https://github.com/ohcnetwork/care_scribe/sessions/2bdc2018-d89a-423e-935a-171f9ffd49aa

Co-authored-by: shivankacker <23238460+shivankacker@users.noreply.github.com>
---
 care_scribe/settings.py     |  2 +-
 care_scribe/tasks/scribe.py | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/care_scribe/settings.py b/care_scribe/settings.py
index 1b914c6..5914457 100644
--- a/care_scribe/settings.py
+++ b/care_scribe/settings.py
@@ -152,7 +152,7 @@ def reload(self) -> None:
     "SCRIBE_AZURE_ENDPOINT": "",
     "SCRIBE_GOOGLE_PROJECT_ID" : "",
     "SCRIBE_GOOGLE_LOCATION" : "",
-    "SCRIBE_GOOGLE_LANGUAGE_CODES": "auto",
+    "SCRIBE_GOOGLE_LANGUAGE_CODE": "en-US",
     "SCRIBE_TNC": "<Please add your terms and conditions here>",
 }
 
diff --git a/care_scribe/tasks/scribe.py b/care_scribe/tasks/scribe.py
index 0cafb3f..9579cb0 100644
--- a/care_scribe/tasks/scribe.py
+++ b/care_scribe/tasks/scribe.py
@@ -31,7 +31,11 @@ def _google_credentials():
     )
 
 def google_stt_translate(audio_file_object, model_name):
-    """Translate a single audio file to English using Google Cloud Speech-to-Text v2."""
+    """Translate a single audio file using Google Cloud Speech-to-Text v2.
+
+    The audio is auto-detected as any source language and translated into the
+    language configured via ``SCRIBE_GOOGLE_LANGUAGE_CODE`` (defaults to ``en-US``).
+    """
     from google.api_core.client_options import ClientOptions
     from google.cloud.speech_v2 import SpeechClient
     from google.cloud.speech_v2.types import cloud_speech
@@ -51,17 +55,13 @@ def google_stt_translate(audio_file_object, model_name):
         f"projects/{plugin_settings.SCRIBE_GOOGLE_PROJECT_ID}"
         f"/locations/{location}/recognizers/_"
     )
-    language_codes = [
-        stripped
-        for code in (plugin_settings.SCRIBE_GOOGLE_LANGUAGE_CODES or "en-US").split(",")
-        if (stripped := code.strip()) and stripped.lower() != "auto"
-    ] or ["en-US"]
+    target_language = plugin_settings.SCRIBE_GOOGLE_LANGUAGE_CODE or "en-US"
     config = cloud_speech.RecognitionConfig(
         auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
-        language_codes=language_codes,
+        language_codes=["auto"],
         model=model_name or "long",
         translation_config=cloud_speech.TranslationConfig(
-            target_language="en-US",
+            target_language=target_language,
         ),
     )
     response = client.recognize(

From e89b8d53fed55e3830374c6404b8e18574dd4461 Mon Sep 17 00:00:00 2001
From: Shivank Kacker <kacker.shivank@gmail.com>
Date: Fri, 15 May 2026 14:12:27 +0530
Subject: [PATCH 4/6] Updated to support for translation and transcription

---
 care_scribe/settings.py     |  2 +-
 care_scribe/tasks/scribe.py | 95 +++++++++++++++++++++----------------
 2 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/care_scribe/settings.py b/care_scribe/settings.py
index 5914457..dbf3852 100644
--- a/care_scribe/settings.py
+++ b/care_scribe/settings.py
@@ -152,7 +152,7 @@ def reload(self) -> None:
     "SCRIBE_AZURE_ENDPOINT": "",
     "SCRIBE_GOOGLE_PROJECT_ID" : "",
     "SCRIBE_GOOGLE_LOCATION" : "",
-    "SCRIBE_GOOGLE_LANGUAGE_CODE": "en-US",
+    "SCRIBE_TRANSCRIBE_LANGUAGE": "", # only works for google. OpenAI can return source language or only translate to English.
     "SCRIBE_TNC": "<Please add your terms and conditions here>",
 }
 
diff --git a/care_scribe/tasks/scribe.py b/care_scribe/tasks/scribe.py
index 9579cb0..b03fa15 100644
--- a/care_scribe/tasks/scribe.py
+++ b/care_scribe/tasks/scribe.py
@@ -30,58 +30,71 @@ def _google_credentials():
         info, scopes=["https://www.googleapis.com/auth/cloud-platform"]
     )
 
-def google_stt_translate(audio_file_object, model_name):
-    """Translate a single audio file using Google Cloud Speech-to-Text v2.
+def _google_llm_transcribe(audio_file_object, model_name):
+    """Transcribe a single audio file using a Google Gemini model.
 
-    The audio is auto-detected as any source language and translated into the
-    language configured via ``SCRIBE_GOOGLE_LANGUAGE_CODE`` (defaults to ``en-US``).
+    The audio is sent to the configured Gemini model with a prompt instructing
+    it to return ONLY the transcribed text. If ``SCRIBE_TRANSCRIBE_LANGUAGE``
+    is set, the model is asked to translate into that language; otherwise the
+    transcript is returned in the original spoken language.
     """
-    from google.api_core.client_options import ClientOptions
-    from google.cloud.speech_v2 import SpeechClient
-    from google.cloud.speech_v2.types import cloud_speech
-
-    location = plugin_settings.SCRIBE_GOOGLE_LOCATION or "global"
-    client_options = None
-    if location and location != "global":
-        client_options = ClientOptions(
-            api_endpoint=f"{location}-speech.googleapis.com"
-        )
-    client = SpeechClient(
-        credentials=_google_credentials(), client_options=client_options
-    )
+    target_language = (plugin_settings.SCRIBE_TRANSCRIBE_LANGUAGE or "").strip()
 
     _, audio_data = audio_file_object.files_manager.file_contents(audio_file_object)
-    recognizer = (
-        f"projects/{plugin_settings.SCRIBE_GOOGLE_PROJECT_ID}"
-        f"/locations/{location}/recognizers/_"
-    )
-    target_language = plugin_settings.SCRIBE_GOOGLE_LANGUAGE_CODE or "en-US"
-    config = cloud_speech.RecognitionConfig(
-        auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
-        language_codes=["auto"],
-        model=model_name or "long",
-        translation_config=cloud_speech.TranslationConfig(
-            target_language=target_language,
-        ),
-    )
-    response = client.recognize(
-        request=cloud_speech.RecognizeRequest(
-            recognizer=recognizer,
-            config=config,
-            content=audio_data,
+    fmt = audio_file_object.internal_name.split(".")[-1]
+
+    client = ai_client("google")
+    if target_language:
+        prompt = (
+            "You are an audio transcription engine. Transcribe the provided "
+            f"audio and translate the transcript into the language with BCP-47 "
+            f"code '{target_language}'.\n"
+            "Strict output rules:\n"
+            f"- Output ONLY the final transcript in '{target_language}'.\n"
+            "- Do NOT include the original-language transcription.\n"
+            "- Do NOT include both languages or any side-by-side text.\n"
+            "- Do NOT add explanations, labels, preambles, quotes, or markdown.\n"
+            "- If the audio is empty or unintelligible, output an empty string."
         )
+    else:
+        prompt = (
+            "You are an audio transcription engine. Transcribe the provided "
+            "audio in the original spoken language. Do not translate.\n"
+            "Strict output rules:\n"
+            "- Output ONLY the transcript text.\n"
+            "- Do NOT add explanations, labels, preambles, quotes, or markdown.\n"
+            "- If the audio is empty or unintelligible, output an empty string."
+        )
+    response = client.models.generate_content(
+        model=model_name,
+        contents=[
+            types.Content(
+                role="user",
+                parts=[
+                    types.Part.from_text(text=prompt),
+                    types.Part.from_bytes(
+                        data=audio_data,
+                        mime_type=f"audio/{fmt}",
+                    ),
+                ],
+            )
+        ],
+        config=types.GenerateContentConfig(
+            temperature=0,
+            thinking_config=(
+                types.ThinkingConfig(thinking_budget=0)
+                if "2.5" in model_name and "pro" not in model_name
+                else None
+            ),
+        ),
     )
-    return " ".join(
-        result.alternatives[0].translation
-        for result in response.results
-        if result.alternatives
-    )
+    return (response.text or "").strip()
 
 
 def transcribe_audio_file(audio_file_object, provider, audio_model):
     """Transcribe a single audio file using the configured provider."""
     if provider == "google":
-        return google_stt_translate(audio_file_object, audio_model)
+        return _google_llm_transcribe(audio_file_object, audio_model)
 
     client = ai_client(provider)
     _, audio_file_data = audio_file_object.files_manager.file_contents(

From e4e6f008822a42a2566c5ee827e52bc8712e6033 Mon Sep 17 00:00:00 2001
From: Shivank Kacker <kacker.shivank@gmail.com>
Date: Fri, 15 May 2026 15:00:23 +0530
Subject: [PATCH 5/6] remove cloud speech

---
 requirements_dev.txt | 1 -
 setup.py             | 1 -
 2 files changed, 2 deletions(-)

diff --git a/requirements_dev.txt b/requirements_dev.txt
index 8517411..6e09cc8 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -19,4 +19,3 @@ django-filter
 jsonschema
 openai==2.2.0
 google-genai==1.41.0
-google-cloud-speech==2.39.0
diff --git a/setup.py b/setup.py
index b43c577..4466f13 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,6 @@
     "jsonschema",
     "openai==2.2.0",
     "google-genai==1.41.0",
-    "google-cloud-speech==2.27.0",
 ]
 
 test_requirements = []

From 2dd5bc54f7f6b7aee8411a8c2c5bfd32ba5c9da6 Mon Sep 17 00:00:00 2001
From: Shivank Kacker <kacker.shivank@gmail.com>
Date: Mon, 18 May 2026 12:14:44 +0530
Subject: [PATCH 6/6] prompt update

---
 care_scribe/tasks/scribe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/care_scribe/tasks/scribe.py b/care_scribe/tasks/scribe.py
index b03fa15..a0c6202 100644
--- a/care_scribe/tasks/scribe.py
+++ b/care_scribe/tasks/scribe.py
@@ -54,7 +54,7 @@ def _google_llm_transcribe(audio_file_object, model_name):
             "- Do NOT include the original-language transcription.\n"
             "- Do NOT include both languages or any side-by-side text.\n"
             "- Do NOT add explanations, labels, preambles, quotes, or markdown.\n"
-            "- If the audio is empty or unintelligible, output an empty string."
+            "- If the audio is empty or unintelligible, or contains no speech, output an empty string."
         )
     else:
         prompt = (
@@ -63,7 +63,7 @@ def _google_llm_transcribe(audio_file_object, model_name):
             "Strict output rules:\n"
             "- Output ONLY the transcript text.\n"
             "- Do NOT add explanations, labels, preambles, quotes, or markdown.\n"
-            "- If the audio is empty or unintelligible, output an empty string."
+            "- If the audio is empty or unintelligible, or contains no speech, output an empty string."
         )
     response = client.models.generate_content(
         model=model_name,