@@ -359,47 +359,82 @@ struct OpenAIResponsesService: LLMService {
359359 private let transcriptionMaxUploadBytes = 23 * 1024 * 1024 // 23 MB safety margin
360360 private let transcriptionMaxChunkDurationSeconds : Double = 600 // 10 minutes
361361 private let transcriptionMinChunkDurationSeconds : Double = 30 // 30 seconds floor
362+ private let transcodingExtensions : Set < String > = [ " ogg " , " oga " , " opus " ]
362363
363364 // Basic audio transcription via OpenAI Audio API, with on-disk caching
364365 private func transcribeAudio( at url: URL ) throws -> String {
365366 // Compute deterministic cache key from file contents
366- let audioData = try Data ( contentsOf: url)
367- let digest = SHA256 . hash ( data: audioData)
367+ let originalData = try Data ( contentsOf: url)
368+ let ext = url. pathExtension. lowercased ( )
369+ let name = url. lastPathComponent
370+ let startedAt = Date ( )
371+ print ( " [AudioSTT] Begin \( name) bytes= \( originalData. count) ext= \( ext. isEmpty ? " - " : ext) " )
372+ let digest = SHA256 . hash ( data: originalData)
368373 let hash = digest. map { String ( format: " %02x " , $0) } . joined ( )
369374 let cacheDir = try transcriptsDirectory ( )
370375 let cachedFile = cacheDir. appendingPathComponent ( " tr- \( hash) .txt " )
371376 if FileManager . default. fileExists ( atPath: cachedFile. path) {
372377 if let s = try ? String ( contentsOf: cachedFile, encoding: . utf8) , !s. trimmingCharacters ( in: . whitespacesAndNewlines) . isEmpty {
378+ print ( " [AudioSTT] Cache hit \( name) hash= \( hash. prefix ( 8 ) ) " )
373379 return s
374380 }
375381 }
376- let transcript = try transcribeAudioData ( url: url, data: audioData)
377- let trimmed = transcript. trimmingCharacters ( in: . whitespacesAndNewlines)
378- if !trimmed. isEmpty {
379- try ? trimmed. write ( to: cachedFile, atomically: true , encoding: . utf8)
382+ var workingURL = url
383+ var workingData = originalData
384+ var tempFiles : [ URL ] = [ ]
385+ if requiresTranscoding ( ext: ext) {
386+ do {
387+ print ( " [AudioSTT] Transcoding \( name) from . \( ext) to .m4a " )
388+ let convertedURL = try transcodeAudioToM4A ( sourceURL: url)
389+ tempFiles. append ( convertedURL)
390+ workingURL = convertedURL
391+ workingData = try Data ( contentsOf: convertedURL)
392+ print ( " [AudioSTT] Transcode success \( name) bytes= \( workingData. count) " )
393+ } catch {
394+ print ( " [AudioSTT] Transcode failed \( name) error= \( error) ; using original data " )
395+ }
396+ }
397+ defer { cleanupTemporaryFiles ( at: tempFiles) }
398+ do {
399+ let transcript = try transcribeAudioData ( url: workingURL, data: workingData)
400+ let trimmed = transcript. trimmingCharacters ( in: . whitespacesAndNewlines)
401+ if !trimmed. isEmpty {
402+ try ? trimmed. write ( to: cachedFile, atomically: true , encoding: . utf8)
403+ }
404+ let elapsed = Date ( ) . timeIntervalSince ( startedAt)
405+ let durationString = String ( format: " %.2f " , elapsed)
406+ print ( " [AudioSTT] Success \( name) duration= \( durationString) s chars= \( trimmed. count) " )
407+ return transcript
408+ } catch {
409+ print ( " [AudioSTT] Failure \( name) error= \( error) " )
410+ throw error
380411 }
381- return transcript
382412 }
383413
384414 private func transcribeAudioData( url: URL , data: Data ) throws -> String {
385415 guard let key = ConfigService . openAIAPIKey ( ) , !key. isEmpty else { throw ServiceError . missingAPIKey }
386416 // Attempt single-shot transcription first when under the upload limit
387417 if data. count <= transcriptionMaxUploadBytes {
418+ print ( " [AudioSTT] Direct upload \( url. lastPathComponent) bytes= \( data. count) " )
388419 return try performTranscriptionRequest ( data: data, filename: url. lastPathComponent, apiKey: key, language: transcriptionLanguage)
389420 }
390421
391422 // Chunk oversized recordings and stitch transcripts
423+ print ( " [AudioSTT] Chunking \( url. lastPathComponent) bytes= \( data. count) " )
392424 let chunkURLs : [ URL ]
393425 do {
394426 chunkURLs = try exportAudioChunksIfPossible ( from: url)
395427 } catch {
428+ print ( " [AudioSTT] Chunk export failed \( error) ; falling back to full upload " )
396429 return try performTranscriptionRequest ( data: data, filename: url. lastPathComponent, apiKey: key, language: transcriptionLanguage)
397430 }
398431 if chunkURLs. isEmpty {
399432 // Fall back to single upload even though it may fail; better than giving up silently
433+ print ( " [AudioSTT] Chunk export returned empty; falling back to full upload " )
400434 return try performTranscriptionRequest ( data: data, filename: url. lastPathComponent, apiKey: key, language: transcriptionLanguage)
401435 }
402436
437+ print ( " [AudioSTT] Exported \( chunkURLs. count) chunks for \( url. lastPathComponent) " )
403438 var combined : [ String ] = [ ]
404439 defer { cleanupTemporaryFiles ( at: chunkURLs) }
405440 for (idx, chunkURL) in chunkURLs. enumerated ( ) {
@@ -412,6 +447,7 @@ struct OpenAIResponsesService: LLMService {
412447 let prefix = chunkURLs. count > 1 ? " Part \( idx + 1 ) / \( chunkURLs. count) : \n " : " "
413448 combined. append ( prefix + trimmed)
414449 }
450+ print ( " [AudioSTT] Chunk \( idx + 1 ) / \( chunkURLs. count) success bytes= \( chunkData. count) chars= \( trimmed. count) " )
415451 }
416452 return combined. joined ( separator: " \n \n " )
417453 }
@@ -442,22 +478,25 @@ struct OpenAIResponsesService: LLMService {
442478 appendFormField ( name: " model " , value: transcriptionModel)
443479 if let lang = language { appendFormField ( name: " language " , value: lang) }
444480 let mime = audioMimeType ( for: URL ( fileURLWithPath: filename) )
481+ print ( " [AudioSTT] Uploading \( filename) mime= \( mime) bytes= \( data. count) " )
445482 appendFileField ( name: " file " , filename: filename, mimeType: mime, data: data)
446483 body. append ( " -- \( boundary) -- \r \n " . data ( using: . utf8) !)
447484
448485 req. httpBody = body
449486
450- let ( data , resp) = try URLSession . shared. syncDataTask ( with: req)
487+ let ( respData , resp) = try URLSession . shared. syncDataTask ( with: req)
451488 guard let http = resp as? HTTPURLResponse , ( 200 ..< 300 ) . contains ( http. statusCode) else {
452- let txt = String ( data: data, encoding: . utf8) ?? " "
453- throw NSError ( domain: " AudioSTT " , code: ( resp as? HTTPURLResponse ) ? . statusCode ?? - 1 , userInfo: [ NSLocalizedDescriptionKey: " Transcription HTTP error: \( txt) " ] )
489+ let bodyText = String ( data: respData, encoding: . utf8) ?? " "
490+ let preview = bodyText. count > 2000 ? " \( bodyText. prefix ( 2000 ) ) … " : bodyText
491+ print ( " [AudioSTT] HTTP error \( filename) status= \( ( resp as? HTTPURLResponse ) ? . statusCode ?? - 1 ) body= \( preview) " )
492+ throw NSError ( domain: " AudioSTT " , code: ( resp as? HTTPURLResponse ) ? . statusCode ?? - 1 , userInfo: [ NSLocalizedDescriptionKey: " Transcription HTTP error: \( bodyText) " ] )
454493 }
455494 var output = " "
456- if let obj = try ? JSONSerialization . jsonObject ( with: data ) as? [ String : Any ] , let text = obj [ " text " ] as? String {
495+ if let obj = try ? JSONSerialization . jsonObject ( with: respData ) as? [ String : Any ] , let text = obj [ " text " ] as? String {
457496 output = text
458497 } else {
459498 // Some responses may include alternatives under "segments"; fallback to raw string
460- output = String ( data: data , encoding: . utf8) ?? " "
499+ output = String ( data: respData , encoding: . utf8) ?? " "
461500 }
462501 // Persist successful transcription to cache for reuse
463502 return output
@@ -548,6 +587,55 @@ struct OpenAIResponsesService: LLMService {
548587 }
549588 }
550589
590+ private func requiresTranscoding( ext: String ) -> Bool {
591+ transcodingExtensions. contains ( ext)
592+ }
593+
594+ private func transcodeAudioToM4A( sourceURL: URL ) throws -> URL {
595+ let asset = AVAsset ( url: sourceURL)
596+ guard asset. isExportable else {
597+ throw NSError ( domain: " AudioSTT " , code: - 7 , userInfo: [ NSLocalizedDescriptionKey: " Audio asset not exportable " ] )
598+ }
599+ guard let exporter = AVAssetExportSession ( asset: asset, presetName: AVAssetExportPresetAppleM4A) ??
600+ AVAssetExportSession ( asset: asset, presetName: AVAssetExportPresetPassthrough) else {
601+ throw NSError ( domain: " AudioSTT " , code: - 8 , userInfo: [ NSLocalizedDescriptionKey: " Unable to create audio exporter " ] )
602+ }
603+ let tempURL = temporaryTranscodeURL ( originalURL: sourceURL)
604+ exporter. outputURL = tempURL
605+ if exporter. supportedFileTypes. contains ( . m4a) {
606+ exporter. outputFileType = . m4a
607+ } else if let first = exporter. supportedFileTypes. first {
608+ exporter. outputFileType = first
609+ }
610+ exporter. timeRange = CMTimeRange ( start: . zero, duration: asset. duration)
611+ exporter. shouldOptimizeForNetworkUse = true
612+
613+ let group = DispatchGroup ( )
614+ group. enter ( )
615+ exporter. exportAsynchronously { group. leave ( ) }
616+ group. wait ( )
617+
618+ switch exporter. status {
619+ case . completed:
620+ let attrs = try ? FileManager . default. attributesOfItem ( atPath: tempURL. path)
621+ let size = ( attrs ? [ . size] as? NSNumber ) ? . intValue ?? 0
622+ if size <= 0 {
623+ throw NSError ( domain: " AudioSTT " , code: - 9 , userInfo: [ NSLocalizedDescriptionKey: " Transcoded file empty " ] )
624+ }
625+ return tempURL
626+ case . failed, . cancelled:
627+ throw exporter. error ?? NSError ( domain: " AudioSTT " , code: - 10 , userInfo: [ NSLocalizedDescriptionKey: " Audio transcode failed " ] )
628+ default :
629+ throw exporter. error ?? NSError ( domain: " AudioSTT " , code: - 11 , userInfo: [ NSLocalizedDescriptionKey: " Audio transcode ended unexpectedly " ] )
630+ }
631+ }
632+
633+ private func temporaryTranscodeURL( originalURL: URL ) -> URL {
634+ let dir = FileManager . default. temporaryDirectory
635+ let base = originalURL. deletingPathExtension ( ) . lastPathComponent
636+ return dir. appendingPathComponent ( " transcode- \( base) - \( UUID ( ) . uuidString) .m4a " )
637+ }
638+
551639 private func audioMimeType( for url: URL ) -> String {
552640 let ext = url. pathExtension. lowercased ( )
553641 switch ext {
@@ -556,6 +644,7 @@ struct OpenAIResponsesService: LLMService {
556644 case " wav " : return " audio/wav "
557645 case " aac " : return " audio/aac "
558646 case " flac " : return " audio/flac "
647+ case " ogg " , " oga " , " opus " : return " audio/ogg "
559648 default : return " application/octet-stream "
560649 }
561650 }
0 commit comments