Added audio support

Xpitfire · Xpitfire · commit c8e6b60eca46 · 2025-11-20T16:06:34.000+01:00
diff --git a/singularity/LLMService.swift b/singularity/LLMService.swift
@@ -359,47 +359,82 @@ struct OpenAIResponsesService: LLMService {
     private let transcriptionMaxUploadBytes = 23 * 1024 * 1024 // 23 MB safety margin
     private let transcriptionMaxChunkDurationSeconds: Double = 600 // 10 minutes
     private let transcriptionMinChunkDurationSeconds: Double = 30  // 30 seconds floor
+    private let transcodingExtensions: Set<String> = ["ogg", "oga", "opus"]
 
     // Basic audio transcription via OpenAI Audio API, with on-disk caching
     private func transcribeAudio(at url: URL) throws -> String {
         // Compute deterministic cache key from file contents
-        let audioData = try Data(contentsOf: url)
-        let digest = SHA256.hash(data: audioData)
+        let originalData = try Data(contentsOf: url)
+        let ext = url.pathExtension.lowercased()
+        let name = url.lastPathComponent
+        let startedAt = Date()
+        print("[AudioSTT] Begin \(name) bytes=\(originalData.count) ext=\(ext.isEmpty ? "-" : ext)")
+        let digest = SHA256.hash(data: originalData)
         let hash = digest.map { String(format: "%02x", $0) }.joined()
         let cacheDir = try transcriptsDirectory()
         let cachedFile = cacheDir.appendingPathComponent("tr-\(hash).txt")
         if FileManager.default.fileExists(atPath: cachedFile.path) {
             if let s = try? String(contentsOf: cachedFile, encoding: .utf8), !s.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
+                print("[AudioSTT] Cache hit \(name) hash=\(hash.prefix(8))")
                 return s
             }
         }
-        let transcript = try transcribeAudioData(url: url, data: audioData)
-        let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
-        if !trimmed.isEmpty {
-            try? trimmed.write(to: cachedFile, atomically: true, encoding: .utf8)
+        var workingURL = url
+        var workingData = originalData
+        var tempFiles: [URL] = []
+        if requiresTranscoding(ext: ext) {
+            do {
+                print("[AudioSTT] Transcoding \(name) from .\(ext) to .m4a")
+                let convertedURL = try transcodeAudioToM4A(sourceURL: url)
+                tempFiles.append(convertedURL)
+                workingURL = convertedURL
+                workingData = try Data(contentsOf: convertedURL)
+                print("[AudioSTT] Transcode success \(name) bytes=\(workingData.count)")
+            } catch {
+                print("[AudioSTT] Transcode failed \(name) error=\(error); using original data")
+            }
+        }
+        defer { cleanupTemporaryFiles(at: tempFiles) }
+        do {
+            let transcript = try transcribeAudioData(url: workingURL, data: workingData)
+            let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
+            if !trimmed.isEmpty {
+                try? trimmed.write(to: cachedFile, atomically: true, encoding: .utf8)
+            }
+            let elapsed = Date().timeIntervalSince(startedAt)
+            let durationString = String(format: "%.2f", elapsed)
+            print("[AudioSTT] Success \(name) duration=\(durationString)s chars=\(trimmed.count)")
+            return transcript
+        } catch {
+            print("[AudioSTT] Failure \(name) error=\(error)")
+            throw error
         }
-        return transcript
     }
 
     private func transcribeAudioData(url: URL, data: Data) throws -> String {
         guard let key = ConfigService.openAIAPIKey(), !key.isEmpty else { throw ServiceError.missingAPIKey }
         // Attempt single-shot transcription first when under the upload limit
         if data.count <= transcriptionMaxUploadBytes {
+            print("[AudioSTT] Direct upload \(url.lastPathComponent) bytes=\(data.count)")
             return try performTranscriptionRequest(data: data, filename: url.lastPathComponent, apiKey: key, language: transcriptionLanguage)
         }
 
         // Chunk oversized recordings and stitch transcripts
+        print("[AudioSTT] Chunking \(url.lastPathComponent) bytes=\(data.count)")
         let chunkURLs: [URL]
         do {
             chunkURLs = try exportAudioChunksIfPossible(from: url)
         } catch {
+            print("[AudioSTT] Chunk export failed \(error); falling back to full upload")
             return try performTranscriptionRequest(data: data, filename: url.lastPathComponent, apiKey: key, language: transcriptionLanguage)
         }
         if chunkURLs.isEmpty {
             // Fall back to single upload even though it may fail; better than giving up silently
+            print("[AudioSTT] Chunk export returned empty; falling back to full upload")
             return try performTranscriptionRequest(data: data, filename: url.lastPathComponent, apiKey: key, language: transcriptionLanguage)
         }
 
+        print("[AudioSTT] Exported \(chunkURLs.count) chunks for \(url.lastPathComponent)")
         var combined: [String] = []
         defer { cleanupTemporaryFiles(at: chunkURLs) }
         for (idx, chunkURL) in chunkURLs.enumerated() {
@@ -412,6 +447,7 @@ struct OpenAIResponsesService: LLMService {
                 let prefix = chunkURLs.count > 1 ? "Part \(idx + 1)/\(chunkURLs.count):\n" : ""
                 combined.append(prefix + trimmed)
             }
+            print("[AudioSTT] Chunk \(idx + 1)/\(chunkURLs.count) success bytes=\(chunkData.count) chars=\(trimmed.count)")
         }
         return combined.joined(separator: "\n\n")
     }
@@ -442,22 +478,25 @@ struct OpenAIResponsesService: LLMService {
         appendFormField(name: "model", value: transcriptionModel)
         if let lang = language { appendFormField(name: "language", value: lang) }
         let mime = audioMimeType(for: URL(fileURLWithPath: filename))
+        print("[AudioSTT] Uploading \(filename) mime=\(mime) bytes=\(data.count)")
         appendFileField(name: "file", filename: filename, mimeType: mime, data: data)
         body.append("--\(boundary)--\r\n".data(using: .utf8)!)
 
         req.httpBody = body
 
-        let (data, resp) = try URLSession.shared.syncDataTask(with: req)
+        let (respData, resp) = try URLSession.shared.syncDataTask(with: req)
         guard let http = resp as? HTTPURLResponse, (200..<300).contains(http.statusCode) else {
-            let txt = String(data: data, encoding: .utf8) ?? ""
-            throw NSError(domain: "AudioSTT", code: (resp as? HTTPURLResponse)?.statusCode ?? -1, userInfo: [NSLocalizedDescriptionKey: "Transcription HTTP error: \(txt)"])
+            let bodyText = String(data: respData, encoding: .utf8) ?? ""
+            let preview = bodyText.count > 2000 ? "\(bodyText.prefix(2000))…" : bodyText
+            print("[AudioSTT] HTTP error \(filename) status=\((resp as? HTTPURLResponse)?.statusCode ?? -1) body=\(preview)")
+            throw NSError(domain: "AudioSTT", code: (resp as? HTTPURLResponse)?.statusCode ?? -1, userInfo: [NSLocalizedDescriptionKey: "Transcription HTTP error: \(bodyText)"])
         }
         var output = ""
-        if let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any], let text = obj["text"] as? String {
+        if let obj = try? JSONSerialization.jsonObject(with: respData) as? [String: Any], let text = obj["text"] as? String {
             output = text
         } else {
             // Some responses may include alternatives under "segments"; fallback to raw string
-            output = String(data: data, encoding: .utf8) ?? ""
+            output = String(data: respData, encoding: .utf8) ?? ""
         }
         // Persist successful transcription to cache for reuse
         return output
@@ -548,6 +587,55 @@ struct OpenAIResponsesService: LLMService {
         }
     }
 
+    private func requiresTranscoding(ext: String) -> Bool {
+        transcodingExtensions.contains(ext)
+    }
+
+    private func transcodeAudioToM4A(sourceURL: URL) throws -> URL {
+        let asset = AVAsset(url: sourceURL)
+        guard asset.isExportable else {
+            throw NSError(domain: "AudioSTT", code: -7, userInfo: [NSLocalizedDescriptionKey: "Audio asset not exportable"])
+        }
+        guard let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetAppleM4A) ??
+                AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetPassthrough) else {
+            throw NSError(domain: "AudioSTT", code: -8, userInfo: [NSLocalizedDescriptionKey: "Unable to create audio exporter"])
+        }
+        let tempURL = temporaryTranscodeURL(originalURL: sourceURL)
+        exporter.outputURL = tempURL
+        if exporter.supportedFileTypes.contains(.m4a) {
+            exporter.outputFileType = .m4a
+        } else if let first = exporter.supportedFileTypes.first {
+            exporter.outputFileType = first
+        }
+        exporter.timeRange = CMTimeRange(start: .zero, duration: asset.duration)
+        exporter.shouldOptimizeForNetworkUse = true
+
+        let group = DispatchGroup()
+        group.enter()
+        exporter.exportAsynchronously { group.leave() }
+        group.wait()
+
+        switch exporter.status {
+        case .completed:
+            let attrs = try? FileManager.default.attributesOfItem(atPath: tempURL.path)
+            let size = (attrs?[.size] as? NSNumber)?.intValue ?? 0
+            if size <= 0 {
+                throw NSError(domain: "AudioSTT", code: -9, userInfo: [NSLocalizedDescriptionKey: "Transcoded file empty"])
+            }
+            return tempURL
+        case .failed, .cancelled:
+            throw exporter.error ?? NSError(domain: "AudioSTT", code: -10, userInfo: [NSLocalizedDescriptionKey: "Audio transcode failed"])
+        default:
+            throw exporter.error ?? NSError(domain: "AudioSTT", code: -11, userInfo: [NSLocalizedDescriptionKey: "Audio transcode ended unexpectedly"])
+        }
+    }
+
+    private func temporaryTranscodeURL(originalURL: URL) -> URL {
+        let dir = FileManager.default.temporaryDirectory
+        let base = originalURL.deletingPathExtension().lastPathComponent
+        return dir.appendingPathComponent("transcode-\(base)-\(UUID().uuidString).m4a")
+    }
+
     private func audioMimeType(for url: URL) -> String {
         let ext = url.pathExtension.lowercased()
         switch ext {
@@ -556,6 +644,7 @@ struct OpenAIResponsesService: LLMService {
         case "wav": return "audio/wav"
         case "aac": return "audio/aac"
         case "flac": return "audio/flac"
+        case "ogg", "oga", "opus": return "audio/ogg"
         default: return "application/octet-stream"
         }
     }