Skip to content

Commit c8e6b60

Browse files
committed
Added audio support
1 parent cabcb5d commit c8e6b60

1 file changed

Lines changed: 101 additions & 12 deletions

File tree

singularity/LLMService.swift

Lines changed: 101 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -359,47 +359,82 @@ struct OpenAIResponsesService: LLMService {
359359
private let transcriptionMaxUploadBytes = 23 * 1024 * 1024 // 23 MB safety margin
360360
private let transcriptionMaxChunkDurationSeconds: Double = 600 // 10 minutes
361361
private let transcriptionMinChunkDurationSeconds: Double = 30 // 30 seconds floor
362+
private let transcodingExtensions: Set<String> = ["ogg", "oga", "opus"]
362363

363364
// Basic audio transcription via OpenAI Audio API, with on-disk caching
364365
private func transcribeAudio(at url: URL) throws -> String {
365366
// Compute deterministic cache key from file contents
366-
let audioData = try Data(contentsOf: url)
367-
let digest = SHA256.hash(data: audioData)
367+
let originalData = try Data(contentsOf: url)
368+
let ext = url.pathExtension.lowercased()
369+
let name = url.lastPathComponent
370+
let startedAt = Date()
371+
print("[AudioSTT] Begin \(name) bytes=\(originalData.count) ext=\(ext.isEmpty ? "-" : ext)")
372+
let digest = SHA256.hash(data: originalData)
368373
let hash = digest.map { String(format: "%02x", $0) }.joined()
369374
let cacheDir = try transcriptsDirectory()
370375
let cachedFile = cacheDir.appendingPathComponent("tr-\(hash).txt")
371376
if FileManager.default.fileExists(atPath: cachedFile.path) {
372377
if let s = try? String(contentsOf: cachedFile, encoding: .utf8), !s.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
378+
print("[AudioSTT] Cache hit \(name) hash=\(hash.prefix(8))")
373379
return s
374380
}
375381
}
376-
let transcript = try transcribeAudioData(url: url, data: audioData)
377-
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
378-
if !trimmed.isEmpty {
379-
try? trimmed.write(to: cachedFile, atomically: true, encoding: .utf8)
382+
var workingURL = url
383+
var workingData = originalData
384+
var tempFiles: [URL] = []
385+
if requiresTranscoding(ext: ext) {
386+
do {
387+
print("[AudioSTT] Transcoding \(name) from .\(ext) to .m4a")
388+
let convertedURL = try transcodeAudioToM4A(sourceURL: url)
389+
tempFiles.append(convertedURL)
390+
workingURL = convertedURL
391+
workingData = try Data(contentsOf: convertedURL)
392+
print("[AudioSTT] Transcode success \(name) bytes=\(workingData.count)")
393+
} catch {
394+
print("[AudioSTT] Transcode failed \(name) error=\(error); using original data")
395+
}
396+
}
397+
defer { cleanupTemporaryFiles(at: tempFiles) }
398+
do {
399+
let transcript = try transcribeAudioData(url: workingURL, data: workingData)
400+
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
401+
if !trimmed.isEmpty {
402+
try? trimmed.write(to: cachedFile, atomically: true, encoding: .utf8)
403+
}
404+
let elapsed = Date().timeIntervalSince(startedAt)
405+
let durationString = String(format: "%.2f", elapsed)
406+
print("[AudioSTT] Success \(name) duration=\(durationString)s chars=\(trimmed.count)")
407+
return transcript
408+
} catch {
409+
print("[AudioSTT] Failure \(name) error=\(error)")
410+
throw error
380411
}
381-
return transcript
382412
}
383413

384414
private func transcribeAudioData(url: URL, data: Data) throws -> String {
385415
guard let key = ConfigService.openAIAPIKey(), !key.isEmpty else { throw ServiceError.missingAPIKey }
386416
// Attempt single-shot transcription first when under the upload limit
387417
if data.count <= transcriptionMaxUploadBytes {
418+
print("[AudioSTT] Direct upload \(url.lastPathComponent) bytes=\(data.count)")
388419
return try performTranscriptionRequest(data: data, filename: url.lastPathComponent, apiKey: key, language: transcriptionLanguage)
389420
}
390421

391422
// Chunk oversized recordings and stitch transcripts
423+
print("[AudioSTT] Chunking \(url.lastPathComponent) bytes=\(data.count)")
392424
let chunkURLs: [URL]
393425
do {
394426
chunkURLs = try exportAudioChunksIfPossible(from: url)
395427
} catch {
428+
print("[AudioSTT] Chunk export failed \(error); falling back to full upload")
396429
return try performTranscriptionRequest(data: data, filename: url.lastPathComponent, apiKey: key, language: transcriptionLanguage)
397430
}
398431
if chunkURLs.isEmpty {
399432
// Fall back to single upload even though it may fail; better than giving up silently
433+
print("[AudioSTT] Chunk export returned empty; falling back to full upload")
400434
return try performTranscriptionRequest(data: data, filename: url.lastPathComponent, apiKey: key, language: transcriptionLanguage)
401435
}
402436

437+
print("[AudioSTT] Exported \(chunkURLs.count) chunks for \(url.lastPathComponent)")
403438
var combined: [String] = []
404439
defer { cleanupTemporaryFiles(at: chunkURLs) }
405440
for (idx, chunkURL) in chunkURLs.enumerated() {
@@ -412,6 +447,7 @@ struct OpenAIResponsesService: LLMService {
412447
let prefix = chunkURLs.count > 1 ? "Part \(idx + 1)/\(chunkURLs.count):\n" : ""
413448
combined.append(prefix + trimmed)
414449
}
450+
print("[AudioSTT] Chunk \(idx + 1)/\(chunkURLs.count) success bytes=\(chunkData.count) chars=\(trimmed.count)")
415451
}
416452
return combined.joined(separator: "\n\n")
417453
}
@@ -442,22 +478,25 @@ struct OpenAIResponsesService: LLMService {
442478
appendFormField(name: "model", value: transcriptionModel)
443479
if let lang = language { appendFormField(name: "language", value: lang) }
444480
let mime = audioMimeType(for: URL(fileURLWithPath: filename))
481+
print("[AudioSTT] Uploading \(filename) mime=\(mime) bytes=\(data.count)")
445482
appendFileField(name: "file", filename: filename, mimeType: mime, data: data)
446483
body.append("--\(boundary)--\r\n".data(using: .utf8)!)
447484

448485
req.httpBody = body
449486

450-
let (data, resp) = try URLSession.shared.syncDataTask(with: req)
487+
let (respData, resp) = try URLSession.shared.syncDataTask(with: req)
451488
guard let http = resp as? HTTPURLResponse, (200..<300).contains(http.statusCode) else {
452-
let txt = String(data: data, encoding: .utf8) ?? ""
453-
throw NSError(domain: "AudioSTT", code: (resp as? HTTPURLResponse)?.statusCode ?? -1, userInfo: [NSLocalizedDescriptionKey: "Transcription HTTP error: \(txt)"])
489+
let bodyText = String(data: respData, encoding: .utf8) ?? ""
490+
let preview = bodyText.count > 2000 ? "\(bodyText.prefix(2000))" : bodyText
491+
print("[AudioSTT] HTTP error \(filename) status=\((resp as? HTTPURLResponse)?.statusCode ?? -1) body=\(preview)")
492+
throw NSError(domain: "AudioSTT", code: (resp as? HTTPURLResponse)?.statusCode ?? -1, userInfo: [NSLocalizedDescriptionKey: "Transcription HTTP error: \(bodyText)"])
454493
}
455494
var output = ""
456-
if let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any], let text = obj["text"] as? String {
495+
if let obj = try? JSONSerialization.jsonObject(with: respData) as? [String: Any], let text = obj["text"] as? String {
457496
output = text
458497
} else {
459498
// Some responses may include alternatives under "segments"; fallback to raw string
460-
output = String(data: data, encoding: .utf8) ?? ""
499+
output = String(data: respData, encoding: .utf8) ?? ""
461500
}
462501
// Persist successful transcription to cache for reuse
463502
return output
@@ -548,6 +587,55 @@ struct OpenAIResponsesService: LLMService {
548587
}
549588
}
550589

590+
private func requiresTranscoding(ext: String) -> Bool {
591+
transcodingExtensions.contains(ext)
592+
}
593+
594+
private func transcodeAudioToM4A(sourceURL: URL) throws -> URL {
595+
let asset = AVAsset(url: sourceURL)
596+
guard asset.isExportable else {
597+
throw NSError(domain: "AudioSTT", code: -7, userInfo: [NSLocalizedDescriptionKey: "Audio asset not exportable"])
598+
}
599+
guard let exporter = AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetAppleM4A) ??
600+
AVAssetExportSession(asset: asset, presetName: AVAssetExportPresetPassthrough) else {
601+
throw NSError(domain: "AudioSTT", code: -8, userInfo: [NSLocalizedDescriptionKey: "Unable to create audio exporter"])
602+
}
603+
let tempURL = temporaryTranscodeURL(originalURL: sourceURL)
604+
exporter.outputURL = tempURL
605+
if exporter.supportedFileTypes.contains(.m4a) {
606+
exporter.outputFileType = .m4a
607+
} else if let first = exporter.supportedFileTypes.first {
608+
exporter.outputFileType = first
609+
}
610+
exporter.timeRange = CMTimeRange(start: .zero, duration: asset.duration)
611+
exporter.shouldOptimizeForNetworkUse = true
612+
613+
let group = DispatchGroup()
614+
group.enter()
615+
exporter.exportAsynchronously { group.leave() }
616+
group.wait()
617+
618+
switch exporter.status {
619+
case .completed:
620+
let attrs = try? FileManager.default.attributesOfItem(atPath: tempURL.path)
621+
let size = (attrs?[.size] as? NSNumber)?.intValue ?? 0
622+
if size <= 0 {
623+
throw NSError(domain: "AudioSTT", code: -9, userInfo: [NSLocalizedDescriptionKey: "Transcoded file empty"])
624+
}
625+
return tempURL
626+
case .failed, .cancelled:
627+
throw exporter.error ?? NSError(domain: "AudioSTT", code: -10, userInfo: [NSLocalizedDescriptionKey: "Audio transcode failed"])
628+
default:
629+
throw exporter.error ?? NSError(domain: "AudioSTT", code: -11, userInfo: [NSLocalizedDescriptionKey: "Audio transcode ended unexpectedly"])
630+
}
631+
}
632+
633+
private func temporaryTranscodeURL(originalURL: URL) -> URL {
634+
let dir = FileManager.default.temporaryDirectory
635+
let base = originalURL.deletingPathExtension().lastPathComponent
636+
return dir.appendingPathComponent("transcode-\(base)-\(UUID().uuidString).m4a")
637+
}
638+
551639
private func audioMimeType(for url: URL) -> String {
552640
let ext = url.pathExtension.lowercased()
553641
switch ext {
@@ -556,6 +644,7 @@ struct OpenAIResponsesService: LLMService {
556644
case "wav": return "audio/wav"
557645
case "aac": return "audio/aac"
558646
case "flac": return "audio/flac"
647+
case "ogg", "oga", "opus": return "audio/ogg"
559648
default: return "application/octet-stream"
560649
}
561650
}

0 commit comments

Comments
 (0)