diff --git a/AudioType/App/AudioTypeApp.swift b/AudioType/App/AudioTypeApp.swift index fd20ece..3054019 100644 --- a/AudioType/App/AudioTypeApp.swift +++ b/AudioType/App/AudioTypeApp.swift @@ -12,6 +12,7 @@ struct AudioTypeApp: App { } } +@MainActor class AppDelegate: NSObject, NSApplicationDelegate { private var statusItem: NSStatusItem! private var menuBarController: MenuBarController! @@ -49,8 +50,8 @@ class AppDelegate: NSObject, NSApplicationDelegate { // Show onboarding if permissions are missing or no engine is usable if !micPermission || !accessibilityPermission || !EngineResolver.anyEngineAvailable { - DispatchQueue.main.async { - self.showOnboarding() + await MainActor.run { + showOnboarding() } } else { // All set — start listening for hotkey diff --git a/AudioType/App/MenuBarController.swift b/AudioType/App/MenuBarController.swift index e324448..57f99f4 100644 --- a/AudioType/App/MenuBarController.swift +++ b/AudioType/App/MenuBarController.swift @@ -17,18 +17,45 @@ extension NSImage { } } -/// Shared observable for live audio level — drives the recording waveform. +/// Shared observable for live audio level - drives the recording waveform. +/// Also carries the overlay text so the hosting view doesn't have to be +/// rebuilt on every state change. class AudioLevelMonitor: ObservableObject { static let shared = AudioLevelMonitor() @Published var level: Float = 0.0 + @Published var overlayText: String = "Recording..." } class MenuBarController: NSObject, NSWindowDelegate { private weak var statusItem: NSStatusItem? private var transcriptionManager: TranscriptionManager private var recordingWindow: NSWindow? + private var recordingHostingView: NSHostingView? private var settingsWindow: NSWindow? + // Pre-rendered status-bar icons, built once. Calling NSImage.tinted on + // every state change re-rasterizes the symbol via lockFocus/unlockFocus + // and was a steady source of bitmap allocations. + private lazy var idleIcon: NSImage? = { + let img = NSImage( + systemSymbolName: "waveform.circle.fill", accessibilityDescription: "Ready") + img?.isTemplate = true + return img + }() + private lazy var recordingIcon: NSImage? = { + NSImage(systemSymbolName: "waveform.circle.fill", accessibilityDescription: "Recording")? + .tinted(with: AudioTypeTheme.nsRecordingRed) + }() + private lazy var processingIcon: NSImage? = { + NSImage(systemSymbolName: "ellipsis.circle.fill", accessibilityDescription: "Processing")? + .tinted(with: AudioTypeTheme.nsAmber) + }() + private lazy var errorIcon: NSImage? = { + NSImage( + systemSymbolName: "exclamationmark.triangle.fill", accessibilityDescription: "Error")? + .tinted(with: .systemRed) + }() + init(transcriptionManager: TranscriptionManager) { self.transcriptionManager = transcriptionManager super.init() @@ -50,6 +77,10 @@ class MenuBarController: NSObject, NSWindowDelegate { ) } + deinit { + NotificationCenter.default.removeObserver(self) + } + func setupStatusItem(_ statusItem: NSStatusItem) { self.statusItem = statusItem @@ -106,38 +137,24 @@ class MenuBarController: NSObject, NSWindowDelegate { switch state { case .idle: - let img = NSImage( - systemSymbolName: "waveform.circle.fill", accessibilityDescription: "Ready") - img?.isTemplate = true - button.image = img + button.image = idleIcon AudioLevelMonitor.shared.level = 0 hideRecordingIndicator() updateStatusMenuItem("Ready") case .recording: - // Tinted coral/red — non-template so the color shows through - if let base = NSImage( - systemSymbolName: "waveform.circle.fill", accessibilityDescription: "Recording") { - button.image = base.tinted(with: AudioTypeTheme.nsRecordingRed) - } + button.image = recordingIcon showRecordingIndicator() updateStatusMenuItem("Recording...") case .processing: - // Tinted amber — "I'm thinking" - if let base = NSImage( - systemSymbolName: "ellipsis.circle.fill", accessibilityDescription: "Processing") { - button.image = base.tinted(with: AudioTypeTheme.nsAmber) - } + button.image = processingIcon AudioLevelMonitor.shared.level = 0 updateRecordingIndicator(text: "Processing...") updateStatusMenuItem("Processing...") case .error(let message): - let img = NSImage( - systemSymbolName: "exclamationmark.triangle.fill", accessibilityDescription: "Error") - img?.isTemplate = false - button.image = img?.tinted(with: .systemRed) + button.image = errorIcon hideRecordingIndicator() updateStatusMenuItem("Error: \(message)") } @@ -175,22 +192,27 @@ class MenuBarController: NSObject, NSWindowDelegate { recordingWindow = window } - let hostingView = NSHostingView( - rootView: RecordingOverlay(text: "Recording...") - .environmentObject(AudioLevelMonitor.shared)) - hostingView.frame = NSRect(x: 0, y: 0, width: 180, height: 50) - recordingWindow?.contentView = hostingView + // Build the hosting view once; subsequent updates just mutate the + // observable state. Re-creating NSHostingView on every state change + // was leaking the SwiftUI graph and Metal layers. + if recordingHostingView == nil { + let hosting = NSHostingView( + rootView: AnyView( + RecordingOverlay() + .environmentObject(AudioLevelMonitor.shared) + ) + ) + hosting.frame = NSRect(x: 0, y: 0, width: 180, height: 50) + recordingHostingView = hosting + recordingWindow?.contentView = hosting + } + + AudioLevelMonitor.shared.overlayText = "Recording..." recordingWindow?.orderFront(nil) } private func updateRecordingIndicator(text: String) { - if let window = recordingWindow { - let hostingView = NSHostingView( - rootView: RecordingOverlay(text: text) - .environmentObject(AudioLevelMonitor.shared)) - hostingView.frame = NSRect(x: 0, y: 0, width: 180, height: 50) - window.contentView = hostingView - } + AudioLevelMonitor.shared.overlayText = text } private func hideRecordingIndicator() { diff --git a/AudioType/App/TranscriptionManager.swift b/AudioType/App/TranscriptionManager.swift index 5948c4c..17dbcdb 100644 --- a/AudioType/App/TranscriptionManager.swift +++ b/AudioType/App/TranscriptionManager.swift @@ -32,6 +32,11 @@ class TranscriptionManager: ObservableObject { private var hotKeyManager: HotKeyManager? private var textInserter: TextInserter? + /// Active transcription task. Held so a new recording can cancel any + /// in-flight transcription from a previous one (e.g. user re-triggers + /// the hotkey while the network call is still pending). + private var transcriptionTask: Task? + private let logger = Logger(subsystem: "com.audiotype", category: "TranscriptionManager") private init() {} @@ -60,7 +65,7 @@ class TranscriptionManager: ObservableObject { if !EngineResolver.anyEngineAvailable { logger.warning("No transcription engine available") - setState(.error("No engine available — add a cloud API key or enable Apple Speech")) + setState(.error("No engine available - add a cloud API key or enable Apple Speech")) } else { logger.info("Transcription engine ready: \(engine.displayName)") } @@ -85,7 +90,7 @@ class TranscriptionManager: ObservableObject { audioRecorder = nil } - /// Called when the user saves an API key or changes engine preference — re-evaluate. + /// Called when the user saves an API key or changes engine preference - re-evaluate. func onEngineConfigChanged() { let engine = EngineResolver.resolve() activeEngineName = engine.displayName @@ -93,7 +98,7 @@ class TranscriptionManager: ObservableObject { setState(.idle) logger.info("Engine config changed, active engine: \(engine.displayName)") } else { - setState(.error("No engine available — add a cloud API key or enable Apple Speech")) + setState(.error("No engine available - add a cloud API key or enable Apple Speech")) } } @@ -111,6 +116,12 @@ class TranscriptionManager: ObservableObject { } } + /// Engine resolved at recording start and reused for the matching + /// transcription. Keeps Keychain / availability checks out of the + /// post-stop hot path and ensures the engine identity doesn't change + /// mid-recording if the user edits settings. + private var activeEngine: TranscriptionEngine? + private func startRecording() { guard state == .idle else { logger.warning("Cannot start recording: not in idle state") @@ -118,14 +129,24 @@ class TranscriptionManager: ObservableObject { } guard EngineResolver.anyEngineAvailable else { - setState(.error("No engine available — add a cloud API key or enable Apple Speech")) + setState(.error("No engine available - add a cloud API key or enable Apple Speech")) return } + // Cancel any still-pending transcription from a previous recording so + // we don't insert stale text into the user's new context. + transcriptionTask?.cancel() + transcriptionTask = nil + + // Resolve the engine once, up front. transcribeAndInsert will reuse it. + let engine = EngineResolver.resolve() + activeEngine = engine + activeEngineName = engine.displayName + do { try audioRecorder?.startRecording() setState(.recording) - logger.info("Recording started") + logger.info("Recording started with engine: \(engine.displayName)") } catch { logger.error("Failed to start recording: \(error.localizedDescription)") setState(.error("Failed to start recording")) @@ -144,21 +165,22 @@ class TranscriptionManager: ObservableObject { return } + // Take the engine resolved at startRecording. Falls back to a fresh + // resolution defensively if somehow nil. + let engine = activeEngine ?? EngineResolver.resolve() + activeEngine = nil + logger.info("Recording stopped, captured \(samples.count) samples") setState(.processing) - // Transcribe in background - Task.detached { [weak self] in - await self?.transcribeAndInsert(samples: samples) + // Transcribe in background. Hold the task so the next recording can + // cancel it if it's still pending. + transcriptionTask = Task.detached { [weak self] in + await self?.transcribeAndInsert(samples: samples, engine: engine) } } - private func transcribeAndInsert(samples: [Float]) async { - let engine = EngineResolver.resolve() - - await MainActor.run { - self.activeEngineName = engine.displayName - } + private func transcribeAndInsert(samples: [Float], engine: TranscriptionEngine) async { let startTime = CFAbsoluteTimeGetCurrent() diff --git a/AudioType/Core/AudioRecorder.swift b/AudioType/Core/AudioRecorder.swift index 452148c..42ab4e5 100644 --- a/AudioType/Core/AudioRecorder.swift +++ b/AudioType/Core/AudioRecorder.swift @@ -1,8 +1,12 @@ import AVFoundation +import Accelerate import os.log class AudioRecorder { - private let audioEngine = AVAudioEngine() + // Lazily created on startRecording and torn down on stopRecording so the + // audio HAL doesn't stay warm between recordings (big idle-energy win for + // a menu-bar app). + private var audioEngine: AVAudioEngine? private var audioBuffer: [Float] = [] private let bufferLock = NSLock() private var isRecording = false @@ -16,8 +20,8 @@ class AudioRecorder { private let targetSampleRate: Double = 16000 init() { - // Pre-allocate buffer for ~30 seconds of audio at 16kHz - audioBuffer.reserveCapacity(Int(targetSampleRate * 30)) + // Buffer is allocated on each startRecording so the recorder has zero + // footprint when idle. } func startRecording() throws { @@ -26,12 +30,19 @@ class AudioRecorder { return } - // Clear previous buffer - bufferLock.lock() - audioBuffer.removeAll(keepingCapacity: true) - bufferLock.unlock() + // Drop the buffer entirely (don't preserve capacity — see issue 1.4). + do { + bufferLock.lock() + defer { bufferLock.unlock() } + audioBuffer = [] + audioBuffer.reserveCapacity(Int(targetSampleRate * 30)) + } - let inputNode = audioEngine.inputNode + // Lazily create the audio engine on each recording. + let engine = AVAudioEngine() + audioEngine = engine + + let inputNode = engine.inputNode let inputFormat = inputNode.outputFormat(forBus: 0) logger.info("Input format: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) channels") @@ -66,8 +77,8 @@ class AudioRecorder { } // Start audio engine - audioEngine.prepare() - try audioEngine.start() + engine.prepare() + try engine.start() isRecording = true logger.info("Recording started") @@ -79,16 +90,25 @@ class AudioRecorder { return nil } - // Stop and remove tap - audioEngine.inputNode.removeTap(onBus: 0) - audioEngine.stop() + // Stop and tear down the engine so the audio HAL releases its resources. + if let engine = audioEngine { + engine.inputNode.removeTap(onBus: 0) + engine.stop() + } + audioEngine = nil isRecording = false - // Return captured samples - bufferLock.lock() - let samples = audioBuffer - bufferLock.unlock() + // Move the buffer out of the recorder (zero-copy via COW transfer) and + // leave the recorder with a fresh empty array so it doesn't keep the + // recording's high-water capacity in memory. + let samples: [Float] + do { + bufferLock.lock() + defer { bufferLock.unlock() } + samples = audioBuffer + audioBuffer = [] + } logger.info( "Recording stopped, captured \(samples.count) samples (\(Double(samples.count) / self.targetSampleRate, format: .fixed(precision: 2))s)" @@ -100,10 +120,7 @@ class AudioRecorder { private func processAudioBuffer( _ buffer: AVAudioPCMBuffer, converter: AVAudioConverter?, targetFormat: AVAudioFormat ) { - var samplesArray: [Float] - if let converter = converter { - // Need to convert to target format let frameCount = AVAudioFrameCount( Double(buffer.frameLength) * targetSampleRate / buffer.format.sampleRate ) @@ -129,25 +146,37 @@ class AudioRecorder { } guard let channelData = convertedBuffer.floatChannelData else { return } - samplesArray = Array( - UnsafeBufferPointer(start: channelData[0], count: Int(convertedBuffer.frameLength))) + let count = Int(convertedBuffer.frameLength) + consume(samples: channelData[0], count: count) } else { - // Already in correct format guard let channelData = buffer.floatChannelData else { return } - samplesArray = Array( - UnsafeBufferPointer(start: channelData[0], count: Int(buffer.frameLength))) + let count = Int(buffer.frameLength) + consume(samples: channelData[0], count: count) } + } - // Compute RMS level for live waveform - let rms = sqrt(samplesArray.reduce(0) { $0 + $1 * $1 } / Float(max(samplesArray.count, 1))) + /// Consume a chunk of mic samples: compute RMS for the waveform and append + /// to the recording buffer — without ever materialising an intermediate + /// `[Float]`. Called on the audio thread. + private func consume(samples: UnsafePointer, count: Int) { + guard count > 0 else { return } + + // RMS via Accelerate (vectorised). Replaces a scalar reduce loop that + // ran on every tap callback. + var meanSquare: Float = 0 + vDSP_measqv(samples, 1, &meanSquare, vDSP_Length(count)) + let rms = sqrt(meanSquare) // Normalize: typical speech RMS is 0.01–0.15, scale aggressively to 0–1 let level = min(rms * 25, 1.0) onLevelUpdate?(level) - // Append to buffer + // Append directly from the unsafe buffer pointer; [Float] has an + // append(contentsOf:) overload that takes any Sequence, including + // UnsafeBufferPointer, so no intermediate Array is allocated. + let ptr = UnsafeBufferPointer(start: samples, count: count) bufferLock.lock() - audioBuffer.append(contentsOf: samplesArray) - bufferLock.unlock() + defer { bufferLock.unlock() } + audioBuffer.append(contentsOf: ptr) } } diff --git a/AudioType/Core/HotKeyManager.swift b/AudioType/Core/HotKeyManager.swift index 075b053..6c4c146 100644 --- a/AudioType/Core/HotKeyManager.swift +++ b/AudioType/Core/HotKeyManager.swift @@ -14,6 +14,12 @@ class HotKeyManager { private let callback: (HotKeyEvent) -> Void private var isRecording = false + // Retained pointer to self that the event-tap callback uses as refcon. + // Holding self retained for the lifetime of the tap means the tap + // callback is always safe to call back into self, even if the owner + // releases its reference. We balance the retain in stopListening. + private var refconRetained: Unmanaged? + // Track fn key state private var fnKeyWasPressed = false @@ -29,6 +35,10 @@ class HotKeyManager { // Use CGEventTap for fn key detection let eventMask: CGEventMask = (1 << CGEventType.flagsChanged.rawValue) + // Retain self for the duration of the tap. Released in stopListening. + let retained = Unmanaged.passRetained(self) + refconRetained = retained + guard let tap = CGEvent.tapCreate( tap: .cgSessionEventTap, @@ -36,13 +46,17 @@ class HotKeyManager { options: .defaultTap, eventsOfInterest: eventMask, callback: { proxy, type, event, refcon in - guard let refcon = refcon else { return Unmanaged.passRetained(event) } + // The event is owned by the system; pass it back unretained. + guard let refcon = refcon else { return Unmanaged.passUnretained(event) } let manager = Unmanaged.fromOpaque(refcon).takeUnretainedValue() return manager.handleEvent(proxy: proxy, type: type, event: event) }, - userInfo: Unmanaged.passUnretained(self).toOpaque() + userInfo: retained.toOpaque() ) else { + // Tap creation failed — release the retain we just took. + retained.release() + refconRetained = nil logger.error("Failed to create event tap. Accessibility permission may be required.") return } @@ -61,6 +75,9 @@ class HotKeyManager { func stopListening() { if let tap = eventTap { CGEvent.tapEnable(tap: tap, enable: false) + // Invalidating the mach port stops further callbacks before we drop + // the run loop source. + CFMachPortInvalidate(tap) } if let source = runLoopSource { @@ -72,6 +89,12 @@ class HotKeyManager { isRecording = false fnKeyWasPressed = false + // Balance the retain taken in startListening. Done last so any + // callback already in-flight against the now-disabled tap still sees + // a live self via its own takeUnretainedValue. + refconRetained?.release() + refconRetained = nil + logger.info("Hotkey listener stopped") } @@ -85,7 +108,7 @@ class HotKeyManager { if let tap = eventTap { CGEvent.tapEnable(tap: tap, enable: true) } - return Unmanaged.passRetained(event) + return Unmanaged.passUnretained(event) } let flags = event.flags @@ -120,7 +143,7 @@ class HotKeyManager { } } - return Unmanaged.passRetained(event) + return Unmanaged.passUnretained(event) } deinit { diff --git a/AudioType/Core/TextInserter.swift b/AudioType/Core/TextInserter.swift index 34186c6..84e9061 100644 --- a/AudioType/Core/TextInserter.swift +++ b/AudioType/Core/TextInserter.swift @@ -6,24 +6,41 @@ import os.log class TextInserter { private let logger = Logger(subsystem: "com.audiotype", category: "TextInserter") + /// Above this length we paste via clipboard instead of synthesising one + /// keystroke per character. Per-char synthesis costs ~1 ms each plus a + /// fresh CGEventSource per char — for long dictations that's the dominant + /// post-recording latency the user feels. + private static let clipboardPasteThreshold = 30 + func insertText(_ text: String) { guard !text.isEmpty else { return } logger.info("Inserting text: \(text.prefix(50))...") - // Use CGEvent to simulate keyboard input - for char in text { - insertCharacter(char) - // Small delay between characters for reliability - usleep(1000) // 1ms + if text.count > Self.clipboardPasteThreshold { + insertTextViaClipboard(text) + } else { + insertTextViaKeystrokes(text) } logger.info("Text insertion complete") } - private func insertCharacter(_ char: Character) { + /// Per-character keystroke synthesis. Used for short strings where + /// clipboard paste's clipboard-restore quirks aren't worth it. + private func insertTextViaKeystrokes(_ text: String) { + // Cache the event source once for the whole insertion — creating one + // per character was a measurable hot path. let source = CGEventSource(stateID: .hidSystemState) + for char in text { + insertCharacter(char, source: source) + // Tiny delay so target apps don't drop events under load. + usleep(1000) // 1ms + } + } + + private func insertCharacter(_ char: Character, source: CGEventSource?) { // Create key down event guard let keyDown = CGEvent(keyboardEventSource: source, virtualKey: 0, keyDown: true) else { logger.error("Failed to create keyDown event") diff --git a/AudioType/Core/TextPostProcessor.swift b/AudioType/Core/TextPostProcessor.swift index 706fd25..6aec6c4 100644 --- a/AudioType/Core/TextPostProcessor.swift +++ b/AudioType/Core/TextPostProcessor.swift @@ -106,41 +106,37 @@ class TextPostProcessor { // User-defined custom replacements private var customReplacements: [String: String] = [:] + // Cached compiled regex + lookup table. Rebuilt only when the catalog + // changes (custom replacements added/removed). The previous code rebuilt + // a merged dictionary and ran ~85 case-insensitive String scans on every + // single transcription. + private var cachedRegex: NSRegularExpression? + private var cachedLookup: [String: String] = [:] + private let regexLock = NSLock() + private init() { loadCustomReplacements() + rebuildRegex() } /// Process transcribed text with corrections func process(_ text: String) -> String { - var result = text - - // Apply word replacements (case-insensitive) - let allReplacements = wordReplacements.merging(customReplacements) { _, custom in custom } - - for (pattern, replacement) in allReplacements { - result = result.replacingOccurrences( - of: pattern, - with: replacement, - options: .caseInsensitive - ) - } - - // Capitalize first letter of sentences - result = capitalizeSentences(result) - - return result + let result = applyReplacements(text) + return capitalizeSentences(result) } /// Add a custom word replacement func addCustomReplacement(from: String, to: String) { customReplacements[from.lowercased()] = to saveCustomReplacements() + rebuildRegex() } /// Remove a custom replacement func removeCustomReplacement(from: String) { customReplacements.removeValue(forKey: from.lowercased()) saveCustomReplacements() + rebuildRegex() } /// Get all custom replacements @@ -150,6 +146,73 @@ class TextPostProcessor { // MARK: - Private + /// Rebuild the compiled regex from the current built-in + custom catalogs. + /// Custom replacements override built-ins on key collision. + private func rebuildRegex() { + regexLock.lock() + defer { regexLock.unlock() } + + let merged = wordReplacements.merging(customReplacements) { _, custom in custom } + cachedLookup = [:] + cachedLookup.reserveCapacity(merged.count) + for (key, value) in merged { + cachedLookup[key.lowercased()] = value + } + + // Sort keys longest-first so e.g. "rest api" wins over "api". This also + // gives us a deterministic order independent of dictionary hashing, + // which the old implementation lacked. + let keys = merged.keys.sorted { $0.count > $1.count } + let pattern = keys.map { NSRegularExpression.escapedPattern(for: $0) } + .joined(separator: "|") + + cachedRegex = try? NSRegularExpression( + pattern: pattern, + options: [.caseInsensitive] + ) + } + + /// Apply replacements in a single regex pass. + private func applyReplacements(_ text: String) -> String { + regexLock.lock() + let regex = cachedRegex + let lookup = cachedLookup + regexLock.unlock() + + guard let regex = regex, !text.isEmpty else { return text } + + let nsText = text as NSString + let range = NSRange(location: 0, length: nsText.length) + let matches = regex.matches(in: text, options: [], range: range) + if matches.isEmpty { return text } + + // Reassemble in one pass, alternating original spans and replacements. + var result = "" + result.reserveCapacity(text.count) + var cursor = 0 + for match in matches { + let r = match.range + if r.location > cursor { + result.append( + nsText.substring(with: NSRange(location: cursor, length: r.location - cursor)) + ) + } + let matched = nsText.substring(with: r).lowercased() + if let replacement = lookup[matched] { + result.append(replacement) + } else { + result.append(nsText.substring(with: r)) + } + cursor = r.location + r.length + } + if cursor < nsText.length { + result.append( + nsText.substring(with: NSRange(location: cursor, length: nsText.length - cursor)) + ) + } + return result + } + private func capitalizeSentences(_ text: String) -> String { var result = "" var capitalizeNext = true diff --git a/AudioType/Core/WAVEncoder.swift b/AudioType/Core/WAVEncoder.swift index ea1210a..4596ae8 100644 --- a/AudioType/Core/WAVEncoder.swift +++ b/AudioType/Core/WAVEncoder.swift @@ -1,3 +1,4 @@ +import Accelerate import Foundation import os.log @@ -25,7 +26,7 @@ struct WhisperAPIConfig { /// and response parsing are all handled here. class WhisperAPIEngine: TranscriptionEngine { - /// Provider configuration — subclasses must override. + /// Provider configuration - subclasses must override. var config: WhisperAPIConfig { fatalError("Subclasses must override config") } @@ -87,7 +88,7 @@ class WhisperAPIEngine: TranscriptionEngine { throw WhisperAPIError.invalidURL } - let request = WAVEncoder.buildRequest( + let (request, body) = WAVEncoder.buildRequest( url: url, apiKey: apiKey, wavData: wavData, @@ -97,7 +98,10 @@ class WhisperAPIEngine: TranscriptionEngine { let (data, response): (Data, URLResponse) do { - (data, response) = try await URLSession.shared.data(for: request) + // upload(for:from:) keeps a single copy of the body; setting + // request.httpBody and calling data(for:) tends to keep the body + // resident in two places. With ~2 MB WAV bodies this matters. + (data, response) = try await URLSession.shared.upload(for: request, from: body) } catch { throw WhisperAPIError.networkError(error.localizedDescription) } @@ -178,50 +182,97 @@ enum WhisperAPIError: Error, LocalizedError { enum WAVEncoder { /// Encode Float32 PCM samples into WAV in memory (16-bit PCM, mono). + /// + /// The previous implementation allocated an intermediate `[Int16]` + /// (~960 KB for a 30 s clip), let `Data` realloc as it grew, and + /// did 480 000 individual `appendLittleEndian` calls. This version: + /// + /// - Allocates the final `Data` once at exact size (44-byte header + 2N). + /// - Writes the header in place. + /// - Uses Accelerate to clip Float → Int16 directly into the data + /// region in a single pass. static func encode(samples: [Float], sampleRate: Int) -> Data { - var data = Data() - - let int16Samples = samples.map { sample -> Int16 in - let clamped = max(-1.0, min(1.0, sample)) - return Int16(clamped * Float(Int16.max)) - } - let numChannels: UInt16 = 1 let bitsPerSample: UInt16 = 16 let byteRate = UInt32(sampleRate) * UInt32(numChannels) * UInt32(bitsPerSample / 8) let blockAlign = numChannels * (bitsPerSample / 8) - let dataSize = UInt32(int16Samples.count * 2) - let fileSize = 36 + dataSize - - // RIFF header - data.append(contentsOf: "RIFF".utf8) - data.appendLittleEndian(fileSize) - data.append(contentsOf: "WAVE".utf8) - - // fmt chunk - data.append(contentsOf: "fmt ".utf8) - data.appendLittleEndian(UInt32(16)) - data.appendLittleEndian(UInt16(1)) // PCM - data.appendLittleEndian(numChannels) - data.appendLittleEndian(UInt32(sampleRate)) - data.appendLittleEndian(byteRate) - data.appendLittleEndian(blockAlign) - data.appendLittleEndian(bitsPerSample) - - // data chunk - data.append(contentsOf: "data".utf8) - data.appendLittleEndian(dataSize) - - for sample in int16Samples { - data.appendLittleEndian(sample) + let dataSize = UInt32(samples.count * 2) + let fileSize: UInt32 = 36 + dataSize + let totalSize = 44 + samples.count * 2 + + var data = Data(count: totalSize) + data.withUnsafeMutableBytes { (raw: UnsafeMutableRawBufferPointer) in + guard let base = raw.baseAddress else { return } + + // --- Header --------------------------------------------------------- + func writeASCII(_ string: String, at offset: Int) { + for (i, byte) in string.utf8.enumerated() { + base.storeBytes(of: byte, toByteOffset: offset + i, as: UInt8.self) + } + } + func writeLE(_ value: T, at offset: Int) { + base.storeBytes(of: value.littleEndian, toByteOffset: offset, as: T.self) + } + + writeASCII("RIFF", at: 0) + writeLE(fileSize, at: 4) + writeASCII("WAVE", at: 8) + + writeASCII("fmt ", at: 12) + writeLE(UInt32(16), at: 16) + writeLE(UInt16(1), at: 20) // PCM + writeLE(numChannels, at: 22) + writeLE(UInt32(sampleRate), at: 24) + writeLE(byteRate, at: 28) + writeLE(blockAlign, at: 32) + writeLE(bitsPerSample, at: 34) + + writeASCII("data", at: 36) + writeLE(dataSize, at: 40) + + // --- PCM data ------------------------------------------------------- + // Clip to [-1, 1], scale by Int16.max, convert to Int16 — all via + // Accelerate, all into the destination region in one pass. + guard !samples.isEmpty else { return } + + let dst = base.advanced(by: 44).assumingMemoryBound(to: Int16.self) + let n = vDSP_Length(samples.count) + + samples.withUnsafeBufferPointer { src in + guard let srcBase = src.baseAddress else { return } + + // Scratch buffer for clip+scale; reuse src memory would mutate the + // caller's input, so allocate a transient float buffer. + let scratch = UnsafeMutablePointer.allocate(capacity: samples.count) + defer { scratch.deallocate() } + + // Clip into scratch. + var lo: Float = -1.0 + var hi: Float = 1.0 + vDSP_vclip(srcBase, 1, &lo, &hi, scratch, 1, n) + + // Scale by Int16.max in place. + var scale = Float(Int16.max) + vDSP_vsmul(scratch, 1, &scale, scratch, 1, n) + + // Convert Float → Int16 with rounding directly into dst. + vDSP_vfix16(scratch, 1, dst, 1, n) + + // WAV is little-endian. On Apple silicon and Intel, host order is + // already LE so no byte-swap needed. Guard with a static assert + // for any future big-endian Apple platform (none exist today). + assert(1.littleEndian == 1, "WAVEncoder assumes little-endian host") + } } - return data } /// Build a multipart/form-data request for an OpenAI-compatible /// `/v1/audio/transcriptions` endpoint. + /// + /// Returns the request and body separately so callers can pass the body + /// to `URLSession.upload(for:from:)` instead of setting `httpBody`. static func buildRequest( url: URL, apiKey: String, @@ -229,7 +280,7 @@ enum WAVEncoder { model: String, languageCode: String?, timeoutInterval: TimeInterval = 30 - ) -> URLRequest { + ) -> (URLRequest, Data) { let boundary = UUID().uuidString var request = URLRequest(url: url) @@ -266,8 +317,7 @@ enum WAVEncoder { ) body.append(Data("--\(boundary)--\r\n".utf8)) - request.httpBody = body - return request + return (request, body) } } diff --git a/AudioType/UI/OnboardingView.swift b/AudioType/UI/OnboardingView.swift index 8f862fb..8155343 100644 --- a/AudioType/UI/OnboardingView.swift +++ b/AudioType/UI/OnboardingView.swift @@ -9,7 +9,6 @@ struct OnboardingView: View { @State private var anyCloudKeyConfigured = GroqEngine.isConfigured || OpenAIEngine.isConfigured @State private var apiKeyText = "" @State private var apiKeySaveError: String? - @State private var hasAutoCompleted = false let timer = Timer.publish(every: 0.5, on: .main, in: .common).autoconnect() @@ -77,7 +76,7 @@ struct OnboardingView: View { .font(.caption) .foregroundColor(.secondary) } - Text("Cloud transcription — faster & more accurate") + Text("Cloud transcription - faster & more accurate") .font(.caption) .foregroundColor(.secondary) } @@ -165,17 +164,13 @@ struct OnboardingView: View { checkPermissions() } .onReceive(timer) { _ in - // Continuously check permissions + // Continuously refresh permission state so the UI reflects changes made + // in System Settings. The user closes the window themselves via the + // "Get Started" button once everything is ready. microphoneGranted = AVCaptureDevice.authorizationStatus(for: .audio) == .authorized accessibilityGranted = Permissions.checkAccessibility() speechRecognitionGranted = Permissions.isSpeechRecognitionAuthorized anyCloudKeyConfigured = GroqEngine.isConfigured || OpenAIEngine.isConfigured - - // Auto-complete when all required permissions are ready and at least one engine works - if canContinue && !hasAutoCompleted { - hasAutoCompleted = true - onComplete() - } } } diff --git a/AudioType/UI/RecordingOverlay.swift b/AudioType/UI/RecordingOverlay.swift index 39ceb5a..cc7b99e 100644 --- a/AudioType/UI/RecordingOverlay.swift +++ b/AudioType/UI/RecordingOverlay.swift @@ -1,11 +1,10 @@ import SwiftUI struct RecordingOverlay: View { - let text: String @EnvironmentObject var levelMonitor: AudioLevelMonitor private var isRecording: Bool { - text == "Recording..." + levelMonitor.overlayText == "Recording..." } var body: some View { diff --git a/AudioType/Utilities/KeychainHelper.swift b/AudioType/Utilities/KeychainHelper.swift index 5a4b4ec..956e924 100644 --- a/AudioType/Utilities/KeychainHelper.swift +++ b/AudioType/Utilities/KeychainHelper.swift @@ -12,6 +12,13 @@ enum KeychainHelper { subsystem: "com.audiotype", category: "KeychainHelper" ) + // In-memory cache of resolved values. Keychain reads aren't expensive in + // absolute terms but they were happening on every transcription (often + // multiple times) via the engines' apiKey getters. Cache entries are + // invalidated on save/delete. + private static var cache: [String: String?] = [:] + private static let cacheLock = NSLock() + // MARK: - Public API /// Save a value to the Keychain. Overwrites any existing value for the key. @@ -36,11 +43,23 @@ enum KeychainHelper { logger.error("Failed to save key \(key), status: \(status)") throw KeychainError.saveFailed(status) } + + cacheLock.lock() + cache[key] = value + cacheLock.unlock() + logger.info("Saved value for key: \(key)") } /// Retrieve a value from the Keychain. static func get(key: String) -> String? { + cacheLock.lock() + if let cached = cache[key] { + cacheLock.unlock() + return cached + } + cacheLock.unlock() + let query: [String: Any] = [ kSecClass as String: kSecClassGenericPassword, kSecAttrService as String: service, @@ -52,13 +71,19 @@ enum KeychainHelper { var result: AnyObject? let status = SecItemCopyMatching(query as CFDictionary, &result) - guard status == errSecSuccess, + let value: String? + if status == errSecSuccess, let data = result as? Data, - let value = String(data: data, encoding: .utf8) - else { - return nil + let decoded = String(data: data, encoding: .utf8) { + value = decoded + } else { + value = nil } + cacheLock.lock() + cache[key] = value + cacheLock.unlock() + return value } @@ -72,6 +97,11 @@ enum KeychainHelper { ] let status = SecItemDelete(query as CFDictionary) + + cacheLock.lock() + cache[key] = .some(nil) // remember "absent" too, to avoid re-querying + cacheLock.unlock() + if status == errSecSuccess || status == errSecItemNotFound { return true }