From 1acea31c6f5ff7e36e4b380f9e63f83e9a1d1c31 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:16:57 +0530
Subject: [PATCH 01/17] Tear down AVAudioEngine when idle and free buffer on
 stop

The recorder held an AVAudioEngine for the lifetime of the process and
kept the audio buffer's high-water capacity forever. For a menu-bar app
that lives for days this kept the audio HAL warm and steadily grew RAM.

- Make audioEngine optional, create on startRecording, nil on stop
- Drop buffer capacity instead of removeAll(keepingCapacity: true)
- Move buffer out via COW transfer instead of copying on stop

AUDIOTYPE-1
---
 AudioType/Core/AudioRecorder.swift | 38 ++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/AudioType/Core/AudioRecorder.swift b/AudioType/Core/AudioRecorder.swift
index 452148c..701051f 100644
--- a/AudioType/Core/AudioRecorder.swift
+++ b/AudioType/Core/AudioRecorder.swift
@@ -2,7 +2,10 @@ import AVFoundation
 import os.log
 
 class AudioRecorder {
-  private let audioEngine = AVAudioEngine()
+  // Lazily created on startRecording and torn down on stopRecording so the
+  // audio HAL doesn't stay warm between recordings (big idle-energy win for
+  // a menu-bar app).
+  private var audioEngine: AVAudioEngine?
   private var audioBuffer: [Float] = []
   private let bufferLock = NSLock()
   private var isRecording = false
@@ -16,8 +19,8 @@ class AudioRecorder {
   private let targetSampleRate: Double = 16000
 
   init() {
-    // Pre-allocate buffer for ~30 seconds of audio at 16kHz
-    audioBuffer.reserveCapacity(Int(targetSampleRate * 30))
+    // Buffer is allocated on each startRecording so the recorder has zero
+    // footprint when idle.
   }
 
   func startRecording() throws {
@@ -26,12 +29,17 @@ class AudioRecorder {
       return
     }
 
-    // Clear previous buffer
+    // Drop the buffer entirely (don't preserve capacity — see issue 1.4).
     bufferLock.lock()
-    audioBuffer.removeAll(keepingCapacity: true)
+    audioBuffer = []
+    audioBuffer.reserveCapacity(Int(targetSampleRate * 30))
     bufferLock.unlock()
 
-    let inputNode = audioEngine.inputNode
+    // Lazily create the audio engine on each recording.
+    let engine = AVAudioEngine()
+    audioEngine = engine
+
+    let inputNode = engine.inputNode
     let inputFormat = inputNode.outputFormat(forBus: 0)
 
     logger.info("Input format: \(inputFormat.sampleRate)Hz, \(inputFormat.channelCount) channels")
@@ -66,8 +74,8 @@ class AudioRecorder {
     }
 
     // Start audio engine
-    audioEngine.prepare()
-    try audioEngine.start()
+    engine.prepare()
+    try engine.start()
 
     isRecording = true
     logger.info("Recording started")
@@ -79,15 +87,21 @@ class AudioRecorder {
       return nil
     }
 
-    // Stop and remove tap
-    audioEngine.inputNode.removeTap(onBus: 0)
-    audioEngine.stop()
+    // Stop and tear down the engine so the audio HAL releases its resources.
+    if let engine = audioEngine {
+      engine.inputNode.removeTap(onBus: 0)
+      engine.stop()
+    }
+    audioEngine = nil
 
     isRecording = false
 
-    // Return captured samples
+    // Move the buffer out of the recorder (zero-copy via COW transfer) and
+    // leave the recorder with a fresh empty array so it doesn't keep the
+    // recording's high-water capacity in memory.
     bufferLock.lock()
     let samples = audioBuffer
+    audioBuffer = []
     bufferLock.unlock()
 
     logger.info(

From cdc74604a6f71a869055f87cde4075fa145fcae1 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:17:03 +0530
Subject: [PATCH 02/17] Use clipboard paste for long inserts; cache
 CGEventSource

Per-character keystroke synthesis with a 1 ms sleep was the dominant
post-release latency for any non-trivial transcription, and a fresh
CGEventSource was being allocated for every character.

- Route text > 30 chars through the existing clipboard paste path
- Create CGEventSource once per insertion, pass it to insertCharacter

AUDIOTYPE-1
---
 AudioType/Core/TextInserter.swift | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/AudioType/Core/TextInserter.swift b/AudioType/Core/TextInserter.swift
index 34186c6..84e9061 100644
--- a/AudioType/Core/TextInserter.swift
+++ b/AudioType/Core/TextInserter.swift
@@ -6,24 +6,41 @@ import os.log
 class TextInserter {
   private let logger = Logger(subsystem: "com.audiotype", category: "TextInserter")
 
+  /// Above this length we paste via clipboard instead of synthesising one
+  /// keystroke per character. Per-char synthesis costs ~1 ms each plus a
+  /// fresh CGEventSource per char — for long dictations that's the dominant
+  /// post-recording latency the user feels.
+  private static let clipboardPasteThreshold = 30
+
   func insertText(_ text: String) {
     guard !text.isEmpty else { return }
 
     logger.info("Inserting text: \(text.prefix(50))...")
 
-    // Use CGEvent to simulate keyboard input
-    for char in text {
-      insertCharacter(char)
-      // Small delay between characters for reliability
-      usleep(1000)  // 1ms
+    if text.count > Self.clipboardPasteThreshold {
+      insertTextViaClipboard(text)
+    } else {
+      insertTextViaKeystrokes(text)
     }
 
     logger.info("Text insertion complete")
   }
 
-  private func insertCharacter(_ char: Character) {
+  /// Per-character keystroke synthesis. Used for short strings where
+  /// clipboard paste's clipboard-restore quirks aren't worth it.
+  private func insertTextViaKeystrokes(_ text: String) {
+    // Cache the event source once for the whole insertion — creating one
+    // per character was a measurable hot path.
     let source = CGEventSource(stateID: .hidSystemState)
 
+    for char in text {
+      insertCharacter(char, source: source)
+      // Tiny delay so target apps don't drop events under load.
+      usleep(1000)  // 1ms
+    }
+  }
+
+  private func insertCharacter(_ char: Character, source: CGEventSource?) {
     // Create key down event
     guard let keyDown = CGEvent(keyboardEventSource: source, virtualKey: 0, keyDown: true) else {
       logger.error("Failed to create keyDown event")

From 5f8669892eafa51e8ad8aa85d46b25437a194e07 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:17:51 +0530
Subject: [PATCH 03/17] Cache RecordingOverlay hosting view; drive text via
 observable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each recording start and recording→processing transition was building a
fresh NSHostingView<RecordingOverlay>, which leaked the SwiftUI graph
and Metal layers and was a primary contributor to the +80 MB drift seen
after long sessions.

- Add overlayText to AudioLevelMonitor as @Published
- Read text from the env object inside RecordingOverlay
- Build the NSHostingView once and just mutate overlayText afterwards

AUDIOTYPE-1
---
 AudioType/App/MenuBarController.swift | 39 ++++++++++++++++-----------
 AudioType/UI/RecordingOverlay.swift   |  3 +--
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/AudioType/App/MenuBarController.swift b/AudioType/App/MenuBarController.swift
index e324448..26e3041 100644
--- a/AudioType/App/MenuBarController.swift
+++ b/AudioType/App/MenuBarController.swift
@@ -17,16 +17,20 @@ extension NSImage {
   }
 }
 
-/// Shared observable for live audio level — drives the recording waveform.
+/// Shared observable for live audio level - drives the recording waveform.
+/// Also carries the overlay text so the hosting view doesn't have to be
+/// rebuilt on every state change.
 class AudioLevelMonitor: ObservableObject {
   static let shared = AudioLevelMonitor()
   @Published var level: Float = 0.0
+  @Published var overlayText: String = "Recording..."
 }
 
 class MenuBarController: NSObject, NSWindowDelegate {
   private weak var statusItem: NSStatusItem?
   private var transcriptionManager: TranscriptionManager
   private var recordingWindow: NSWindow?
+  private var recordingHostingView: NSHostingView<AnyView>?
   private var settingsWindow: NSWindow?
 
   init(transcriptionManager: TranscriptionManager) {
@@ -115,7 +119,7 @@ class MenuBarController: NSObject, NSWindowDelegate {
       updateStatusMenuItem("Ready")
 
     case .recording:
-      // Tinted coral/red — non-template so the color shows through
+      // Tinted coral/red - non-template so the color shows through
       if let base = NSImage(
         systemSymbolName: "waveform.circle.fill", accessibilityDescription: "Recording") {
         button.image = base.tinted(with: AudioTypeTheme.nsRecordingRed)
@@ -124,7 +128,7 @@ class MenuBarController: NSObject, NSWindowDelegate {
       updateStatusMenuItem("Recording...")
 
     case .processing:
-      // Tinted amber — "I'm thinking"
+      // Tinted amber - "I'm thinking"
       if let base = NSImage(
         systemSymbolName: "ellipsis.circle.fill", accessibilityDescription: "Processing") {
         button.image = base.tinted(with: AudioTypeTheme.nsAmber)
@@ -175,22 +179,27 @@ class MenuBarController: NSObject, NSWindowDelegate {
       recordingWindow = window
     }
 
-    let hostingView = NSHostingView(
-      rootView: RecordingOverlay(text: "Recording...")
-        .environmentObject(AudioLevelMonitor.shared))
-    hostingView.frame = NSRect(x: 0, y: 0, width: 180, height: 50)
-    recordingWindow?.contentView = hostingView
+    // Build the hosting view once; subsequent updates just mutate the
+    // observable state. Re-creating NSHostingView on every state change
+    // was leaking the SwiftUI graph and Metal layers.
+    if recordingHostingView == nil {
+      let hosting = NSHostingView(
+        rootView: AnyView(
+          RecordingOverlay()
+            .environmentObject(AudioLevelMonitor.shared)
+        )
+      )
+      hosting.frame = NSRect(x: 0, y: 0, width: 180, height: 50)
+      recordingHostingView = hosting
+      recordingWindow?.contentView = hosting
+    }
+
+    AudioLevelMonitor.shared.overlayText = "Recording..."
     recordingWindow?.orderFront(nil)
   }
 
   private func updateRecordingIndicator(text: String) {
-    if let window = recordingWindow {
-      let hostingView = NSHostingView(
-        rootView: RecordingOverlay(text: text)
-          .environmentObject(AudioLevelMonitor.shared))
-      hostingView.frame = NSRect(x: 0, y: 0, width: 180, height: 50)
-      window.contentView = hostingView
-    }
+    AudioLevelMonitor.shared.overlayText = text
   }
 
   private func hideRecordingIndicator() {
diff --git a/AudioType/UI/RecordingOverlay.swift b/AudioType/UI/RecordingOverlay.swift
index 39ceb5a..cc7b99e 100644
--- a/AudioType/UI/RecordingOverlay.swift
+++ b/AudioType/UI/RecordingOverlay.swift
@@ -1,11 +1,10 @@
 import SwiftUI
 
 struct RecordingOverlay: View {
-  let text: String
   @EnvironmentObject var levelMonitor: AudioLevelMonitor
 
   private var isRecording: Bool {
-    text == "Recording..."
+    levelMonitor.overlayText == "Recording..."
   }
 
   var body: some View {

From f9deded48a084093bb9a73becad89112a0a518d1 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:18:06 +0530
Subject: [PATCH 04/17] Remove notification observers in MenuBarController
 deinit

Selector-based observers on NotificationCenter.default were never
explicitly removed. While unzeroed-weak crashes are no longer a risk on
modern macOS, explicit cleanup is best practice and matters if the
controller is ever re-instantiated.

AUDIOTYPE-1
---
 AudioType/App/MenuBarController.swift | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/AudioType/App/MenuBarController.swift b/AudioType/App/MenuBarController.swift
index 26e3041..9393d58 100644
--- a/AudioType/App/MenuBarController.swift
+++ b/AudioType/App/MenuBarController.swift
@@ -54,6 +54,10 @@ class MenuBarController: NSObject, NSWindowDelegate {
     )
   }
 
+  deinit {
+    NotificationCenter.default.removeObserver(self)
+  }
+
   func setupStatusItem(_ statusItem: NSStatusItem) {
     self.statusItem = statusItem
 

From 635811dec0d2f801744475e3d441497d029ab294 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:18:36 +0530
Subject: [PATCH 05/17] Cache tinted status-bar icons instead of re-rendering
 per state

NSImage.tinted uses lockFocus/unlockFocus, which allocates a fresh
offscreen bitmap rep on every call. With four distinct icon states
that's a fixed set; pre-render each one once and reuse.

AUDIOTYPE-1
---
 AudioType/App/MenuBarController.swift | 45 ++++++++++++++++-----------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/AudioType/App/MenuBarController.swift b/AudioType/App/MenuBarController.swift
index 9393d58..57f99f4 100644
--- a/AudioType/App/MenuBarController.swift
+++ b/AudioType/App/MenuBarController.swift
@@ -33,6 +33,29 @@ class MenuBarController: NSObject, NSWindowDelegate {
   private var recordingHostingView: NSHostingView<AnyView>?
   private var settingsWindow: NSWindow?
 
+  // Pre-rendered status-bar icons, built once. Calling NSImage.tinted on
+  // every state change re-rasterizes the symbol via lockFocus/unlockFocus
+  // and was a steady source of bitmap allocations.
+  private lazy var idleIcon: NSImage? = {
+    let img = NSImage(
+      systemSymbolName: "waveform.circle.fill", accessibilityDescription: "Ready")
+    img?.isTemplate = true
+    return img
+  }()
+  private lazy var recordingIcon: NSImage? = {
+    NSImage(systemSymbolName: "waveform.circle.fill", accessibilityDescription: "Recording")?
+      .tinted(with: AudioTypeTheme.nsRecordingRed)
+  }()
+  private lazy var processingIcon: NSImage? = {
+    NSImage(systemSymbolName: "ellipsis.circle.fill", accessibilityDescription: "Processing")?
+      .tinted(with: AudioTypeTheme.nsAmber)
+  }()
+  private lazy var errorIcon: NSImage? = {
+    NSImage(
+      systemSymbolName: "exclamationmark.triangle.fill", accessibilityDescription: "Error")?
+      .tinted(with: .systemRed)
+  }()
+
   init(transcriptionManager: TranscriptionManager) {
     self.transcriptionManager = transcriptionManager
     super.init()
@@ -114,38 +137,24 @@ class MenuBarController: NSObject, NSWindowDelegate {
 
     switch state {
     case .idle:
-      let img = NSImage(
-        systemSymbolName: "waveform.circle.fill", accessibilityDescription: "Ready")
-      img?.isTemplate = true
-      button.image = img
+      button.image = idleIcon
       AudioLevelMonitor.shared.level = 0
       hideRecordingIndicator()
       updateStatusMenuItem("Ready")
 
     case .recording:
-      // Tinted coral/red - non-template so the color shows through
-      if let base = NSImage(
-        systemSymbolName: "waveform.circle.fill", accessibilityDescription: "Recording") {
-        button.image = base.tinted(with: AudioTypeTheme.nsRecordingRed)
-      }
+      button.image = recordingIcon
       showRecordingIndicator()
       updateStatusMenuItem("Recording...")
 
     case .processing:
-      // Tinted amber - "I'm thinking"
-      if let base = NSImage(
-        systemSymbolName: "ellipsis.circle.fill", accessibilityDescription: "Processing") {
-        button.image = base.tinted(with: AudioTypeTheme.nsAmber)
-      }
+      button.image = processingIcon
       AudioLevelMonitor.shared.level = 0
       updateRecordingIndicator(text: "Processing...")
       updateStatusMenuItem("Processing...")
 
     case .error(let message):
-      let img = NSImage(
-        systemSymbolName: "exclamationmark.triangle.fill", accessibilityDescription: "Error")
-      img?.isTemplate = false
-      button.image = img?.tinted(with: .systemRed)
+      button.image = errorIcon
       hideRecordingIndicator()
       updateStatusMenuItem("Error: \(message)")
     }

From b85526b2c38a02c2a68e0400a5b8dddb353938bd Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:19:11 +0530
Subject: [PATCH 06/17] Return CGEvent unretained from event tap callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The tap callback fires on every modifier-key change system-wide. Each
return value used Unmanaged.passRetained(event), which adds a retain
the system then has to release — wasted work per event. Apple's own
sample code returns passUnretained because the event is already owned
by the system.

AUDIOTYPE-1
---
 AudioType/Core/HotKeyManager.swift | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/AudioType/Core/HotKeyManager.swift b/AudioType/Core/HotKeyManager.swift
index 075b053..b6c396c 100644
--- a/AudioType/Core/HotKeyManager.swift
+++ b/AudioType/Core/HotKeyManager.swift
@@ -36,7 +36,10 @@ class HotKeyManager {
         options: .defaultTap,
         eventsOfInterest: eventMask,
         callback: { proxy, type, event, refcon in
-          guard let refcon = refcon else { return Unmanaged.passRetained(event) }
+          // The event is owned by the system; pass it back unretained.
+          // Using passRetained here added a retain/release pair per event
+          // (i.e. on every modifier-key change system-wide).
+          guard let refcon = refcon else { return Unmanaged.passUnretained(event) }
           let manager = Unmanaged<HotKeyManager>.fromOpaque(refcon).takeUnretainedValue()
           return manager.handleEvent(proxy: proxy, type: type, event: event)
         },
@@ -85,7 +88,7 @@ class HotKeyManager {
       if let tap = eventTap {
         CGEvent.tapEnable(tap: tap, enable: true)
       }
-      return Unmanaged.passRetained(event)
+      return Unmanaged.passUnretained(event)
     }
 
     let flags = event.flags
@@ -120,7 +123,7 @@ class HotKeyManager {
       }
     }
 
-    return Unmanaged.passRetained(event)
+    return Unmanaged.passUnretained(event)
   }
 
   deinit {

From 59aa202642d4fdc090a63d070fd78a42d3e1fa28 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:20:28 +0530
Subject: [PATCH 07/17] Vectorise RMS and stop allocating per audio tap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The audio-tap callback fires every ~100 ms during recording. Each call
was allocating an intermediate [Float] from the converter output, then
appending it to the main buffer (a second copy), then computing RMS via
a scalar Array.reduce.

- Compute RMS with vDSP_measqv (vectorised, ~5-10× faster)
- Append directly from UnsafeBufferPointer; no intermediate Array
- Wrap all bufferLock acquires with defer { unlock() } for safety

AUDIOTYPE-1
---
 AudioType/Core/AudioRecorder.swift | 57 +++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/AudioType/Core/AudioRecorder.swift b/AudioType/Core/AudioRecorder.swift
index 701051f..42ab4e5 100644
--- a/AudioType/Core/AudioRecorder.swift
+++ b/AudioType/Core/AudioRecorder.swift
@@ -1,4 +1,5 @@
 import AVFoundation
+import Accelerate
 import os.log
 
 class AudioRecorder {
@@ -30,10 +31,12 @@ class AudioRecorder {
     }
 
     // Drop the buffer entirely (don't preserve capacity — see issue 1.4).
-    bufferLock.lock()
-    audioBuffer = []
-    audioBuffer.reserveCapacity(Int(targetSampleRate * 30))
-    bufferLock.unlock()
+    do {
+      bufferLock.lock()
+      defer { bufferLock.unlock() }
+      audioBuffer = []
+      audioBuffer.reserveCapacity(Int(targetSampleRate * 30))
+    }
 
     // Lazily create the audio engine on each recording.
     let engine = AVAudioEngine()
@@ -99,10 +102,13 @@ class AudioRecorder {
     // Move the buffer out of the recorder (zero-copy via COW transfer) and
     // leave the recorder with a fresh empty array so it doesn't keep the
     // recording's high-water capacity in memory.
-    bufferLock.lock()
-    let samples = audioBuffer
-    audioBuffer = []
-    bufferLock.unlock()
+    let samples: [Float]
+    do {
+      bufferLock.lock()
+      defer { bufferLock.unlock() }
+      samples = audioBuffer
+      audioBuffer = []
+    }
 
     logger.info(
       "Recording stopped, captured \(samples.count) samples (\(Double(samples.count) / self.targetSampleRate, format: .fixed(precision: 2))s)"
@@ -114,10 +120,7 @@ class AudioRecorder {
   private func processAudioBuffer(
     _ buffer: AVAudioPCMBuffer, converter: AVAudioConverter?, targetFormat: AVAudioFormat
   ) {
-    var samplesArray: [Float]
-
     if let converter = converter {
-      // Need to convert to target format
       let frameCount = AVAudioFrameCount(
         Double(buffer.frameLength) * targetSampleRate / buffer.format.sampleRate
       )
@@ -143,25 +146,37 @@ class AudioRecorder {
       }
 
       guard let channelData = convertedBuffer.floatChannelData else { return }
-      samplesArray = Array(
-        UnsafeBufferPointer(start: channelData[0], count: Int(convertedBuffer.frameLength)))
+      let count = Int(convertedBuffer.frameLength)
+      consume(samples: channelData[0], count: count)
     } else {
-      // Already in correct format
       guard let channelData = buffer.floatChannelData else { return }
-      samplesArray = Array(
-        UnsafeBufferPointer(start: channelData[0], count: Int(buffer.frameLength)))
+      let count = Int(buffer.frameLength)
+      consume(samples: channelData[0], count: count)
     }
+  }
 
-    // Compute RMS level for live waveform
-    let rms = sqrt(samplesArray.reduce(0) { $0 + $1 * $1 } / Float(max(samplesArray.count, 1)))
+  /// Consume a chunk of mic samples: compute RMS for the waveform and append
+  /// to the recording buffer — without ever materialising an intermediate
+  /// `[Float]`. Called on the audio thread.
+  private func consume(samples: UnsafePointer<Float>, count: Int) {
+    guard count > 0 else { return }
+
+    // RMS via Accelerate (vectorised). Replaces a scalar reduce loop that
+    // ran on every tap callback.
+    var meanSquare: Float = 0
+    vDSP_measqv(samples, 1, &meanSquare, vDSP_Length(count))
+    let rms = sqrt(meanSquare)
     // Normalize: typical speech RMS is 0.01–0.15, scale aggressively to 0–1
     let level = min(rms * 25, 1.0)
     onLevelUpdate?(level)
 
-    // Append to buffer
+    // Append directly from the unsafe buffer pointer; [Float] has an
+    // append(contentsOf:) overload that takes any Sequence, including
+    // UnsafeBufferPointer, so no intermediate Array is allocated.
+    let ptr = UnsafeBufferPointer(start: samples, count: count)
     bufferLock.lock()
-    audioBuffer.append(contentsOf: samplesArray)
-    bufferLock.unlock()
+    defer { bufferLock.unlock() }
+    audioBuffer.append(contentsOf: ptr)
   }
 }
 

From 5ab3b07d10f14a29cc4a3ef239276a48075943c6 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:21:22 +0530
Subject: [PATCH 08/17] Rewrite WAVEncoder.encode with preallocated Data + vDSP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The encoder allocated an intermediate [Int16] (~960 KB for a 30 s clip),
let Data realloc as it grew from 0, then made one appendLittleEndian
call per sample (~480 000 calls).

- Allocate final Data once at exact size
- Write header in place via storeBytes
- Clip + scale + Float→Int16 conversion via vDSP into the data region

Produces byte-identical output. Significant peak-memory reduction and
encode-time speedup on long recordings.

AUDIOTYPE-1
---
 AudioType/Core/WAVEncoder.swift | 111 ++++++++++++++++++++++----------
 1 file changed, 78 insertions(+), 33 deletions(-)

diff --git a/AudioType/Core/WAVEncoder.swift b/AudioType/Core/WAVEncoder.swift
index ea1210a..d20dacc 100644
--- a/AudioType/Core/WAVEncoder.swift
+++ b/AudioType/Core/WAVEncoder.swift
@@ -1,3 +1,4 @@
+import Accelerate
 import Foundation
 import os.log
 
@@ -25,7 +26,7 @@ struct WhisperAPIConfig {
 /// and response parsing are all handled here.
 class WhisperAPIEngine: TranscriptionEngine {
 
-  /// Provider configuration — subclasses must override.
+  /// Provider configuration - subclasses must override.
   var config: WhisperAPIConfig {
     fatalError("Subclasses must override config")
   }
@@ -178,45 +179,89 @@ enum WhisperAPIError: Error, LocalizedError {
 enum WAVEncoder {
 
   /// Encode Float32 PCM samples into WAV in memory (16-bit PCM, mono).
+  ///
+  /// The previous implementation allocated an intermediate `[Int16]`
+  /// (~960 KB for a 30 s clip), let `Data` realloc as it grew, and
+  /// did 480 000 individual `appendLittleEndian` calls. This version:
+  ///
+  /// - Allocates the final `Data` once at exact size (44-byte header + 2N).
+  /// - Writes the header in place.
+  /// - Uses Accelerate to clip Float → Int16 directly into the data
+  ///   region in a single pass.
   static func encode(samples: [Float], sampleRate: Int) -> Data {
-    var data = Data()
-
-    let int16Samples = samples.map { sample -> Int16 in
-      let clamped = max(-1.0, min(1.0, sample))
-      return Int16(clamped * Float(Int16.max))
-    }
-
     let numChannels: UInt16 = 1
     let bitsPerSample: UInt16 = 16
     let byteRate = UInt32(sampleRate)
       * UInt32(numChannels) * UInt32(bitsPerSample / 8)
     let blockAlign = numChannels * (bitsPerSample / 8)
-    let dataSize = UInt32(int16Samples.count * 2)
-    let fileSize = 36 + dataSize
-
-    // RIFF header
-    data.append(contentsOf: "RIFF".utf8)
-    data.appendLittleEndian(fileSize)
-    data.append(contentsOf: "WAVE".utf8)
-
-    // fmt chunk
-    data.append(contentsOf: "fmt ".utf8)
-    data.appendLittleEndian(UInt32(16))
-    data.appendLittleEndian(UInt16(1))  // PCM
-    data.appendLittleEndian(numChannels)
-    data.appendLittleEndian(UInt32(sampleRate))
-    data.appendLittleEndian(byteRate)
-    data.appendLittleEndian(blockAlign)
-    data.appendLittleEndian(bitsPerSample)
-
-    // data chunk
-    data.append(contentsOf: "data".utf8)
-    data.appendLittleEndian(dataSize)
-
-    for sample in int16Samples {
-      data.appendLittleEndian(sample)
+    let dataSize = UInt32(samples.count * 2)
+    let fileSize: UInt32 = 36 + dataSize
+    let totalSize = 44 + samples.count * 2
+
+    var data = Data(count: totalSize)
+    data.withUnsafeMutableBytes { (raw: UnsafeMutableRawBufferPointer) -> Void in
+      guard let base = raw.baseAddress else { return }
+
+      // --- Header ---------------------------------------------------------
+      func writeASCII(_ string: String, at offset: Int) {
+        for (i, byte) in string.utf8.enumerated() {
+          base.storeBytes(of: byte, toByteOffset: offset + i, as: UInt8.self)
+        }
+      }
+      func writeLE<T: FixedWidthInteger>(_ value: T, at offset: Int) {
+        base.storeBytes(of: value.littleEndian, toByteOffset: offset, as: T.self)
+      }
+
+      writeASCII("RIFF", at: 0)
+      writeLE(fileSize, at: 4)
+      writeASCII("WAVE", at: 8)
+
+      writeASCII("fmt ", at: 12)
+      writeLE(UInt32(16), at: 16)
+      writeLE(UInt16(1), at: 20)  // PCM
+      writeLE(numChannels, at: 22)
+      writeLE(UInt32(sampleRate), at: 24)
+      writeLE(byteRate, at: 28)
+      writeLE(blockAlign, at: 32)
+      writeLE(bitsPerSample, at: 34)
+
+      writeASCII("data", at: 36)
+      writeLE(dataSize, at: 40)
+
+      // --- PCM data -------------------------------------------------------
+      // Clip to [-1, 1], scale by Int16.max, convert to Int16 — all via
+      // Accelerate, all into the destination region in one pass.
+      guard !samples.isEmpty else { return }
+
+      let dst = base.advanced(by: 44).assumingMemoryBound(to: Int16.self)
+      let n = vDSP_Length(samples.count)
+
+      samples.withUnsafeBufferPointer { src in
+        guard let srcBase = src.baseAddress else { return }
+
+        // Scratch buffer for clip+scale; reuse src memory would mutate the
+        // caller's input, so allocate a transient float buffer.
+        let scratch = UnsafeMutablePointer<Float>.allocate(capacity: samples.count)
+        defer { scratch.deallocate() }
+
+        // Clip into scratch.
+        var lo: Float = -1.0
+        var hi: Float = 1.0
+        vDSP_vclip(srcBase, 1, &lo, &hi, scratch, 1, n)
+
+        // Scale by Int16.max in place.
+        var scale = Float(Int16.max)
+        vDSP_vsmul(scratch, 1, &scale, scratch, 1, n)
+
+        // Convert Float → Int16 with rounding directly into dst.
+        vDSP_vfix16(scratch, 1, dst, 1, n)
+
+        // WAV is little-endian. On Apple silicon and Intel, host order is
+        // already LE so no byte-swap needed. Guard with a static assert
+        // for any future big-endian Apple platform (none exist today).
+        assert(1.littleEndian == 1, "WAVEncoder assumes little-endian host")
+      }
     }
-
     return data
   }
 

From 854ab2d2661044fcc4e5835de94ff4bf9054a896 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:21:57 +0530
Subject: [PATCH 09/17] Switch transcribe upload to
 URLSession.upload(for:from:)

Setting URLRequest.httpBody and calling URLSession.shared.data(for:)
typically holds the body in two places (the request and URLSession's
internal copy). For ~2 MB WAV bodies that's wasted memory. upload(for:
from:) takes the body once and forwards it.

- buildRequest now returns (URLRequest, Data) instead of mutating httpBody
- transcribe now uses upload(for:from:)

AUDIOTYPE-1
---
 AudioType/Core/WAVEncoder.swift | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/AudioType/Core/WAVEncoder.swift b/AudioType/Core/WAVEncoder.swift
index d20dacc..970f775 100644
--- a/AudioType/Core/WAVEncoder.swift
+++ b/AudioType/Core/WAVEncoder.swift
@@ -88,7 +88,7 @@ class WhisperAPIEngine: TranscriptionEngine {
       throw WhisperAPIError.invalidURL
     }
 
-    let request = WAVEncoder.buildRequest(
+    let (request, body) = WAVEncoder.buildRequest(
       url: url,
       apiKey: apiKey,
       wavData: wavData,
@@ -98,7 +98,10 @@ class WhisperAPIEngine: TranscriptionEngine {
 
     let (data, response): (Data, URLResponse)
     do {
-      (data, response) = try await URLSession.shared.data(for: request)
+      // upload(for:from:) keeps a single copy of the body; setting
+      // request.httpBody and calling data(for:) tends to keep the body
+      // resident in two places. With ~2 MB WAV bodies this matters.
+      (data, response) = try await URLSession.shared.upload(for: request, from: body)
     } catch {
       throw WhisperAPIError.networkError(error.localizedDescription)
     }
@@ -267,6 +270,9 @@ enum WAVEncoder {
 
   /// Build a multipart/form-data request for an OpenAI-compatible
   /// `/v1/audio/transcriptions` endpoint.
+  ///
+  /// Returns the request and body separately so callers can pass the body
+  /// to `URLSession.upload(for:from:)` instead of setting `httpBody`.
   static func buildRequest(
     url: URL,
     apiKey: String,
@@ -274,7 +280,7 @@ enum WAVEncoder {
     model: String,
     languageCode: String?,
     timeoutInterval: TimeInterval = 30
-  ) -> URLRequest {
+  ) -> (URLRequest, Data) {
     let boundary = UUID().uuidString
 
     var request = URLRequest(url: url)
@@ -311,8 +317,7 @@ enum WAVEncoder {
     )
     body.append(Data("--\(boundary)--\r\n".utf8))
 
-    request.httpBody = body
-    return request
+    return (request, body)
   }
 }
 

From 22e62b5e7e2bd7dbcc9afe8d4ea296727a13afca Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:22:48 +0530
Subject: [PATCH 10/17] Compile single regex for TextPostProcessor replacements

The processor was running ~85 case-insensitive replacingOccurrences
calls on the full transcription, each O(n), and rebuilding a merged
dictionary on every call. Dictionary iteration order is also undefined,
so identical inputs could produce different outputs across runs.

- Compile a single NSRegularExpression with alternation, longest-first
- Cache the compiled regex + lookup; rebuild only on catalog changes
- Apply replacements in one match-and-stitch pass
- Deterministic output ordering as a side benefit

Substring (not word-bounded) matching is preserved to match prior
behavior.

AUDIOTYPE-1
---
 AudioType/Core/TextPostProcessor.swift | 97 +++++++++++++++++++++-----
 1 file changed, 80 insertions(+), 17 deletions(-)

diff --git a/AudioType/Core/TextPostProcessor.swift b/AudioType/Core/TextPostProcessor.swift
index 706fd25..6aec6c4 100644
--- a/AudioType/Core/TextPostProcessor.swift
+++ b/AudioType/Core/TextPostProcessor.swift
@@ -106,41 +106,37 @@ class TextPostProcessor {
   // User-defined custom replacements
   private var customReplacements: [String: String] = [:]
 
+  // Cached compiled regex + lookup table. Rebuilt only when the catalog
+  // changes (custom replacements added/removed). The previous code rebuilt
+  // a merged dictionary and ran ~85 case-insensitive String scans on every
+  // single transcription.
+  private var cachedRegex: NSRegularExpression?
+  private var cachedLookup: [String: String] = [:]
+  private let regexLock = NSLock()
+
   private init() {
     loadCustomReplacements()
+    rebuildRegex()
   }
 
   /// Process transcribed text with corrections
   func process(_ text: String) -> String {
-    var result = text
-
-    // Apply word replacements (case-insensitive)
-    let allReplacements = wordReplacements.merging(customReplacements) { _, custom in custom }
-
-    for (pattern, replacement) in allReplacements {
-      result = result.replacingOccurrences(
-        of: pattern,
-        with: replacement,
-        options: .caseInsensitive
-      )
-    }
-
-    // Capitalize first letter of sentences
-    result = capitalizeSentences(result)
-
-    return result
+    let result = applyReplacements(text)
+    return capitalizeSentences(result)
   }
 
   /// Add a custom word replacement
   func addCustomReplacement(from: String, to: String) {
     customReplacements[from.lowercased()] = to
     saveCustomReplacements()
+    rebuildRegex()
   }
 
   /// Remove a custom replacement
   func removeCustomReplacement(from: String) {
     customReplacements.removeValue(forKey: from.lowercased())
     saveCustomReplacements()
+    rebuildRegex()
   }
 
   /// Get all custom replacements
@@ -150,6 +146,73 @@ class TextPostProcessor {
 
   // MARK: - Private
 
+  /// Rebuild the compiled regex from the current built-in + custom catalogs.
+  /// Custom replacements override built-ins on key collision.
+  private func rebuildRegex() {
+    regexLock.lock()
+    defer { regexLock.unlock() }
+
+    let merged = wordReplacements.merging(customReplacements) { _, custom in custom }
+    cachedLookup = [:]
+    cachedLookup.reserveCapacity(merged.count)
+    for (key, value) in merged {
+      cachedLookup[key.lowercased()] = value
+    }
+
+    // Sort keys longest-first so e.g. "rest api" wins over "api". This also
+    // gives us a deterministic order independent of dictionary hashing,
+    // which the old implementation lacked.
+    let keys = merged.keys.sorted { $0.count > $1.count }
+    let pattern = keys.map { NSRegularExpression.escapedPattern(for: $0) }
+      .joined(separator: "|")
+
+    cachedRegex = try? NSRegularExpression(
+      pattern: pattern,
+      options: [.caseInsensitive]
+    )
+  }
+
+  /// Apply replacements in a single regex pass.
+  private func applyReplacements(_ text: String) -> String {
+    regexLock.lock()
+    let regex = cachedRegex
+    let lookup = cachedLookup
+    regexLock.unlock()
+
+    guard let regex = regex, !text.isEmpty else { return text }
+
+    let nsText = text as NSString
+    let range = NSRange(location: 0, length: nsText.length)
+    let matches = regex.matches(in: text, options: [], range: range)
+    if matches.isEmpty { return text }
+
+    // Reassemble in one pass, alternating original spans and replacements.
+    var result = ""
+    result.reserveCapacity(text.count)
+    var cursor = 0
+    for match in matches {
+      let r = match.range
+      if r.location > cursor {
+        result.append(
+          nsText.substring(with: NSRange(location: cursor, length: r.location - cursor))
+        )
+      }
+      let matched = nsText.substring(with: r).lowercased()
+      if let replacement = lookup[matched] {
+        result.append(replacement)
+      } else {
+        result.append(nsText.substring(with: r))
+      }
+      cursor = r.location + r.length
+    }
+    if cursor < nsText.length {
+      result.append(
+        nsText.substring(with: NSRange(location: cursor, length: nsText.length - cursor))
+      )
+    }
+    return result
+  }
+
   private func capitalizeSentences(_ text: String) -> String {
     var result = ""
     var capitalizeNext = true

From 31c536a2abcc5c43cd2747e3f55847e17038b173 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:23:27 +0530
Subject: [PATCH 11/17] Cache Keychain reads in memory; invalidate on
 save/delete

Every transcription resolved the API key via SecItemCopyMatching, often
multiple times across the engine's isAvailable/apiKey accessors. Cache
the resolved value (including the absent state) and invalidate on
save/delete.

AUDIOTYPE-1
---
 AudioType/Utilities/KeychainHelper.swift | 39 +++++++++++++++++++++---
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/AudioType/Utilities/KeychainHelper.swift b/AudioType/Utilities/KeychainHelper.swift
index 5a4b4ec..fb41886 100644
--- a/AudioType/Utilities/KeychainHelper.swift
+++ b/AudioType/Utilities/KeychainHelper.swift
@@ -12,6 +12,13 @@ enum KeychainHelper {
     subsystem: "com.audiotype", category: "KeychainHelper"
   )
 
+  // In-memory cache of resolved values. Keychain reads aren't expensive in
+  // absolute terms but they were happening on every transcription (often
+  // multiple times) via the engines' apiKey getters. Cache entries are
+  // invalidated on save/delete.
+  private static var cache: [String: String?] = [:]
+  private static let cacheLock = NSLock()
+
   // MARK: - Public API
 
   /// Save a value to the Keychain. Overwrites any existing value for the key.
@@ -36,11 +43,23 @@ enum KeychainHelper {
       logger.error("Failed to save key \(key), status: \(status)")
       throw KeychainError.saveFailed(status)
     }
+
+    cacheLock.lock()
+    cache[key] = value
+    cacheLock.unlock()
+
     logger.info("Saved value for key: \(key)")
   }
 
   /// Retrieve a value from the Keychain.
   static func get(key: String) -> String? {
+    cacheLock.lock()
+    if let cached = cache[key] {
+      cacheLock.unlock()
+      return cached
+    }
+    cacheLock.unlock()
+
     let query: [String: Any] = [
       kSecClass as String: kSecClassGenericPassword,
       kSecAttrService as String: service,
@@ -52,13 +71,20 @@ enum KeychainHelper {
     var result: AnyObject?
     let status = SecItemCopyMatching(query as CFDictionary, &result)
 
-    guard status == errSecSuccess,
+    let value: String?
+    if status == errSecSuccess,
       let data = result as? Data,
-      let value = String(data: data, encoding: .utf8)
-    else {
-      return nil
+      let decoded = String(data: data, encoding: .utf8)
+    {
+      value = decoded
+    } else {
+      value = nil
     }
 
+    cacheLock.lock()
+    cache[key] = value
+    cacheLock.unlock()
+
     return value
   }
 
@@ -72,6 +98,11 @@ enum KeychainHelper {
     ]
 
     let status = SecItemDelete(query as CFDictionary)
+
+    cacheLock.lock()
+    cache[key] = .some(nil)  // remember "absent" too, to avoid re-querying
+    cacheLock.unlock()
+
     if status == errSecSuccess || status == errSecItemNotFound {
       return true
     }

From c9774d0db06c5029d2a5b4ac4b1ca60b767d8436 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:24:04 +0530
Subject: [PATCH 12/17] Hold transcription Task and cancel on next recording

A new Task.detached was spawned per recording without holding the
handle. If the user fired the hotkey while a previous transcription was
still in-flight (e.g. slow network), both would race and stale text
could land in the user's new focus.

- Hold the task in transcriptionTask
- Cancel any pending task before starting a new recording

AUDIOTYPE-1
---
 AudioType/App/TranscriptionManager.swift | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/AudioType/App/TranscriptionManager.swift b/AudioType/App/TranscriptionManager.swift
index 5948c4c..10b44db 100644
--- a/AudioType/App/TranscriptionManager.swift
+++ b/AudioType/App/TranscriptionManager.swift
@@ -32,6 +32,11 @@ class TranscriptionManager: ObservableObject {
   private var hotKeyManager: HotKeyManager?
   private var textInserter: TextInserter?
 
+  /// Active transcription task. Held so a new recording can cancel any
+  /// in-flight transcription from a previous one (e.g. user re-triggers
+  /// the hotkey while the network call is still pending).
+  private var transcriptionTask: Task<Void, Never>?
+
   private let logger = Logger(subsystem: "com.audiotype", category: "TranscriptionManager")
 
   private init() {}
@@ -60,7 +65,7 @@ class TranscriptionManager: ObservableObject {
 
     if !EngineResolver.anyEngineAvailable {
       logger.warning("No transcription engine available")
-      setState(.error("No engine available — add a cloud API key or enable Apple Speech"))
+      setState(.error("No engine available - add a cloud API key or enable Apple Speech"))
     } else {
       logger.info("Transcription engine ready: \(engine.displayName)")
     }
@@ -85,7 +90,7 @@ class TranscriptionManager: ObservableObject {
     audioRecorder = nil
   }
 
-  /// Called when the user saves an API key or changes engine preference — re-evaluate.
+  /// Called when the user saves an API key or changes engine preference - re-evaluate.
   func onEngineConfigChanged() {
     let engine = EngineResolver.resolve()
     activeEngineName = engine.displayName
@@ -93,7 +98,7 @@ class TranscriptionManager: ObservableObject {
       setState(.idle)
       logger.info("Engine config changed, active engine: \(engine.displayName)")
     } else {
-      setState(.error("No engine available — add a cloud API key or enable Apple Speech"))
+      setState(.error("No engine available - add a cloud API key or enable Apple Speech"))
     }
   }
 
@@ -118,10 +123,15 @@ class TranscriptionManager: ObservableObject {
     }
 
     guard EngineResolver.anyEngineAvailable else {
-      setState(.error("No engine available — add a cloud API key or enable Apple Speech"))
+      setState(.error("No engine available - add a cloud API key or enable Apple Speech"))
       return
     }
 
+    // Cancel any still-pending transcription from a previous recording so
+    // we don't insert stale text into the user's new context.
+    transcriptionTask?.cancel()
+    transcriptionTask = nil
+
     do {
       try audioRecorder?.startRecording()
       setState(.recording)
@@ -147,8 +157,9 @@ class TranscriptionManager: ObservableObject {
     logger.info("Recording stopped, captured \(samples.count) samples")
     setState(.processing)
 
-    // Transcribe in background
-    Task.detached { [weak self] in
+    // Transcribe in background. Hold the task so the next recording can
+    // cancel it if it's still pending.
+    transcriptionTask = Task.detached { [weak self] in
       await self?.transcribeAndInsert(samples: samples)
     }
   }

From 59589ebbef134a8bd0bbe0f433fc1c9c6af3e530 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:24:36 +0530
Subject: [PATCH 13/17] Resolve transcription engine once per recording
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

EngineResolver.resolve() was called twice — once via anyEngineAvailable
at startRecording and once at transcribeAndInsert. Each call instantiated
a fresh engine and (for cloud engines) hit the Keychain. Resolve once at
recording start and reuse for the matching transcription.

This also ensures the engine identity can't flip mid-recording if the
user edits settings during capture.

AUDIOTYPE-1
---
 AudioType/App/TranscriptionManager.swift | 27 +++++++++++++++++-------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/AudioType/App/TranscriptionManager.swift b/AudioType/App/TranscriptionManager.swift
index 10b44db..17dbcdb 100644
--- a/AudioType/App/TranscriptionManager.swift
+++ b/AudioType/App/TranscriptionManager.swift
@@ -116,6 +116,12 @@ class TranscriptionManager: ObservableObject {
     }
   }
 
+  /// Engine resolved at recording start and reused for the matching
+  /// transcription. Keeps Keychain / availability checks out of the
+  /// post-stop hot path and ensures the engine identity doesn't change
+  /// mid-recording if the user edits settings.
+  private var activeEngine: TranscriptionEngine?
+
   private func startRecording() {
     guard state == .idle else {
       logger.warning("Cannot start recording: not in idle state")
@@ -132,10 +138,15 @@ class TranscriptionManager: ObservableObject {
     transcriptionTask?.cancel()
     transcriptionTask = nil
 
+    // Resolve the engine once, up front. transcribeAndInsert will reuse it.
+    let engine = EngineResolver.resolve()
+    activeEngine = engine
+    activeEngineName = engine.displayName
+
     do {
       try audioRecorder?.startRecording()
       setState(.recording)
-      logger.info("Recording started")
+      logger.info("Recording started with engine: \(engine.displayName)")
     } catch {
       logger.error("Failed to start recording: \(error.localizedDescription)")
       setState(.error("Failed to start recording"))
@@ -154,22 +165,22 @@ class TranscriptionManager: ObservableObject {
       return
     }
 
+    // Take the engine resolved at startRecording. Falls back to a fresh
+    // resolution defensively if somehow nil.
+    let engine = activeEngine ?? EngineResolver.resolve()
+    activeEngine = nil
+
     logger.info("Recording stopped, captured \(samples.count) samples")
     setState(.processing)
 
     // Transcribe in background. Hold the task so the next recording can
     // cancel it if it's still pending.
     transcriptionTask = Task.detached { [weak self] in
-      await self?.transcribeAndInsert(samples: samples)
+      await self?.transcribeAndInsert(samples: samples, engine: engine)
     }
   }
 
-  private func transcribeAndInsert(samples: [Float]) async {
-    let engine = EngineResolver.resolve()
-
-    await MainActor.run {
-      self.activeEngineName = engine.displayName
-    }
+  private func transcribeAndInsert(samples: [Float], engine: TranscriptionEngine) async {
 
     let startTime = CFAbsoluteTimeGetCurrent()
 

From 0e855f86156c5fbd8eb12fabd9c054d3605f0707 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:25:28 +0530
Subject: [PATCH 14/17] Retain self for event-tap lifetime; invalidate port on
 stop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous code passed self unretained as the tap's refcon. If self
were ever released while a callback was in-flight on another thread,
takeUnretainedValue would dereference freed memory.

- Retain self with passRetained on startListening; release on stopListening
- Invalidate the CFMachPort before tearing down the run loop source so
  no further callbacks can fire while we clean up
- Release the retain after the tap is dead so any in-flight callback
  still sees a live self

This makes deinit unreachable while listening, which is the correct
trade-off — cleanup must go through stopListening explicitly (which
TranscriptionManager.cleanup already does).

AUDIOTYPE-1
---
 AudioType/Core/HotKeyManager.swift | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/AudioType/Core/HotKeyManager.swift b/AudioType/Core/HotKeyManager.swift
index b6c396c..6c4c146 100644
--- a/AudioType/Core/HotKeyManager.swift
+++ b/AudioType/Core/HotKeyManager.swift
@@ -14,6 +14,12 @@ class HotKeyManager {
   private let callback: (HotKeyEvent) -> Void
   private var isRecording = false
 
+  // Retained pointer to self that the event-tap callback uses as refcon.
+  // Holding self retained for the lifetime of the tap means the tap
+  // callback is always safe to call back into self, even if the owner
+  // releases its reference. We balance the retain in stopListening.
+  private var refconRetained: Unmanaged<HotKeyManager>?
+
   // Track fn key state
   private var fnKeyWasPressed = false
 
@@ -29,6 +35,10 @@ class HotKeyManager {
     // Use CGEventTap for fn key detection
     let eventMask: CGEventMask = (1 << CGEventType.flagsChanged.rawValue)
 
+    // Retain self for the duration of the tap. Released in stopListening.
+    let retained = Unmanaged.passRetained(self)
+    refconRetained = retained
+
     guard
       let tap = CGEvent.tapCreate(
         tap: .cgSessionEventTap,
@@ -37,15 +47,16 @@ class HotKeyManager {
         eventsOfInterest: eventMask,
         callback: { proxy, type, event, refcon in
           // The event is owned by the system; pass it back unretained.
-          // Using passRetained here added a retain/release pair per event
-          // (i.e. on every modifier-key change system-wide).
           guard let refcon = refcon else { return Unmanaged.passUnretained(event) }
           let manager = Unmanaged<HotKeyManager>.fromOpaque(refcon).takeUnretainedValue()
           return manager.handleEvent(proxy: proxy, type: type, event: event)
         },
-        userInfo: Unmanaged.passUnretained(self).toOpaque()
+        userInfo: retained.toOpaque()
       )
     else {
+      // Tap creation failed — release the retain we just took.
+      retained.release()
+      refconRetained = nil
       logger.error("Failed to create event tap. Accessibility permission may be required.")
       return
     }
@@ -64,6 +75,9 @@ class HotKeyManager {
   func stopListening() {
     if let tap = eventTap {
       CGEvent.tapEnable(tap: tap, enable: false)
+      // Invalidating the mach port stops further callbacks before we drop
+      // the run loop source.
+      CFMachPortInvalidate(tap)
     }
 
     if let source = runLoopSource {
@@ -75,6 +89,12 @@ class HotKeyManager {
     isRecording = false
     fnKeyWasPressed = false
 
+    // Balance the retain taken in startListening. Done last so any
+    // callback already in-flight against the now-disabled tap still sees
+    // a live self via its own takeUnretainedValue.
+    refconRetained?.release()
+    refconRetained = nil
+
     logger.info("Hotkey listener stopped")
   }
 

From 3430ab251c9ca598889e8e0c3f65ba6c2f081511 Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:32:08 +0530
Subject: [PATCH 15/17] Fix SwiftLint violations introduced by perf work
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- KeychainHelper: opening_brace — keep brace on the same line as the
  multi-line if-let condition
- WAVEncoder: redundant_void_return — drop explicit -> Void on the
  withUnsafeMutableBytes closure

AUDIOTYPE-1
---
 AudioType/Core/WAVEncoder.swift          | 2 +-
 AudioType/Utilities/KeychainHelper.swift | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/AudioType/Core/WAVEncoder.swift b/AudioType/Core/WAVEncoder.swift
index 970f775..4596ae8 100644
--- a/AudioType/Core/WAVEncoder.swift
+++ b/AudioType/Core/WAVEncoder.swift
@@ -202,7 +202,7 @@ enum WAVEncoder {
     let totalSize = 44 + samples.count * 2
 
     var data = Data(count: totalSize)
-    data.withUnsafeMutableBytes { (raw: UnsafeMutableRawBufferPointer) -> Void in
+    data.withUnsafeMutableBytes { (raw: UnsafeMutableRawBufferPointer) in
       guard let base = raw.baseAddress else { return }
 
       // --- Header ---------------------------------------------------------
diff --git a/AudioType/Utilities/KeychainHelper.swift b/AudioType/Utilities/KeychainHelper.swift
index fb41886..956e924 100644
--- a/AudioType/Utilities/KeychainHelper.swift
+++ b/AudioType/Utilities/KeychainHelper.swift
@@ -74,8 +74,7 @@ enum KeychainHelper {
     let value: String?
     if status == errSecSuccess,
       let data = result as? Data,
-      let decoded = String(data: data, encoding: .utf8)
-    {
+      let decoded = String(data: data, encoding: .utf8) {
       value = decoded
     } else {
       value = nil

From 89aa5dcf43466beeeba8b2f3b3daa3f8dac9552a Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:37:58 +0530
Subject: [PATCH 16/17] Mark AppDelegate @MainActor to silence Sendable warning

---
 AudioType/App/AudioTypeApp.swift | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/AudioType/App/AudioTypeApp.swift b/AudioType/App/AudioTypeApp.swift
index fd20ece..3054019 100644
--- a/AudioType/App/AudioTypeApp.swift
+++ b/AudioType/App/AudioTypeApp.swift
@@ -12,6 +12,7 @@ struct AudioTypeApp: App {
   }
 }
 
+@MainActor
 class AppDelegate: NSObject, NSApplicationDelegate {
   private var statusItem: NSStatusItem!
   private var menuBarController: MenuBarController!
@@ -49,8 +50,8 @@ class AppDelegate: NSObject, NSApplicationDelegate {
 
     // Show onboarding if permissions are missing or no engine is usable
     if !micPermission || !accessibilityPermission || !EngineResolver.anyEngineAvailable {
-      DispatchQueue.main.async {
-        self.showOnboarding()
+      await MainActor.run {
+        showOnboarding()
       }
     } else {
       // All set — start listening for hotkey

From 6252cc4d0695a76ae340b6feb5477832dcaaf15d Mon Sep 17 00:00:00 2001
From: Utkarsh Patel <itismeutkarsh@gmail.com>
Date: Sat, 2 May 2026 11:37:58 +0530
Subject: [PATCH 17/17] Stop auto-closing onboarding; let user click Get
 Started

---
 AudioType/UI/OnboardingView.swift | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/AudioType/UI/OnboardingView.swift b/AudioType/UI/OnboardingView.swift
index 8f862fb..8155343 100644
--- a/AudioType/UI/OnboardingView.swift
+++ b/AudioType/UI/OnboardingView.swift
@@ -9,7 +9,6 @@ struct OnboardingView: View {
   @State private var anyCloudKeyConfigured = GroqEngine.isConfigured || OpenAIEngine.isConfigured
   @State private var apiKeyText = ""
   @State private var apiKeySaveError: String?
-  @State private var hasAutoCompleted = false
 
   let timer = Timer.publish(every: 0.5, on: .main, in: .common).autoconnect()
 
@@ -77,7 +76,7 @@ struct OnboardingView: View {
                   .font(.caption)
                   .foregroundColor(.secondary)
               }
-              Text("Cloud transcription — faster & more accurate")
+              Text("Cloud transcription - faster & more accurate")
                 .font(.caption)
                 .foregroundColor(.secondary)
             }
@@ -165,17 +164,13 @@ struct OnboardingView: View {
       checkPermissions()
     }
     .onReceive(timer) { _ in
-      // Continuously check permissions
+      // Continuously refresh permission state so the UI reflects changes made
+      // in System Settings. The user closes the window themselves via the
+      // "Get Started" button once everything is ready.
       microphoneGranted = AVCaptureDevice.authorizationStatus(for: .audio) == .authorized
       accessibilityGranted = Permissions.checkAccessibility()
       speechRecognitionGranted = Permissions.isSpeechRecognitionAuthorized
       anyCloudKeyConfigured = GroqEngine.isConfigured || OpenAIEngine.isConfigured
-
-      // Auto-complete when all required permissions are ready and at least one engine works
-      if canContinue && !hasAutoCompleted {
-        hasAutoCompleted = true
-        onComplete()
-      }
     }
   }