From a4294d3d6d87e5b2dc2644029b792055efd5c3ec Mon Sep 17 00:00:00 2001
From: Sam Gutentag <1404219+samgutentag@users.noreply.github.com>
Date: Wed, 1 Jul 2026 16:11:52 -0700
Subject: [PATCH 1/2] fix: unify scroll anchor and debounce speaking detection
 in smooth modes

Three related fixes for jumpy/lagging auto-scroll in classic and
voice-activated (silence-paused) modes:

- wordProgressAtCurrentOffset() computed the resume word from the
  viewport center, but smooth-mode scrolling anchors the active word
  near the bottom. Releasing a manual scroll therefore snapped the text
  down by roughly half the window height. Both paths now share a single
  readingAnchorY() helper.

- The smooth-mode anchor sat 20pt above the bottom edge, giving the
  speaker zero lookahead: any word past the timer position was below
  the window. The anchor now sits at 70% of the viewport height so a
  couple of upcoming lines stay visible.

- isSpeaking was a single 0.08 threshold over recent audio levels, so a
  voice hovering near it rapidly started/stopped the scroll timer. It
  now uses hysteresis (on above 0.08, off below 0.05) and resets when
  the audio tap is removed.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 Textream/Textream/MarqueeTextView.swift  | 27 ++++++++++++-------
 Textream/Textream/SpeechRecognizer.swift | 33 +++++++++++++++++++-----
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/Textream/Textream/MarqueeTextView.swift b/Textream/Textream/MarqueeTextView.swift
index 74e5afb..2de52c6 100644
--- a/Textream/Textream/MarqueeTextView.swift
+++ b/Textream/Textream/MarqueeTextView.swift
@@ -246,24 +246,33 @@ struct SpeechScrollView: View {
         )
     }
 
+    /// Y position in the viewport where the active word is anchored.
+    /// Smooth modes (classic/silence-paused) anchor in the lower third so read
+    /// text stays visible above while the next lines remain visible below —
+    /// anchoring at the very bottom leaves the speaker no lookahead.
+    /// wordProgressAtCurrentOffset must use the same anchor, otherwise
+    /// releasing a manual scroll snaps the text by the difference.
+    private func readingAnchorY(containerHeight: CGFloat) -> CGFloat {
+        smoothScroll ? containerHeight * 0.7 : containerHeight * 0.5
+    }
+
     private func recalcCenter(containerHeight: CGFloat) {
-        let center = containerHeight * 0.5
+        let anchor = readingAnchorY(containerHeight: containerHeight)
 
         if smoothScroll {
-            // Classic/silence-paused: anchor active word near the bottom, scrolling up
-            let bottomAnchor = containerHeight - 20
+            // Classic/silence-paused: continuous word progress, interpolated
             let wordIdx = Int(smoothWordProgress)
             let fraction = smoothWordProgress - Double(wordIdx)
             let clampedIdx = max(0, min(wordIdx, words.count - 1))
             guard let wordY = wordYPositions[clampedIdx] else { return }
             let nextY = wordYPositions[clampedIdx + 1] ?? wordY
             let interpolatedY = wordY + (nextY - wordY) * CGFloat(fraction)
-            scrollOffset = bottomAnchor - interpolatedY
+            scrollOffset = anchor - interpolatedY
         } else {
-            // Word-tracking/voice-activated: active word at vertical center
+            // Word tracking: active word at vertical center
             let wordIdx = activeWordIndex()
             if let wordY = wordYPositions[wordIdx] {
-                let target = center - wordY
+                let target = anchor - wordY
                 // Only update if it actually changed to avoid redundant animations
                 if abs(scrollOffset - target) > 1 {
                     scrollOffset = target
@@ -274,9 +283,9 @@ struct SpeechScrollView: View {
 
     /// Find the word progress at the current visual position (scrollOffset + manualOffset)
     private func wordProgressAtCurrentOffset() -> Double {
-        let center = containerHeight * 0.5
-        // The Y position currently at the center of the view
-        let targetY = center - (scrollOffset + manualOffset)
+        let anchor = readingAnchorY(containerHeight: containerHeight)
+        // The Y position currently at the reading anchor line
+        let targetY = anchor - (scrollOffset + manualOffset)
 
         // Find the closest word and interpolate
         let sorted = wordYPositions.sorted { $0.key < $1.key }
diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift
index d1151c1..84192ff 100644
--- a/Textream/Textream/SpeechRecognizer.swift
+++ b/Textream/Textream/SpeechRecognizer.swift
@@ -80,12 +80,26 @@ class SpeechRecognizer {
     var shouldDismiss: Bool = false
     var shouldAdvancePage: Bool = false
 
-    /// True when recent audio levels indicate the user is actively speaking
-    var isSpeaking: Bool {
+    /// True when recent audio levels indicate the user is actively speaking.
+    /// Uses hysteresis: a level hovering around a single threshold would
+    /// rapidly toggle this flag and stutter the silence-paused scroll timer.
+    private(set) var isSpeaking: Bool = false
+
+    private static let speakingOnLevel: CGFloat = 0.08
+    private static let speakingOffLevel: CGFloat = 0.05
+
+    private func updateSpeakingState() {
         let recent = audioLevels.suffix(10)
-        guard !recent.isEmpty else { return false }
+        guard !recent.isEmpty else {
+            isSpeaking = false
+            return
+        }
         let avg = recent.reduce(0, +) / CGFloat(recent.count)
-        return avg > 0.08
+        if isSpeaking {
+            if avg < Self.speakingOffLevel { isSpeaking = false }
+        } else if avg > Self.speakingOnLevel {
+            isSpeaking = true
+        }
     }
 
     private var speechRecognizer: SFSpeechRecognizer?
@@ -243,6 +257,9 @@ class SpeechRecognizer {
             audioEngine.stop()
         }
         audioEngine.inputNode.removeTap(onBus: 0)
+        // The tap no longer feeds audioLevels, so the speaking state would
+        // otherwise freeze at its last value.
+        isSpeaking = false
     }
 
     private func cleanupRecognition() {
@@ -371,10 +388,12 @@ class SpeechRecognizer {
             let level = CGFloat(min(rms * 5, 1.0))
 
             DispatchQueue.main.async {
-                self?.audioLevels.append(level)
-                if (self?.audioLevels.count ?? 0) > 30 {
-                    self?.audioLevels.removeFirst()
+                guard let self else { return }
+                self.audioLevels.append(level)
+                if self.audioLevels.count > 30 {
+                    self.audioLevels.removeFirst()
                 }
+                self.updateSpeakingState()
             }
         }
 

From 2fa5959233ffb8e7e8c17772a7b77a196b24006a Mon Sep 17 00:00:00 2001
From: Sam Gutentag <1404219+samgutentag@users.noreply.github.com>
Date: Wed, 1 Jul 2026 22:02:49 -0700
Subject: [PATCH 2/2] fix: prefer word-level matcher when strategies diverge in
 word tracking

When the char-level and word-level matchers disagreed by more than the
tolerance, matchCharacters took min() of the two. The char matcher's
resync can only bridge 3 characters, so a single word-level STT
substitution (e.g. "sits" transcribed as "says") wedges it permanently.
From then on min() vetoed the word matcher's correct position forever:
the transcription bar kept updating while the highlight froze.

Log-verified against a live read: the word matcher tracked the speaker
exactly (word=187 at "following your voice") while the char matcher was
stuck at char=89 ("MacBook's"), which is where the highlight sat.

Disagreements now resolve to the word-level result. Its forward
movement requires consecutive fuzzy word matches, and the existing
2-of-3 agreement gate still filters transient false jumps, so the
runaway-jump failure mode that min() guarded against remains covered.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 Textream/Textream/SpeechRecognizer.swift | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift
index 84192ff..043f71f 100644
--- a/Textream/Textream/SpeechRecognizer.swift
+++ b/Textream/Textream/SpeechRecognizer.swift
@@ -601,16 +601,20 @@ class SpeechRecognizer {
         // Strategy 2: word-level match (handles STT word substitutions)
         let wordResult = wordLevelMatch(spoken: spoken)
 
-        // Use agreement-based selection instead of blind max().
         // If both strategies agree within a tolerance, use the average.
-        // If they disagree wildly, use the more conservative (lower) result
-        // to avoid false-positive jumps.
+        // If they disagree wildly, trust the word-level matcher: the char
+        // matcher's 3-char resync cannot bridge word-level substitutions
+        // ("sits" transcribed as "says"), after which it wedges permanently
+        // and taking min() would veto the word matcher forever, freezing the
+        // highlight. Word-level movement requires consecutive fuzzy word
+        // matches, and the 2-of-3 agreement gate below still filters
+        // transient false jumps.
         let best: Int
         let tolerance = 20 // characters
         if abs(charResult - wordResult) <= tolerance {
             best = (charResult + wordResult) / 2
         } else {
-            best = min(charResult, wordResult)
+            best = wordResult
         }
 
         let newCount = matchStartOffset + best