From a4294d3d6d87e5b2dc2644029b792055efd5c3ec Mon Sep 17 00:00:00 2001 From: Sam Gutentag <1404219+samgutentag@users.noreply.github.com> Date: Wed, 1 Jul 2026 16:11:52 -0700 Subject: [PATCH 1/2] fix: unify scroll anchor and debounce speaking detection in smooth modes Three related fixes for jumpy/lagging auto-scroll in classic and voice-activated (silence-paused) modes: - wordProgressAtCurrentOffset() computed the resume word from the viewport center, but smooth-mode scrolling anchors the active word near the bottom. Releasing a manual scroll therefore snapped the text down by roughly half the window height. Both paths now share a single readingAnchorY() helper. - The smooth-mode anchor sat 20pt above the bottom edge, giving the speaker zero lookahead: any word past the timer position was below the window. The anchor now sits at 70% of the viewport height so a couple of upcoming lines stay visible. - isSpeaking was a single 0.08 threshold over recent audio levels, so a voice hovering near it rapidly started/stopped the scroll timer. It now uses hysteresis (on above 0.08, off below 0.05) and resets when the audio tap is removed. Co-Authored-By: Claude Fable 5 --- Textream/Textream/MarqueeTextView.swift | 27 ++++++++++++------- Textream/Textream/SpeechRecognizer.swift | 33 +++++++++++++++++++----- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/Textream/Textream/MarqueeTextView.swift b/Textream/Textream/MarqueeTextView.swift index 74e5afb..2de52c6 100644 --- a/Textream/Textream/MarqueeTextView.swift +++ b/Textream/Textream/MarqueeTextView.swift @@ -246,24 +246,33 @@ struct SpeechScrollView: View { ) } + /// Y position in the viewport where the active word is anchored. + /// Smooth modes (classic/silence-paused) anchor in the lower third so read + /// text stays visible above while the next lines remain visible below — + /// anchoring at the very bottom leaves the speaker no lookahead. + /// wordProgressAtCurrentOffset must use the same anchor, otherwise + /// releasing a manual scroll snaps the text by the difference. + private func readingAnchorY(containerHeight: CGFloat) -> CGFloat { + smoothScroll ? containerHeight * 0.7 : containerHeight * 0.5 + } + private func recalcCenter(containerHeight: CGFloat) { - let center = containerHeight * 0.5 + let anchor = readingAnchorY(containerHeight: containerHeight) if smoothScroll { - // Classic/silence-paused: anchor active word near the bottom, scrolling up - let bottomAnchor = containerHeight - 20 + // Classic/silence-paused: continuous word progress, interpolated let wordIdx = Int(smoothWordProgress) let fraction = smoothWordProgress - Double(wordIdx) let clampedIdx = max(0, min(wordIdx, words.count - 1)) guard let wordY = wordYPositions[clampedIdx] else { return } let nextY = wordYPositions[clampedIdx + 1] ?? wordY let interpolatedY = wordY + (nextY - wordY) * CGFloat(fraction) - scrollOffset = bottomAnchor - interpolatedY + scrollOffset = anchor - interpolatedY } else { - // Word-tracking/voice-activated: active word at vertical center + // Word tracking: active word at vertical center let wordIdx = activeWordIndex() if let wordY = wordYPositions[wordIdx] { - let target = center - wordY + let target = anchor - wordY // Only update if it actually changed to avoid redundant animations if abs(scrollOffset - target) > 1 { scrollOffset = target @@ -274,9 +283,9 @@ struct SpeechScrollView: View { /// Find the word progress at the current visual position (scrollOffset + manualOffset) private func wordProgressAtCurrentOffset() -> Double { - let center = containerHeight * 0.5 - // The Y position currently at the center of the view - let targetY = center - (scrollOffset + manualOffset) + let anchor = readingAnchorY(containerHeight: containerHeight) + // The Y position currently at the reading anchor line + let targetY = anchor - (scrollOffset + manualOffset) // Find the closest word and interpolate let sorted = wordYPositions.sorted { $0.key < $1.key } diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift index d1151c1..84192ff 100644 --- a/Textream/Textream/SpeechRecognizer.swift +++ b/Textream/Textream/SpeechRecognizer.swift @@ -80,12 +80,26 @@ class SpeechRecognizer { var shouldDismiss: Bool = false var shouldAdvancePage: Bool = false - /// True when recent audio levels indicate the user is actively speaking - var isSpeaking: Bool { + /// True when recent audio levels indicate the user is actively speaking. + /// Uses hysteresis: a level hovering around a single threshold would + /// rapidly toggle this flag and stutter the silence-paused scroll timer. + private(set) var isSpeaking: Bool = false + + private static let speakingOnLevel: CGFloat = 0.08 + private static let speakingOffLevel: CGFloat = 0.05 + + private func updateSpeakingState() { let recent = audioLevels.suffix(10) - guard !recent.isEmpty else { return false } + guard !recent.isEmpty else { + isSpeaking = false + return + } let avg = recent.reduce(0, +) / CGFloat(recent.count) - return avg > 0.08 + if isSpeaking { + if avg < Self.speakingOffLevel { isSpeaking = false } + } else if avg > Self.speakingOnLevel { + isSpeaking = true + } } private var speechRecognizer: SFSpeechRecognizer? @@ -243,6 +257,9 @@ class SpeechRecognizer { audioEngine.stop() } audioEngine.inputNode.removeTap(onBus: 0) + // The tap no longer feeds audioLevels, so the speaking state would + // otherwise freeze at its last value. + isSpeaking = false } private func cleanupRecognition() { @@ -371,10 +388,12 @@ class SpeechRecognizer { let level = CGFloat(min(rms * 5, 1.0)) DispatchQueue.main.async { - self?.audioLevels.append(level) - if (self?.audioLevels.count ?? 0) > 30 { - self?.audioLevels.removeFirst() + guard let self else { return } + self.audioLevels.append(level) + if self.audioLevels.count > 30 { + self.audioLevels.removeFirst() } + self.updateSpeakingState() } } From 2fa5959233ffb8e7e8c17772a7b77a196b24006a Mon Sep 17 00:00:00 2001 From: Sam Gutentag <1404219+samgutentag@users.noreply.github.com> Date: Wed, 1 Jul 2026 22:02:49 -0700 Subject: [PATCH 2/2] fix: prefer word-level matcher when strategies diverge in word tracking When the char-level and word-level matchers disagreed by more than the tolerance, matchCharacters took min() of the two. The char matcher's resync can only bridge 3 characters, so a single word-level STT substitution (e.g. "sits" transcribed as "says") wedges it permanently. From then on min() vetoed the word matcher's correct position forever: the transcription bar kept updating while the highlight froze. Log-verified against a live read: the word matcher tracked the speaker exactly (word=187 at "following your voice") while the char matcher was stuck at char=89 ("MacBook's"), which is where the highlight sat. Disagreements now resolve to the word-level result. Its forward movement requires consecutive fuzzy word matches, and the existing 2-of-3 agreement gate still filters transient false jumps, so the runaway-jump failure mode that min() guarded against remains covered. Co-Authored-By: Claude Fable 5 --- Textream/Textream/SpeechRecognizer.swift | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift index 84192ff..043f71f 100644 --- a/Textream/Textream/SpeechRecognizer.swift +++ b/Textream/Textream/SpeechRecognizer.swift @@ -601,16 +601,20 @@ class SpeechRecognizer { // Strategy 2: word-level match (handles STT word substitutions) let wordResult = wordLevelMatch(spoken: spoken) - // Use agreement-based selection instead of blind max(). // If both strategies agree within a tolerance, use the average. - // If they disagree wildly, use the more conservative (lower) result - // to avoid false-positive jumps. + // If they disagree wildly, trust the word-level matcher: the char + // matcher's 3-char resync cannot bridge word-level substitutions + // ("sits" transcribed as "says"), after which it wedges permanently + // and taking min() would veto the word matcher forever, freezing the + // highlight. Word-level movement requires consecutive fuzzy word + // matches, and the 2-of-3 agreement gate below still filters + // transient false jumps. let best: Int let tolerance = 20 // characters if abs(charResult - wordResult) <= tolerance { best = (charResult + wordResult) / 2 } else { - best = min(charResult, wordResult) + best = wordResult } let newCount = matchStartOffset + best