dmarzzz · RonTuretzky · Jun 14, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -41,3 +41,6 @@ result
 
 # Claude Code local session state
 .claude/
+
+# Generated by scripts/stage-mobile.sh (the desktop GUI staged for the mobile bundle)
+mobile-pair/app/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- Optional cross-platform CPU **streaming ASR** backend (sherpa-onnx) behind the
+  `[streaming]` extra, with model keys `sherpa-stream-en` (zipformer-20M, ultra-fast) and
+  `sherpa-nemotron-en` (NeMo FastConformer-RNNT 0.6B, accurate). Fully additive — nothing
+  changes when the extra is absent. See `docs/streaming-asr.md` and
+  `docs/streaming-asr-benchmark.md`.
+- **Unified Android app**: the phone now runs the *same* web GUI as the desktop, with a native
+  on-device engine instead of the Python backend. A `LocalBackend` (`gui/static/backend-local.js`)
+  drives the `tauri-plugin-voxasr` plugin, which records then transcribes the clip at stop with
+  **offline Whisper** (`whisper-base.en` default, `VOXASR_MODEL=whisper-small.en` for accuracy) —
+  full punctuation, fully offline (the APK strips the `INTERNET` permission; `RECORD_AUDIO` only).
+  Python-only features (diarization, AI summarize, system audio) are hidden on-device. See
+  `tauri-plugin-voxasr/README.md`.
+
 ## [0.3.0] - 2026-06-03
 
 ### Added

diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ git clone https://github.com/dmarzzz/VoxTerm.git
 cd voxterm
 python3 -m venv .venv
 source .venv/bin/activate
-pip install -r requirements.txt
+pip install -e .                 # add ".[streaming]" for the optional streaming ASR backend
 python3 -m tui.app
 ```
 
@@ -129,6 +129,24 @@ Press `P` to manage your speaker profile library (rename, delete, wipe all data)
 
 Models download automatically on first use.
 
+### Optional: streaming ASR (cross-platform, CPU)
+
+`pip install "voxterm[streaming]"` adds a sherpa-onnx **streaming** backend (word-by-word,
+CPU-only, Linux/macOS-arm64/Windows) with two model keys — `sherpa-stream-en` (zipformer-20M,
+ultra-fast) and `sherpa-nemotron-en` (NeMo 0.6B, accurate). Fully opt-in: absent, nothing
+changes. See [docs/streaming-asr.md](docs/streaming-asr.md) and the
+[benchmark](docs/streaming-asr-benchmark.md).
+
+### Android: the same GUI, on-device (offline)
+
+The Android app runs the **same web GUI as the desktop**, but with a native on-device engine instead
+of the Python backend: it records, then transcribes **entirely on the phone** with offline Whisper
+via [`tauri-plugin-voxasr/`](tauri-plugin-voxasr/) — no pairing, no relay, **no network** (the APK
+strips the `INTERNET` permission). Build it with `scripts/android-dev.sh --debug` (it fetches the
+bundled model on first run). `whisper-base.en` ships by default; set
+`VOXASR_MODEL=whisper-small.en` for higher accuracy. See
+[tauri-plugin-voxasr/README.md](tauri-plugin-voxasr/README.md).
+
 ## Project Structure
 
 ```

diff --git a/audio/capture.py b/audio/capture.py
@@ -1,5 +1,6 @@
 import math
 import queue
+import sys
 import numpy as np
 import sounddevice as sd
 from scipy.signal import resample_poly
@@ -81,9 +82,23 @@ def start(self):
         # Reset filter state to avoid a click on resume.
         if self._noise_filter is not None:
             self._noise_filter.reset()
-        dev_info = sd.query_devices(kind='input')
+        try:
+            dev_info = sd.query_devices(kind='input')
+        except Exception as e:
+            if sys.platform == "darwin":
+                raise RuntimeError(
+                    "No microphone available. On macOS, grant Microphone permission to this app "
+                    "in System Settings > Privacy & Security > Microphone, then retry."
+                ) from e
+            raise
         self._device_name = dev_info['name']
         native_channels = dev_info['max_input_channels']
+        if not native_channels and sys.platform == "darwin":
+            raise RuntimeError(
+                "The input device reports 0 channels — on macOS this usually means Microphone "
+                "permission hasn't been granted. Allow it in System Settings > Privacy & "
+                "Security > Microphone, then retry."
+            )
         self._native_rate = int(dev_info['default_samplerate'])
         # Compute resample ratio as integer up/down factors
         if self._native_rate != SAMPLE_RATE:

diff --git a/audio/mix.py b/audio/mix.py
@@ -0,0 +1,24 @@
+"""Time-aligned mixing of two equal-rate mono float32 chunk streams (mic + system audio).
+
+The single home for this operation — both the TUI (`tui/app.py`) and the GUI engine
+(`gui/engine.py`) drive recording from a mic stream plus an optional system-audio stream.
+(Distinct from `audio/merger.py`, which is the P2P energy-weighted multi-peer mixer.)
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+
+def mix_chunks(mic: list, sysaud: list) -> list:
+    """Sum the overlapping chunks (clipped to [-1, 1]), then append each stream's tail.
+
+    The streams arrive as lists of equal-length float32 chunks; we sum index-for-index
+    for the first ``min(len)`` chunks and keep whichever stream has the longer tail, so
+    no audio is dropped when one side is briefly ahead.
+    """
+    n = min(len(mic), len(sysaud))
+    mixed = [np.clip(mic[i] + sysaud[i], -1.0, 1.0) for i in range(n)]
+    mixed.extend(mic[n:])
+    mixed.extend(sysaud[n:])
+    return mixed
diff --git a/audio/transcriber.py b/audio/transcriber.py
@@ -382,11 +382,39 @@ def __init__(self, model: str = "small", language: str | None = "en"):
         self._init_dedup()
 
     def load(self):
-        """Pre-load the model (downloads on first run)."""
+        """Pre-load the model (downloads on first run) and warm the decoder.
+
+        Tuning (all env-overridable, measured on CPU): explicit compute_type (CT2 'auto'
+        already resolves to int8 on CPU, but be explicit so a CUDA box doesn't silently pick
+        float16), an explicit cpu_threads count (CT2's default oversubscribes hybrid P+E CPUs),
+        and greedy decoding (beam_size=1) on CPU — ~1.3-2x faster than beam 5 with no measurable
+        accuracy loss on the short pre-VAD'd clips this feeds. A final dummy decode JITs CT2's
+        kernels so the user's first real transcription isn't the cold path.
+        """
+        import os
         from faster_whisper import WhisperModel
-        self._model = WhisperModel(
-            self.model_size, device="auto", compute_type="auto",
-        )
+        try:
+            import torch
+            cuda = torch.cuda.is_available()
+        except Exception:
+            cuda = False
+        device = os.environ.get("VOXTERM_FW_DEVICE") or ("cuda" if cuda else "cpu")
+        compute = os.environ.get("VOXTERM_FW_COMPUTE") or ("float16" if device == "cuda" else "int8")
+        self._beam = int(os.environ.get("VOXTERM_FW_BEAM") or (5 if device == "cuda" else 1))
+        kw = {"device": device, "compute_type": compute}
+        if device == "cpu":
+            try:
+                default_threads = max(1, min(6, (os.cpu_count() or 4) // 2))
+            except Exception:
+                default_threads = 4
+            kw["cpu_threads"] = int(os.environ.get("VOXTERM_FW_CPU_THREADS") or default_threads)
+        self._model = WhisperModel(self.model_size, **kw)
+        try:                                   # warm the decoder off the user's hot path
+            warm = np.zeros(16000, dtype=np.float32)
+            for _ in self._model.transcribe(warm, language=self._language, beam_size=1, vad_filter=False)[0]:
+                pass
+        except Exception:
+            pass
         self._loaded = True
 
     def transcribe(self, audio: np.ndarray, **kwargs) -> dict:
@@ -397,7 +425,7 @@ def transcribe(self, audio: np.ndarray, **kwargs) -> dict:
         segments, _info = self._model.transcribe(
             audio,
             language=self._language,
-            beam_size=5,
+            beam_size=getattr(self, "_beam", 1),
             vad_filter=False,  # we already run Silero VAD upstream
         )
         text = " ".join(seg.text.strip() for seg in segments).strip()
@@ -415,6 +443,152 @@ def is_loaded(self) -> bool:
         return self._loaded
 
 
+# --- optional cross-platform streaming backend (sherpa-onnx) -----------------
+
+_SHERPA_RELEASE = "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models"
+# repo dir name -> download tarball. All are transducer models (encoder/decoder/joiner/tokens).
+_SHERPA_MODEL_URLS = {
+    "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17":
+        f"{_SHERPA_RELEASE}/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2",
+    "sherpa-onnx-nemotron-speech-streaming-en-0.6b-560ms-int8-2026-04-25":
+        f"{_SHERPA_RELEASE}/sherpa-onnx-nemotron-speech-streaming-en-0.6b-560ms-int8-2026-04-25.tar.bz2",
+}
+
+
+def _model_complete(d: "Path") -> bool:
+    """True only when ALL four artifacts are present (so a half-extracted dir isn't trusted)."""
+    return (d.is_dir() and any(d.glob("*encoder*.onnx")) and any(d.glob("*decoder*.onnx"))
+            and any(d.glob("*joiner*.onnx")) and (d / "tokens.txt").exists())
+
+
+def _ensure_sherpa_model(repo: str) -> "Path":
+    """Return a local dir holding the streaming-zipformer ONNX files, downloading + extracting
+    the published tarball on first use. Cached under ~/.cache/voxterm/sherpa/<repo>/. Both the
+    download and the extraction are atomic, and a partial/corrupt cache self-heals."""
+    from pathlib import Path
+    import shutil
+    import tarfile
+    import urllib.request
+
+    cache = Path.home() / ".cache" / "voxterm" / "sherpa"
+    target = cache / repo
+    if _model_complete(target):
+        return target
+    shutil.rmtree(target, ignore_errors=True)               # wipe any partial/corrupt extraction
+    cache.mkdir(parents=True, exist_ok=True)
+    tarball = cache / (repo + ".tar.bz2")
+    if not tarball.exists():
+        tmp = tarball.with_suffix(".part")
+        try:
+            urllib.request.urlretrieve(_SHERPA_MODEL_URLS[repo], tmp)  # noqa: S310 (pinned github release URL)
+        except Exception:
+            tmp.unlink(missing_ok=True)                     # don't leak a partial download
+            raise
+        tmp.rename(tarball)
+    # extract into a sibling staging dir, then atomically move into place — an interrupted
+    # extraction never leaves a half-populated dir that _model_complete would accept.
+    staging = cache / (repo + ".extracting")
+    shutil.rmtree(staging, ignore_errors=True)
+    with tarfile.open(tarball, "r:bz2") as tf:
+        tf.extractall(staging, filter="data")               # produces staging/<repo>/ (safe filter)
+    extracted = staging / repo
+    if not _model_complete(extracted):
+        shutil.rmtree(staging, ignore_errors=True)
+        raise RuntimeError(f"sherpa model tarball for {repo} is incomplete after extraction")
+    extracted.rename(target)
+    shutil.rmtree(staging, ignore_errors=True)
+    return target
+
+
+# Public alias so external callers (the GUI live loop) don't import the private name.
+is_hallucination = _is_hallucination
+
+
+class SherpaStreamingTranscriber(_DeduplicatorMixin):
+    """Cross-platform CPU streaming ASR via sherpa-onnx (k2-fsa). Optional backend — only
+    reachable when sherpa-onnx is installed (config gates the model key). Per-call
+    create_stream makes it a drop-in for the existing chunked callers; the live loop can also
+    drive it as a true streaming recognizer."""
+
+    def __init__(self, model: str = "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17",
+                 language: str | None = "en"):
+        self.model_id = model
+        self._language = language
+        self._rec = None
+        self._loaded = False
+        self._init_dedup()
+
+    # Public surface for the GUI live loop, so it never reaches into underscore-privates.
+    @property
+    def recognizer(self):
+        """The underlying sherpa OnlineRecognizer (valid after load())."""
+        return self._rec
+
+    def reset_dedup(self):
+        """Clear consecutive-duplicate state — call when (re)starting a live stream."""
+        self._init_dedup()
+
+    def is_duplicate(self, text: str) -> bool:
+        """True if `text` repeats the immediately-preceding finalized text."""
+        return self._is_duplicate(text)
+
+    def load(self):
+        try:
+            import sherpa_onnx
+        except ImportError as e:
+            raise RuntimeError(
+                'sherpa-onnx is not installed; install it to use streaming models '
+                '(pip install "voxterm[streaming]" or pip install sherpa-onnx).'
+            ) from e
+        d = _ensure_sherpa_model(self.model_id)
+
+        def _pick(*globs):
+            for g in globs:
+                hits = sorted(d.glob(g))
+                if hits:
+                    return str(hits[0])
+            raise RuntimeError(
+                f"sherpa model dir {d} is missing a required file (looked for {globs!r}); "
+                "delete it and re-run to re-download."
+            )
+
+        enc = _pick("*encoder*.int8.onnx", "*encoder*.onnx")
+        dec = _pick("*decoder*.int8.onnx", "*decoder*.onnx")
+        joi = _pick("*joiner*.int8.onnx", "*joiner*.onnx")
+        tokens = _pick("tokens.txt")
+        self._rec = sherpa_onnx.OnlineRecognizer.from_transducer(
+            tokens=tokens, encoder=enc, decoder=dec, joiner=joi,
+            num_threads=2, provider="cpu", enable_endpoint_detection=True,
+            rule1_min_trailing_silence=2.4, rule2_min_trailing_silence=1.2,
+            rule3_min_utterance_length=20.0,
+        )
+        self._loaded = True
+
+    def transcribe(self, audio: np.ndarray, **kwargs) -> dict:
+        rms = float(np.sqrt(np.mean(audio ** 2)))
+        if rms < 0.005:
+            return {"text": "", "speaker": "", "speaker_id": 0}
+        s = self._rec.create_stream()
+        s.accept_waveform(16000, np.ascontiguousarray(audio, dtype=np.float32))
+        while self._rec.is_ready(s):
+            self._rec.decode_stream(s)
+        s.input_finished()
+        while self._rec.is_ready(s):
+            self._rec.decode_stream(s)
+        text = (self._rec.get_result(s) or "").strip()
+        if text and text.isupper():
+            # only sentence-case models that emit ALL-CAPS (zipformer); leave models with native
+            # casing + punctuation (nemotron) untouched.
+            text = text.capitalize()
+        if not text or _is_hallucination(text, self._language) or self._is_duplicate(text):
+            return {"text": "", "speaker": "", "speaker_id": 0}
+        return {"text": text, "speaker": "", "speaker_id": 0}
+
+    @property
+    def is_loaded(self) -> bool:        # @property to match every other backend's contract
+        return self._loaded
+
+
 def get_transcriber(model_name: str, *, language: str | None = "en"):
     """Construct the transcriber backend for a model key.
 
@@ -429,13 +603,16 @@ def get_transcriber(model_name: str, *, language: str | None = "en"):
         FASTER_WHISPER_MODELS,
         PARAKEET_MODELS,
         QWEN3_MODELS,
+        SHERPA_MODELS,
     )
 
     model_repo = AVAILABLE_MODELS[model_name]
     if model_name in QWEN3_MODELS:
         return Qwen3Transcriber(model=model_repo, language=language)
     if model_name in PARAKEET_MODELS:
         return ParakeetTranscriber(model=model_repo, language=language)
+    if model_name in SHERPA_MODELS:
+        return SherpaStreamingTranscriber(model=model_repo, language=language)
     if model_name in FASTER_WHISPER_MODELS:
         return FasterWhisperTranscriber(model=model_repo, language=language)
     return WhisperTranscriber(model=model_repo)
diff --git a/config.py b/config.py
@@ -2,6 +2,7 @@
 
 VERSION = "0.3.0"
 
+import importlib.util
 import sys
 import platform
 
@@ -90,6 +91,21 @@
 else:
     raise RuntimeError(f"Unsupported platform: {sys.platform}")
 
+# Optional cross-platform streaming backend (sherpa-onnx). Surfaced ONLY when the package is
+# installed AND a wheel exists for this platform (there is no Intel-macOS wheel). 100% additive:
+# if absent, SHERPA_MODELS stays empty, AVAILABLE_MODELS/DEFAULT_MODEL are byte-for-byte unchanged,
+# and the transcriber's sherpa dispatch branch is unreachable. sherpa statically links its own
+# ONNX Runtime, so it cannot collide with VoxTerm's pinned onnxruntime (Silero VAD / 3D-Speaker).
+_HAS_SHERPA = (
+    importlib.util.find_spec("sherpa_onnx") is not None
+    and not (sys.platform == "darwin" and platform.machine() != "arm64")
+)
+if _HAS_SHERPA:
+    AVAILABLE_MODELS["sherpa-stream-en"] = "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17"
+    # nemotron-EN streaming (NeMo FastConformer-RNNT, 0.6B, exported for sherpa-onnx)
+    AVAILABLE_MODELS["sherpa-nemotron-en"] = "sherpa-onnx-nemotron-speech-streaming-en-0.6b-560ms-int8-2026-04-25"
+SHERPA_MODELS = {"sherpa-stream-en", "sherpa-nemotron-en"} if _HAS_SHERPA else set()
+
 # Language forcing for Qwen3-ASR (None = auto-detect)
 DEFAULT_LANGUAGE = "en"
 AVAILABLE_LANGUAGES = {

diff --git a/dev/requirements-dev.txt b/dev/requirements-dev.txt
@@ -1,2 +1,4 @@
 pytest>=8.0
 pytest-timeout>=2.0
+websocket-client>=1.0  # scripts/gui_e2e.py (CDP)
+soundfile>=0.12  # scripts/bench_asr.py (WAV loading)