From f631e1ceb4dc485bc41ab37f7416294ea41b7f55 Mon Sep 17 00:00:00 2001 From: NubsCarson <192162056+NubsCarson@users.noreply.github.com> Date: Thu, 4 Jun 2026 06:26:07 +0000 Subject: [PATCH 01/60] =?UTF-8?q?feat(gui):=20web=20control=20app=20for=20?= =?UTF-8?q?VoxTerm=20(record=20=E2=86=92=20transcribe=20=E2=86=92=20AI=20e?= =?UTF-8?q?xport=20=E2=86=92=20review)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A clean, responsive web GUI that fully drives VoxTerm's engine from the browser (desktop + phone over LAN), with a Python control backend — no reinvention of the transcription/diarization logic, it reuses VoxTerm's own AudioCapture + transcriber + Silero VAD + diarizer + EventLogger. gui/server.py stdlib http.server + SSE status stream + JSON API (loopback by default; VOXTERM_GUI_LAN=1 to reach it from a phone). CSP, nosniff, bounded request bodies, static-dir traversal guard, capped SSE. gui/engine.py control layer: start/stop recording via AudioCapture, background transcribe+export job with progress, session history, artifact reads (path-traversal guarded). gui/transcribe.py importable transcription (WAV/buffer -> faithful events.jsonl + -transcript.md) reusing VoxTerm's engine; progress callback for the UI. gui/export.py the reviewed LLM-agent exporter (events.jsonl -> -agent.md + .json), ported self-contained into the fork (+ gui/test_export.py, 23 tests). gui/static/ polished UI (index.html/style.css/app.js): record hero w/ live level ring + timer, model/language pickers, SSE-driven transcript view, client-side speaker rename (flows into copy/export), session browser, Copy-for-AI / download .md / download .json. v1 = record → stop → transcribe (robust; reuses the tested pipeline). Verified so far without a mic: API + static serving + traversal guards + the full load/view/export flow against a real 53-turn session; export tests 23/23. Pending a recording-finalize (mic contention): the record-from-GUI path and a Tauri v2 native/mobile wrapper. Live word-streaming, party/P2P, hivemind = labeled fast-follows. --- gui/.gitignore | 2 + gui/__init__.py | 0 gui/engine.py | 209 +++++++++++++++++++++++ gui/export.py | 373 ++++++++++++++++++++++++++++++++++++++++++ gui/server.py | 160 ++++++++++++++++++ gui/static/app.js | 234 ++++++++++++++++++++++++++ gui/static/index.html | 89 ++++++++++ gui/static/style.css | 152 +++++++++++++++++ gui/test_export.py | 253 ++++++++++++++++++++++++++++ gui/transcribe.py | 176 ++++++++++++++++++++ 10 files changed, 1648 insertions(+) create mode 100644 gui/.gitignore create mode 100644 gui/__init__.py create mode 100644 gui/engine.py create mode 100644 gui/export.py create mode 100644 gui/server.py create mode 100644 gui/static/app.js create mode 100644 gui/static/index.html create mode 100644 gui/static/style.css create mode 100644 gui/test_export.py create mode 100644 gui/transcribe.py diff --git a/gui/.gitignore b/gui/.gitignore new file mode 100644 index 0000000..7a60b85 --- /dev/null +++ b/gui/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +*.pyc diff --git a/gui/__init__.py b/gui/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gui/engine.py b/gui/engine.py new file mode 100644 index 0000000..9a87030 --- /dev/null +++ b/gui/engine.py @@ -0,0 +1,209 @@ +"""GUI control layer over VoxTerm's engine. + +Exposes the operations the GUI drives — start/stop recording (via VoxTerm's own +``AudioCapture``), the background transcribe+export job, and session history — as a +small thread-safe object the HTTP server calls. No transcription/diarization logic +lives here; recording reuses ``audio.capture.AudioCapture`` and the heavy lifting is +``gui.transcribe`` + ``gui.export`` (the reviewed, tested pipeline). + +v1 model: record -> stop -> transcribe (robust, reuses the tested pipeline). Live +word-by-word streaming is a planned fast-follow. +""" +from __future__ import annotations + +import sys +import threading +import time +import wave +from datetime import datetime +from pathlib import Path + +_ROOT = str(Path(__file__).resolve().parent.parent) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + +import numpy as np # noqa: E402 + +import config # noqa: E402 +from gui import transcribe, export # noqa: E402 + +OUT_DIR = Path.home() / "voxterm-live" +SR = config.SAMPLE_RATE + + +def _write_wav(path: Path, audio: np.ndarray) -> None: + pcm = np.clip(audio, -1.0, 1.0) + pcm = (pcm * 32767.0).astype(" list[str]: + # faster-whisper keys only (the CPU-usable set; qwen3 default is unusable on CPU) + return sorted(config.FASTER_WHISPER_MODELS) + + def languages(self) -> dict: + return dict(config.AVAILABLE_LANGUAGES) + + # ---- recording ---- + def start_recording(self) -> dict: + with self._lock: + if self.recording: + return {"ok": True, "already": True} + from audio.capture import AudioCapture + self._cap = AudioCapture() + self._cap.start() + self._chunks = [] + self._stop.clear() + self.recording = True + self.started_at = time.time() + self.level = 0.0 + self._poll_thread = threading.Thread(target=self._poll, daemon=True, name="gui-rec-poll") + self._poll_thread.start() + return {"ok": True} + + def _poll(self): + while not self._stop.is_set(): + try: + chunks = self._cap.drain() + except Exception: + chunks = [] + for c in chunks: + if c is not None and len(c): + self._chunks.append(np.asarray(c, dtype=np.float32)) + if chunks: + last = np.asarray(chunks[-1], dtype=np.float32) + if len(last): + self.level = float(np.sqrt(np.mean(np.square(last)))) + time.sleep(0.066) # ~15 Hz + + def stop_recording(self, model: str = "fw-small", language: str = "en") -> dict: + with self._lock: + if not self.recording: + return {"ok": False, "error": "not recording"} + self._stop.set() + if self._poll_thread: + self._poll_thread.join(timeout=3) + try: + rest = self._cap.drain() + for c in rest: + if c is not None and len(c): + self._chunks.append(np.asarray(c, dtype=np.float32)) + self._cap.stop() + except Exception: + pass + self.recording = False + audio = np.concatenate(self._chunks).astype(np.float32) if self._chunks else np.zeros(0, dtype=np.float32) + self._chunks = [] + if len(audio) < SR // 2: # < 0.5s + self.job = {"state": "error", "error": "recording too short"} + return {"ok": False, "error": "recording too short"} + ts = datetime.now().strftime("%Y%m%d-%H%M%S") + wav = self.out_dir / f"{ts}-gui.wav" + _write_wav(wav, audio) + self.job = {"state": "transcribing", "frac": 0.0, "msg": "starting", "wav": str(wav)} + threading.Thread(target=self._do_transcribe, args=(audio, model, language, str(wav)), + daemon=True, name="gui-transcribe").start() + return {"ok": True, "wav": str(wav), "seconds": round(len(audio) / SR, 1)} + + def _do_transcribe(self, audio, model, language, wav): + try: + def prog(frac, msg): + self.job = {"state": "transcribing", "frac": round(frac, 3), "msg": msg, "wav": wav} + r = transcribe.transcribe_audio(audio, self.out_dir, model=model, language=language, progress=prog) + md_path, json_path = export.export(Path(r["events_path"]), self.out_dir) + self.job = {"state": "done", "wav": wav, **r, + "agent_md": str(md_path), "agent_json": str(json_path), + "stem": Path(r["transcript_path"]).stem.replace("-transcript", "")} + except Exception as e: + self.job = {"state": "error", "error": f"{type(e).__name__}: {e}"} + + def transcribe_existing(self, wav_path: str, model: str = "fw-small", language: str = "en") -> dict: + """Transcribe an already-recorded WAV (e.g. a prior capture) in the background.""" + p = Path(wav_path) + if not p.exists(): + return {"ok": False, "error": "no such file"} + self.job = {"state": "transcribing", "frac": 0.0, "msg": "starting", "wav": str(p)} + threading.Thread(target=lambda: self._do_transcribe(transcribe.load_wav_16k_mono(p), model, language, str(p)), + daemon=True, name="gui-transcribe").start() + return {"ok": True} + + def status(self) -> dict: + return { + "recording": self.recording, + "level": round(self.level, 4), + "elapsed": round(time.time() - self.started_at, 1) if (self.recording and self.started_at) else 0, + "job": self.job, + } + + # ---- session history ---- + def _session_dirs(self) -> list[Path]: + dirs = [self.out_dir] + try: + dirs.append(Path(config.SESSIONS_DIR)) + dirs.append(Path(config.LIVE_DIR)) + except Exception: + pass + seen, uniq = set(), [] + for d in dirs: + if d and d not in seen and d.exists(): + seen.add(d) + uniq.append(d) + return uniq + + def sessions(self) -> list[dict]: + """All sessions across the known dirs, newest first, with which artifacts exist.""" + out = {} + for d in self._session_dirs(): + for f in d.glob("*-transcript.md"): + stem = f.stem[: -len("-transcript")] + out.setdefault((d, stem), {"stem": stem, "dir": str(d), "mtime": f.stat().st_mtime}) + out[(d, stem)]["transcript"] = f.name + for f in d.glob("*-agent.md"): + stem = f.stem[: -len("-agent")] + e = out.setdefault((d, stem), {"stem": stem, "dir": str(d), "mtime": f.stat().st_mtime}) + e["agent_md"] = f.name + e["mtime"] = max(e.get("mtime", 0), f.stat().st_mtime) + for f in d.glob("*-agent.json"): + stem = f.stem[: -len("-agent")] + e = out.setdefault((d, stem), {"stem": stem, "dir": str(d), "mtime": f.stat().st_mtime}) + e["agent_json"] = f.name + items = sorted(out.values(), key=lambda x: x.get("mtime", 0), reverse=True) + return items + + def _resolve(self, stem: str, suffix: str) -> Path | None: + # prevent traversal: stem must be a bare name + if "/" in stem or ".." in stem: + return None + for d in self._session_dirs(): + p = d / f"{stem}{suffix}" + if p.exists(): + return p + return None + + def read_artifact(self, stem: str, kind: str) -> dict: + suffix = {"transcript": "-transcript.md", "agent_md": "-agent.md", "agent_json": "-agent.json"}.get(kind) + if not suffix: + return {"ok": False, "error": "bad kind"} + p = self._resolve(stem, suffix) + if not p: + return {"ok": False, "error": "not found"} + return {"ok": True, "stem": stem, "kind": kind, "path": str(p), "text": p.read_text(encoding="utf-8")} diff --git a/gui/export.py b/gui/export.py new file mode 100644 index 0000000..5df423f --- /dev/null +++ b/gui/export.py @@ -0,0 +1,373 @@ +"""Export a VoxTerm event log to an LLM-agent-optimized transcript. + +A pure, replayable function of the ``*-events.jsonl`` stream (the same stream glass +tails) — no audio, no live state. Produces two files alongside the session: + + -agent.md human + LLM readable: YAML front-matter, an orientation line, + then one speaker-attributed, timestamped turn per paragraph. + -agent.json the typed, lossless companion the .md is rendered from. + +Run: + python -m glass.export [events.jsonl] [--out-dir DIR] + # with no path, exports the newest *-events.jsonl in VoxTerm's live dir. + +Design notes: +- ``confidence`` in the event stream is a TIER STRING ("", "high", "medium", "new"), + never a float — we keep it verbatim and derive a single ``confidence_uncertain`` + boolean, avoiding the "+confidence -> NaN" trap a numeric coercion would hit. +- Timestamps prefer a turn's ``audio_offset`` (true seconds into the recording, set + by the headless file transcriber) and fall back to wall-clock ``t - session_start`` + for live TUI sessions. +- Speaker labels from diarization are voice CLUSTERS, not verified identities; that + caveat is stated once in the front-matter ``notes`` rather than marked on every + turn. Per-turn markers flag only genuinely-more-uncertain turns. +""" +from __future__ import annotations + +import argparse +import json +import math +import re +import sys +from collections import Counter +from datetime import datetime +from pathlib import Path + +EXPORT_VERSION = 1 +DOC_KIND = "voxterm-transcript" + +# Marker render order (bare tokens; rendered "[token]" in Markdown). +_MARKER_ORDER = ["~", "new-voice", "overlap", "peer"] +_MARKER_LEGEND = { + "[~]": "low/medium-confidence or unattributed speaker — treat the label as uncertain", + "[new-voice]": "first appearance of an unrecognized speaker", + "[overlap]": "overlapping speech in this segment", + "[peer]": "turn arrived from a remote P2P peer", +} +_CONFIDENCE_LEGEND = {"recognized": "high", "suggested": "medium", "new_voice": "new", "asserted": ""} + + +def load_events(path: Path) -> list[dict]: + """Read a JSONL event log; skip blank/garbled lines rather than failing.""" + events = [] + with path.open("r", encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + if isinstance(obj, dict) and "kind" in obj: + events.append(obj) + return events + + +def _num(v, default=0.0): + """Tolerant numeric parse from an untrusted event log: returns a FINITE float + (or ``default``) — never raises, never returns NaN/Infinity. Garbled or + non-numeric ``t``/``audio_offset`` values degrade to the default instead of + crashing the whole export.""" + if isinstance(v, bool) or v is None: + return default + try: + f = float(v) + except (TypeError, ValueError): + return default + return f if math.isfinite(f) else default + + +def _coerce_sid(raw) -> int: + """Speaker ids are ints in real VoxTerm but strings ("S0") in the synth harness; + normalize either to an int without crashing.""" + if isinstance(raw, bool) or raw is None: + return 0 + if isinstance(raw, int): + return raw + if isinstance(raw, float): + return int(raw) + if isinstance(raw, str): + try: + return int(raw) + except ValueError: + m = re.search(r"\d+", raw) + return int(m.group()) if m else 0 + return 0 + + +def _fmt_hms(seconds: float) -> str: + s = int(round(seconds)) + h, rem = divmod(s, 3600) + m, sec = divmod(rem, 60) + return f"{h}:{m:02d}:{sec:02d}" if h else f"{m:02d}:{sec:02d}" + + +def _iso_local(unix_ts: float) -> str: + """Local, timezone-aware ISO 8601 (e.g. 2026-06-04T09:14:07-07:00).""" + return datetime.fromtimestamp(unix_ts).astimezone().isoformat(timespec="seconds") + + +def build(events: list[dict], *, session_id: str, source_stream: str) -> dict: + """Reduce the event stream to the typed export object (the JSON sidecar shape).""" + starts = [e for e in events if e.get("kind") == "session" and e.get("phase") == "start"] + ends = [e for e in events if e.get("kind") == "session" and e.get("phase") == "end"] + text_evs = [e for e in events if e.get("kind") in ("text", "peer_text")] + + has_anchor = bool(starts or text_evs) + t0 = _num(starts[0].get("t")) if starts else (_num(text_evs[0].get("t")) if text_evs else 0.0) + incomplete = not ends + t_end = _num(ends[-1].get("t"), t0) if ends else (_num(events[-1].get("t"), t0) if events else t0) + model = starts[0].get("model", "") if starts else "" + language = (starts[0].get("language", "") if starts else "") or "auto" + party = any(e.get("kind") == "party" and e.get("on") for e in events) + + turns = [] + has_audio_time = any("audio_offset" in e for e in text_evs) + audio_end_max = 0.0 + for idx, e in enumerate(text_evs): + is_peer = e.get("kind") == "peer_text" + if "audio_offset" in e: + t_off = _num(e.get("audio_offset")) + audio_end_max = max(audio_end_max, _num(e.get("audio_end"), t_off)) + else: + t_off = max(0.0, _num(e.get("t"), t0) - t0) + conf = "" if is_peer else e.get("confidence", "") + if isinstance(conf, bool) or conf is None: + conf = "" + + # confidence is normally a TIER STRING ("", high, medium, new). Some producers + # (the synth harness, older logs) emit a float; handle both honestly. A + # non-finite float (NaN/Inf) is sanitized to "" so it can never reach the JSON + # sidecar as an invalid literal — and is treated as uncertain, not certain. + numeric = isinstance(conf, (int, float)) + non_finite = numeric and not math.isfinite(conf) + if non_finite: + conf, numeric = "", False + tier_uncertain = (conf < 0.5) if numeric else (conf in ("medium", "new") or non_finite) + + sid = 0 if is_peer else _coerce_sid(e.get("speaker_id", 0)) + overlap = bool(e.get("overlap", False)) and not is_peer + peer_name = e.get("peer") if is_peer else None + speaker = e.get("speaker", "") or ("(unattributed)" if (not is_peer and sid == 0) else "") + + markers = [] + if conf == "new": + markers.append("new-voice") + if tier_uncertain: + markers.append("~") + if not is_peer and sid == 0: + markers.append("~") + if overlap: + markers.append("overlap") + if is_peer: + markers.append("peer") + markers = [m for m in _MARKER_ORDER if m in markers] # dedup + canonical order + uncertain = tier_uncertain or (sid == 0 and not is_peer) + + turns.append({ + "index": idx, + "t_offset": round(t_off, 2), + "t_offset_hms": _fmt_hms(t_off), + "t_unix": _num(e.get("t"), None), + "speaker_id": sid, + "speaker": speaker, + "text": (e.get("text", "") or "").strip(), + "confidence": conf, + "confidence_uncertain": uncertain, + "overlap": overlap, + "peer": is_peer, + "peer_name": peer_name, + "markers": markers, + }) + + if has_audio_time: + # cover every turn, including any wall-clock-fallback turn, so a turn's + # offset can never exceed the reported duration. + duration_seconds = round(max([audio_end_max] + [t["t_offset"] for t in turns], default=0.0)) + else: + duration_seconds = round(max(0.0, t_end - t0)) + + # Distinct speakers (local by id; peers by name+label) with turn counts. + local_labels: dict[int, Counter] = {} + local_counts: Counter = Counter() + peer_counts: Counter = Counter() + for t in turns: + if t["peer"]: + peer_counts[(t["peer_name"], t["speaker"])] += 1 + else: + local_counts[t["speaker_id"]] += 1 + local_labels.setdefault(t["speaker_id"], Counter())[t["speaker"]] += 1 + speakers = [] + for sid in sorted(local_counts): + label = local_labels[sid].most_common(1)[0][0] if local_labels[sid] else "" + speakers.append({"id": sid, "label": label or "(unattributed)", + "peer": False, "peer_name": None, "turns": local_counts[sid]}) + for (pname, plabel), n in sorted(peer_counts.items(), key=lambda kv: (str(kv[0][0]), str(kv[0][1]))): + speakers.append({"id": 0, "label": plabel, "peer": True, "peer_name": pname, "turns": n}) + + # notes are computed once here (the single source) and stored on the doc; render_md + # consumes doc["_notes"] and render_json strips it. Ordering: caveat first. + notes = ["Speaker labels are diarization voice-clusters (e.g. \"Speaker 1\"), not verified " + "identities; treat attribution as approximate unless a turn is otherwise marked."] + if not has_anchor: + notes.append("No parsable session or transcript events were found — this log may be " + "empty or corrupt; timestamps are unavailable.") + if incomplete: + notes.append("Session end event missing — duration/ended_at are approximate.") + if has_audio_time: + notes.append("Timestamps are true offsets into the recorded audio.") + + return { + "voxterm_export_version": EXPORT_VERSION, + "kind": DOC_KIND, + "session": { + "id": session_id, + "started_at": (_iso_local(t0) if has_anchor else None), + "started_at_unix": (t0 if has_anchor else None), + "ended_at": (_iso_local(t_end) if (has_anchor and not incomplete) else None), + "duration_seconds": duration_seconds, + "duration_hms": _fmt_hms(duration_seconds), + "source": "VoxTerm", + "source_stream": source_stream, + "model": model, + "language": language, + "party": party, + "incomplete": incomplete, + "audio_relative_time": has_audio_time, + }, + "speakers": speakers, + "turns": turns, + "_notes": notes, + } + + +def _yaml_scalar(v) -> str: + if isinstance(v, bool): + return "true" if v else "false" + if v is None: + return "null" + if isinstance(v, (int, float)): + return str(v) + return json.dumps(str(v), ensure_ascii=False) # JSON string == valid YAML double-quoted + + +def render_md(doc: dict) -> str: + s = doc["session"] + out = ["---"] + out.append(f"voxterm_export_version: {EXPORT_VERSION}") + out.append(f"kind: {DOC_KIND}") + out.append(f"session_id: {_yaml_scalar(s['id'])}") + out.append(f"date: {s['started_at'][:10] if s['started_at'] else 'null'}") + out.append(f"started_at: {_yaml_scalar(s['started_at'])}") + out.append(f"ended_at: {_yaml_scalar(s['ended_at'])}") + out.append(f"duration: {_yaml_scalar(s['duration_hms'])}") + out.append(f"duration_seconds: {s['duration_seconds']}") + out.append("source: VoxTerm") + out.append(f"source_stream: {_yaml_scalar(s['source_stream'])}") + out.append(f"model: {_yaml_scalar(s['model'])}") + out.append(f"language: {_yaml_scalar(s['language'])}") + out.append(f"party: {_yaml_scalar(s['party'])}") + out.append(f"audio_relative_time: {_yaml_scalar(s['audio_relative_time'])}") + out.append("speakers:") + for sp in doc["speakers"]: + bits = f"id: {sp['id']}, label: {_yaml_scalar(sp['label'])}, turns: {sp['turns']}, peer: {_yaml_scalar(sp['peer'])}" + if sp["peer"]: + bits += f", peer_name: {_yaml_scalar(sp['peer_name'])}" + out.append(f" - {{ {bits} }}") + out.append(f"turns: {len(doc['turns'])}") + out.append("confidence_legend: { " + ", ".join(f"{k}: {_yaml_scalar(v)}" for k, v in _CONFIDENCE_LEGEND.items()) + " }") + out.append("markers:") + for k, v in _MARKER_LEGEND.items(): + out.append(f" {_yaml_scalar(k)}: {_yaml_scalar(v)}") + out.append("notes:") + for n in doc.get("_notes", []): + out.append(f" - {_yaml_scalar(n)}") + out.append("---") + out.append("") + + n_sp = len(doc["speakers"]) + out.append(f"> VoxTerm session — {n_sp} speaker(s), {len(doc['turns'])} turns, " + f"{s['duration_hms']}. Timestamps are [mm:ss] " + f"{'into the recorded audio' if s['audio_relative_time'] else 'from session start'}. " + f"Markers: [~] uncertain attribution, [overlap] overlapping speech, " + f"[new-voice] first appearance, [peer] remote peer. " + f"Speaker labels are voice clusters, not verified identities.") + out.append("") + out.append("## Transcript") + out.append("") + for t in doc["turns"]: + if t["peer"]: + who = f"**{t['speaker']}** (peer: {t['peer_name']})" + elif t["speaker_id"]: + who = f"**{t['speaker']}** (#{t['speaker_id']})" + else: + who = "**(unattributed)**" + line = f"[{t['t_offset_hms']}] {who}: {t['text']}" + if t["markers"]: + line += " " + " ".join(f"[{m}]" for m in t["markers"]) + out.append(line) + out.append("") + return "\n".join(out).rstrip() + "\n" + + +def render_json(doc: dict) -> str: + d = {k: v for k, v in doc.items() if k != "_notes"} + # allow_nan=False: refuse to emit NaN/Infinity literals (invalid JSON for strict + # parsers). build() already sanitizes them, so this is a fail-loud backstop. + return json.dumps(d, ensure_ascii=False, indent=2, allow_nan=False) + "\n" + + +def export(events_path: Path, out_dir: Path | None = None) -> tuple[Path, Path]: + events = load_events(events_path) + stem = events_path.stem + if stem.endswith("-events"): + stem = stem[: -len("-events")] + doc = build(events, session_id=stem, source_stream=events_path.name) # notes set on doc + if not doc["turns"]: + print(f"warning: no transcript turns found in {events_path.name} " + f"({len(events)} events parsed) — writing an empty export", file=sys.stderr) + + out_dir = out_dir or events_path.parent + out_dir.mkdir(parents=True, exist_ok=True) + md_path = out_dir / f"{stem}-agent.md" + json_path = out_dir / f"{stem}-agent.json" + md_path.write_text(render_md(doc), encoding="utf-8") + json_path.write_text(render_json(doc), encoding="utf-8") + return md_path, json_path + + +def main(argv=None) -> int: + ap = argparse.ArgumentParser(description="Export a VoxTerm event log to an LLM-agent transcript.") + ap.add_argument("events", nargs="?", help="path to a *-events.jsonl (default: newest in VoxTerm live dir)") + ap.add_argument("--out-dir", default=None, help="output dir (default: alongside the events file)") + args = ap.parse_args(argv) + + if args.events: + events_path = Path(args.events) + else: + # Default: newest *-events.jsonl in VoxTerm's live dir (self-contained — no glass dep). + try: + import sys as _sys + _sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + from config import LIVE_DIR + live = Path(LIVE_DIR) + except Exception: + live = Path.home() / ".local" / "share" / "voxterm" / ".live" + cands = sorted(live.glob("*-events.jsonl"), key=lambda p: p.stat().st_mtime, reverse=True) if live.exists() else [] + if not cands: + print("error: no events file given and none found in the live dir", file=sys.stderr) + return 2 + events_path = cands[0] + if not events_path.exists(): + print(f"error: no such file: {events_path}", file=sys.stderr) + return 2 + + md_path, json_path = export(events_path, Path(args.out_dir) if args.out_dir else None) + print(f"agent transcript: {md_path}") + print(f"json sidecar: {json_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/gui/server.py b/gui/server.py new file mode 100644 index 0000000..1d36539 --- /dev/null +++ b/gui/server.py @@ -0,0 +1,160 @@ +"""VoxTerm GUI control server — stdlib http.server, no extra deps. + +Serves the web UI (gui/static/) + a small JSON API + an SSE status stream, all +backed by gui.engine (which reuses VoxTerm's own engine). Loopback-only by default; +set VOXTERM_GUI_LAN=1 to expose on the LAN (e.g. to drive it from your phone). + + python -m gui.server # http://127.0.0.1:8740 + VOXTERM_GUI_LAN=1 python -m gui.server +""" +from __future__ import annotations + +import json +import os +import sys +import time +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from urllib.parse import urlparse, parse_qs + +_ROOT = str(Path(__file__).resolve().parent.parent) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + +from gui.engine import Engine # noqa: E402 + +STATIC = Path(__file__).resolve().parent / "static" +DEFAULT_PORT = 8740 +MAX_BODY = 64 * 1024 # API requests are tiny; bound them +MAX_SSE = 8 # cap concurrent status streams +_sse_count = 0 + +ENGINE = Engine() + +# Strict CSP: same-origin only, no external anything (the UI is fully self-hosted). +CSP = ("default-src 'none'; script-src 'self'; style-src 'self'; img-src 'self' data:; " + "connect-src 'self'; font-src 'self'; base-uri 'none'; form-action 'none'") +_CTYPES = {".html": "text/html; charset=utf-8", ".js": "text/javascript; charset=utf-8", + ".css": "text/css; charset=utf-8", ".svg": "image/svg+xml", ".json": "application/json"} + + +class Handler(BaseHTTPRequestHandler): + server_version = "voxterm-gui" + + def _hdr(self, code=200, ctype="application/json", extra=None): + self.send_response(code) + self.send_header("Content-Type", ctype) + self.send_header("Content-Security-Policy", CSP) + self.send_header("X-Content-Type-Options", "nosniff") + self.send_header("Referrer-Policy", "no-referrer") + for k, v in (extra or {}).items(): + self.send_header(k, v) + self.end_headers() + + def _json(self, obj, code=200): + body = json.dumps(obj, ensure_ascii=False).encode("utf-8") + self._hdr(code, "application/json") + self.wfile.write(body) + + def _read_json(self) -> dict: + n = int(self.headers.get("Content-Length") or 0) + if n <= 0 or n > MAX_BODY: + return {} + try: + return json.loads(self.rfile.read(n).decode("utf-8")) or {} + except Exception: + return {} + + def log_message(self, *a): # quiet + pass + + # ---- GET ---- + def do_GET(self): + u = urlparse(self.path) + p, q = u.path, parse_qs(u.query) + if p == "/" or p == "/index.html": + return self._serve_static("index.html") + if p.startswith("/static/"): + return self._serve_static(p[len("/static/"):]) + if p == "/api/options": + return self._json({"models": ENGINE.models(), "languages": ENGINE.languages()}) + if p == "/api/status": + return self._json(ENGINE.status()) + if p == "/api/sessions": + return self._json({"sessions": ENGINE.sessions()}) + if p == "/api/session": + stem = (q.get("stem") or [""])[0] + kind = (q.get("kind") or ["transcript"])[0] + return self._json(ENGINE.read_artifact(stem, kind)) + if p == "/api/events": + return self._sse() + return self._json({"error": "not found"}, 404) + + # ---- POST ---- + def do_POST(self): + p = urlparse(self.path).path + if p == "/api/record/start": + return self._json(ENGINE.start_recording()) + if p == "/api/record/stop": + b = self._read_json() + return self._json(ENGINE.stop_recording(model=b.get("model", "fw-small"), + language=b.get("language", "en"))) + if p == "/api/transcribe": + b = self._read_json() + return self._json(ENGINE.transcribe_existing(b.get("wav", ""), model=b.get("model", "fw-small"), + language=b.get("language", "en"))) + return self._json({"error": "not found"}, 404) + + def _serve_static(self, rel: str): + # resolve within STATIC only (no traversal) + target = (STATIC / rel).resolve() + try: + target.relative_to(STATIC.resolve()) + except ValueError: + return self._json({"error": "forbidden"}, 403) + if not target.is_file(): + return self._json({"error": "not found"}, 404) + ctype = _CTYPES.get(target.suffix, "application/octet-stream") + data = target.read_bytes() + self._hdr(200, ctype) + self.wfile.write(data) + + def _sse(self): + global _sse_count + if _sse_count >= MAX_SSE: + return self._json({"error": "too many streams"}, 429) + _sse_count += 1 + try: + self._hdr(200, "text/event-stream", {"Cache-Control": "no-cache", "Connection": "keep-alive"}) + while True: + payload = json.dumps(ENGINE.status(), ensure_ascii=False) + self.wfile.write(f"data: {payload}\n\n".encode("utf-8")) + self.wfile.flush() + time.sleep(0.4) + except (BrokenPipeError, ConnectionResetError): + pass + finally: + _sse_count -= 1 + + +def main(argv=None) -> int: + lan = os.environ.get("VOXTERM_GUI_LAN") == "1" + host = "0.0.0.0" if lan else "127.0.0.1" + port = int(os.environ.get("VOXTERM_GUI_PORT", DEFAULT_PORT)) + httpd = ThreadingHTTPServer((host, port), Handler) + httpd.daemon_threads = True + where = f"http://{'' if lan else '127.0.0.1'}:{port}" + print(f"[voxterm-gui] serving {where}") + if lan: + print("[voxterm-gui] LAN-exposed (VOXTERM_GUI_LAN=1) — reachable from your phone on this wifi.") + else: + print("[voxterm-gui] loopback only. Set VOXTERM_GUI_LAN=1 to reach it from your phone.") + try: + httpd.serve_forever() + except KeyboardInterrupt: + pass + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/gui/static/app.js b/gui/static/app.js new file mode 100644 index 0000000..87414fa --- /dev/null +++ b/gui/static/app.js @@ -0,0 +1,234 @@ +"use strict"; +const $ = (id) => document.getElementById(id); +const PALETTE = ["#5eead4", "#f0566a", "#fbbf24", "#a78bfa", "#60a5fa", "#34d399", "#fb923c", "#f472b6"]; + +let OPTS = { models: [], languages: {} }; +let CUR = null; // current doc (agent_json parsed) +let RENAMES = {}; // speaker_id -> custom name (view + export) +let lastJobState = "idle"; + +// ---------- helpers ---------- +async function getJSON(url, opts) { const r = await fetch(url, opts); return r.json(); } +function toast(msg) { + const t = $("toast"); t.textContent = msg; t.classList.remove("hidden"); + clearTimeout(toast._t); toast._t = setTimeout(() => t.classList.add("hidden"), 2200); +} +function fmtClock(sec) { + sec = Math.max(0, Math.floor(sec || 0)); + const m = Math.floor(sec / 60), s = sec % 60; + const h = Math.floor(m / 60); + return h ? `${h}:${String(m % 60).padStart(2, "0")}:${String(s).padStart(2, "0")}` + : `${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`; +} +function colorFor(sid) { return PALETTE[((sid || 0) % PALETTE.length + PALETTE.length) % PALETTE.length]; } +function nameFor(turn) { + if (turn.peer) return turn.peer_name ? `${turn.speaker} · ${turn.peer_name}` : turn.speaker; + if (RENAMES[turn.speaker_id]) return RENAMES[turn.speaker_id]; + return turn.speaker || "(unattributed)"; +} + +// ---------- init ---------- +async function init() { + OPTS = await getJSON("/api/options"); + const mSel = $("model"), lSel = $("language"); + OPTS.models.forEach((m) => { const o = document.createElement("option"); o.value = m; o.textContent = m; if (m === "fw-small") o.selected = true; mSel.appendChild(o); }); + Object.entries(OPTS.languages).forEach(([code, name]) => { const o = document.createElement("option"); o.value = code; o.textContent = name; if (code === "en") o.selected = true; lSel.appendChild(o); }); + + $("recBtn").addEventListener("click", toggleRecord); + $("refreshSessions").addEventListener("click", loadSessions); + $("navToggle").addEventListener("click", () => document.body.classList.toggle("nav-open")); + $("copyAgent").addEventListener("click", copyForAI); + $("dlMd").addEventListener("click", () => download(buildMarkdown(), `${CUR.session.id}-agent.md`, "text/markdown")); + $("dlJson").addEventListener("click", () => download(buildJson(), `${CUR.session.id}-agent.json`, "application/json")); + + await loadSessions(); + openEvents(); +} + +// ---------- live status (SSE) ---------- +function openEvents() { + const es = new EventSource("/api/events"); + es.onmessage = (e) => { + let s; try { s = JSON.parse(e.data); } catch { return; } + applyStatus(s); + }; + es.onerror = () => {/* browser auto-reconnects */}; +} +function applyStatus(s) { + document.body.classList.toggle("recording", !!s.recording); + $("recBtn").setAttribute("aria-label", s.recording ? "Stop recording" : "Start recording"); + if (s.recording) { + $("timer").textContent = fmtClock(s.elapsed); + $("recState").textContent = "Recording…"; + // level ring (0..~0.3 typical) -> 0..360deg + const deg = Math.min(360, (s.level || 0) / 0.25 * 360); + $("ring").style.background = `conic-gradient(var(--rec) ${deg}deg, var(--line) ${deg}deg)`; + $("model").disabled = $("language").disabled = true; + } else { + $("ring").style.background = ""; + $("model").disabled = $("language").disabled = false; + if (lastJobState === "idle" && s.job.state === "idle") { $("recState").textContent = "Ready to record"; $("timer").textContent = "00:00"; } + } + const job = s.job || { state: "idle" }; + if (job.state === "transcribing") { + $("progress").classList.remove("hidden"); + $("progressMsg").textContent = job.msg || "Transcribing…"; + const pct = Math.round((job.frac || 0) * 100); + $("progressPct").textContent = pct + "%"; + $("barFill").style.width = pct + "%"; + $("recState").textContent = "Transcribing…"; + } + if (job.state === "done" && lastJobState !== "done") { + $("progress").classList.add("hidden"); + $("recState").textContent = "Ready to record"; + toast(`Done — ${job.n_turns} turns, ${job.n_speakers} speaker(s)`); + if (job.stem) loadSession(job.stem); + loadSessions(); + } + if (job.state === "error" && lastJobState !== "error") { + $("progress").classList.add("hidden"); + toast("Error: " + (job.error || "transcription failed")); + $("recState").textContent = "Ready to record"; + } + lastJobState = job.state; +} + +// ---------- record ---------- +async function toggleRecord() { + const recording = document.body.classList.contains("recording"); + if (!recording) { + const r = await getJSON("/api/record/start", { method: "POST" }); + if (!r.ok) toast("Could not start (mic busy?)"); + } else { + $("recBtn").disabled = true; + await getJSON("/api/record/stop", { + method: "POST", headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ model: $("model").value, language: $("language").value }), + }); + setTimeout(() => { $("recBtn").disabled = false; }, 600); + } +} + +// ---------- sessions ---------- +async function loadSessions() { + const { sessions } = await getJSON("/api/sessions"); + const ul = $("sessions"); ul.innerHTML = ""; + if (!sessions.length) { ul.innerHTML = `
  • No sessions yet.
  • `; return; } + sessions.forEach((s) => { + const li = document.createElement("li"); li.className = "session"; li.dataset.stem = s.stem; + const has = []; if (s.agent_md) has.push("AI"); if (s.transcript) has.push("md"); + li.innerHTML = `
    ${prettyStem(s.stem)}
    +
    ${has.map((h) => `${h}`).join("")}
    `; + li.addEventListener("click", () => loadSession(s.stem)); + ul.appendChild(li); + }); +} +function prettyStem(stem) { + const m = stem.match(/(\d{4})-?(\d{2})-?(\d{2})[_-]?(\d{2})(\d{2})/); + if (m) return `${m[1]}-${m[2]}-${m[3]} ${m[4]}:${m[5]}`; + return stem; +} + +async function loadSession(stem) { + // prefer the structured JSON; fall back to the markdown if the AI export is missing + let res = await getJSON(`/api/session?stem=${encodeURIComponent(stem)}&kind=agent_json`); + if (!res.ok) { + res = await getJSON(`/api/session?stem=${encodeURIComponent(stem)}&kind=transcript`); + if (res.ok) return showRawMarkdown(stem, res.text); + return toast("Could not load session"); + } + try { CUR = JSON.parse(res.text); } catch { return toast("Bad session JSON"); } + RENAMES = {}; + render(); + document.querySelectorAll(".session").forEach((el) => el.classList.toggle("active", el.dataset.stem === stem)); + document.body.classList.remove("nav-open"); +} + +// ---------- render ---------- +function render() { + $("empty").classList.add("hidden"); + $("transcriptView").classList.remove("hidden"); + const s = CUR.session; + $("tvTitle").textContent = prettyStem(s.id); + $("tvMeta").textContent = `${CUR.turns.length} turns · ${CUR.speakers.length} speaker(s) · ${s.duration_hms || ""} · ${s.model || ""}`; + + // legend (click to rename) + const leg = $("speakerLegend"); leg.innerHTML = ""; + CUR.speakers.filter((sp) => !sp.peer).forEach((sp) => { + const el = document.createElement("button"); el.className = "lg"; + el.innerHTML = `${RENAMES[sp.id] || sp.label}`; + el.title = "Click to rename this speaker"; + el.addEventListener("click", () => renameSpeaker(sp.id)); + leg.appendChild(el); + }); + + const wrap = $("turns"); wrap.innerHTML = ""; + CUR.turns.forEach((t) => { + const row = document.createElement("div"); row.className = "turn" + (t.confidence_uncertain ? " uncertain" : ""); + const c = t.peer ? "#7aa2f7" : colorFor(t.speaker_id); + const mk = (t.markers || []).map((m) => `${m}`).join(""); + const spk = t.peer + ? `${escapeHtml(nameFor(t))}` + : ``; + row.innerHTML = `
    ${t.t_offset_hms}
    +
    ${spk}${mk}
    ${escapeHtml(t.text)}
    `; + const btn = row.querySelector("button[data-sid]"); + if (btn) btn.addEventListener("click", () => renameSpeaker(t.speaker_id)); + wrap.appendChild(row); + }); +} +function showRawMarkdown(stem, text) { + CUR = null; + $("empty").classList.add("hidden"); $("transcriptView").classList.remove("hidden"); + $("tvTitle").textContent = prettyStem(stem); $("tvMeta").textContent = "(no AI export — raw transcript)"; + $("speakerLegend").innerHTML = ""; + $("turns").innerHTML = `
    ${escapeHtml(text)}
    `; +} +function renameSpeaker(sid) { + const cur = RENAMES[sid] || (CUR.speakers.find((x) => x.id === sid) || {}).label || `Speaker ${sid}`; + const name = prompt("Rename speaker (applies to this view + your copy/export):", cur); + if (name && name.trim()) { RENAMES[sid] = name.trim(); render(); toast("Renamed — included when you copy/export"); } +} +function escapeHtml(s) { return String(s).replace(/[&<>"']/g, (c) => ({ "&": "&", "<": "<", ">": ">", '"': """, "'": "'" }[c])); } + +// ---------- export (rename-aware, built from the JSON source of truth) ---------- +function buildJson() { + const d = JSON.parse(JSON.stringify(CUR)); + d.turns.forEach((t) => { if (!t.peer && RENAMES[t.speaker_id]) t.speaker = RENAMES[t.speaker_id]; }); + d.speakers.forEach((sp) => { if (!sp.peer && RENAMES[sp.id]) sp.label = RENAMES[sp.id]; }); + return JSON.stringify(d, null, 2) + "\n"; +} +function buildMarkdown() { + const s = CUR.session; + const spk = CUR.speakers.map((sp) => sp.peer + ? ` - { id: 0, label: "${sp.label}", turns: ${sp.turns}, peer: true, peer_name: "${sp.peer_name}" }` + : ` - { id: ${sp.id}, label: "${RENAMES[sp.id] || sp.label}", turns: ${sp.turns}, peer: false }`).join("\n"); + const fm = ["---", "voxterm_export_version: 1", "kind: voxterm-transcript", + `session_id: "${s.id}"`, `date: ${(s.started_at || "").slice(0, 10) || "null"}`, + `duration: "${s.duration_hms || ""}"`, `model: "${s.model || ""}"`, `language: "${s.language || ""}"`, + "speakers:", spk, `turns: ${CUR.turns.length}`, + "notes:", ' - "Speaker labels are diarization clusters / your renames, not verified identities."', "---", ""].join("\n"); + const body = ["> VoxTerm session — timestamps are [mm:ss] into the recording; [~]=uncertain, [overlap], [new-voice], [peer].", "", "## Transcript", ""]; + CUR.turns.forEach((t) => { + const who = t.peer ? `**${nameFor(t)}** (peer: ${t.peer_name})` + : (t.speaker_id ? `**${nameFor(t)}** (#${t.speaker_id})` : "**(unattributed)**"); + let line = `[${t.t_offset_hms}] ${who}: ${t.text}`; + if (t.markers && t.markers.length) line += " " + t.markers.map((m) => `[${m}]`).join(" "); + body.push(line, ""); + }); + return fm + body.join("\n").trim() + "\n"; +} +async function copyForAI() { + if (!CUR) return toast("Load a transcript first"); + const md = buildMarkdown(); + try { await navigator.clipboard.writeText(md); toast("Copied AI transcript to clipboard"); } + catch { download(md, `${CUR.session.id}-agent.md`, "text/markdown"); toast("Clipboard blocked — downloaded instead"); } +} +function download(text, filename, mime) { + const blob = new Blob([text], { type: mime }); + const a = document.createElement("a"); + a.href = URL.createObjectURL(blob); a.download = filename; a.click(); + setTimeout(() => URL.revokeObjectURL(a.href), 1000); +} + +init().catch((e) => toast("Init failed: " + e)); diff --git a/gui/static/index.html b/gui/static/index.html new file mode 100644 index 0000000..f6afa97 --- /dev/null +++ b/gui/static/index.html @@ -0,0 +1,89 @@ + + + + + + + VoxTerm + + + + + + + +
    + +
    +
    +
    + +
    +
    +
    00:00
    +
    Ready to record
    +
    +
    +
    + + +
    +

    Pick a model, hit record, talk. Stop when done — you’ll get a clean transcript and an AI-ready export.

    +
    + + + + + + + + +
    +

    No transcript yet. Record something, or pick a past session from the left.

    +
    + + +
    + + + + diff --git a/gui/static/style.css b/gui/static/style.css new file mode 100644 index 0000000..d3830c8 --- /dev/null +++ b/gui/static/style.css @@ -0,0 +1,152 @@ +:root { + --bg: #0e0f13; + --bg-elev: #16181f; + --bg-elev-2: #1c1f28; + --line: #262a35; + --text: #e7e9ee; + --muted: #9aa1ad; + --faint: #6b7280; + --accent: #5eead4; /* calm teal */ + --accent-dim: #2dd4bf; + --rec: #f0566a; /* warm coral for record */ + --rec-glow: rgba(240, 86, 106, 0.45); + --radius: 14px; + --shadow: 0 8px 30px rgba(0,0,0,0.35); + font-synthesis: none; +} + +* { box-sizing: border-box; } +html, body { height: 100%; margin: 0; } +body { + background: radial-gradient(1200px 600px at 80% -10%, #14202a 0%, var(--bg) 55%) fixed; + color: var(--text); + font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; + font-size: 15px; + line-height: 1.55; + display: grid; + grid-template-columns: 288px 1fr; + -webkit-font-smoothing: antialiased; +} + +/* ---------- sidebar ---------- */ +.sidebar { + background: linear-gradient(180deg, var(--bg-elev) 0%, #121419 100%); + border-right: 1px solid var(--line); + padding: 22px 18px; + height: 100vh; + position: sticky; + top: 0; + overflow-y: auto; +} +.brand { display: flex; align-items: center; gap: 10px; } +.brand h1 { font-size: 20px; font-weight: 650; letter-spacing: -0.01em; margin: 0; } +.brand-dot { width: 11px; height: 11px; border-radius: 50%; background: var(--accent); + box-shadow: 0 0 12px var(--accent); } +.brand-sub { color: var(--faint); font-size: 12px; margin: 4px 0 22px 21px; letter-spacing: 0.02em; } +.sessions-head { display: flex; align-items: center; justify-content: space-between; margin-bottom: 8px; } +.sessions-head h2 { font-size: 12px; text-transform: uppercase; letter-spacing: 0.08em; color: var(--muted); margin: 0; font-weight: 600; } +.ghost-btn { background: none; border: none; color: var(--muted); cursor: pointer; font-size: 16px; border-radius: 8px; padding: 2px 7px; } +.ghost-btn:hover { background: var(--bg-elev-2); color: var(--text); } +.sessions { list-style: none; padding: 0; margin: 0; display: flex; flex-direction: column; gap: 6px; } +.session { + padding: 10px 12px; border-radius: 10px; cursor: pointer; border: 1px solid transparent; + transition: background .15s, border-color .15s; +} +.session:hover { background: var(--bg-elev-2); } +.session.active { background: var(--bg-elev-2); border-color: var(--line); } +.session .s-title { font-size: 13.5px; font-weight: 550; } +.session .s-sub { font-size: 11.5px; color: var(--faint); margin-top: 2px; display: flex; gap: 6px; } +.tag { font-size: 10px; padding: 1px 6px; border-radius: 999px; background: #1f2630; color: var(--accent-dim); border: 1px solid var(--line); } + +/* ---------- main ---------- */ +.main { padding: 36px clamp(20px, 5vw, 56px); max-width: 980px; width: 100%; } +.hero { text-align: center; padding: 8px 0 28px; } +.record-wrap { display: flex; flex-direction: column; align-items: center; gap: 16px; } + +.ring { + width: 150px; height: 150px; border-radius: 50%; + display: grid; place-items: center; position: relative; + background: conic-gradient(var(--accent) 0deg, var(--line) 0deg); + transition: background .1s linear; +} +.ring-fill { position: absolute; inset: 7px; border-radius: 50%; background: var(--bg); } +.rec-btn { + position: relative; z-index: 1; + width: 112px; height: 112px; border-radius: 50%; border: none; cursor: pointer; + background: radial-gradient(circle at 50% 35%, #ff6b7e, var(--rec)); + display: grid; place-items: center; + box-shadow: 0 6px 24px var(--rec-glow); + transition: transform .12s ease, box-shadow .2s; +} +.rec-btn:hover { transform: scale(1.04); } +.rec-btn:active { transform: scale(0.97); } +.rec-icon { width: 30px; height: 30px; border-radius: 50%; background: #fff; transition: all .18s ease; } +body.recording .rec-icon { width: 26px; height: 26px; border-radius: 6px; } /* circle -> square (stop) */ +body.recording .rec-btn { animation: pulse 1.6s ease-in-out infinite; } +@keyframes pulse { 0%,100% { box-shadow: 0 6px 24px var(--rec-glow); } 50% { box-shadow: 0 6px 40px var(--rec-glow); } } +.rec-btn:disabled { filter: grayscale(.6) brightness(.7); cursor: not-allowed; } + +.timer { font-size: 30px; font-weight: 600; font-variant-numeric: tabular-nums; letter-spacing: 0.01em; } +.rec-state { color: var(--muted); font-size: 13px; } + +.controls { display: flex; gap: 16px; justify-content: center; margin-top: 26px; flex-wrap: wrap; } +.field { display: flex; flex-direction: column; gap: 6px; text-align: left; } +.field > span { font-size: 11px; text-transform: uppercase; letter-spacing: 0.06em; color: var(--muted); } +.select { + background: var(--bg-elev); color: var(--text); border: 1px solid var(--line); + border-radius: 10px; padding: 9px 12px; font-size: 14px; min-width: 160px; cursor: pointer; +} +.select:disabled { opacity: .5; cursor: not-allowed; } +.hint { color: var(--faint); font-size: 12.5px; max-width: 460px; margin: 22px auto 0; } + +/* ---------- progress ---------- */ +.progress { background: var(--bg-elev); border: 1px solid var(--line); border-radius: var(--radius); padding: 18px 20px; margin: 8px 0 24px; box-shadow: var(--shadow); } +.progress-row { display: flex; justify-content: space-between; font-size: 13.5px; color: var(--muted); margin-bottom: 10px; } +.bar { height: 8px; background: var(--bg-elev-2); border-radius: 999px; overflow: hidden; } +.bar-fill { height: 100%; width: 0%; background: linear-gradient(90deg, var(--accent-dim), var(--accent)); border-radius: 999px; transition: width .3s ease; } + +/* ---------- transcript ---------- */ +.transcript-view { animation: fade .3s ease; } +@keyframes fade { from { opacity: 0; transform: translateY(6px); } to { opacity: 1; transform: none; } } +.tv-head { display: flex; align-items: flex-start; justify-content: space-between; gap: 16px; flex-wrap: wrap; margin-bottom: 12px; } +.tv-head h2 { margin: 0; font-size: 20px; } +.tv-meta { color: var(--faint); font-size: 12.5px; margin: 4px 0 0; } +.tv-actions { display: flex; gap: 8px; flex-wrap: wrap; } +.btn { background: var(--accent); color: #06231e; border: none; border-radius: 10px; padding: 9px 15px; font-size: 13.5px; font-weight: 600; cursor: pointer; transition: filter .15s, transform .1s; } +.btn:hover { filter: brightness(1.06); } +.btn:active { transform: translateY(1px); } +.btn.ghost { background: transparent; color: var(--text); border: 1px solid var(--line); } +.btn.ghost:hover { background: var(--bg-elev-2); } + +.legend { display: flex; gap: 12px; flex-wrap: wrap; margin: 6px 0 18px; } +.legend .lg { display: inline-flex; align-items: center; gap: 7px; font-size: 12.5px; color: var(--muted); background: var(--bg-elev); border: 1px solid var(--line); padding: 4px 10px; border-radius: 999px; cursor: pointer; } +.legend .dot { width: 10px; height: 10px; border-radius: 50%; flex: none; } + +.turns { display: flex; flex-direction: column; gap: 2px; } +.turn { display: grid; grid-template-columns: 64px 1fr; gap: 14px; padding: 9px 12px; border-radius: 10px; transition: background .12s; } +.turn:hover { background: var(--bg-elev); } +.turn .t-time { color: var(--faint); font-size: 12px; font-variant-numeric: tabular-nums; padding-top: 2px; } +.turn .t-body { min-width: 0; } +.turn .t-spk { font-weight: 600; font-size: 13px; display: inline-flex; align-items: center; gap: 7px; } +.turn .t-spk .dot { width: 9px; height: 9px; border-radius: 50%; flex: none; } +.turn .t-spk button { background: none; border: none; color: inherit; font: inherit; cursor: pointer; border-bottom: 1px dotted transparent; padding: 0; } +.turn .t-spk button:hover { border-bottom-color: currentColor; } +.turn .t-text { margin-top: 2px; } +.turn .mk { font-size: 10.5px; color: var(--faint); border: 1px solid var(--line); border-radius: 6px; padding: 0 5px; margin-left: 6px; vertical-align: middle; } +.turn.uncertain .t-text { color: var(--muted); } + +/* ---------- misc ---------- */ +.empty { color: var(--faint); text-align: center; padding: 60px 20px; font-size: 14px; } +.hidden { display: none !important; } +.toast { position: fixed; bottom: 22px; left: 50%; transform: translateX(-50%); background: var(--bg-elev-2); color: var(--text); border: 1px solid var(--line); padding: 10px 18px; border-radius: 10px; box-shadow: var(--shadow); font-size: 13.5px; z-index: 50; animation: fade .2s ease; } +.nav-toggle { display: none; position: fixed; top: 14px; left: 14px; z-index: 40; background: var(--bg-elev); color: var(--text); border: 1px solid var(--line); border-radius: 10px; width: 40px; height: 40px; font-size: 18px; cursor: pointer; } + +/* ---------- responsive / mobile ---------- */ +@media (max-width: 820px) { + body { grid-template-columns: 1fr; } + .sidebar { position: fixed; z-index: 30; width: 280px; transform: translateX(-100%); transition: transform .22s ease; } + body.nav-open .sidebar { transform: none; } + .nav-toggle { display: grid; place-items: center; } + .main { padding: 64px 18px 40px; } + .turn { grid-template-columns: 52px 1fr; gap: 10px; } +} diff --git a/gui/test_export.py b/gui/test_export.py new file mode 100644 index 0000000..19b2f34 --- /dev/null +++ b/gui/test_export.py @@ -0,0 +1,253 @@ +"""Tests for glass.export — the LLM-agent transcript exporter. + +Pytest-style; also runnable standalone (`python tests/test_export.py`) via the +__main__ runner at the bottom, so it works without pytest installed. +""" +import json +import sys +import tempfile +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from export import build, export, render_md, render_json, _coerce_sid, _fmt_hms + + +def _evs(): + """A realistic REAL-schema event stream (int speaker_ids, tier-string confidence).""" + t0 = 1_780_000_000.0 + return [ + {"t": t0, "kind": "session", "phase": "start", "model": "fw-base", "language": "en"}, + {"t": t0, "kind": "recording", "on": True}, + {"t": t0 + 1, "kind": "speaker", "speaker_id": 1, "label": "Speaker 1", "color": "#00ffcc"}, + {"t": t0 + 1, "kind": "text", "speaker": "Speaker 1", "speaker_id": 1, "color": "#00ffcc", + "text": "Let's ship the exporter.", "confidence": "high", "overlap": False, "audio_offset": 1.5, "audio_end": 4.0}, + {"t": t0 + 2, "kind": "text", "speaker": "Speaker 2", "speaker_id": 2, "color": "#ff8c42", + "text": "Is the new voice handled?", "confidence": "new", "overlap": False, "audio_offset": 5.0, "audio_end": 7.0}, + {"t": t0 + 3, "kind": "text", "speaker": "Speaker 1", "speaker_id": 1, "color": "#00ffcc", + "text": "Yes, and overlaps too.", "confidence": "", "overlap": True, "audio_offset": 7.5, "audio_end": 9.0}, + {"t": t0 + 4, "kind": "text", "speaker": "", "speaker_id": 0, "color": "", + "text": "mumble in the back", "confidence": "", "overlap": False, "audio_offset": 9.5, "audio_end": 10.0}, + {"t": t0 + 5, "kind": "peer_text", "peer": "laptop-2", "speaker": "Sam", "text": "Joining from the other room."}, + {"t": t0 + 6, "kind": "session", "phase": "end"}, + ] + + +def _doc(): + return build(_evs(), session_id="2026-06-04_120000", source_stream="x-events.jsonl") + + +def test_turn_count_and_kinds(): + d = _doc() + assert len(d["turns"]) == 5 # 4 text + 1 peer_text (non-content events excluded) + assert d["voxterm_export_version"] == 1 + assert d["kind"] == "voxterm-transcript" + + +def test_high_confidence_unmarked(): + t = _doc()["turns"][0] + assert t["confidence"] == "high" and t["markers"] == [] and t["confidence_uncertain"] is False + + +def test_new_voice_marked_uncertain(): + t = _doc()["turns"][1] + assert "new-voice" in t["markers"] and "~" in t["markers"] and t["confidence_uncertain"] is True + + +def test_overlap_marked(): + t = _doc()["turns"][2] + assert "overlap" in t["markers"] + # confidence "" with a real speaker id is NOT per-turn marked uncertain + assert t["confidence_uncertain"] is False + + +def test_unattributed_marked(): + t = _doc()["turns"][3] + assert t["speaker_id"] == 0 and t["speaker"] == "(unattributed)" + assert "~" in t["markers"] and t["confidence_uncertain"] is True + + +def test_peer_turn(): + t = _doc()["turns"][4] + assert t["peer"] is True and t["peer_name"] == "laptop-2" and t["speaker"] == "Sam" + assert t["speaker_id"] == 0 and t["markers"] == ["peer"] + + +def test_audio_offset_preferred_over_wallclock(): + d = _doc() + assert d["session"]["audio_relative_time"] is True + assert d["turns"][0]["t_offset"] == 1.5 and d["turns"][0]["t_offset_hms"] == "00:02" + # duration from max audio_end, not wall-clock session span + assert d["session"]["duration_seconds"] == 10 + + +def test_speaker_grouping(): + d = _doc() + locals_ = {s["id"]: s for s in d["speakers"] if not s["peer"]} + assert locals_[1]["turns"] == 2 and locals_[1]["label"] == "Speaker 1" + assert locals_[2]["turns"] == 1 + assert any(s["peer"] and s["peer_name"] == "laptop-2" for s in d["speakers"]) + + +def test_numeric_confidence_robust(): + # synth/older logs emit a float confidence; <0.5 must be uncertain, not crash + evs = [{"t": 1.0, "kind": "session", "phase": "start", "model": "m", "language": "en"}, + {"t": 2.0, "kind": "text", "speaker": "a", "speaker_id": 1, "text": "hi", "confidence": 0.40, "overlap": False}, + {"t": 3.0, "kind": "session", "phase": "end"}] + t = build(evs, session_id="s", source_stream="x")["turns"][0] + assert t["confidence_uncertain"] is True and "~" in t["markers"] + + +def test_coerce_sid_handles_strings(): + assert _coerce_sid(3) == 3 and _coerce_sid("S2") == 2 and _coerce_sid("7") == 7 + assert _coerce_sid("nope") == 0 and _coerce_sid(None) == 0 and _coerce_sid(True) == 0 + + +def test_incomplete_session(): + evs = [{"t": 1.0, "kind": "session", "phase": "start", "model": "m", "language": "en"}, + {"t": 2.0, "kind": "text", "speaker": "a", "speaker_id": 1, "text": "hi", "confidence": "", "overlap": False}] + d = build(evs, session_id="s", source_stream="x") + assert d["session"]["incomplete"] is True and d["session"]["ended_at"] is None + + +def test_render_md_structure(): + d = _doc() + d["_notes"] = ["note"] + md = render_md(d) + assert md.startswith("---\n") and "## Transcript" in md + assert "[peer]" in md and "(unattributed)" in md and "(#1)" in md + # exactly one turn per non-empty paragraph in the transcript body + body = md.split("## Transcript", 1)[1] + turn_lines = [ln for ln in body.splitlines() if ln.startswith("[")] + assert len(turn_lines) == len(d["turns"]) + + +def test_json_sidecar_valid(): + d = _doc() + parsed = json.loads(render_json(d)) + assert parsed["voxterm_export_version"] == 1 + assert len(parsed["turns"]) == 5 + assert "_notes" not in parsed # internal field must not leak into JSON + + +def test_export_round_trip_files(): + tmp = Path(tempfile.mkdtemp()) + ev = tmp / "2026-06-04_120000-events.jsonl" + ev.write_text("\n".join(json.dumps(e) for e in _evs()) + "\n", encoding="utf-8") + md_path, json_path = export(ev, tmp) + assert md_path.name == "2026-06-04_120000-agent.md" + assert json_path.name == "2026-06-04_120000-agent.json" + assert md_path.exists() and json_path.exists() + json.loads(json_path.read_text()) # valid + + +def test_malformed_lines_skipped(): + tmp = Path(tempfile.mkdtemp()) + ev = tmp / "s-events.jsonl" + ev.write_text('{"t":1,"kind":"session","phase":"start","model":"m","language":"en"}\n' + 'not json at all\n' + '\n' + '{"t":2,"kind":"text","speaker":"a","speaker_id":1,"text":"hi","confidence":"","overlap":false}\n', + encoding="utf-8") + md_path, _ = export(ev, tmp) + assert "hi" in md_path.read_text() + + +def test_fmt_hms(): + assert _fmt_hms(65) == "01:05" and _fmt_hms(3661) == "1:01:01" and _fmt_hms(0) == "00:00" + + +# --- regression tests for the adversarial-review findings --- + +def test_missing_t_does_not_crash(): + # garbled-but-valid-JSON lines with no 't' must not crash build() (load_events contract) + evs = [{"kind": "session", "phase": "start", "model": "m", "language": "en"}, + {"kind": "text", "speaker": "a", "speaker_id": 1, "text": "hi", "confidence": "", "overlap": False}] + d = build(evs, session_id="s", source_stream="x") + assert len(d["turns"]) == 1 + + +def test_garbled_timestamps_do_not_crash(): + evs = [{"t": "not-a-number", "kind": "session", "phase": "start", "model": "m", "language": "en"}, + {"t": "", "kind": "text", "speaker": "a", "speaker_id": 1, "text": "hi", + "confidence": "", "overlap": False, "audio_offset": "NaN", "audio_end": None}] + d = build(evs, session_id="s", source_stream="x") + assert d["turns"][0]["t_offset"] == 0.0 # degraded, not crashed + + +def test_nonfinite_confidence_safe_json(): + nan = float("nan") + evs = [{"t": 1.0, "kind": "session", "phase": "start", "model": "m", "language": "en"}, + {"t": 2.0, "kind": "text", "speaker": "a", "speaker_id": 1, "text": "hi", "confidence": nan, "overlap": False}, + {"t": 3.0, "kind": "session", "phase": "end"}] + d = build(evs, session_id="s", source_stream="x") + assert d["turns"][0]["confidence"] == "" # NaN sanitized away + assert d["turns"][0]["confidence_uncertain"] is True + out = render_json(d) + assert "NaN" not in out and "Infinity" not in out + # strict parse: raise if any NaN/Infinity constant is present + json.loads(out, parse_constant=lambda c: (_ for _ in ()).throw(ValueError(c))) + + +def test_empty_log_is_honest_not_1970(): + d = build([], session_id="s", source_stream="x") + assert d["session"]["started_at"] is None # not the Unix epoch + assert len(d["turns"]) == 0 + assert any("empty or corrupt" in n for n in d["_notes"]) + # renders without crashing despite null started_at + md = render_md(d) + assert "date: null" in md + + +def test_build_populates_notes(): + # notes must come from build() itself (no manual _notes injection needed) + d = build(_evs(), session_id="s", source_stream="x") + assert d.get("_notes") and any("diarization voice-clusters" in n for n in d["_notes"]) + md = render_md(d) + assert "notes:" in md and "diarization voice-clusters" in md + + +def test_duration_covers_all_turns(): + # an audio-timed session with a wall-clock-fallback turn: duration must cover it + t0 = 1000.0 + evs = [{"t": t0, "kind": "session", "phase": "start", "model": "m", "language": "en"}, + {"t": t0 + 1, "kind": "text", "speaker": "a", "speaker_id": 1, "text": "x", + "confidence": "", "overlap": False, "audio_offset": 5.0, "audio_end": 6.0}, + {"t": t0 + 30, "kind": "text", "speaker": "a", "speaker_id": 1, "text": "y", + "confidence": "", "overlap": False}, # no audio_offset -> wall-clock 30s + {"t": t0 + 31, "kind": "session", "phase": "end"}] + d = build(evs, session_id="s", source_stream="x") + assert d["session"]["duration_seconds"] >= max(t["t_offset"] for t in d["turns"]) + + +def test_yaml_frontmatter_parses_with_injection_attempt(): + try: + import yaml + except ImportError: + return # no YAML parser available; skip + evs = [{"t": 1.0, "kind": "session", "phase": "start", "model": 'weird: "model"\ninjected: x', "language": "en"}, + {"t": 2.0, "kind": "peer_text", "peer": "host: evil\nkey: 1", "speaker": 'Sam "the man"', "text": "hi"}, + {"t": 3.0, "kind": "session", "phase": "end"}] + d = build(evs, session_id="s", source_stream="x") + md = render_md(d) + front = md.split("---", 2)[1] + parsed = yaml.safe_load(front) + assert parsed["model"] == 'weird: "model"\ninjected: x' # value preserved, not split + assert "injected" not in parsed # no key injected + # peer name with a colon/newline round-trips inside the speakers list + peer = next(s for s in parsed["speakers"] if s.get("peer")) + assert peer["peer_name"] == "host: evil\nkey: 1" + + +if __name__ == "__main__": + fns = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)] + failed = 0 + for fn in fns: + try: + fn() + print(f" ok {fn.__name__}") + except Exception as e: + failed += 1 + print(f" FAIL {fn.__name__}: {type(e).__name__}: {e}") + print(f"\n{len(fns) - failed}/{len(fns)} passed") + sys.exit(1 if failed else 0) diff --git a/gui/transcribe.py b/gui/transcribe.py new file mode 100644 index 0000000..ac93f61 --- /dev/null +++ b/gui/transcribe.py @@ -0,0 +1,176 @@ +"""Headless transcription for the GUI: a WAV (or an in-memory buffer) -> a faithful +VoxTerm ``events.jsonl`` + ``-transcript.md``, reusing VoxTerm's OWN engine +(transcriber + Silero VAD + diarizer + EventLogger). No reimplementation of the +transcription/diarization logic — this just drives the same components the TUI drives. + +Importable from the GUI backend (``gui.server``) and runnable as a CLI: + + python -m gui.transcribe ROOM.wav [--out-dir DIR] [--model fw-base] [--language en] + +Each ``text`` event carries an additive ``audio_offset``/``audio_end`` (seconds into +the recording) so the exporter shows true audio-relative timestamps; glass and other +consumers ignore the extra fields. +""" +from __future__ import annotations + +import argparse +import sys +from datetime import datetime +from pathlib import Path + +# VoxTerm package root (this file lives in /gui/). +_ROOT = str(Path(__file__).resolve().parent.parent) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + +import numpy as np # noqa: E402 + +import config # noqa: E402 +from audio.transcriber import get_transcriber # noqa: E402 +from audio.vad import SileroVAD # noqa: E402 +from audio.diarization.proxy import DiarizationProxy # noqa: E402 +from tui.events import EventLogger # noqa: E402 +from tui.app import VoxTerm # noqa: E402 (reuse only the static _split_text_by_segments) + +SR = config.SAMPLE_RATE # 16000 + + +def load_wav_16k_mono(path: Path) -> np.ndarray: + """Load any WAV as float32 mono @ 16 kHz (the live-capture format).""" + import soundfile as sf + data, sr = sf.read(str(path), dtype="float32", always_2d=False) + if getattr(data, "ndim", 1) > 1: + data = data.mean(axis=1) + if sr != SR: + from scipy.signal import resample_poly + data = resample_poly(data, SR, sr).astype(np.float32) + return np.ascontiguousarray(data, dtype=np.float32) + + +def _fmt_hms(seconds: float) -> str: + s = int(seconds) + h, rem = divmod(s, 3600) + m, sec = divmod(rem, 60) + return f"{h}:{m:02d}:{sec:02d}" if h else f"{m:02d}:{sec:02d}" + + +def transcribe_audio(audio: np.ndarray, out_dir: Path, *, model: str = "fw-base", + language: str = "en", progress=None) -> dict: + """Transcribe a float32/16k mono buffer. Returns + {events_path, transcript_path, n_turns, n_speakers}. ``progress(frac, msg)`` is + called 0..1 as windows complete (optional, for a live UI).""" + out_dir = Path(out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + if progress: + progress(0.02, "loading engine") + tr = get_transcriber(model, language=language) + tr.load() + vad = SileroVAD() + diar = DiarizationProxy() + diar.load() + diar.reset_session() + + session_start = datetime.now() + base = session_start.strftime("%Y-%m-%d_%H%M%S") + ts, n = base, 1 + while (out_dir / f"{ts}-events.jsonl").exists() or (out_dir / f"{ts}-transcript.md").exists(): + n += 1 + ts = f"{base}-{n}" + events_path = out_dir / f"{ts}-events.jsonl" + md_path = out_dir / f"{ts}-transcript.md" + + ev = EventLogger(events_path) + ev.open() + md = md_path.open("x", encoding="utf-8") + last_sid, n_turns = 0, 0 + speakers: dict[int, str] = {} + try: + md.write("# VoxTerm Transcript\n\n") + md.write(f"- **Date:** {session_start.strftime('%A, %B %d, %Y')}\n") + md.write(f"- **Started:** {session_start.strftime('%I:%M %p')}\n") + md.write(f"- **Model:** {model}\n") + md.write(f"- **Language:** {config.AVAILABLE_LANGUAGES.get(language, language)}\n") + md.write("\n---\n\n") + ev.emit("session", phase="start", model=model, language=language) + ev.emit("recording", on=True) + + windows = vad.get_speech_segments(audio, min_speech_ms=500, min_silence_ms=300, max_speech_s=6.0) + total = max(1, len(windows)) + for wi, (s, e) in enumerate(windows): + if progress: + progress(0.05 + 0.92 * wi / total, f"segment {wi + 1}/{total}") + clip = audio[s:e] + out = tr.transcribe(clip) + text = (out.get("text") or "").strip() + if not text: + continue + ev.emit("vad", on=True) + if len(clip) >= 48000: + segments = diar.identify_segments(clip.copy()) + else: + lbl, sid = diar.identify(clip.copy()) + segments = [(lbl, sid, 0, len(clip))] + if not segments: + segments = [("", 0, 0, len(clip))] + if len(segments) > 1: + parts = VoxTerm._split_text_by_segments(text, segments) + else: + parts = [(text, segments[0][0], segments[0][1])] + for (seg_text, label, sid), (_l, _s, seg_start, seg_end) in zip(parts, segments): + seg_text = seg_text.strip() + if not seg_text: + continue + color = diar.get_speaker_color(sid) if sid else "" + if sid and sid not in speakers: + speakers[sid] = label or f"Speaker {sid}" + audio_offset = round((s + seg_start) / SR, 2) + audio_end = round((s + seg_end) / SR, 2) + if sid != last_sid: + ev.emit("speaker", speaker_id=sid, label=label, color=color) + last_sid = sid + ev.emit("text", speaker=label, speaker_id=sid, color=color, text=seg_text, + confidence="", overlap=False, audio_offset=audio_offset, audio_end=audio_end) + stamp = _fmt_hms(audio_offset) + md.write(f"**[{stamp}]** **{label}:** {seg_text}\n\n" if label else f"**[{stamp}]** {seg_text}\n\n") + n_turns += 1 + ev.emit("vad", on=False) + finally: + for _c in (lambda: ev.emit("recording", on=False), lambda: ev.emit("session", phase="end"), ev.close, md.close): + try: + _c() + except Exception: + pass + if progress: + progress(1.0, "done") + return {"events_path": str(events_path), "transcript_path": str(md_path), + "n_turns": n_turns, "n_speakers": len(speakers)} + + +def transcribe_wav(wav_path, out_dir, *, model="fw-base", language="en", progress=None) -> dict: + return transcribe_audio(load_wav_16k_mono(Path(wav_path)), Path(out_dir), + model=model, language=language, progress=progress) + + +def main(argv=None) -> int: + ap = argparse.ArgumentParser(description="Headless VoxTerm transcription of a WAV file.") + ap.add_argument("wav") + ap.add_argument("--out-dir", default=str(Path.home() / "voxterm-live")) + ap.add_argument("--model", default="fw-base") + ap.add_argument("--language", default="en") + args = ap.parse_args(argv) + if not Path(args.wav).exists(): + print(f"error: no such file: {args.wav}", file=sys.stderr) + return 2 + + def prog(f, m): + print(f" [{int(f*100):3d}%] {m}", flush=True) + r = transcribe_wav(args.wav, args.out_dir, model=args.model, language=args.language, progress=prog) + print(f"done: {r['n_turns']} turns, {r['n_speakers']} speaker(s)") + print(f"EVENTS={r['events_path']}") + print(f"TRANSCRIPT={r['transcript_path']}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 9a6de2b6a4285e0b14c8e1e9fee302d99ca85d8a Mon Sep 17 00:00:00 2001 From: NubsCarson <192162056+NubsCarson@users.noreply.github.com> Date: Thu, 4 Jun 2026 06:41:40 +0000 Subject: [PATCH 02/60] =?UTF-8?q?fix(gui):=20apply=20adversarial-review=20?= =?UTF-8?q?findings=20(11=20confirmed)=20=E2=80=94=20security=20+=20correc?= =?UTF-8?q?tness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review of the GUI (16 agents) found 11 real issues, all fixed + verified: - BLOCKER: strict CSP (style-src 'self', no 'unsafe-inline') silently blocked every element.style the UI sets (level ring, progress bar, speaker color dots) — the core visuals. Allow 'unsafe-inline' for style-src (all interpolated values are escaped). - MAJOR (security): LAN mode (VOXTERM_GUI_LAN=1) had zero auth — anyone on the wifi could start a recording of the room or read past transcripts. Now requires a token (generated/printed on start, or VOXTERM_GUI_TOKEN) on every /api/* call; loopback stays open. Verified: no-token/bad-token -> 401, valid -> 200. - MAJOR (perf): the transcriber/VAD/diarizer were reloaded from disk every recording. Cache them (lock-guarded) in gui.transcribe; reset the diarizer session per run. - MAJOR (xss): unescaped speaker rename/label in the legend innerHTML -> escapeHtml. - MAJOR (correctness): hand-built YAML in the client export broke / allowed key injection on a rename/peer_name with a quote or newline -> JSON.stringify scalars (mirrors the server's _yaml_scalar). - MAJOR (crash): Download .md/.json threw on a raw-markdown fallback session (CUR null) -> guard the handlers. - MINOR: dir-aware artifact resolution (same stem in two dirs returned the wrong file); poll-thread appends under the lock + join the thread WITHOUT holding it (avoids a deadlock) so trailing audio isn't dropped; SSE counter guarded by a lock; session-stem escaped in the sidebar; flush startup prints so the LAN token is visible immediately. - NIT: start_recording wraps mic-open in try/except -> structured {ok:false,error} so a busy/missing mic shows a real message instead of a 500. Verified: py_compile + node --check clean; export tests 23/23; CSP header correct; the loopback load flow against a real 53-turn session; LAN 401/200/401; and a live record -> stop -> transcribe -> export run through the engine (graceful 0-turn on a near-silent clip). 2 review findings correctly refuted (malformed Content-Length is cosmetic; nav a11y is enhancement). --- gui/engine.py | 49 +++++++++++++++++++++++++---------------- gui/server.py | 55 ++++++++++++++++++++++++++++++++++++----------- gui/static/app.js | 40 ++++++++++++++++++++++------------ gui/transcribe.py | 32 ++++++++++++++++++++++----- 4 files changed, 126 insertions(+), 50 deletions(-) diff --git a/gui/engine.py b/gui/engine.py index 9a87030..b5943b1 100644 --- a/gui/engine.py +++ b/gui/engine.py @@ -69,8 +69,13 @@ def start_recording(self) -> dict: if self.recording: return {"ok": True, "already": True} from audio.capture import AudioCapture - self._cap = AudioCapture() - self._cap.start() + try: + self._cap = AudioCapture() + self._cap.start() + except Exception as e: # no input device / busy / permission + self._cap = None + self.recording = False + return {"ok": False, "error": f"could not open the microphone: {e}"} self._chunks = [] self._stop.clear() self.recording = True @@ -86,31 +91,33 @@ def _poll(self): chunks = self._cap.drain() except Exception: chunks = [] - for c in chunks: - if c is not None and len(c): - self._chunks.append(np.asarray(c, dtype=np.float32)) - if chunks: - last = np.asarray(chunks[-1], dtype=np.float32) + fresh = [np.asarray(c, dtype=np.float32) for c in chunks if c is not None and len(c)] + if fresh: + with self._lock: # serialize with stop's concat/clear + self._chunks.extend(fresh) + last = fresh[-1] if len(last): self.level = float(np.sqrt(np.mean(np.square(last)))) time.sleep(0.066) # ~15 Hz def stop_recording(self, model: str = "fw-small", language: str = "en") -> dict: + if not self.recording: + return {"ok": False, "error": "not recording"} + # Signal + join the poll thread WITHOUT holding self._lock (the poll thread takes + # the lock to append, so holding it here would deadlock). Once joined, no more + # appends can race the final drain/concat/clear. + self._stop.set() + if self._poll_thread: + self._poll_thread.join(timeout=5) with self._lock: - if not self.recording: - return {"ok": False, "error": "not recording"} - self._stop.set() - if self._poll_thread: - self._poll_thread.join(timeout=3) + self.recording = False try: - rest = self._cap.drain() - for c in rest: + for c in self._cap.drain(): if c is not None and len(c): self._chunks.append(np.asarray(c, dtype=np.float32)) self._cap.stop() except Exception: pass - self.recording = False audio = np.concatenate(self._chunks).astype(np.float32) if self._chunks else np.zeros(0, dtype=np.float32) self._chunks = [] if len(audio) < SR // 2: # < 0.5s @@ -189,21 +196,25 @@ def sessions(self) -> list[dict]: items = sorted(out.values(), key=lambda x: x.get("mtime", 0), reverse=True) return items - def _resolve(self, stem: str, suffix: str) -> Path | None: + def _resolve(self, stem: str, suffix: str, only_dir: str | None = None) -> Path | None: # prevent traversal: stem must be a bare name if "/" in stem or ".." in stem: return None - for d in self._session_dirs(): + dirs = self._session_dirs() + if only_dir: # restrict to that dir IFF it's a known session dir (no traversal) + od = Path(only_dir) + dirs = [d for d in dirs if d == od] + for d in dirs: p = d / f"{stem}{suffix}" if p.exists(): return p return None - def read_artifact(self, stem: str, kind: str) -> dict: + def read_artifact(self, stem: str, kind: str, dir: str | None = None) -> dict: suffix = {"transcript": "-transcript.md", "agent_md": "-agent.md", "agent_json": "-agent.json"}.get(kind) if not suffix: return {"ok": False, "error": "bad kind"} - p = self._resolve(stem, suffix) + p = self._resolve(stem, suffix, only_dir=dir) if not p: return {"ok": False, "error": "not found"} return {"ok": True, "stem": stem, "kind": kind, "path": str(p), "text": p.read_text(encoding="utf-8")} diff --git a/gui/server.py b/gui/server.py index 1d36539..df67f92 100644 --- a/gui/server.py +++ b/gui/server.py @@ -11,7 +11,9 @@ import json import os +import secrets import sys +import threading import time from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from pathlib import Path @@ -27,13 +29,22 @@ DEFAULT_PORT = 8740 MAX_BODY = 64 * 1024 # API requests are tiny; bound them MAX_SSE = 8 # cap concurrent status streams +_sse_lock = threading.Lock() _sse_count = 0 +# When LAN-exposed (VOXTERM_GUI_LAN=1) every /api/* call must carry this token — +# without it, anyone on the wifi could start a recording of the room or read past +# transcripts. None = loopback (no token required). Set in main(). +TOKEN = None + ENGINE = Engine() # Strict CSP: same-origin only, no external anything (the UI is fully self-hosted). -CSP = ("default-src 'none'; script-src 'self'; style-src 'self'; img-src 'self' data:; " - "connect-src 'self'; font-src 'self'; base-uri 'none'; form-action 'none'") +# style-src allows 'unsafe-inline' because the UI sets element.style (the live level +# ring, the progress bar) and per-speaker color dots; all interpolated values are +# escaped (app.js escapeHtml) and the data is local, so the exposure is minimal. +CSP = ("default-src 'none'; script-src 'self'; style-src 'self' 'unsafe-inline'; " + "img-src 'self' data:; connect-src 'self'; font-src 'self'; base-uri 'none'; form-action 'none'") _CTYPES = {".html": "text/html; charset=utf-8", ".js": "text/javascript; charset=utf-8", ".css": "text/css; charset=utf-8", ".svg": "image/svg+xml", ".json": "application/json"} @@ -68,10 +79,21 @@ def _read_json(self) -> dict: def log_message(self, *a): # quiet pass + def _authed(self, q) -> bool: + """Token check for /api/* when LAN-exposed. Loopback (TOKEN is None) is open.""" + if TOKEN is None: + return True + given = (self.headers.get("X-VoxTerm-Token") + or (self.headers.get("Authorization") or "").removeprefix("Bearer ").strip() + or (q.get("token") or [""])[0]) + return bool(given) and secrets.compare_digest(given, TOKEN) + # ---- GET ---- def do_GET(self): u = urlparse(self.path) p, q = u.path, parse_qs(u.query) + if p.startswith("/api/") and not self._authed(q): + return self._json({"error": "unauthorized"}, 401) if p == "/" or p == "/index.html": return self._serve_static("index.html") if p.startswith("/static/"): @@ -85,14 +107,18 @@ def do_GET(self): if p == "/api/session": stem = (q.get("stem") or [""])[0] kind = (q.get("kind") or ["transcript"])[0] - return self._json(ENGINE.read_artifact(stem, kind)) + d = (q.get("dir") or [None])[0] + return self._json(ENGINE.read_artifact(stem, kind, dir=d)) if p == "/api/events": return self._sse() return self._json({"error": "not found"}, 404) # ---- POST ---- def do_POST(self): - p = urlparse(self.path).path + u = urlparse(self.path) + p, q = u.path, parse_qs(u.query) + if p.startswith("/api/") and not self._authed(q): + return self._json({"error": "unauthorized"}, 401) if p == "/api/record/start": return self._json(ENGINE.start_recording()) if p == "/api/record/stop": @@ -121,9 +147,10 @@ def _serve_static(self, rel: str): def _sse(self): global _sse_count - if _sse_count >= MAX_SSE: - return self._json({"error": "too many streams"}, 429) - _sse_count += 1 + with _sse_lock: + if _sse_count >= MAX_SSE: + return self._json({"error": "too many streams"}, 429) + _sse_count += 1 try: self._hdr(200, "text/event-stream", {"Cache-Control": "no-cache", "Connection": "keep-alive"}) while True: @@ -134,21 +161,25 @@ def _sse(self): except (BrokenPipeError, ConnectionResetError): pass finally: - _sse_count -= 1 + with _sse_lock: + _sse_count -= 1 def main(argv=None) -> int: + global TOKEN lan = os.environ.get("VOXTERM_GUI_LAN") == "1" host = "0.0.0.0" if lan else "127.0.0.1" port = int(os.environ.get("VOXTERM_GUI_PORT", DEFAULT_PORT)) + if lan: + # Records a real room — never expose to the wifi without a secret. + TOKEN = os.environ.get("VOXTERM_GUI_TOKEN") or secrets.token_urlsafe(24) httpd = ThreadingHTTPServer((host, port), Handler) httpd.daemon_threads = True - where = f"http://{'' if lan else '127.0.0.1'}:{port}" - print(f"[voxterm-gui] serving {where}") if lan: - print("[voxterm-gui] LAN-exposed (VOXTERM_GUI_LAN=1) — reachable from your phone on this wifi.") + print(f"[voxterm-gui] LAN-exposed (VOXTERM_GUI_LAN=1) — token REQUIRED on every /api call.", flush=True) + print(f"[voxterm-gui] open from your phone: http://:{port}/?token={TOKEN}", flush=True) else: - print("[voxterm-gui] loopback only. Set VOXTERM_GUI_LAN=1 to reach it from your phone.") + print(f"[voxterm-gui] serving http://127.0.0.1:{port} (loopback only; set VOXTERM_GUI_LAN=1 for phone access)", flush=True) try: httpd.serve_forever() except KeyboardInterrupt: diff --git a/gui/static/app.js b/gui/static/app.js index 87414fa..d1e5746 100644 --- a/gui/static/app.js +++ b/gui/static/app.js @@ -8,7 +8,14 @@ let RENAMES = {}; // speaker_id -> custom name (view + export) let lastJobState = "idle"; // ---------- helpers ---------- -async function getJSON(url, opts) { const r = await fetch(url, opts); return r.json(); } +// When opened via http://host/?token=… (LAN mode) every API call carries the token. +const TOKEN = new URLSearchParams(location.search).get("token") || ""; +function authUrl(u) { return TOKEN ? u + (u.includes("?") ? "&" : "?") + "token=" + encodeURIComponent(TOKEN) : u; } +async function getJSON(url, opts) { + opts = opts || {}; + if (TOKEN) opts.headers = Object.assign({ "X-VoxTerm-Token": TOKEN }, opts.headers || {}); + const r = await fetch(url, opts); return r.json(); +} function toast(msg) { const t = $("toast"); t.textContent = msg; t.classList.remove("hidden"); clearTimeout(toast._t); toast._t = setTimeout(() => t.classList.add("hidden"), 2200); @@ -38,8 +45,8 @@ async function init() { $("refreshSessions").addEventListener("click", loadSessions); $("navToggle").addEventListener("click", () => document.body.classList.toggle("nav-open")); $("copyAgent").addEventListener("click", copyForAI); - $("dlMd").addEventListener("click", () => download(buildMarkdown(), `${CUR.session.id}-agent.md`, "text/markdown")); - $("dlJson").addEventListener("click", () => download(buildJson(), `${CUR.session.id}-agent.json`, "application/json")); + $("dlMd").addEventListener("click", () => { if (!CUR) return toast("Load an AI export first"); download(buildMarkdown(), `${CUR.session.id}-agent.md`, "text/markdown"); }); + $("dlJson").addEventListener("click", () => { if (!CUR) return toast("Load an AI export first"); download(buildJson(), `${CUR.session.id}-agent.json`, "application/json"); }); await loadSessions(); openEvents(); @@ -47,7 +54,7 @@ async function init() { // ---------- live status (SSE) ---------- function openEvents() { - const es = new EventSource("/api/events"); + const es = new EventSource(authUrl("/api/events")); es.onmessage = (e) => { let s; try { s = JSON.parse(e.data); } catch { return; } applyStatus(s); @@ -117,9 +124,9 @@ async function loadSessions() { sessions.forEach((s) => { const li = document.createElement("li"); li.className = "session"; li.dataset.stem = s.stem; const has = []; if (s.agent_md) has.push("AI"); if (s.transcript) has.push("md"); - li.innerHTML = `
    ${prettyStem(s.stem)}
    + li.innerHTML = `
    ${escapeHtml(prettyStem(s.stem))}
    ${has.map((h) => `${h}`).join("")}
    `; - li.addEventListener("click", () => loadSession(s.stem)); + li.addEventListener("click", () => loadSession(s.stem, s.dir)); ul.appendChild(li); }); } @@ -129,11 +136,12 @@ function prettyStem(stem) { return stem; } -async function loadSession(stem) { +async function loadSession(stem, dir) { + const dq = dir ? `&dir=${encodeURIComponent(dir)}` : ""; // prefer the structured JSON; fall back to the markdown if the AI export is missing - let res = await getJSON(`/api/session?stem=${encodeURIComponent(stem)}&kind=agent_json`); + let res = await getJSON(`/api/session?stem=${encodeURIComponent(stem)}&kind=agent_json${dq}`); if (!res.ok) { - res = await getJSON(`/api/session?stem=${encodeURIComponent(stem)}&kind=transcript`); + res = await getJSON(`/api/session?stem=${encodeURIComponent(stem)}&kind=transcript${dq}`); if (res.ok) return showRawMarkdown(stem, res.text); return toast("Could not load session"); } @@ -156,7 +164,7 @@ function render() { const leg = $("speakerLegend"); leg.innerHTML = ""; CUR.speakers.filter((sp) => !sp.peer).forEach((sp) => { const el = document.createElement("button"); el.className = "lg"; - el.innerHTML = `${RENAMES[sp.id] || sp.label}`; + el.innerHTML = `${escapeHtml(RENAMES[sp.id] || sp.label)}`; el.title = "Click to rename this speaker"; el.addEventListener("click", () => renameSpeaker(sp.id)); leg.appendChild(el); @@ -200,12 +208,16 @@ function buildJson() { } function buildMarkdown() { const s = CUR.session; + // JSON.stringify of a string is a valid YAML double-quoted scalar — mirrors the + // server's _yaml_scalar so a rename/peer_name with a quote or newline can't break + // the front-matter or inject keys. + const y = (v) => JSON.stringify(String(v == null ? "" : v)); const spk = CUR.speakers.map((sp) => sp.peer - ? ` - { id: 0, label: "${sp.label}", turns: ${sp.turns}, peer: true, peer_name: "${sp.peer_name}" }` - : ` - { id: ${sp.id}, label: "${RENAMES[sp.id] || sp.label}", turns: ${sp.turns}, peer: false }`).join("\n"); + ? ` - { id: 0, label: ${y(sp.label)}, turns: ${sp.turns}, peer: true, peer_name: ${y(sp.peer_name)} }` + : ` - { id: ${sp.id}, label: ${y(RENAMES[sp.id] || sp.label)}, turns: ${sp.turns}, peer: false }`).join("\n"); const fm = ["---", "voxterm_export_version: 1", "kind: voxterm-transcript", - `session_id: "${s.id}"`, `date: ${(s.started_at || "").slice(0, 10) || "null"}`, - `duration: "${s.duration_hms || ""}"`, `model: "${s.model || ""}"`, `language: "${s.language || ""}"`, + `session_id: ${y(s.id)}`, `date: ${(s.started_at || "").slice(0, 10) || "null"}`, + `duration: ${y(s.duration_hms || "")}`, `model: ${y(s.model || "")}`, `language: ${y(s.language || "")}`, "speakers:", spk, `turns: ${CUR.turns.length}`, "notes:", ' - "Speaker labels are diarization clusters / your renames, not verified identities."', "---", ""].join("\n"); const body = ["> VoxTerm session — timestamps are [mm:ss] into the recording; [~]=uncertain, [overlap], [new-voice], [peer].", "", "## Transcript", ""]; diff --git a/gui/transcribe.py b/gui/transcribe.py index ac93f61..cdfcbb9 100644 --- a/gui/transcribe.py +++ b/gui/transcribe.py @@ -15,6 +15,7 @@ import argparse import sys +import threading from datetime import datetime from pathlib import Path @@ -34,6 +35,31 @@ SR = config.SAMPLE_RATE # 16000 +# Loaded engines are cached so a second recording doesn't reload the model (hundreds +# of MB) from disk. Acquisition is serialized; the diarizer's per-session state is +# reset per run. (The GUI runs one transcription at a time.) +_ENGINE_LOCK = threading.Lock() +_TR_CACHE: dict = {} +_VAD = None +_DIAR = None + + +def _get_engines(model: str, language: str): + global _VAD, _DIAR + with _ENGINE_LOCK: + key = (model, language) + tr = _TR_CACHE.get(key) + if tr is None: + tr = get_transcriber(model, language=language) + tr.load() + _TR_CACHE[key] = tr + if _VAD is None: + _VAD = SileroVAD() + if _DIAR is None: + _DIAR = DiarizationProxy() + _DIAR.load() + return tr, _VAD, _DIAR + def load_wav_16k_mono(path: Path) -> np.ndarray: """Load any WAV as float32 mono @ 16 kHz (the live-capture format).""" @@ -64,11 +90,7 @@ def transcribe_audio(audio: np.ndarray, out_dir: Path, *, model: str = "fw-base" if progress: progress(0.02, "loading engine") - tr = get_transcriber(model, language=language) - tr.load() - vad = SileroVAD() - diar = DiarizationProxy() - diar.load() + tr, vad, diar = _get_engines(model, language) # cached across calls diar.reset_session() session_start = datetime.now() From cd1bc25887e51c9449157183614359787cf476f0 Mon Sep 17 00:00:00 2001 From: NubsCarson <192162056+NubsCarson@users.noreply.github.com> Date: Thu, 4 Jun 2026 08:43:31 +0000 Subject: [PATCH 03/60] test+feat(gui): backend tests (engine+server), docs, and UX polish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recording-safe hardening (built while a live mic recording ran — file-only changes): - gui/test_engine.py (16 tests): non-mic engine paths — models()/languages(), _write_wav round-trip + clipping, sessions() discovery/ordering/flags across dirs, read_artifact/_resolve text + path-traversal rejection + only_dir restriction, idle status() shape. Isolated to temp dirs; never opens the mic or a model. - gui/test_server.py (14 tests): in-process server on an ephemeral port — static serving + content-types, traversal blocked (403), /api/options|status|sessions, 404 unknown route, and the full LAN-auth contract (no-token 401 / valid 200 / wrong 401 / header 200 / TOKEN=None open; static stays open). No /api/record POST. - gui/README.md: honest docs — what it is, how to run, the phone/LAN token flow, the privacy/security model, files + outputs, v1 features + labeled fast-follows. - UX polish (static/* only, API unchanged): a11y (aria-expanded synced, aria-live on status/toast, :focus-visible rings), keyboard (Space / r toggle record, Escape + outside-click close the mobile drawer, without hijacking focused controls), a "Summarize for AI" button (copies transcript prefixed with a ready-to-paste LLM summarization task), real mic-error toasts, an empty-sessions state, and export buttons disabled until a transcript is loaded. Verified (light, recording-safe): py_compile + node --check clean; all three gui suites green (23+16+14 = 53 tests); serve smoke confirms the UI + new control load. --- gui/README.md | 132 ++++++++++++++++++++++ gui/static/app.js | 78 ++++++++++++- gui/static/index.html | 7 +- gui/static/style.css | 9 ++ gui/test_engine.py | 251 ++++++++++++++++++++++++++++++++++++++++++ gui/test_server.py | 242 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 710 insertions(+), 9 deletions(-) create mode 100644 gui/README.md create mode 100644 gui/test_engine.py create mode 100644 gui/test_server.py diff --git a/gui/README.md b/gui/README.md new file mode 100644 index 0000000..f724d95 --- /dev/null +++ b/gui/README.md @@ -0,0 +1,132 @@ +# VoxTerm GUI + +A small web control app over VoxTerm's own engine. Hit a button to record, stop, +transcribe, and diarize; review the result; and export an AI-ready transcript — all +from a browser tab (including your phone, on your own network). + +It is a thin control surface, not a reimplementation. Recording uses VoxTerm's +`audio.capture.AudioCapture`; transcription drives the same transcriber + Silero VAD + +diarizer + `EventLogger` the TUI uses; the AI export is a pure function of the same +`events.jsonl` stream that the TUI emits. Nothing about the speech pipeline is +duplicated here. + +## What it does (v1) + +A single linear flow: + +1. **Record** — pick a model + language, hit the button, talk. +2. **Stop** — captured audio is written to a WAV. +3. **Transcribe + diarize** — runs in the background; a progress bar tracks it. +4. **Export** — produces an AI-ready `-agent.md` + `-agent.json` automatically. +5. **History** — every past session is listed in the sidebar; click to reopen. +6. **Rename** — relabel a diarized speaker; the rename flows into your copy/download. + +Review extras: a speaker legend, per-turn timestamps and uncertainty markers, a +"Copy for AI" button, and `.md` / `.json` downloads built client-side from the +loaded session (so your renames are included). + +## How to run + +```bash +python -m gui.server +# -> http://127.0.0.1:8740 (loopback only) +``` + +By default it binds `127.0.0.1` — reachable only from this machine. + +Optional env: + +| Var | Default | Effect | +|-----|---------|--------| +| `VOXTERM_GUI_PORT` | `8740` | listen port | +| `VOXTERM_GUI_LAN` | unset | `=1` binds `0.0.0.0` and requires a token (see below) | +| `VOXTERM_GUI_TOKEN` | auto | set your own LAN token; otherwise one is generated | + +### Phone / LAN access + +The app records a real room, so exposing it to the network is gated behind a token +that must be present on **every** `/api/*` call. + +```bash +VOXTERM_GUI_LAN=1 python -m gui.server +``` + +On start it prints the exact URL to open from your phone: + +``` +http://:8740/?token= +``` + +Open that URL on a device on the same network. The page reads `?token=…` from the URL +and attaches it to every API request and the status stream automatically. Without a +valid token, every `/api/*` call returns `401`. + +## Privacy and security model + +- **Loopback by default.** No token, no network exposure — only this machine can reach it. +- **Token-gated LAN.** With `VOXTERM_GUI_LAN=1`, every `/api/*` request must carry the + token (header `X-VoxTerm-Token`, `Authorization: Bearer …`, or `?token=…`), checked with + a constant-time compare. This guards both starting a recording of the room and reading + past transcripts. +- **Transcription is fully local.** Models run on this machine via VoxTerm's engine. + Nothing audio-related leaves the host. +- **No audio in any network payload.** The API moves JSON status, option lists, and text + artifacts only — never audio. WAVs stay on disk under `~/voxterm-live/`. +- **Strict CSP.** Same-origin only; no external scripts, fonts, images, or connections. + (`style-src` allows `'unsafe-inline'` for a few computed styles — the level ring, the + progress bar, speaker color dots — all from local, escaped data.) Plus + `X-Content-Type-Options: nosniff` and `Referrer-Policy: no-referrer`. +- **No path traversal.** Static files resolve within `static/` only; session lookups + reject non-bare stems and restrict any `dir` to a known session directory. + +## Files + +| File | Role | +|------|------| +| `server.py` | stdlib `http.server` — serves the UI, a tiny JSON API, and an SSE status stream; handles the loopback/LAN + token gate and CSP. No transcription logic. | +| `engine.py` | Control layer over VoxTerm's engine: start/stop recording (via `AudioCapture`), the background transcribe+export job, live level/status, and session-history listing/reads. | +| `transcribe.py` | Headless transcription: a WAV (or in-memory buffer) → a faithful `events.jsonl` + `-transcript.md`, reusing VoxTerm's transcriber, Silero VAD, diarizer, and `EventLogger`. Also a CLI: `python -m gui.transcribe ROOM.wav`. | +| `export.py` | Pure, replayable export of an `events.jsonl` → `-agent.md` (+ `-agent.json`). No audio, no live state. CLI: `python -m glass.export [events.jsonl]`. | +| `static/index.html`, `static/app.js`, `static/style.css` | The self-hosted single-page UI (record hero, progress bar, transcript view, sessions sidebar). | + +### Outputs + +Recordings and their artifacts land in `~/voxterm-live/`. The history sidebar also reads +VoxTerm's own session and live dirs. Per session: + +| Artifact | What it is | +|----------|------------| +| `-gui.wav` | the captured audio (local only) | +| `-events.jsonl` | the canonical VoxTerm event stream (the same one the TUI emits / glass tails) | +| `-transcript.md` | human-readable transcript with timestamps + speaker labels | +| `-agent.md` | AI-ready transcript: YAML front-matter, marker legend, one speaker-attributed, timestamped turn per line | +| `-agent.json` | typed, lossless companion the `-agent.md` is rendered from | + +`events.jsonl` is the source of truth: each line is one JSON object +(`{"t", "kind", …}`). The exporter is a pure reduction of that stream — `text` events +carry an `audio_offset`/`audio_end` so timestamps are true offsets into the recording. + +### API surface + +`GET /api/options` · `GET /api/status` · `GET /api/sessions` · `GET /api/session` · +`GET /api/events` (SSE) · `POST /api/record/start` · `POST /api/record/stop` · +`POST /api/transcribe` (transcribe an existing WAV). + +## Models and languages + +Models offered are VoxTerm's faster-whisper keys (`fw-tiny`, `fw-base`, `fw-small`, +`fw-medium`, `fw-large-v3`, `fw-distil-large-v3`); `fw-small` is the default. Languages +come from VoxTerm's `AVAILABLE_LANGUAGES` (default `en`). On CPU, the smaller `fw-*` +models are the practical choices. + +## Scope: what this is not (yet) + +v1 is deliberately the linear flow above (record → stop → transcribe → export → +history → rename). Planned fast-follows, not built here: + +- **Live word-by-word streaming** during recording (v1 transcribes after stop). +- **Party / P2P** multi-device sessions (the export already understands `peer` turns). +- **Hivemind** shared/aggregated sessions. +- **Merged view** across multiple sessions. +- **Speaker profiles** (persistent cross-session identities; v1 renames are per-view). +- **Tauri native / mobile wrapper** instead of the browser tab. diff --git a/gui/static/app.js b/gui/static/app.js index d1e5746..914d411 100644 --- a/gui/static/app.js +++ b/gui/static/app.js @@ -28,6 +28,11 @@ function fmtClock(sec) { : `${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`; } function colorFor(sid) { return PALETTE[((sid || 0) % PALETTE.length + PALETTE.length) % PALETTE.length]; } +// Sidebar drawer (mobile): keep body class + aria-expanded in sync. +function setNav(open) { + document.body.classList.toggle("nav-open", open); + $("navToggle").setAttribute("aria-expanded", open ? "true" : "false"); +} function nameFor(turn) { if (turn.peer) return turn.peer_name ? `${turn.speaker} · ${turn.peer_name}` : turn.speaker; if (RENAMES[turn.speaker_id]) return RENAMES[turn.speaker_id]; @@ -43,15 +48,46 @@ async function init() { $("recBtn").addEventListener("click", toggleRecord); $("refreshSessions").addEventListener("click", loadSessions); - $("navToggle").addEventListener("click", () => document.body.classList.toggle("nav-open")); + $("navToggle").addEventListener("click", () => setNav(!document.body.classList.contains("nav-open"))); $("copyAgent").addEventListener("click", copyForAI); - $("dlMd").addEventListener("click", () => { if (!CUR) return toast("Load an AI export first"); download(buildMarkdown(), `${CUR.session.id}-agent.md`, "text/markdown"); }); - $("dlJson").addEventListener("click", () => { if (!CUR) return toast("Load an AI export first"); download(buildJson(), `${CUR.session.id}-agent.json`, "application/json"); }); + $("summarizeAi").addEventListener("click", summarizeForAI); + $("dlMd").addEventListener("click", () => { if (!CUR) return; download(buildMarkdown(), `${CUR.session.id}-agent.md`, "text/markdown"); }); + $("dlJson").addEventListener("click", () => { if (!CUR) return; download(buildJson(), `${CUR.session.id}-agent.json`, "application/json"); }); + setExportEnabled(false); + + // close the mobile drawer when clicking outside it (the toggle handles its own click) + document.addEventListener("click", (e) => { + if (!document.body.classList.contains("nav-open")) return; + if ($("sidebar").contains(e.target) || $("navToggle").contains(e.target)) return; + setNav(false); + }); + // global keyboard: Escape closes drawer; Space / r toggle record (not while typing in a control) + document.addEventListener("keydown", onKeydown); await loadSessions(); openEvents(); } +function onKeydown(e) { + if (e.key === "Escape") { setNav(false); return; } + const el = document.activeElement; + const tag = el && el.tagName; + const typing = tag === "SELECT" || tag === "INPUT" || tag === "TEXTAREA" || (el && el.isContentEditable); + if (typing || e.metaKey || e.ctrlKey || e.altKey) return; + const isSpace = e.code === "Space" || e.key === " "; + // Let Space activate a focused button/session instead of hijacking it for record/stop. + if (isSpace && el && (tag === "BUTTON" || el.getAttribute("role") === "button")) return; + if (isSpace || e.key === "r" || e.key === "R") { + e.preventDefault(); + toggleRecord(); + } +} + +// Export/copy actions are meaningless without a loaded transcript — disable them outright. +function setExportEnabled(on) { + ["copyAgent", "summarizeAi", "dlJson", "dlMd"].forEach((id) => { $(id).disabled = !on; }); +} + // ---------- live status (SSE) ---------- function openEvents() { const es = new EventSource(authUrl("/api/events")); @@ -105,7 +141,7 @@ async function toggleRecord() { const recording = document.body.classList.contains("recording"); if (!recording) { const r = await getJSON("/api/record/start", { method: "POST" }); - if (!r.ok) toast("Could not start (mic busy?)"); + if (!r.ok) toast(r.error ? "Mic error: " + r.error : "Could not start (mic busy?)"); } else { $("recBtn").disabled = true; await getJSON("/api/record/stop", { @@ -120,13 +156,15 @@ async function toggleRecord() { async function loadSessions() { const { sessions } = await getJSON("/api/sessions"); const ul = $("sessions"); ul.innerHTML = ""; - if (!sessions.length) { ul.innerHTML = `
  • No sessions yet.
  • `; return; } + if (!sessions.length) { ul.innerHTML = `
  • No sessions yet — record one to get started.
  • `; return; } sessions.forEach((s) => { const li = document.createElement("li"); li.className = "session"; li.dataset.stem = s.stem; + li.tabIndex = 0; li.setAttribute("role", "button"); const has = []; if (s.agent_md) has.push("AI"); if (s.transcript) has.push("md"); li.innerHTML = `
    ${escapeHtml(prettyStem(s.stem))}
    ${has.map((h) => `${h}`).join("")}
    `; li.addEventListener("click", () => loadSession(s.stem, s.dir)); + li.addEventListener("keydown", (e) => { if (e.key === "Enter" || e.key === " ") { e.preventDefault(); e.stopPropagation(); loadSession(s.stem, s.dir); } }); ul.appendChild(li); }); } @@ -149,13 +187,14 @@ async function loadSession(stem, dir) { RENAMES = {}; render(); document.querySelectorAll(".session").forEach((el) => el.classList.toggle("active", el.dataset.stem === stem)); - document.body.classList.remove("nav-open"); + setNav(false); } // ---------- render ---------- function render() { $("empty").classList.add("hidden"); $("transcriptView").classList.remove("hidden"); + setExportEnabled(true); const s = CUR.session; $("tvTitle").textContent = prettyStem(s.id); $("tvMeta").textContent = `${CUR.turns.length} turns · ${CUR.speakers.length} speaker(s) · ${s.duration_hms || ""} · ${s.model || ""}`; @@ -187,6 +226,7 @@ function render() { } function showRawMarkdown(stem, text) { CUR = null; + setExportEnabled(false); // no structured JSON behind a raw-markdown view $("empty").classList.add("hidden"); $("transcriptView").classList.remove("hidden"); $("tvTitle").textContent = prettyStem(stem); $("tvMeta").textContent = "(no AI export — raw transcript)"; $("speakerLegend").innerHTML = ""; @@ -236,6 +276,32 @@ async function copyForAI() { try { await navigator.clipboard.writeText(md); toast("Copied AI transcript to clipboard"); } catch { download(md, `${CUR.session.id}-agent.md`, "text/markdown"); toast("Clipboard blocked — downloaded instead"); } } +// A ready-to-paste prompt: a strong summarization instruction followed by the transcript. +function summaryPrompt() { + return [ + "## Task", + "", + "You are given a transcript of a recorded conversation (below). Read it in full, then produce:", + "", + "1. **Summary** — a concise overview (3-5 sentences) of what the conversation was about.", + "2. **Key decisions** — a bullet list of decisions reached, or \"None\" if there were none.", + "3. **Action items** — a bullet list of follow-ups, each with the owner if one is identifiable.", + "4. **Per-speaker highlights** — for each speaker, 1-2 bullets on their main points or positions.", + "", + "Stick to what the transcript actually says. Do not invent details. Speaker labels are diarization", + "clusters or manual renames, not verified identities — treat them as such.", + "", + "---", + "", + buildMarkdown(), + ].join("\n"); +} +async function summarizeForAI() { + if (!CUR) return toast("Load a transcript first"); + const text = summaryPrompt(); + try { await navigator.clipboard.writeText(text); toast("Copied summary prompt to clipboard"); } + catch { download(text, `${CUR.session.id}-summarize.md`, "text/markdown"); toast("Clipboard blocked — downloaded instead"); } +} function download(text, filename, mime) { const blob = new Blob([text], { type: mime }); const a = document.createElement("a"); diff --git a/gui/static/index.html b/gui/static/index.html index f6afa97..2f51784 100644 --- a/gui/static/index.html +++ b/gui/static/index.html @@ -8,7 +8,7 @@ - + diff --git a/gui/static/style.css b/gui/static/style.css index ad44216..fc23764 100644 --- a/gui/static/style.css +++ b/gui/static/style.css @@ -63,6 +63,8 @@ body { .session .s-sub { font-size: 11.5px; color: var(--faint); margin-top: 2px; display: flex; gap: 6px; } .tag { font-size: 10px; padding: 1px 6px; border-radius: 999px; background: #1f2630; color: var(--accent-dim); border: 1px solid var(--line); } .sessions-empty { color: var(--faint); font-size: 12.5px; line-height: 1.5; padding: 14px 12px; border: 1px dashed var(--line); border-radius: 10px; text-align: center; } +.session-search { width: 100%; box-sizing: border-box; margin: 0 0 10px; background: var(--bg-elev); color: var(--text); border: 1px solid var(--line); border-radius: 9px; padding: 7px 11px; font-size: 13px; } +.session-search::placeholder { color: var(--faint); } /* ---------- main ---------- */ .main { padding: 36px clamp(20px, 5vw, 56px); max-width: 980px; width: 100%; } From 9086a6349b6a1ac3ca60f9db2eb0434522356888 Mon Sep 17 00:00:00 2001 From: NubsCarson <192162056+NubsCarson@users.noreply.github.com> Date: Thu, 4 Jun 2026 10:55:28 +0000 Subject: [PATCH 10/60] feat(gui): live near-real-time transcription of an in-progress recording (gui/live.py) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tails the raw PCM of a WAV being recorded and transcribes each new speech window with VoxTerm's engine, printing '[mm:ss] text' as the conversation happens. Reads the FILE, not the mic, so it runs alongside any recorder with zero contention. Text-only + fw-base default for low latency. CLI: python -m gui.live ROOM.wav [--model] [--interval] [--max-seconds]. Proven on a live recording (transcribed the active conversation in near-real-time). NOT yet wired into the GUI browser UI — that's the next step (stream lines over SSE to a live transcript panel). --- gui/live.py | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 gui/live.py diff --git a/gui/live.py b/gui/live.py new file mode 100644 index 0000000..513583a --- /dev/null +++ b/gui/live.py @@ -0,0 +1,95 @@ +"""Live / near-real-time transcription of an IN-PROGRESS recording. + +Tails the raw PCM of the WAV that `arecord` (or VoxTerm) is writing and transcribes +each new *speech* window with VoxTerm's own engine, printing "[mm:ss] text" as the +conversation happens. It reads the FILE, not the microphone, so it never contends with +the recorder — you can run it alongside a live capture. + + python -m gui.live ROOM.wav [--model fw-base] [--interval 10] [--max-seconds N] + +Text-only by default (no diarization) to stay light + low-latency; the full +speaker-attributed transcript comes from the post-stop pipeline. fw-base is the default +for speed (≈realtime-capable on CPU). Stops after --max-seconds (0 = until interrupted). +""" +from __future__ import annotations + +import argparse +import sys +import time +from pathlib import Path + +_ROOT = str(Path(__file__).resolve().parent.parent) +if _ROOT not in sys.path: + sys.path.insert(0, _ROOT) + +import numpy as np # noqa: E402 + +import config # noqa: E402 +from gui.transcribe import _get_engines, _fmt_hms # reuse the cached engine + time fmt # noqa: E402 + +SR = config.SAMPLE_RATE +_WAV_HEADER = 44 # bytes; raw little-endian s16 PCM follows + + +def main(argv=None) -> int: + ap = argparse.ArgumentParser(description="Live transcription of an in-progress WAV.") + ap.add_argument("wav") + ap.add_argument("--model", default="fw-base") + ap.add_argument("--language", default="en") + ap.add_argument("--interval", type=float, default=10.0, help="seconds between passes") + ap.add_argument("--max-seconds", type=float, default=0.0, help="stop after N seconds (0 = until interrupted)") + args = ap.parse_args(argv) + + wav = Path(args.wav) + if not wav.exists(): + print(f"error: no such file: {wav}", file=sys.stderr) + return 2 + + print(f"[live] loading {args.model} …", flush=True) + tr, vad, _diar = _get_engines(args.model, args.language) + print(f"[live] transcribing {wav.name} every {args.interval:.0f}s — reads the file, not the mic", flush=True) + + f = open(wav, "rb") + f.seek(_WAV_HEADER) + buf = np.zeros(0, dtype=np.float32) + abs_start = 0 # absolute sample index of buf[0] + started = time.time() + n_lines = 0 + try: + while True: + time.sleep(args.interval) + data = f.read() # everything appended since last read + if data: + n = len(data) - (len(data) % 2) # whole int16 samples only + if n: + buf = np.concatenate([buf, np.frombuffer(data[:n], dtype="= SR * 2: + segs = vad.get_speech_segments(buf, min_speech_ms=500, min_silence_ms=300, max_speech_s=6.0) + tail_guard = len(buf) - int(SR * 0.6) # leave the last ~0.6s as "still talking" + consumed = 0 + for (s, e) in segs: + if e > tail_guard: + break # this segment may still be growing — wait + out = tr.transcribe(buf[s:e]) + txt = (out.get("text") or "").strip() + if txt: + print(f" [{_fmt_hms((abs_start + s) / SR)}] {txt}", flush=True) + n_lines += 1 + consumed = e + if consumed: + abs_start += consumed + buf = buf[consumed:] + if args.max_seconds and (time.time() - started) >= args.max_seconds: + break + except KeyboardInterrupt: + pass + finally: + f.close() + print(f"[live] stopped — {n_lines} live lines transcribed", flush=True) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 38aa103e67f3bc11f4a9e10a6f036d6e34cca1af Mon Sep 17 00:00:00 2001 From: NubsCarson <192162056+NubsCarson@users.noreply.github.com> Date: Thu, 4 Jun 2026 11:02:50 +0000 Subject: [PATCH 11/60] =?UTF-8?q?feat(gui):=20live=20transcription=20IN=20?= =?UTF-8?q?the=20GUI=20=E2=80=94=20tail=20an=20in-progress=20recording,=20?= =?UTF-8?q?stream=20to=20a=20panel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires gui/live.py's near-real-time transcription into the browser UI so it's actually usable, not just a CLI: - engine.py: live_start/live_stop + a background tail-transcribe thread that follows the newest in-progress recording FROM the current end (true live — no slow backlog replay), transcribes finalized speech windows with the cached fw-base engine, and appends '[mm:ss] text' lines (capped) exposed via status().live. Reads the file, not the mic, so it runs alongside any recorder with zero contention. - server.py: POST /api/live/start (optional wav, defaults to newest) + /api/live/stop. - static: a '⦿ Live transcript' toggle + a streaming, auto-scrolling live panel (calm theme, pulsing dot); applyStatus renders status().live.lines. Verified end-to-end on a real in-progress recording: start -> lines appear within ~10-16s of new speech with correct audio timestamps (e.g. [39:19]…) -> stop. server tests 14/14; node --check clean. (Browser render of the panel is wired but visually confirmable only in a browser; the data path is proven.) --- gui/engine.py | 71 +++++++++++++++++++++++++++++++++++++++++++ gui/server.py | 5 +++ gui/static/app.js | 26 ++++++++++++++++ gui/static/index.html | 7 +++++ gui/static/style.css | 12 ++++++++ 5 files changed, 121 insertions(+) diff --git a/gui/engine.py b/gui/engine.py index 91f59ab..895a11b 100644 --- a/gui/engine.py +++ b/gui/engine.py @@ -54,6 +54,10 @@ def __init__(self, out_dir: Path = OUT_DIR): self.level = 0.0 self.started_at = None self.job = {"state": "idle"} # idle | transcribing | done | error + # live (near-real-time) monitor of an in-progress recording (reads the file) + self._live = {"active": False, "wav": None, "lines": []} + self._live_stop = threading.Event() + self._live_thread = None # ---- static option lists for the UI ---- def models(self) -> list[str]: @@ -160,8 +164,75 @@ def status(self) -> dict: "level": round(self.level, 4), "elapsed": round(time.time() - self.started_at, 1) if (self.recording and self.started_at) else 0, "job": self.job, + "live": {"active": self._live["active"], "wav": self._live["wav"], + "lines": self._live["lines"][-120:]}, } + # ---- live (near-real-time) monitor: tail an in-progress recording's file ---- + def _newest_wav(self): + wavs = sorted(self.out_dir.glob("*.wav"), key=lambda p: p.stat().st_mtime, reverse=True) + return wavs[0] if wavs else None + + def live_start(self, wav: str | None = None) -> dict: + with self._lock: + if self._live["active"]: + return {"ok": True, "already": True, "wav": self._live["wav"]} + target = Path(wav) if wav else self._newest_wav() + if not target or not target.exists(): + return {"ok": False, "error": "no recording to monitor"} + self._live = {"active": True, "wav": str(target), "lines": []} + self._live_stop.clear() + self._live_thread = threading.Thread(target=self._live_loop, args=(target,), daemon=True, name="gui-live") + self._live_thread.start() + return {"ok": True, "wav": str(target)} + + def live_stop(self) -> dict: + self._live_stop.set() + if self._live_thread: + self._live_thread.join(timeout=3) + self._live["active"] = False + return {"ok": True} + + def _live_loop(self, wav: Path): + # tail raw PCM of the (still-growing) WAV, transcribe finalized speech windows + from gui.transcribe import _get_engines, _fmt_hms + try: + tr, vad, _d = _get_engines("fw-base", "en") + except Exception as e: + self._live["lines"].append({"t": "", "text": f"(live engine error: {e})"}) + self._live["active"] = False + return + f = open(wav, "rb") + f.seek(0, 2) # tail from the CURRENT end — only NEW speech (true live, no slow backlog replay) + abs_start = max(0, (f.tell() - 44) // 2) # samples already recorded before we started (for timestamps) + buf = np.zeros(0, dtype=np.float32) + try: + while not self._live_stop.is_set(): + self._live_stop.wait(8.0) + data = f.read() + if data: + n = len(data) - (len(data) % 2) + if n: + buf = np.concatenate([buf, np.frombuffer(data[:n], dtype="= SR * 2: + segs = vad.get_speech_segments(buf, min_speech_ms=500, min_silence_ms=300, max_speech_s=6.0) + guard = len(buf) - int(SR * 0.6) + consumed = 0 + for (s, e) in segs: + if e > guard: + break + txt = (tr.transcribe(buf[s:e]).get("text") or "").strip() + if txt: + self._live["lines"].append({"t": _fmt_hms((abs_start + s) / SR), "text": txt}) + self._live["lines"] = self._live["lines"][-200:] + consumed = e + if consumed: + abs_start += consumed + buf = buf[consumed:] + finally: + f.close() + self._live["active"] = False + # ---- session history ---- def _session_dirs(self) -> list[Path]: dirs = [self.out_dir] diff --git a/gui/server.py b/gui/server.py index 5e2467c..15db50f 100644 --- a/gui/server.py +++ b/gui/server.py @@ -135,6 +135,11 @@ def do_POST(self): b = self._read_json() return self._json(ENGINE.transcribe_existing(b.get("wav", ""), model=b.get("model", "fw-small"), language=b.get("language", "en"))) + if p == "/api/live/start": + b = self._read_json() + return self._json(ENGINE.live_start(b.get("wav"))) + if p == "/api/live/stop": + return self._json(ENGINE.live_stop()) return self._json({"error": "not found"}, 404) def _serve_static(self, rel: str): diff --git a/gui/static/app.js b/gui/static/app.js index c7a4b2e..909e9a6 100644 --- a/gui/static/app.js +++ b/gui/static/app.js @@ -68,6 +68,7 @@ async function init() { lSel.addEventListener("change", () => lsSet(LS_LANG, lSel.value)); $("recBtn").addEventListener("click", toggleRecord); + $("liveToggle").addEventListener("click", toggleLive); $("refreshSessions").addEventListener("click", loadSessions); $("sessionSearch").addEventListener("input", (e) => renderSessions(e.target.value)); $("navToggle").addEventListener("click", () => setNav(!document.body.classList.contains("nav-open"))); @@ -164,9 +165,34 @@ function applyStatus(s) { toast("Error: " + (job.error || "transcription failed")); $("recState").textContent = "Ready to record"; } + // live transcript panel (near-real-time tail of an in-progress recording) + const live = s.live || { active: false, lines: [] }; + document.body.classList.toggle("live-on", !!live.active); + $("liveToggle").textContent = live.active ? "■ Stop live" : "⦿ Live transcript"; + if (live.active) { + $("liveView").classList.remove("hidden"); + $("liveMeta").textContent = live.wav ? "· " + live.wav.split("/").pop() : ""; + const lines = live.lines || []; + const el = $("liveLines"); + const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight < 40; + el.innerHTML = lines.length + ? lines.map((l) => `
    ${escapeHtml(l.t)}${escapeHtml(l.text)}
    `).join("") + : `
    listening…
    `; + if (atBottom) el.scrollTop = el.scrollHeight; // keep pinned to newest unless scrolled up + } else { + $("liveView").classList.add("hidden"); + } lastJobState = job.state; } +// ---------- live transcript ---------- +async function toggleLive() { + const on = document.body.classList.contains("live-on"); + const r = await getJSON(on ? "/api/live/stop" : "/api/live/start", + { method: "POST", headers: { "Content-Type": "application/json" }, body: "{}" }); + if (!on && r && r.ok === false) toast(r.error ? "Live: " + r.error : "Could not start live"); +} + // ---------- record ---------- async function toggleRecord() { const recording = document.body.classList.contains("recording"); diff --git a/gui/static/index.html b/gui/static/index.html index faedecf..2d7e7ce 100644 --- a/gui/static/index.html +++ b/gui/static/index.html @@ -53,6 +53,13 @@

    Sessions

    Pick a model, hit record, talk. Stop when done — you’ll get a clean transcript and an AI-ready export.

    + + + + + diff --git a/gui/static/style.css b/gui/static/style.css index fc23764..3bf83a9 100644 --- a/gui/static/style.css +++ b/gui/static/style.css @@ -157,6 +157,18 @@ body.working .rec-btn { cursor: progress; filter: grayscale(.5) brightness(.78); /* ---------- misc ---------- */ .empty { color: var(--faint); text-align: center; padding: 60px 20px; font-size: 14px; } + +/* ---------- live transcript ---------- */ +.live-toggle { margin-top: 18px; } +body.live-on .live-toggle { background: var(--rec); color: #fff; border-color: transparent; } +.live-view { background: var(--bg-elev); border: 1px solid var(--line); border-radius: var(--radius); padding: 16px 18px; margin: 8px 0 24px; box-shadow: var(--shadow); } +.live-head { display: flex; align-items: center; gap: 8px; font-size: 13px; font-weight: 600; color: var(--muted); margin-bottom: 10px; } +.live-dot { width: 9px; height: 9px; border-radius: 50%; background: var(--rec); box-shadow: 0 0 8px var(--rec); animation: pulse 1.6s ease-in-out infinite; } +.live-lines { max-height: 320px; overflow-y: auto; display: flex; flex-direction: column; gap: 4px; font-size: 14px; line-height: 1.5; } +.live-lines .ll { color: var(--text); } +.live-lines .ll-t { color: var(--faint); font-variant-numeric: tabular-nums; font-size: 12px; margin-right: 8px; } +.live-lines .ll-empty { color: var(--faint); font-style: italic; } + .hidden { display: none !important; } .toast { position: fixed; bottom: 22px; left: 50%; transform: translateX(-50%); background: var(--bg-elev-2); color: var(--text); border: 1px solid var(--line); padding: 10px 18px; border-radius: 10px; box-shadow: var(--shadow); font-size: 13.5px; z-index: 50; animation: fade .2s ease; } .nav-toggle { display: none; position: fixed; top: 14px; left: 14px; z-index: 40; background: var(--bg-elev); color: var(--text); border: 1px solid var(--line); border-radius: 10px; width: 40px; height: 40px; font-size: 18px; cursor: pointer; } From 7455b4f9f04dcaa84f4a9924dd5da3799e6ac839 Mon Sep 17 00:00:00 2001 From: NubsCarson <192162056+NubsCarson@users.noreply.github.com> Date: Thu, 4 Jun 2026 12:36:38 +0000 Subject: [PATCH 12/60] feat(gui): session delete (traversal-safe, confirm) + fix stale status test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - engine.delete_session(stem, dir): removes only a session's text artifacts (-transcript/-agent.{md,json,srt,vtt}/-events.jsonl) for the stem, reusing _resolve's traversal guard + _session_dirs/only_dir restriction; never touches .wav (audio kept). - POST /api/session/delete (behind the LAN-auth gate). - UI: a subtle ✕ on each session row (confirm; stopPropagation so it can't trigger open; clears the view if the open session is deleted). - test_engine: +6 delete tests (exact-files, traversal rejected, dir-restricted, .wav untouched, missing-stem ok); fixed test_status_idle_shape to expect the 'live' key added by the prior live-transcription commit. gui suites green (export 37 / engine 22 / server 14). --- gui/engine.py | 26 +++++++++++++ gui/server.py | 3 ++ gui/static/app.js | 29 ++++++++++++++- gui/static/style.css | 10 +++++ gui/test_engine.py | 87 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 152 insertions(+), 3 deletions(-) diff --git a/gui/engine.py b/gui/engine.py index 895a11b..cdd4109 100644 --- a/gui/engine.py +++ b/gui/engine.py @@ -282,6 +282,32 @@ def _resolve(self, stem: str, suffix: str, only_dir: str | None = None) -> Path return p return None + # text artifacts a session owns (audio .wav is managed separately and never touched) + _ARTIFACT_SUFFIXES = ["-transcript.md", "-agent.md", "-agent.json", + "-agent.srt", "-agent.vtt", "-events.jsonl"] + + def delete_session(self, stem: str, dir: str | None = None) -> dict: + """Remove ONLY this session's text artifacts for ``stem``. + + Reuses _resolve's traversal guard (reject '/' or '..' in the stem) and resolves + strictly within _session_dirs() (honoring the optional ``dir`` like _resolve's + only_dir). Deletes only files that exist; never touches .wav audio or anything + outside a known session dir. Returns the list of deleted filenames. + """ + # SAME guard as _resolve: stem must be a bare name (no traversal) + if "/" in stem or ".." in stem: + return {"ok": False, "error": "bad stem", "deleted": []} + deleted: list[str] = [] + for suffix in self._ARTIFACT_SUFFIXES: + p = self._resolve(stem, suffix, only_dir=dir) # resolves within known dirs only + if p and p.is_file(): + try: + p.unlink() + deleted.append(p.name) + except OSError: + pass + return {"ok": True, "deleted": deleted} + def read_artifact(self, stem: str, kind: str, dir: str | None = None) -> dict: suffix = {"transcript": "-transcript.md", "agent_md": "-agent.md", "agent_json": "-agent.json", "srt": "-agent.srt", "vtt": "-agent.vtt"}.get(kind) diff --git a/gui/server.py b/gui/server.py index 15db50f..a80f5bb 100644 --- a/gui/server.py +++ b/gui/server.py @@ -140,6 +140,9 @@ def do_POST(self): return self._json(ENGINE.live_start(b.get("wav"))) if p == "/api/live/stop": return self._json(ENGINE.live_stop()) + if p == "/api/session/delete": + b = self._read_json() + return self._json(ENGINE.delete_session(b.get("stem", ""), dir=b.get("dir"))) return self._json({"error": "not found"}, 404) def _serve_static(self, rel: str): diff --git a/gui/static/app.js b/gui/static/app.js index 909e9a6..c38b3c0 100644 --- a/gui/static/app.js +++ b/gui/static/app.js @@ -226,10 +226,15 @@ function renderSessions(query) { const li = document.createElement("li"); li.className = "session"; li.dataset.stem = s.stem; li.tabIndex = 0; li.setAttribute("role", "button"); const has = []; if (s.agent_md) has.push("AI"); if (s.transcript) has.push("md"); - li.innerHTML = `
    ${escapeHtml(prettyStem(s.stem))}
    -
    ${has.map((h) => `${h}`).join("")}
    `; + li.innerHTML = `
    ${escapeHtml(prettyStem(s.stem))}
    +
    ${has.map((h) => `${h}`).join("")}
    + `; li.addEventListener("click", () => loadSession(s.stem, s.dir)); li.addEventListener("keydown", (e) => { if (e.key === "Enter" || e.key === " ") { e.preventDefault(); e.stopPropagation(); loadSession(s.stem, s.dir); } }); + const del = li.querySelector(".session-del"); + // a focused ✕ must not let Space/Enter bubble up to the row (which would loadSession) + del.addEventListener("keydown", (e) => { e.stopPropagation(); }); + del.addEventListener("click", (e) => { e.stopPropagation(); deleteSession(s.stem, s.dir); }); ul.appendChild(li); }); } @@ -255,6 +260,26 @@ async function loadSession(stem, dir) { setNav(false); } +// Delete a session's transcript files (audio is kept). Confirm, POST, then refresh +// the list and clear the transcript view if the deleted session was the one open. +async function deleteSession(stem, dir) { + if (!confirm("Delete this session's transcript files? (audio is kept)")) return; + const r = await getJSON("/api/session/delete", { + method: "POST", headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ stem: stem, dir: dir || null }), + }); + if (!r || r.ok === false) return toast("Could not delete session"); + const wasOpen = CUR && CUR.session && CUR.session.id === stem; + toast(r.deleted && r.deleted.length ? `Deleted ${r.deleted.length} file(s)` : "Nothing to delete"); + await loadSessions(); + if (wasOpen) { + CUR = null; RENAMES = {}; + setExportEnabled(false); + $("transcriptView").classList.add("hidden"); + $("empty").classList.remove("hidden"); + } +} + // ---------- render ---------- function render() { $("empty").classList.add("hidden"); diff --git a/gui/static/style.css b/gui/static/style.css index 3bf83a9..ef054ee 100644 --- a/gui/static/style.css +++ b/gui/static/style.css @@ -56,11 +56,21 @@ body { .session { padding: 10px 12px; border-radius: 10px; cursor: pointer; border: 1px solid transparent; transition: background .15s, border-color .15s; + display: flex; align-items: center; gap: 8px; } .session:hover { background: var(--bg-elev-2); } .session.active { background: var(--bg-elev-2); border-color: var(--line); } +.session .s-main { flex: 1; min-width: 0; } .session .s-title { font-size: 13.5px; font-weight: 550; } .session .s-sub { font-size: 11.5px; color: var(--faint); margin-top: 2px; display: flex; gap: 6px; } +/* subtle ✕ — muted until the row is hovered/focused, then it pops; hover -> record color */ +.session-del { + flex: none; background: none; border: none; cursor: pointer; color: var(--muted); + font-size: 13px; line-height: 1; padding: 4px 6px; border-radius: 7px; + opacity: 0; transition: opacity .15s, color .15s, background .15s; +} +.session:hover .session-del, .session:focus-within .session-del { opacity: .55; } +.session-del:hover, .session-del:focus-visible { opacity: 1; color: var(--rec); background: var(--bg-elev); } .tag { font-size: 10px; padding: 1px 6px; border-radius: 999px; background: #1f2630; color: var(--accent-dim); border: 1px solid var(--line); } .sessions-empty { color: var(--faint); font-size: 12.5px; line-height: 1.5; padding: 14px 12px; border: 1px dashed var(--line); border-radius: 10px; text-align: center; } .session-search { width: 100%; box-sizing: border-box; margin: 0 0 10px; background: var(--bg-elev); color: var(--text); border: 1px solid var(--line); border-radius: 9px; padding: 7px 11px; font-size: 13px; } diff --git a/gui/test_engine.py b/gui/test_engine.py index 7b4dd98..be3a067 100644 --- a/gui/test_engine.py +++ b/gui/test_engine.py @@ -216,16 +216,101 @@ def test_session_dirs_excludes_nonexistent_and_dedups(): assert out in dirs and sess in dirs and live in dirs +# --- delete_session ---------------------------------------------------------- + +def test_delete_session_removes_only_its_artifacts(): + eng, out, *_ = _isolated_engine() + # every text artifact kind for the target stem + for suf in ["-transcript.md", "-agent.md", "-agent.json", "-agent.srt", "-agent.vtt", "-events.jsonl"]: + _touch(out / f"s1{suf}") + # things that must SURVIVE: this session's audio, and another whole session + _touch(out / "s1.wav", text="AUDIO") + _touch(out / "s2-transcript.md", text="other") + _touch(out / "s2.wav", text="OTHER AUDIO") + r = eng.delete_session("s1") + assert r["ok"] is True + assert set(r["deleted"]) == { + "s1-transcript.md", "s1-agent.md", "s1-agent.json", + "s1-agent.srt", "s1-agent.vtt", "s1-events.jsonl", + }, r["deleted"] + # all six artifacts are gone + for suf in ["-transcript.md", "-agent.md", "-agent.json", "-agent.srt", "-agent.vtt", "-events.jsonl"]: + assert not (out / f"s1{suf}").exists() + # audio + other session untouched + assert (out / "s1.wav").exists() and (out / "s1.wav").read_text() == "AUDIO" + assert (out / "s2-transcript.md").exists() and (out / "s2.wav").exists() + + +def test_delete_session_only_existing_files(): + eng, out, *_ = _isolated_engine() + _touch(out / "s1-transcript.md") + _touch(out / "s1-agent.json", text="{}") + # the other four suffixes don't exist -> only the present two are reported/removed + r = eng.delete_session("s1") + assert r["ok"] is True + assert set(r["deleted"]) == {"s1-transcript.md", "s1-agent.json"}, r["deleted"] + assert not (out / "s1-transcript.md").exists() + assert not (out / "s1-agent.json").exists() + + +def test_delete_session_missing_stem_is_ok_empty(): + eng, *_ = _isolated_engine() + r = eng.delete_session("does-not-exist") + assert r["ok"] is True and r["deleted"] == [] + + +def test_delete_session_rejects_path_traversal(): + eng, out, sess, _live = _isolated_engine() + # plant a real file we must NOT be able to reach via traversal + _touch(out / "secret-transcript.md", text="SECRET") + for bad in ("../secret", "a/b", "..", "sub/secret"): + r = eng.delete_session(bad) + assert r["ok"] is False and r["deleted"] == [], (bad, r) + # the planted file is still there (never touched) + assert (out / "secret-transcript.md").exists() + + +def test_delete_session_honors_dir_restriction(): + eng, out, sess, _live = _isolated_engine() + # same stem lives in BOTH known dirs + _touch(out / "dup-transcript.md", text="from-out") + _touch(sess / "dup-transcript.md", text="from-sess") + # restricting to SESSIONS_DIR deletes only that dir's copy + r = eng.delete_session("dup", dir=str(sess)) + assert r["ok"] is True and r["deleted"] == ["dup-transcript.md"], r + assert not (sess / "dup-transcript.md").exists() + assert (out / "dup-transcript.md").exists() and (out / "dup-transcript.md").read_text() == "from-out" + # a dir that is NOT a known session dir -> nothing resolves/deleted, even though + # the file physically exists there + bogus = Path(tempfile.mkdtemp(prefix="voxeng_bogus_")) + _touch(bogus / "dup-transcript.md", text="from-bogus") + r2 = eng.delete_session("dup", dir=str(bogus)) + assert r2["ok"] is True and r2["deleted"] == [], r2 + assert (bogus / "dup-transcript.md").exists() + + +def test_delete_session_never_touches_wav(): + eng, out, *_ = _isolated_engine() + _touch(out / "rec-transcript.md") + _touch(out / "rec.wav", text="AUDIO") + _touch(out / "rec-agent.wav", text="NOT A TEXT ARTIFACT") # adversarial name + r = eng.delete_session("rec") + assert r["deleted"] == ["rec-transcript.md"], r + assert (out / "rec.wav").exists() + assert (out / "rec-agent.wav").exists() # .wav suffix is never in the artifact list + + # --- status (idle) ----------------------------------------------------------- def test_status_idle_shape(): eng, *_ = _isolated_engine() st = eng.status() - assert set(st) == {"recording", "level", "elapsed", "job"} + assert set(st) == {"recording", "level", "elapsed", "job", "live"} assert st["recording"] is False assert st["level"] == 0.0 assert st["elapsed"] == 0 # not recording -> zero, never time.time() assert st["job"] == {"state": "idle"} + assert st["live"] == {"active": False, "wav": None, "lines": []} def test_status_elapsed_zero_even_with_started_at(): From bf93872930ac280a37c701a37865cac72e80dc45 Mon Sep 17 00:00:00 2001 From: NubsCarson <192162056+NubsCarson@users.noreply.github.com> Date: Thu, 4 Jun 2026 13:58:08 +0000 Subject: [PATCH 13/60] =?UTF-8?q?feat(gui):=20live=20partial=20transcripti?= =?UTF-8?q?on=20=E2=80=94=20LocalAgreement=20stabilizer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The live monitor finalized speech only on silence, so an in-progress utterance showed nothing until the speaker paused. Add a partial preview of the still-growing tail: each pass re-decodes the tail and a LocalAgreement-n stabilizer commits the longest word-prefix that has agreed across the last n hypotheses (stable) and marks the remainder volatile. As words settle they graduate stable→so the head stops flickering while the tail updates live. - gui/stabilize.py: PartialStabilizer (pure, LocalAgreement-n) + 9 unit tests - engine._live_loop: re-decode tail → stabilize → status.live.partial; reset on finalize so each utterance starts clean - app.js/style.css: render the partial (committed words solid, volatile tail dimmed + softly pulsing) Proven on a real recording: ASR revised "floor"→"hood" mid-utterance and the stabilizer held it volatile until settled (never committed the wrong word). 82 tests green. Idea ported from elizaOS's streaming partial-stabilizer. --- gui/engine.py | 21 +++++++-- gui/stabilize.py | 54 ++++++++++++++++++++++ gui/static/app.js | 17 +++++-- gui/static/style.css | 4 ++ gui/test_engine.py | 2 +- gui/test_stabilize.py | 102 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 192 insertions(+), 8 deletions(-) create mode 100644 gui/stabilize.py create mode 100644 gui/test_stabilize.py diff --git a/gui/engine.py b/gui/engine.py index cdd4109..b5b6061 100644 --- a/gui/engine.py +++ b/gui/engine.py @@ -55,9 +55,10 @@ def __init__(self, out_dir: Path = OUT_DIR): self.started_at = None self.job = {"state": "idle"} # idle | transcribing | done | error # live (near-real-time) monitor of an in-progress recording (reads the file) - self._live = {"active": False, "wav": None, "lines": []} + self._live = {"active": False, "wav": None, "lines": [], "partial": None} self._live_stop = threading.Event() self._live_thread = None + self._stab = None # PartialStabilizer for the in-progress (volatile) tail # ---- static option lists for the UI ---- def models(self) -> list[str]: @@ -165,7 +166,7 @@ def status(self) -> dict: "elapsed": round(time.time() - self.started_at, 1) if (self.recording and self.started_at) else 0, "job": self.job, "live": {"active": self._live["active"], "wav": self._live["wav"], - "lines": self._live["lines"][-120:]}, + "lines": self._live["lines"][-120:], "partial": self._live.get("partial")}, } # ---- live (near-real-time) monitor: tail an in-progress recording's file ---- @@ -180,7 +181,9 @@ def live_start(self, wav: str | None = None) -> dict: target = Path(wav) if wav else self._newest_wav() if not target or not target.exists(): return {"ok": False, "error": "no recording to monitor"} - self._live = {"active": True, "wav": str(target), "lines": []} + self._live = {"active": True, "wav": str(target), "lines": [], "partial": None} + from gui.stabilize import PartialStabilizer + self._stab = PartialStabilizer() self._live_stop.clear() self._live_thread = threading.Thread(target=self._live_loop, args=(target,), daemon=True, name="gui-live") self._live_thread.start() @@ -191,6 +194,7 @@ def live_stop(self) -> dict: if self._live_thread: self._live_thread.join(timeout=3) self._live["active"] = False + self._live["partial"] = None return {"ok": True} def _live_loop(self, wav: Path): @@ -229,6 +233,17 @@ def _live_loop(self, wav: Path): if consumed: abs_start += consumed buf = buf[consumed:] + if self._stab: # finalized → the volatile tail starts fresh + self._stab.reset() + # Volatile preview of the still-in-progress tail (LocalAgreement-stabilized), + # so an in-progress utterance shows up live instead of only after the pause. + if self._stab is not None and len(buf) >= int(SR * 0.4): + ptxt = (tr.transcribe(buf).get("text") or "").strip() + st = self._stab.push(ptxt) + self._live["partial"] = ({"t": _fmt_hms(abs_start / SR), **st} + if (st["stable"] or st["volatile"]) else None) + else: + self._live["partial"] = None finally: f.close() self._live["active"] = False diff --git a/gui/stabilize.py b/gui/stabilize.py new file mode 100644 index 0000000..9166eae --- /dev/null +++ b/gui/stabilize.py @@ -0,0 +1,54 @@ +"""Partial-hypothesis stabilizer for live transcription (LocalAgreement-n). + +A streaming transcriber re-decodes the still-growing speech tail on every pass, so the +raw partial text keeps rewriting itself as more audio arrives. This stabilizes the +display: it commits the longest leading run of words that has agreed across the last +``n`` passes (the *stable* prefix) and marks only the trailing remainder of the newest +hypothesis as *volatile*. As a word stays put for ``n`` consecutive passes it graduates +from volatile to stable, so the head stops flickering while the tail keeps updating in +near-real-time. + +LocalAgreement-n is the standard streaming-ASR commit policy (n=2 is the common default). +The idea is ported from elizaOS's streaming partial-stabilizer; reimplemented here for +VoxTerm (no model needed — it operates on the transcriber's own incremental output). +""" +from __future__ import annotations + +from collections import deque + + +def common_prefix_len(seqs: list[list[str]]) -> int: + """Length of the longest leading run of words shared by ALL ``seqs``.""" + if not seqs: + return 0 + shortest = min(len(s) for s in seqs) + for i in range(shortest): + w = seqs[0][i] + if any(s[i] != w for s in seqs): + return i + return shortest + + +class PartialStabilizer: + """Commit the leading words that agree across the last ``n`` partial hypotheses. + + Call :meth:`push` with each new raw partial; it returns ``{"stable", "volatile"}``. + Call :meth:`reset` when the current utterance is finalized so the next partial starts + clean. Whitespace is the token boundary (good enough for live display; the saved + transcript still comes from the full post-stop pipeline). + """ + + def __init__(self, n: int = 2): + self.n = max(2, int(n)) + self._hist: deque[list[str]] = deque(maxlen=self.n) + + def push(self, text: str) -> dict: + words = (text or "").split() + self._hist.append(words) + # Need a full window before committing anything; until then it's all volatile. + stable_len = common_prefix_len(list(self._hist)) if len(self._hist) >= self.n else 0 + return {"stable": " ".join(words[:stable_len]), + "volatile": " ".join(words[stable_len:])} + + def reset(self) -> None: + self._hist.clear() diff --git a/gui/static/app.js b/gui/static/app.js index c38b3c0..339aa0c 100644 --- a/gui/static/app.js +++ b/gui/static/app.js @@ -166,7 +166,7 @@ function applyStatus(s) { $("recState").textContent = "Ready to record"; } // live transcript panel (near-real-time tail of an in-progress recording) - const live = s.live || { active: false, lines: [] }; + const live = s.live || { active: false, lines: [], partial: null }; document.body.classList.toggle("live-on", !!live.active); $("liveToggle").textContent = live.active ? "■ Stop live" : "⦿ Live transcript"; if (live.active) { @@ -175,9 +175,18 @@ function applyStatus(s) { const lines = live.lines || []; const el = $("liveLines"); const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight < 40; - el.innerHTML = lines.length - ? lines.map((l) => `
    ${escapeHtml(l.t)}${escapeHtml(l.text)}
    `).join("") - : `
    listening…
    `; + let html = lines + .map((l) => `
    ${escapeHtml(l.t)}${escapeHtml(l.text)}
    `) + .join(""); + // the still-in-progress utterance: committed words solid, the rest dimmed (volatile) + const p = live.partial; + if (p && (p.stable || p.volatile)) { + html += `
    ${escapeHtml(p.t || "")}` + + escapeHtml(p.stable || "") + + (p.stable && p.volatile ? " " : "") + + `${escapeHtml(p.volatile || "")}
    `; + } + el.innerHTML = html || `
    listening…
    `; if (atBottom) el.scrollTop = el.scrollHeight; // keep pinned to newest unless scrolled up } else { $("liveView").classList.add("hidden"); diff --git a/gui/static/style.css b/gui/static/style.css index ef054ee..63a5d1a 100644 --- a/gui/static/style.css +++ b/gui/static/style.css @@ -178,6 +178,10 @@ body.live-on .live-toggle { background: var(--rec); color: #fff; border-color: t .live-lines .ll { color: var(--text); } .live-lines .ll-t { color: var(--faint); font-variant-numeric: tabular-nums; font-size: 12px; margin-right: 8px; } .live-lines .ll-empty { color: var(--faint); font-style: italic; } +/* the in-progress utterance: committed words solid, volatile tail dimmed + softly pulsing */ +.live-lines .ll-partial { color: var(--accent-dim); } +.live-lines .ll-partial .ll-vol { color: var(--faint); font-style: italic; animation: livepulse 1.4s ease-in-out infinite; } +@keyframes livepulse { 0%, 100% { opacity: 0.55; } 50% { opacity: 0.95; } } .hidden { display: none !important; } .toast { position: fixed; bottom: 22px; left: 50%; transform: translateX(-50%); background: var(--bg-elev-2); color: var(--text); border: 1px solid var(--line); padding: 10px 18px; border-radius: 10px; box-shadow: var(--shadow); font-size: 13.5px; z-index: 50; animation: fade .2s ease; } diff --git a/gui/test_engine.py b/gui/test_engine.py index be3a067..c269e50 100644 --- a/gui/test_engine.py +++ b/gui/test_engine.py @@ -310,7 +310,7 @@ def test_status_idle_shape(): assert st["level"] == 0.0 assert st["elapsed"] == 0 # not recording -> zero, never time.time() assert st["job"] == {"state": "idle"} - assert st["live"] == {"active": False, "wav": None, "lines": []} + assert st["live"] == {"active": False, "wav": None, "lines": [], "partial": None} def test_status_elapsed_zero_even_with_started_at(): diff --git a/gui/test_stabilize.py b/gui/test_stabilize.py new file mode 100644 index 0000000..895e735 --- /dev/null +++ b/gui/test_stabilize.py @@ -0,0 +1,102 @@ +"""Tests for gui.stabilize.PartialStabilizer (LocalAgreement-n). + +Pure logic — no audio, no model. Verifies the commit policy: a leading word run only +becomes "stable" once it has agreed across the last n hypotheses; the tail stays +"volatile"; reset() clears the window. Pytest-style; also runnable standalone. +""" +import sys +from pathlib import Path + +_HERE = Path(__file__).resolve().parent +_ROOT = _HERE.parent +for p in (str(_ROOT), str(_HERE)): + if p not in sys.path: + sys.path.insert(0, p) + +from stabilize import PartialStabilizer, common_prefix_len + + +def test_common_prefix_len_basic(): + assert common_prefix_len([["a", "b", "c"], ["a", "b", "d"]]) == 2 + assert common_prefix_len([["a", "b"], ["x", "y"]]) == 0 + assert common_prefix_len([["a", "b", "c"], ["a", "b", "c"]]) == 3 + + +def test_common_prefix_len_edge(): + assert common_prefix_len([]) == 0 + assert common_prefix_len([["a", "b"]]) == 2 # single seq → all shared + assert common_prefix_len([["a"], []]) == 0 # one empty → nothing shared + + +def test_first_push_is_all_volatile(): + # n=2 default: with only one hypothesis there is nothing to agree against. + s = PartialStabilizer() + out = s.push("hello there friend") + assert out["stable"] == "" + assert out["volatile"] == "hello there friend" + + +def test_agreement_commits_prefix(): + s = PartialStabilizer() + s.push("the quick brown") + out = s.push("the quick brown fox") + # "the quick brown" agreed across both → stable; "fox" is new → volatile. + assert out["stable"] == "the quick brown" + assert out["volatile"] == "fox" + + +def test_divergent_tail_stays_volatile(): + s = PartialStabilizer() + s.push("i think we should") + out = s.push("i think we shall not") + # prefix "i think we" agrees; "should" vs "shall" diverge → volatile from there. + assert out["stable"] == "i think we" + assert out["volatile"] == "shall not" + + +def test_reset_clears_window(): + s = PartialStabilizer() + s.push("alpha beta") + s.push("alpha beta") # would commit "alpha beta" + s.reset() + out = s.push("gamma delta") # fresh window → all volatile again + assert out["stable"] == "" + assert out["volatile"] == "gamma delta" + + +def test_empty_text(): + s = PartialStabilizer() + out = s.push("") + assert out == {"stable": "", "volatile": ""} + out = s.push(" ") # whitespace-only → no tokens + assert out == {"stable": "", "volatile": ""} + + +def test_n_three_needs_three_agreeing(): + s = PartialStabilizer(n=3) + assert s.push("a b")["stable"] == "" # 1 hyp + assert s.push("a b")["stable"] == "" # 2 hyps, n=3 not reached + assert s.push("a b")["stable"] == "a b" # 3 hyps agree → commit + + +def test_shrinking_hypothesis_does_not_overcommit(): + # If the newest hypothesis is shorter, stable can only be as long as the shortest. + s = PartialStabilizer() + s.push("one two three four") + out = s.push("one two") + assert out["stable"] == "one two" + assert out["volatile"] == "" + + +if __name__ == "__main__": + fns = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)] + failed = 0 + for fn in fns: + try: + fn() + print(f" ok {fn.__name__}") + except Exception as e: + failed += 1 + print(f" FAIL {fn.__name__}: {type(e).__name__}: {e}") + print(f"\n{len(fns) - failed}/{len(fns)} passed") + sys.exit(1 if failed else 0) From f94a33be7f194fd9fa917002091f883b44c5c939 Mon Sep 17 00:00:00 2001 From: NubsCarson <192162056+NubsCarson@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:24:47 +0000 Subject: [PATCH 14/60] feat(gui): live follows YOUR recording + waveform + CLI partial parity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gap #1 — live now tails the GUI's own recording. start_recording streams straight to a growing on-disk WAV (placeholder header, _poll appends s16 PCM under the lock + flush, stop patches the real header). The live monitor tails that same file, so clicking Live during a GUI recording shows your words (before, Record buffered in RAM and only wrote on stop, so Live saw nothing). Bonus: a long session no longer sits entirely in RAM; transcription loads the file off-thread. Gap #3 — small stuff: - live.py CLI: ported the LocalAgreement stabilizer (in-place updating partial line) for parity with the GUI - app.js/index.html/style.css: a scrolling live amplitude canvas during record 3 new streaming-WAV tests (header is 44B + parses, _pcm_bytes==_write_wav, growing file is tailable mid-write then finalizes valid). 85 tests green. --- gui/engine.py | 86 +++++++++++++++++++++++++++++++++++-------- gui/live.py | 25 +++++++++++++ gui/static/app.js | 26 +++++++++++++ gui/static/index.html | 1 + gui/static/style.css | 3 ++ gui/test_engine.py | 46 ++++++++++++++++++++++- 6 files changed, 170 insertions(+), 17 deletions(-) diff --git a/gui/engine.py b/gui/engine.py index b5b6061..d0b5a35 100644 --- a/gui/engine.py +++ b/gui/engine.py @@ -11,6 +11,7 @@ """ from __future__ import annotations +import struct import sys import threading import time @@ -41,12 +42,30 @@ def _write_wav(path: Path, audio: np.ndarray) -> None: w.writeframes(pcm.tobytes()) +def _wav_header(data_len: int, sr: int = SR) -> bytes: + """The canonical 44-byte PCM WAV header (mono, s16). ``data_len`` may be 0 as a + placeholder while the file is still growing — it's patched on close. Tailers read raw + PCM past byte 44 regardless, so a placeholder size never breaks live monitoring.""" + return (b"RIFF" + struct.pack(" bytes: + """float32 [-1,1] → little-endian s16 bytes (the same mapping as ``_write_wav``).""" + return (np.clip(chunk, -1.0, 1.0) * 32767.0).astype(" dict: self._cap = None self.recording = False return {"ok": False, "error": f"could not open the microphone: {e}"} - self._chunks = [] + # open the growing WAV now (placeholder header, patched on stop) so the live + # monitor can tail this very recording and click-Live follows what you're saying. + ts = datetime.now().strftime("%Y%m%d-%H%M%S") + self._rec_wav_path = self.out_dir / f"{ts}-gui.wav" + try: + self._rec_file = open(self._rec_wav_path, "wb") + self._rec_file.write(_wav_header(0)) + self._rec_file.flush() + except OSError as e: + try: + self._cap.stop() + except Exception: + pass + self._cap = None + self.recording = False + return {"ok": False, "error": f"could not open the recording file: {e}"} + self._rec_bytes = 0 self._stop.clear() self.recording = True self.started_at = time.time() self.level = 0.0 self._poll_thread = threading.Thread(target=self._poll, daemon=True, name="gui-rec-poll") self._poll_thread.start() - return {"ok": True} + return {"ok": True, "wav": str(self._rec_wav_path)} def _poll(self): while not self._stop.is_set(): @@ -98,8 +133,13 @@ def _poll(self): chunks = [] fresh = [np.asarray(c, dtype=np.float32) for c in chunks if c is not None and len(c)] if fresh: - with self._lock: # serialize with stop's concat/clear - self._chunks.extend(fresh) + with self._lock: # serialize with stop's finalize + if self._rec_file: + for c in fresh: + b = _pcm_bytes(c) + self._rec_file.write(b) + self._rec_bytes += len(b) + self._rec_file.flush() # make new audio visible to the live tailer last = fresh[-1] if len(last): self.level = float(np.sqrt(np.mean(np.square(last)))) @@ -117,24 +157,38 @@ def stop_recording(self, model: str = "fw-small", language: str = "en") -> dict: with self._lock: self.recording = False try: - for c in self._cap.drain(): + for c in self._cap.drain(): # capture any frames still queued if c is not None and len(c): - self._chunks.append(np.asarray(c, dtype=np.float32)) + b = _pcm_bytes(np.asarray(c, dtype=np.float32)) + self._rec_file.write(b) + self._rec_bytes += len(b) self._cap.stop() except Exception: pass - audio = np.concatenate(self._chunks).astype(np.float32) if self._chunks else np.zeros(0, dtype=np.float32) - self._chunks = [] - if len(audio) < SR // 2: # < 0.5s + wav = self._rec_wav_path + n_bytes = self._rec_bytes + try: # patch the header with the real size → valid WAV + self._rec_file.flush() + self._rec_file.seek(0) + self._rec_file.write(_wav_header(n_bytes)) + self._rec_file.flush() + self._rec_file.close() + except Exception: + pass + self._rec_file = None + if n_bytes < SR: # < 0.5s of s16 mono (SR*2 bytes/s → 0.5s = SR bytes) + try: + wav.unlink() + except OSError: + pass self.job = {"state": "error", "error": "recording too short"} return {"ok": False, "error": "recording too short"} - ts = datetime.now().strftime("%Y%m%d-%H%M%S") - wav = self.out_dir / f"{ts}-gui.wav" - _write_wav(wav, audio) self.job = {"state": "transcribing", "frac": 0.0, "msg": "starting", "wav": str(wav)} - threading.Thread(target=self._do_transcribe, args=(audio, model, language, str(wav)), - daemon=True, name="gui-transcribe").start() - return {"ok": True, "wav": str(wav), "seconds": round(len(audio) / SR, 1)} + # load + transcribe off the request thread (matches transcribe_existing) + threading.Thread( + target=lambda: self._do_transcribe(transcribe.load_wav_16k_mono(wav), model, language, str(wav)), + daemon=True, name="gui-transcribe").start() + return {"ok": True, "wav": str(wav), "seconds": round(n_bytes / (SR * 2), 1)} def _do_transcribe(self, audio, model, language, wav): try: diff --git a/gui/live.py b/gui/live.py index 513583a..acce88d 100644 --- a/gui/live.py +++ b/gui/live.py @@ -26,6 +26,7 @@ import config # noqa: E402 from gui.transcribe import _get_engines, _fmt_hms # reuse the cached engine + time fmt # noqa: E402 +from gui.stabilize import PartialStabilizer # noqa: E402 SR = config.SAMPLE_RATE _WAV_HEADER = 44 # bytes; raw little-endian s16 PCM follows @@ -55,6 +56,16 @@ def main(argv=None) -> int: abs_start = 0 # absolute sample index of buf[0] started = time.time() n_lines = 0 + stab = PartialStabilizer() # volatile preview of the still-in-progress tail + partial_len = 0 # chars of the in-place partial line currently on screen + + def clear_partial(): + nonlocal partial_len + if partial_len: + sys.stdout.write("\r" + " " * partial_len + "\r") + sys.stdout.flush() + partial_len = 0 + try: while True: time.sleep(args.interval) @@ -75,17 +86,31 @@ def main(argv=None) -> int: out = tr.transcribe(buf[s:e]) txt = (out.get("text") or "").strip() if txt: + clear_partial() # erase the in-place partial before a final line print(f" [{_fmt_hms((abs_start + s) / SR)}] {txt}", flush=True) n_lines += 1 consumed = e if consumed: abs_start += consumed buf = buf[consumed:] + stab.reset() # finalized → the volatile tail restarts clean + # in-place volatile partial of the still-in-progress tail + if len(buf) >= int(SR * 0.4): + st = stab.push((tr.transcribe(buf).get("text") or "").strip()) + line = (st["stable"] + (" " if st["stable"] and st["volatile"] else "") + st["volatile"]).strip() + if line: + s_out = f" ~ [{_fmt_hms(abs_start / SR)}] {line}" + sys.stdout.write("\r" + s_out + " " * max(0, partial_len - len(s_out))) + sys.stdout.flush() + partial_len = len(s_out) + else: + clear_partial() if args.max_seconds and (time.time() - started) >= args.max_seconds: break except KeyboardInterrupt: pass finally: + clear_partial() f.close() print(f"[live] stopped — {n_lines} live lines transcribed", flush=True) return 0 diff --git a/gui/static/app.js b/gui/static/app.js index 339aa0c..5fc8205 100644 --- a/gui/static/app.js +++ b/gui/static/app.js @@ -50,6 +50,30 @@ function nameFor(turn) { return turn.speaker || "(unattributed)"; } +// Live amplitude strip: push each SSE level reading and draw a scrolling bar history. +const WAVE_MAX = 120; +const _wave = []; +function drawWave(level) { + const c = $("recWave"); + if (!c || !c.getContext) return; + _wave.push(Math.min(1, Math.max(0, (level || 0) / 0.25))); // ~0.25 RMS ≈ full height + if (_wave.length > WAVE_MAX) _wave.shift(); + const ctx = c.getContext("2d"), W = c.width, H = c.height; + ctx.clearRect(0, 0, W, H); + const accent = (getComputedStyle(document.documentElement).getPropertyValue("--rec") || "#ff5d6c").trim(); + ctx.fillStyle = accent || "#ff5d6c"; + const barW = W / WAVE_MAX; + for (let i = 0; i < _wave.length; i++) { + const h = Math.max(2, _wave[i] * H); + ctx.fillRect(i * barW + barW * 0.2, (H - h) / 2, barW * 0.6, h); // centered bar + } +} +function clearWave() { + _wave.length = 0; + const c = $("recWave"); + if (c && c.getContext) c.getContext("2d").clearRect(0, 0, c.width, c.height); +} + // ---------- init ---------- async function init() { const o = await getJSON("/api/options"); @@ -131,9 +155,11 @@ function applyStatus(s) { // level ring (0..~0.3 typical) -> 0..360deg const deg = Math.min(360, (s.level || 0) / 0.25 * 360); $("ring").style.background = `conic-gradient(var(--rec) ${deg}deg, var(--line) ${deg}deg)`; + drawWave(s.level); // scrolling live amplitude strip $("model").disabled = $("language").disabled = true; } else { $("ring").style.background = ""; + if (_wave.length) clearWave(); $("model").disabled = $("language").disabled = false; if (lastJobState === "idle" && s.job.state === "idle") { $("recState").textContent = "Ready to record"; $("timer").textContent = "00:00"; } } diff --git a/gui/static/index.html b/gui/static/index.html index 2d7e7ce..f65ff35 100644 --- a/gui/static/index.html +++ b/gui/static/index.html @@ -42,6 +42,7 @@

    Sessions

    Ready to record
    +
    - -
    -

    No transcript yet. Record something, or pick a past session from the left.

    -
    + +
    + +
    + 00:00 + Ready +
    + + + +
    diff --git a/gui/static/manifest.webmanifest b/gui/static/manifest.webmanifest index 7a9dc72..af120b4 100644 --- a/gui/static/manifest.webmanifest +++ b/gui/static/manifest.webmanifest @@ -5,8 +5,8 @@ "start_url": "/", "scope": "/", "display": "standalone", - "background_color": "#0e0f13", - "theme_color": "#0e0f13", + "background_color": "#1c1c1e", + "theme_color": "#1c1c1e", "icons": [ { "src": "/static/icon-192.png", "type": "image/png", "sizes": "192x192", "purpose": "any maskable" }, { "src": "/static/icon-512.png", "type": "image/png", "sizes": "512x512", "purpose": "any maskable" }, diff --git a/gui/static/style.css b/gui/static/style.css index 95edb3e..877df99 100644 --- a/gui/static/style.css +++ b/gui/static/style.css @@ -1,213 +1,302 @@ :root { - --bg: #0e0f13; - --bg-elev: #16181f; - --bg-elev-2: #1c1f28; - --line: #262a35; - --text: #e7e9ee; - --muted: #9aa1ad; - --faint: #7d8694; /* ≥4.6:1 on --bg — was #6b7280 (~3.9:1, failed WCAG AA) */ - --accent: #5eead4; /* calm teal */ - --accent-dim: #2dd4bf; - --rec: #f0566a; /* warm coral for record */ - --rec-glow: rgba(240, 86, 106, 0.45); - --radius: 14px; - --shadow: 0 8px 30px rgba(0,0,0,0.35); + color-scheme: dark; + + /* ---- Surfaces (Apple systemBackground -> elevated, dark) ---- */ + --bg: #1c1c1e; /* main canvas */ + --bg-elev: #2c2c2e; /* sidebar, cards, inputs, popovers */ + --bg-elev-2: #3a3a3c; /* hover / pressed surface */ + + /* ---- Separators: hairlines replace nearly all the old 1px boxes ---- */ + --line: rgba(255,255,255,0.10); + --line-strong: rgba(255,255,255,0.16); + + /* ---- Translucent fills: inset controls (search, selects, segmented) ---- */ + --fill: rgba(120,120,128,0.18); + --fill-hover: rgba(120,120,128,0.26); + + /* ---- Text (Apple "label" ramp, dark) ---- */ + --text: #ffffff; + --muted: rgba(235,235,245,0.62); + --faint: rgba(235,235,245,0.55); /* legible on dark surfaces (was 0.34 — failed contrast) */ + + /* ---- "Primary"/active tint: MONOCHROME near-white, NO hue. (red --rec is the only color) ---- */ + --accent: #e8e8ec; + --accent-hover: #ffffff; + --accent-tint: rgba(255,255,255,0.10); + --ring: rgba(255,255,255,0.42); + + /* ---- Record red: systemRed (dark) — ONLY the recording state ---- */ + --rec: #ff453a; + --rec-glow: rgba(255,69,58,0.22); + + /* ---- Type scale: 5 steps + timer, system font ---- */ + --font: -apple-system, BlinkMacSystemFont, system-ui, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; + --fs-title: 22px; + --fs-headline: 17px; + --fs-body: 15px; + --fs-sub: 13px; + --fs-caption: 11px; + --fs-timer: 17px; + + /* ---- 8pt spacing ---- */ + --sp-1:4px; --sp-2:8px; --sp-3:12px; --sp-4:16px; --sp-5:24px; --sp-6:32px; --sp-7:40px; + + /* ---- Radius ---- */ + --r-sm:8px; --r:10px; --r-lg:12px; --r-xl:16px; --r-pill:999px; + + /* ---- Borders + the one shadow (popovers + toast only) ---- */ + --hairline: 0.5px solid var(--line); + --shadow-pop: 0 10px 30px rgba(0,0,0,0.36), 0 1px 2px rgba(0,0,0,0.30); + + /* ---- Motion: fast, decelerating, never bouncy ---- */ + --dur-fast:120ms; --dur:160ms; --dur-slow:220ms; + --ease: cubic-bezier(0.2,0,0,1); + --ease-spring: cubic-bezier(0.34,1.4,0.64,1); font-synthesis: none; } * { box-sizing: border-box; } html, body { height: 100%; margin: 0; } - -/* ---------- focus (keyboard) ---------- */ -:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; border-radius: 8px; } -.rec-btn:focus-visible { outline-offset: 4px; } -.session:focus-visible { outline-offset: -2px; } body { - background: radial-gradient(1200px 600px at 80% -10%, #14202a 0%, var(--bg) 55%) fixed; + background: var(--bg); color: var(--text); - font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; - font-size: 15px; - line-height: 1.55; - display: grid; - grid-template-columns: 288px 1fr; + font-family: var(--font); + font-size: var(--fs-body); + line-height: 1.45; + letter-spacing: -0.006em; -webkit-font-smoothing: antialiased; + text-rendering: optimizeLegibility; + display: grid; + grid-template-columns: 260px 1fr; } +:focus-visible { outline: 2px solid var(--ring); outline-offset: 2px; border-radius: 6px; } + /* ---------- sidebar ---------- */ .sidebar { - background: linear-gradient(180deg, var(--bg-elev) 0%, #121419 100%); - border-right: 1px solid var(--line); - padding: 22px 18px; - height: 100vh; - position: sticky; - top: 0; - overflow-y: auto; + background: var(--bg-elev); + border-right: var(--hairline); + padding: var(--sp-4) var(--sp-2); + height: 100vh; height: 100dvh; + position: sticky; top: 0; + display: flex; flex-direction: column; gap: var(--sp-2); + overflow: hidden; } -.brand { display: flex; align-items: center; gap: 10px; } -.brand h1 { font-size: 20px; font-weight: 650; letter-spacing: -0.01em; margin: 0; } -.brand-dot { width: 11px; height: 11px; border-radius: 50%; background: var(--accent); - box-shadow: 0 0 12px var(--accent); } -.brand-sub { color: var(--faint); font-size: 12px; margin: 4px 0 22px 21px; letter-spacing: 0.02em; } -.sessions-head { display: flex; align-items: center; justify-content: space-between; margin-bottom: 8px; } -.sessions-head h2 { font-size: 12px; text-transform: uppercase; letter-spacing: 0.08em; color: var(--muted); margin: 0; font-weight: 600; } -.ghost-btn { background: none; border: none; color: var(--muted); cursor: pointer; font-size: 16px; border-radius: 8px; padding: 2px 7px; } -.ghost-btn:hover { background: var(--bg-elev-2); color: var(--text); } -.sessions { list-style: none; padding: 0; margin: 0; display: flex; flex-direction: column; gap: 6px; } +.brand { display: flex; align-items: center; gap: var(--sp-2); padding: var(--sp-1) var(--sp-2) 0; } +.brand h1 { font-size: var(--fs-headline); font-weight: 600; letter-spacing: -0.012em; margin: 0; } +.brand-dot { width: 8px; height: 8px; border-radius: 50%; background: var(--muted); } +body.recording .brand-dot { background: var(--rec); } +.session-search { + width: 100%; box-sizing: border-box; height: 36px; padding: 0 12px; border: 0; + border-radius: var(--r); background: var(--fill); color: var(--text); font-size: var(--fs-sub); + transition: box-shadow var(--dur) var(--ease); +} +.session-search::placeholder { color: var(--faint); } +.session-search:focus { outline: none; box-shadow: 0 0 0 3px var(--accent-tint); } +.session-search::-webkit-search-cancel-button { -webkit-appearance: none; } + +.sessions { list-style: none; padding: 0; margin: 0; flex: 1; overflow-y: auto; + display: flex; flex-direction: column; gap: 1px; } .session { - padding: 10px 12px; border-radius: 10px; cursor: pointer; border: 1px solid transparent; - transition: background .15s, border-color .15s; - display: flex; align-items: center; gap: 8px; + position: relative; display: flex; align-items: center; gap: var(--sp-2); + padding: var(--sp-2) var(--sp-3); border-radius: var(--r-sm); border: 0; cursor: pointer; + transition: background var(--dur-fast) var(--ease); } -.session:hover { background: var(--bg-elev-2); } -.session.active { background: var(--bg-elev-2); border-color: var(--line); } +.session:hover { background: var(--fill); } +.session.active { background: var(--accent-tint); } +.session.active::before { content:""; position:absolute; left:0; top:8px; bottom:8px; width:2px; + border-radius:2px; background: var(--accent); } .session .s-main { flex: 1; min-width: 0; } -.session .s-title { font-size: 13.5px; font-weight: 550; } -.session .s-sub { font-size: 11.5px; color: var(--faint); margin-top: 2px; display: flex; gap: 6px; } -/* subtle ✕ — muted until the row is hovered/focused, then it pops; hover -> record color */ +.session .s-title { font-size: var(--fs-body); font-weight: 500; color: var(--text); + white-space: nowrap; overflow: hidden; text-overflow: ellipsis; } +.session .s-sub { font-size: var(--fs-caption); color: var(--faint); margin-top: 1px; + font-variant-numeric: tabular-nums; } +.session:focus-visible { outline: 2px solid var(--ring); outline-offset: -2px; } .session-del { - flex: none; background: none; border: none; cursor: pointer; color: var(--muted); - font-size: 13px; line-height: 1; padding: 4px 6px; border-radius: 7px; - opacity: 0; transition: opacity .15s, color .15s, background .15s; + flex: none; background: none; border: 0; cursor: pointer; color: var(--muted); + font-size: 13px; line-height: 1; padding: 4px 6px; border-radius: 6px; + opacity: 0; transition: opacity var(--dur-fast), color var(--dur-fast), background var(--dur-fast); } -.session:hover .session-del, .session:focus-within .session-del { opacity: .55; } -.session-del:hover, .session-del:focus-visible { opacity: 1; color: var(--rec); background: var(--bg-elev); } -/* touch devices have no hover, so the ✕ would be invisible/undiscoverable — keep it shown */ -@media (hover: none) { .session-del { opacity: .55; } } -.tag { font-size: 10px; padding: 1px 6px; border-radius: 999px; background: #1f2630; color: var(--accent-dim); border: 1px solid var(--line); } -.sessions-empty { color: var(--faint); font-size: 12.5px; line-height: 1.5; padding: 14px 12px; border: 1px dashed var(--line); border-radius: 10px; text-align: center; } -.session-search { width: 100%; box-sizing: border-box; margin: 0 0 10px; background: var(--bg-elev); color: var(--text); border: 1px solid var(--line); border-radius: 9px; padding: 7px 11px; font-size: 13px; } -.session-search::placeholder { color: var(--faint); } +.session:hover .session-del, .session:focus-within .session-del { opacity: .5; } +.session-del:hover, .session-del:focus-visible { opacity: 1; color: var(--text); background: var(--fill-hover); } +@media (hover: none) { .session-del { opacity: .5; } } +.sessions-empty { color: var(--faint); font-size: var(--fs-sub); line-height: 1.5; + padding: var(--sp-4) var(--sp-3); text-align: center; } -/* ---------- main ---------- */ -.main { padding: 36px clamp(20px, 5vw, 56px); max-width: 980px; width: 100%; } -.hero { text-align: center; padding: 8px 0 28px; } -.record-wrap { display: flex; flex-direction: column; align-items: center; gap: 16px; } +/* ---------- main (document column + sticky dock) ---------- */ +.main { display: flex; flex-direction: column; height: 100vh; height: 100dvh; min-width: 0; } +.doc { flex: 1; overflow-y: auto; padding: var(--sp-6) clamp(20px, 5vw, 40px) var(--sp-5); + width: 100%; max-width: 800px; margin: 0 auto; } -.ring { - width: 150px; height: 150px; border-radius: 50%; - display: grid; place-items: center; position: relative; - background: conic-gradient(var(--accent) 0deg, var(--line) 0deg); - transition: background .1s linear; -} -.ring-fill { position: absolute; inset: 7px; border-radius: 50%; background: var(--bg); } -.rec-btn { - position: relative; z-index: 1; - width: 112px; height: 112px; border-radius: 50%; border: none; cursor: pointer; - background: radial-gradient(circle at 50% 35%, #ff6b7e, var(--rec)); - display: grid; place-items: center; - box-shadow: 0 6px 24px var(--rec-glow); - transition: transform .12s ease, box-shadow .2s; -} -.rec-btn:hover { transform: scale(1.04); } -.rec-btn:active { transform: scale(0.97); } -.rec-icon { width: 30px; height: 30px; border-radius: 50%; background: #fff; transition: all .18s ease; } -body.recording .rec-icon { width: 26px; height: 26px; border-radius: 6px; } /* circle -> square (stop) */ -body.recording .rec-btn { animation: pulse 1.6s ease-in-out infinite; } -@keyframes pulse { 0%,100% { box-shadow: 0 6px 24px var(--rec-glow); } 50% { box-shadow: 0 6px 40px var(--rec-glow); } } -.rec-btn:disabled { filter: grayscale(.6) brightness(.7); cursor: not-allowed; } - -/* "working" affordance while a job is transcribing: a calm breathing ring + dimmed, - non-interactive record button. The progress bar still carries the precise status. */ -body.working .ring { animation: workpulse 2.2s ease-in-out infinite; } -body.working .rec-btn { cursor: progress; filter: grayscale(.5) brightness(.78); } -@keyframes workpulse { - 0%, 100% { box-shadow: 0 0 0 0 rgba(94, 234, 212, 0.0); } - 50% { box-shadow: 0 0 0 6px rgba(94, 234, 212, 0.10); } -} +/* transcript header */ +.tv-head { display: flex; align-items: flex-start; justify-content: space-between; gap: var(--sp-4); + flex-wrap: wrap; margin-bottom: var(--sp-3); padding-bottom: var(--sp-3); border-bottom: var(--hairline); } +.tv-titlewrap { min-width: 0; flex: 1; } +.tv-head h2 { margin: 0; font-size: var(--fs-title); font-weight: 600; letter-spacing: -0.019em; line-height: 1.2; } +.tv-meta { color: var(--faint); font-size: var(--fs-sub); margin: var(--sp-1) 0 0; } +.player { display: block; width: 100%; height: 32px; margin-top: var(--sp-2); } +.tv-actions { display: flex; gap: var(--sp-2); align-items: center; flex: none; } -.timer { font-size: 30px; font-weight: 600; font-variant-numeric: tabular-nums; letter-spacing: 0.01em; } -.rec-state { color: var(--muted); font-size: 13px; } - -.controls { display: flex; gap: 16px; justify-content: center; margin-top: 26px; flex-wrap: wrap; } -.field { display: flex; flex-direction: column; gap: 6px; text-align: left; } -.field > span { font-size: 11px; text-transform: uppercase; letter-spacing: 0.06em; color: var(--muted); } -.select { - background: var(--bg-elev); color: var(--text); border: 1px solid var(--line); - border-radius: 10px; padding: 9px 12px; font-size: 14px; min-width: 160px; cursor: pointer; -} -.select:disabled { opacity: .5; cursor: not-allowed; } -.hint { color: var(--faint); font-size: 12.5px; max-width: 460px; margin: 22px auto 0; } - -/* ---------- progress ---------- */ -.progress { background: var(--bg-elev); border: 1px solid var(--line); border-radius: var(--radius); padding: 18px 20px; margin: 8px 0 24px; box-shadow: var(--shadow); } -.progress-row { display: flex; justify-content: space-between; font-size: 13.5px; color: var(--muted); margin-bottom: 10px; } -.bar { height: 8px; background: var(--bg-elev-2); border-radius: 999px; overflow: hidden; } -.bar-fill { height: 100%; width: 0%; background: linear-gradient(90deg, var(--accent-dim), var(--accent)); border-radius: 999px; transition: width .3s ease; } - -/* ---------- transcript ---------- */ -.transcript-view { animation: fade .3s ease; } -@keyframes fade { from { opacity: 0; transform: translateY(6px); } to { opacity: 1; transform: none; } } -.tv-head { display: flex; align-items: flex-start; justify-content: space-between; gap: 16px; flex-wrap: wrap; margin-bottom: 12px; } -.tv-head h2 { margin: 0; font-size: 20px; } -.tv-meta { color: var(--faint); font-size: 12.5px; margin: 4px 0 0; } -.tv-actions { display: flex; gap: 8px; flex-wrap: wrap; } -.btn { background: var(--accent); color: #06231e; border: none; border-radius: 10px; padding: 9px 15px; font-size: 13.5px; font-weight: 600; cursor: pointer; transition: filter .15s, transform .1s; } -.btn:hover { filter: brightness(1.06); } -.btn:active { transform: translateY(1px); } -.btn.ghost { background: transparent; color: var(--text); border: 1px solid var(--line); } -.btn.ghost:hover { background: var(--bg-elev-2); } -.btn:disabled { opacity: .42; cursor: not-allowed; filter: none; transform: none; } -.btn:disabled:hover { filter: none; background: var(--accent); } -.btn.ghost:disabled:hover { background: transparent; } - -.legend { display: flex; gap: 12px; flex-wrap: wrap; margin: 6px 0 18px; } -.legend .lg { display: inline-flex; align-items: center; gap: 7px; font-size: 12.5px; color: var(--muted); background: var(--bg-elev); border: 1px solid var(--line); padding: 4px 10px; border-radius: 999px; cursor: pointer; } -.legend .dot { width: 10px; height: 10px; border-radius: 50%; flex: none; } - -.turns { display: flex; flex-direction: column; gap: 2px; } -.turn { display: grid; grid-template-columns: 64px 1fr; gap: 14px; padding: 9px 12px; border-radius: 10px; transition: background .12s; } -.turn:hover { background: var(--bg-elev); } -.turn .t-time { color: var(--faint); font-size: 12px; font-variant-numeric: tabular-nums; padding-top: 2px; } -.turn .t-body { min-width: 0; } -.turn .t-spk { font-weight: 600; font-size: 13px; display: inline-flex; align-items: center; gap: 7px; } -.turn .t-spk .dot { width: 9px; height: 9px; border-radius: 50%; flex: none; } -.turn .t-spk button { background: none; border: none; color: inherit; font: inherit; cursor: pointer; border-bottom: 1px dotted transparent; padding: 0; } -.turn .t-spk button:hover { border-bottom-color: currentColor; } -.turn .t-text { margin-top: 2px; } -.turn .mk { font-size: 10.5px; color: var(--faint); border: 1px solid var(--line); border-radius: 6px; padding: 0 5px; margin-left: 6px; vertical-align: middle; } +/* live transcript document (while recording) + finished transcript share the turns container */ +.turns { display: flex; flex-direction: column; gap: var(--sp-1); } +.turn { padding: var(--sp-2) var(--sp-3); border-radius: var(--r-sm); transition: background var(--dur-fast) var(--ease); } +.turn:hover { background: var(--fill); } +.turn .t-head { display: flex; align-items: baseline; gap: var(--sp-2); margin-bottom: 2px; } +/* continuation of the same speaker: drop the repeated name chip but KEEP the timestamp + (still clickable to seek) so the line never looks orphaned, and tighten the spacing + so consecutive lines read as one block. */ +.turn.same-speaker { padding-top: 0; } +.turn.same-speaker .t-spk { display: none; } +.turn.same-speaker .t-head { margin-bottom: 0; } +.turn .t-time { color: var(--faint); font-size: var(--fs-caption); font-variant-numeric: tabular-nums; } +.turn .t-text { font-size: var(--fs-body); } .turn.uncertain .t-text { color: var(--muted); } +.turn .mk { font-size: var(--fs-caption); color: var(--faint); background: var(--fill); + border-radius: var(--r-pill); padding: 1px 7px; margin-left: 6px; } +.t-spk { display: inline-flex; align-items: center; gap: 6px; font-weight: 600; font-size: var(--fs-sub); } +.t-spk .dot { width: 7px; height: 7px; border-radius: 50%; flex: none; } +.t-spk button { background: none; border: 0; color: inherit; font: inherit; cursor: pointer; padding: 0; border-radius: 4px; } +.t-spk button:hover { color: var(--accent); } -/* ---------- misc ---------- */ -.empty { color: var(--faint); text-align: center; padding: 60px 20px; font-size: 14px; } - -/* ---------- live transcript ---------- */ -.live-toggle { margin-top: 18px; } -body.live-on .live-toggle { background: var(--rec); color: #fff; border-color: transparent; } -.live-view { background: var(--bg-elev); border: 1px solid var(--line); border-radius: var(--radius); padding: 16px 18px; margin: 8px 0 24px; box-shadow: var(--shadow); } -.live-head { display: flex; align-items: center; gap: 8px; font-size: 13px; font-weight: 600; color: var(--muted); margin-bottom: 10px; } -.live-dot { width: 9px; height: 9px; border-radius: 50%; background: var(--rec); box-shadow: 0 0 8px var(--rec); animation: pulse 1.6s ease-in-out infinite; } -.live-lines { max-height: 320px; overflow-y: auto; display: flex; flex-direction: column; gap: 4px; font-size: 14px; line-height: 1.5; } -.live-lines .ll { color: var(--text); } -.live-lines .ll-t { color: var(--faint); font-variant-numeric: tabular-nums; font-size: 12px; margin-right: 8px; } -.rec-wave { display: none; width: 100%; height: 56px; margin: 6px 0 2px; } -body.recording .rec-wave { display: block; } - -.live-lines .ll-empty { color: var(--faint); font-style: italic; } -/* the in-progress utterance: committed words solid, volatile tail dimmed + softly pulsing */ -.live-lines .ll-partial { color: var(--accent-dim); } +/* live volatile tail */ +.live-lines .ll { padding: 2px var(--sp-3); } +.live-lines .ll-t { color: var(--faint); font-variant-numeric: tabular-nums; font-size: var(--fs-caption); margin-right: 8px; } +.live-lines .ll-empty { color: var(--faint); font-style: italic; padding: var(--sp-2) var(--sp-3); } +.live-lines .ll-partial { color: var(--muted); } .live-lines .ll-partial .ll-vol { color: var(--faint); font-style: italic; animation: livepulse 1.4s ease-in-out infinite; } -@keyframes livepulse { 0%, 100% { opacity: 0.55; } 50% { opacity: 0.95; } } +@keyframes livepulse { 0%,100% { opacity: 0.5; } 50% { opacity: 0.9; } } + +/* progress bar */ +.progress { margin: 0 0 var(--sp-4); } +.progress-row { display: flex; justify-content: space-between; font-size: var(--fs-sub); color: var(--muted); margin-bottom: 6px; } +.bar { height: 4px; background: var(--fill); border-radius: var(--r-pill); overflow: hidden; } +.bar-fill { height: 100%; width: 0%; background: var(--accent); border-radius: var(--r-pill); transition: width var(--dur-slow) var(--ease); } + +/* empty state */ +.empty { color: var(--faint); text-align: center; padding: var(--sp-6) var(--sp-5); min-height: 50vh; + display: flex; flex-direction: column; align-items: center; justify-content: center; + font-size: var(--fs-body); line-height: 1.6; } +.rec-ph { color: var(--muted); } +.rec-ph strong { color: var(--text); } +.rec-ph-dot { display: inline-block; width: 9px; height: 9px; border-radius: 50%; background: var(--rec); + margin-right: 8px; vertical-align: middle; animation: rec-ph-pulse 1.4s ease-in-out infinite; } +@keyframes rec-ph-pulse { 0%, 100% { opacity: 1; } 50% { opacity: 0.25; } } +@media (prefers-reduced-motion: reduce) { .rec-ph-dot { animation: none; } } +.empty strong { color: var(--text); font-weight: 600; } + +/* ---------- dock (record + controls, pinned bottom) ---------- */ +.dock { flex: none; display: flex; align-items: center; gap: var(--sp-3); + padding: var(--sp-2) clamp(16px, 4vw, 28px); min-height: 60px; + background: var(--bg); border-top: var(--hairline); } +.rec-btn { flex: none; width: 44px; height: 44px; border-radius: 50%; cursor: pointer; + display: grid; place-items: center; background: var(--fill); border: 1.5px solid transparent; + transition: background var(--dur-fast) var(--ease), border-color var(--dur) var(--ease); } +.rec-btn:hover { background: var(--fill-hover); } +.rec-btn:active { transform: scale(0.96); } +.rec-btn:focus-visible { outline: 2px solid var(--ring); outline-offset: 3px; } +.rec-icon { width: 18px; height: 18px; border-radius: 50%; background: var(--rec); + transition: width var(--dur) var(--ease-spring), height var(--dur) var(--ease-spring), border-radius var(--dur) var(--ease); } +body.recording .rec-icon { width: 16px; height: 16px; border-radius: 4px; } +body.recording .rec-btn { border-color: var(--rec); } +body.working .rec-btn { opacity: .55; cursor: progress; } +.rec-btn:disabled { opacity: .4; cursor: not-allowed; } + +.dock-status { flex: none; min-width: 56px; display: flex; flex-direction: column; line-height: 1.2; } +.timer { font-size: var(--fs-timer); font-weight: 600; font-variant-numeric: tabular-nums; display: none; } +body.recording .timer, body.working .timer { display: block; } +.rec-state { font-size: var(--fs-caption); color: var(--muted); } +body.low-signal .rec-state { color: var(--rec); font-weight: 600; } + +.rec-wave { flex: 1; min-width: 0; height: 28px; opacity: 0; transition: opacity var(--dur) var(--ease); } +body.recording .rec-wave { opacity: 1; } +body.low-signal .rec-wave { opacity: .45; } + +.dock .select { flex: none; } +.gear-btn { flex: none; width: 36px; height: 36px; border-radius: var(--r); border: 0; cursor: pointer; + display: grid; place-items: center; background: transparent; color: var(--muted); font-size: 17px; + list-style: none; transition: background var(--dur-fast) var(--ease), color var(--dur-fast) var(--ease); } +.gear-btn::-webkit-details-marker { display: none; } +.gear-btn:hover { background: var(--fill); color: var(--text); } + +/* ---------- buttons ---------- */ +.btn { display: inline-flex; align-items: center; gap: 6px; height: 32px; padding: 0 14px; + border-radius: var(--r); border: 0; font-size: var(--fs-sub); font-weight: 600; cursor: pointer; + background: var(--accent); color: #1c1c1e; + transition: background var(--dur-fast) var(--ease), transform var(--dur-fast) var(--ease); } +.btn:hover { background: var(--accent-hover); } +.btn:active { transform: translateY(0.5px); } +.btn.ghost { background: transparent; color: var(--text); font-weight: 500; } +.btn.ghost:hover { background: var(--fill); } +.btn:disabled { opacity: .4; cursor: not-allowed; transform: none; } +.btn:focus-visible { outline: 2px solid var(--ring); outline-offset: 2px; } + +/* ---------- select (native, CSS chevron — CSP-safe, no SVG) ---------- */ +.select { appearance: none; -webkit-appearance: none; height: 32px; padding: 0 30px 0 12px; + border-radius: var(--r); background-color: var(--fill); color: var(--text); border: 0; + font-size: var(--fs-sub); cursor: pointer; max-width: 100%; + background-image: linear-gradient(45deg, transparent 50%, var(--faint) 50%), + linear-gradient(135deg, var(--faint) 50%, transparent 50%); + background-position: calc(100% - 15px) center, calc(100% - 10px) center; + background-size: 5px 5px, 5px 5px; background-repeat: no-repeat; + transition: background-color var(--dur-fast) var(--ease); } +.select:hover { background-color: var(--fill-hover); } +.select:focus-visible { outline: none; box-shadow: 0 0 0 3px var(--accent-tint); } +.select:disabled { opacity: .5; cursor: not-allowed; } +.select option { background: var(--bg-elev); color: var(--text); } + +/* ---------- menus / popovers (native
    ) ---------- */ +.menu { position: relative; } +.menu > summary { list-style: none; } +.menu > summary::-webkit-details-marker { display: none; } +.menu-pop { position: absolute; right: 0; bottom: calc(100% + 8px); z-index: 60; min-width: 220px; padding: 5px; + max-width: calc(100vw - 24px); max-height: min(60vh, 420px); overflow-y: auto; + background: var(--bg-elev); border: var(--hairline); border-radius: var(--r-lg); box-shadow: var(--shadow-pop); + display: flex; flex-direction: column; gap: 1px; animation: menu-in var(--dur) var(--ease); } +.tv-actions .menu-pop { top: calc(100% + 6px); bottom: auto; } +@keyframes menu-in { from { opacity:0; transform: translateY(4px) scale(.98); } to { opacity:1; transform:none; } } +.menu-item { display: flex; align-items: center; gap: 8px; width: 100%; padding: 8px 10px; border: 0; + border-radius: var(--r-sm); background: none; color: var(--text); font-size: var(--fs-sub); + font-weight: 500; text-align: left; cursor: pointer; } +.menu-item:hover { background: var(--fill); } +.menu-item:focus-visible { background: var(--fill); outline: 2px solid var(--ring); outline-offset: -2px; } +.menu-item:disabled { opacity: .4; cursor: not-allowed; } +.menu-item.danger { color: var(--rec); } +.menu-sep { height: 0.5px; background: var(--line); margin: 4px 6px; } +.settings-pop { gap: var(--sp-3); padding: var(--sp-3); } +.settings-pop .field { display: flex; flex-direction: column; gap: 6px; } +.settings-pop .field > span { font-size: var(--fs-caption); color: var(--faint); } +.settings-pop .select { width: 100%; } +.toggle { display: flex; align-items: center; gap: var(--sp-2); font-size: var(--fs-sub); cursor: pointer; } +.toggle input { width: 16px; height: 16px; accent-color: var(--accent); cursor: pointer; } +.kbd-hint { margin: 0; font-size: var(--fs-caption); color: var(--faint); line-height: 1.4; } + +/* ---------- toast ---------- */ +.toast { position: fixed; left: 50%; bottom: max(78px, calc(72px + env(safe-area-inset-bottom))); + transform: translateX(-50%); background: var(--bg-elev); color: var(--text); border: var(--hairline); + padding: 10px 16px; border-radius: var(--r-lg); box-shadow: var(--shadow-pop); font-size: var(--fs-sub); + z-index: 70; animation: menu-in var(--dur) var(--ease); } + +.nav-toggle { display: none; position: fixed; top: 14px; left: 14px; z-index: 40; background: var(--bg-elev); + color: var(--text); border: var(--hairline); border-radius: var(--r); width: 40px; height: 40px; + font-size: 17px; cursor: pointer; } .hidden { display: none !important; } -.toast { position: fixed; bottom: 22px; left: 50%; transform: translateX(-50%); background: var(--bg-elev-2); color: var(--text); border: 1px solid var(--line); padding: 10px 18px; border-radius: 10px; box-shadow: var(--shadow); font-size: 13.5px; z-index: 50; animation: fade .2s ease; } -.nav-toggle { display: none; position: fixed; top: 14px; left: 14px; z-index: 40; background: var(--bg-elev); color: var(--text); border: 1px solid var(--line); border-radius: 10px; width: 40px; height: 40px; font-size: 18px; cursor: pointer; } /* ---------- responsive / mobile ---------- */ @media (max-width: 820px) { body { grid-template-columns: 1fr; } - .sidebar { position: fixed; z-index: 30; width: 280px; transform: translateX(-100%); transition: transform .22s ease; - padding-left: env(safe-area-inset-left); } + .sidebar { position: fixed; z-index: 30; width: 280px; transform: translateX(-100%); + transition: transform var(--dur-slow) var(--ease); padding-left: env(safe-area-inset-left); } body.nav-open .sidebar { transform: none; } .nav-toggle { display: grid; place-items: center; width: 44px; height: 44px; - top: max(14px, env(safe-area-inset-top)); left: max(14px, env(safe-area-inset-left)); } - /* safe-area + comfortable tap targets on phones */ - .main { padding: max(64px, calc(40px + env(safe-area-inset-top))) max(18px, env(safe-area-inset-left)) - calc(40px + env(safe-area-inset-bottom)) max(18px, env(safe-area-inset-right)); } - .turn { grid-template-columns: 52px 1fr; gap: 10px; } - .btn, .select, .ghost-btn, .session-del, .lg { min-height: 44px; } - .toast { bottom: max(22px, calc(16px + env(safe-area-inset-bottom))); } + top: max(14px, env(safe-area-inset-top)); left: max(14px, env(safe-area-inset-left)); } + .doc { padding-top: max(64px, calc(40px + env(safe-area-inset-top))); } + .btn, .select, .session, .menu-item, .rec-btn, .gear-btn { min-height: 44px; } + .select, .session-search { font-size: 16px; } /* prevents iOS zoom */ + .dock { flex-wrap: wrap; row-gap: var(--sp-2); padding-bottom: max(8px, env(safe-area-inset-bottom)); } + .rec-wave { flex-basis: 100%; order: 5; } /* wave drops to its own row so controls never overflow */ + .dock .select { max-width: 50vw; } } -/* honor the OS "reduce motion" setting — kill the looping pulses/levels */ @media (prefers-reduced-motion: reduce) { *, *::before, *::after { animation-duration: .001ms !important; animation-iteration-count: 1 !important; transition-duration: .001ms !important; } diff --git a/gui/static/sw.js b/gui/static/sw.js index c284b52..0e848ca 100644 --- a/gui/static/sw.js +++ b/gui/static/sw.js @@ -5,7 +5,7 @@ * stream. Bumping CACHE drops the old shell on activate. */ "use strict"; -const CACHE = "voxterm-shell-v1"; +const CACHE = "voxterm-shell-v2"; // bump on shell changes so the redesign isn't served stale const SHELL = [ "/", "/static/backend-remote.js", From 9ebb0faaf84db5b4eb7ce41c8f0bb25072b421bf Mon Sep 17 00:00:00 2001 From: NubsCarson <192162056+NubsCarson@users.noreply.github.com> Date: Fri, 5 Jun 2026 16:13:52 +0000 Subject: [PATCH 44/60] gui: player preload=metadata (show real duration on load) + headless e2e for the redesign -