From a43aacb58e98c96fa59aabe46f73fc51e1ff5e24 Mon Sep 17 00:00:00 2001
From: Nigel Tatschner
Date: Thu, 28 May 2026 20:33:39 +0100
Subject: [PATCH] fix(tray): stop "looks connected but isn't shipping" silent
failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A 10+ hour outage on 2026-05-28 surfaced three architectural bugs in
the tray's auth-loss handling. All three fed each other:
## Bug 1 — hangar worker unilaterally sabotages the sync worker
On a 401 from the hangar push (`POST /v1/me/hangar`), the hangar
worker called `clear_persisted_device_token()` AND set
`account_status.auth_lost = true` tray-wide. The intent was "if our
push 401s, the sync worker's calls will also 401, save a round-trip".
But the hangar push and sync drain are independent surfaces, and they
can race: the sync worker captures `api_url`/`access_token` at
respawn-time, so a fresh post-pair token can be live in the sync
worker's locals while the hangar worker is still using the pre-pair
config snapshot. The hangar 401 in that race wipes the just-paired
token AND flips global auth_lost — for everyone. Fix: hangar logs +
records `hangar_stats.last_error` and bails its own cycle. Tray-wide
auth_lost decisions stay with the sync worker's own 401 handlers.
## Bug 2 — sync worker with auth_lost loops silently forever
`sync.rs::spawn_lane` checked `if !auth_lost` and skipped the drain
on every tick when set. Workers stayed alive but did no work — no
HTTP, no `sync_stats` updates, no log lines. The user saw 10+ hours
of zero activity with no diagnostic signal. Fix: when the worker
sees `auth_lost`, it logs once, emits `sync-paused` so the UI can
react, and BREAKS out of the loop. The worker re-spawns on the next
`respawn()` (pair_device / save_config / set_sync_preset), so re-pair
recovers automatically.
## Bug 3 — health pill shows green forever on frozen sync_stats
The pill derived from `sync_stats.last_success_at` with no staleness
check, so a worker that died at noon would have its "last green
reading" displayed until the user restarted the app. Fix: extend
`deriveSyncHealth` with a staleness threshold (2× `bulk interval_secs`,
floored at 60s). Adds `STALE` and `PAUSED` variants. Precedence is
now OFF > PAUSED > IDLE > ERR > STALE > OK — ERR beats STALE because
a known error is more useful than "we don't know"; STALE catches
silent-failure modes like bug #2.
Plus a `sync-paused` listener in SettingsPane that surfaces a notice
("Sync paused: the server rejected this uplink's token. Re-pair the
device to resume.") and clears on toggle-back-on, mirroring the
existing `sync-revoked` notice plumbing.
## Tests
- 210 starstats-client cargo tests pass (no new tests — the sync.rs
change is a control-flow refactor exercised by the existing fixture
surface).
- 162 tray-ui vitest tests pass (was 160; added STALE and PAUSED
tests in SettingsPane.test.tsx). Existing OK test updated to use a
fresh timestamp so the new staleness check doesn't fire.
- `cargo fmt -p starstats-client --check` clean.
- `cargo clippy -p starstats-client --bin starstats-client --tests
-- -D warnings` clean.
---
Cargo.lock | 2 +-
.../src/components/SettingsPane.test.tsx | 70 ++++++++++++++-
apps/tray-ui/src/components/SettingsPane.tsx | 87 +++++++++++++++++--
crates/starstats-client/src/hangar.rs | 47 +++++-----
crates/starstats-client/src/sync.rs | 29 +++++--
5 files changed, 199 insertions(+), 36 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 09e1307b..8d439f48 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6462,7 +6462,7 @@ checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
[[package]]
name = "starstats-client"
-version = "1.8.9"
+version = "1.8.10"
dependencies = [
"anyhow",
"base64 0.22.1",
diff --git a/apps/tray-ui/src/components/SettingsPane.test.tsx b/apps/tray-ui/src/components/SettingsPane.test.tsx
index 6a17bd07..6a7fd292 100644
--- a/apps/tray-ui/src/components/SettingsPane.test.tsx
+++ b/apps/tray-ui/src/components/SettingsPane.test.tsx
@@ -240,15 +240,81 @@ describe('SettingsPane remote-sync health indicator', () => {
claimed_handle: 'U',
},
});
+ // Use a fresh timestamp so the staleness check (2× bulk interval)
+ // doesn't fire — the OK pill requires both a recent success AND
+ // a recent attempt.
+ const nowIso = new Date().toISOString();
const status = makeStatus(
makeSync({
- last_success_at: '2026-05-21T12:00:00Z',
- last_attempt_at: '2026-05-21T12:00:00Z',
+ last_success_at: nowIso,
+ last_attempt_at: nowIso,
+ }),
+ );
+ wrap();
+ expect(await screen.findByLabelText(/Sync settings/i)).toBeInTheDocument();
+ expect(within(getRemoteSyncHealth()).getByText(/^OK$/)).toBeInTheDocument();
+ });
+
+ it('shows STALE when the last attempt is older than 2× the bulk interval and there is no error', async () => {
+ // Regression for the 2026-05-28 outage: workers tripping the
+ // auth_lost guard left sync_stats frozen on the last green
+ // reading. The pill happily showed OK for 10+ hours despite
+ // no traffic. STALE surfaces that silence to the user.
+ const cfg = makeConfig({
+ remote_sync: {
+ ...makeConfig().remote_sync,
+ enabled: true,
+ access_token: 'tok',
+ claimed_handle: 'U',
+ interval_secs: 60,
+ },
+ });
+ // Bulk interval = 60s ⇒ threshold = 120s. Use 10 minutes ago.
+ const tenMinAgo = new Date(Date.now() - 10 * 60 * 1000).toISOString();
+ const status = makeStatus(
+ makeSync({
+ last_success_at: tenMinAgo,
+ last_attempt_at: tenMinAgo,
}),
);
wrap();
expect(await screen.findByLabelText(/Sync settings/i)).toBeInTheDocument();
+ expect(within(getRemoteSyncHealth()).getByText(/^STALE$/)).toBeInTheDocument();
+ });
+
+ it('shows PAUSED when the sync-paused event fires', async () => {
+ let pausedHandler: ((e: unknown) => void) | undefined;
+ (listen as ReturnType).mockImplementation((event, handler) => {
+ if (event === 'sync-paused') pausedHandler = handler;
+ return Promise.resolve(() => undefined);
+ });
+
+ const cfg = makeConfig({
+ remote_sync: {
+ ...makeConfig().remote_sync,
+ enabled: true,
+ access_token: 'tok',
+ claimed_handle: 'U',
+ },
+ });
+ const status = makeStatus(makeSync({
+ last_success_at: new Date().toISOString(),
+ last_attempt_at: new Date().toISOString(),
+ }));
+ wrap();
+
+ // Before the event: OK (recent activity).
expect(within(getRemoteSyncHealth()).getByText(/^OK$/)).toBeInTheDocument();
+
+ await act(async () => {
+ pausedHandler?.({});
+ });
+
+ // After the event: PAUSED takes precedence over OK.
+ expect(within(getRemoteSyncHealth()).getByText(/^PAUSED$/)).toBeInTheDocument();
+ expect(
+ await screen.findByText(/server rejected this uplink's token/i),
+ ).toBeInTheDocument();
});
it('shows ERR when sync.last_error is set', async () => {
diff --git a/apps/tray-ui/src/components/SettingsPane.tsx b/apps/tray-ui/src/components/SettingsPane.tsx
index e20b6f04..aad9b348 100644
--- a/apps/tray-ui/src/components/SettingsPane.tsx
+++ b/apps/tray-ui/src/components/SettingsPane.tsx
@@ -66,22 +66,55 @@ interface Props {
status?: StatusResponse | null;
}
-type SyncHealth = 'ok' | 'err' | 'idle' | 'off';
+type SyncHealth = 'ok' | 'err' | 'stale' | 'paused' | 'idle' | 'off';
+
+/// Minimum staleness threshold in seconds when the worker has fired
+/// at least once but hasn't attempted again recently. Set to 2× the
+/// bulk interval, but never below this floor — protects against a
+/// pathologically-low interval producing flapping pills.
+const STALE_FLOOR_SECS = 60;
/// Derive the Remote sync card's health pill from the device's
-/// pairing state, the per-config enabled flag, and the latest
-/// `SyncStats` snapshot from the polling hook.
+/// pairing state, the per-config enabled flag, the latest `SyncStats`
+/// snapshot from the polling hook, the bulk interval (for staleness
+/// calculation), and the `paused` flag (set when the sync worker
+/// emits `sync-paused`).
///
-/// OFF wins over ERR — if the user has disabled sync we don't want
-/// to nag them with a stale failure from before they flipped it off.
+/// Precedence: OFF > PAUSED > IDLE > ERR > STALE > OK.
+/// - OFF wins over everything — user disabled sync, don't nag.
+/// - PAUSED next — worker bailed (auth_lost); needs re-pair.
+/// - IDLE when no stats yet — never polled.
+/// - ERR when there's a recorded error — we have an explanation,
+/// show it. ERR beats STALE: a known failure is more useful
+/// information than "we don't know".
+/// - STALE when last_attempt_at is older than 2× the bulk interval
+/// AND there's no error. Catches "looks connected but isn't
+/// shipping" silent-failure modes (2026-05-28 outage: workers
+/// alive, auth_lost guard skipping drain, sync_stats frozen on
+/// its last green reading, pill happily showed OK for 10+ hours).
+/// - OK only when we've had a recent success.
function deriveSyncHealth(
isPaired: boolean,
enabled: boolean,
sync: SyncStats | null | undefined,
+ bulkIntervalSecs: number,
+ paused: boolean,
): SyncHealth {
if (!isPaired || !enabled) return 'off';
+ if (paused) return 'paused';
if (!sync) return 'idle';
if (sync.last_error) return 'err';
+ if (sync.last_attempt_at) {
+ const ageSecs =
+ (Date.now() - Date.parse(sync.last_attempt_at)) / 1000;
+ const staleThresholdSecs = Math.max(
+ bulkIntervalSecs * 2,
+ STALE_FLOOR_SECS,
+ );
+ if (Number.isFinite(ageSecs) && ageSecs > staleThresholdSecs) {
+ return 'stale';
+ }
+ }
if (sync.last_success_at) return 'ok';
return 'idle';
}
@@ -89,6 +122,8 @@ function deriveSyncHealth(
const SYNC_HEALTH_LABEL: Record = {
ok: 'OK',
err: 'ERR',
+ stale: 'STALE',
+ paused: 'PAUSED',
idle: 'IDLE',
off: 'OFF',
};
@@ -99,6 +134,8 @@ const SYNC_HEALTH_TONE: Record<
> = {
ok: 'ok',
err: 'danger',
+ stale: 'warn',
+ paused: 'warn',
idle: 'dim',
off: 'dim',
};
@@ -106,6 +143,8 @@ const SYNC_HEALTH_TONE: Record<
const SYNC_HEALTH_COLOR: Record = {
ok: 'var(--ok)',
err: 'var(--danger)',
+ stale: 'var(--warn)',
+ paused: 'var(--warn)',
idle: 'var(--fg-dim)',
off: 'var(--fg-dim)',
};
@@ -164,6 +203,14 @@ export function SettingsPane({ config, onSave, status }: Props) {
} | null>(null);
const [revoked, setRevoked] = useState(false);
+ // `paused` mirrors a `sync-paused` emit from the Rust sync worker —
+ // fired when the worker exits its loop because `auth_lost` is set.
+ // The Settings pane uses it to drive the health pill to "PAUSED"
+ // and surface an explanatory notice. Cleared on save/re-pair, same
+ // shape as `revoked`. Surfaced 2026-05-28 after the silent
+ // auth_lost-loop outage; previously the health pill stayed "OK"
+ // forever because sync_stats never updated.
+ const [paused, setPaused] = useState(false);
// Unsaved-draft guard: tracks the remote config that arrived while
// the user had uncommitted edits, and the last-saved baseline used
@@ -247,6 +294,16 @@ export function SettingsPane({ config, onSave, status }: Props) {
};
}, []);
+ useEffect(() => {
+ let cancel: (() => void) | undefined;
+ listen('sync-paused', () => setPaused(true)).then((unl) => {
+ cancel = unl;
+ });
+ return () => {
+ cancel?.();
+ };
+ }, []);
+
// The App owns the canonical Config and subscribes to the bulk-lane
// `config-changed` event there (so it fires regardless of which tab
// the user is on). When the parent replaces the config — whether
@@ -446,6 +503,8 @@ export function SettingsPane({ config, onSave, status }: Props) {
isPaired,
draft.remote_sync.enabled,
status?.sync,
+ draft.remote_sync.interval_secs,
+ paused,
);
return (
@@ -487,6 +546,7 @@ export function SettingsPane({ config, onSave, status }: Props) {
editDraft((prev) => ({ ...prev, sync_with_cloud: e.target.checked }));
if (e.target.checked) {
setRevoked(false);
+ setPaused(false);
}
}}
style={{ accentColor: 'var(--accent)' }}
@@ -519,6 +579,23 @@ export function SettingsPane({ config, onSave, status }: Props) {
Cloud sync is disabled for this uplink. Toggle Cloud sync back on to resume.
)}
+ {paused && (
+
+ Sync paused: the server rejected this uplink's token. Re-pair the
+ device (Connected Uplinks card below) to resume.
+
+ )}
,
- account_status: &Mutex,
) -> Result<()> {
{
let mut s = hangar_stats.lock();
@@ -243,23 +235,34 @@ async fn refresh_once(
let status = resp.status();
if status == StatusCode::UNAUTHORIZED || status == StatusCode::FORBIDDEN {
// Device token rejected by our server (revoked / signature
- // invalid / account deleted). Mirror the sync worker's behaviour:
- // clear the persisted token, flip auth_lost so the health
- // banner fires and other workers (sync, parser-submissions)
- // pause too, then bail so the next loop iteration sleeps.
- // Without this propagation, the hangar fetcher would re-try
- // every cycle and the user would never see an auth_lost
- // signal until a *separate* /v1/ingest 401 fired.
+ // invalid / account deleted). Record the error so the UI
+ // surfaces it, but do NOT clear the persisted token or flip
+ // global `auth_lost` — that's the sync worker's call to make.
+ //
+ // History: this used to call `clear_persisted_device_token()`
+ // + `account_status.lock().auth_lost = true`, on the theory
+ // that any 401 here implies the sync worker would also 401
+ // and we'd save a round-trip. But it conflated TWO different
+ // 401 surfaces:
+ // - sync /v1/ingest / /v1/auth/me — the canonical token-
+ // status oracle, which when it 401s should pause sync.
+ // - this hangar push — a peripheral feature whose 401 can
+ // race ahead of the sync worker's view of the world, e.g.
+ // when the sync worker has a fresh post-pair token in its
+ // captured locals but the hangar worker is still using the
+ // pre-pair config snapshot.
+ // The race shipped a real outage 2026-05-28: hangar 401 wiped
+ // the freshly-paired token + flipped auth_lost system-wide;
+ // sync workers then silently no-op'd the auth_lost guard for
+ // hours while the UI's health pill stayed green. Fix: leave
+ // it to the sync worker, which has the authoritative view.
let body = resp.text().await.unwrap_or_default();
tracing::warn!(
%status,
body = %body,
- "hangar push: device token rejected — clearing and flipping auth_lost"
+ "hangar push: device token rejected — recording and bailing this cycle \
+ (sync worker handles tray-wide auth_lost)"
);
- if let Err(e) = crate::sync::clear_persisted_device_token() {
- tracing::warn!(error = %e, "failed to clear device token after hangar auth loss");
- }
- account_status.lock().auth_lost = true;
anyhow::bail!("hangar push failed: {status} (device token rejected)");
}
if !status.is_success() {
diff --git a/crates/starstats-client/src/sync.rs b/crates/starstats-client/src/sync.rs
index 5d0db01a..48220678 100644
--- a/crates/starstats-client/src/sync.rs
+++ b/crates/starstats-client/src/sync.rs
@@ -298,12 +298,29 @@ fn spawn_lane(
) -> tauri::async_runtime::JoinHandle<()> {
tauri::async_runtime::spawn(async move {
loop {
- // If a previous iteration tripped auth_lost, skip the
- // drain entirely — the token has been cleared and we'd
- // just re-trigger the same 401. Wait for the user to
- // re-pair (which clears auth_lost and respawns this task).
- let auth_ok = !account_status.lock().auth_lost;
- if auth_ok {
+ // If a previous iteration tripped `auth_lost`, EXIT the
+ // worker entirely. Previously this loop kept spinning,
+ // skipping the drain on every tick — workers stayed alive
+ // but did no work, and `sync_stats` kept its last-known-
+ // good values, so the Settings health pill happily showed
+ // green for hours after sync had silently died. Surfaced
+ // 2026-05-28 in a tray "looks connected but isn't shipping"
+ // outage. Now: log once, emit `sync-paused` so the UI can
+ // show a banner, and break. The worker re-spawns on the
+ // next `respawn()` call (pair_device, save_config,
+ // set_sync_preset).
+ if account_status.lock().auth_lost {
+ tracing::warn!(
+ lane = lane.label(),
+ "sync worker exiting: auth_lost is set — waiting for re-pair"
+ );
+ if let Err(e) = app_handle.emit("sync-paused", "auth_lost") {
+ tracing::warn!(error = %e, "emit sync-paused failed");
+ }
+ break;
+ }
+ // Auth gate passed — proceed with the regular drain/heartbeat cycle.
+ {
let types_ref: Vec<&str> = priority_types.iter().map(|s| s.as_str()).collect();
if let Err(e) = drain_lane(
lane,