diff --git a/Cargo.lock b/Cargo.lock index 09e1307..8d439f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6462,7 +6462,7 @@ checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" [[package]] name = "starstats-client" -version = "1.8.9" +version = "1.8.10" dependencies = [ "anyhow", "base64 0.22.1", diff --git a/apps/tray-ui/src/components/SettingsPane.test.tsx b/apps/tray-ui/src/components/SettingsPane.test.tsx index 6a17bd0..6a7fd29 100644 --- a/apps/tray-ui/src/components/SettingsPane.test.tsx +++ b/apps/tray-ui/src/components/SettingsPane.test.tsx @@ -240,15 +240,81 @@ describe('SettingsPane remote-sync health indicator', () => { claimed_handle: 'U', }, }); + // Use a fresh timestamp so the staleness check (2× bulk interval) + // doesn't fire — the OK pill requires both a recent success AND + // a recent attempt. + const nowIso = new Date().toISOString(); const status = makeStatus( makeSync({ - last_success_at: '2026-05-21T12:00:00Z', - last_attempt_at: '2026-05-21T12:00:00Z', + last_success_at: nowIso, + last_attempt_at: nowIso, + }), + ); + wrap(); + expect(await screen.findByLabelText(/Sync settings/i)).toBeInTheDocument(); + expect(within(getRemoteSyncHealth()).getByText(/^OK$/)).toBeInTheDocument(); + }); + + it('shows STALE when the last attempt is older than 2× the bulk interval and there is no error', async () => { + // Regression for the 2026-05-28 outage: workers tripping the + // auth_lost guard left sync_stats frozen on the last green + // reading. The pill happily showed OK for 10+ hours despite + // no traffic. STALE surfaces that silence to the user. + const cfg = makeConfig({ + remote_sync: { + ...makeConfig().remote_sync, + enabled: true, + access_token: 'tok', + claimed_handle: 'U', + interval_secs: 60, + }, + }); + // Bulk interval = 60s ⇒ threshold = 120s. Use 10 minutes ago. + const tenMinAgo = new Date(Date.now() - 10 * 60 * 1000).toISOString(); + const status = makeStatus( + makeSync({ + last_success_at: tenMinAgo, + last_attempt_at: tenMinAgo, }), ); wrap(); expect(await screen.findByLabelText(/Sync settings/i)).toBeInTheDocument(); + expect(within(getRemoteSyncHealth()).getByText(/^STALE$/)).toBeInTheDocument(); + }); + + it('shows PAUSED when the sync-paused event fires', async () => { + let pausedHandler: ((e: unknown) => void) | undefined; + (listen as ReturnType).mockImplementation((event, handler) => { + if (event === 'sync-paused') pausedHandler = handler; + return Promise.resolve(() => undefined); + }); + + const cfg = makeConfig({ + remote_sync: { + ...makeConfig().remote_sync, + enabled: true, + access_token: 'tok', + claimed_handle: 'U', + }, + }); + const status = makeStatus(makeSync({ + last_success_at: new Date().toISOString(), + last_attempt_at: new Date().toISOString(), + })); + wrap(); + + // Before the event: OK (recent activity). expect(within(getRemoteSyncHealth()).getByText(/^OK$/)).toBeInTheDocument(); + + await act(async () => { + pausedHandler?.({}); + }); + + // After the event: PAUSED takes precedence over OK. + expect(within(getRemoteSyncHealth()).getByText(/^PAUSED$/)).toBeInTheDocument(); + expect( + await screen.findByText(/server rejected this uplink's token/i), + ).toBeInTheDocument(); }); it('shows ERR when sync.last_error is set', async () => { diff --git a/apps/tray-ui/src/components/SettingsPane.tsx b/apps/tray-ui/src/components/SettingsPane.tsx index e20b6f0..aad9b34 100644 --- a/apps/tray-ui/src/components/SettingsPane.tsx +++ b/apps/tray-ui/src/components/SettingsPane.tsx @@ -66,22 +66,55 @@ interface Props { status?: StatusResponse | null; } -type SyncHealth = 'ok' | 'err' | 'idle' | 'off'; +type SyncHealth = 'ok' | 'err' | 'stale' | 'paused' | 'idle' | 'off'; + +/// Minimum staleness threshold in seconds when the worker has fired +/// at least once but hasn't attempted again recently. Set to 2× the +/// bulk interval, but never below this floor — protects against a +/// pathologically-low interval producing flapping pills. +const STALE_FLOOR_SECS = 60; /// Derive the Remote sync card's health pill from the device's -/// pairing state, the per-config enabled flag, and the latest -/// `SyncStats` snapshot from the polling hook. +/// pairing state, the per-config enabled flag, the latest `SyncStats` +/// snapshot from the polling hook, the bulk interval (for staleness +/// calculation), and the `paused` flag (set when the sync worker +/// emits `sync-paused`). /// -/// OFF wins over ERR — if the user has disabled sync we don't want -/// to nag them with a stale failure from before they flipped it off. +/// Precedence: OFF > PAUSED > IDLE > ERR > STALE > OK. +/// - OFF wins over everything — user disabled sync, don't nag. +/// - PAUSED next — worker bailed (auth_lost); needs re-pair. +/// - IDLE when no stats yet — never polled. +/// - ERR when there's a recorded error — we have an explanation, +/// show it. ERR beats STALE: a known failure is more useful +/// information than "we don't know". +/// - STALE when last_attempt_at is older than 2× the bulk interval +/// AND there's no error. Catches "looks connected but isn't +/// shipping" silent-failure modes (2026-05-28 outage: workers +/// alive, auth_lost guard skipping drain, sync_stats frozen on +/// its last green reading, pill happily showed OK for 10+ hours). +/// - OK only when we've had a recent success. function deriveSyncHealth( isPaired: boolean, enabled: boolean, sync: SyncStats | null | undefined, + bulkIntervalSecs: number, + paused: boolean, ): SyncHealth { if (!isPaired || !enabled) return 'off'; + if (paused) return 'paused'; if (!sync) return 'idle'; if (sync.last_error) return 'err'; + if (sync.last_attempt_at) { + const ageSecs = + (Date.now() - Date.parse(sync.last_attempt_at)) / 1000; + const staleThresholdSecs = Math.max( + bulkIntervalSecs * 2, + STALE_FLOOR_SECS, + ); + if (Number.isFinite(ageSecs) && ageSecs > staleThresholdSecs) { + return 'stale'; + } + } if (sync.last_success_at) return 'ok'; return 'idle'; } @@ -89,6 +122,8 @@ function deriveSyncHealth( const SYNC_HEALTH_LABEL: Record = { ok: 'OK', err: 'ERR', + stale: 'STALE', + paused: 'PAUSED', idle: 'IDLE', off: 'OFF', }; @@ -99,6 +134,8 @@ const SYNC_HEALTH_TONE: Record< > = { ok: 'ok', err: 'danger', + stale: 'warn', + paused: 'warn', idle: 'dim', off: 'dim', }; @@ -106,6 +143,8 @@ const SYNC_HEALTH_TONE: Record< const SYNC_HEALTH_COLOR: Record = { ok: 'var(--ok)', err: 'var(--danger)', + stale: 'var(--warn)', + paused: 'var(--warn)', idle: 'var(--fg-dim)', off: 'var(--fg-dim)', }; @@ -164,6 +203,14 @@ export function SettingsPane({ config, onSave, status }: Props) { } | null>(null); const [revoked, setRevoked] = useState(false); + // `paused` mirrors a `sync-paused` emit from the Rust sync worker — + // fired when the worker exits its loop because `auth_lost` is set. + // The Settings pane uses it to drive the health pill to "PAUSED" + // and surface an explanatory notice. Cleared on save/re-pair, same + // shape as `revoked`. Surfaced 2026-05-28 after the silent + // auth_lost-loop outage; previously the health pill stayed "OK" + // forever because sync_stats never updated. + const [paused, setPaused] = useState(false); // Unsaved-draft guard: tracks the remote config that arrived while // the user had uncommitted edits, and the last-saved baseline used @@ -247,6 +294,16 @@ export function SettingsPane({ config, onSave, status }: Props) { }; }, []); + useEffect(() => { + let cancel: (() => void) | undefined; + listen('sync-paused', () => setPaused(true)).then((unl) => { + cancel = unl; + }); + return () => { + cancel?.(); + }; + }, []); + // The App owns the canonical Config and subscribes to the bulk-lane // `config-changed` event there (so it fires regardless of which tab // the user is on). When the parent replaces the config — whether @@ -446,6 +503,8 @@ export function SettingsPane({ config, onSave, status }: Props) { isPaired, draft.remote_sync.enabled, status?.sync, + draft.remote_sync.interval_secs, + paused, ); return ( @@ -487,6 +546,7 @@ export function SettingsPane({ config, onSave, status }: Props) { editDraft((prev) => ({ ...prev, sync_with_cloud: e.target.checked })); if (e.target.checked) { setRevoked(false); + setPaused(false); } }} style={{ accentColor: 'var(--accent)' }} @@ -519,6 +579,23 @@ export function SettingsPane({ config, onSave, status }: Props) { Cloud sync is disabled for this uplink. Toggle Cloud sync back on to resume.

)} + {paused && ( +

+ Sync paused: the server rejected this uplink's token. Re-pair the + device (Connected Uplinks card below) to resume. +

+ )} , - account_status: &Mutex, ) -> Result<()> { { let mut s = hangar_stats.lock(); @@ -243,23 +235,34 @@ async fn refresh_once( let status = resp.status(); if status == StatusCode::UNAUTHORIZED || status == StatusCode::FORBIDDEN { // Device token rejected by our server (revoked / signature - // invalid / account deleted). Mirror the sync worker's behaviour: - // clear the persisted token, flip auth_lost so the health - // banner fires and other workers (sync, parser-submissions) - // pause too, then bail so the next loop iteration sleeps. - // Without this propagation, the hangar fetcher would re-try - // every cycle and the user would never see an auth_lost - // signal until a *separate* /v1/ingest 401 fired. + // invalid / account deleted). Record the error so the UI + // surfaces it, but do NOT clear the persisted token or flip + // global `auth_lost` — that's the sync worker's call to make. + // + // History: this used to call `clear_persisted_device_token()` + // + `account_status.lock().auth_lost = true`, on the theory + // that any 401 here implies the sync worker would also 401 + // and we'd save a round-trip. But it conflated TWO different + // 401 surfaces: + // - sync /v1/ingest / /v1/auth/me — the canonical token- + // status oracle, which when it 401s should pause sync. + // - this hangar push — a peripheral feature whose 401 can + // race ahead of the sync worker's view of the world, e.g. + // when the sync worker has a fresh post-pair token in its + // captured locals but the hangar worker is still using the + // pre-pair config snapshot. + // The race shipped a real outage 2026-05-28: hangar 401 wiped + // the freshly-paired token + flipped auth_lost system-wide; + // sync workers then silently no-op'd the auth_lost guard for + // hours while the UI's health pill stayed green. Fix: leave + // it to the sync worker, which has the authoritative view. let body = resp.text().await.unwrap_or_default(); tracing::warn!( %status, body = %body, - "hangar push: device token rejected — clearing and flipping auth_lost" + "hangar push: device token rejected — recording and bailing this cycle \ + (sync worker handles tray-wide auth_lost)" ); - if let Err(e) = crate::sync::clear_persisted_device_token() { - tracing::warn!(error = %e, "failed to clear device token after hangar auth loss"); - } - account_status.lock().auth_lost = true; anyhow::bail!("hangar push failed: {status} (device token rejected)"); } if !status.is_success() { diff --git a/crates/starstats-client/src/sync.rs b/crates/starstats-client/src/sync.rs index 5d0db01..4822067 100644 --- a/crates/starstats-client/src/sync.rs +++ b/crates/starstats-client/src/sync.rs @@ -298,12 +298,29 @@ fn spawn_lane( ) -> tauri::async_runtime::JoinHandle<()> { tauri::async_runtime::spawn(async move { loop { - // If a previous iteration tripped auth_lost, skip the - // drain entirely — the token has been cleared and we'd - // just re-trigger the same 401. Wait for the user to - // re-pair (which clears auth_lost and respawns this task). - let auth_ok = !account_status.lock().auth_lost; - if auth_ok { + // If a previous iteration tripped `auth_lost`, EXIT the + // worker entirely. Previously this loop kept spinning, + // skipping the drain on every tick — workers stayed alive + // but did no work, and `sync_stats` kept its last-known- + // good values, so the Settings health pill happily showed + // green for hours after sync had silently died. Surfaced + // 2026-05-28 in a tray "looks connected but isn't shipping" + // outage. Now: log once, emit `sync-paused` so the UI can + // show a banner, and break. The worker re-spawns on the + // next `respawn()` call (pair_device, save_config, + // set_sync_preset). + if account_status.lock().auth_lost { + tracing::warn!( + lane = lane.label(), + "sync worker exiting: auth_lost is set — waiting for re-pair" + ); + if let Err(e) = app_handle.emit("sync-paused", "auth_lost") { + tracing::warn!(error = %e, "emit sync-paused failed"); + } + break; + } + // Auth gate passed — proceed with the regular drain/heartbeat cycle. + { let types_ref: Vec<&str> = priority_types.iter().map(|s| s.as_str()).collect(); if let Err(e) = drain_lane( lane,