Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

70 changes: 68 additions & 2 deletions apps/tray-ui/src/components/SettingsPane.test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -240,15 +240,81 @@ describe('SettingsPane remote-sync health indicator', () => {
claimed_handle: 'U',
},
});
// Use a fresh timestamp so the staleness check (2× bulk interval)
// doesn't fire — the OK pill requires both a recent success AND
// a recent attempt.
const nowIso = new Date().toISOString();
const status = makeStatus(
makeSync({
last_success_at: '2026-05-21T12:00:00Z',
last_attempt_at: '2026-05-21T12:00:00Z',
last_success_at: nowIso,
last_attempt_at: nowIso,
}),
);
wrap(<SettingsPane config={cfg} onSave={vi.fn()} status={status} />);
expect(await screen.findByLabelText(/Sync settings/i)).toBeInTheDocument();
expect(within(getRemoteSyncHealth()).getByText(/^OK$/)).toBeInTheDocument();
});

it('shows STALE when the last attempt is older than 2× the bulk interval and there is no error', async () => {
// Regression for the 2026-05-28 outage: workers tripping the
// auth_lost guard left sync_stats frozen on the last green
// reading. The pill happily showed OK for 10+ hours despite
// no traffic. STALE surfaces that silence to the user.
const cfg = makeConfig({
remote_sync: {
...makeConfig().remote_sync,
enabled: true,
access_token: 'tok',
claimed_handle: 'U',
interval_secs: 60,
},
});
// Bulk interval = 60s ⇒ threshold = 120s. Use 10 minutes ago.
const tenMinAgo = new Date(Date.now() - 10 * 60 * 1000).toISOString();
const status = makeStatus(
makeSync({
last_success_at: tenMinAgo,
last_attempt_at: tenMinAgo,
}),
);
wrap(<SettingsPane config={cfg} onSave={vi.fn()} status={status} />);
expect(await screen.findByLabelText(/Sync settings/i)).toBeInTheDocument();
expect(within(getRemoteSyncHealth()).getByText(/^STALE$/)).toBeInTheDocument();
});

it('shows PAUSED when the sync-paused event fires', async () => {
let pausedHandler: ((e: unknown) => void) | undefined;
(listen as ReturnType<typeof vi.fn>).mockImplementation((event, handler) => {
if (event === 'sync-paused') pausedHandler = handler;
return Promise.resolve(() => undefined);
});

const cfg = makeConfig({
remote_sync: {
...makeConfig().remote_sync,
enabled: true,
access_token: 'tok',
claimed_handle: 'U',
},
});
const status = makeStatus(makeSync({
last_success_at: new Date().toISOString(),
last_attempt_at: new Date().toISOString(),
}));
wrap(<SettingsPane config={cfg} onSave={vi.fn()} status={status} />);

// Before the event: OK (recent activity).
expect(within(getRemoteSyncHealth()).getByText(/^OK$/)).toBeInTheDocument();

await act(async () => {
pausedHandler?.({});
});

// After the event: PAUSED takes precedence over OK.
expect(within(getRemoteSyncHealth()).getByText(/^PAUSED$/)).toBeInTheDocument();
expect(
await screen.findByText(/server rejected this uplink's token/i),
).toBeInTheDocument();
});

it('shows ERR when sync.last_error is set', async () => {
Expand Down
87 changes: 82 additions & 5 deletions apps/tray-ui/src/components/SettingsPane.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -66,29 +66,64 @@ interface Props {
status?: StatusResponse | null;
}

type SyncHealth = 'ok' | 'err' | 'idle' | 'off';
type SyncHealth = 'ok' | 'err' | 'stale' | 'paused' | 'idle' | 'off';

/// Minimum staleness threshold in seconds when the worker has fired
/// at least once but hasn't attempted again recently. Set to 2× the
/// bulk interval, but never below this floor — protects against a
/// pathologically-low interval producing flapping pills.
const STALE_FLOOR_SECS = 60;

/// Derive the Remote sync card's health pill from the device's
/// pairing state, the per-config enabled flag, and the latest
/// `SyncStats` snapshot from the polling hook.
/// pairing state, the per-config enabled flag, the latest `SyncStats`
/// snapshot from the polling hook, the bulk interval (for staleness
/// calculation), and the `paused` flag (set when the sync worker
/// emits `sync-paused`).
///
/// OFF wins over ERR — if the user has disabled sync we don't want
/// to nag them with a stale failure from before they flipped it off.
/// Precedence: OFF > PAUSED > IDLE > ERR > STALE > OK.
/// - OFF wins over everything — user disabled sync, don't nag.
/// - PAUSED next — worker bailed (auth_lost); needs re-pair.
/// - IDLE when no stats yet — never polled.
/// - ERR when there's a recorded error — we have an explanation,
/// show it. ERR beats STALE: a known failure is more useful
/// information than "we don't know".
/// - STALE when last_attempt_at is older than 2× the bulk interval
/// AND there's no error. Catches "looks connected but isn't
/// shipping" silent-failure modes (2026-05-28 outage: workers
/// alive, auth_lost guard skipping drain, sync_stats frozen on
/// its last green reading, pill happily showed OK for 10+ hours).
/// - OK only when we've had a recent success.
function deriveSyncHealth(
isPaired: boolean,
enabled: boolean,
sync: SyncStats | null | undefined,
bulkIntervalSecs: number,
paused: boolean,
): SyncHealth {
if (!isPaired || !enabled) return 'off';
if (paused) return 'paused';
if (!sync) return 'idle';
if (sync.last_error) return 'err';
if (sync.last_attempt_at) {
const ageSecs =
(Date.now() - Date.parse(sync.last_attempt_at)) / 1000;
const staleThresholdSecs = Math.max(
bulkIntervalSecs * 2,
STALE_FLOOR_SECS,
);
if (Number.isFinite(ageSecs) && ageSecs > staleThresholdSecs) {
return 'stale';
}
}
if (sync.last_success_at) return 'ok';
return 'idle';
}

const SYNC_HEALTH_LABEL: Record<SyncHealth, string> = {
ok: 'OK',
err: 'ERR',
stale: 'STALE',
paused: 'PAUSED',
idle: 'IDLE',
off: 'OFF',
};
Expand All @@ -99,13 +134,17 @@ const SYNC_HEALTH_TONE: Record<
> = {
ok: 'ok',
err: 'danger',
stale: 'warn',
paused: 'warn',
idle: 'dim',
off: 'dim',
};

const SYNC_HEALTH_COLOR: Record<SyncHealth, string> = {
ok: 'var(--ok)',
err: 'var(--danger)',
stale: 'var(--warn)',
paused: 'var(--warn)',
idle: 'var(--fg-dim)',
off: 'var(--fg-dim)',
};
Expand Down Expand Up @@ -164,6 +203,14 @@ export function SettingsPane({ config, onSave, status }: Props) {
} | null>(null);

const [revoked, setRevoked] = useState(false);
// `paused` mirrors a `sync-paused` emit from the Rust sync worker —
// fired when the worker exits its loop because `auth_lost` is set.
// The Settings pane uses it to drive the health pill to "PAUSED"
// and surface an explanatory notice. Cleared on save/re-pair, same
// shape as `revoked`. Surfaced 2026-05-28 after the silent
// auth_lost-loop outage; previously the health pill stayed "OK"
// forever because sync_stats never updated.
const [paused, setPaused] = useState(false);

// Unsaved-draft guard: tracks the remote config that arrived while
// the user had uncommitted edits, and the last-saved baseline used
Expand Down Expand Up @@ -247,6 +294,16 @@ export function SettingsPane({ config, onSave, status }: Props) {
};
}, []);

useEffect(() => {
let cancel: (() => void) | undefined;
listen('sync-paused', () => setPaused(true)).then((unl) => {
cancel = unl;
});
return () => {
cancel?.();
};
}, []);

// The App owns the canonical Config and subscribes to the bulk-lane
// `config-changed` event there (so it fires regardless of which tab
// the user is on). When the parent replaces the config — whether
Expand Down Expand Up @@ -446,6 +503,8 @@ export function SettingsPane({ config, onSave, status }: Props) {
isPaired,
draft.remote_sync.enabled,
status?.sync,
draft.remote_sync.interval_secs,
paused,
);

return (
Expand Down Expand Up @@ -487,6 +546,7 @@ export function SettingsPane({ config, onSave, status }: Props) {
editDraft((prev) => ({ ...prev, sync_with_cloud: e.target.checked }));
if (e.target.checked) {
setRevoked(false);
setPaused(false);
}
}}
style={{ accentColor: 'var(--accent)' }}
Expand Down Expand Up @@ -519,6 +579,23 @@ export function SettingsPane({ config, onSave, status }: Props) {
Cloud sync is disabled for this uplink. Toggle Cloud sync back on to resume.
</p>
)}
{paused && (
<p
style={{
margin: '8px 0 0',
padding: '8px 10px',
background: 'var(--bg-2)',
border: '1px solid var(--warn)',
borderRadius: 'var(--r-sm)',
fontSize: 12,
color: 'var(--warn)',
}}
role="status"
>
Sync paused: the server rejected this uplink's token. Re-pair the
device (Connected Uplinks card below) to resume.
</p>
)}
</TrayCard>

<TrayCard
Expand Down
47 changes: 25 additions & 22 deletions crates/starstats-client/src/hangar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,15 +134,8 @@ pub fn start(
// is gated. Local-only fetch + parse without an upstream
// push has no caller today.
if !account_status.lock().auth_lost {
if let Err(e) = refresh_once(
&client,
&api_url,
&access_token,
&secret,
&hangar_stats,
&account_status,
)
.await
if let Err(e) =
refresh_once(&client, &api_url, &access_token, &secret, &hangar_stats).await
{
tracing::warn!(error = %e, "hangar refresh failed");
let mut s = hangar_stats.lock();
Expand Down Expand Up @@ -185,7 +178,6 @@ async fn refresh_once(
access_token: &str,
secret: &SecretStore,
hangar_stats: &Mutex<HangarStats>,
account_status: &Mutex<AccountStatus>,
) -> Result<()> {
{
let mut s = hangar_stats.lock();
Expand Down Expand Up @@ -243,23 +235,34 @@ async fn refresh_once(
let status = resp.status();
if status == StatusCode::UNAUTHORIZED || status == StatusCode::FORBIDDEN {
// Device token rejected by our server (revoked / signature
// invalid / account deleted). Mirror the sync worker's behaviour:
// clear the persisted token, flip auth_lost so the health
// banner fires and other workers (sync, parser-submissions)
// pause too, then bail so the next loop iteration sleeps.
// Without this propagation, the hangar fetcher would re-try
// every cycle and the user would never see an auth_lost
// signal until a *separate* /v1/ingest 401 fired.
// invalid / account deleted). Record the error so the UI
// surfaces it, but do NOT clear the persisted token or flip
// global `auth_lost` — that's the sync worker's call to make.
//
// History: this used to call `clear_persisted_device_token()`
// + `account_status.lock().auth_lost = true`, on the theory
// that any 401 here implies the sync worker would also 401
// and we'd save a round-trip. But it conflated TWO different
// 401 surfaces:
// - sync /v1/ingest / /v1/auth/me — the canonical token-
// status oracle, which when it 401s should pause sync.
// - this hangar push — a peripheral feature whose 401 can
// race ahead of the sync worker's view of the world, e.g.
// when the sync worker has a fresh post-pair token in its
// captured locals but the hangar worker is still using the
// pre-pair config snapshot.
// The race shipped a real outage 2026-05-28: hangar 401 wiped
// the freshly-paired token + flipped auth_lost system-wide;
// sync workers then silently no-op'd the auth_lost guard for
// hours while the UI's health pill stayed green. Fix: leave
// it to the sync worker, which has the authoritative view.
let body = resp.text().await.unwrap_or_default();
tracing::warn!(
%status,
body = %body,
"hangar push: device token rejected — clearing and flipping auth_lost"
"hangar push: device token rejected — recording and bailing this cycle \
(sync worker handles tray-wide auth_lost)"
);
if let Err(e) = crate::sync::clear_persisted_device_token() {
tracing::warn!(error = %e, "failed to clear device token after hangar auth loss");
}
account_status.lock().auth_lost = true;
anyhow::bail!("hangar push failed: {status} (device token rejected)");
}
if !status.is_success() {
Expand Down
29 changes: 23 additions & 6 deletions crates/starstats-client/src/sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -298,12 +298,29 @@ fn spawn_lane(
) -> tauri::async_runtime::JoinHandle<()> {
tauri::async_runtime::spawn(async move {
loop {
// If a previous iteration tripped auth_lost, skip the
// drain entirely — the token has been cleared and we'd
// just re-trigger the same 401. Wait for the user to
// re-pair (which clears auth_lost and respawns this task).
let auth_ok = !account_status.lock().auth_lost;
if auth_ok {
// If a previous iteration tripped `auth_lost`, EXIT the
// worker entirely. Previously this loop kept spinning,
// skipping the drain on every tick — workers stayed alive
// but did no work, and `sync_stats` kept its last-known-
// good values, so the Settings health pill happily showed
// green for hours after sync had silently died. Surfaced
// 2026-05-28 in a tray "looks connected but isn't shipping"
// outage. Now: log once, emit `sync-paused` so the UI can
// show a banner, and break. The worker re-spawns on the
// next `respawn()` call (pair_device, save_config,
// set_sync_preset).
if account_status.lock().auth_lost {
tracing::warn!(
lane = lane.label(),
"sync worker exiting: auth_lost is set — waiting for re-pair"
);
if let Err(e) = app_handle.emit("sync-paused", "auth_lost") {
tracing::warn!(error = %e, "emit sync-paused failed");
}
break;
}
// Auth gate passed — proceed with the regular drain/heartbeat cycle.
{
let types_ref: Vec<&str> = priority_types.iter().map(|s| s.as_str()).collect();
if let Err(e) = drain_lane(
lane,
Expand Down
Loading