From 7b83b2888d240b0020b7b75027c0ffae9f38ac15 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:00:17 +0800 Subject: [PATCH 01/60] docs: add agent recovery merge design --- .../2026-04-16-agent-recovery-merge-design.md | 587 ++++++++++++++++++ 1 file changed, 587 insertions(+) create mode 100644 docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md diff --git a/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md b/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md new file mode 100644 index 00000000..e734a434 --- /dev/null +++ b/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md @@ -0,0 +1,587 @@ +# Agent Recovery Merge Design + +**Date:** 2026-04-16 +**Status:** Draft +**Scope:** Admin-driven recovery of a reinstalled agent by merging a new temporary server record back into the original offline server record + +## Problem + +The current registration model can reuse a server row only when the agent still has a valid token or when the machine fingerprint remains stable. After a true system reinstall, the old token is often gone and the machine fingerprint may change, so the server creates a new temporary server record instead of reconnecting to the original one. + +That creates two operational problems: + +1. The original server record keeps the historical charts, alerts, tasks, billing metadata, and dashboard references, but it stays offline. +2. The newly registered server record becomes the live agent identity, but it starts with little or no history and is not the record users want to keep. + +The desired recovery flow is: + +- The admin starts from the old offline server record. +- The admin picks a newly registered online temporary server record. +- The system rebinds the live agent to the old server identity. +- The system merges the temporary server's history into the old server record. +- Overlapping time ranges prefer the temporary server's data. +- The temporary server record is deleted after recovery completes. + +This is a targeted recovery flow only. It is not a general-purpose "merge any two servers" feature. + +## Goals + +- Preserve the original `server_id` as the long-term identity. +- Restore the live agent onto the original server record without requiring manual input on the agent. +- Merge historical data from the temporary server into the original server. +- Treat overlapping time ranges as `source wins`. +- Keep user-managed server configuration on the original record. +- Replace runtime system fields on the original record with the recovered agent's latest values. +- Automatically remove the temporary server record after successful recovery. +- Make the workflow explicit, auditable, and retryable. + +## Non-Goals + +- No attempt to fill monitoring gaps during the reinstall window. +- No support for arbitrary record-to-record merge in v1. +- No attempt to reverse the full workflow after the recovered agent has successfully rebound. +- No new permanent "installation identity" entity in v1. +- No merge behavior for data that is not keyed by `server_id` and is not semantically tied to one server record, such as `service_monitor_record`. + +## User Workflow + +### Entry Point + +The recovery action appears only on a server detail page for a server that is currently offline. + +Button label: + +- `claim and merge new agent` + +The action is admin-only. + +### Candidate Selection + +The action opens a dialog showing candidate temporary server records. Candidates must satisfy all of the following: + +- currently online +- not equal to the target server +- not already participating in another recovery job +- still plausibly a temporary auto-registered record + +Candidate ranking is recommendation-only. The admin must still explicitly confirm the selected source. + +Recommended ranking signals: + +- same or similar `last_remote_addr` +- matching `cpu_arch` +- matching `os` +- matching `virtualization` +- close `agent_version` +- close `created_at` +- `target` went offline before `source` was created +- matching `mem_total` +- matching `disk_total` +- matching `cpu_cores` +- matching `country_code` and `region` + +The dialog should show a short explanation for why a candidate was recommended. + +### Confirmation + +Before execution, the dialog shows a summary: + +- keep the old server record +- move the live agent identity onto the old server +- merge history from the temporary record +- when timestamps overlap, the temporary record wins +- delete the temporary record after success + +### Result States + +- On success: the original server becomes online again and the temporary server disappears. +- On failure before rebind: the temporary server remains unchanged and the admin can retry. +- On failure after rebind but before cleanup: the original server remains the live identity, the temporary server remains present, and the admin can retry completion. + +## Terminology + +- `target`: the original offline server record that will be kept +- `source`: the newly registered online temporary server record that will be absorbed and deleted + +## Architecture + +The recovery feature is implemented as a staged server-side recovery merge job. + +High-level flow: + +1. Validate `target` and `source`. +2. Rebind the live agent from `source` identity to `target` identity. +3. Wait for the agent to reconnect as `target`. +4. Freeze writes for both `target` and `source`. +5. Merge `source` history into `target`. +6. Update runtime fields on `target`. +7. Delete `source`. +8. Unfreeze writes and mark the job complete. + +The key design choice is to split "future writes go to the right identity" from "past writes are merged." The system must not start deleting or migrating `source` history until the agent has actually rebound onto `target`. + +## Components + +### 1. Recovery Merge Job Tracker + +Add a server-local tracker for recovery jobs, similar in spirit to the upgrade job tracker. + +Tracked fields: + +- `job_id` +- `target_server_id` +- `source_server_id` +- `status` +- `stage` +- `started_at` +- `updated_at` +- `error` +- per-stage checkpoint metadata + +Suggested stages: + +- `validating` +- `rebinding` +- `awaiting_target_online` +- `freezing_writes` +- `merging_history` +- `finalizing` +- `succeeded` +- `failed` + +The tracker provides: + +- protection against concurrent recovery jobs involving the same server +- visible progress for the frontend +- a retry boundary after partial completion + +### 2. Agent Rebind Protocol + +Add a dedicated protocol message that instructs a connected agent currently identified as `source` to switch to `target`. + +New server-to-agent message: + +- `ServerMessage::RebindIdentity { job_id, target_server_id, token }` + +New agent-to-server messages: + +- `AgentMessage::RebindIdentityAck { job_id }` +- `AgentMessage::RebindIdentityFailed { job_id, error }` + +Agent behavior: + +1. Receive `RebindIdentity`. +2. Persist the new token locally. +3. Acknowledge success or failure. +4. Disconnect. +5. Reconnect using the new token, which now authenticates as `target`. + +The target server row receives a newly generated token. The source row keeps its existing token until final cleanup so that failure before the rebind is easy to reason about. + +### 3. Write Freeze Guard + +The system needs an explicit in-memory recovery lock for `target` and `source` during merge. + +Reason: + +- `records` are persisted asynchronously by `record_writer` +- `ping_records`, `task_results`, and `network_probe_record` are persisted directly from the WebSocket handler +- `traffic_hourly` and `traffic_state` are updated continuously + +Without a write freeze, merge results could be invalidated by concurrent writes after the merge has already decided which side wins. + +The guard should: + +- block or drop writes for both `target` and `source` during `freezing_writes`, `merging_history`, and `finalizing` +- make the skip explicit in logs +- be lifted immediately after the job completes or fails + +This intentionally allows a small monitoring gap during the merge window. That is acceptable because gap filling is out of scope and already accepted by the product requirements. + +## Data Model Semantics + +### Canonical Identity + +The final canonical identity is always `target.server_id`. + +After the recovery: + +- all future agent writes use `target.server_id` +- all kept history belongs to `target.server_id` +- `source.server_id` no longer exists + +### Server Row Field Policy + +On `servers(target)`, keep the original user-managed fields: + +- `name` +- `group_id` +- `weight` +- `hidden` +- `remark` +- `public_remark` +- `price` +- `billing_cycle` +- `currency` +- `expired_at` +- `traffic_limit` +- `traffic_limit_type` +- `billing_start_day` +- `capabilities` + +On `servers(target)`, replace runtime fields from `source`: + +- `cpu_name` +- `cpu_cores` +- `cpu_arch` +- `os` +- `kernel_version` +- `mem_total` +- `swap_total` +- `disk_total` +- `ipv4` +- `ipv6` +- `region` +- `country_code` +- `virtualization` +- `agent_version` +- `protocol_version` +- `features` +- `last_remote_addr` +- `fingerprint` + +`server_tags` remain those of `target`. + +## History Merge Rules + +The merge logic is table-specific. + +### Category A: Keep Target Configuration, Drop Source Configuration + +These tables or fields are treated as target-owned configuration and are not merged from source: + +- `servers` user-managed fields listed above +- `server_tag` +- `network_probe_config` +- any `server_ids_json` references already pointing at `target` + +Source-owned values in this category are discarded when `source` is deleted. + +### Category B: Raw Time-Series Tables + +For raw tables without a natural uniqueness key, merge by replacing the target's overlapping time window with source data. + +Algorithm per table: + +1. Read the source time range: `min_ts` and `max_ts`. +2. Delete target rows whose timestamps fall in `[min_ts, max_ts]`. +3. Rewrite all source rows to `target.server_id`. +4. Delete the original source rows if they were not already moved by update. + +This gives exact `source wins` behavior over the source's active time window. + +Apply this policy to: + +- `records` +- `gpu_record` +- `ping_record` +- `task_result` +- `network_probe_record` +- `docker_event` + +Field-specific time keys: + +- `records.time` +- `gpu_record.time` +- `ping_record.time` +- `task_result.finished_at` +- `network_probe_record.timestamp` +- `docker_event.timestamp` + +Notes: + +- `task_result` overlap uses `finished_at`; no attempt is made to semantically deduplicate by command. +- `docker_event` overlap uses event timestamp and still follows `source wins`. + +### Category C: Aggregated or Unique-Key Tables + +For tables with a uniqueness key or a natural aggregate bucket, merge by key with strict `source wins`. + +Algorithm per table: + +1. For each source row, compute the target conflict key. +2. Delete any target row with the same key. +3. Rewrite the source row to `target.server_id`. + +Apply this policy to: + +- `records_hourly` with key `(server_id, time)` +- `network_probe_record_hourly` with key `(server_id, target_id, hour)` +- `traffic_hourly` with key `(server_id, hour)` +- `traffic_daily` with key `(server_id, date)` +- `uptime_daily` with key `(server_id, date)` +- `alert_state` with key `(rule_id, server_id)` +- `traffic_state` with key `server_id` + +Special notes: + +- `alert_state`: when both sides exist for the same `(rule_id, server_id)` pair, keep the source row and delete the target row. +- `traffic_state`: always take the source row because it is the live baseline for future traffic deltas. + +### Category D: Not Merged + +Do not merge: + +- `service_monitor_record` + +Reason: + +- It is keyed by `monitor_id`, not `server_id`. +- It does not represent per-server ownership in the way the recovery feature needs. + +## Recovery Job Flow + +### Stage 1: Validating + +Checks: + +- `target` exists +- `source` exists +- `target` is offline +- `source` is online +- neither record is already in another recovery job +- `source` still looks like a temporary auto-registered node + +If any check fails, the job fails without side effects. + +### Stage 2: Rebinding + +1. Generate a new token for `target`. +2. Persist the new token hash and prefix on `target`. +3. Send `RebindIdentity` to the currently connected `source` agent. +4. Wait for `RebindIdentityAck`. + +If the agent reports failure, the job fails here and no history is merged. + +### Stage 3: Awaiting Target Online + +Wait for the recovered agent to reconnect as `target`. + +Success condition: + +- `target` becomes the current online connection + +Failure condition: + +- timeout + +Timeout does not roll back to the old identity. The job simply fails before merge and keeps `source` untouched. + +### Stage 4: Freezing Writes + +Enable recovery locks for both `target` and `source`. + +This must happen only after `target` is already online under the recovered identity, because the freeze may cause some writes to be skipped. + +### Stage 5: Merging History + +Execute the table-group merge in bounded transactions. + +Recommended groups: + +- group 1: `records`, `gpu_record`, `docker_event` +- group 2: `records_hourly`, `uptime_daily`, `traffic_hourly`, `traffic_daily`, `traffic_state` +- group 3: `ping_record`, `task_result`, `network_probe_record`, `network_probe_record_hourly` +- group 4: `alert_state` + +Each group: + +- runs in its own DB transaction +- records a completed checkpoint before the next group starts + +### Stage 6: Finalizing + +1. Update `servers(target)` runtime fields from `source`. +2. Delete remaining source-owned rows that were not already moved. +3. Delete the `source` server row. +4. Clear job locks. +5. Write audit log entries. + +### Stage 7: Terminal State + +- `succeeded` +- `failed` + +## Failure Handling + +### Failure Before Target Rebind Succeeds + +If the job fails before `target` reconnects: + +- do not merge history +- do not delete source +- do not freeze writes +- mark the job failed + +This keeps retry semantics simple. + +### Failure After Target Rebind Succeeds + +If the job fails after `target` is already online: + +- keep `target` as the live identity +- keep `source` present +- keep job checkpoints +- allow retry from the first incomplete merge stage + +The system does not attempt a full rollback after the live identity has already switched. That would be more fragile than completing the merge forward. + +### Failure During Final Cleanup + +If all history has been merged but deleting `source` fails: + +- leave the source row present +- mark the job failed in `finalizing` +- allow a retry that only runs the remaining cleanup steps + +## Transaction Strategy + +Do not use one global transaction for the entire recovery flow. + +Reasons: + +- the workflow includes WebSocket disconnect and reconnect +- SQLite lock duration would be too large +- a late failure would waste all merge work + +Instead: + +- use short transactions for validation-side DB writes +- use no transaction during the async rebind wait +- use one transaction per merge table group +- use one short transaction for final cleanup + +This provides clear checkpoints and safe retries. + +## API and UI + +### API + +Suggested endpoints: + +- `GET /api/servers/{target_id}/recovery-candidates` +- `POST /api/servers/{target_id}/recover-merge` +- `GET /api/servers/recovery-jobs/{job_id}` + +`POST /recover-merge` request body: + +```json +{ + "source_server_id": "..." +} +``` + +Response: + +```json +{ + "data": { + "job_id": "...", + "status": "running", + "stage": "validating" + } +} +``` + +### UI + +On the target server detail page: + +- admin-only button: `claim and merge new agent` +- candidate list dialog with match explanations +- confirmation dialog with irreversible-effect summary + +During execution: + +- show recovery stage on the target page +- show source as `recovery in progress` + +On success: + +- refresh both list and detail views +- target remains +- source disappears + +On failure: + +- show stage-specific error +- offer retry + +## Audit Logging + +Write explicit audit entries for: + +- recovery started +- source selected +- rebind succeeded or failed +- merge succeeded or failed +- source deleted + +Recommended detail payload: + +- `job_id` +- `target_server_id` +- `source_server_id` +- `stage` +- `error` + +## Testing Strategy + +### Backend Integration Tests + +Must cover: + +1. successful end-to-end recovery +2. rebind failure before merge +3. timeout waiting for target online +4. failure during one merge group with retryable checkpoint state +5. successful retry after partial failure +6. `source wins` for each unique-key table +7. raw time-window replacement for each raw history table +8. write-freeze behavior during merge +9. final cleanup deleting the source record + +### Agent Tests + +Must cover: + +1. receiving `RebindIdentity` +2. persisting the new token +3. reporting ack and failure +4. reconnecting with the new identity + +### Frontend Tests + +Must cover: + +1. candidate ranking and rendering +2. confirmation summary +3. progress state rendering +4. error state and retry action + +## Rollout + +Recommended rollout order: + +1. backend job tracker and protocol +2. agent rebind support +3. write-freeze guards +4. merge engine and tests +5. UI workflow +6. documentation + +## Open Tradeoffs + +- The merge window intentionally drops some live writes due to the recovery lock. This is acceptable because monitoring-gap repair is out of scope. +- The design chooses forward completion over full rollback after live identity rebind. This reduces failure complexity and matches the operational priority of restoring the server under the original identity. +- The design does not try to infer recovery automatically. Admin confirmation remains mandatory to avoid silent mis-merges. From 048de64fc29c5916cf4e82edad8b130cd6853fc9 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:01:43 +0800 Subject: [PATCH 02/60] docs: refine agent recovery merge design --- .../2026-04-16-agent-recovery-merge-design.md | 126 +++++++++++++++--- 1 file changed, 111 insertions(+), 15 deletions(-) diff --git a/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md b/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md index e734a434..ba7dbf29 100644 --- a/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md +++ b/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md @@ -42,6 +42,7 @@ This is a targeted recovery flow only. It is not a general-purpose "merge any tw - No attempt to reverse the full workflow after the recovered agent has successfully rebound. - No new permanent "installation identity" entity in v1. - No merge behavior for data that is not keyed by `server_id` and is not semantically tied to one server record, such as `service_monitor_record`. +- Not designed for machine migration to materially different hardware. Candidate ranking heuristics assume the same logical host was reinstalled and re-registered. ## User Workflow @@ -62,10 +63,11 @@ The action opens a dialog showing candidate temporary server records. Candidates - currently online - not equal to the target server - not already participating in another recovery job -- still plausibly a temporary auto-registered record Candidate ranking is recommendation-only. The admin must still explicitly confirm the selected source. +There is no code-level `auto_registered` or `is_temporary` marker on `servers` in v1. "Temporary" is only a product description for the common case where a newly registered online source is the replacement agent after reinstall. The implementation therefore uses heuristics for ranking, not a hard temporary flag. + Recommended ranking signals: - same or similar `last_remote_addr` @@ -79,6 +81,8 @@ Recommended ranking signals: - matching `disk_total` - matching `cpu_cores` - matching `country_code` and `region` +- still has default-like metadata such as recent `created_at` and unchanged default `name` +- is not referenced, or is only lightly referenced, by shared `server_ids_json` configuration tables The dialog should show a short explanation for why a candidate was recommended. @@ -124,7 +128,25 @@ The key design choice is to split "future writes go to the right identity" from ### 1. Recovery Merge Job Tracker -Add a server-local tracker for recovery jobs, similar in spirit to the upgrade job tracker. +Add a recovery job tracker with database persistence plus an in-memory lock/cache layer. + +Unlike the existing upgrade tracker, recovery cannot be memory-only because failure and retry windows can span multiple DB transactions, WebSocket disconnects, and process restarts. + +New persistent table: + +- `recovery_job` + +Suggested columns: + +- `job_id` +- `target_server_id` +- `source_server_id` +- `status` +- `stage` +- `checkpoint_json` +- `error` +- `created_at` +- `updated_at` Tracked fields: @@ -154,6 +176,9 @@ The tracker provides: - protection against concurrent recovery jobs involving the same server - visible progress for the frontend - a retry boundary after partial completion +- restart-safe recovery state + +The in-memory layer is still useful for fast lock checks and live progress fan-out, but the database row is the source of truth. ### 2. Agent Rebind Protocol @@ -171,13 +196,21 @@ New agent-to-server messages: Agent behavior: 1. Receive `RebindIdentity`. -2. Persist the new token locally. -3. Acknowledge success or failure. +2. Persist the new token locally using atomic file replacement semantics. +3. Only after the token is durably written, acknowledge success or failure. 4. Disconnect. 5. Reconnect using the new token, which now authenticates as `target`. The target server row receives a newly generated token. The source row keeps its existing token until final cleanup so that failure before the rebind is easy to reason about. +The agent-side token write must be implemented as: + +- write to a temporary file +- flush and close +- atomic rename over the old config file + +The current non-atomic "rewrite file in place" helper is not sufficient for this workflow and must be replaced or wrapped. + ### 3. Write Freeze Guard The system needs an explicit in-memory recovery lock for `target` and `source` during merge. @@ -196,6 +229,12 @@ The guard should: - make the skip explicit in logs - be lifted immediately after the job completes or fails +Implementation guidance: + +- the WebSocket handler should funnel agent-originated database writes through a unified `writes_allowed_for(server_id)` check +- this check must cover at least `ping_record`, `task_result`, `network_probe_record`, `docker_event`, and agent-triggered audit side effects such as IP-change audit records +- `record_writer` and traffic upsert paths must honor the same guard + This intentionally allows a small monitoring gap during the merge window. That is acceptable because gap filling is out of scope and already accepted by the product requirements. ## Data Model Semantics @@ -263,10 +302,31 @@ These tables or fields are treated as target-owned configuration and are not mer - `servers` user-managed fields listed above - `server_tag` - `network_probe_config` -- any `server_ids_json` references already pointing at `target` Source-owned values in this category are discarded when `source` is deleted. +### Category A2: Shared `server_ids_json` References + +The source server is allowed to appear in shared configuration JSON arrays. This is not a hard exclusion during candidate selection. + +When `source` is deleted, all references to `source.server_id` must be rewritten to `target.server_id` and deduplicated in the following tables: + +- `alert_rule.server_ids_json` +- `ping_task.server_ids_json` +- `task.server_ids_json` +- `service_monitor.server_ids_json` +- `maintenance.server_ids_json` +- `incident.server_ids_json` +- `status_page.server_ids_json` + +Rules: + +- replace every occurrence of `source_server_id` with `target_server_id` +- deduplicate the final array while preserving order where practical +- never leave a dangling `source_server_id` reference behind + +Because this is a replacement, not a removal, these updates do not create the empty-array semantics problems seen in orphan cleanup flows. + ### Category B: Raw Time-Series Tables For raw tables without a natural uniqueness key, merge by replacing the target's overlapping time window with source data. @@ -320,14 +380,27 @@ Apply this policy to: - `traffic_hourly` with key `(server_id, hour)` - `traffic_daily` with key `(server_id, date)` - `uptime_daily` with key `(server_id, date)` -- `alert_state` with key `(rule_id, server_id)` - `traffic_state` with key `server_id` Special notes: -- `alert_state`: when both sides exist for the same `(rule_id, server_id)` pair, keep the source row and delete the target row. - `traffic_state`: always take the source row because it is the live baseline for future traffic deltas. +### Category C2: Stateful Logical-Server Rows + +These rows represent state that semantically belongs to the logical target server, not the temporary replacement identity. + +Apply this policy to: + +- `alert_state` with key `(rule_id, server_id)` + +Rules: + +- if target has no row for the rule, move the source row to target +- if both target and source have a row for the same rule, keep the target row and discard the source row + +This avoids resetting ongoing alert continuity on the original logical server. + ### Category D: Not Merged Do not merge: @@ -350,7 +423,7 @@ Checks: - `target` is offline - `source` is online - neither record is already in another recovery job -- `source` still looks like a temporary auto-registered node +- candidate ranking metadata is captured for the confirmation UI, but there is no hard `is_temporary` gate in v1 If any check fails, the job fails without side effects. @@ -363,6 +436,8 @@ If any check fails, the job fails without side effects. If the agent reports failure, the job fails here and no history is merged. +The agent must not send `RebindIdentityAck` until the new token is durably written locally. + ### Stage 3: Awaiting Target Online Wait for the recovered agent to reconnect as `target`. @@ -375,7 +450,14 @@ Failure condition: - timeout -Timeout does not roll back to the old identity. The job simply fails before merge and keeps `source` untouched. +Timeout does not roll back the newly issued target token. + +Reason: + +- the agent may already have durably persisted the new token and may still reconnect late +- rolling back the target token would risk turning a late reconnect into a guaranteed `401` + +The job simply fails before merge and keeps `source` untouched. A retry issues a fresh target token and supersedes the prior unfinished attempt. ### Stage 4: Freezing Writes @@ -393,6 +475,7 @@ Recommended groups: - group 2: `records_hourly`, `uptime_daily`, `traffic_hourly`, `traffic_daily`, `traffic_state` - group 3: `ping_record`, `task_result`, `network_probe_record`, `network_probe_record_hourly` - group 4: `alert_state` +- group 5: shared `server_ids_json` reference rewrites Each group: @@ -535,6 +618,15 @@ Recommended detail payload: - `stage` - `error` +Recommended action names: + +- `recovery.started` +- `recovery.rebind_ok` +- `recovery.rebind_failed` +- `recovery.merge_group_done` +- `recovery.source_deleted` +- `recovery.failed` + ## Testing Strategy ### Backend Integration Tests @@ -547,18 +639,22 @@ Must cover: 4. failure during one merge group with retryable checkpoint state 5. successful retry after partial failure 6. `source wins` for each unique-key table -7. raw time-window replacement for each raw history table -8. write-freeze behavior during merge -9. final cleanup deleting the source record +7. `target wins` conflict handling for `alert_state` +8. raw time-window replacement for each raw history table +9. shared `server_ids_json` rewrite and dedupe across all seven tables +10. write-freeze behavior during merge +11. process restart with a persisted recovery job +12. final cleanup deleting the source record ### Agent Tests Must cover: 1. receiving `RebindIdentity` -2. persisting the new token -3. reporting ack and failure -4. reconnecting with the new identity +2. persisting the new token with atomic replace semantics +3. only acknowledging after durable local write +4. reporting ack and failure +5. reconnecting with the new identity ### Frontend Tests From a49380db00a428019fac68a5c84452c9b27c2593 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:03:58 +0800 Subject: [PATCH 03/60] docs: clarify recovery merge spec details --- .../2026-04-16-agent-recovery-merge-design.md | 59 ++++++++++++------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md b/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md index ba7dbf29..d2e12ffd 100644 --- a/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md +++ b/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md @@ -136,7 +136,7 @@ New persistent table: - `recovery_job` -Suggested columns: +Persistent columns: - `job_id` - `target_server_id` @@ -145,20 +145,18 @@ Suggested columns: - `stage` - `checkpoint_json` - `error` +- `started_at` - `created_at` - `updated_at` +- `last_heartbeat_at` -Tracked fields: +Recommended indexes: -- `job_id` -- `target_server_id` -- `source_server_id` -- `status` -- `stage` -- `started_at` -- `updated_at` -- `error` -- per-stage checkpoint metadata +- unique primary key on `job_id` +- index on `(target_server_id, status)` +- index on `(source_server_id, status)` + +`checkpoint_json` stores per-stage metadata needed for restart-safe continuation. Suggested stages: @@ -324,6 +322,7 @@ Rules: - replace every occurrence of `source_server_id` with `target_server_id` - deduplicate the final array while preserving order where practical - never leave a dangling `source_server_id` reference behind +- apply the rewrite with a read-modify-write cycle scoped to rows that still contain `source_server_id` Because this is a replacement, not a removal, these updates do not create the empty-array semantics problems seen in orphan cleanup flows. @@ -459,6 +458,12 @@ Reason: The job simply fails before merge and keeps `source` untouched. A retry issues a fresh target token and supersedes the prior unfinished attempt. +Retry semantics after Stage 3 timeout or any other pre-rebind completion failure: + +- mark the existing recovery job as `failed` +- create a new recovery job with a new `job_id` +- issue a fresh target token for the new attempt + ### Stage 4: Freezing Writes Enable recovery locks for both `target` and `source`. @@ -485,10 +490,15 @@ Each group: ### Stage 6: Finalizing 1. Update `servers(target)` runtime fields from `source`. -2. Delete remaining source-owned rows that were not already moved. -3. Delete the `source` server row. -4. Clear job locks. -5. Write audit log entries. +2. Delete remaining source-owned rows that are intentionally not merged: + - `server_tag` + - `network_probe_config` +3. Assert that no other source-owned rows remain in tables that should already have been moved or rewritten. +4. Delete the `source` server row. +5. Clear job locks. +6. Write audit log entries. + +`source` deletion should not be used as the primary cleanup mechanism for historical rows. It is only the final removal of the now-obsolete server row after merge/rewrite work has already completed. Foreign-key cascade is acceptable as a safety net for tables that define it, but the merge engine must not depend on it for correctness. ### Stage 7: Terminal State @@ -519,6 +529,12 @@ If the job fails after `target` is already online: The system does not attempt a full rollback after the live identity has already switched. That would be more fragile than completing the merge forward. +Retry semantics after rebind has succeeded: + +- keep the same `job_id` +- resume from the first incomplete stage using persisted checkpoints +- do not issue another target token unless the retry is explicitly restarted from the beginning as a separate administrative action + ### Failure During Final Cleanup If all history has been merged but deleting `source` fails: @@ -627,6 +643,13 @@ Recommended action names: - `recovery.source_deleted` - `recovery.failed` +## Open Tradeoffs + +- The merge window intentionally drops some live writes due to the recovery lock. This is acceptable because monitoring-gap repair is out of scope. +- The design chooses forward completion over full rollback after live identity rebind. This reduces failure complexity and matches the operational priority of restoring the server under the original identity. +- The design does not try to infer recovery automatically. Admin confirmation remains mandatory to avoid silent mis-merges. +- Shared `server_ids_json` rewrites are read-modify-write operations. If an admin edits the same row concurrently in the UI, last-writer-wins behavior may still occur. This is acceptable in v1 because the race is rare and bounded to recovery-time configuration edits. + ## Testing Strategy ### Backend Integration Tests @@ -675,9 +698,3 @@ Recommended rollout order: 4. merge engine and tests 5. UI workflow 6. documentation - -## Open Tradeoffs - -- The merge window intentionally drops some live writes due to the recovery lock. This is acceptable because monitoring-gap repair is out of scope. -- The design chooses forward completion over full rollback after live identity rebind. This reduces failure complexity and matches the operational priority of restoring the server under the original identity. -- The design does not try to infer recovery automatically. Admin confirmation remains mandatory to avoid silent mis-merges. From 8885cab98b7a556ab79edf837759c4939d4aa9e1 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:05:11 +0800 Subject: [PATCH 04/60] docs: add agent recovery merge plan --- .../plans/2026-04-16-agent-recovery-merge.md | 914 ++++++++++++++++++ 1 file changed, 914 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-16-agent-recovery-merge.md diff --git a/docs/superpowers/plans/2026-04-16-agent-recovery-merge.md b/docs/superpowers/plans/2026-04-16-agent-recovery-merge.md new file mode 100644 index 00000000..f3642ff5 --- /dev/null +++ b/docs/superpowers/plans/2026-04-16-agent-recovery-merge.md @@ -0,0 +1,914 @@ +# Agent Recovery Merge Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add an admin-driven recovery workflow that rebinds a newly registered replacement agent onto an existing offline server record, merges the replacement record's history into the original record, rewrites shared references, and deletes the temporary record. + +**Architecture:** The implementation is split into four vertical slices: protocol and atomic agent rebind support, persistent server-side recovery jobs with write freezing, table-aware history merge logic, and a server-detail UI for candidate selection and progress. The recovery flow keeps the original `server_id`, persists job state in SQLite for restart-safe retries, and uses bounded merge transactions plus checkpointed progress rather than one giant transaction. + +**Tech Stack:** Rust (`axum`, `sea-orm`, `tokio`, SQLite), React 19, TanStack Query, TanStack Router, Zustand, Vitest, OpenAPI-generated web types + +--- + +## File Map + +### Backend Rust + +- Create: `crates/server/src/entity/recovery_job.rs` + Stores the persistent recovery job row. +- Modify: `crates/server/src/entity/mod.rs` + Register the new entity module. +- Create: `crates/server/src/migration/m20260416_000017_create_recovery_job.rs` + Creates `recovery_job` table and indexes. +- Modify: `crates/server/src/migration/mod.rs` + Registers the new migration. +- Create: `crates/server/src/service/recovery_job.rs` + DB-backed repository/service for creating, updating, resuming, and checkpointing jobs. +- Create: `crates/server/src/service/recovery_lock.rs` + In-memory write-freeze guard keyed by `server_id`. +- Create: `crates/server/src/service/recovery_merge.rs` + Orchestrates rebind, merge groups, JSON rewrites, cleanup, and retry semantics. +- Modify: `crates/server/src/service/mod.rs` + Exposes the new services. +- Modify: `crates/server/src/state.rs` + Wires persistent recovery services and write-freeze guard into `AppState`. +- Create: `crates/server/src/router/api/server_recovery.rs` + Read/write endpoints for candidates, start job, and get job state. +- Modify: `crates/server/src/router/api/mod.rs` + Mounts the new router. +- Modify: `crates/server/src/openapi.rs` + Registers new endpoints and DTOs. +- Modify: `crates/server/src/router/ws/agent.rs` + Handles `RebindIdentityAck`/`Failed`, recovery-aware write gating, and rebind orchestration callbacks. +- Modify: `crates/server/src/router/ws/browser.rs` + Includes recovery jobs in browser `FullSync` and live updates. +- Modify: `crates/server/src/task/record_writer.rs` + Honors recovery write freezes. +- Modify: `crates/server/src/service/traffic.rs` + Adds helper(s) needed by merge/finalization and respects recovery lock where state is updated. +- Modify: `crates/server/tests/integration.rs` + Adds API + end-to-end recovery integration coverage. +- Modify: `crates/server/src/service/recovery_merge.rs` + Include focused DB tests using `setup_test_db`. + +### Shared Protocol + +- Modify: `crates/common/src/protocol.rs` + Adds recovery DTOs and WebSocket messages used by agent/browser/server. + +### Agent + +- Create: `crates/agent/src/rebind.rs` + Atomic token persistence helper and rebind message handling. +- Modify: `crates/agent/src/main.rs` + Registers the new module. +- Modify: `crates/agent/src/reporter.rs` + Handles `ServerMessage::RebindIdentity` and emits ack/failure messages. + +### Web + +- Modify: `apps/web/src/lib/api-schema.ts` + Re-export recovery candidate/job schemas after OpenAPI regeneration. +- Modify: `apps/web/src/hooks/use-api.ts` + Adds candidate, start-job, and job polling helpers. +- Modify: `apps/web/src/hooks/use-api.test.tsx` + Covers the new API helpers. +- Create: `apps/web/src/stores/recovery-jobs-store.ts` + Holds live recovery job state keyed by `target_server_id` and `job_id`. +- Create: `apps/web/src/stores/recovery-jobs-store.test.ts` + Covers store set/update/clear behavior. +- Modify: `apps/web/src/hooks/use-servers-ws.ts` + Hydrates recovery jobs from `full_sync` and incremental events. +- Modify: `apps/web/src/hooks/use-servers-ws.test.ts` + Covers WS hydration and updates for recovery jobs. +- Create: `apps/web/src/components/server/recovery-merge-dialog.tsx` + Candidate picker + confirmation flow on the server detail page. +- Create: `apps/web/src/components/server/recovery-merge-dialog.test.tsx` + Covers ranking display, confirmation copy, pending/error UI. +- Modify: `apps/web/src/routes/_authed/servers/$id.tsx` + Adds action button, dialog integration, and job status rendering. +- Modify: `apps/web/src/routes/_authed/servers/$id.test.tsx` + Covers button visibility and job state rendering. +- Modify: `apps/web/src/locales/en/servers.json` + New copy for recovery UI. +- Modify: `apps/web/src/locales/zh/servers.json` + New copy for recovery UI. + +### Docs + +- Modify: `apps/docs/content/docs/cn/server.mdx` + Document the admin recovery flow and its limits. +- Modify: `apps/docs/content/docs/en/server.mdx` + Same in English. +- Modify: `apps/docs/content/docs/cn/api-reference.mdx` + Add recovery endpoints. +- Modify: `apps/docs/content/docs/en/api-reference.mdx` + Add recovery endpoints. + +--- + +### Task 1: Add Recovery Protocol and Atomic Agent Token Rebind Support + +**Files:** +- Create: `crates/agent/src/rebind.rs` +- Modify: `crates/agent/src/main.rs` +- Modify: `crates/agent/src/reporter.rs` +- Modify: `crates/common/src/protocol.rs` + +- [ ] **Step 1: Write failing agent tests for atomic token replacement** + +```rust +// crates/agent/src/rebind.rs +#[cfg(test)] +mod tests { + use super::persist_rebind_token; + + #[test] + fn persist_rebind_token_replaces_existing_token_line_atomically() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("agent.toml"); + std::fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\ntoken = \"old\"\n").unwrap(); + + persist_rebind_token(&path, "new-token").unwrap(); + + let content = std::fs::read_to_string(&path).unwrap(); + assert!(content.contains("token = \"new-token\"")); + assert!(!content.contains("token = \"old\"")); + } + + #[test] + fn persist_rebind_token_preserves_non_token_lines() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("agent.toml"); + std::fs::write(&path, "server_url = \"https://monitor.example.com\"\n[collector]\ninterval = 3\n").unwrap(); + + persist_rebind_token(&path, "fresh-token").unwrap(); + + let content = std::fs::read_to_string(&path).unwrap(); + assert!(content.contains("server_url = \"https://monitor.example.com\"")); + assert!(content.contains("[collector]")); + assert!(content.contains("interval = 3")); + assert!(content.contains("token = \"fresh-token\"")); + } +} +``` + +- [ ] **Step 2: Run the agent tests and verify they fail** + +Run: `cargo test -p serverbee-agent persist_rebind_token -- --exact` + +Expected: FAIL with unresolved import or missing `persist_rebind_token`. + +- [ ] **Step 3: Implement the atomic token writer and wire the module** + +```rust +// crates/agent/src/rebind.rs +pub fn persist_rebind_token(path: &std::path::Path, token: &str) -> anyhow::Result<()> { + let content = if path.exists() { + std::fs::read_to_string(path)? + } else { + String::new() + }; + + let mut lines: Vec = content.lines().map(str::to_owned).collect(); + let token_line = format!("token = \"{token}\""); + if let Some(pos) = lines.iter().position(|line| line.starts_with("token")) { + lines[pos] = token_line; + } else { + lines.push(token_line); + } + + let tmp = path.with_extension("tmp"); + std::fs::write(&tmp, lines.join("\n"))?; + std::fs::rename(&tmp, path)?; + Ok(()) +} + +// crates/agent/src/main.rs +mod rebind; +``` + +- [ ] **Step 4: Extend the shared protocol and reporter rebind handling** + +```rust +// crates/common/src/protocol.rs +ServerMessage::RebindIdentity { + job_id: String, + target_server_id: String, + token: String, +} + +AgentMessage::RebindIdentityAck { + job_id: String, +} + +AgentMessage::RebindIdentityFailed { + job_id: String, + error: String, +} + +// crates/agent/src/reporter.rs +ServerMessage::RebindIdentity { job_id, token, .. } => { + match crate::rebind::persist_rebind_token(std::path::Path::new(crate::config::AgentConfig::config_path()), &token) { + Ok(()) => { + self.config.token = token; + let ack = AgentMessage::RebindIdentityAck { job_id }; + let json = serde_json::to_string(&ack)?; + write.send(Message::Text(json.into())).await?; + write.send(Message::Close(None)).await?; + return Ok(()); + } + Err(err) => { + let failed = AgentMessage::RebindIdentityFailed { job_id, error: err.to_string() }; + let json = serde_json::to_string(&failed)?; + write.send(Message::Text(json.into())).await?; + return Ok(()); + } + } +} +``` + +- [ ] **Step 5: Run the focused agent tests and commit** + +Run: `cargo test -p serverbee-agent persist_rebind_token -- --exact` + +Expected: PASS + +Commit: + +```bash +git add crates/common/src/protocol.rs crates/agent/src/main.rs crates/agent/src/rebind.rs crates/agent/src/reporter.rs +git commit -m "feat(agent): add atomic recovery token rebind support" +``` + +### Task 2: Add Persistent Recovery Job Schema and Repository + +**Files:** +- Create: `crates/server/src/entity/recovery_job.rs` +- Modify: `crates/server/src/entity/mod.rs` +- Create: `crates/server/src/migration/m20260416_000017_create_recovery_job.rs` +- Modify: `crates/server/src/migration/mod.rs` +- Create: `crates/server/src/service/recovery_job.rs` +- Modify: `crates/server/src/service/mod.rs` + +- [ ] **Step 1: Write failing DB-backed service tests** + +```rust +// crates/server/src/service/recovery_job.rs +#[cfg(test)] +mod tests { + use super::RecoveryJobService; + use crate::test_utils::setup_test_db; + + #[tokio::test] + async fn create_job_persists_running_row() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryJobService::create_job(&db, "target-1", "source-1").await.unwrap(); + + assert_eq!(job.target_server_id, "target-1"); + assert_eq!(job.source_server_id, "source-1"); + assert_eq!(job.status, "running"); + assert_eq!(job.stage, "validating"); + } + + #[tokio::test] + async fn update_checkpoint_round_trips() { + let (db, _tmp) = setup_test_db().await; + let job = RecoveryJobService::create_job(&db, "target-1", "source-1").await.unwrap(); + + RecoveryJobService::update_stage(&db, &job.job_id, "merging_history", Some("{\"group\":2}"), None) + .await + .unwrap(); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id).await.unwrap().unwrap(); + assert_eq!(loaded.stage, "merging_history"); + assert_eq!(loaded.checkpoint_json.as_deref(), Some("{\"group\":2}")); + } +} +``` + +- [ ] **Step 2: Run the focused server tests and verify they fail** + +Run: `cargo test -p serverbee-server recovery_job_service -- --nocapture` + +Expected: FAIL with missing entity/service definitions. + +- [ ] **Step 3: Implement the entity, migration, and repository** + +```rust +// crates/server/src/entity/recovery_job.rs +#[derive(Clone, Debug, PartialEq, DeriveEntityModel)] +#[sea_orm(table_name = "recovery_job")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub job_id: String, + pub target_server_id: String, + pub source_server_id: String, + pub status: String, + pub stage: String, + pub checkpoint_json: Option, + pub error: Option, + pub started_at: DateTimeUtc, + pub created_at: DateTimeUtc, + pub updated_at: DateTimeUtc, + pub last_heartbeat_at: Option, +} + +// crates/server/src/migration/m20260416_000017_create_recovery_job.rs +db.execute_unprepared( + "CREATE TABLE recovery_job ( + job_id TEXT PRIMARY KEY NOT NULL, + target_server_id TEXT NOT NULL, + source_server_id TEXT NOT NULL, + status TEXT NOT NULL, + stage TEXT NOT NULL, + checkpoint_json TEXT NULL, + error TEXT NULL, + started_at TEXT NOT NULL, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + last_heartbeat_at TEXT NULL + )" +).await?; +db.execute_unprepared("CREATE INDEX idx_recovery_job_target_status ON recovery_job(target_server_id, status)").await?; +db.execute_unprepared("CREATE INDEX idx_recovery_job_source_status ON recovery_job(source_server_id, status)").await?; + +// crates/server/src/service/recovery_job.rs +pub struct RecoveryJobService; +``` + +- [ ] **Step 4: Add repository methods used by the orchestration layer** + +```rust +impl RecoveryJobService { + pub async fn create_job(db: &DatabaseConnection, target: &str, source: &str) -> Result { /* insert row */ } + pub async fn get_job(db: &DatabaseConnection, job_id: &str) -> Result, AppError> { /* find by id */ } + pub async fn update_stage( + db: &DatabaseConnection, + job_id: &str, + stage: &str, + checkpoint_json: Option<&str>, + error: Option<&str> + ) -> Result { /* update row */ } + pub async fn mark_failed(db: &DatabaseConnection, job_id: &str, stage: &str, error: &str) -> Result<(), AppError> { /* update status */ } + pub async fn running_for_target(db: &DatabaseConnection, target: &str) -> Result, AppError> { /* query by index */ } + pub async fn running_for_source(db: &DatabaseConnection, source: &str) -> Result, AppError> { /* query by index */ } +} +``` + +- [ ] **Step 5: Run the tests and commit** + +Run: `cargo test -p serverbee-server recovery_job_service -- --nocapture` + +Expected: PASS + +Commit: + +```bash +git add crates/server/src/entity/mod.rs crates/server/src/entity/recovery_job.rs crates/server/src/migration/mod.rs crates/server/src/migration/m20260416_000017_create_recovery_job.rs crates/server/src/service/mod.rs crates/server/src/service/recovery_job.rs +git commit -m "feat(server): persist recovery jobs in sqlite" +``` + +### Task 3: Add Recovery Candidate Scoring and Admin API Endpoints + +**Files:** +- Create: `crates/server/src/router/api/server_recovery.rs` +- Modify: `crates/server/src/router/api/mod.rs` +- Modify: `crates/server/src/openapi.rs` +- Modify: `crates/server/src/service/recovery_job.rs` +- Modify: `crates/server/tests/integration.rs` + +- [ ] **Step 1: Write failing tests for candidate ranking and API validation** + +```rust +// crates/server/src/router/api/server_recovery.rs +#[cfg(test)] +mod tests { + use super::{score_candidate, CandidateScoreInput}; + + #[test] + fn higher_score_when_ip_arch_and_created_at_match() { + let strong = score_candidate(CandidateScoreInput { + same_remote_addr: true, + same_cpu_arch: true, + same_os: true, + same_virtualization: true, + created_within_minutes: 10, + same_country: true, + }); + let weak = score_candidate(CandidateScoreInput { + same_remote_addr: false, + same_cpu_arch: false, + same_os: true, + same_virtualization: false, + created_within_minutes: 240, + same_country: false, + }); + assert!(strong > weak); + } +} +``` + +- [ ] **Step 2: Run the targeted tests and verify failure** + +Run: `cargo test -p serverbee-server higher_score_when_ip_arch_and_created_at_match -- --exact` + +Expected: FAIL because `server_recovery.rs` and `score_candidate` do not exist. + +- [ ] **Step 3: Implement DTOs, scoring, and read/write routes** + +```rust +// crates/server/src/router/api/server_recovery.rs +#[derive(Debug, Serialize, utoipa::ToSchema)] +pub struct RecoveryCandidateResponse { + pub server_id: String, + pub name: String, + pub score: i32, + pub reasons: Vec, +} + +#[derive(Debug, Deserialize, utoipa::ToSchema)] +pub struct StartRecoveryRequest { + pub source_server_id: String, +} + +pub fn read_router() -> Router> { + Router::new() + .route("/servers/{target_id}/recovery-candidates", get(list_candidates)) + .route("/servers/recovery-jobs/{job_id}", get(get_recovery_job)) +} + +pub fn write_router() -> Router> { + Router::new().route("/servers/{target_id}/recover-merge", post(start_recovery_merge)) +} +``` + +- [ ] **Step 4: Add integration coverage for admin auth and validation rules** + +```rust +// crates/server/tests/integration.rs +#[tokio::test] +async fn test_recovery_candidates_requires_auth_and_filters_target() { + let (base_url, _tmp) = start_test_server().await; + let client = http_client(); + login_admin(&client, &base_url).await; + + let resp = client + .get(format!("{}/api/servers/target-1/recovery-candidates", base_url)) + .send() + .await + .unwrap(); + + assert_eq!(resp.status(), 200); + let body: serde_json::Value = resp.json().await.unwrap(); + assert!(body["data"].is_array()); +} +``` + +- [ ] **Step 5: Run focused tests and commit** + +Run: `cargo test -p serverbee-server recovery_candidates -- --nocapture` + +Expected: PASS + +Commit: + +```bash +git add crates/server/src/router/api/mod.rs crates/server/src/router/api/server_recovery.rs crates/server/src/openapi.rs crates/server/tests/integration.rs +git commit -m "feat(server): add recovery candidate and job api" +``` + +### Task 4: Add Recovery Locks and Route All Agent-Originated Writes Through Them + +**Files:** +- Create: `crates/server/src/service/recovery_lock.rs` +- Modify: `crates/server/src/state.rs` +- Modify: `crates/server/src/router/ws/agent.rs` +- Modify: `crates/server/src/task/record_writer.rs` +- Modify: `crates/server/src/service/mod.rs` + +- [ ] **Step 1: Write failing unit tests for the lock guard** + +```rust +// crates/server/src/service/recovery_lock.rs +#[cfg(test)] +mod tests { + use super::RecoveryLockService; + + #[test] + fn locked_server_denies_writes_until_released() { + let locks = RecoveryLockService::new(); + assert!(locks.writes_allowed_for("srv-1")); + locks.freeze("srv-1"); + assert!(!locks.writes_allowed_for("srv-1")); + locks.release("srv-1"); + assert!(locks.writes_allowed_for("srv-1")); + } +} +``` + +- [ ] **Step 2: Run the guard test and verify failure** + +Run: `cargo test -p serverbee-server locked_server_denies_writes_until_released -- --exact` + +Expected: FAIL because `RecoveryLockService` does not exist. + +- [ ] **Step 3: Implement the lock service and wire it into `AppState`** + +```rust +// crates/server/src/service/recovery_lock.rs +#[derive(Default)] +pub struct RecoveryLockService { + frozen: dashmap::DashSet, +} + +impl RecoveryLockService { + pub fn new() -> Self { Self { frozen: dashmap::DashSet::new() } } + pub fn freeze(&self, server_id: &str) { self.frozen.insert(server_id.to_string()); } + pub fn release(&self, server_id: &str) { self.frozen.remove(server_id); } + pub fn writes_allowed_for(&self, server_id: &str) -> bool { !self.frozen.contains(server_id) } +} + +// crates/server/src/state.rs +pub recovery_lock: RecoveryLockService, +``` + +- [ ] **Step 4: Gate all write paths that can race with recovery** + +```rust +// crates/server/src/router/ws/agent.rs +if !state.recovery_lock.writes_allowed_for(server_id) { + tracing::info!("Skipping recovery-frozen ping/task/probe write for {server_id}"); + return; +} + +// crates/server/src/task/record_writer.rs +if !state.recovery_lock.writes_allowed_for(server_id) { + continue; +} +``` + +- [ ] **Step 5: Run focused tests and commit** + +Run: `cargo test -p serverbee-server locked_server_denies_writes_until_released -- --exact` + +Expected: PASS + +Commit: + +```bash +git add crates/server/src/service/mod.rs crates/server/src/service/recovery_lock.rs crates/server/src/state.rs crates/server/src/router/ws/agent.rs crates/server/src/task/record_writer.rs +git commit -m "feat(server): add recovery write freeze guards" +``` + +### Task 5: Implement the Rebind Orchestrator and Recovery Job Lifecycle + +**Files:** +- Create: `crates/server/src/service/recovery_merge.rs` +- Modify: `crates/server/src/service/mod.rs` +- Modify: `crates/server/src/router/api/server_recovery.rs` +- Modify: `crates/server/src/router/ws/agent.rs` +- Modify: `crates/server/src/router/ws/browser.rs` +- Modify: `crates/common/src/protocol.rs` +- Modify: `apps/web/src/hooks/use-servers-ws.ts` (for later WS payload shape) + +- [ ] **Step 1: Write failing service tests for pre-rebind vs post-rebind retry semantics** + +```rust +// crates/server/src/service/recovery_merge.rs +#[cfg(test)] +mod tests { + use super::{RecoveryFailureMode, retry_strategy_for}; + + #[test] + fn pre_rebind_failures_require_new_job() { + assert_eq!(retry_strategy_for(RecoveryFailureMode::AwaitingTargetOnlineTimeout), "new_job"); + } + + #[test] + fn post_rebind_failures_resume_same_job() { + assert_eq!(retry_strategy_for(RecoveryFailureMode::MergeGroupFailed), "resume_same_job"); + } +} +``` + +- [ ] **Step 2: Run the lifecycle tests and verify failure** + +Run: `cargo test -p serverbee-server pre_rebind_failures_require_new_job -- --exact` + +Expected: FAIL because `recovery_merge.rs` does not exist. + +- [ ] **Step 3: Implement orchestration entry points and persisted stage transitions** + +```rust +// crates/server/src/service/recovery_merge.rs +pub struct RecoveryMergeService; + +impl RecoveryMergeService { + pub async fn start( + state: &Arc, + target_server_id: &str, + source_server_id: &str, + ) -> Result { + let job = RecoveryJobService::create_job(&state.db, target_server_id, source_server_id).await?; + RecoveryJobService::update_stage(&state.db, &job.job_id, "rebinding", None, None).await?; + Ok(job) + } + + pub async fn handle_rebind_ack(state: &Arc, job_id: &str) -> Result<(), AppError> { + RecoveryJobService::update_stage(&state.db, job_id, "awaiting_target_online", None, None).await?; + Ok(()) + } +} + +pub fn retry_strategy_for(mode: RecoveryFailureMode) -> &'static str { + match mode { + RecoveryFailureMode::AwaitingTargetOnlineTimeout => "new_job", + RecoveryFailureMode::MergeGroupFailed => "resume_same_job", + } +} +``` + +- [ ] **Step 4: Wire WS acknowledgements and browser progress fan-out** + +```rust +// crates/server/src/router/ws/agent.rs +AgentMessage::RebindIdentityAck { job_id } => { + if let Err(err) = RecoveryMergeService::handle_rebind_ack(state, &job_id).await { + tracing::error!("Failed to advance recovery job {job_id}: {err}"); + } +} + +// crates/server/src/router/ws/browser.rs +BrowserMessage::FullSync { + servers, + upgrades: state.upgrade_tracker.snapshot(), + recoveries: state.recovery_merge.snapshot(), +} +``` + +- [ ] **Step 5: Run lifecycle tests and commit** + +Run: `cargo test -p serverbee-server pre_rebind_failures_require_new_job post_rebind_failures_resume_same_job -- --nocapture` + +Expected: PASS + +Commit: + +```bash +git add crates/common/src/protocol.rs crates/server/src/service/mod.rs crates/server/src/service/recovery_merge.rs crates/server/src/router/api/server_recovery.rs crates/server/src/router/ws/agent.rs crates/server/src/router/ws/browser.rs +git commit -m "feat(server): orchestrate recovery rebind lifecycle" +``` + +### Task 6: Implement History Merge Groups, JSON Rewrite, and Final Cleanup + +**Files:** +- Modify: `crates/server/src/service/recovery_merge.rs` +- Modify: `crates/server/src/service/traffic.rs` +- Modify: `crates/server/tests/integration.rs` + +- [ ] **Step 1: Write failing merge-engine tests for raw, unique-key, JSON, and alert-state semantics** + +```rust +// crates/server/src/service/recovery_merge.rs +#[tokio::test] +async fn merge_raw_records_replaces_target_overlap_with_source() { /* seed overlapping rows; expect target window delete + source move */ } + +#[tokio::test] +async fn merge_alert_state_keeps_target_when_rule_conflicts() { /* same rule on both sides; expect target row kept */ } + +#[tokio::test] +async fn rewrite_server_ids_json_replaces_source_with_target_once() { /* ["target","source","source"] -> ["target"] */ } +``` + +- [ ] **Step 2: Run the merge-engine tests and verify failure** + +Run: `cargo test -p serverbee-server merge_raw_records_replaces_target_overlap_with_source -- --exact` + +Expected: FAIL because merge helpers do not exist. + +- [ ] **Step 3: Implement merge group helpers** + +```rust +impl RecoveryMergeService { + async fn merge_raw_table( + db: &DatabaseConnection, + table: &str, + time_column: &str, + target: &str, + source: &str, + ) -> Result<(), AppError> { /* delete target overlap; update source rows to target */ } + + async fn merge_alert_states(db: &DatabaseConnection, target: &str, source: &str) -> Result<(), AppError> { /* target wins */ } + + async fn rewrite_server_ids_json_tables(db: &DatabaseConnection, target: &str, source: &str) -> Result<(), AppError> { /* alert_rule/ping_task/task/service_monitor/maintenance/incident/status_page */ } +} +``` + +- [ ] **Step 4: Implement finalization rules and explicit source cleanup** + +```rust +impl RecoveryMergeService { + async fn finalize_target_server_row(db: &DatabaseConnection, target: &str, source: &server::Model) -> Result<(), AppError> { /* copy runtime fields */ } + + async fn delete_intentionally_unmerged_source_rows(db: &DatabaseConnection, source: &str) -> Result<(), AppError> { + server_tag::Entity::delete_many().filter(server_tag::Column::ServerId.eq(source)).exec(db).await?; + network_probe_config::Entity::delete_many().filter(network_probe_config::Column::ServerId.eq(source)).exec(db).await?; + Ok(()) + } +} +``` + +- [ ] **Step 5: Run merge-focused tests and commit** + +Run: `cargo test -p serverbee-server recovery_merge -- --nocapture` + +Expected: PASS + +Commit: + +```bash +git add crates/server/src/service/recovery_merge.rs crates/server/src/service/traffic.rs crates/server/tests/integration.rs +git commit -m "feat(server): merge recovered server history into target identity" +``` + +### Task 7: Add Browser Recovery Job State, Dialog UI, and Server Detail Controls + +**Files:** +- Modify: `apps/web/src/lib/api-schema.ts` +- Modify: `apps/web/src/hooks/use-api.ts` +- Modify: `apps/web/src/hooks/use-api.test.tsx` +- Create: `apps/web/src/stores/recovery-jobs-store.ts` +- Create: `apps/web/src/stores/recovery-jobs-store.test.ts` +- Modify: `apps/web/src/hooks/use-servers-ws.ts` +- Modify: `apps/web/src/hooks/use-servers-ws.test.ts` +- Create: `apps/web/src/components/server/recovery-merge-dialog.tsx` +- Create: `apps/web/src/components/server/recovery-merge-dialog.test.tsx` +- Modify: `apps/web/src/routes/_authed/servers/$id.tsx` +- Modify: `apps/web/src/routes/_authed/servers/$id.test.tsx` +- Modify: `apps/web/src/locales/en/servers.json` +- Modify: `apps/web/src/locales/zh/servers.json` + +- [ ] **Step 1: Write failing store and hook tests** + +```ts +// apps/web/src/stores/recovery-jobs-store.test.ts +it('stores recovery jobs keyed by target server id', () => { + useRecoveryJobsStore.getState().setJob('target-1', { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running', + stage: 'rebinding' + }) + expect(useRecoveryJobsStore.getState().getJob('target-1')?.job_id).toBe('job-1') +}) + +// apps/web/src/hooks/use-api.test.tsx +it('fetches recovery candidates for a target server', async () => { + fetchMock.mockResponseOnce(JSON.stringify({ data: [{ server_id: 'source-1', score: 42, reasons: ['same IP'] }] })) + const result = await api.get('/api/servers/target-1/recovery-candidates') + expect(result[0].server_id).toBe('source-1') +}) +``` + +- [ ] **Step 2: Run the focused web tests and verify failure** + +Run: `bun --cwd apps/web run test -- src/stores/recovery-jobs-store.test.ts src/hooks/use-api.test.tsx` + +Expected: FAIL because the store and API helpers do not exist. + +- [ ] **Step 3: Implement API helpers, store, and WS hydration** + +```ts +// apps/web/src/hooks/use-api.ts +export function useRecoveryCandidates(targetId: string, enabled = true) { + return useQuery({ + queryKey: ['servers', targetId, 'recovery-candidates'], + queryFn: () => api.get(`/api/servers/${targetId}/recovery-candidates`), + enabled: enabled && !!targetId + }) +} + +export async function startRecoveryMerge(targetId: string, sourceServerId: string) { + return api.post(`/api/servers/${targetId}/recover-merge`, { source_server_id: sourceServerId }) +} + +// apps/web/src/stores/recovery-jobs-store.ts +export const useRecoveryJobsStore = create()((set, get) => ({ /* same pattern as upgrade-jobs-store */ })) +``` + +- [ ] **Step 4: Implement dialog and server detail integration** + +```tsx +// apps/web/src/components/server/recovery-merge-dialog.tsx +export function RecoveryMergeDialog({ targetServerId, open, onOpenChange }: Props) { + const { data: candidates } = useRecoveryCandidates(targetServerId, open) + const [selectedSourceId, setSelectedSourceId] = useState('') + const mutation = useMutation({ + mutationFn: () => startRecoveryMerge(targetServerId, selectedSourceId) + }) + + return ( + + + + {t('recovery_merge_title')} + + {/* candidate list + reasons + confirmation copy */} + + + ) +} + +// apps/web/src/routes/_authed/servers/$id.tsx +{!server.online && isAdmin ? : null} +``` + +- [ ] **Step 5: Regenerate OpenAPI web types, run web tests, and commit** + +Run: `bun --cwd apps/web run generate:api-types` + +Expected: `src/lib/api-types.ts` updated without errors + +Run: `bun --cwd apps/web run test -- src/hooks/use-api.test.tsx src/hooks/use-servers-ws.test.ts src/components/server/recovery-merge-dialog.test.tsx src/routes/_authed/servers/$id.test.tsx` + +Expected: PASS + +Commit: + +```bash +git add apps/web/src/lib/api-schema.ts apps/web/src/hooks/use-api.ts apps/web/src/hooks/use-api.test.tsx apps/web/src/stores/recovery-jobs-store.ts apps/web/src/stores/recovery-jobs-store.test.ts apps/web/src/hooks/use-servers-ws.ts apps/web/src/hooks/use-servers-ws.test.ts apps/web/src/components/server/recovery-merge-dialog.tsx apps/web/src/components/server/recovery-merge-dialog.test.tsx apps/web/src/routes/_authed/servers/\$id.tsx apps/web/src/routes/_authed/servers/\$id.test.tsx apps/web/src/locales/en/servers.json apps/web/src/locales/zh/servers.json apps/web/src/lib/api-types.ts +git commit -m "feat(web): add server recovery merge workflow" +``` + +### Task 8: Update Docs and Run End-to-End Verification + +**Files:** +- Modify: `apps/docs/content/docs/cn/server.mdx` +- Modify: `apps/docs/content/docs/en/server.mdx` +- Modify: `apps/docs/content/docs/cn/api-reference.mdx` +- Modify: `apps/docs/content/docs/en/api-reference.mdx` + +- [ ] **Step 1: Write the documentation changes** + +```mdx +## Recovering a Reinstalled Agent + +If an existing server was reinstalled and re-registered as a new temporary node: + +1. Open the original offline server. +2. Click **Claim and Merge New Agent**. +3. Select the recommended online replacement. +4. Confirm the merge. + +The original server record is kept. The replacement record's overlapping history wins, and the temporary record is deleted after recovery completes. +``` + +- [ ] **Step 2: Run the backend verification suite** + +Run: `cargo test -p serverbee-server recovery -- --nocapture` + +Expected: PASS for the recovery-specific tests added in `integration.rs` and `service/recovery_merge.rs` + +- [ ] **Step 3: Run the agent verification suite** + +Run: `cargo test -p serverbee-agent rebind -- --nocapture` + +Expected: PASS for the new atomic token persistence and rebind tests + +- [ ] **Step 4: Run web typecheck and lint** + +Run: `bun --cwd apps/web run typecheck` + +Expected: PASS + +Run: `bun x ultracite check apps/web/src/hooks/use-api.ts apps/web/src/hooks/use-servers-ws.ts apps/web/src/components/server/recovery-merge-dialog.tsx apps/web/src/routes/_authed/servers/\$id.tsx` + +Expected: PASS + +- [ ] **Step 5: Commit the docs and final verification sweep** + +```bash +git add apps/docs/content/docs/cn/server.mdx apps/docs/content/docs/en/server.mdx apps/docs/content/docs/cn/api-reference.mdx apps/docs/content/docs/en/api-reference.mdx +git commit -m "docs: add agent recovery merge guidance" +``` + +## Self-Review + +- Spec coverage: + - Recovery job persistence: Task 2 + - Agent atomic token rebind + ack semantics: Task 1 + - Candidate scoring and recovery APIs: Task 3 + - Write freeze: Task 4 + - Rebind orchestration and retry semantics: Task 5 + - History merge groups, JSON rewrites, and cleanup: Task 6 + - Browser progress and admin UI: Task 7 + - Docs and verification: Task 8 +- Placeholder scan: + - No `TODO`, `TBD`, or "handle appropriately" placeholders remain. + - Each code-changing task includes concrete snippets and commands. +- Type consistency: + - `RebindIdentity`, `RebindIdentityAck`, `RecoveryJobResponse`, and `RecoveryCandidateResponse` names are reused consistently across tasks. + - `target_server_id` and `source_server_id` naming is consistent across backend, protocol, and web tasks. From 29ff2c16bce404f06c9b168841923388eac13620 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:06:49 +0800 Subject: [PATCH 05/60] feat(agent): add atomic recovery token rebind support --- crates/agent/src/main.rs | 3 +- crates/agent/src/rebind.rs | 130 ++++++++++++++++++++++++++++++++++ crates/agent/src/reporter.rs | 40 ++++++++++- crates/common/src/protocol.rs | 67 ++++++++++++++++++ 4 files changed, 237 insertions(+), 3 deletions(-) create mode 100644 crates/agent/src/rebind.rs diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 9455d896..699cfcb5 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -7,6 +7,7 @@ mod fingerprint; mod network_prober; mod pinger; mod probe_utils; +mod rebind; mod register; mod reporter; mod terminal; @@ -74,7 +75,7 @@ async fn main() -> anyhow::Result<()> { tracing::info!("No token found, registering..."); let (_server_id, token) = register::register_agent(&config, &machine_fingerprint).await?; tracing::info!("Registration successful"); - if let Err(e) = register::save_token(&token) { + if let Err(e) = rebind::persist_rebind_token(AgentConfig::config_path(), &token) { tracing::warn!("Failed to save token: {e}"); } config.token = token; diff --git a/crates/agent/src/rebind.rs b/crates/agent/src/rebind.rs new file mode 100644 index 00000000..a176663e --- /dev/null +++ b/crates/agent/src/rebind.rs @@ -0,0 +1,130 @@ +use std::ffi::OsStr; +use std::fs::{self, OpenOptions}; +use std::io::Write; +use std::path::Path; + +use anyhow::Context; + +fn render_token_content(existing: &str, token: &str) -> String { + let token_line = format!("token = \"{token}\""); + let mut lines: Vec = existing.lines().map(ToOwned::to_owned).collect(); + + if let Some(pos) = lines.iter().position(|line| is_token_line(line)) { + lines[pos] = token_line; + } else { + lines.push(token_line); + } + + lines.join("\n") +} + +fn is_token_line(line: &str) -> bool { + let trimmed = line.trim_start(); + let Some(rest) = trimmed.strip_prefix("token") else { + return false; + }; + + rest.trim_start().starts_with('=') +} + +pub fn persist_rebind_token(path: impl AsRef, token: &str) -> anyhow::Result<()> { + let path = path.as_ref(); + let existing = if path.exists() { + fs::read_to_string(path).with_context(|| format!("failed to read {}", path.display()))? + } else { + String::new() + }; + let rendered = render_token_content(&existing, token); + + let parent = path.parent().unwrap_or_else(|| Path::new(".")); + let file_name = path.file_name().unwrap_or_else(|| OsStr::new("agent.toml")); + let temp_path = parent.join(format!( + ".{}.rebind.{}.tmp", + file_name.to_string_lossy(), + uuid::Uuid::new_v4() + )); + + let write_result = (|| -> anyhow::Result<()> { + let mut temp_file = OpenOptions::new() + .create_new(true) + .write(true) + .open(&temp_path) + .with_context(|| format!("failed to create {}", temp_path.display()))?; + temp_file + .write_all(rendered.as_bytes()) + .with_context(|| format!("failed to write {}", temp_path.display()))?; + temp_file + .sync_all() + .with_context(|| format!("failed to sync {}", temp_path.display()))?; + if path.exists() { + if let Ok(metadata) = fs::metadata(path) { + let _ = fs::set_permissions(&temp_path, metadata.permissions()); + } + } + fs::rename(&temp_path, path).with_context(|| { + format!("failed to atomically replace {} with {}", path.display(), temp_path.display()) + })?; + + #[cfg(unix)] + { + if let Some(dir) = path.parent() { + if let Ok(dir_file) = fs::File::open(dir) { + let _ = dir_file.sync_all(); + } + } + } + + Ok(()) + })(); + + if write_result.is_err() { + let _ = fs::remove_file(&temp_path); + } + + write_result +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn persist_rebind_token_replaces_existing_token_line_without_touching_other_lines() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write( + &path, + r#"server_url = "http://127.0.0.1:9527" +token = "old-token" +log.level = "debug""#, + ) + .expect("seed file"); + + persist_rebind_token(&path, "new-token").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert_eq!( + content, + r#"server_url = "http://127.0.0.1:9527" +token = "new-token" +log.level = "debug""# + ); + } + + #[test] + fn persist_rebind_token_appends_token_line_when_missing() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\n").expect("seed file"); + + persist_rebind_token(&path, "fresh-token").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert_eq!( + content, + r#"server_url = "http://127.0.0.1:9527" +token = "fresh-token""# + ); + } +} diff --git a/crates/agent/src/reporter.rs b/crates/agent/src/reporter.rs index 66dd6626..e94fcba8 100644 --- a/crates/agent/src/reporter.rs +++ b/crates/agent/src/reporter.rs @@ -21,6 +21,7 @@ use crate::config::AgentConfig; use crate::docker::DockerManager; use crate::file_manager::{FileEvent, FileManager}; use crate::network_prober::NetworkProber; +use crate::rebind; use crate::pinger::PingManager; use crate::register; use crate::terminal::{TerminalEvent, TerminalManager}; @@ -108,7 +109,7 @@ impl Reporter { } } - async fn connect_and_report(&self) -> anyhow::Result<()> { + async fn connect_and_report(&mut self) -> anyhow::Result<()> { use serverbee_common::constants::*; tracing::info!("Connecting to {}...", build_ws_url(&self.config)?); @@ -440,7 +441,7 @@ impl Reporter { #[allow(clippy::too_many_arguments)] async fn handle_server_message( - &self, + &mut self, text: &str, write: &mut S, ping_manager: &mut PingManager, @@ -538,6 +539,41 @@ impl Reporter { } }); } + ServerMessage::RebindIdentity { + job_id, + target_server_id, + token, + } => { + tracing::info!( + "Rebinding identity for job_id={job_id} to target_server_id={target_server_id}" + ); + + if let Err(write_err) = + rebind::persist_rebind_token(AgentConfig::config_path(), &token) + { + tracing::warn!( + "Failed to persist rebind token for job_id={job_id}: {write_err}" + ); + let failed = AgentMessage::RebindIdentityFailed { + job_id: job_id.clone(), + error: write_err.to_string(), + }; + let json = serde_json::to_string(&failed)?; + if let Err(send_err) = write.send(Message::Text(json.into())).await { + tracing::warn!( + "Failed to send RebindIdentityFailed for job_id={job_id}: {send_err}" + ); + } + return Ok(()); + } + + self.config.token = token; + let ack = AgentMessage::RebindIdentityAck { job_id }; + let json = serde_json::to_string(&ack)?; + write.send(Message::Text(json.into())).await?; + write.send(Message::Close(None)).await?; + return Ok(()); + } ServerMessage::Ack { msg_id } => { tracing::debug!("Received Ack for msg_id={msg_id}"); } diff --git a/crates/common/src/protocol.rs b/crates/common/src/protocol.rs index 7b52c706..ca866f06 100644 --- a/crates/common/src/protocol.rs +++ b/crates/common/src/protocol.rs @@ -81,6 +81,13 @@ pub enum AgentMessage { capability: String, reason: CapabilityDeniedReason, }, + RebindIdentityAck { + job_id: String, + }, + RebindIdentityFailed { + job_id: String, + error: String, + }, NetworkProbeResults { results: Vec, }, @@ -347,6 +354,11 @@ pub enum ServerMessage { #[serde(default)] job_id: Option, }, + RebindIdentity { + job_id: String, + target_server_id: String, + token: String, + }, CapabilitiesSync { capabilities: u32, }, @@ -477,6 +489,61 @@ mod tests { } } + #[test] + fn test_rebind_identity_round_trip() { + let msg = ServerMessage::RebindIdentity { + job_id: "job-1".to_string(), + target_server_id: "server-1".to_string(), + token: "token-123".to_string(), + }; + let json = serde_json::to_string(&msg).unwrap(); + let parsed: ServerMessage = serde_json::from_str(&json).unwrap(); + match parsed { + ServerMessage::RebindIdentity { + job_id, + target_server_id, + token, + } => { + assert_eq!(job_id, "job-1"); + assert_eq!(target_server_id, "server-1"); + assert_eq!(token, "token-123"); + } + _ => panic!("Expected RebindIdentity"), + } + } + + #[test] + fn test_rebind_identity_ack_round_trip() { + let msg = AgentMessage::RebindIdentityAck { + job_id: "job-1".to_string(), + }; + let json = serde_json::to_string(&msg).unwrap(); + let parsed: AgentMessage = serde_json::from_str(&json).unwrap(); + match parsed { + AgentMessage::RebindIdentityAck { job_id } => { + assert_eq!(job_id, "job-1"); + } + _ => panic!("Expected RebindIdentityAck"), + } + } + + #[test] + fn test_rebind_identity_failed_round_trip() { + let msg = AgentMessage::RebindIdentityFailed { + job_id: "job-1".to_string(), + error: "permission denied".to_string(), + }; + let json = serde_json::to_string(&msg).unwrap(); + let parsed: AgentMessage = serde_json::from_str(&json).unwrap(); + match parsed { + AgentMessage::RebindIdentityFailed { job_id, error } => { + assert_eq!(job_id, "job-1"); + assert_eq!(error, "permission denied"); + } + _ => panic!("Expected RebindIdentityFailed"), + } + } + #[test] fn test_capability_denied_round_trip() { let msg = AgentMessage::CapabilityDenied { From 375d22909dcef1ac581e2a273c16f8f028306c0d Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:08:27 +0800 Subject: [PATCH 06/60] test(agent): add exact-named rebind token test --- crates/agent/src/main.rs | 21 +++++++++++++++++++++ crates/agent/src/rebind.rs | 4 ++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 699cfcb5..324eb3e9 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -86,6 +86,27 @@ async fn main() -> anyhow::Result<()> { Ok(()) } +#[cfg(test)] +#[test] +fn persist_rebind_token() { + use std::fs; + + use tempfile::TempDir; + + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\n").expect("seed file"); + + crate::rebind::persist_rebind_token(&path, "focused-token").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert_eq!( + content, + r#"server_url = "http://127.0.0.1:9527" +token = "focused-token""# + ); +} + #[cfg(test)] mod tests { use super::install_rustls_crypto_provider; diff --git a/crates/agent/src/rebind.rs b/crates/agent/src/rebind.rs index a176663e..77491bed 100644 --- a/crates/agent/src/rebind.rs +++ b/crates/agent/src/rebind.rs @@ -101,7 +101,7 @@ log.level = "debug""#, ) .expect("seed file"); - persist_rebind_token(&path, "new-token").expect("persist"); + super::persist_rebind_token(&path, "new-token").expect("persist"); let content = fs::read_to_string(&path).expect("read file"); assert_eq!( @@ -118,7 +118,7 @@ log.level = "debug""# let path = tempdir.path().join("agent.toml"); fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\n").expect("seed file"); - persist_rebind_token(&path, "fresh-token").expect("persist"); + super::persist_rebind_token(&path, "fresh-token").expect("persist"); let content = fs::read_to_string(&path).expect("read file"); assert_eq!( From 4fc69a3f4bb502c8d9d9acd3ef8bc697db80a1fb Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:09:35 +0800 Subject: [PATCH 07/60] fix(agent): harden recovery token persistence --- Cargo.lock | 7 +++--- crates/agent/Cargo.toml | 3 +++ crates/agent/src/config.rs | 16 ++++++++++-- crates/agent/src/main.rs | 19 +++++++++++++- crates/agent/src/rebind.rs | 49 +++++++++++++++++++++++++++++++++--- crates/agent/src/register.rs | 17 +------------ crates/agent/src/reporter.rs | 5 +--- 7 files changed, 87 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6eee2e3c..b463fc46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4015,7 +4015,7 @@ dependencies = [ [[package]] name = "serverbee-agent" -version = "0.8.6" +version = "0.8.8" dependencies = [ "anyhow", "base64 0.22.1", @@ -4044,11 +4044,12 @@ dependencies = [ "tracing-subscriber", "url", "uuid", + "windows-sys 0.59.0", ] [[package]] name = "serverbee-common" -version = "0.8.6" +version = "0.8.8" dependencies = [ "chrono", "serde", @@ -4059,7 +4060,7 @@ dependencies = [ [[package]] name = "serverbee-server" -version = "0.8.6" +version = "0.8.8" dependencies = [ "a2", "anyhow", diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml index 844ce8a9..94bea628 100644 --- a/crates/agent/Cargo.toml +++ b/crates/agent/Cargo.toml @@ -37,3 +37,6 @@ bollard = "0.18" [dev-dependencies] tempfile = "3" + +[target.'cfg(windows)'.dependencies] +windows-sys = { version = "0.59", features = ["Win32_Storage_FileSystem"] } diff --git a/crates/agent/src/config.rs b/crates/agent/src/config.rs index 1b4817de..b8dfdf6d 100644 --- a/crates/agent/src/config.rs +++ b/crates/agent/src/config.rs @@ -140,8 +140,20 @@ impl AgentConfig { Ok(config) } - pub fn config_path() -> &'static str { - if std::path::Path::new("/etc/serverbee/agent.toml").exists() { + pub fn config_path_for_persistence() -> &'static str { + Self::select_config_path_for_persistence( + std::path::Path::new("agent.toml").exists(), + std::path::Path::new("/etc/serverbee/agent.toml").exists(), + ) + } + + pub(crate) fn select_config_path_for_persistence( + local_exists: bool, + system_exists: bool, + ) -> &'static str { + if local_exists { + "agent.toml" + } else if system_exists { "/etc/serverbee/agent.toml" } else { "agent.toml" diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 324eb3e9..7c923eeb 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -75,7 +75,7 @@ async fn main() -> anyhow::Result<()> { tracing::info!("No token found, registering..."); let (_server_id, token) = register::register_agent(&config, &machine_fingerprint).await?; tracing::info!("Registration successful"); - if let Err(e) = rebind::persist_rebind_token(AgentConfig::config_path(), &token) { + if let Err(e) = register::save_token(&token) { tracing::warn!("Failed to save token: {e}"); } config.token = token; @@ -107,6 +107,23 @@ token = "focused-token""# ); } +#[cfg(test)] +#[test] +fn config_path() { + assert_eq!( + crate::config::AgentConfig::select_config_path_for_persistence(true, true), + "agent.toml" + ); + assert_eq!( + crate::config::AgentConfig::select_config_path_for_persistence(false, true), + "/etc/serverbee/agent.toml" + ); + assert_eq!( + crate::config::AgentConfig::select_config_path_for_persistence(false, false), + "agent.toml" + ); +} + #[cfg(test)] mod tests { use super::install_rustls_crypto_provider; diff --git a/crates/agent/src/rebind.rs b/crates/agent/src/rebind.rs index 77491bed..a76efc81 100644 --- a/crates/agent/src/rebind.rs +++ b/crates/agent/src/rebind.rs @@ -61,9 +61,7 @@ pub fn persist_rebind_token(path: impl AsRef, token: &str) -> anyhow::Resu let _ = fs::set_permissions(&temp_path, metadata.permissions()); } } - fs::rename(&temp_path, path).with_context(|| { - format!("failed to atomically replace {} with {}", path.display(), temp_path.display()) - })?; + replace_file(&temp_path, path)?; #[cfg(unix)] { @@ -84,6 +82,51 @@ pub fn persist_rebind_token(path: impl AsRef, token: &str) -> anyhow::Resu write_result } +#[cfg(unix)] +fn replace_file(temp_path: &Path, path: &Path) -> anyhow::Result<()> { + fs::rename(temp_path, path).with_context(|| { + format!( + "failed to atomically replace {} with {}", + path.display(), + temp_path.display() + ) + }) +} + +#[cfg(windows)] +fn replace_file(temp_path: &Path, path: &Path) -> anyhow::Result<()> { + use std::os::windows::ffi::OsStrExt; + + use windows_sys::Win32::Storage::FileSystem::{ + MOVEFILE_REPLACE_EXISTING, MOVEFILE_WRITE_THROUGH, MoveFileExW, + }; + + let mut temp_wide: Vec = temp_path.as_os_str().encode_wide().collect(); + temp_wide.push(0); + let mut path_wide: Vec = path.as_os_str().encode_wide().collect(); + path_wide.push(0); + + let ok = unsafe { + MoveFileExW( + temp_wide.as_ptr(), + path_wide.as_ptr(), + MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH, + ) + }; + + if ok == 0 { + Err(std::io::Error::last_os_error()).with_context(|| { + format!( + "failed to atomically replace {} with {}", + path.display(), + temp_path.display() + ) + }) + } else { + Ok(()) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/agent/src/register.rs b/crates/agent/src/register.rs index 56d3ed7d..87a865d0 100644 --- a/crates/agent/src/register.rs +++ b/crates/agent/src/register.rs @@ -42,20 +42,5 @@ pub async fn register_agent(config: &AgentConfig, fingerprint: &str) -> Result<( } pub fn save_token(token: &str) -> Result<()> { - let path = AgentConfig::config_path(); - let content = if std::path::Path::new(path).exists() { - std::fs::read_to_string(path)? - } else { - String::new() - }; - - let mut lines: Vec = content.lines().map(String::from).collect(); - let token_line = format!("token = \"{token}\""); - if let Some(pos) = lines.iter().position(|l| l.starts_with("token")) { - lines[pos] = token_line; - } else { - lines.push(token_line); - } - std::fs::write(path, lines.join("\n"))?; - Ok(()) + crate::rebind::persist_rebind_token(AgentConfig::config_path_for_persistence(), token) } diff --git a/crates/agent/src/reporter.rs b/crates/agent/src/reporter.rs index e94fcba8..47197331 100644 --- a/crates/agent/src/reporter.rs +++ b/crates/agent/src/reporter.rs @@ -21,7 +21,6 @@ use crate::config::AgentConfig; use crate::docker::DockerManager; use crate::file_manager::{FileEvent, FileManager}; use crate::network_prober::NetworkProber; -use crate::rebind; use crate::pinger::PingManager; use crate::register; use crate::terminal::{TerminalEvent, TerminalManager}; @@ -548,9 +547,7 @@ impl Reporter { "Rebinding identity for job_id={job_id} to target_server_id={target_server_id}" ); - if let Err(write_err) = - rebind::persist_rebind_token(AgentConfig::config_path(), &token) - { + if let Err(write_err) = register::save_token(&token) { tracing::warn!( "Failed to persist rebind token for job_id={job_id}: {write_err}" ); From a4965f9e1f3b145bf76ef96b07a84e5da7f5aef4 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:11:52 +0800 Subject: [PATCH 08/60] refactor(agent): co-locate recovery persistence tests --- crates/agent/src/config.rs | 16 ++++++++++++++++ crates/agent/src/main.rs | 30 ++---------------------------- crates/agent/src/rebind.rs | 25 ++++++++++++++++++++++--- crates/agent/src/register.rs | 7 ++++++- 4 files changed, 46 insertions(+), 32 deletions(-) diff --git a/crates/agent/src/config.rs b/crates/agent/src/config.rs index b8dfdf6d..01962dd6 100644 --- a/crates/agent/src/config.rs +++ b/crates/agent/src/config.rs @@ -161,6 +161,22 @@ impl AgentConfig { } } +#[cfg(test)] +pub(crate) fn assert_config_path() { + assert_eq!( + AgentConfig::select_config_path_for_persistence(true, true), + "agent.toml" + ); + assert_eq!( + AgentConfig::select_config_path_for_persistence(false, true), + "/etc/serverbee/agent.toml" + ); + assert_eq!( + AgentConfig::select_config_path_for_persistence(false, false), + "agent.toml" + ); +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 7c923eeb..6d7aa43c 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -89,39 +89,13 @@ async fn main() -> anyhow::Result<()> { #[cfg(test)] #[test] fn persist_rebind_token() { - use std::fs; - - use tempfile::TempDir; - - let tempdir = TempDir::new().expect("tempdir"); - let path = tempdir.path().join("agent.toml"); - fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\n").expect("seed file"); - - crate::rebind::persist_rebind_token(&path, "focused-token").expect("persist"); - - let content = fs::read_to_string(&path).expect("read file"); - assert_eq!( - content, - r#"server_url = "http://127.0.0.1:9527" -token = "focused-token""# - ); + crate::rebind::assert_persist_rebind_token(); } #[cfg(test)] #[test] fn config_path() { - assert_eq!( - crate::config::AgentConfig::select_config_path_for_persistence(true, true), - "agent.toml" - ); - assert_eq!( - crate::config::AgentConfig::select_config_path_for_persistence(false, true), - "/etc/serverbee/agent.toml" - ); - assert_eq!( - crate::config::AgentConfig::select_config_path_for_persistence(false, false), - "agent.toml" - ); + crate::config::assert_config_path(); } #[cfg(test)] diff --git a/crates/agent/src/rebind.rs b/crates/agent/src/rebind.rs index a76efc81..ac70f4bf 100644 --- a/crates/agent/src/rebind.rs +++ b/crates/agent/src/rebind.rs @@ -27,7 +27,7 @@ fn is_token_line(line: &str) -> bool { rest.trim_start().starts_with('=') } -pub fn persist_rebind_token(path: impl AsRef, token: &str) -> anyhow::Result<()> { +pub(crate) fn persist_rebind_token_impl(path: impl AsRef, token: &str) -> anyhow::Result<()> { let path = path.as_ref(); let existing = if path.exists() { fs::read_to_string(path).with_context(|| format!("failed to read {}", path.display()))? @@ -82,6 +82,9 @@ pub fn persist_rebind_token(path: impl AsRef, token: &str) -> anyhow::Resu write_result } +#[cfg(not(test))] +pub use persist_rebind_token_impl as persist_rebind_token; + #[cfg(unix)] fn replace_file(temp_path: &Path, path: &Path) -> anyhow::Result<()> { fs::rename(temp_path, path).with_context(|| { @@ -127,6 +130,22 @@ fn replace_file(temp_path: &Path, path: &Path) -> anyhow::Result<()> { } } +#[cfg(test)] +pub(crate) fn assert_persist_rebind_token() { + let tempdir = tempfile::TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\n").expect("seed file"); + + persist_rebind_token_impl(&path, "focused-token").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert_eq!( + content, + r#"server_url = "http://127.0.0.1:9527" +token = "focused-token""# + ); +} + #[cfg(test)] mod tests { use super::*; @@ -144,7 +163,7 @@ log.level = "debug""#, ) .expect("seed file"); - super::persist_rebind_token(&path, "new-token").expect("persist"); + super::persist_rebind_token_impl(&path, "new-token").expect("persist"); let content = fs::read_to_string(&path).expect("read file"); assert_eq!( @@ -161,7 +180,7 @@ log.level = "debug""# let path = tempdir.path().join("agent.toml"); fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\n").expect("seed file"); - super::persist_rebind_token(&path, "fresh-token").expect("persist"); + super::persist_rebind_token_impl(&path, "fresh-token").expect("persist"); let content = fs::read_to_string(&path).expect("read file"); assert_eq!( diff --git a/crates/agent/src/register.rs b/crates/agent/src/register.rs index 87a865d0..6c71339d 100644 --- a/crates/agent/src/register.rs +++ b/crates/agent/src/register.rs @@ -3,6 +3,11 @@ use serde::{Deserialize, Serialize}; use crate::config::AgentConfig; +#[cfg(test)] +use crate::rebind::persist_rebind_token_impl as persist_rebind_token; +#[cfg(not(test))] +use crate::rebind::persist_rebind_token; + #[derive(Serialize)] struct RegisterRequest { #[serde(skip_serializing_if = "String::is_empty")] @@ -42,5 +47,5 @@ pub async fn register_agent(config: &AgentConfig, fingerprint: &str) -> Result<( } pub fn save_token(token: &str) -> Result<()> { - crate::rebind::persist_rebind_token(AgentConfig::config_path_for_persistence(), token) + persist_rebind_token(AgentConfig::config_path_for_persistence(), token) } From c8928829a246950cc4d26594817c38a0b2782c52 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:13:06 +0800 Subject: [PATCH 09/60] style(agent): satisfy clippy for recovery persistence --- crates/agent/src/rebind.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/crates/agent/src/rebind.rs b/crates/agent/src/rebind.rs index ac70f4bf..c9b334d3 100644 --- a/crates/agent/src/rebind.rs +++ b/crates/agent/src/rebind.rs @@ -56,19 +56,15 @@ pub(crate) fn persist_rebind_token_impl(path: impl AsRef, token: &str) -> temp_file .sync_all() .with_context(|| format!("failed to sync {}", temp_path.display()))?; - if path.exists() { - if let Ok(metadata) = fs::metadata(path) { - let _ = fs::set_permissions(&temp_path, metadata.permissions()); - } + if path.exists() && let Ok(metadata) = fs::metadata(path) { + let _ = fs::set_permissions(&temp_path, metadata.permissions()); } replace_file(&temp_path, path)?; #[cfg(unix)] { - if let Some(dir) = path.parent() { - if let Ok(dir_file) = fs::File::open(dir) { - let _ = dir_file.sync_all(); - } + if let Some(dir) = path.parent() && let Ok(dir_file) = fs::File::open(dir) { + let _ = dir_file.sync_all(); } } @@ -83,7 +79,7 @@ pub(crate) fn persist_rebind_token_impl(path: impl AsRef, token: &str) -> } #[cfg(not(test))] -pub use persist_rebind_token_impl as persist_rebind_token; +pub(crate) use persist_rebind_token_impl as persist_rebind_token; #[cfg(unix)] fn replace_file(temp_path: &Path, path: &Path) -> anyhow::Result<()> { From 9a6a946743656bd3fdd8ecaa10f81e9b9e638cea Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:14:41 +0800 Subject: [PATCH 10/60] fix(common): bump protocol version for recovery rebind --- crates/common/src/constants.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/common/src/constants.rs b/crates/common/src/constants.rs index 8a2f7a7e..e5261ca9 100644 --- a/crates/common/src/constants.rs +++ b/crates/common/src/constants.rs @@ -1,7 +1,7 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION"); pub const DEFAULT_SERVER_PORT: u16 = 9527; pub const DEFAULT_REPORT_INTERVAL: u32 = 3; -pub const PROTOCOL_VERSION: u32 = 3; +pub const PROTOCOL_VERSION: u32 = 4; pub const SESSION_TTL_SECS: i64 = 86400; pub const HEARTBEAT_INTERVAL_SECS: u64 = 30; @@ -198,6 +198,12 @@ pub fn probe_type_to_cap(probe_type: &str) -> Option { } } +#[cfg(test)] +#[test] +fn protocol_version() { + assert_eq!(PROTOCOL_VERSION, 4); +} + #[cfg(test)] mod tests { use super::*; From e3a3a581c6dae3859443320f9953f22072d65f00 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:16:23 +0800 Subject: [PATCH 11/60] fix(agent): stop old socket immediately after rebind ack --- crates/agent/src/reporter.rs | 57 +++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/crates/agent/src/reporter.rs b/crates/agent/src/reporter.rs index 47197331..0f88f116 100644 --- a/crates/agent/src/reporter.rs +++ b/crates/agent/src/reporter.rs @@ -33,6 +33,11 @@ const UPGRADE_DOWNLOAD_TIMEOUT_SECS: u64 = 600; static UPGRADE_IN_PROGRESS: AtomicBool = AtomicBool::new(false); +enum ServerMessageOutcome { + Continue, + Reconnect, +} + pub struct Reporter { config: AgentConfig, fingerprint: String, @@ -393,7 +398,19 @@ impl Reporter { server_msg = read.next() => { match server_msg { Some(Ok(Message::Text(text))) => { - self.handle_server_message(&text, &mut write, &mut ping_manager, &mut terminal_manager, &mut network_prober, &cmd_result_tx, &capabilities, &server_capabilities, &file_manager, &file_tx, &mut docker_manager, &mut docker_available, &mut docker_stats_interval).await?; + match self.handle_server_message(&text, &mut write, &mut ping_manager, &mut terminal_manager, &mut network_prober, &cmd_result_tx, &capabilities, &server_capabilities, &file_manager, &file_tx, &mut docker_manager, &mut docker_available, &mut docker_stats_interval).await? { + ServerMessageOutcome::Continue => {} + ServerMessageOutcome::Reconnect => { + ping_manager.stop_all(); + terminal_manager.close_all(); + network_prober.stop_all(); + file_manager.cancel_all_transfers(); + if let Some(dm) = docker_manager.as_mut() { + dm.cleanup(); + } + return Ok(()); + } + } } Some(Ok(Message::Close(_))) => { tracing::info!("Server closed connection"); @@ -454,7 +471,7 @@ impl Reporter { docker_manager: &mut Option, docker_available: &mut bool, docker_stats_interval: &mut Option, - ) -> anyhow::Result<()> + ) -> anyhow::Result where S: SinkExt + Unpin, { @@ -464,7 +481,7 @@ impl Reporter { Ok(m) => m, Err(e) => { tracing::warn!("Failed to parse server message: {e}"); - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } }; @@ -519,7 +536,7 @@ impl Reporter { tokio::spawn(async move { let _ = tx.send(denied).await; }); - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } tracing::info!("Executing command (task_id={task_id}): {command}"); let tx = cmd_result_tx.clone(); @@ -561,7 +578,7 @@ impl Reporter { "Failed to send RebindIdentityFailed for job_id={job_id}: {send_err}" ); } - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } self.config.token = token; @@ -569,7 +586,7 @@ impl Reporter { let json = serde_json::to_string(&ack)?; write.send(Message::Text(json.into())).await?; write.send(Message::Close(None)).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Reconnect); } ServerMessage::Ack { msg_id } => { tracing::debug!("Received Ack for msg_id={msg_id}"); @@ -626,7 +643,7 @@ impl Reporter { }; let json = serde_json::to_string(&denied)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } if UPGRADE_IN_PROGRESS @@ -645,7 +662,7 @@ impl Reporter { ) .await; }); - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } tracing::info!("Upgrade requested: v{version} from {download_url}"); @@ -695,7 +712,7 @@ impl Reporter { tokio::spawn(async move { let _ = tx.send(denied).await; }); - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } // Input validation: target must be domain or IP only @@ -714,7 +731,7 @@ impl Reporter { }; let _ = tx.send(msg).await; }); - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } tracing::info!( @@ -742,7 +759,7 @@ impl Reporter { }; let json = serde_json::to_string(&msg)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.list_dir(&path).await; let msg = match result { @@ -772,7 +789,7 @@ impl Reporter { }; let json = serde_json::to_string(&msg)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.stat(&path).await; let msg = match result { @@ -804,7 +821,7 @@ impl Reporter { }; let json = serde_json::to_string(&msg)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.read_file(&path, max_size).await; let msg = match result { @@ -836,7 +853,7 @@ impl Reporter { }; let json = serde_json::to_string(&result)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.write_file(&path, &content).await; let msg = match result { @@ -868,7 +885,7 @@ impl Reporter { }; let json = serde_json::to_string(&result)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.delete(&path, recursive).await; let msg = match result { @@ -896,7 +913,7 @@ impl Reporter { }; let json = serde_json::to_string(&result)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.mkdir(&path).await; let msg = match result { @@ -924,7 +941,7 @@ impl Reporter { }; let json = serde_json::to_string(&result)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.rename_path(&from, &to).await; let msg = match result { @@ -951,7 +968,7 @@ impl Reporter { }; let json = serde_json::to_string(&msg)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } file_manager.start_download(transfer_id, path, file_tx.clone()); } @@ -971,7 +988,7 @@ impl Reporter { }; let json = serde_json::to_string(&msg)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } match file_manager .start_upload(transfer_id.clone(), path, size) @@ -1089,7 +1106,7 @@ impl Reporter { } } - Ok(()) + Ok(ServerMessageOutcome::Continue) } } From afa425da0b1d287f575a0a0ca25d949d893c810c Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:17:57 +0800 Subject: [PATCH 12/60] fix(server): handle rebind agent messages for protocol compat --- crates/server/src/router/ws/agent.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 002c0b85..9bf99af7 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -844,6 +844,16 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent AgentMessage::Pong => { // Agent responded to our protocol-level Ping; already handled by WS Pong frames } + AgentMessage::RebindIdentityAck { job_id } => { + tracing::info!( + "Ignoring RebindIdentityAck from agent {server_id} for job_id={job_id}" + ); + } + AgentMessage::RebindIdentityFailed { job_id, error } => { + tracing::warn!( + "Ignoring RebindIdentityFailed from agent {server_id} for job_id={job_id}: {error}" + ); + } // Docker variants AgentMessage::DockerInfo { From 4f283ac163c6d63c24b0242a17b4c9a92beb9b30 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:19:08 +0800 Subject: [PATCH 13/60] fix(agent): keep persisted token at toml top level --- crates/agent/src/rebind.rs | 39 +++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/crates/agent/src/rebind.rs b/crates/agent/src/rebind.rs index c9b334d3..4a2bd29f 100644 --- a/crates/agent/src/rebind.rs +++ b/crates/agent/src/rebind.rs @@ -12,7 +12,11 @@ fn render_token_content(existing: &str, token: &str) -> String { if let Some(pos) = lines.iter().position(|line| is_token_line(line)) { lines[pos] = token_line; } else { - lines.push(token_line); + let insert_pos = lines + .iter() + .position(|line| is_table_header(line)) + .unwrap_or(lines.len()); + lines.insert(insert_pos, token_line); } lines.join("\n") @@ -27,6 +31,11 @@ fn is_token_line(line: &str) -> bool { rest.trim_start().starts_with('=') } +fn is_table_header(line: &str) -> bool { + let trimmed = line.trim_start(); + trimmed.starts_with('[') && trimmed.ends_with(']') +} + pub(crate) fn persist_rebind_token_impl(path: impl AsRef, token: &str) -> anyhow::Result<()> { let path = path.as_ref(); let existing = if path.exists() { @@ -185,4 +194,32 @@ log.level = "debug""# token = "fresh-token""# ); } + + #[test] + fn persist_rebind_token_inserts_before_first_table_header() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write( + &path, + r#"server_url = "http://127.0.0.1:9527" +[collector] +interval = 3 +[log] +level = "info""#, + ) + .expect("seed file"); + + super::persist_rebind_token_impl(&path, "fresh-token").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert_eq!( + content, + r#"server_url = "http://127.0.0.1:9527" +token = "fresh-token" +[collector] +interval = 3 +[log] +level = "info""# + ); + } } From bded125c467fb0e1d4c69a7f60afa336132f6f1a Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:20:44 +0800 Subject: [PATCH 14/60] fix(agent): scope token replacement to top-level toml --- crates/agent/src/rebind.rs | 43 ++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/crates/agent/src/rebind.rs b/crates/agent/src/rebind.rs index 4a2bd29f..5df03765 100644 --- a/crates/agent/src/rebind.rs +++ b/crates/agent/src/rebind.rs @@ -8,15 +8,16 @@ use anyhow::Context; fn render_token_content(existing: &str, token: &str) -> String { let token_line = format!("token = \"{token}\""); let mut lines: Vec = existing.lines().map(ToOwned::to_owned).collect(); + let preamble_end = lines + .iter() + .position(|line| is_table_header(line)) + .unwrap_or(lines.len()); + let preamble = &mut lines[..preamble_end]; - if let Some(pos) = lines.iter().position(|line| is_token_line(line)) { + if let Some(pos) = preamble.iter().position(|line| is_token_line(line)) { lines[pos] = token_line; } else { - let insert_pos = lines - .iter() - .position(|line| is_table_header(line)) - .unwrap_or(lines.len()); - lines.insert(insert_pos, token_line); + lines.insert(preamble_end, token_line); } lines.join("\n") @@ -219,6 +220,36 @@ token = "fresh-token" [collector] interval = 3 [log] +level = "info""# + ); + } + + #[test] + fn persist_rebind_token_preserves_nested_token_and_inserts_top_level_token() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write( + &path, + r#"server_url = "http://127.0.0.1:9527" +[collector] +token = "nested" +interval = 3 +[log] +level = "info""#, + ) + .expect("seed file"); + + super::persist_rebind_token_impl(&path, "top-level").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert_eq!( + content, + r#"server_url = "http://127.0.0.1:9527" +token = "top-level" +[collector] +token = "nested" +interval = 3 +[log] level = "info""# ); } From da6bec32049689bc58df4d0838465067d75fa419 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:22:31 +0800 Subject: [PATCH 15/60] feat(server): persist recovery jobs in sqlite --- crates/server/src/entity/mod.rs | 1 + crates/server/src/entity/recovery_job.rs | 23 ++ .../m20260416_000017_create_recovery_job.rs | 45 ++++ crates/server/src/migration/mod.rs | 2 + crates/server/src/service/mod.rs | 1 + crates/server/src/service/recovery_job.rs | 229 ++++++++++++++++++ 6 files changed, 301 insertions(+) create mode 100644 crates/server/src/entity/recovery_job.rs create mode 100644 crates/server/src/migration/m20260416_000017_create_recovery_job.rs create mode 100644 crates/server/src/service/recovery_job.rs diff --git a/crates/server/src/entity/mod.rs b/crates/server/src/entity/mod.rs index f7f84a82..35e35a80 100644 --- a/crates/server/src/entity/mod.rs +++ b/crates/server/src/entity/mod.rs @@ -21,6 +21,7 @@ pub mod notification_group; pub mod oauth_account; pub mod ping_record; pub mod ping_task; +pub mod recovery_job; pub mod record; pub mod record_hourly; pub mod server; diff --git a/crates/server/src/entity/recovery_job.rs b/crates/server/src/entity/recovery_job.rs new file mode 100644 index 00000000..f8e25e63 --- /dev/null +++ b/crates/server/src/entity/recovery_job.rs @@ -0,0 +1,23 @@ +use sea_orm::entity::prelude::*; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel)] +#[sea_orm(table_name = "recovery_job")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub job_id: String, + pub target_server_id: String, + pub source_server_id: String, + pub status: String, + pub stage: String, + pub checkpoint_json: Option, + pub error: Option, + pub started_at: DateTimeUtc, + pub created_at: DateTimeUtc, + pub updated_at: DateTimeUtc, + pub last_heartbeat_at: Option, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation {} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/crates/server/src/migration/m20260416_000017_create_recovery_job.rs b/crates/server/src/migration/m20260416_000017_create_recovery_job.rs new file mode 100644 index 00000000..e5930498 --- /dev/null +++ b/crates/server/src/migration/m20260416_000017_create_recovery_job.rs @@ -0,0 +1,45 @@ +use sea_orm_migration::prelude::*; + +pub struct Migration; + +impl MigrationName for Migration { + fn name(&self) -> &str { + "m20260416_000017_create_recovery_job" + } +} + +#[async_trait::async_trait] +impl MigrationTrait for Migration { + async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> { + let db = manager.get_connection(); + db.execute_unprepared( + "CREATE TABLE IF NOT EXISTS recovery_job ( + job_id TEXT PRIMARY KEY NOT NULL, + target_server_id TEXT NOT NULL, + source_server_id TEXT NOT NULL, + status TEXT NOT NULL, + stage TEXT NOT NULL, + checkpoint_json TEXT NULL, + error TEXT NULL, + started_at DATETIME NOT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + last_heartbeat_at DATETIME NULL + )", + ) + .await?; + db.execute_unprepared( + "CREATE INDEX idx_recovery_job_target_status ON recovery_job(target_server_id, status)", + ) + .await?; + db.execute_unprepared( + "CREATE INDEX idx_recovery_job_source_status ON recovery_job(source_server_id, status)", + ) + .await?; + Ok(()) + } + + async fn down(&self, _manager: &SchemaManager) -> Result<(), DbErr> { + Ok(()) + } +} diff --git a/crates/server/src/migration/mod.rs b/crates/server/src/migration/mod.rs index b61a2b38..3a1a267d 100644 --- a/crates/server/src/migration/mod.rs +++ b/crates/server/src/migration/mod.rs @@ -16,6 +16,7 @@ mod m20260329_000013_add_server_fingerprint; mod m20260329_000014_create_mobile_session; mod m20260329_000015_add_session_source; mod m20260329_000016_create_device_token; +mod m20260416_000017_create_recovery_job; pub struct Migrator; @@ -38,6 +39,7 @@ impl MigratorTrait for Migrator { Box::new(m20260329_000014_create_mobile_session::Migration), Box::new(m20260329_000015_add_session_source::Migration), Box::new(m20260329_000016_create_device_token::Migration), + Box::new(m20260416_000017_create_recovery_job::Migration), ] } } diff --git a/crates/server/src/service/mod.rs b/crates/server/src/service/mod.rs index 39ed9037..787edd6a 100644 --- a/crates/server/src/service/mod.rs +++ b/crates/server/src/service/mod.rs @@ -19,6 +19,7 @@ pub mod notification; pub mod oauth; pub mod ping; pub mod record; +pub mod recovery_job; pub mod server; pub mod service_monitor; pub mod status_page; diff --git a/crates/server/src/service/recovery_job.rs b/crates/server/src/service/recovery_job.rs new file mode 100644 index 00000000..02cda324 --- /dev/null +++ b/crates/server/src/service/recovery_job.rs @@ -0,0 +1,229 @@ +use chrono::Utc; +use sea_orm::*; +use uuid::Uuid; + +use crate::entity::recovery_job; +use crate::error::AppError; + +pub struct RecoveryJobService; + +impl RecoveryJobService { + pub async fn create_job( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result { + let now = Utc::now(); + let active = recovery_job::ActiveModel { + job_id: Set(Uuid::new_v4().to_string()), + target_server_id: Set(target_server_id.to_string()), + source_server_id: Set(source_server_id.to_string()), + status: Set("running".to_string()), + stage: Set("validating".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(None), + }; + + Ok(active.insert(db).await?) + } + + pub async fn get_job( + db: &DatabaseConnection, + job_id: &str, + ) -> Result, AppError> { + Ok(recovery_job::Entity::find_by_id(job_id).one(db).await?) + } + + pub async fn update_stage( + db: &DatabaseConnection, + job_id: &str, + stage: &str, + checkpoint_json: Option<&str>, + error: Option<&str>, + ) -> Result { + let model = Self::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + let mut active: recovery_job::ActiveModel = model.into(); + let now = Utc::now(); + + active.stage = Set(stage.to_string()); + active.checkpoint_json = Set(checkpoint_json.map(ToOwned::to_owned)); + active.error = Set(error.map(ToOwned::to_owned)); + active.updated_at = Set(now); + active.last_heartbeat_at = Set(Some(now)); + + Ok(active.update(db).await?) + } + + pub async fn mark_failed( + db: &DatabaseConnection, + job_id: &str, + stage: &str, + error: &str, + ) -> Result<(), AppError> { + let model = Self::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + let mut active: recovery_job::ActiveModel = model.into(); + let now = Utc::now(); + + active.status = Set("failed".to_string()); + active.stage = Set(stage.to_string()); + active.error = Set(Some(error.to_string())); + active.updated_at = Set(now); + active.last_heartbeat_at = Set(Some(now)); + + active.update(db).await?; + Ok(()) + } + + pub async fn running_for_target( + db: &DatabaseConnection, + target_server_id: &str, + ) -> Result, AppError> { + Ok(recovery_job::Entity::find() + .filter(recovery_job::Column::TargetServerId.eq(target_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .limit(1) + .all(db) + .await? + .into_iter() + .next()) + } + + pub async fn running_for_source( + db: &DatabaseConnection, + source_server_id: &str, + ) -> Result, AppError> { + Ok(recovery_job::Entity::find() + .filter(recovery_job::Column::SourceServerId.eq(source_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .limit(1) + .all(db) + .await? + .into_iter() + .next()) + } +} + +#[cfg(test)] +mod tests { + use super::RecoveryJobService; + use crate::test_utils::setup_test_db; + + #[tokio::test] + async fn create_job_persists_running_row_for_target_and_source() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.job_id, job.job_id); + assert_eq!(loaded.target_server_id, "target-1"); + assert_eq!(loaded.source_server_id, "source-1"); + assert_eq!(loaded.status, "running"); + assert_eq!(loaded.stage, "validating"); + assert_eq!(loaded.checkpoint_json, None); + assert_eq!(loaded.error, None); + assert!(loaded.last_heartbeat_at.is_none()); + } + + #[tokio::test] + async fn update_stage_round_trips_stage_and_checkpoint_json() { + let (db, _tmp) = setup_test_db().await; + let job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + + RecoveryJobService::update_stage( + &db, + &job.job_id, + "merging_history", + Some("{\"group\":2}"), + None, + ) + .await + .unwrap(); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, "merging_history"); + assert_eq!(loaded.checkpoint_json.as_deref(), Some("{\"group\":2}")); + assert_eq!(loaded.error, None); + assert_eq!(loaded.status, "running"); + assert!(loaded.last_heartbeat_at.is_some()); + } + + #[tokio::test] + async fn mark_failed_updates_status_stage_and_error() { + let (db, _tmp) = setup_test_db().await; + let job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + + RecoveryJobService::mark_failed(&db, &job.job_id, "finalizing", "boom") + .await + .unwrap(); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.status, "failed"); + assert_eq!(loaded.stage, "finalizing"); + assert_eq!(loaded.error.as_deref(), Some("boom")); + assert!(loaded.last_heartbeat_at.is_some()); + } + + #[tokio::test] + async fn running_queries_match_by_target_and_source() { + let (db, _tmp) = setup_test_db().await; + let job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + + let by_target = RecoveryJobService::running_for_target(&db, "target-1") + .await + .unwrap() + .unwrap(); + let by_source = RecoveryJobService::running_for_source(&db, "source-1") + .await + .unwrap() + .unwrap(); + + assert_eq!(by_target.job_id, job.job_id); + assert_eq!(by_source.job_id, job.job_id); + } + + #[tokio::test] + async fn running_queries_ignore_non_running_jobs() { + let (db, _tmp) = setup_test_db().await; + let job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + RecoveryJobService::mark_failed(&db, &job.job_id, "finalizing", "boom") + .await + .unwrap(); + + assert!(RecoveryJobService::running_for_target(&db, "target-1") + .await + .unwrap() + .is_none()); + assert!(RecoveryJobService::running_for_source(&db, "source-1") + .await + .unwrap() + .is_none()); + } +} From e9e1adff967c5d5e891a42565babd2c3742187f8 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:23:39 +0800 Subject: [PATCH 16/60] fix(agent): reject token persistence under env override --- crates/agent/src/config.rs | 49 ++++++++++++++++++++++++++ crates/agent/src/main.rs | 4 +-- crates/agent/src/register.rs | 67 ++++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 3 deletions(-) diff --git a/crates/agent/src/config.rs b/crates/agent/src/config.rs index 01962dd6..aab89466 100644 --- a/crates/agent/src/config.rs +++ b/crates/agent/src/config.rs @@ -159,6 +159,48 @@ impl AgentConfig { "agent.toml" } } + + pub(crate) fn token_env_override_present() -> bool { + std::env::var_os("SERVERBEE_TOKEN").is_some() + } +} + +#[cfg(test)] +pub(crate) fn with_serverbee_token_env(value: Option<&str>, test: impl FnOnce() -> T) -> T { + use std::sync::{Mutex, OnceLock}; + + struct ServerbeeTokenEnvGuard { + original: Option, + } + + impl Drop for ServerbeeTokenEnvGuard { + fn drop(&mut self) { + match self.original.take() { + Some(value) => unsafe { + std::env::set_var("SERVERBEE_TOKEN", value); + }, + None => unsafe { + std::env::remove_var("SERVERBEE_TOKEN"); + }, + } + } + } + + static ENV_LOCK: OnceLock> = OnceLock::new(); + let _lock = ENV_LOCK.get_or_init(|| Mutex::new(())).lock().expect("env lock"); + let original = std::env::var_os("SERVERBEE_TOKEN"); + + match value { + Some(value) => unsafe { + std::env::set_var("SERVERBEE_TOKEN", value); + }, + None => unsafe { + std::env::remove_var("SERVERBEE_TOKEN"); + }, + } + + let _guard = ServerbeeTokenEnvGuard { original }; + test() } #[cfg(test)] @@ -201,4 +243,11 @@ mod tests { "default external IP URL should be api.ipify.org" ); } + + #[test] + fn token_env_override_present_detects_serverbee_token() { + super::with_serverbee_token_env(Some("env-token"), || { + assert!(AgentConfig::token_env_override_present()); + }); + } } diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 6d7aa43c..16bd3c37 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -75,9 +75,7 @@ async fn main() -> anyhow::Result<()> { tracing::info!("No token found, registering..."); let (_server_id, token) = register::register_agent(&config, &machine_fingerprint).await?; tracing::info!("Registration successful"); - if let Err(e) = register::save_token(&token) { - tracing::warn!("Failed to save token: {e}"); - } + register::save_token(&token)?; config.token = token; } diff --git a/crates/agent/src/register.rs b/crates/agent/src/register.rs index 6c71339d..f80aca97 100644 --- a/crates/agent/src/register.rs +++ b/crates/agent/src/register.rs @@ -47,5 +47,72 @@ pub async fn register_agent(config: &AgentConfig, fingerprint: &str) -> Result<( } pub fn save_token(token: &str) -> Result<()> { + if AgentConfig::token_env_override_present() { + anyhow::bail!( + "SERVERBEE_TOKEN is set; refusing to persist token to agent.toml" + ); + } + persist_rebind_token(AgentConfig::config_path_for_persistence(), token) } + +#[cfg(test)] +mod tests { + use std::fs; + use std::path::PathBuf; + + use tempfile::TempDir; + + struct CurrentDirGuard { + original: PathBuf, + } + + impl Drop for CurrentDirGuard { + fn drop(&mut self) { + let _ = std::env::set_current_dir(&self.original); + } + } + + fn set_current_dir(dir: &TempDir) -> CurrentDirGuard { + let original = std::env::current_dir().expect("cwd"); + std::env::set_current_dir(dir.path()).expect("set cwd"); + CurrentDirGuard { original } + } + + #[test] + fn save_token_rejects_persistence_when_serverbee_token_is_set() { + crate::config::with_serverbee_token_env(Some("env-token"), || { + let tempdir = TempDir::new().expect("tempdir"); + let _cwd_guard = set_current_dir(&tempdir); + + let result = super::save_token("persisted-token"); + + let err = result.expect_err("save_token should fail"); + assert!( + err.to_string().contains("SERVERBEE_TOKEN"), + "unexpected error: {err}" + ); + assert!( + !tempdir.path().join("agent.toml").exists(), + "token persistence should not write a config file" + ); + }); + } + + #[test] + fn save_token_allows_persistence_when_serverbee_token_is_unset() { + crate::config::with_serverbee_token_env(None, || { + let tempdir = TempDir::new().expect("tempdir"); + let _cwd_guard = set_current_dir(&tempdir); + + super::save_token("persisted-token").expect("save_token"); + + let content = fs::read_to_string(tempdir.path().join("agent.toml")) + .expect("read persisted config"); + assert!( + content.contains("token = \"persisted-token\""), + "expected persisted token, got: {content}" + ); + }); + } +} From 44ac5f8887309a01a11744696466885e7e3a06f7 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:25:14 +0800 Subject: [PATCH 17/60] fix(agent): keep bootstrap token persistence best effort --- crates/agent/src/main.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 16bd3c37..6d7aa43c 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -75,7 +75,9 @@ async fn main() -> anyhow::Result<()> { tracing::info!("No token found, registering..."); let (_server_id, token) = register::register_agent(&config, &machine_fingerprint).await?; tracing::info!("Registration successful"); - register::save_token(&token)?; + if let Err(e) = register::save_token(&token) { + tracing::warn!("Failed to save token: {e}"); + } config.token = token; } From 0f615f5e6c60b9ff2a7b3ad6c50a8c74f56179c4 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:26:53 +0800 Subject: [PATCH 18/60] fix(server): enforce unique active recovery jobs --- .../m20260416_000017_create_recovery_job.rs | 8 +- crates/server/src/service/recovery_job.rs | 81 ++++++++++++++++--- 2 files changed, 76 insertions(+), 13 deletions(-) diff --git a/crates/server/src/migration/m20260416_000017_create_recovery_job.rs b/crates/server/src/migration/m20260416_000017_create_recovery_job.rs index e5930498..25028ce0 100644 --- a/crates/server/src/migration/m20260416_000017_create_recovery_job.rs +++ b/crates/server/src/migration/m20260416_000017_create_recovery_job.rs @@ -29,11 +29,15 @@ impl MigrationTrait for Migration { ) .await?; db.execute_unprepared( - "CREATE INDEX idx_recovery_job_target_status ON recovery_job(target_server_id, status)", + "CREATE UNIQUE INDEX IF NOT EXISTS idx_recovery_job_target_running + ON recovery_job(target_server_id) + WHERE status = 'running'", ) .await?; db.execute_unprepared( - "CREATE INDEX idx_recovery_job_source_status ON recovery_job(source_server_id, status)", + "CREATE UNIQUE INDEX IF NOT EXISTS idx_recovery_job_source_running + ON recovery_job(source_server_id) + WHERE status = 'running'", ) .await?; Ok(()) diff --git a/crates/server/src/service/recovery_job.rs b/crates/server/src/service/recovery_job.rs index 02cda324..75330cc5 100644 --- a/crates/server/src/service/recovery_job.rs +++ b/crates/server/src/service/recovery_job.rs @@ -7,6 +7,11 @@ use crate::error::AppError; pub struct RecoveryJobService; +fn is_unique_violation(err: &DbErr) -> bool { + let message = err.to_string(); + message.contains("UNIQUE constraint failed") || message.contains("UNIQUE") +} + impl RecoveryJobService { pub async fn create_job( db: &DatabaseConnection, @@ -28,7 +33,13 @@ impl RecoveryJobService { last_heartbeat_at: Set(None), }; - Ok(active.insert(db).await?) + match active.insert(db).await { + Ok(model) => Ok(model), + Err(err) if is_unique_violation(&err) => Err(AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + )), + Err(err) => Err(err.into()), + } } pub async fn get_job( @@ -89,11 +100,8 @@ impl RecoveryJobService { Ok(recovery_job::Entity::find() .filter(recovery_job::Column::TargetServerId.eq(target_server_id)) .filter(recovery_job::Column::Status.eq("running")) - .limit(1) - .all(db) - .await? - .into_iter() - .next()) + .one(db) + .await?) } pub async fn running_for_source( @@ -103,18 +111,45 @@ impl RecoveryJobService { Ok(recovery_job::Entity::find() .filter(recovery_job::Column::SourceServerId.eq(source_server_id)) .filter(recovery_job::Column::Status.eq("running")) - .limit(1) - .all(db) - .await? - .into_iter() - .next()) + .one(db) + .await?) } } #[cfg(test)] mod tests { use super::RecoveryJobService; + use crate::entity::recovery_job; use crate::test_utils::setup_test_db; + use crate::error::AppError; + use chrono::Utc; + use sea_orm::{ActiveModelTrait, Set}; + + async fn insert_job( + db: &sea_orm::DatabaseConnection, + job_id: &str, + target_server_id: &str, + source_server_id: &str, + status: &str, + ) -> recovery_job::Model { + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set(job_id.to_string()), + target_server_id: Set(target_server_id.to_string()), + source_server_id: Set(source_server_id.to_string()), + status: Set(status.to_string()), + stage: Set("validating".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(None), + } + .insert(db) + .await + .unwrap() + } #[tokio::test] async fn create_job_persists_running_row_for_target_and_source() { @@ -193,6 +228,7 @@ mod tests { let job = RecoveryJobService::create_job(&db, "target-1", "source-1") .await .unwrap(); + let _failed = insert_job(&db, "job-failed", "target-1", "source-1", "failed").await; let by_target = RecoveryJobService::running_for_target(&db, "target-1") .await @@ -226,4 +262,27 @@ mod tests { .unwrap() .is_none()); } + + #[tokio::test] + async fn create_job_rejects_duplicate_active_jobs_for_target_or_source() { + let (db, _tmp) = setup_test_db().await; + + let _first = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + + match RecoveryJobService::create_job(&db, "target-1", "source-2").await { + Err(AppError::Conflict(message)) => { + assert!(message.contains("running recovery job")); + } + other => panic!("expected conflict for duplicate target, got {other:?}"), + } + + match RecoveryJobService::create_job(&db, "target-2", "source-1").await { + Err(AppError::Conflict(message)) => { + assert!(message.contains("running recovery job")); + } + other => panic!("expected conflict for duplicate source, got {other:?}"), + } + } } From c10ea856fe59dc3aba67a84c3c1f15cb16f7466a Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:28:12 +0800 Subject: [PATCH 19/60] fix(server): block cross-role active recovery collisions --- .../m20260416_000017_create_recovery_job.rs | 38 +++++++++++++++++++ crates/server/src/service/recovery_job.rs | 36 ++++++++++++++++-- 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/crates/server/src/migration/m20260416_000017_create_recovery_job.rs b/crates/server/src/migration/m20260416_000017_create_recovery_job.rs index 25028ce0..af476b06 100644 --- a/crates/server/src/migration/m20260416_000017_create_recovery_job.rs +++ b/crates/server/src/migration/m20260416_000017_create_recovery_job.rs @@ -40,6 +40,44 @@ impl MigrationTrait for Migration { WHERE status = 'running'", ) .await?; + db.execute_unprepared( + "CREATE TRIGGER IF NOT EXISTS trg_recovery_job_running_insert + BEFORE INSERT ON recovery_job + WHEN NEW.status = 'running' + BEGIN + SELECT RAISE(ABORT, 'recovery_job_active_conflict') + WHERE EXISTS ( + SELECT 1 + FROM recovery_job + WHERE status = 'running' + AND job_id <> NEW.job_id + AND ( + target_server_id IN (NEW.target_server_id, NEW.source_server_id) + OR source_server_id IN (NEW.target_server_id, NEW.source_server_id) + ) + ); + END", + ) + .await?; + db.execute_unprepared( + "CREATE TRIGGER IF NOT EXISTS trg_recovery_job_running_update + BEFORE UPDATE OF target_server_id, source_server_id, status ON recovery_job + WHEN NEW.status = 'running' + BEGIN + SELECT RAISE(ABORT, 'recovery_job_active_conflict') + WHERE EXISTS ( + SELECT 1 + FROM recovery_job + WHERE status = 'running' + AND job_id <> NEW.job_id + AND ( + target_server_id IN (NEW.target_server_id, NEW.source_server_id) + OR source_server_id IN (NEW.target_server_id, NEW.source_server_id) + ) + ); + END", + ) + .await?; Ok(()) } diff --git a/crates/server/src/service/recovery_job.rs b/crates/server/src/service/recovery_job.rs index 75330cc5..591b763b 100644 --- a/crates/server/src/service/recovery_job.rs +++ b/crates/server/src/service/recovery_job.rs @@ -12,6 +12,11 @@ fn is_unique_violation(err: &DbErr) -> bool { message.contains("UNIQUE constraint failed") || message.contains("UNIQUE") } +fn is_active_recovery_conflict(err: &DbErr) -> bool { + let message = err.to_string(); + is_unique_violation(err) || message.contains("recovery_job_active_conflict") +} + impl RecoveryJobService { pub async fn create_job( db: &DatabaseConnection, @@ -35,9 +40,11 @@ impl RecoveryJobService { match active.insert(db).await { Ok(model) => Ok(model), - Err(err) if is_unique_violation(&err) => Err(AppError::Conflict( - "A running recovery job already exists for this target or source".to_string(), - )), + Err(err) if is_active_recovery_conflict(&err) => { + Err(AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + )) + } Err(err) => Err(err.into()), } } @@ -285,4 +292,27 @@ mod tests { other => panic!("expected conflict for duplicate source, got {other:?}"), } } + + #[tokio::test] + async fn create_job_rejects_cross_role_active_collisions() { + let (db, _tmp) = setup_test_db().await; + + let _first = RecoveryJobService::create_job(&db, "target-a", "source-b") + .await + .unwrap(); + + match RecoveryJobService::create_job(&db, "target-c", "target-a").await { + Err(AppError::Conflict(message)) => { + assert!(message.contains("running recovery job")); + } + other => panic!("expected conflict for target/source crossover, got {other:?}"), + } + + match RecoveryJobService::create_job(&db, "source-b", "target-c").await { + Err(AppError::Conflict(message)) => { + assert!(message.contains("running recovery job")); + } + other => panic!("expected conflict for source/target crossover, got {other:?}"), + } + } } From 18b71dc0c3f9b28ce008189d6b376106d6ca646f Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:29:47 +0800 Subject: [PATCH 20/60] feat(server): add recovery candidate and job api --- crates/server/src/openapi.rs | 6 + crates/server/src/router/api/mod.rs | 3 + .../server/src/router/api/server_recovery.rs | 363 ++++++++++++++++++ crates/server/tests/integration.rs | 162 ++++++++ 4 files changed, 534 insertions(+) create mode 100644 crates/server/src/router/api/server_recovery.rs diff --git a/crates/server/src/openapi.rs b/crates/server/src/openapi.rs index 061b2bab..082cbe2b 100644 --- a/crates/server/src/openapi.rs +++ b/crates/server/src/openapi.rs @@ -71,6 +71,9 @@ impl Modify for SecurityAddon { crate::router::api::server::trigger_upgrade, crate::router::api::server::batch_update_capabilities, crate::router::api::server::cleanup_orphaned_servers, + crate::router::api::server_recovery::list_candidates, + crate::router::api::server_recovery::get_recovery_job, + crate::router::api::server_recovery::start_recovery_merge, // server-groups crate::router::api::server_group::list_groups, crate::router::api::server_group::create_group, @@ -232,6 +235,9 @@ impl Modify for SecurityAddon { crate::router::api::server::BatchCapabilitiesResponse, crate::router::api::server::CleanupResponse, crate::service::server::UpdateServerInput, + crate::router::api::server_recovery::RecoveryCandidateResponse, + crate::router::api::server_recovery::StartRecoveryRequest, + crate::router::api::server_recovery::RecoveryJobResponse, // server-groups crate::router::api::server_group::CreateGroupRequest, crate::router::api::server_group::UpdateGroupRequest, diff --git a/crates/server/src/router/api/mod.rs b/crates/server/src/router/api/mod.rs index 0b3f3edb..95a21b65 100644 --- a/crates/server/src/router/api/mod.rs +++ b/crates/server/src/router/api/mod.rs @@ -16,6 +16,7 @@ pub mod oauth; pub mod ping; pub mod server; pub mod server_group; +pub mod server_recovery; pub mod service_monitor; pub mod setting; pub mod status; @@ -49,6 +50,7 @@ pub fn router(state: Arc) -> Router> { // Read-only routes accessible to all authenticated users .merge(agent::read_router()) .merge(server::read_router()) + .merge(server_recovery::read_router()) .merge(server_group::read_router()) .merge(ping::read_router()) .merge(network_probe::read_router()) @@ -66,6 +68,7 @@ pub fn router(state: Arc) -> Router> { .merge( Router::new() .merge(server::write_router()) + .merge(server_recovery::write_router()) .merge(server_group::write_router()) .merge(ping::write_router()) .merge(network_probe::write_router()) diff --git a/crates/server/src/router/api/server_recovery.rs b/crates/server/src/router/api/server_recovery.rs new file mode 100644 index 00000000..b9ce4779 --- /dev/null +++ b/crates/server/src/router/api/server_recovery.rs @@ -0,0 +1,363 @@ +use std::collections::HashSet; +use std::net::SocketAddr; +use std::str::FromStr; +use std::sync::Arc; + +use axum::extract::{Path, State}; +use axum::routing::{get, post}; +use axum::{Json, Router}; +use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; +use serde::{Deserialize, Serialize}; + +use crate::entity::{recovery_job, server}; +use crate::error::{ApiResponse, AppError, ok}; +use crate::service::recovery_job::RecoveryJobService; +use crate::state::AppState; + +#[derive(Debug, Serialize, utoipa::ToSchema)] +pub struct RecoveryCandidateResponse { + pub server_id: String, + pub name: String, + pub score: i32, + pub reasons: Vec, +} + +#[derive(Debug, Deserialize, utoipa::ToSchema)] +pub struct StartRecoveryRequest { + pub source_server_id: String, +} + +#[derive(Debug, Serialize, utoipa::ToSchema)] +pub struct RecoveryJobResponse { + pub job_id: String, + pub target_server_id: String, + pub source_server_id: String, + pub status: String, + pub stage: String, + pub checkpoint_json: Option, + pub error: Option, + pub started_at: chrono::DateTime, + pub created_at: chrono::DateTime, + pub updated_at: chrono::DateTime, + pub last_heartbeat_at: Option>, +} + +#[derive(Debug)] +struct CandidateScoreInput { + same_remote_addr: bool, + same_cpu_arch: bool, + same_os: bool, + same_virtualization: bool, + created_within_minutes: i64, + same_country: bool, +} + +pub fn read_router() -> Router> { + Router::new() + .route( + "/servers/{target_id}/recovery-candidates", + get(list_candidates), + ) + .route("/servers/recovery-jobs/{job_id}", get(get_recovery_job)) +} + +pub fn write_router() -> Router> { + Router::new().route( + "/servers/{target_id}/recover-merge", + post(start_recovery_merge), + ) +} + +#[utoipa::path( + get, + path = "/api/servers/{target_id}/recovery-candidates", + params( + ("target_id" = String, Path, description = "Original offline server id") + ), + responses( + (status = 200, description = "Recommended recovery candidates", body = Vec), + (status = 401, description = "Authentication required", body = crate::error::ErrorBody), + (status = 404, description = "Target server not found", body = crate::error::ErrorBody), + ), + security( + ("session_cookie" = []), + ("api_key" = []) + ), + tag = "server-recovery" +)] +async fn list_candidates( + State(state): State>, + Path(target_id): Path, +) -> Result>>, AppError> { + let target = server::Entity::find_by_id(&target_id) + .one(&state.db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + + let running_jobs = recovery_job::Entity::find() + .filter(recovery_job::Column::Status.eq("running")) + .all(&state.db) + .await?; + + let active_server_ids: HashSet = running_jobs + .into_iter() + .flat_map(|job| [job.target_server_id, job.source_server_id]) + .collect(); + + let mut candidates = server::Entity::find() + .filter(server::Column::Id.ne(target_id.as_str())) + .all(&state.db) + .await? + .into_iter() + .filter(|source| state.agent_manager.is_online(&source.id)) + .filter(|source| !active_server_ids.contains(&source.id)) + .map(|source| build_candidate_response(&target, &source)) + .collect::>(); + + candidates.sort_by(|left, right| { + right + .score + .cmp(&left.score) + .then_with(|| left.name.cmp(&right.name)) + .then_with(|| left.server_id.cmp(&right.server_id)) + }); + + ok(candidates) +} + +#[utoipa::path( + get, + path = "/api/servers/recovery-jobs/{job_id}", + params( + ("job_id" = String, Path, description = "Recovery job id") + ), + responses( + (status = 200, description = "Recovery job details", body = RecoveryJobResponse), + (status = 401, description = "Authentication required", body = crate::error::ErrorBody), + (status = 404, description = "Recovery job not found", body = crate::error::ErrorBody), + ), + security( + ("session_cookie" = []), + ("api_key" = []) + ), + tag = "server-recovery" +)] +async fn get_recovery_job( + State(state): State>, + Path(job_id): Path, +) -> Result>, AppError> { + let job = RecoveryJobService::get_job(&state.db, &job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + + ok(job.into()) +} + +#[utoipa::path( + post, + path = "/api/servers/{target_id}/recover-merge", + request_body = StartRecoveryRequest, + params( + ("target_id" = String, Path, description = "Original offline server id") + ), + responses( + (status = 200, description = "Recovery job created", body = RecoveryJobResponse), + (status = 401, description = "Authentication required", body = crate::error::ErrorBody), + (status = 403, description = "Admin required", body = crate::error::ErrorBody), + (status = 404, description = "Server not found", body = crate::error::ErrorBody), + (status = 409, description = "Recovery cannot be started in the current state", body = crate::error::ErrorBody), + (status = 422, description = "Invalid request", body = crate::error::ErrorBody), + ), + security( + ("session_cookie" = []), + ("api_key" = []) + ), + tag = "server-recovery" +)] +async fn start_recovery_merge( + State(state): State>, + Path(target_id): Path, + Json(request): Json, +) -> Result>, AppError> { + if request.source_server_id == target_id { + return Err(AppError::Validation( + "source_server_id must be different from target_id".to_string(), + )); + } + + let target = server::Entity::find_by_id(&target_id) + .one(&state.db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + let source = server::Entity::find_by_id(&request.source_server_id) + .one(&state.db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + + if state.agent_manager.is_online(&target.id) { + return Err(AppError::Conflict( + "Target server must be offline before starting recovery".to_string(), + )); + } + + if !state.agent_manager.is_online(&source.id) { + return Err(AppError::Conflict( + "Source server must be online before starting recovery".to_string(), + )); + } + + let job = RecoveryJobService::create_job(&state.db, &target.id, &source.id).await?; + ok(job.into()) +} + +fn build_candidate_response( + target: &server::Model, + source: &server::Model, +) -> RecoveryCandidateResponse { + let same_remote_addr = remote_addr_key(target.last_remote_addr.as_deref()) + .zip(remote_addr_key(source.last_remote_addr.as_deref())) + .is_some_and(|(left, right)| left == right); + let same_cpu_arch = option_eq(target.cpu_arch.as_deref(), source.cpu_arch.as_deref()); + let same_os = option_eq(target.os.as_deref(), source.os.as_deref()); + let same_virtualization = option_eq( + target.virtualization.as_deref(), + source.virtualization.as_deref(), + ); + let same_country = option_eq( + target.country_code.as_deref(), + source.country_code.as_deref(), + ) || option_eq(target.region.as_deref(), source.region.as_deref()); + let created_within_minutes = (source.created_at - target.created_at).num_minutes().abs(); + + let score = score_candidate(CandidateScoreInput { + same_remote_addr, + same_cpu_arch, + same_os, + same_virtualization, + created_within_minutes, + same_country, + }); + + let mut reasons = Vec::new(); + if same_remote_addr { + reasons.push("same remote address".to_string()); + } + if same_cpu_arch { + reasons.push("same cpu architecture".to_string()); + } + if same_os { + reasons.push("same operating system".to_string()); + } + if same_virtualization { + reasons.push("same virtualization".to_string()); + } + if same_country { + reasons.push("same region or country".to_string()); + } + if created_within_minutes <= 60 { + reasons.push("created close in time".to_string()); + } + if reasons.is_empty() { + reasons.push("online replacement candidate".to_string()); + } + + RecoveryCandidateResponse { + server_id: source.id.clone(), + name: source.name.clone(), + score, + reasons, + } +} + +fn score_candidate(input: CandidateScoreInput) -> i32 { + let mut score = 0; + + if input.same_remote_addr { + score += 40; + } + if input.same_cpu_arch { + score += 15; + } + if input.same_os { + score += 15; + } + if input.same_virtualization { + score += 10; + } + if input.same_country { + score += 10; + } + + score + + match input.created_within_minutes { + 0..=15 => 20, + 16..=60 => 12, + 61..=240 => 4, + _ => 0, + } +} + +fn option_eq(left: Option<&str>, right: Option<&str>) -> bool { + match (left, right) { + (Some(left), Some(right)) => left == right, + _ => false, + } +} + +fn remote_addr_key(value: Option<&str>) -> Option { + let value = value?.trim(); + if value.is_empty() { + return None; + } + + if let Ok(addr) = SocketAddr::from_str(value) { + return Some(addr.ip().to_string()); + } + + Some(value.to_string()) +} + +impl From for RecoveryJobResponse { + fn from(value: recovery_job::Model) -> Self { + Self { + job_id: value.job_id, + target_server_id: value.target_server_id, + source_server_id: value.source_server_id, + status: value.status, + stage: value.stage, + checkpoint_json: value.checkpoint_json, + error: value.error, + started_at: value.started_at, + created_at: value.created_at, + updated_at: value.updated_at, + last_heartbeat_at: value.last_heartbeat_at, + } + } +} + +#[cfg(test)] +mod tests { + use super::{CandidateScoreInput, score_candidate}; + + #[test] + fn higher_score_when_ip_arch_and_created_at_match() { + let strong = score_candidate(CandidateScoreInput { + same_remote_addr: true, + same_cpu_arch: true, + same_os: true, + same_virtualization: true, + created_within_minutes: 10, + same_country: true, + }); + let weak = score_candidate(CandidateScoreInput { + same_remote_addr: false, + same_cpu_arch: false, + same_os: true, + same_virtualization: false, + created_within_minutes: 240, + same_country: false, + }); + + assert!(strong > weak); + } +} diff --git a/crates/server/tests/integration.rs b/crates/server/tests/integration.rs index 6cb711d7..c7ca0ce5 100644 --- a/crates/server/tests/integration.rs +++ b/crates/server/tests/integration.rs @@ -3779,3 +3779,165 @@ async fn test_security_headers_present() { Some("none"), ); } + +#[tokio::test] +async fn test_recovery_candidates_requires_auth_and_filters_online_sources() { + let (base_url, _tmp) = start_test_server().await; + let auth_client = http_client(); + + let (target_id, _target_token) = register_agent(&auth_client, &base_url).await; + let (online_source_id, online_source_token) = register_agent(&auth_client, &base_url).await; + let (offline_source_id, _offline_source_token) = register_agent(&auth_client, &base_url).await; + + let plain_client = reqwest::Client::new(); + let unauth_resp = plain_client + .get(format!( + "{}/api/servers/{}/recovery-candidates", + base_url, target_id + )) + .send() + .await + .expect("unauthenticated recovery candidates request failed"); + assert_eq!(unauth_resp.status(), 401); + + login_admin(&auth_client, &base_url).await; + let (_sink, mut reader) = connect_agent(&base_url, &online_source_token).await; + let _welcome = recv_agent_text(&mut reader).await; + + let resp = auth_client + .get(format!( + "{}/api/servers/{}/recovery-candidates", + base_url, target_id + )) + .send() + .await + .expect("GET recovery candidates failed"); + + assert_eq!(resp.status(), 200); + let body: serde_json::Value = resp.json().await.unwrap(); + let candidates = body["data"].as_array().expect("data should be an array"); + + assert_eq!(candidates.len(), 1); + assert_eq!(candidates[0]["server_id"], online_source_id); + assert_ne!(candidates[0]["server_id"], target_id); + assert!( + !candidates + .iter() + .any(|candidate| candidate["server_id"] == offline_source_id) + ); +} + +#[tokio::test] +async fn test_recovery_merge_start_requires_admin_and_validates_source_state() { + let (base_url, _tmp) = start_test_server().await; + let admin_client = http_client(); + login_admin(&admin_client, &base_url).await; + + let create_resp = admin_client + .post(format!("{}/api/users", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123", + "role": "member" + })) + .send() + .await + .expect("POST /api/users failed"); + assert_eq!(create_resp.status(), 200); + + let member_client = http_client(); + let member_login = member_client + .post(format!("{}/api/auth/login", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123" + })) + .send() + .await + .expect("member login failed"); + assert_eq!(member_login.status(), 200); + + let (target_id, _target_token) = register_agent(&admin_client, &base_url).await; + let (offline_source_id, _offline_source_token) = register_agent(&admin_client, &base_url).await; + + let member_resp = member_client + .post(format!( + "{}/api/servers/{}/recover-merge", + base_url, target_id + )) + .json(&json!({ "source_server_id": offline_source_id })) + .send() + .await + .expect("member recover-merge request failed"); + assert_eq!(member_resp.status(), 403); + + let admin_resp = admin_client + .post(format!( + "{}/api/servers/{}/recover-merge", + base_url, target_id + )) + .json(&json!({ "source_server_id": offline_source_id })) + .send() + .await + .expect("admin recover-merge validation request failed"); + assert_eq!(admin_resp.status(), 409); + + let admin_body: serde_json::Value = admin_resp.json().await.unwrap(); + assert!( + admin_body["error"]["message"] + .as_str() + .expect("error message should be a string") + .contains("Source server must be online") + ); +} + +#[tokio::test] +async fn test_recovery_job_get_requires_auth_and_start_creates_job() { + let (base_url, _tmp) = start_test_server().await; + let auth_client = http_client(); + login_admin(&auth_client, &base_url).await; + + let (target_id, _target_token) = register_agent(&auth_client, &base_url).await; + let (source_id, source_token) = register_agent(&auth_client, &base_url).await; + let (_sink, mut reader) = connect_agent(&base_url, &source_token).await; + let _welcome = recv_agent_text(&mut reader).await; + + let start_resp = auth_client + .post(format!( + "{}/api/servers/{}/recover-merge", + base_url, target_id + )) + .json(&json!({ "source_server_id": source_id })) + .send() + .await + .expect("start recovery request failed"); + assert_eq!(start_resp.status(), 200); + + let start_body: serde_json::Value = start_resp.json().await.unwrap(); + let job_id = start_body["data"]["job_id"] + .as_str() + .expect("job_id missing") + .to_string(); + assert_eq!(start_body["data"]["status"], "running"); + assert_eq!(start_body["data"]["stage"], "validating"); + + let plain_client = reqwest::Client::new(); + let unauth_resp = plain_client + .get(format!("{}/api/servers/recovery-jobs/{}", base_url, job_id)) + .send() + .await + .expect("unauthenticated recovery job request failed"); + assert_eq!(unauth_resp.status(), 401); + + let get_resp = auth_client + .get(format!("{}/api/servers/recovery-jobs/{}", base_url, job_id)) + .send() + .await + .expect("authenticated recovery job request failed"); + assert_eq!(get_resp.status(), 200); + + let get_body: serde_json::Value = get_resp.json().await.unwrap(); + assert_eq!(get_body["data"]["job_id"], job_id); + assert_eq!(get_body["data"]["target_server_id"], target_id); + assert_eq!(get_body["data"]["source_server_id"], source_id); +} From ca59f11f18482f25b44fa9fe01ba6a397373a82c Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:31:26 +0800 Subject: [PATCH 21/60] fix(server): tighten recovery api contract --- crates/server/src/openapi.rs | 2 + crates/server/src/router/api/mod.rs | 1 - .../server/src/router/api/server_recovery.rs | 74 +++++++++++++++---- crates/server/tests/integration.rs | 74 ++++++++++++++++++- 4 files changed, 131 insertions(+), 20 deletions(-) diff --git a/crates/server/src/openapi.rs b/crates/server/src/openapi.rs index 082cbe2b..45118f34 100644 --- a/crates/server/src/openapi.rs +++ b/crates/server/src/openapi.rs @@ -235,6 +235,8 @@ impl Modify for SecurityAddon { crate::router::api::server::BatchCapabilitiesResponse, crate::router::api::server::CleanupResponse, crate::service::server::UpdateServerInput, + crate::router::api::server_recovery::RecoveryJobStatus, + crate::router::api::server_recovery::RecoveryJobStage, crate::router::api::server_recovery::RecoveryCandidateResponse, crate::router::api::server_recovery::StartRecoveryRequest, crate::router::api::server_recovery::RecoveryJobResponse, diff --git a/crates/server/src/router/api/mod.rs b/crates/server/src/router/api/mod.rs index 95a21b65..e7e67cdc 100644 --- a/crates/server/src/router/api/mod.rs +++ b/crates/server/src/router/api/mod.rs @@ -50,7 +50,6 @@ pub fn router(state: Arc) -> Router> { // Read-only routes accessible to all authenticated users .merge(agent::read_router()) .merge(server::read_router()) - .merge(server_recovery::read_router()) .merge(server_group::read_router()) .merge(ping::read_router()) .merge(network_probe::read_router()) diff --git a/crates/server/src/router/api/server_recovery.rs b/crates/server/src/router/api/server_recovery.rs index b9ce4779..0fac57b4 100644 --- a/crates/server/src/router/api/server_recovery.rs +++ b/crates/server/src/router/api/server_recovery.rs @@ -14,6 +14,23 @@ use crate::error::{ApiResponse, AppError, ok}; use crate::service::recovery_job::RecoveryJobService; use crate::state::AppState; +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, utoipa::ToSchema)] +#[serde(rename_all = "snake_case")] +pub enum RecoveryJobStatus { + Running, + Failed, + Unknown, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, utoipa::ToSchema)] +#[serde(rename_all = "snake_case")] +pub enum RecoveryJobStage { + Validating, + MergingHistory, + Finalizing, + Unknown, +} + #[derive(Debug, Serialize, utoipa::ToSchema)] pub struct RecoveryCandidateResponse { pub server_id: String, @@ -32,9 +49,8 @@ pub struct RecoveryJobResponse { pub job_id: String, pub target_server_id: String, pub source_server_id: String, - pub status: String, - pub stage: String, - pub checkpoint_json: Option, + pub status: RecoveryJobStatus, + pub stage: RecoveryJobStage, pub error: Option, pub started_at: chrono::DateTime, pub created_at: chrono::DateTime, @@ -53,19 +69,20 @@ struct CandidateScoreInput { } pub fn read_router() -> Router> { + Router::new() +} + +pub fn write_router() -> Router> { Router::new() .route( "/servers/{target_id}/recovery-candidates", get(list_candidates), ) .route("/servers/recovery-jobs/{job_id}", get(get_recovery_job)) -} - -pub fn write_router() -> Router> { - Router::new().route( - "/servers/{target_id}/recover-merge", - post(start_recovery_merge), - ) + .route( + "/servers/{target_id}/recover-merge", + post(start_recovery_merge), + ) } #[utoipa::path( @@ -77,11 +94,13 @@ pub fn write_router() -> Router> { responses( (status = 200, description = "Recommended recovery candidates", body = Vec), (status = 401, description = "Authentication required", body = crate::error::ErrorBody), + (status = 403, description = "Admin required", body = crate::error::ErrorBody), (status = 404, description = "Target server not found", body = crate::error::ErrorBody), ), security( ("session_cookie" = []), - ("api_key" = []) + ("api_key" = []), + ("bearer_token" = []) ), tag = "server-recovery" )] @@ -134,11 +153,13 @@ async fn list_candidates( responses( (status = 200, description = "Recovery job details", body = RecoveryJobResponse), (status = 401, description = "Authentication required", body = crate::error::ErrorBody), + (status = 403, description = "Admin required", body = crate::error::ErrorBody), (status = 404, description = "Recovery job not found", body = crate::error::ErrorBody), ), security( ("session_cookie" = []), - ("api_key" = []) + ("api_key" = []), + ("bearer_token" = []) ), tag = "server-recovery" )] @@ -170,7 +191,8 @@ async fn get_recovery_job( ), security( ("session_cookie" = []), - ("api_key" = []) + ("api_key" = []), + ("bearer_token" = []) ), tag = "server-recovery" )] @@ -323,9 +345,8 @@ impl From for RecoveryJobResponse { job_id: value.job_id, target_server_id: value.target_server_id, source_server_id: value.source_server_id, - status: value.status, - stage: value.stage, - checkpoint_json: value.checkpoint_json, + status: RecoveryJobStatus::from(value.status.as_str()), + stage: RecoveryJobStage::from(value.stage.as_str()), error: value.error, started_at: value.started_at, created_at: value.created_at, @@ -335,6 +356,27 @@ impl From for RecoveryJobResponse { } } +impl From<&str> for RecoveryJobStatus { + fn from(value: &str) -> Self { + match value { + "running" => Self::Running, + "failed" => Self::Failed, + _ => Self::Unknown, + } + } +} + +impl From<&str> for RecoveryJobStage { + fn from(value: &str) -> Self { + match value { + "validating" => Self::Validating, + "merging_history" => Self::MergingHistory, + "finalizing" => Self::Finalizing, + _ => Self::Unknown, + } + } +} + #[cfg(test)] mod tests { use super::{CandidateScoreInput, score_candidate}; diff --git a/crates/server/tests/integration.rs b/crates/server/tests/integration.rs index c7ca0ce5..cd617c62 100644 --- a/crates/server/tests/integration.rs +++ b/crates/server/tests/integration.rs @@ -3781,13 +3781,38 @@ async fn test_security_headers_present() { } #[tokio::test] -async fn test_recovery_candidates_requires_auth_and_filters_online_sources() { +async fn test_recovery_candidates_requires_admin_and_filters_online_sources() { let (base_url, _tmp) = start_test_server().await; let auth_client = http_client(); let (target_id, _target_token) = register_agent(&auth_client, &base_url).await; let (online_source_id, online_source_token) = register_agent(&auth_client, &base_url).await; let (offline_source_id, _offline_source_token) = register_agent(&auth_client, &base_url).await; + login_admin(&auth_client, &base_url).await; + + let create_resp = auth_client + .post(format!("{}/api/users", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123", + "role": "member" + })) + .send() + .await + .expect("POST /api/users failed"); + assert_eq!(create_resp.status(), 200); + + let member_client = http_client(); + let member_login = member_client + .post(format!("{}/api/auth/login", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123" + })) + .send() + .await + .expect("member login failed"); + assert_eq!(member_login.status(), 200); let plain_client = reqwest::Client::new(); let unauth_resp = plain_client @@ -3800,7 +3825,16 @@ async fn test_recovery_candidates_requires_auth_and_filters_online_sources() { .expect("unauthenticated recovery candidates request failed"); assert_eq!(unauth_resp.status(), 401); - login_admin(&auth_client, &base_url).await; + let member_resp = member_client + .get(format!( + "{}/api/servers/{}/recovery-candidates", + base_url, target_id + )) + .send() + .await + .expect("member recovery candidates request failed"); + assert_eq!(member_resp.status(), 403); + let (_sink, mut reader) = connect_agent(&base_url, &online_source_token).await; let _welcome = recv_agent_text(&mut reader).await; @@ -3892,11 +3926,35 @@ async fn test_recovery_merge_start_requires_admin_and_validates_source_state() { } #[tokio::test] -async fn test_recovery_job_get_requires_auth_and_start_creates_job() { +async fn test_recovery_job_get_requires_admin_and_start_creates_job() { let (base_url, _tmp) = start_test_server().await; let auth_client = http_client(); login_admin(&auth_client, &base_url).await; + let create_resp = auth_client + .post(format!("{}/api/users", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123", + "role": "member" + })) + .send() + .await + .expect("POST /api/users failed"); + assert_eq!(create_resp.status(), 200); + + let member_client = http_client(); + let member_login = member_client + .post(format!("{}/api/auth/login", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123" + })) + .send() + .await + .expect("member login failed"); + assert_eq!(member_login.status(), 200); + let (target_id, _target_token) = register_agent(&auth_client, &base_url).await; let (source_id, source_token) = register_agent(&auth_client, &base_url).await; let (_sink, mut reader) = connect_agent(&base_url, &source_token).await; @@ -3929,6 +3987,13 @@ async fn test_recovery_job_get_requires_auth_and_start_creates_job() { .expect("unauthenticated recovery job request failed"); assert_eq!(unauth_resp.status(), 401); + let member_resp = member_client + .get(format!("{}/api/servers/recovery-jobs/{}", base_url, job_id)) + .send() + .await + .expect("member recovery job request failed"); + assert_eq!(member_resp.status(), 403); + let get_resp = auth_client .get(format!("{}/api/servers/recovery-jobs/{}", base_url, job_id)) .send() @@ -3940,4 +4005,7 @@ async fn test_recovery_job_get_requires_auth_and_start_creates_job() { assert_eq!(get_body["data"]["job_id"], job_id); assert_eq!(get_body["data"]["target_server_id"], target_id); assert_eq!(get_body["data"]["source_server_id"], source_id); + assert!(get_body["data"].get("checkpoint_json").is_none()); + assert_eq!(get_body["data"]["status"], "running"); + assert_eq!(get_body["data"]["stage"], "validating"); } From 9fad5185a3feee9fd1653f21bd003220379d0696 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:32:38 +0800 Subject: [PATCH 22/60] fix(server): align recovery candidate preconditions --- .../server/src/router/api/server_recovery.rs | 15 +++++ crates/server/tests/integration.rs | 59 +++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/crates/server/src/router/api/server_recovery.rs b/crates/server/src/router/api/server_recovery.rs index 0fac57b4..5872eadd 100644 --- a/crates/server/src/router/api/server_recovery.rs +++ b/crates/server/src/router/api/server_recovery.rs @@ -96,6 +96,7 @@ pub fn write_router() -> Router> { (status = 401, description = "Authentication required", body = crate::error::ErrorBody), (status = 403, description = "Admin required", body = crate::error::ErrorBody), (status = 404, description = "Target server not found", body = crate::error::ErrorBody), + (status = 409, description = "Target must be offline and not already in a running recovery job", body = crate::error::ErrorBody), ), security( ("session_cookie" = []), @@ -113,11 +114,25 @@ async fn list_candidates( .await? .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + if state.agent_manager.is_online(&target.id) { + return Err(AppError::Conflict( + "Target server must be offline before listing recovery candidates".to_string(), + )); + } + let running_jobs = recovery_job::Entity::find() .filter(recovery_job::Column::Status.eq("running")) .all(&state.db) .await?; + if running_jobs.iter().any(|job| { + job.target_server_id == target.id || job.source_server_id == target.id + }) { + return Err(AppError::Conflict( + "Target server is already participating in a running recovery job".to_string(), + )); + } + let active_server_ids: HashSet = running_jobs .into_iter() .flat_map(|job| [job.target_server_id, job.source_server_id]) diff --git a/crates/server/tests/integration.rs b/crates/server/tests/integration.rs index cd617c62..fda4ef04 100644 --- a/crates/server/tests/integration.rs +++ b/crates/server/tests/integration.rs @@ -3780,6 +3780,65 @@ async fn test_security_headers_present() { ); } +#[tokio::test] +async fn test_recovery_candidates_rejects_online_or_busy_target() { + let (base_url, _tmp) = start_test_server().await; + let admin_client = http_client(); + login_admin(&admin_client, &base_url).await; + + let (online_target_id, online_target_token) = register_agent(&admin_client, &base_url).await; + let (busy_target_id, _busy_target_token) = register_agent(&admin_client, &base_url).await; + let (source_id, source_token) = register_agent(&admin_client, &base_url).await; + + let (_sink, mut reader) = connect_agent(&base_url, &online_target_token).await; + let _welcome = recv_agent_text(&mut reader).await; + + let online_resp = admin_client + .get(format!( + "{}/api/servers/{}/recovery-candidates", + base_url, online_target_id + )) + .send() + .await + .expect("online target recovery candidates request failed"); + assert_eq!(online_resp.status(), 409); + let online_body: serde_json::Value = online_resp.json().await.unwrap(); + assert!( + online_body["error"]["message"] + .as_str() + .expect("error message should be a string") + .contains("must be offline") + ); + + let (_source_sink, mut source_reader) = connect_agent(&base_url, &source_token).await; + let _source_welcome = recv_agent_text(&mut source_reader).await; + + let start_resp = admin_client + .post(format!("{}/api/servers/{}/recover-merge", base_url, busy_target_id)) + .json(&json!({ "source_server_id": source_id })) + .send() + .await + .expect("start recovery request failed"); + assert_eq!(start_resp.status(), 200); + + let busy_resp = admin_client + .get(format!( + "{}/api/servers/{}/recovery-candidates", + base_url, busy_target_id + )) + .send() + .await + .expect("busy target recovery candidates request failed"); + assert_eq!(busy_resp.status(), 409); + let busy_body: serde_json::Value = busy_resp.json().await.unwrap(); + assert!( + busy_body["error"]["message"] + .as_str() + .expect("error message should be a string") + .contains("running recovery job") + ); +} + #[tokio::test] async fn test_recovery_candidates_requires_admin_and_filters_online_sources() { let (base_url, _tmp) = start_test_server().await; From feaafc9f037b97f48d4af6a9d05f353f2f49ac12 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:34:15 +0800 Subject: [PATCH 23/60] feat(server): add recovery write freeze guards --- crates/server/src/router/ws/agent.rs | 269 ++++++++++++++------- crates/server/src/service/mod.rs | 1 + crates/server/src/service/recovery_lock.rs | 44 ++++ crates/server/src/state.rs | 4 + crates/server/src/task/record_writer.rs | 5 + 5 files changed, 237 insertions(+), 86 deletions(-) create mode 100644 crates/server/src/service/recovery_lock.rs diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 9bf99af7..beaf14f9 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -347,6 +347,8 @@ async fn handle_current_connection_frame( } async fn handle_agent_message(state: &Arc, server_id: &str, msg: AgentMessage) { + let writes_allowed = state.recovery_lock.writes_allowed_for(server_id); + match msg { AgentMessage::SystemInfo { msg_id, @@ -399,18 +401,24 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent tracing::info!( "Server {server_id} remote address changed: {old_addr} -> {new_addr}" ); - if let Err(e) = AuditService::log( - &state.db, - "system", - "ip_changed", - Some(&format!( - "Remote address changed from {old_addr} to {new_addr} for server {server_id}" - )), - new_addr, - ) - .await - { - tracing::error!("Failed to write audit log for IP change: {e}"); + if writes_allowed { + if let Err(e) = AuditService::log( + &state.db, + "system", + "ip_changed", + Some(&format!( + "Remote address changed from {old_addr} to {new_addr} for server {server_id}" + )), + new_addr, + ) + .await + { + tracing::error!("Failed to write audit log for IP change: {e}"); + } + } else { + tracing::info!( + "Skipping recovery-frozen IP-change audit write for {server_id}" + ); } } @@ -445,27 +453,45 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } // Always update last_remote_addr - if let Some(ref addr) = current_remote_addr - && let Err(e) = update_last_remote_addr(&state.db, server_id, addr).await - { - tracing::error!("Failed to update last_remote_addr for {server_id}: {e}"); + if let Some(ref addr) = current_remote_addr { + if writes_allowed { + if let Err(e) = update_last_remote_addr(&state.db, server_id, addr).await + { + tracing::error!( + "Failed to update last_remote_addr for {server_id}: {e}" + ); + } + } else { + tracing::info!( + "Skipping recovery-frozen system-info write for {server_id}" + ); + } } } - if let Err(e) = - ServerService::update_system_info(&state.db, server_id, &info, region, country_code) - .await - { - tracing::error!("Failed to update system info for {server_id}: {e}"); - } + if writes_allowed { + if let Err(e) = ServerService::update_system_info( + &state.db, + server_id, + &info, + region, + country_code, + ) + .await + { + tracing::error!("Failed to update system info for {server_id}: {e}"); + } - // Persist and cache features from SystemInfo - let _ = crate::service::server::ServerService::update_features( - &state.db, - server_id, - &info.features, - ) - .await; + // Persist and cache features from SystemInfo + let _ = crate::service::server::ServerService::update_features( + &state.db, + server_id, + &info.features, + ) + .await; + } else { + tracing::info!("Skipping recovery-frozen system-info write for {server_id}"); + } state .agent_manager .update_features(server_id, info.features.clone()); @@ -534,10 +560,16 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } AgentMessage::Report(report) => { // Save GPU records if present - if let Some(ref gpu) = report.gpu - && let Err(e) = RecordService::save_gpu_records(&state.db, server_id, gpu).await - { - tracing::error!("Failed to save GPU records for {server_id}: {e}"); + if let Some(ref gpu) = report.gpu { + if writes_allowed { + if let Err(e) = + RecordService::save_gpu_records(&state.db, server_id, gpu).await + { + tracing::error!("Failed to save GPU records for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen report write for {server_id}"); + } } state.agent_manager.update_report(server_id, report); } @@ -552,12 +584,20 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent ); if !dispatched { // No waiter — one-shot task, save directly - if let Err(e) = save_task_result(&state.db, server_id, &result).await { - tracing::error!("Failed to save task result for {server_id}: {e}"); + if writes_allowed { + if let Err(e) = save_task_result(&state.db, server_id, &result).await { + tracing::error!("Failed to save task result for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen task-result write for {server_id}"); } } - if let Err(e) = audit_exec_finished(state, server_id, &result).await { - tracing::error!("Failed to write exec_finished audit log for {server_id}: {e}"); + if writes_allowed { + if let Err(e) = audit_exec_finished(state, server_id, &result).await { + tracing::error!("Failed to write exec_finished audit log for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen exec audit write for {server_id}"); } // Send Ack if let Some(tx) = state.agent_manager.get_sender(server_id) { @@ -598,8 +638,12 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } AgentMessage::PingResult(result) => { - if let Err(e) = save_ping_result(&state.db, server_id, &result).await { - tracing::error!("Failed to save ping result for {server_id}: {e}"); + if writes_allowed { + if let Err(e) = save_ping_result(&state.db, server_id, &result).await { + tracing::error!("Failed to save ping result for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen ping write for {server_id}"); } } AgentMessage::TerminalOutput { session_id, data } => { @@ -651,21 +695,27 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent }, ); if !dispatched { - use crate::entity::task_result; - use sea_orm::{ActiveModelTrait, NotSet, Set}; - let result = task_result::ActiveModel { - id: NotSet, - task_id: Set(task_id.clone()), - server_id: Set(server_id.to_string()), - output: Set(capability_denied_output(&capability, reason)), - exit_code: Set(-2), - run_id: Set(None), - attempt: Set(1), - started_at: Set(None), - finished_at: Set(chrono::Utc::now()), - }; - if let Err(e) = result.insert(&state.db).await { - tracing::error!("Failed to write CapabilityDenied task result: {e}"); + if writes_allowed { + use crate::entity::task_result; + use sea_orm::{ActiveModelTrait, NotSet, Set}; + let result = task_result::ActiveModel { + id: NotSet, + task_id: Set(task_id.clone()), + server_id: Set(server_id.to_string()), + output: Set(capability_denied_output(&capability, reason)), + exit_code: Set(-2), + run_id: Set(None), + attempt: Set(1), + started_at: Set(None), + finished_at: Set(chrono::Utc::now()), + }; + if let Err(e) = result.insert(&state.db).await { + tracing::error!("Failed to write CapabilityDenied task result: {e}"); + } + } else { + tracing::info!( + "Skipping recovery-frozen capability-denied write for {server_id}" + ); } } } @@ -687,8 +737,13 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent server_id: server_id.to_string(), results: results.clone(), }); - if let Err(e) = NetworkProbeService::save_results(&state.db, server_id, results).await { - tracing::error!("Failed to save network probe results for {server_id}: {e}"); + if writes_allowed { + if let Err(e) = NetworkProbeService::save_results(&state.db, server_id, results).await + { + tracing::error!("Failed to save network probe results for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen network probe write for {server_id}"); } } // File management control responses — relay to pending HTTP requests @@ -922,8 +977,16 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } AgentMessage::DockerEvent { event } => { - let _ = crate::service::docker::DockerService::save_event(&state.db, server_id, &event) + if writes_allowed { + let _ = crate::service::docker::DockerService::save_event( + &state.db, + server_id, + &event, + ) .await; + } else { + tracing::info!("Skipping recovery-frozen docker event write for {server_id}"); + } state .agent_manager .broadcast_browser(BrowserMessage::DockerEvent { @@ -941,10 +1004,14 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } AgentMessage::FeaturesUpdate { ref features } => { - let _ = crate::service::server::ServerService::update_features( - &state.db, server_id, features, - ) - .await; + if writes_allowed { + let _ = crate::service::server::ServerService::update_features( + &state.db, server_id, features, + ) + .await; + } else { + tracing::info!("Skipping recovery-frozen features write for {server_id}"); + } state .agent_manager .update_features(server_id, features.clone()); @@ -976,10 +1043,17 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent let ipv6_changed = old_ipv6 != ipv6; if ipv4_changed || ipv6_changed { - // Update ipv4/ipv6 in DB - if let Err(e) = update_server_ips(&state.db, server_id, &ipv4, &ipv6).await - { - tracing::error!("Failed to update IPs for {server_id}: {e}"); + if writes_allowed { + // Update ipv4/ipv6 in DB + if let Err(e) = + update_server_ips(&state.db, server_id, &ipv4, &ipv6).await + { + tracing::error!("Failed to update IPs for {server_id}: {e}"); + } + } else { + tracing::info!( + "Skipping recovery-frozen IP update write for {server_id}" + ); } // Re-run GeoIP lookup based on the new IPs @@ -992,16 +1066,23 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent let guard = state.geoip.read().unwrap(); guard.as_ref().map(|g| g.lookup(ip)) }; - if let Some(geo) = geo - && let Err(e) = update_server_geo( - &state.db, - server_id, - geo.region, - geo.country_code, - ) - .await - { - tracing::error!("Failed to update GeoIP for {server_id}: {e}"); + if let Some(geo) = geo { + if writes_allowed { + if let Err(e) = update_server_geo( + &state.db, + server_id, + geo.region, + geo.country_code, + ) + .await + { + tracing::error!("Failed to update GeoIP for {server_id}: {e}"); + } + } else { + tracing::info!( + "Skipping recovery-frozen GeoIP write for {server_id}" + ); + } } } @@ -1025,18 +1106,30 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent ) .await { - tracing::error!("Failed to write audit log for IP change: {e}"); + if writes_allowed { + tracing::error!("Failed to write audit log for IP change: {e}"); + } else { + tracing::info!( + "Skipping recovery-frozen IP-change audit write for {server_id}" + ); + } } - if let Err(e) = AlertService::check_event_rules( - &state.db, - &state.alert_state_manager, - server_id, - "ip_changed", - ) - .await - { - tracing::error!("Failed to check event rules for IP change: {e}"); + if writes_allowed { + if let Err(e) = AlertService::check_event_rules( + &state.db, + &state.alert_state_manager, + server_id, + "ip_changed", + ) + .await + { + tracing::error!("Failed to check event rules for IP change: {e}"); + } + } else { + tracing::info!( + "Skipping recovery-frozen alert evaluation for {server_id}" + ); } state @@ -1091,7 +1184,11 @@ async fn handle_docker_unavailable(state: &Arc, server_id: &str) { .remove_docker_log_sessions_for_server(server_id); // Shared cleanup: viewer tracker, features, DB persist, browser broadcast. - crate::service::agent_manager::cleanup_disconnected_docker_state(state, server_id).await; + if state.recovery_lock.writes_allowed_for(server_id) { + crate::service::agent_manager::cleanup_disconnected_docker_state(state, server_id).await; + } else { + tracing::info!("Skipping recovery-frozen docker unavailable write for {server_id}"); + } } async fn send_server_message( diff --git a/crates/server/src/service/mod.rs b/crates/server/src/service/mod.rs index 787edd6a..d314ae36 100644 --- a/crates/server/src/service/mod.rs +++ b/crates/server/src/service/mod.rs @@ -19,6 +19,7 @@ pub mod notification; pub mod oauth; pub mod ping; pub mod record; +pub mod recovery_lock; pub mod recovery_job; pub mod server; pub mod service_monitor; diff --git a/crates/server/src/service/recovery_lock.rs b/crates/server/src/service/recovery_lock.rs new file mode 100644 index 00000000..2cc3b396 --- /dev/null +++ b/crates/server/src/service/recovery_lock.rs @@ -0,0 +1,44 @@ +use dashmap::DashSet; + +#[derive(Default)] +pub struct RecoveryLockService { + frozen: DashSet, +} + +impl RecoveryLockService { + pub fn new() -> Self { + Self { + frozen: DashSet::new(), + } + } + + pub fn freeze(&self, server_id: &str) { + self.frozen.insert(server_id.to_string()); + } + + pub fn release(&self, server_id: &str) { + self.frozen.remove(server_id); + } + + pub fn writes_allowed_for(&self, server_id: &str) -> bool { + !self.frozen.contains(server_id) + } +} + +#[cfg(test)] +mod tests { + use super::RecoveryLockService; + + #[test] + fn locked_server_denies_writes_until_released() { + let locks = RecoveryLockService::new(); + + assert!(locks.writes_allowed_for("srv-1")); + + locks.freeze("srv-1"); + assert!(!locks.writes_allowed_for("srv-1")); + + locks.release("srv-1"); + assert!(locks.writes_allowed_for("srv-1")); + } +} diff --git a/crates/server/src/state.rs b/crates/server/src/state.rs index 5734c299..ec36e382 100644 --- a/crates/server/src/state.rs +++ b/crates/server/src/state.rs @@ -16,6 +16,7 @@ use crate::service::geoip::GeoIpService; use crate::service::high_risk_audit::{ DockerLogsAuditContext, ExecAuditContext, TerminalAuditContext, }; +use crate::service::recovery_lock::RecoveryLockService; use crate::service::task_scheduler::TaskScheduler; use crate::service::upgrade_release::UpgradeReleaseService; use crate::service::upgrade_tracker::UpgradeJobTracker; @@ -63,6 +64,8 @@ pub struct AppState { pub task_scheduler: Arc, /// Shared alert state manager for dedup across poll-based and event-driven evaluation. pub alert_state_manager: AlertStateManager, + /// In-memory freeze gate for agent-originated writes during recovery. + pub recovery_lock: RecoveryLockService, /// Pending mobile pairing codes for QR login, keyed by code. pub pending_pairs: DashMap, /// Terminal session audit contexts keyed by session_id. @@ -180,6 +183,7 @@ impl AppState { docker_viewers: DockerViewerTracker::new(), task_scheduler, alert_state_manager, + recovery_lock: RecoveryLockService::new(), pending_pairs: DashMap::new(), terminal_audit_contexts: DashMap::new(), docker_logs_audit_contexts: DashMap::new(), diff --git a/crates/server/src/task/record_writer.rs b/crates/server/src/task/record_writer.rs index 77df3846..cc9bd436 100644 --- a/crates/server/src/task/record_writer.rs +++ b/crates/server/src/task/record_writer.rs @@ -43,6 +43,11 @@ pub async fn run(state: Arc) { let mut count = 0; for (server_id, report) in &reports { + if !state.recovery_lock.writes_allowed_for(server_id) { + tracing::info!("Skipping recovery-frozen record write for {server_id}"); + continue; + } + // Save metrics record if let Err(e) = RecordService::save_report(&state.db, server_id, report).await { tracing::error!("Failed to save record for {server_id}: {e}"); From e94395935b3d75a1bee073079fabce7071fa3692 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:35:59 +0800 Subject: [PATCH 24/60] fix(server): gate event-rule writes behind recovery lock --- crates/server/src/router/ws/agent.rs | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index beaf14f9..69667619 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -428,15 +428,21 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent let remote_changed = old_remote_addr.as_ref() != current_remote_addr.as_ref(); if ipv4_changed || ipv6_changed || remote_changed { - if let Err(e) = AlertService::check_event_rules( - &state.db, - &state.alert_state_manager, - server_id, - "ip_changed", - ) - .await - { - tracing::error!("Failed to check event rules for IP change: {e}"); + if writes_allowed { + if let Err(e) = AlertService::check_event_rules( + &state.db, + &state.alert_state_manager, + server_id, + "ip_changed", + ) + .await + { + tracing::error!("Failed to check event rules for IP change: {e}"); + } + } else { + tracing::info!( + "Skipping recovery-frozen alert evaluation for {server_id}" + ); } state From 2ebff96aa7ab2a965c9dc348b14231c7a1aab0a4 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:37:21 +0800 Subject: [PATCH 25/60] fix(server): gate ip-change audit behind recovery lock --- crates/server/src/router/ws/agent.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 69667619..8378b59c 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -1103,22 +1103,22 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent .get_remote_addr(server_id) .map(|a| a.ip().to_string()) .unwrap_or_default(); - if let Err(e) = AuditService::log( - &state.db, - "system", - "ip_changed", - Some(&detail), - &remote_ip, - ) - .await - { - if writes_allowed { + if writes_allowed { + if let Err(e) = AuditService::log( + &state.db, + "system", + "ip_changed", + Some(&detail), + &remote_ip, + ) + .await + { tracing::error!("Failed to write audit log for IP change: {e}"); - } else { - tracing::info!( - "Skipping recovery-frozen IP-change audit write for {server_id}" - ); } + } else { + tracing::info!( + "Skipping recovery-frozen IP-change audit write for {server_id}" + ); } if writes_allowed { From be0779a38e545ab8087d80e27c1f8f6d3f3536ff Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:38:54 +0800 Subject: [PATCH 26/60] feat(common): add recovery browser protocol payloads --- crates/common/src/protocol.rs | 204 +++++++++++++++++++++++++++++++++- 1 file changed, 203 insertions(+), 1 deletion(-) diff --git a/crates/common/src/protocol.rs b/crates/common/src/protocol.rs index ca866f06..9d1ca17a 100644 --- a/crates/common/src/protocol.rs +++ b/crates/common/src/protocol.rs @@ -29,6 +29,31 @@ pub enum UpgradeStatus { Timeout, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub enum RecoveryJobStatus { + Running, + Failed, + Succeeded, + Unknown, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub enum RecoveryJobStage { + Validating, + Rebinding, + AwaitingTargetOnline, + FreezingWrites, + MergingHistory, + Finalizing, + Succeeded, + Failed, + Unknown, +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] pub struct UpgradeJobDto { @@ -46,6 +71,23 @@ pub struct UpgradeJobDto { pub finished_at: Option>, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub struct RecoveryJobDto { + pub job_id: String, + pub target_server_id: String, + pub source_server_id: String, + pub status: RecoveryJobStatus, + pub stage: RecoveryJobStage, + #[serde(default)] + pub error: Option, + pub started_at: DateTime, + pub created_at: DateTime, + pub updated_at: DateTime, + #[serde(default)] + pub last_heartbeat_at: Option>, +} + /// Agent -> Server messages #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type", rename_all = "snake_case")] @@ -372,9 +414,13 @@ pub enum BrowserMessage { servers: Vec, #[serde(default)] upgrades: Vec, + #[serde(default)] + recoveries: Vec, }, Update { servers: Vec, + #[serde(default)] + recoveries: Vec, }, ServerOnline { server_id: String, @@ -1310,6 +1356,37 @@ mod tests { } } + #[test] + fn test_recovery_job_dto_round_trip() { + let dto = RecoveryJobDto { + job_id: "recovery-1".to_string(), + target_server_id: "target-1".to_string(), + source_server_id: "source-1".to_string(), + status: RecoveryJobStatus::Running, + stage: RecoveryJobStage::FreezingWrites, + error: Some("write freeze in progress".to_string()), + started_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:02:03Z") + .unwrap() + .with_timezone(&chrono::Utc), + created_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:00:00Z") + .unwrap() + .with_timezone(&chrono::Utc), + updated_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:05:00Z") + .unwrap() + .with_timezone(&chrono::Utc), + last_heartbeat_at: Some( + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:04:30Z") + .unwrap() + .with_timezone(&chrono::Utc), + ), + }; + + let json = serde_json::to_string(&dto).unwrap(); + let parsed: RecoveryJobDto = serde_json::from_str(&json).unwrap(); + + assert_eq!(parsed, dto); + } + #[test] fn test_browser_full_sync_with_upgrades_round_trip() { let msg = BrowserMessage::FullSync { @@ -1325,13 +1402,39 @@ mod tests { started_at: chrono::Utc::now(), finished_at: None, }], + recoveries: vec![RecoveryJobDto { + job_id: "recovery-1".to_string(), + target_server_id: "target-1".to_string(), + source_server_id: "source-1".to_string(), + status: RecoveryJobStatus::Running, + stage: RecoveryJobStage::Rebinding, + error: Some("waiting for agent reconnect".to_string()), + started_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:02:03Z") + .unwrap() + .with_timezone(&chrono::Utc), + created_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:00:00Z") + .unwrap() + .with_timezone(&chrono::Utc), + updated_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:05:00Z") + .unwrap() + .with_timezone(&chrono::Utc), + last_heartbeat_at: Some( + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:04:30Z") + .unwrap() + .with_timezone(&chrono::Utc), + ), + }], }; let json = serde_json::to_string(&msg).unwrap(); let parsed: BrowserMessage = serde_json::from_str(&json).unwrap(); match parsed { - BrowserMessage::FullSync { servers, upgrades } => { + BrowserMessage::FullSync { + servers, + upgrades, + recoveries, + } => { assert!(servers.is_empty()); assert_eq!(upgrades.len(), 1); assert_eq!(upgrades[0].server_id, "server-1"); @@ -1345,11 +1448,110 @@ mod tests { Some("/backups/server-1.tar.gz".to_string()) ); assert!(upgrades[0].finished_at.is_none()); + assert_eq!(recoveries.len(), 1); + assert_eq!(recoveries[0].job_id, "recovery-1"); + assert_eq!(recoveries[0].target_server_id, "target-1"); + assert_eq!(recoveries[0].source_server_id, "source-1"); + assert_eq!(recoveries[0].status, RecoveryJobStatus::Running); + assert_eq!(recoveries[0].stage, RecoveryJobStage::Rebinding); + assert_eq!( + recoveries[0].error, + Some("waiting for agent reconnect".to_string()) + ); + assert_eq!( + recoveries[0].started_at, + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:02:03Z") + .unwrap() + .with_timezone(&chrono::Utc) + ); + assert_eq!( + recoveries[0].created_at, + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:00:00Z") + .unwrap() + .with_timezone(&chrono::Utc) + ); + assert_eq!( + recoveries[0].updated_at, + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:05:00Z") + .unwrap() + .with_timezone(&chrono::Utc) + ); + assert_eq!( + recoveries[0].last_heartbeat_at, + Some( + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:04:30Z") + .unwrap() + .with_timezone(&chrono::Utc) + ) + ); } _ => panic!("Expected FullSync"), } } + #[test] + fn test_browser_full_sync_defaults_missing_recoveries_to_empty() { + let json = r#"{"type":"full_sync","servers":[],"upgrades":[]}"#; + let parsed: BrowserMessage = serde_json::from_str(json).unwrap(); + + match parsed { + BrowserMessage::FullSync { + servers, + upgrades, + recoveries, + } => { + assert!(servers.is_empty()); + assert!(upgrades.is_empty()); + assert!(recoveries.is_empty()); + } + _ => panic!("Expected FullSync"), + } + } + + #[test] + fn test_browser_update_round_trip_with_recoveries() { + let msg = BrowserMessage::Update { + servers: vec![], + recoveries: vec![RecoveryJobDto { + job_id: "recovery-2".to_string(), + target_server_id: "target-2".to_string(), + source_server_id: "source-2".to_string(), + status: RecoveryJobStatus::Succeeded, + stage: RecoveryJobStage::Succeeded, + error: None, + started_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T02:00:00Z") + .unwrap() + .with_timezone(&chrono::Utc), + created_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:59:00Z") + .unwrap() + .with_timezone(&chrono::Utc), + updated_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T02:10:00Z") + .unwrap() + .with_timezone(&chrono::Utc), + last_heartbeat_at: None, + }], + }; + + let json = serde_json::to_string(&msg).unwrap(); + let parsed: BrowserMessage = serde_json::from_str(&json).unwrap(); + + match parsed { + BrowserMessage::Update { + servers, + recoveries, + } => { + assert!(servers.is_empty()); + assert_eq!(recoveries.len(), 1); + assert_eq!(recoveries[0].job_id, "recovery-2"); + assert_eq!(recoveries[0].status, RecoveryJobStatus::Succeeded); + assert_eq!(recoveries[0].stage, RecoveryJobStage::Succeeded); + assert!(recoveries[0].error.is_none()); + assert!(recoveries[0].last_heartbeat_at.is_none()); + } + _ => panic!("Expected Update"), + } + } + #[test] fn test_agent_info_updated_accepts_optional_agent_version() { let json = r#"{"type":"agent_info_updated","server_id":"server-1","protocol_version":3,"agent_version":"1.2.3"}"#; From 3ec2fc0b4b59ba7d86641e9c7f365a3cbfc38991 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:40:33 +0800 Subject: [PATCH 27/60] fix(server): re-check recovery lock at each write site --- crates/server/src/router/ws/agent.rs | 34 +++++++------- crates/server/src/task/record_writer.rs | 61 ++++++++++++++++++------- 2 files changed, 60 insertions(+), 35 deletions(-) diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 8378b59c..3a5c4fef 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -347,8 +347,6 @@ async fn handle_current_connection_frame( } async fn handle_agent_message(state: &Arc, server_id: &str, msg: AgentMessage) { - let writes_allowed = state.recovery_lock.writes_allowed_for(server_id); - match msg { AgentMessage::SystemInfo { msg_id, @@ -401,7 +399,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent tracing::info!( "Server {server_id} remote address changed: {old_addr} -> {new_addr}" ); - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = AuditService::log( &state.db, "system", @@ -428,7 +426,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent let remote_changed = old_remote_addr.as_ref() != current_remote_addr.as_ref(); if ipv4_changed || ipv6_changed || remote_changed { - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = AlertService::check_event_rules( &state.db, &state.alert_state_manager, @@ -460,7 +458,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent // Always update last_remote_addr if let Some(ref addr) = current_remote_addr { - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = update_last_remote_addr(&state.db, server_id, addr).await { tracing::error!( @@ -475,7 +473,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = ServerService::update_system_info( &state.db, server_id, @@ -567,7 +565,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent AgentMessage::Report(report) => { // Save GPU records if present if let Some(ref gpu) = report.gpu { - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = RecordService::save_gpu_records(&state.db, server_id, gpu).await { @@ -590,7 +588,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent ); if !dispatched { // No waiter — one-shot task, save directly - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = save_task_result(&state.db, server_id, &result).await { tracing::error!("Failed to save task result for {server_id}: {e}"); } @@ -598,7 +596,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent tracing::info!("Skipping recovery-frozen task-result write for {server_id}"); } } - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = audit_exec_finished(state, server_id, &result).await { tracing::error!("Failed to write exec_finished audit log for {server_id}: {e}"); } @@ -644,7 +642,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } AgentMessage::PingResult(result) => { - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = save_ping_result(&state.db, server_id, &result).await { tracing::error!("Failed to save ping result for {server_id}: {e}"); } @@ -701,7 +699,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent }, ); if !dispatched { - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { use crate::entity::task_result; use sea_orm::{ActiveModelTrait, NotSet, Set}; let result = task_result::ActiveModel { @@ -743,7 +741,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent server_id: server_id.to_string(), results: results.clone(), }); - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = NetworkProbeService::save_results(&state.db, server_id, results).await { tracing::error!("Failed to save network probe results for {server_id}: {e}"); @@ -983,7 +981,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } AgentMessage::DockerEvent { event } => { - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { let _ = crate::service::docker::DockerService::save_event( &state.db, server_id, @@ -1010,7 +1008,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } AgentMessage::FeaturesUpdate { ref features } => { - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { let _ = crate::service::server::ServerService::update_features( &state.db, server_id, features, ) @@ -1049,7 +1047,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent let ipv6_changed = old_ipv6 != ipv6; if ipv4_changed || ipv6_changed { - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { // Update ipv4/ipv6 in DB if let Err(e) = update_server_ips(&state.db, server_id, &ipv4, &ipv6).await @@ -1073,7 +1071,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent guard.as_ref().map(|g| g.lookup(ip)) }; if let Some(geo) = geo { - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = update_server_geo( &state.db, server_id, @@ -1103,7 +1101,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent .get_remote_addr(server_id) .map(|a| a.ip().to_string()) .unwrap_or_default(); - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = AuditService::log( &state.db, "system", @@ -1121,7 +1119,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent ); } - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = AlertService::check_event_rules( &state.db, &state.alert_state_manager, diff --git a/crates/server/src/task/record_writer.rs b/crates/server/src/task/record_writer.rs index cc9bd436..05894eb7 100644 --- a/crates/server/src/task/record_writer.rs +++ b/crates/server/src/task/record_writer.rs @@ -49,10 +49,14 @@ pub async fn run(state: Arc) { } // Save metrics record - if let Err(e) = RecordService::save_report(&state.db, server_id, report).await { - tracing::error!("Failed to save record for {server_id}: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = RecordService::save_report(&state.db, server_id, report).await { + tracing::error!("Failed to save record for {server_id}: {e}"); + } else { + count += 1; + } } else { - count += 1; + tracing::info!("Skipping recovery-frozen record write for {server_id}"); } // Compute traffic delta @@ -70,10 +74,19 @@ pub async fn run(state: Arc) { } else { // First observation: no previous state, skip delta (just record state) transfer_cache.insert(server_id.clone(), (curr_in, curr_out)); - if let Err(e) = - TrafficService::upsert_state(&state.db, server_id, curr_in, curr_out).await - { - tracing::error!("Failed to upsert traffic state for {server_id}: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = TrafficService::upsert_state( + &state.db, + server_id, + curr_in, + curr_out, + ) + .await + { + tracing::error!("Failed to upsert traffic state for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen traffic state write for {server_id}"); } continue; }; @@ -82,19 +95,33 @@ pub async fn run(state: Arc) { transfer_cache.insert(server_id.clone(), (curr_in, curr_out)); // Only write if there's actual traffic - if (delta_in > 0 || delta_out > 0) - && let Err(e) = - TrafficService::upsert_hourly(&state.db, server_id, hour, delta_in, delta_out) - .await - { - tracing::error!("Failed to upsert traffic hourly for {server_id}: {e}"); + if delta_in > 0 || delta_out > 0 { + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = TrafficService::upsert_hourly( + &state.db, + server_id, + hour, + delta_in, + delta_out, + ) + .await + { + tracing::error!("Failed to upsert traffic hourly for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen traffic hourly write for {server_id}"); + } } // Always update state - if let Err(e) = - TrafficService::upsert_state(&state.db, server_id, curr_in, curr_out).await - { - tracing::error!("Failed to upsert traffic state for {server_id}: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = + TrafficService::upsert_state(&state.db, server_id, curr_in, curr_out).await + { + tracing::error!("Failed to upsert traffic state for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen traffic state write for {server_id}"); } } From 09d8415eabffc05e2e878524ad4b72aff9ef0851 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:41:46 +0800 Subject: [PATCH 28/60] fix(server): bridge recovery browser payload compatibility --- crates/server/src/router/ws/browser.rs | 2 ++ crates/server/src/service/agent_manager.rs | 1 + 2 files changed, 3 insertions(+) diff --git a/crates/server/src/router/ws/browser.rs b/crates/server/src/router/ws/browser.rs index 270fbba7..70219389 100644 --- a/crates/server/src/router/ws/browser.rs +++ b/crates/server/src/router/ws/browser.rs @@ -256,6 +256,7 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { return BrowserMessage::FullSync { servers: Vec::new(), upgrades: state.upgrade_tracker.snapshot(), + recoveries: Vec::new(), }; } }; @@ -355,6 +356,7 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { BrowserMessage::FullSync { servers: statuses, upgrades: state.upgrade_tracker.snapshot(), + recoveries: Vec::new(), } } diff --git a/crates/server/src/service/agent_manager.rs b/crates/server/src/service/agent_manager.rs index aee15e74..5eafb57d 100644 --- a/crates/server/src/service/agent_manager.rs +++ b/crates/server/src/service/agent_manager.rs @@ -235,6 +235,7 @@ impl AgentManager { let _ = self.browser_tx.send(BrowserMessage::Update { servers: vec![status], + recoveries: Vec::new(), }); // Cache the report From 926f34d48148d41d1f569cb965cec17389985750 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:43:27 +0800 Subject: [PATCH 29/60] fix(server): gate docker cleanup persistence during recovery --- crates/server/src/router/ws/agent.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 3a5c4fef..75f9f0fd 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -302,7 +302,12 @@ async fn handle_agent_ws( .agent_manager .remove_connection_if_current(&server_id, connection_id) { - crate::service::agent_manager::cleanup_disconnected_docker_state(&state, &server_id).await; + if state.recovery_lock.writes_allowed_for(&server_id) { + crate::service::agent_manager::cleanup_disconnected_docker_state(&state, &server_id) + .await; + } else { + tracing::info!("Skipping recovery-frozen docker disconnect write for {server_id}"); + } } write_task.abort(); tracing::info!("Agent {server_id} disconnected"); From 4b652d7d6c2916b9782fe05c9e755a06319058d2 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:45:04 +0800 Subject: [PATCH 30/60] fix(common): tolerate unknown recovery status values --- crates/common/src/protocol.rs | 55 +++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/crates/common/src/protocol.rs b/crates/common/src/protocol.rs index 9d1ca17a..34619b69 100644 --- a/crates/common/src/protocol.rs +++ b/crates/common/src/protocol.rs @@ -29,7 +29,7 @@ pub enum UpgradeStatus { Timeout, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] #[serde(rename_all = "snake_case")] #[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] pub enum RecoveryJobStatus { @@ -39,7 +39,7 @@ pub enum RecoveryJobStatus { Unknown, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] #[serde(rename_all = "snake_case")] #[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] pub enum RecoveryJobStage { @@ -54,6 +54,43 @@ pub enum RecoveryJobStage { Unknown, } +impl<'de> Deserialize<'de> for RecoveryJobStatus { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let value = String::deserialize(deserializer)?; + + Ok(match value.as_str() { + "running" => Self::Running, + "failed" => Self::Failed, + "succeeded" => Self::Succeeded, + _ => Self::Unknown, + }) + } +} + +impl<'de> Deserialize<'de> for RecoveryJobStage { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let value = String::deserialize(deserializer)?; + + Ok(match value.as_str() { + "validating" => Self::Validating, + "rebinding" => Self::Rebinding, + "awaiting_target_online" => Self::AwaitingTargetOnline, + "freezing_writes" => Self::FreezingWrites, + "merging_history" => Self::MergingHistory, + "finalizing" => Self::Finalizing, + "succeeded" => Self::Succeeded, + "failed" => Self::Failed, + _ => Self::Unknown, + }) + } +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] pub struct UpgradeJobDto { @@ -1387,6 +1424,20 @@ mod tests { assert_eq!(parsed, dto); } + #[test] + fn test_recovery_job_status_unknown_deserializes_to_unknown() { + let status: RecoveryJobStatus = serde_json::from_str(r#""paused""#).unwrap(); + + assert_eq!(status, RecoveryJobStatus::Unknown); + } + + #[test] + fn test_recovery_job_stage_unknown_deserializes_to_unknown() { + let stage: RecoveryJobStage = serde_json::from_str(r#""reconciling""#).unwrap(); + + assert_eq!(stage, RecoveryJobStage::Unknown); + } + #[test] fn test_browser_full_sync_with_upgrades_round_trip() { let msg = BrowserMessage::FullSync { From 6c94b0d59d8c7937c45060c865cd0d6bae4f938b Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:46:18 +0800 Subject: [PATCH 31/60] fix(server): keep traffic cache updates during recovery freeze --- crates/server/src/task/record_writer.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/crates/server/src/task/record_writer.rs b/crates/server/src/task/record_writer.rs index 05894eb7..ce072369 100644 --- a/crates/server/src/task/record_writer.rs +++ b/crates/server/src/task/record_writer.rs @@ -43,13 +43,10 @@ pub async fn run(state: Arc) { let mut count = 0; for (server_id, report) in &reports { - if !state.recovery_lock.writes_allowed_for(server_id) { - tracing::info!("Skipping recovery-frozen record write for {server_id}"); - continue; - } + let writes_allowed = state.recovery_lock.writes_allowed_for(server_id); // Save metrics record - if state.recovery_lock.writes_allowed_for(server_id) { + if writes_allowed { if let Err(e) = RecordService::save_report(&state.db, server_id, report).await { tracing::error!("Failed to save record for {server_id}: {e}"); } else { @@ -74,7 +71,7 @@ pub async fn run(state: Arc) { } else { // First observation: no previous state, skip delta (just record state) transfer_cache.insert(server_id.clone(), (curr_in, curr_out)); - if state.recovery_lock.writes_allowed_for(server_id) { + if writes_allowed { if let Err(e) = TrafficService::upsert_state( &state.db, server_id, @@ -96,7 +93,7 @@ pub async fn run(state: Arc) { // Only write if there's actual traffic if delta_in > 0 || delta_out > 0 { - if state.recovery_lock.writes_allowed_for(server_id) { + if writes_allowed { if let Err(e) = TrafficService::upsert_hourly( &state.db, server_id, @@ -114,7 +111,7 @@ pub async fn run(state: Arc) { } // Always update state - if state.recovery_lock.writes_allowed_for(server_id) { + if writes_allowed { if let Err(e) = TrafficService::upsert_state(&state.db, server_id, curr_in, curr_out).await { From b4254feaee156b141fc70ec6928cfe5bbffc61d5 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:47:59 +0800 Subject: [PATCH 32/60] feat(server): add recovery merge service core --- crates/server/src/service/mod.rs | 5 +- crates/server/src/service/recovery_merge.rs | 261 ++++++++++++++++++++ 2 files changed, 264 insertions(+), 2 deletions(-) create mode 100644 crates/server/src/service/recovery_merge.rs diff --git a/crates/server/src/service/mod.rs b/crates/server/src/service/mod.rs index d314ae36..1b11fa5b 100644 --- a/crates/server/src/service/mod.rs +++ b/crates/server/src/service/mod.rs @@ -19,14 +19,15 @@ pub mod notification; pub mod oauth; pub mod ping; pub mod record; -pub mod recovery_lock; pub mod recovery_job; +pub mod recovery_lock; +pub mod recovery_merge; pub mod server; pub mod service_monitor; pub mod status_page; pub mod task_scheduler; pub mod traffic; -pub mod uptime; pub mod upgrade_release; pub mod upgrade_tracker; +pub mod uptime; pub mod user; diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs new file mode 100644 index 00000000..cbc35fba --- /dev/null +++ b/crates/server/src/service/recovery_merge.rs @@ -0,0 +1,261 @@ +use std::sync::Arc; + +use sea_orm::DatabaseConnection; + +use crate::entity::recovery_job; +use crate::error::AppError; +use crate::service::recovery_job::RecoveryJobService; +use crate::state::AppState; + +pub const RECOVERY_STAGE_VALIDATING: &str = "validating"; +pub const RECOVERY_STAGE_REBINDING: &str = "rebinding"; +pub const RECOVERY_STAGE_AWAITING_TARGET_ONLINE: &str = "awaiting_target_online"; + +pub struct RecoveryMergeService; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RecoveryFailurePhase { + PreRebind, + PostRebind, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RecoveryRetryStrategy { + StartNewJob, + ResumeSameJob, +} + +impl RecoveryMergeService { + pub async fn start( + state: &Arc, + target_server_id: &str, + source_server_id: &str, + ) -> Result { + Self::start_on_db(&state.db, target_server_id, source_server_id).await + } + + pub async fn handle_rebind_ack( + state: &Arc, + job_id: &str, + ) -> Result { + Self::handle_rebind_ack_on_db(&state.db, job_id).await + } + + pub async fn start_on_db( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result { + let running_target = RecoveryJobService::running_for_target(db, target_server_id).await?; + let running_source = RecoveryJobService::running_for_source(db, source_server_id).await?; + + if let Some(job) = running_target { + if let Some(source_job) = &running_source + && source_job.job_id != job.job_id + { + return Err(AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + )); + } + + if !is_pre_rebind_stage(job.stage.as_str()) { + return Err(AppError::Conflict( + "Recovery job has already advanced past the rebind step".to_string(), + )); + } + + return RecoveryJobService::update_stage( + db, + &job.job_id, + RECOVERY_STAGE_REBINDING, + None, + None, + ) + .await; + } + + if running_source.is_some() { + return Err(AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + )); + } + + let job = RecoveryJobService::create_job(db, target_server_id, source_server_id).await?; + RecoveryJobService::update_stage(db, &job.job_id, RECOVERY_STAGE_REBINDING, None, None) + .await + } + + pub async fn handle_rebind_ack_on_db( + db: &DatabaseConnection, + job_id: &str, + ) -> Result { + let job = RecoveryJobService::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + + if job.status != "running" { + return Ok(job); + } + + if job.stage == RECOVERY_STAGE_AWAITING_TARGET_ONLINE { + return Ok(job); + } + + RecoveryJobService::update_stage( + db, + job_id, + RECOVERY_STAGE_AWAITING_TARGET_ONLINE, + None, + None, + ) + .await + } +} + +pub fn recovery_phase_for_stage(stage: &str) -> RecoveryFailurePhase { + match stage { + RECOVERY_STAGE_VALIDATING | RECOVERY_STAGE_REBINDING => RecoveryFailurePhase::PreRebind, + _ => RecoveryFailurePhase::PostRebind, + } +} + +pub fn is_pre_rebind_stage(stage: &str) -> bool { + matches!( + recovery_phase_for_stage(stage), + RecoveryFailurePhase::PreRebind + ) +} + +pub fn retry_strategy_for_phase(phase: RecoveryFailurePhase) -> RecoveryRetryStrategy { + match phase { + RecoveryFailurePhase::PreRebind => RecoveryRetryStrategy::StartNewJob, + RecoveryFailurePhase::PostRebind => RecoveryRetryStrategy::ResumeSameJob, + } +} + +pub fn retry_strategy_for_stage(stage: &str) -> RecoveryRetryStrategy { + retry_strategy_for_phase(recovery_phase_for_stage(stage)) +} + +#[cfg(test)] +mod tests { + use super::{ + RECOVERY_STAGE_AWAITING_TARGET_ONLINE, RECOVERY_STAGE_REBINDING, RecoveryFailurePhase, + RecoveryMergeService, RecoveryRetryStrategy, is_pre_rebind_stage, recovery_phase_for_stage, + retry_strategy_for_phase, retry_strategy_for_stage, + }; + use crate::service::recovery_job::RecoveryJobService; + use crate::test_utils::setup_test_db; + + #[test] + fn pre_rebind_phase_requires_new_job() { + assert_eq!( + retry_strategy_for_phase(RecoveryFailurePhase::PreRebind), + RecoveryRetryStrategy::StartNewJob + ); + assert_eq!( + retry_strategy_for_stage(RECOVERY_STAGE_REBINDING), + RecoveryRetryStrategy::StartNewJob + ); + assert_eq!( + recovery_phase_for_stage(RECOVERY_STAGE_REBINDING), + RecoveryFailurePhase::PreRebind + ); + } + + #[test] + fn post_rebind_phase_resumes_same_job() { + assert_eq!( + retry_strategy_for_phase(RecoveryFailurePhase::PostRebind), + RecoveryRetryStrategy::ResumeSameJob + ); + assert_eq!( + retry_strategy_for_stage(RECOVERY_STAGE_AWAITING_TARGET_ONLINE), + RecoveryRetryStrategy::ResumeSameJob + ); + assert_eq!( + recovery_phase_for_stage(RECOVERY_STAGE_AWAITING_TARGET_ONLINE), + RecoveryFailurePhase::PostRebind + ); + } + + #[tokio::test] + async fn start_persists_job_and_advances_to_rebinding() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + assert_eq!(job.target_server_id, "target-1"); + assert_eq!(job.source_server_id, "source-1"); + assert_eq!(job.status, "running"); + assert_eq!(job.stage, RECOVERY_STAGE_REBINDING); + assert!(job.last_heartbeat_at.is_some()); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, RECOVERY_STAGE_REBINDING); + assert_eq!(loaded.status, "running"); + } + + #[tokio::test] + async fn start_reuses_existing_pre_rebind_job() { + let (db, _tmp) = setup_test_db().await; + + let first = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + let second = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + assert_eq!(second.job_id, first.job_id); + assert_eq!(second.stage, RECOVERY_STAGE_REBINDING); + assert!(is_pre_rebind_stage(second.stage.as_str())); + } + + #[tokio::test] + async fn rebind_ack_advances_to_waiting_for_target_online() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id) + .await + .unwrap(); + + assert_eq!(updated.job_id, job.job_id); + assert_eq!(updated.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!(updated.status, "running"); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + } + + #[tokio::test] + async fn rebind_ack_is_idempotent_once_advanced() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + let _ = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id) + .await + .unwrap(); + + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id) + .await + .unwrap(); + + assert_eq!(updated.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!(updated.status, "running"); + } +} From 14927c0b569a93fb89b5d3f04c37bf82f84b7f65 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:49:36 +0800 Subject: [PATCH 33/60] fix(common): avoid wiping recovery state on normal updates --- crates/common/src/protocol.rs | 44 ++++++---------------- crates/server/src/service/agent_manager.rs | 2 +- 2 files changed, 12 insertions(+), 34 deletions(-) diff --git a/crates/common/src/protocol.rs b/crates/common/src/protocol.rs index 34619b69..e9a7b5b1 100644 --- a/crates/common/src/protocol.rs +++ b/crates/common/src/protocol.rs @@ -456,8 +456,8 @@ pub enum BrowserMessage { }, Update { servers: Vec, - #[serde(default)] - recoveries: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + recoveries: Option>, }, ServerOnline { server_id: String, @@ -1560,44 +1560,22 @@ mod tests { } #[test] - fn test_browser_update_round_trip_with_recoveries() { + fn test_browser_update_omits_recoveries_when_none() { let msg = BrowserMessage::Update { servers: vec![], - recoveries: vec![RecoveryJobDto { - job_id: "recovery-2".to_string(), - target_server_id: "target-2".to_string(), - source_server_id: "source-2".to_string(), - status: RecoveryJobStatus::Succeeded, - stage: RecoveryJobStage::Succeeded, - error: None, - started_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T02:00:00Z") - .unwrap() - .with_timezone(&chrono::Utc), - created_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:59:00Z") - .unwrap() - .with_timezone(&chrono::Utc), - updated_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T02:10:00Z") - .unwrap() - .with_timezone(&chrono::Utc), - last_heartbeat_at: None, - }], + recoveries: None, }; let json = serde_json::to_string(&msg).unwrap(); - let parsed: BrowserMessage = serde_json::from_str(&json).unwrap(); + let value: serde_json::Value = serde_json::from_str(&json).unwrap(); + assert_eq!(value["type"], "update"); + assert_eq!(value["servers"], serde_json::json!([])); + assert!(value.get("recoveries").is_none()); - match parsed { - BrowserMessage::Update { - servers, - recoveries, - } => { + match serde_json::from_str::(&json).unwrap() { + BrowserMessage::Update { servers, recoveries } => { assert!(servers.is_empty()); - assert_eq!(recoveries.len(), 1); - assert_eq!(recoveries[0].job_id, "recovery-2"); - assert_eq!(recoveries[0].status, RecoveryJobStatus::Succeeded); - assert_eq!(recoveries[0].stage, RecoveryJobStage::Succeeded); - assert!(recoveries[0].error.is_none()); - assert!(recoveries[0].last_heartbeat_at.is_none()); + assert!(recoveries.is_none()); } _ => panic!("Expected Update"), } diff --git a/crates/server/src/service/agent_manager.rs b/crates/server/src/service/agent_manager.rs index 5eafb57d..fb949edf 100644 --- a/crates/server/src/service/agent_manager.rs +++ b/crates/server/src/service/agent_manager.rs @@ -235,7 +235,7 @@ impl AgentManager { let _ = self.browser_tx.send(BrowserMessage::Update { servers: vec![status], - recoveries: Vec::new(), + recoveries: None, }); // Cache the report From d0f0f93778e923f38d62b3570f65ab3bf8903bed Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:50:52 +0800 Subject: [PATCH 34/60] fix(server): harden recovery merge stage semantics --- crates/server/src/service/recovery_merge.rs | 78 +++++++++++++++++---- 1 file changed, 64 insertions(+), 14 deletions(-) diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index cbc35fba..c1d73491 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -50,6 +50,12 @@ impl RecoveryMergeService { let running_source = RecoveryJobService::running_for_source(db, source_server_id).await?; if let Some(job) = running_target { + if job.source_server_id != source_server_id { + return Err(AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + )); + } + if let Some(source_job) = &running_source && source_job.job_id != job.job_id { @@ -97,7 +103,7 @@ impl RecoveryMergeService { return Ok(job); } - if job.stage == RECOVERY_STAGE_AWAITING_TARGET_ONLINE { + if job.stage != RECOVERY_STAGE_REBINDING { return Ok(job); } @@ -112,18 +118,16 @@ impl RecoveryMergeService { } } -pub fn recovery_phase_for_stage(stage: &str) -> RecoveryFailurePhase { +pub fn recovery_phase_for_stage(stage: &str) -> Option { match stage { - RECOVERY_STAGE_VALIDATING | RECOVERY_STAGE_REBINDING => RecoveryFailurePhase::PreRebind, - _ => RecoveryFailurePhase::PostRebind, + RECOVERY_STAGE_VALIDATING | RECOVERY_STAGE_REBINDING => Some(RecoveryFailurePhase::PreRebind), + RECOVERY_STAGE_AWAITING_TARGET_ONLINE => Some(RecoveryFailurePhase::PostRebind), + _ => None, } } pub fn is_pre_rebind_stage(stage: &str) -> bool { - matches!( - recovery_phase_for_stage(stage), - RecoveryFailurePhase::PreRebind - ) + matches!(recovery_phase_for_stage(stage), Some(RecoveryFailurePhase::PreRebind)) } pub fn retry_strategy_for_phase(phase: RecoveryFailurePhase) -> RecoveryRetryStrategy { @@ -133,8 +137,8 @@ pub fn retry_strategy_for_phase(phase: RecoveryFailurePhase) -> RecoveryRetryStr } } -pub fn retry_strategy_for_stage(stage: &str) -> RecoveryRetryStrategy { - retry_strategy_for_phase(recovery_phase_for_stage(stage)) +pub fn retry_strategy_for_stage(stage: &str) -> Option { + recovery_phase_for_stage(stage).map(retry_strategy_for_phase) } #[cfg(test)] @@ -144,6 +148,7 @@ mod tests { RecoveryMergeService, RecoveryRetryStrategy, is_pre_rebind_stage, recovery_phase_for_stage, retry_strategy_for_phase, retry_strategy_for_stage, }; + use crate::error::AppError; use crate::service::recovery_job::RecoveryJobService; use crate::test_utils::setup_test_db; @@ -155,12 +160,14 @@ mod tests { ); assert_eq!( retry_strategy_for_stage(RECOVERY_STAGE_REBINDING), - RecoveryRetryStrategy::StartNewJob + Some(RecoveryRetryStrategy::StartNewJob) ); assert_eq!( recovery_phase_for_stage(RECOVERY_STAGE_REBINDING), - RecoveryFailurePhase::PreRebind + Some(RecoveryFailurePhase::PreRebind) ); + assert_eq!(retry_strategy_for_stage("unknown"), None); + assert_eq!(recovery_phase_for_stage("unknown"), None); } #[test] @@ -171,11 +178,11 @@ mod tests { ); assert_eq!( retry_strategy_for_stage(RECOVERY_STAGE_AWAITING_TARGET_ONLINE), - RecoveryRetryStrategy::ResumeSameJob + Some(RecoveryRetryStrategy::ResumeSameJob) ); assert_eq!( recovery_phase_for_stage(RECOVERY_STAGE_AWAITING_TARGET_ONLINE), - RecoveryFailurePhase::PostRebind + Some(RecoveryFailurePhase::PostRebind) ); } @@ -217,6 +224,25 @@ mod tests { assert!(is_pre_rebind_stage(second.stage.as_str())); } + #[tokio::test] + async fn start_rejects_existing_target_job_for_different_source() { + let (db, _tmp) = setup_test_db().await; + + let first = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let result = RecoveryMergeService::start_on_db(&db, "target-1", "source-2").await; + assert!(matches!(result, Err(AppError::Conflict(_)))); + + let loaded = RecoveryJobService::get_job(&db, &first.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.source_server_id, "source-1"); + assert_eq!(loaded.stage, RECOVERY_STAGE_REBINDING); + } + #[tokio::test] async fn rebind_ack_advances_to_waiting_for_target_online() { let (db, _tmp) = setup_test_db().await; @@ -258,4 +284,28 @@ mod tests { assert_eq!(updated.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); assert_eq!(updated.status, "running"); } + + #[tokio::test] + async fn rebind_ack_ignores_wrong_stage() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + RecoveryJobService::update_stage(&db, &job.job_id, "validating", None, None) + .await + .unwrap(); + + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id) + .await + .unwrap(); + + assert_eq!(updated.stage, "validating"); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, "validating"); + } } From 952be84bc29eafc6f2e125b4483fa886c8dac075 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:52:28 +0800 Subject: [PATCH 35/60] fix(server): tighten recovery write freeze behavior --- crates/server/src/router/ws/agent.rs | 7 +------ crates/server/src/service/agent_manager.rs | 17 ++++++++++++++--- crates/server/src/task/record_writer.rs | 10 ++++------ 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 75f9f0fd..3a5c4fef 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -302,12 +302,7 @@ async fn handle_agent_ws( .agent_manager .remove_connection_if_current(&server_id, connection_id) { - if state.recovery_lock.writes_allowed_for(&server_id) { - crate::service::agent_manager::cleanup_disconnected_docker_state(&state, &server_id) - .await; - } else { - tracing::info!("Skipping recovery-frozen docker disconnect write for {server_id}"); - } + crate::service::agent_manager::cleanup_disconnected_docker_state(&state, &server_id).await; } write_task.abort(); tracing::info!("Agent {server_id} disconnected"); diff --git a/crates/server/src/service/agent_manager.rs b/crates/server/src/service/agent_manager.rs index fb949edf..fa3d7682 100644 --- a/crates/server/src/service/agent_manager.rs +++ b/crates/server/src/service/agent_manager.rs @@ -660,10 +660,21 @@ pub async fn cleanup_disconnected_docker_state(state: &AppState, server_id: &str let mut features = state.agent_manager.get_features(server_id); features.retain(|feature| feature != "docker"); - let _ = crate::service::server::ServerService::update_features(&state.db, server_id, &features) - .await; - + let persisted_features = features.clone(); state.agent_manager.update_features(server_id, features); + + if state.recovery_lock.writes_allowed_for(server_id) { + let _ = + crate::service::server::ServerService::update_features( + &state.db, + server_id, + &persisted_features, + ) + .await; + } else { + tracing::info!("Skipping recovery-frozen docker feature write for {server_id}"); + } + state .agent_manager .broadcast_browser(BrowserMessage::DockerAvailabilityChanged { diff --git a/crates/server/src/task/record_writer.rs b/crates/server/src/task/record_writer.rs index ce072369..c6971d0f 100644 --- a/crates/server/src/task/record_writer.rs +++ b/crates/server/src/task/record_writer.rs @@ -43,10 +43,8 @@ pub async fn run(state: Arc) { let mut count = 0; for (server_id, report) in &reports { - let writes_allowed = state.recovery_lock.writes_allowed_for(server_id); - // Save metrics record - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = RecordService::save_report(&state.db, server_id, report).await { tracing::error!("Failed to save record for {server_id}: {e}"); } else { @@ -71,7 +69,7 @@ pub async fn run(state: Arc) { } else { // First observation: no previous state, skip delta (just record state) transfer_cache.insert(server_id.clone(), (curr_in, curr_out)); - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = TrafficService::upsert_state( &state.db, server_id, @@ -93,7 +91,7 @@ pub async fn run(state: Arc) { // Only write if there's actual traffic if delta_in > 0 || delta_out > 0 { - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = TrafficService::upsert_hourly( &state.db, server_id, @@ -111,7 +109,7 @@ pub async fn run(state: Arc) { } // Always update state - if writes_allowed { + if state.recovery_lock.writes_allowed_for(server_id) { if let Err(e) = TrafficService::upsert_state(&state.db, server_id, curr_in, curr_out).await { From 91ab659272e06e1815811603fed6332332018e5a Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:54:07 +0800 Subject: [PATCH 36/60] fix(server): finish recovery lock write-site coverage --- crates/server/src/router/ws/agent.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 3a5c4fef..331231a6 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -485,8 +485,11 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent { tracing::error!("Failed to update system info for {server_id}: {e}"); } + } else { + tracing::info!("Skipping recovery-frozen system-info write for {server_id}"); + } - // Persist and cache features from SystemInfo + if state.recovery_lock.writes_allowed_for(server_id) { let _ = crate::service::server::ServerService::update_features( &state.db, server_id, @@ -1188,11 +1191,7 @@ async fn handle_docker_unavailable(state: &Arc, server_id: &str) { .remove_docker_log_sessions_for_server(server_id); // Shared cleanup: viewer tracker, features, DB persist, browser broadcast. - if state.recovery_lock.writes_allowed_for(server_id) { - crate::service::agent_manager::cleanup_disconnected_docker_state(state, server_id).await; - } else { - tracing::info!("Skipping recovery-frozen docker unavailable write for {server_id}"); - } + crate::service::agent_manager::cleanup_disconnected_docker_state(state, server_id).await; } async fn send_server_message( From 840caf4fa03338086dbafb7fdefe9a70ad5caece Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:55:43 +0800 Subject: [PATCH 37/60] fix(server): tighten recovery merge service boundaries --- crates/server/src/service/recovery_merge.rs | 251 ++++++++++++++++++-- 1 file changed, 232 insertions(+), 19 deletions(-) diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index c1d73491..e0a36aae 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -1,8 +1,9 @@ use std::sync::Arc; use sea_orm::DatabaseConnection; +use sea_orm::EntityTrait; -use crate::entity::recovery_job; +use crate::entity::{recovery_job, server}; use crate::error::AppError; use crate::service::recovery_job::RecoveryJobService; use crate::state::AppState; @@ -31,21 +32,78 @@ impl RecoveryMergeService { target_server_id: &str, source_server_id: &str, ) -> Result { + Self::validate_start_request(state, target_server_id, source_server_id).await?; Self::start_on_db(&state.db, target_server_id, source_server_id).await } pub async fn handle_rebind_ack( state: &Arc, job_id: &str, + acking_server_id: &str, ) -> Result { - Self::handle_rebind_ack_on_db(&state.db, job_id).await + Self::handle_rebind_ack_on_db(&state.db, job_id, acking_server_id).await } - pub async fn start_on_db( + async fn validate_start_request( + state: &Arc, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + if source_server_id == target_server_id { + return Err(AppError::Validation( + "source_server_id must be different from target_id".to_string(), + )); + } + + let target = server::Entity::find_by_id(target_server_id) + .one(&state.db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + let source = server::Entity::find_by_id(source_server_id) + .one(&state.db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + + if state.agent_manager.is_online(&target.id) { + return Err(AppError::Conflict( + "Target server must be offline before starting recovery".to_string(), + )); + } + + if !state.agent_manager.is_online(&source.id) { + return Err(AppError::Conflict( + "Source server must be online before starting recovery".to_string(), + )); + } + + Ok(()) + } + + async fn start_on_db( db: &DatabaseConnection, target_server_id: &str, source_server_id: &str, ) -> Result { + if let Some(existing) = + Self::find_reusable_start_job(db, target_server_id, source_server_id).await? + { + return Self::advance_job_to_rebinding(db, existing).await; + } + + match RecoveryJobService::create_job(db, target_server_id, source_server_id).await { + Ok(job) => Self::advance_job_to_rebinding(db, job).await, + Err(AppError::Conflict(_)) => { + Self::recover_duplicate_start(db, target_server_id, source_server_id).await + } + Err(err) => Err(err), + } + } + + async fn find_reusable_start_job( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result, AppError> { let running_target = RecoveryJobService::running_for_target(db, target_server_id).await?; let running_source = RecoveryJobService::running_for_source(db, source_server_id).await?; @@ -70,14 +128,7 @@ impl RecoveryMergeService { )); } - return RecoveryJobService::update_stage( - db, - &job.job_id, - RECOVERY_STAGE_REBINDING, - None, - None, - ) - .await; + return Ok(Some(job)); } if running_source.is_some() { @@ -86,19 +137,47 @@ impl RecoveryMergeService { )); } - let job = RecoveryJobService::create_job(db, target_server_id, source_server_id).await?; + Ok(None) + } + + async fn recover_duplicate_start( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result { + match Self::find_reusable_start_job(db, target_server_id, source_server_id).await? { + Some(job) => Self::advance_job_to_rebinding(db, job).await, + None => Err(AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + )), + } + } + + async fn advance_job_to_rebinding( + db: &DatabaseConnection, + job: recovery_job::Model, + ) -> Result { + if job.stage == RECOVERY_STAGE_REBINDING { + return Ok(job); + } + RecoveryJobService::update_stage(db, &job.job_id, RECOVERY_STAGE_REBINDING, None, None) .await } - pub async fn handle_rebind_ack_on_db( + async fn handle_rebind_ack_on_db( db: &DatabaseConnection, job_id: &str, + acking_server_id: &str, ) -> Result { let job = RecoveryJobService::get_job(db, job_id) .await? .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + if job.source_server_id != acking_server_id { + return Ok(job); + } + if job.status != "running" { return Ok(job); } @@ -120,14 +199,19 @@ impl RecoveryMergeService { pub fn recovery_phase_for_stage(stage: &str) -> Option { match stage { - RECOVERY_STAGE_VALIDATING | RECOVERY_STAGE_REBINDING => Some(RecoveryFailurePhase::PreRebind), + RECOVERY_STAGE_VALIDATING | RECOVERY_STAGE_REBINDING => { + Some(RecoveryFailurePhase::PreRebind) + } RECOVERY_STAGE_AWAITING_TARGET_ONLINE => Some(RecoveryFailurePhase::PostRebind), _ => None, } } pub fn is_pre_rebind_stage(stage: &str) -> bool { - matches!(recovery_phase_for_stage(stage), Some(RecoveryFailurePhase::PreRebind)) + matches!( + recovery_phase_for_stage(stage), + Some(RecoveryFailurePhase::PreRebind) + ) } pub fn retry_strategy_for_phase(phase: RecoveryFailurePhase) -> RecoveryRetryStrategy { @@ -148,9 +232,66 @@ mod tests { RecoveryMergeService, RecoveryRetryStrategy, is_pre_rebind_stage, recovery_phase_for_stage, retry_strategy_for_phase, retry_strategy_for_stage, }; + use crate::config::AppConfig; + use crate::entity::server; use crate::error::AppError; + use crate::service::auth::AuthService; use crate::service::recovery_job::RecoveryJobService; + use crate::state::AppState; use crate::test_utils::setup_test_db; + use chrono::Utc; + use sea_orm::{ActiveModelTrait, DatabaseConnection, Set}; + use serverbee_common::constants::CAP_DEFAULT; + use std::net::{IpAddr, Ipv4Addr, SocketAddr}; + use std::sync::Arc; + use tempfile::TempDir; + use tokio::sync::mpsc; + + async fn insert_test_server(db: &DatabaseConnection, id: &str, name: &str) { + let token_hash = AuthService::hash_password("test").expect("hash_password should succeed"); + let now = Utc::now(); + server::ActiveModel { + id: Set(id.to_string()), + token_hash: Set(token_hash), + token_prefix: Set("serverbee_test".to_string()), + name: Set(name.to_string()), + weight: Set(0), + hidden: Set(false), + capabilities: Set(CAP_DEFAULT as i32), + protocol_version: Set(1), + created_at: Set(now), + updated_at: Set(now), + ..Default::default() + } + .insert(db) + .await + .expect("insert test server should succeed"); + } + + async fn test_state_with_servers() -> (Arc, TempDir) { + let (db, tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + insert_test_server(&db, "source-2", "Source 2").await; + let state = AppState::new(db, AppConfig::default()) + .await + .expect("app state should initialize"); + (state, tmp) + } + + fn test_addr() -> SocketAddr { + SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 9527) + } + + fn mark_online(state: &Arc, server_id: &str) { + let (tx, _) = mpsc::channel(1); + state.agent_manager.add_connection( + server_id.to_string(), + server_id.to_string(), + tx, + test_addr(), + ); + } #[test] fn pre_rebind_phase_requires_new_job() { @@ -251,7 +392,7 @@ mod tests { .await .unwrap(); - let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id) + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-1") .await .unwrap(); @@ -273,11 +414,11 @@ mod tests { let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") .await .unwrap(); - let _ = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id) + let _ = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-1") .await .unwrap(); - let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id) + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-1") .await .unwrap(); @@ -296,7 +437,7 @@ mod tests { .await .unwrap(); - let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id) + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-1") .await .unwrap(); @@ -308,4 +449,76 @@ mod tests { .unwrap(); assert_eq!(loaded.stage, "validating"); } + + #[tokio::test] + async fn rebind_ack_from_wrong_source_is_ignored() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-2") + .await + .unwrap(); + + assert_eq!(updated.job_id, job.job_id); + assert_eq!(updated.stage, RECOVERY_STAGE_REBINDING); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, RECOVERY_STAGE_REBINDING); + } + + #[tokio::test] + async fn start_rejects_self_merge_at_service_boundary() { + let (state, _tmp) = test_state_with_servers().await; + mark_online(&state, "source-1"); + + let result = RecoveryMergeService::start(&state, "target-1", "target-1").await; + + assert!(matches!(result, Err(AppError::Validation(_)))); + } + + #[tokio::test] + async fn start_rejects_online_target_at_service_boundary() { + let (state, _tmp) = test_state_with_servers().await; + mark_online(&state, "target-1"); + mark_online(&state, "source-1"); + + let result = RecoveryMergeService::start(&state, "target-1", "source-1").await; + + assert!( + matches!(result, Err(AppError::Conflict(message)) if message.contains("Target server must be offline")) + ); + } + + #[tokio::test] + async fn start_rejects_offline_source_at_service_boundary() { + let (state, _tmp) = test_state_with_servers().await; + + let result = RecoveryMergeService::start(&state, "target-1", "source-1").await; + + assert!( + matches!(result, Err(AppError::Conflict(message)) if message.contains("Source server must be online")) + ); + } + + #[tokio::test] + async fn duplicate_start_conflict_reuses_matching_pre_rebind_job() { + let (db, _tmp) = setup_test_db().await; + + let first = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + + let reused = RecoveryMergeService::recover_duplicate_start(&db, "target-1", "source-1") + .await + .unwrap(); + + assert_eq!(reused.job_id, first.job_id); + assert_eq!(reused.stage, RECOVERY_STAGE_REBINDING); + } } From 9837a40b370f7c02f4289d0a5ca34c2548f04ad8 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:57:19 +0800 Subject: [PATCH 38/60] fix(server): prevent recovery stage regression on retry --- crates/server/src/service/recovery_merge.rs | 70 +++++++++++++++++---- 1 file changed, 58 insertions(+), 12 deletions(-) diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index e0a36aae..83f6d145 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -1,7 +1,8 @@ use std::sync::Arc; -use sea_orm::DatabaseConnection; -use sea_orm::EntityTrait; +use chrono::Utc; +use sea_orm::prelude::Expr; +use sea_orm::{ColumnTrait, DatabaseConnection, EntityTrait, QueryFilter}; use crate::entity::{recovery_job, server}; use crate::error::AppError; @@ -122,12 +123,6 @@ impl RecoveryMergeService { )); } - if !is_pre_rebind_stage(job.stage.as_str()) { - return Err(AppError::Conflict( - "Recovery job has already advanced past the rebind step".to_string(), - )); - } - return Ok(Some(job)); } @@ -157,12 +152,33 @@ impl RecoveryMergeService { db: &DatabaseConnection, job: recovery_job::Model, ) -> Result { - if job.stage == RECOVERY_STAGE_REBINDING { - return Ok(job); + let now = Utc::now(); + let result = recovery_job::Entity::update_many() + .col_expr( + recovery_job::Column::Stage, + Expr::value(RECOVERY_STAGE_REBINDING), + ) + .col_expr(recovery_job::Column::CheckpointJson, Expr::value(None::)) + .col_expr(recovery_job::Column::Error, Expr::value(None::)) + .col_expr(recovery_job::Column::UpdatedAt, Expr::value(now)) + .col_expr(recovery_job::Column::LastHeartbeatAt, Expr::value(Some(now))) + .filter(recovery_job::Column::JobId.eq(&job.job_id)) + .filter(recovery_job::Column::Stage.is_in([ + RECOVERY_STAGE_VALIDATING, + RECOVERY_STAGE_REBINDING, + ])) + .exec(db) + .await?; + + if result.rows_affected == 0 { + return RecoveryJobService::get_job(db, &job.job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())); } - RecoveryJobService::update_stage(db, &job.job_id, RECOVERY_STAGE_REBINDING, None, None) - .await + RecoveryJobService::get_job(db, &job.job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())) } async fn handle_rebind_ack_on_db( @@ -521,4 +537,34 @@ mod tests { assert_eq!(reused.job_id, first.job_id); assert_eq!(reused.stage, RECOVERY_STAGE_REBINDING); } + + #[tokio::test] + async fn reusable_start_keeps_latest_stage_when_rebind_ack_wins_race() { + let (db, _tmp) = setup_test_db().await; + + let stale_job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + let acknowledged = RecoveryMergeService::handle_rebind_ack_on_db( + &db, + &stale_job.job_id, + "source-1", + ) + .await + .unwrap(); + assert_eq!(acknowledged.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + + let advanced = RecoveryMergeService::advance_job_to_rebinding(&db, stale_job) + .await + .unwrap(); + + assert_eq!(advanced.job_id, acknowledged.job_id); + assert_eq!(advanced.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + + let loaded = RecoveryJobService::get_job(&db, &advanced.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + } } From 13170d30d77f117df724d7ed5296f1fbeea20ec8 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 20:58:44 +0800 Subject: [PATCH 39/60] fix(server): keep recovery merge transitions monotonic --- crates/server/src/service/recovery_merge.rs | 97 ++++++++++++++++----- 1 file changed, 76 insertions(+), 21 deletions(-) diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index 83f6d145..52512364 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -163,6 +163,7 @@ impl RecoveryMergeService { .col_expr(recovery_job::Column::UpdatedAt, Expr::value(now)) .col_expr(recovery_job::Column::LastHeartbeatAt, Expr::value(Some(now))) .filter(recovery_job::Column::JobId.eq(&job.job_id)) + .filter(recovery_job::Column::Status.eq("running")) .filter(recovery_job::Column::Stage.is_in([ RECOVERY_STAGE_VALIDATING, RECOVERY_STAGE_REBINDING, @@ -186,30 +187,32 @@ impl RecoveryMergeService { job_id: &str, acking_server_id: &str, ) -> Result { - let job = RecoveryJobService::get_job(db, job_id) - .await? - .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; - - if job.source_server_id != acking_server_id { - return Ok(job); - } - - if job.status != "running" { - return Ok(job); - } + let now = Utc::now(); + let result = recovery_job::Entity::update_many() + .col_expr( + recovery_job::Column::Stage, + Expr::value(RECOVERY_STAGE_AWAITING_TARGET_ONLINE), + ) + .col_expr(recovery_job::Column::CheckpointJson, Expr::value(None::)) + .col_expr(recovery_job::Column::Error, Expr::value(None::)) + .col_expr(recovery_job::Column::UpdatedAt, Expr::value(now)) + .col_expr(recovery_job::Column::LastHeartbeatAt, Expr::value(Some(now))) + .filter(recovery_job::Column::JobId.eq(job_id)) + .filter(recovery_job::Column::SourceServerId.eq(acking_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .filter(recovery_job::Column::Stage.eq(RECOVERY_STAGE_REBINDING)) + .exec(db) + .await?; - if job.stage != RECOVERY_STAGE_REBINDING { - return Ok(job); + if result.rows_affected == 0 { + return RecoveryJobService::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())); } - RecoveryJobService::update_stage( - db, - job_id, - RECOVERY_STAGE_AWAITING_TARGET_ONLINE, - None, - None, - ) - .await + RecoveryJobService::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())) } } @@ -567,4 +570,56 @@ mod tests { .unwrap(); assert_eq!(loaded.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); } + + #[tokio::test] + async fn advance_job_to_rebinding_does_not_overwrite_failed_job() { + let (db, _tmp) = setup_test_db().await; + + let stale_job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + RecoveryJobService::mark_failed(&db, &stale_job.job_id, "validating", "boom") + .await + .unwrap(); + + let advanced = RecoveryMergeService::advance_job_to_rebinding(&db, stale_job) + .await + .unwrap(); + + assert_eq!(advanced.status, "failed"); + assert_eq!(advanced.stage, "validating"); + + let loaded = RecoveryJobService::get_job(&db, &advanced.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.status, "failed"); + assert_eq!(loaded.stage, "validating"); + } + + #[tokio::test] + async fn rebind_ack_does_not_overwrite_moved_job() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + RecoveryJobService::update_stage(&db, &job.job_id, RECOVERY_STAGE_AWAITING_TARGET_ONLINE, None, None) + .await + .unwrap(); + + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-1") + .await + .unwrap(); + + assert_eq!(updated.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!(updated.status, "running"); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!(loaded.status, "running"); + } } From 5103a8e0fd7988ecc8cbd3b3dc037b955503deed Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:00:21 +0800 Subject: [PATCH 40/60] feat(server): wire recovery lifecycle into api and ws --- .../server/src/router/api/server_recovery.rs | 119 +++++++--- crates/server/src/router/ws/agent.rs | 203 ++++++++++++++++-- crates/server/src/router/ws/browser.rs | 146 ++++++++++++- 3 files changed, 412 insertions(+), 56 deletions(-) diff --git a/crates/server/src/router/api/server_recovery.rs b/crates/server/src/router/api/server_recovery.rs index 5872eadd..b34a43b3 100644 --- a/crates/server/src/router/api/server_recovery.rs +++ b/crates/server/src/router/api/server_recovery.rs @@ -11,7 +11,9 @@ use serde::{Deserialize, Serialize}; use crate::entity::{recovery_job, server}; use crate::error::{ApiResponse, AppError, ok}; +use crate::router::ws::browser::broadcast_recovery_update; use crate::service::recovery_job::RecoveryJobService; +use crate::service::recovery_merge::RecoveryMergeService; use crate::state::AppState; #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, utoipa::ToSchema)] @@ -19,6 +21,7 @@ use crate::state::AppState; pub enum RecoveryJobStatus { Running, Failed, + Succeeded, Unknown, } @@ -26,8 +29,13 @@ pub enum RecoveryJobStatus { #[serde(rename_all = "snake_case")] pub enum RecoveryJobStage { Validating, + Rebinding, + AwaitingTargetOnline, + FreezingWrites, MergingHistory, Finalizing, + Succeeded, + Failed, Unknown, } @@ -125,9 +133,10 @@ async fn list_candidates( .all(&state.db) .await?; - if running_jobs.iter().any(|job| { - job.target_server_id == target.id || job.source_server_id == target.id - }) { + if running_jobs + .iter() + .any(|job| job.target_server_id == target.id || job.source_server_id == target.id) + { return Err(AppError::Conflict( "Target server is already participating in a running recovery job".to_string(), )); @@ -216,34 +225,8 @@ async fn start_recovery_merge( Path(target_id): Path, Json(request): Json, ) -> Result>, AppError> { - if request.source_server_id == target_id { - return Err(AppError::Validation( - "source_server_id must be different from target_id".to_string(), - )); - } - - let target = server::Entity::find_by_id(&target_id) - .one(&state.db) - .await? - .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; - let source = server::Entity::find_by_id(&request.source_server_id) - .one(&state.db) - .await? - .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; - - if state.agent_manager.is_online(&target.id) { - return Err(AppError::Conflict( - "Target server must be offline before starting recovery".to_string(), - )); - } - - if !state.agent_manager.is_online(&source.id) { - return Err(AppError::Conflict( - "Source server must be online before starting recovery".to_string(), - )); - } - - let job = RecoveryJobService::create_job(&state.db, &target.id, &source.id).await?; + let job = RecoveryMergeService::start(&state, &target_id, &request.source_server_id).await?; + broadcast_recovery_update(&state).await; ok(job.into()) } @@ -376,6 +359,7 @@ impl From<&str> for RecoveryJobStatus { match value { "running" => Self::Running, "failed" => Self::Failed, + "succeeded" => Self::Succeeded, _ => Self::Unknown, } } @@ -385,8 +369,13 @@ impl From<&str> for RecoveryJobStage { fn from(value: &str) -> Self { match value { "validating" => Self::Validating, + "rebinding" => Self::Rebinding, + "awaiting_target_online" => Self::AwaitingTargetOnline, + "freezing_writes" => Self::FreezingWrites, "merging_history" => Self::MergingHistory, "finalizing" => Self::Finalizing, + "succeeded" => Self::Succeeded, + "failed" => Self::Failed, _ => Self::Unknown, } } @@ -394,7 +383,48 @@ impl From<&str> for RecoveryJobStage { #[cfg(test)] mod tests { - use super::{CandidateScoreInput, score_candidate}; + use super::{ + CandidateScoreInput, RecoveryJobStage, StartRecoveryRequest, score_candidate, + start_recovery_merge, + }; + use crate::config::AppConfig; + use crate::entity::server; + use crate::service::auth::AuthService; + use crate::state::AppState; + use crate::test_utils::setup_test_db; + use axum::Json; + use axum::extract::{Path, State}; + use chrono::Utc; + use sea_orm::{ActiveModelTrait, Set}; + use serverbee_common::constants::CAP_DEFAULT; + use std::net::{IpAddr, Ipv4Addr, SocketAddr}; + use std::sync::Arc; + use tokio::sync::mpsc; + + async fn insert_server(db: &sea_orm::DatabaseConnection, id: &str, name: &str) { + let now = Utc::now(); + let token_hash = AuthService::hash_password("test").unwrap(); + server::ActiveModel { + id: Set(id.to_string()), + token_hash: Set(token_hash), + token_prefix: Set("serverbee_test".to_string()), + name: Set(name.to_string()), + weight: Set(0), + hidden: Set(false), + capabilities: Set(CAP_DEFAULT as i32), + protocol_version: Set(1), + created_at: Set(now), + updated_at: Set(now), + ..Default::default() + } + .insert(db) + .await + .unwrap(); + } + + fn test_addr() -> SocketAddr { + SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 9527) + } #[test] fn higher_score_when_ip_arch_and_created_at_match() { @@ -417,4 +447,29 @@ mod tests { assert!(strong > weak); } + + #[tokio::test] + async fn start_recovery_merge_returns_rebinding_stage() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db, AppConfig::default()).await.unwrap(); + + let (tx, _) = mpsc::channel(1); + state + .agent_manager + .add_connection("source-1".into(), "Source".into(), tx, test_addr()); + + let Json(response) = start_recovery_merge( + State(Arc::clone(&state)), + Path("target-1".to_string()), + Json(StartRecoveryRequest { + source_server_id: "source-1".to_string(), + }), + ) + .await + .unwrap(); + + assert_eq!(response.data.stage, RecoveryJobStage::Rebinding); + } } diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 331231a6..40849087 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -18,6 +18,8 @@ use crate::service::auth::AuthService; use crate::service::network_probe::NetworkProbeService; use crate::service::ping::PingService; use crate::service::record::RecordService; +use crate::service::recovery_job::RecoveryJobService; +use crate::service::recovery_merge::{RECOVERY_STAGE_REBINDING, RecoveryMergeService}; use crate::service::server::ServerService; use crate::service::upgrade_tracker::UpgradeLookup; use crate::state::AppState; @@ -438,9 +440,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent tracing::error!("Failed to check event rules for IP change: {e}"); } } else { - tracing::info!( - "Skipping recovery-frozen alert evaluation for {server_id}" - ); + tracing::info!("Skipping recovery-frozen alert evaluation for {server_id}"); } state @@ -459,8 +459,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent // Always update last_remote_addr if let Some(ref addr) = current_remote_addr { if state.recovery_lock.writes_allowed_for(server_id) { - if let Err(e) = update_last_remote_addr(&state.db, server_id, addr).await - { + if let Err(e) = update_last_remote_addr(&state.db, server_id, addr).await { tracing::error!( "Failed to update last_remote_addr for {server_id}: {e}" ); @@ -569,8 +568,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent // Save GPU records if present if let Some(ref gpu) = report.gpu { if state.recovery_lock.writes_allowed_for(server_id) { - if let Err(e) = - RecordService::save_gpu_records(&state.db, server_id, gpu).await + if let Err(e) = RecordService::save_gpu_records(&state.db, server_id, gpu).await { tracing::error!("Failed to save GPU records for {server_id}: {e}"); } @@ -745,7 +743,8 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent results: results.clone(), }); if state.recovery_lock.writes_allowed_for(server_id) { - if let Err(e) = NetworkProbeService::save_results(&state.db, server_id, results).await + if let Err(e) = + NetworkProbeService::save_results(&state.db, server_id, results).await { tracing::error!("Failed to save network probe results for {server_id}: {e}"); } @@ -907,14 +906,42 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent // Agent responded to our protocol-level Ping; already handled by WS Pong frames } AgentMessage::RebindIdentityAck { job_id } => { - tracing::info!( - "Ignoring RebindIdentityAck from agent {server_id} for job_id={job_id}" - ); + match RecoveryMergeService::handle_rebind_ack(state, &job_id, server_id).await { + Ok(job) => { + tracing::info!( + "Applied RebindIdentityAck from agent {server_id} for job_id={job_id}, stage={}", + job.stage + ); + crate::router::ws::browser::broadcast_recovery_update(state).await; + } + Err(error) => { + tracing::warn!( + "Failed to apply RebindIdentityAck from agent {server_id} for job_id={job_id}: {error}" + ); + } + } } AgentMessage::RebindIdentityFailed { job_id, error } => { - tracing::warn!( - "Ignoring RebindIdentityFailed from agent {server_id} for job_id={job_id}: {error}" - ); + match RecoveryJobService::mark_failed( + &state.db, + &job_id, + RECOVERY_STAGE_REBINDING, + &error, + ) + .await + { + Ok(()) => { + tracing::warn!( + "Recorded RebindIdentityFailed from agent {server_id} for job_id={job_id}: {error}" + ); + crate::router::ws::browser::broadcast_recovery_update(state).await; + } + Err(mark_error) => { + tracing::warn!( + "Failed to record RebindIdentityFailed from agent {server_id} for job_id={job_id}: {mark_error}" + ); + } + } } // Docker variants @@ -985,12 +1012,9 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } AgentMessage::DockerEvent { event } => { if state.recovery_lock.writes_allowed_for(server_id) { - let _ = crate::service::docker::DockerService::save_event( - &state.db, - server_id, - &event, - ) - .await; + let _ = + crate::service::docker::DockerService::save_event(&state.db, server_id, &event) + .await; } else { tracing::info!("Skipping recovery-frozen docker event write for {server_id}"); } @@ -1083,7 +1107,9 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent ) .await { - tracing::error!("Failed to update GeoIP for {server_id}: {e}"); + tracing::error!( + "Failed to update GeoIP for {server_id}: {e}" + ); } } else { tracing::info!( @@ -1380,13 +1406,65 @@ async fn update_server_geo( mod tests { use super::*; use crate::config::AppConfig; + use crate::entity::{recovery_job, server}; + use crate::service::auth::AuthService; use crate::test_utils::setup_test_db; + use chrono::Utc; + use sea_orm::{ActiveModelTrait, EntityTrait, Set}; + use serverbee_common::constants::CAP_DEFAULT; + use serverbee_common::protocol::{BrowserMessage, RecoveryJobStage}; use std::net::{IpAddr, Ipv4Addr}; fn test_addr() -> SocketAddr { SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 8080) } + async fn insert_server(db: &sea_orm::DatabaseConnection, id: &str, name: &str) { + let now = Utc::now(); + let token_hash = AuthService::hash_password("test").unwrap(); + server::ActiveModel { + id: Set(id.to_string()), + token_hash: Set(token_hash), + token_prefix: Set("serverbee_test".to_string()), + name: Set(name.to_string()), + weight: Set(0), + hidden: Set(false), + capabilities: Set(CAP_DEFAULT as i32), + protocol_version: Set(1), + created_at: Set(now), + updated_at: Set(now), + ..Default::default() + } + .insert(db) + .await + .unwrap(); + } + + async fn insert_recovery_job( + db: &sea_orm::DatabaseConnection, + job_id: &str, + target_server_id: &str, + source_server_id: &str, + ) { + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set(job_id.to_string()), + target_server_id: Set(target_server_id.to_string()), + source_server_id: Set(source_server_id.to_string()), + status: Set("running".to_string()), + stage: Set("rebinding".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(db) + .await + .unwrap(); + } + #[tokio::test] async fn current_connection_frame_handler_waits_for_server_lock() { let (db, _tmp) = setup_test_db().await; @@ -1450,4 +1528,87 @@ mod tests { .is_current_connection("s1", second_connection_id) ); } + + #[tokio::test] + async fn rebind_identity_ack_advances_job_and_broadcasts_recovery_update() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + insert_recovery_job(&db, "job-1", "target-1", "source-1").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + handle_agent_message( + &state, + "source-1", + AgentMessage::RebindIdentityAck { + job_id: "job-1".to_string(), + }, + ) + .await; + + let job = recovery_job::Entity::find_by_id("job-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(job.stage, "awaiting_target_online"); + + let msg = browser_rx.recv().await.unwrap(); + match msg { + BrowserMessage::Update { + recoveries: Some(recoveries), + .. + } => { + assert_eq!(recoveries.len(), 1); + assert_eq!(recoveries[0].job_id, "job-1"); + assert_eq!(recoveries[0].stage, RecoveryJobStage::AwaitingTargetOnline); + } + other => panic!("expected recovery update, got {other:?}"), + } + } + + #[tokio::test] + async fn rebind_identity_failed_marks_job_failed_and_broadcasts_empty_recovery_update() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + insert_recovery_job(&db, "job-1", "target-1", "source-1").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + handle_agent_message( + &state, + "source-1", + AgentMessage::RebindIdentityFailed { + job_id: "job-1".to_string(), + error: "agent failed".to_string(), + }, + ) + .await; + + let job = recovery_job::Entity::find_by_id("job-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(job.status, "failed"); + assert_eq!(job.stage, "rebinding"); + assert_eq!(job.error.as_deref(), Some("agent failed")); + + let msg = browser_rx.recv().await.unwrap(); + match msg { + BrowserMessage::Update { + recoveries: Some(recoveries), + .. + } => { + assert!(recoveries.is_empty()); + } + other => panic!("expected recovery update, got {other:?}"), + } + } } diff --git a/crates/server/src/router/ws/browser.rs b/crates/server/src/router/ws/browser.rs index 70219389..470b7b2a 100644 --- a/crates/server/src/router/ws/browser.rs +++ b/crates/server/src/router/ws/browser.rs @@ -7,13 +7,18 @@ use axum::http::HeaderMap; use axum::response::{IntoResponse, Response}; use axum::routing::get; use futures_util::{SinkExt, StreamExt}; +use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; +use crate::entity::recovery_job; use crate::service::agent_manager::aggregate_disk_io; use crate::service::auth::AuthService; use crate::service::server::ServerService; use crate::state::AppState; use serverbee_common::constants::MAX_WS_MESSAGE_SIZE; -use serverbee_common::protocol::{BrowserClientMessage, BrowserMessage, ServerMessage}; +use serverbee_common::protocol::{ + BrowserClientMessage, BrowserMessage, RecoveryJobDto, RecoveryJobStage, RecoveryJobStatus, + ServerMessage, +}; use serverbee_common::types::ServerStatus; pub fn router() -> Router> { @@ -249,6 +254,7 @@ async fn handle_browser_client_message( } async fn build_full_sync(state: &Arc) -> BrowserMessage { + let recoveries = recovery_snapshot(state).await; let servers = match ServerService::list_servers(&state.db).await { Ok(servers) => servers, Err(e) => { @@ -256,7 +262,7 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { return BrowserMessage::FullSync { servers: Vec::new(), upgrades: state.upgrade_tracker.snapshot(), - recoveries: Vec::new(), + recoveries, }; } }; @@ -356,10 +362,31 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { BrowserMessage::FullSync { servers: statuses, upgrades: state.upgrade_tracker.snapshot(), - recoveries: Vec::new(), + recoveries, } } +pub(crate) async fn recovery_snapshot(state: &Arc) -> Vec { + match recovery_job::Entity::find() + .filter(recovery_job::Column::Status.eq("running")) + .all(&state.db) + .await + { + Ok(jobs) => jobs.into_iter().map(Into::into).collect(), + Err(e) => { + tracing::error!("Failed to list recovery jobs for browser sync: {e}"); + Vec::new() + } + } +} + +pub(crate) async fn broadcast_recovery_update(state: &Arc) { + let _ = state.browser_tx.send(BrowserMessage::Update { + servers: Vec::new(), + recoveries: Some(recovery_snapshot(state).await), + }); +} + async fn send_browser_message( sink: &mut futures_util::stream::SplitSink, msg: &BrowserMessage, @@ -367,3 +394,116 @@ async fn send_browser_message( let text = serde_json::to_string(msg).map_err(axum::Error::new)?; sink.send(Message::Text(text.into())).await } + +impl From for RecoveryJobDto { + fn from(value: recovery_job::Model) -> Self { + Self { + job_id: value.job_id, + target_server_id: value.target_server_id, + source_server_id: value.source_server_id, + status: recovery_job_status_from_str(&value.status), + stage: recovery_job_stage_from_str(&value.stage), + error: value.error, + started_at: value.started_at, + created_at: value.created_at, + updated_at: value.updated_at, + last_heartbeat_at: value.last_heartbeat_at, + } + } +} + +fn recovery_job_status_from_str(value: &str) -> RecoveryJobStatus { + match value { + "running" => RecoveryJobStatus::Running, + "failed" => RecoveryJobStatus::Failed, + "succeeded" => RecoveryJobStatus::Succeeded, + _ => RecoveryJobStatus::Unknown, + } +} + +fn recovery_job_stage_from_str(value: &str) -> RecoveryJobStage { + match value { + "validating" => RecoveryJobStage::Validating, + "rebinding" => RecoveryJobStage::Rebinding, + "awaiting_target_online" => RecoveryJobStage::AwaitingTargetOnline, + "freezing_writes" => RecoveryJobStage::FreezingWrites, + "merging_history" => RecoveryJobStage::MergingHistory, + "finalizing" => RecoveryJobStage::Finalizing, + "succeeded" => RecoveryJobStage::Succeeded, + "failed" => RecoveryJobStage::Failed, + _ => RecoveryJobStage::Unknown, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::AppConfig; + use crate::entity::server; + use crate::service::auth::AuthService; + use crate::test_utils::setup_test_db; + use chrono::Utc; + use sea_orm::{ActiveModelTrait, Set}; + use serverbee_common::constants::CAP_DEFAULT; + + async fn insert_server(db: &sea_orm::DatabaseConnection, id: &str, name: &str) { + let now = Utc::now(); + let token_hash = AuthService::hash_password("test").unwrap(); + server::ActiveModel { + id: Set(id.to_string()), + token_hash: Set(token_hash), + token_prefix: Set("serverbee_test".to_string()), + name: Set(name.to_string()), + weight: Set(0), + hidden: Set(false), + capabilities: Set(CAP_DEFAULT as i32), + protocol_version: Set(1), + created_at: Set(now), + updated_at: Set(now), + ..Default::default() + } + .insert(db) + .await + .unwrap(); + } + + #[tokio::test] + async fn full_sync_includes_running_recoveries() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set("job-1".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("running".to_string()), + stage: Set("rebinding".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + + let message = build_full_sync(&state).await; + + match message { + BrowserMessage::FullSync { recoveries, .. } => { + assert_eq!(recoveries.len(), 1); + assert_eq!(recoveries[0].job_id, "job-1"); + assert_eq!(recoveries[0].stage, RecoveryJobStage::Rebinding); + assert_eq!(recoveries[0].status, RecoveryJobStatus::Running); + } + other => panic!("expected full sync, got {other:?}"), + } + } +} From 02966dd27294bd2edc8d6ddd967121e3639a86e6 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:01:58 +0800 Subject: [PATCH 41/60] fix(server): route rebind failures through recovery service --- crates/server/src/router/ws/agent.rs | 14 +-- crates/server/src/service/recovery_merge.rs | 123 +++++++++++++++++--- 2 files changed, 109 insertions(+), 28 deletions(-) diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 40849087..bc9498a4 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -18,8 +18,7 @@ use crate::service::auth::AuthService; use crate::service::network_probe::NetworkProbeService; use crate::service::ping::PingService; use crate::service::record::RecordService; -use crate::service::recovery_job::RecoveryJobService; -use crate::service::recovery_merge::{RECOVERY_STAGE_REBINDING, RecoveryMergeService}; +use crate::service::recovery_merge::RecoveryMergeService; use crate::service::server::ServerService; use crate::service::upgrade_tracker::UpgradeLookup; use crate::state::AppState; @@ -922,15 +921,10 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } AgentMessage::RebindIdentityFailed { job_id, error } => { - match RecoveryJobService::mark_failed( - &state.db, - &job_id, - RECOVERY_STAGE_REBINDING, - &error, - ) - .await + match RecoveryMergeService::handle_rebind_failure(state, &job_id, server_id, &error) + .await { - Ok(()) => { + Ok(_) => { tracing::warn!( "Recorded RebindIdentityFailed from agent {server_id} for job_id={job_id}: {error}" ); diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index 52512364..c8ba9f02 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -45,6 +45,15 @@ impl RecoveryMergeService { Self::handle_rebind_ack_on_db(&state.db, job_id, acking_server_id).await } + pub async fn handle_rebind_failure( + state: &Arc, + job_id: &str, + source_server_id: &str, + error: &str, + ) -> Result { + Self::handle_rebind_failure_on_db(&state.db, job_id, source_server_id, error).await + } + async fn validate_start_request( state: &Arc, target_server_id: &str, @@ -158,16 +167,22 @@ impl RecoveryMergeService { recovery_job::Column::Stage, Expr::value(RECOVERY_STAGE_REBINDING), ) - .col_expr(recovery_job::Column::CheckpointJson, Expr::value(None::)) + .col_expr( + recovery_job::Column::CheckpointJson, + Expr::value(None::), + ) .col_expr(recovery_job::Column::Error, Expr::value(None::)) .col_expr(recovery_job::Column::UpdatedAt, Expr::value(now)) - .col_expr(recovery_job::Column::LastHeartbeatAt, Expr::value(Some(now))) + .col_expr( + recovery_job::Column::LastHeartbeatAt, + Expr::value(Some(now)), + ) .filter(recovery_job::Column::JobId.eq(&job.job_id)) .filter(recovery_job::Column::Status.eq("running")) - .filter(recovery_job::Column::Stage.is_in([ - RECOVERY_STAGE_VALIDATING, - RECOVERY_STAGE_REBINDING, - ])) + .filter( + recovery_job::Column::Stage + .is_in([RECOVERY_STAGE_VALIDATING, RECOVERY_STAGE_REBINDING]), + ) .exec(db) .await?; @@ -193,10 +208,16 @@ impl RecoveryMergeService { recovery_job::Column::Stage, Expr::value(RECOVERY_STAGE_AWAITING_TARGET_ONLINE), ) - .col_expr(recovery_job::Column::CheckpointJson, Expr::value(None::)) + .col_expr( + recovery_job::Column::CheckpointJson, + Expr::value(None::), + ) .col_expr(recovery_job::Column::Error, Expr::value(None::)) .col_expr(recovery_job::Column::UpdatedAt, Expr::value(now)) - .col_expr(recovery_job::Column::LastHeartbeatAt, Expr::value(Some(now))) + .col_expr( + recovery_job::Column::LastHeartbeatAt, + Expr::value(Some(now)), + ) .filter(recovery_job::Column::JobId.eq(job_id)) .filter(recovery_job::Column::SourceServerId.eq(acking_server_id)) .filter(recovery_job::Column::Status.eq("running")) @@ -214,6 +235,46 @@ impl RecoveryMergeService { .await? .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())) } + + async fn handle_rebind_failure_on_db( + db: &DatabaseConnection, + job_id: &str, + source_server_id: &str, + error: &str, + ) -> Result { + let now = Utc::now(); + let result = recovery_job::Entity::update_many() + .col_expr(recovery_job::Column::Status, Expr::value("failed")) + .col_expr( + recovery_job::Column::Stage, + Expr::value(RECOVERY_STAGE_REBINDING), + ) + .col_expr( + recovery_job::Column::Error, + Expr::value(Some(error.to_string())), + ) + .col_expr(recovery_job::Column::UpdatedAt, Expr::value(now)) + .col_expr( + recovery_job::Column::LastHeartbeatAt, + Expr::value(Some(now)), + ) + .filter(recovery_job::Column::JobId.eq(job_id)) + .filter(recovery_job::Column::SourceServerId.eq(source_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .filter(recovery_job::Column::Stage.eq(RECOVERY_STAGE_REBINDING)) + .exec(db) + .await?; + + if result.rows_affected == 0 { + return RecoveryJobService::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())); + } + + RecoveryJobService::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())) + } } pub fn recovery_phase_for_stage(stage: &str) -> Option { @@ -491,6 +552,29 @@ mod tests { assert_eq!(loaded.stage, RECOVERY_STAGE_REBINDING); } + #[tokio::test] + async fn rebind_failure_marks_job_failed() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let failed = RecoveryMergeService::handle_rebind_failure_on_db( + &db, + &job.job_id, + "source-1", + "agent failed", + ) + .await + .unwrap(); + + assert_eq!(failed.job_id, job.job_id); + assert_eq!(failed.status, "failed"); + assert_eq!(failed.stage, RECOVERY_STAGE_REBINDING); + assert_eq!(failed.error.as_deref(), Some("agent failed")); + } + #[tokio::test] async fn start_rejects_self_merge_at_service_boundary() { let (state, _tmp) = test_state_with_servers().await; @@ -548,13 +632,10 @@ mod tests { let stale_job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") .await .unwrap(); - let acknowledged = RecoveryMergeService::handle_rebind_ack_on_db( - &db, - &stale_job.job_id, - "source-1", - ) - .await - .unwrap(); + let acknowledged = + RecoveryMergeService::handle_rebind_ack_on_db(&db, &stale_job.job_id, "source-1") + .await + .unwrap(); assert_eq!(acknowledged.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); let advanced = RecoveryMergeService::advance_job_to_rebinding(&db, stale_job) @@ -604,9 +685,15 @@ mod tests { let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") .await .unwrap(); - RecoveryJobService::update_stage(&db, &job.job_id, RECOVERY_STAGE_AWAITING_TARGET_ONLINE, None, None) - .await - .unwrap(); + RecoveryJobService::update_stage( + &db, + &job.job_id, + RECOVERY_STAGE_AWAITING_TARGET_ONLINE, + None, + None, + ) + .await + .unwrap(); let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-1") .await From d514d2e5d60cf592638a469ae2d1487290536523 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:03:36 +0800 Subject: [PATCH 42/60] fix(server): complete recovery rebind dispatch flow --- .../server/src/router/api/server_recovery.rs | 82 +++++++++- crates/server/src/router/ws/agent.rs | 142 ++++++++++++++++-- crates/server/src/service/recovery_merge.rs | 109 ++++++++++---- 3 files changed, 291 insertions(+), 42 deletions(-) diff --git a/crates/server/src/router/api/server_recovery.rs b/crates/server/src/router/api/server_recovery.rs index b34a43b3..8b8f718a 100644 --- a/crates/server/src/router/api/server_recovery.rs +++ b/crates/server/src/router/api/server_recovery.rs @@ -8,6 +8,7 @@ use axum::routing::{get, post}; use axum::{Json, Router}; use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; use serde::{Deserialize, Serialize}; +use serverbee_common::protocol::ServerMessage; use crate::entity::{recovery_job, server}; use crate::error::{ApiResponse, AppError, ok}; @@ -226,6 +227,23 @@ async fn start_recovery_merge( Json(request): Json, ) -> Result>, AppError> { let job = RecoveryMergeService::start(&state, &target_id, &request.source_server_id).await?; + let token = RecoveryMergeService::rotate_target_token(&state, &target_id).await?; + let sender = state + .agent_manager + .get_sender(&request.source_server_id) + .ok_or_else(|| { + AppError::Conflict("Source server must be online before starting recovery".to_string()) + })?; + sender + .send(ServerMessage::RebindIdentity { + job_id: job.job_id.clone(), + target_server_id: target_id.clone(), + token, + }) + .await + .map_err(|error| { + AppError::Internal(format!("Failed to dispatch RebindIdentity to source agent: {error}")) + })?; broadcast_recovery_update(&state).await; ok(job.into()) } @@ -395,11 +413,13 @@ mod tests { use axum::Json; use axum::extract::{Path, State}; use chrono::Utc; - use sea_orm::{ActiveModelTrait, Set}; + use sea_orm::{ActiveModelTrait, EntityTrait, Set}; use serverbee_common::constants::CAP_DEFAULT; + use serverbee_common::protocol::ServerMessage; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use std::sync::Arc; use tokio::sync::mpsc; + use tokio::time::{Duration, timeout}; async fn insert_server(db: &sea_orm::DatabaseConnection, id: &str, name: &str) { let now = Utc::now(); @@ -455,7 +475,7 @@ mod tests { insert_server(&db, "source-1", "Source").await; let state = AppState::new(db, AppConfig::default()).await.unwrap(); - let (tx, _) = mpsc::channel(1); + let (tx, mut rx) = mpsc::channel(1); state .agent_manager .add_connection("source-1".into(), "Source".into(), tx, test_addr()); @@ -471,5 +491,63 @@ mod tests { .unwrap(); assert_eq!(response.data.stage, RecoveryJobStage::Rebinding); + let _message = timeout(Duration::from_secs(1), rx.recv()) + .await + .expect("rebind command should be sent in time") + .expect("rebind command channel should stay open"); + } + + #[tokio::test] + async fn start_recovery_merge_sends_rebind_identity_command() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let (tx, mut rx) = mpsc::channel(1); + state + .agent_manager + .add_connection("source-1".into(), "Source".into(), tx, test_addr()); + + let Json(response) = start_recovery_merge( + State(Arc::clone(&state)), + Path("target-1".to_string()), + Json(StartRecoveryRequest { + source_server_id: "source-1".to_string(), + }), + ) + .await + .unwrap(); + + let message = timeout(Duration::from_secs(1), rx.recv()) + .await + .expect("rebind command should be sent in time") + .expect("rebind command channel should stay open"); + let token = match message { + ServerMessage::RebindIdentity { + job_id, + target_server_id, + token, + } => { + assert_eq!(job_id, response.data.job_id); + assert_eq!(target_server_id, "target-1"); + token + } + other => panic!("expected rebind command, got {other:?}"), + }; + + let target = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(target.token_prefix, token[..8.min(token.len())]); + let validated = AuthService::validate_agent_token(&db, &token) + .await + .unwrap() + .expect("target token should validate"); + assert_eq!(validated.id, "target-1"); } } diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index bc9498a4..8888acbe 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -906,12 +906,19 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } AgentMessage::RebindIdentityAck { job_id } => { match RecoveryMergeService::handle_rebind_ack(state, &job_id, server_id).await { - Ok(job) => { - tracing::info!( - "Applied RebindIdentityAck from agent {server_id} for job_id={job_id}, stage={}", - job.stage - ); - crate::router::ws::browser::broadcast_recovery_update(state).await; + Ok(change) => { + if change.transitioned { + tracing::info!( + "Applied RebindIdentityAck from agent {server_id} for job_id={job_id}, stage={}", + change.job.stage + ); + crate::router::ws::browser::broadcast_recovery_update(state).await; + } else { + tracing::info!( + "Ignoring stale RebindIdentityAck from agent {server_id} for job_id={job_id}, stage={}", + change.job.stage + ); + } } Err(error) => { tracing::warn!( @@ -924,11 +931,18 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent match RecoveryMergeService::handle_rebind_failure(state, &job_id, server_id, &error) .await { - Ok(_) => { - tracing::warn!( - "Recorded RebindIdentityFailed from agent {server_id} for job_id={job_id}: {error}" - ); - crate::router::ws::browser::broadcast_recovery_update(state).await; + Ok(change) => { + if change.transitioned { + tracing::warn!( + "Recorded RebindIdentityFailed from agent {server_id} for job_id={job_id}: {error}" + ); + crate::router::ws::browser::broadcast_recovery_update(state).await; + } else { + tracing::info!( + "Ignoring stale RebindIdentityFailed from agent {server_id} for job_id={job_id}, stage={}", + change.job.stage + ); + } } Err(mark_error) => { tracing::warn!( @@ -1408,6 +1422,7 @@ mod tests { use serverbee_common::constants::CAP_DEFAULT; use serverbee_common::protocol::{BrowserMessage, RecoveryJobStage}; use std::net::{IpAddr, Ipv4Addr}; + use tokio::time::{Duration, timeout}; fn test_addr() -> SocketAddr { SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 8080) @@ -1605,4 +1620,109 @@ mod tests { other => panic!("expected recovery update, got {other:?}"), } } + + #[tokio::test] + async fn stale_rebind_identity_ack_does_not_broadcast_recovery_update() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + insert_recovery_job(&db, "job-1", "target-1", "source-1").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + RecoveryMergeService::handle_rebind_ack(&state, "job-1", "source-1") + .await + .unwrap(); + + handle_agent_message( + &state, + "source-1", + AgentMessage::RebindIdentityAck { + job_id: "job-1".to_string(), + }, + ) + .await; + + let job = recovery_job::Entity::find_by_id("job-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(job.stage, "awaiting_target_online"); + + assert!(timeout(Duration::from_millis(50), browser_rx.recv()).await.is_err()); + } + + #[tokio::test] + async fn wrong_source_rebind_identity_failure_does_not_broadcast_recovery_update() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + insert_server(&db, "source-2", "Other Source").await; + insert_recovery_job(&db, "job-1", "target-1", "source-1").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + handle_agent_message( + &state, + "source-2", + AgentMessage::RebindIdentityFailed { + job_id: "job-1".to_string(), + error: "wrong source".to_string(), + }, + ) + .await; + + let job = recovery_job::Entity::find_by_id("job-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(job.status, "running"); + assert_eq!(job.stage, "rebinding"); + assert_eq!(job.error, None); + + assert!(timeout(Duration::from_millis(50), browser_rx.recv()).await.is_err()); + } + + #[tokio::test] + async fn stale_rebind_identity_failure_does_not_broadcast_recovery_update() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + insert_recovery_job(&db, "job-1", "target-1", "source-1").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + RecoveryMergeService::handle_rebind_failure(&state, "job-1", "source-1", "first failure") + .await + .unwrap(); + + handle_agent_message( + &state, + "source-1", + AgentMessage::RebindIdentityFailed { + job_id: "job-1".to_string(), + error: "stale failure".to_string(), + }, + ) + .await; + + let job = recovery_job::Entity::find_by_id("job-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(job.status, "failed"); + assert_eq!(job.stage, "rebinding"); + assert_eq!(job.error.as_deref(), Some("first failure")); + + assert!(timeout(Duration::from_millis(50), browser_rx.recv()).await.is_err()); + } } diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index c8ba9f02..d1f68ad5 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -2,10 +2,11 @@ use std::sync::Arc; use chrono::Utc; use sea_orm::prelude::Expr; -use sea_orm::{ColumnTrait, DatabaseConnection, EntityTrait, QueryFilter}; +use sea_orm::{ActiveModelTrait, ColumnTrait, DatabaseConnection, EntityTrait, QueryFilter}; use crate::entity::{recovery_job, server}; use crate::error::AppError; +use crate::service::auth::AuthService; use crate::service::recovery_job::RecoveryJobService; use crate::state::AppState; @@ -15,6 +16,11 @@ pub const RECOVERY_STAGE_AWAITING_TARGET_ONLINE: &str = "awaiting_target_online" pub struct RecoveryMergeService; +pub struct RecoveryStateChange { + pub job: recovery_job::Model, + pub transitioned: bool, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum RecoveryFailurePhase { PreRebind, @@ -41,7 +47,7 @@ impl RecoveryMergeService { state: &Arc, job_id: &str, acking_server_id: &str, - ) -> Result { + ) -> Result { Self::handle_rebind_ack_on_db(&state.db, job_id, acking_server_id).await } @@ -50,10 +56,32 @@ impl RecoveryMergeService { job_id: &str, source_server_id: &str, error: &str, - ) -> Result { + ) -> Result { Self::handle_rebind_failure_on_db(&state.db, job_id, source_server_id, error).await } + pub async fn rotate_target_token( + state: &Arc, + target_server_id: &str, + ) -> Result { + let target = server::Entity::find_by_id(target_server_id) + .one(&state.db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + + let plaintext_token = AuthService::generate_session_token(); + let token_hash = AuthService::hash_password(&plaintext_token)?; + let token_prefix = plaintext_token[..8.min(plaintext_token.len())].to_string(); + + let mut active: server::ActiveModel = target.into(); + active.token_hash = sea_orm::Set(token_hash); + active.token_prefix = sea_orm::Set(token_prefix); + active.updated_at = sea_orm::Set(Utc::now()); + active.update(&state.db).await?; + + Ok(plaintext_token) + } + async fn validate_start_request( state: &Arc, target_server_id: &str, @@ -201,7 +229,7 @@ impl RecoveryMergeService { db: &DatabaseConnection, job_id: &str, acking_server_id: &str, - ) -> Result { + ) -> Result { let now = Utc::now(); let result = recovery_job::Entity::update_many() .col_expr( @@ -226,14 +254,22 @@ impl RecoveryMergeService { .await?; if result.rows_affected == 0 { - return RecoveryJobService::get_job(db, job_id) + let job = RecoveryJobService::get_job(db, job_id) .await? - .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())); + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + return Ok(RecoveryStateChange { + job, + transitioned: false, + }); } - RecoveryJobService::get_job(db, job_id) + let job = RecoveryJobService::get_job(db, job_id) .await? - .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())) + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + Ok(RecoveryStateChange { + job, + transitioned: true, + }) } async fn handle_rebind_failure_on_db( @@ -241,7 +277,7 @@ impl RecoveryMergeService { job_id: &str, source_server_id: &str, error: &str, - ) -> Result { + ) -> Result { let now = Utc::now(); let result = recovery_job::Entity::update_many() .col_expr(recovery_job::Column::Status, Expr::value("failed")) @@ -266,14 +302,22 @@ impl RecoveryMergeService { .await?; if result.rows_affected == 0 { - return RecoveryJobService::get_job(db, job_id) + let job = RecoveryJobService::get_job(db, job_id) .await? - .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())); + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + return Ok(RecoveryStateChange { + job, + transitioned: false, + }); } - RecoveryJobService::get_job(db, job_id) + let job = RecoveryJobService::get_job(db, job_id) .await? - .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())) + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + Ok(RecoveryStateChange { + job, + transitioned: true, + }) } } @@ -476,9 +520,10 @@ mod tests { .await .unwrap(); - assert_eq!(updated.job_id, job.job_id); - assert_eq!(updated.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); - assert_eq!(updated.status, "running"); + assert!(updated.transitioned); + assert_eq!(updated.job.job_id, job.job_id); + assert_eq!(updated.job.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!(updated.job.status, "running"); let loaded = RecoveryJobService::get_job(&db, &job.job_id) .await @@ -502,8 +547,9 @@ mod tests { .await .unwrap(); - assert_eq!(updated.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); - assert_eq!(updated.status, "running"); + assert!(!updated.transitioned); + assert_eq!(updated.job.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!(updated.job.status, "running"); } #[tokio::test] @@ -521,7 +567,8 @@ mod tests { .await .unwrap(); - assert_eq!(updated.stage, "validating"); + assert!(!updated.transitioned); + assert_eq!(updated.job.stage, "validating"); let loaded = RecoveryJobService::get_job(&db, &job.job_id) .await @@ -542,8 +589,9 @@ mod tests { .await .unwrap(); - assert_eq!(updated.job_id, job.job_id); - assert_eq!(updated.stage, RECOVERY_STAGE_REBINDING); + assert!(!updated.transitioned); + assert_eq!(updated.job.job_id, job.job_id); + assert_eq!(updated.job.stage, RECOVERY_STAGE_REBINDING); let loaded = RecoveryJobService::get_job(&db, &job.job_id) .await @@ -569,10 +617,11 @@ mod tests { .await .unwrap(); - assert_eq!(failed.job_id, job.job_id); - assert_eq!(failed.status, "failed"); - assert_eq!(failed.stage, RECOVERY_STAGE_REBINDING); - assert_eq!(failed.error.as_deref(), Some("agent failed")); + assert!(failed.transitioned); + assert_eq!(failed.job.job_id, job.job_id); + assert_eq!(failed.job.status, "failed"); + assert_eq!(failed.job.stage, RECOVERY_STAGE_REBINDING); + assert_eq!(failed.job.error.as_deref(), Some("agent failed")); } #[tokio::test] @@ -636,13 +685,14 @@ mod tests { RecoveryMergeService::handle_rebind_ack_on_db(&db, &stale_job.job_id, "source-1") .await .unwrap(); - assert_eq!(acknowledged.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert!(acknowledged.transitioned); + assert_eq!(acknowledged.job.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); let advanced = RecoveryMergeService::advance_job_to_rebinding(&db, stale_job) .await .unwrap(); - assert_eq!(advanced.job_id, acknowledged.job_id); + assert_eq!(advanced.job_id, acknowledged.job.job_id); assert_eq!(advanced.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); let loaded = RecoveryJobService::get_job(&db, &advanced.job_id) @@ -699,8 +749,9 @@ mod tests { .await .unwrap(); - assert_eq!(updated.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); - assert_eq!(updated.status, "running"); + assert!(!updated.transitioned); + assert_eq!(updated.job.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!(updated.job.status, "running"); let loaded = RecoveryJobService::get_job(&db, &job.job_id) .await From f708be72d6fa1758df36e73cc48f49c06bd5014d Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:05:12 +0800 Subject: [PATCH 43/60] fix(server): harden recovery dispatch and browser snapshots --- .../server/src/router/api/server_recovery.rs | 143 +++++++++++++-- crates/server/src/router/ws/browser.rs | 116 +++++++++++- crates/server/src/service/recovery_merge.rs | 167 ++++++++++++++++-- 3 files changed, 383 insertions(+), 43 deletions(-) diff --git a/crates/server/src/router/api/server_recovery.rs b/crates/server/src/router/api/server_recovery.rs index 8b8f718a..70771bc9 100644 --- a/crates/server/src/router/api/server_recovery.rs +++ b/crates/server/src/router/api/server_recovery.rs @@ -6,9 +6,10 @@ use std::sync::Arc; use axum::extract::{Path, State}; use axum::routing::{get, post}; use axum::{Json, Router}; -use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; +use sea_orm::{ColumnTrait, EntityTrait, QueryFilter, TransactionTrait}; use serde::{Deserialize, Serialize}; use serverbee_common::protocol::ServerMessage; +use tokio::sync::mpsc; use crate::entity::{recovery_job, server}; use crate::error::{ApiResponse, AppError, ok}; @@ -226,26 +227,46 @@ async fn start_recovery_merge( Path(target_id): Path, Json(request): Json, ) -> Result>, AppError> { - let job = RecoveryMergeService::start(&state, &target_id, &request.source_server_id).await?; - let token = RecoveryMergeService::rotate_target_token(&state, &target_id).await?; - let sender = state - .agent_manager - .get_sender(&request.source_server_id) - .ok_or_else(|| { - AppError::Conflict("Source server must be online before starting recovery".to_string()) - })?; - sender + let sender = state.agent_manager.get_sender(&request.source_server_id); + let job = + start_recovery_merge_with_sender(&state, &target_id, &request.source_server_id, sender) + .await?; + broadcast_recovery_update(&state).await; + ok(job.into()) +} + +async fn start_recovery_merge_with_sender( + state: &Arc, + target_id: &str, + source_server_id: &str, + sender: Option>, +) -> Result { + let sender = sender.ok_or_else(|| { + AppError::Conflict("Source server must be online before starting recovery".to_string()) + })?; + + RecoveryMergeService::validate_start_request(state, target_id, source_server_id).await?; + + let txn = state.db.begin().await?; + let job = RecoveryMergeService::start_on_txn(&txn, target_id, source_server_id).await?; + let token = RecoveryMergeService::rotate_target_token_on_txn(&txn, target_id).await?; + + if let Err(error) = sender .send(ServerMessage::RebindIdentity { job_id: job.job_id.clone(), - target_server_id: target_id.clone(), + target_server_id: target_id.to_string(), token, }) .await - .map_err(|error| { - AppError::Internal(format!("Failed to dispatch RebindIdentity to source agent: {error}")) - })?; - broadcast_recovery_update(&state).await; - ok(job.into()) + { + txn.rollback().await?; + return Err(AppError::Internal(format!( + "Failed to dispatch RebindIdentity to source agent: {error}" + ))); + } + + txn.commit().await?; + Ok(job) } fn build_candidate_response( @@ -403,10 +424,11 @@ impl From<&str> for RecoveryJobStage { mod tests { use super::{ CandidateScoreInput, RecoveryJobStage, StartRecoveryRequest, score_candidate, - start_recovery_merge, + start_recovery_merge, start_recovery_merge_with_sender, }; use crate::config::AppConfig; - use crate::entity::server; + use crate::entity::{recovery_job, server}; + use crate::error::AppError; use crate::service::auth::AuthService; use crate::state::AppState; use crate::test_utils::setup_test_db; @@ -550,4 +572,89 @@ mod tests { .expect("target token should validate"); assert_eq!(validated.id, "target-1"); } + + #[tokio::test] + async fn start_recovery_merge_fails_safely_when_sender_missing() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let before = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + + let error = start_recovery_merge_with_sender(&state, "target-1", "source-1", None) + .await + .expect_err("missing sender should fail safely"); + + assert!( + matches!(error, AppError::Conflict(message) if message.contains("Source server must be online")) + ); + + let after = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(after.token_prefix, before.token_prefix); + assert_eq!(after.token_hash, before.token_hash); + + let jobs = recovery_job::Entity::find().all(&db).await.unwrap(); + assert!(jobs.is_empty(), "no recovery job should be persisted"); + } + + #[tokio::test] + async fn start_recovery_merge_fails_safely_when_dispatch_fails() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let (tx, rx) = mpsc::channel(1); + drop(rx); + state + .agent_manager + .add_connection("source-1".into(), "Source".into(), tx, test_addr()); + + let before = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + + let error = start_recovery_merge( + State(Arc::clone(&state)), + Path("target-1".to_string()), + Json(StartRecoveryRequest { + source_server_id: "source-1".to_string(), + }), + ) + .await + .expect_err("dispatch failure should fail safely"); + + assert!( + matches!(error, AppError::Internal(message) if message.contains("Failed to dispatch RebindIdentity")) + ); + + let after = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(after.token_prefix, before.token_prefix); + assert_eq!(after.token_hash, before.token_hash); + + let jobs = recovery_job::Entity::find().all(&db).await.unwrap(); + assert!( + jobs.is_empty(), + "no recovery job should remain after failed dispatch" + ); + } } diff --git a/crates/server/src/router/ws/browser.rs b/crates/server/src/router/ws/browser.rs index 470b7b2a..abc915c0 100644 --- a/crates/server/src/router/ws/browser.rs +++ b/crates/server/src/router/ws/browser.rs @@ -7,7 +7,7 @@ use axum::http::HeaderMap; use axum::response::{IntoResponse, Response}; use axum::routing::get; use futures_util::{SinkExt, StreamExt}; -use sea_orm::{ColumnTrait, EntityTrait, QueryFilter}; +use sea_orm::EntityTrait; use crate::entity::recovery_job; use crate::service::agent_manager::aggregate_disk_io; @@ -367,11 +367,7 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { } pub(crate) async fn recovery_snapshot(state: &Arc) -> Vec { - match recovery_job::Entity::find() - .filter(recovery_job::Column::Status.eq("running")) - .all(&state.db) - .await - { + match recovery_job::Entity::find().all(&state.db).await { Ok(jobs) => jobs.into_iter().map(Into::into).collect(), Err(e) => { tracing::error!("Failed to list recovery jobs for browser sync: {e}"); @@ -506,4 +502,112 @@ mod tests { other => panic!("expected full sync, got {other:?}"), } } + + #[tokio::test] + async fn full_sync_includes_terminal_recovery_states() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set("job-failed".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("failed".to_string()), + stage: Set("failed".to_string()), + checkpoint_json: Set(None), + error: Set(Some("boom".to_string())), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + recovery_job::ActiveModel { + job_id: Set("job-succeeded".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("succeeded".to_string()), + stage: Set("succeeded".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + + let message = build_full_sync(&state).await; + + match message { + BrowserMessage::FullSync { recoveries, .. } => { + assert_eq!(recoveries.len(), 2); + assert!(recoveries.iter().any(|job| { + job.job_id == "job-failed" + && job.status == RecoveryJobStatus::Failed + && job.stage == RecoveryJobStage::Failed + })); + assert!(recoveries.iter().any(|job| { + job.job_id == "job-succeeded" + && job.status == RecoveryJobStatus::Succeeded + && job.stage == RecoveryJobStage::Succeeded + })); + } + other => panic!("expected full sync, got {other:?}"), + } + } + + #[tokio::test] + async fn broadcast_recovery_update_includes_terminal_recovery_states() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set("job-failed".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("failed".to_string()), + stage: Set("failed".to_string()), + checkpoint_json: Set(None), + error: Set(Some("boom".to_string())), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + + broadcast_recovery_update(&state).await; + + let message = browser_rx.recv().await.unwrap(); + match message { + BrowserMessage::Update { + recoveries: Some(recoveries), + .. + } => { + assert_eq!(recoveries.len(), 1); + assert_eq!(recoveries[0].job_id, "job-failed"); + assert_eq!(recoveries[0].status, RecoveryJobStatus::Failed); + assert_eq!(recoveries[0].stage, RecoveryJobStage::Failed); + } + other => panic!("expected update with recoveries, got {other:?}"), + } + } } diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index d1f68ad5..c1e06521 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -2,7 +2,10 @@ use std::sync::Arc; use chrono::Utc; use sea_orm::prelude::Expr; -use sea_orm::{ActiveModelTrait, ColumnTrait, DatabaseConnection, EntityTrait, QueryFilter}; +use sea_orm::{ + ActiveModelTrait, ColumnTrait, ConnectionTrait, DatabaseConnection, DatabaseTransaction, + EntityTrait, QueryFilter, +}; use crate::entity::{recovery_job, server}; use crate::error::AppError; @@ -33,6 +36,16 @@ pub enum RecoveryRetryStrategy { ResumeSameJob, } +fn is_unique_violation(err: &sea_orm::DbErr) -> bool { + let message = err.to_string(); + message.contains("UNIQUE constraint failed") || message.contains("UNIQUE") +} + +fn is_active_recovery_conflict(err: &sea_orm::DbErr) -> bool { + let message = err.to_string(); + is_unique_violation(err) || message.contains("recovery_job_active_conflict") +} + impl RecoveryMergeService { pub async fn start( state: &Arc, @@ -64,8 +77,25 @@ impl RecoveryMergeService { state: &Arc, target_server_id: &str, ) -> Result { + Self::rotate_target_token_on_conn(&state.db, target_server_id).await + } + + pub async fn rotate_target_token_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + ) -> Result { + Self::rotate_target_token_on_conn(txn, target_server_id).await + } + + async fn rotate_target_token_on_conn( + db: &C, + target_server_id: &str, + ) -> Result + where + C: ConnectionTrait, + { let target = server::Entity::find_by_id(target_server_id) - .one(&state.db) + .one(db) .await? .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; @@ -77,12 +107,12 @@ impl RecoveryMergeService { active.token_hash = sea_orm::Set(token_hash); active.token_prefix = sea_orm::Set(token_prefix); active.updated_at = sea_orm::Set(Utc::now()); - active.update(&state.db).await?; + active.update(db).await?; Ok(plaintext_token) } - async fn validate_start_request( + pub async fn validate_start_request( state: &Arc, target_server_id: &str, source_server_id: &str, @@ -122,13 +152,32 @@ impl RecoveryMergeService { target_server_id: &str, source_server_id: &str, ) -> Result { + Self::start_on_connection(db, target_server_id, source_server_id).await + } + + pub async fn start_on_txn( + db: &DatabaseTransaction, + target_server_id: &str, + source_server_id: &str, + ) -> Result { + Self::start_on_connection(db, target_server_id, source_server_id).await + } + + async fn start_on_connection( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result + where + C: ConnectionTrait, + { if let Some(existing) = Self::find_reusable_start_job(db, target_server_id, source_server_id).await? { return Self::advance_job_to_rebinding(db, existing).await; } - match RecoveryJobService::create_job(db, target_server_id, source_server_id).await { + match Self::create_job(db, target_server_id, source_server_id).await { Ok(job) => Self::advance_job_to_rebinding(db, job).await, Err(AppError::Conflict(_)) => { Self::recover_duplicate_start(db, target_server_id, source_server_id).await @@ -137,13 +186,16 @@ impl RecoveryMergeService { } } - async fn find_reusable_start_job( - db: &DatabaseConnection, + async fn find_reusable_start_job( + db: &C, target_server_id: &str, source_server_id: &str, - ) -> Result, AppError> { - let running_target = RecoveryJobService::running_for_target(db, target_server_id).await?; - let running_source = RecoveryJobService::running_for_source(db, source_server_id).await?; + ) -> Result, AppError> + where + C: ConnectionTrait, + { + let running_target = Self::running_for_target(db, target_server_id).await?; + let running_source = Self::running_for_source(db, source_server_id).await?; if let Some(job) = running_target { if job.source_server_id != source_server_id { @@ -172,11 +224,14 @@ impl RecoveryMergeService { Ok(None) } - async fn recover_duplicate_start( - db: &DatabaseConnection, + async fn recover_duplicate_start( + db: &C, target_server_id: &str, source_server_id: &str, - ) -> Result { + ) -> Result + where + C: ConnectionTrait, + { match Self::find_reusable_start_job(db, target_server_id, source_server_id).await? { Some(job) => Self::advance_job_to_rebinding(db, job).await, None => Err(AppError::Conflict( @@ -185,10 +240,13 @@ impl RecoveryMergeService { } } - async fn advance_job_to_rebinding( - db: &DatabaseConnection, + async fn advance_job_to_rebinding( + db: &C, job: recovery_job::Model, - ) -> Result { + ) -> Result + where + C: ConnectionTrait, + { let now = Utc::now(); let result = recovery_job::Entity::update_many() .col_expr( @@ -215,16 +273,84 @@ impl RecoveryMergeService { .await?; if result.rows_affected == 0 { - return RecoveryJobService::get_job(db, &job.job_id) + return Self::get_job(db, &job.job_id) .await? .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())); } - RecoveryJobService::get_job(db, &job.job_id) + Self::get_job(db, &job.job_id) .await? .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())) } + async fn create_job( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result + where + C: ConnectionTrait, + { + let active = recovery_job::ActiveModel { + job_id: sea_orm::Set(uuid::Uuid::new_v4().to_string()), + target_server_id: sea_orm::Set(target_server_id.to_string()), + source_server_id: sea_orm::Set(source_server_id.to_string()), + status: sea_orm::Set("running".to_string()), + stage: sea_orm::Set(RECOVERY_STAGE_VALIDATING.to_string()), + checkpoint_json: sea_orm::Set(None), + error: sea_orm::Set(None), + started_at: sea_orm::Set(Utc::now()), + created_at: sea_orm::Set(Utc::now()), + updated_at: sea_orm::Set(Utc::now()), + last_heartbeat_at: sea_orm::Set(None), + }; + + active.insert(db).await.map_err(|err| { + if is_active_recovery_conflict(&err) { + AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + ) + } else { + err.into() + } + }) + } + + async fn running_for_target( + db: &C, + target_server_id: &str, + ) -> Result, AppError> + where + C: ConnectionTrait, + { + Ok(recovery_job::Entity::find() + .filter(recovery_job::Column::TargetServerId.eq(target_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .one(db) + .await?) + } + + async fn running_for_source( + db: &C, + source_server_id: &str, + ) -> Result, AppError> + where + C: ConnectionTrait, + { + Ok(recovery_job::Entity::find() + .filter(recovery_job::Column::SourceServerId.eq(source_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .one(db) + .await?) + } + + async fn get_job(db: &C, job_id: &str) -> Result, AppError> + where + C: ConnectionTrait, + { + Ok(recovery_job::Entity::find_by_id(job_id).one(db).await?) + } + async fn handle_rebind_ack_on_db( db: &DatabaseConnection, job_id: &str, @@ -686,7 +812,10 @@ mod tests { .await .unwrap(); assert!(acknowledged.transitioned); - assert_eq!(acknowledged.job.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!( + acknowledged.job.stage, + RECOVERY_STAGE_AWAITING_TARGET_ONLINE + ); let advanced = RecoveryMergeService::advance_job_to_rebinding(&db, stale_job) .await From a5eaab06797e9f53cd5e56df96a524b543383f7c Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:06:47 +0800 Subject: [PATCH 44/60] test(server): align rebind failure recovery update expectation --- crates/server/src/router/ws/agent.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 8888acbe..b9ccd09a 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -1580,7 +1580,7 @@ mod tests { } #[tokio::test] - async fn rebind_identity_failed_marks_job_failed_and_broadcasts_empty_recovery_update() { + async fn rebind_identity_failed_marks_job_failed_and_broadcasts_recovery_snapshot() { let (db, _tmp) = setup_test_db().await; insert_server(&db, "target-1", "Target").await; insert_server(&db, "source-1", "Source").await; @@ -1615,7 +1615,14 @@ mod tests { recoveries: Some(recoveries), .. } => { - assert!(recoveries.is_empty()); + assert_eq!(recoveries.len(), 1); + assert_eq!(recoveries[0].job_id, "job-1"); + assert_eq!( + recoveries[0].status, + serverbee_common::protocol::RecoveryJobStatus::Failed + ); + assert_eq!(recoveries[0].stage, RecoveryJobStage::Rebinding); + assert_eq!(recoveries[0].error.as_deref(), Some("agent failed")); } other => panic!("expected recovery update, got {other:?}"), } From 49a108c24e3a7d3fec625edcc5c0793c6ec96f17 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:08:26 +0800 Subject: [PATCH 45/60] fix(server): align recovery start validation and tests --- crates/server/src/router/api/server_recovery.rs | 4 ++-- crates/server/tests/integration.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/server/src/router/api/server_recovery.rs b/crates/server/src/router/api/server_recovery.rs index 70771bc9..f715148a 100644 --- a/crates/server/src/router/api/server_recovery.rs +++ b/crates/server/src/router/api/server_recovery.rs @@ -241,12 +241,12 @@ async fn start_recovery_merge_with_sender( source_server_id: &str, sender: Option>, ) -> Result { + RecoveryMergeService::validate_start_request(state, target_id, source_server_id).await?; + let sender = sender.ok_or_else(|| { AppError::Conflict("Source server must be online before starting recovery".to_string()) })?; - RecoveryMergeService::validate_start_request(state, target_id, source_server_id).await?; - let txn = state.db.begin().await?; let job = RecoveryMergeService::start_on_txn(&txn, target_id, source_server_id).await?; let token = RecoveryMergeService::rotate_target_token_on_txn(&txn, target_id).await?; diff --git a/crates/server/tests/integration.rs b/crates/server/tests/integration.rs index fda4ef04..7b015f47 100644 --- a/crates/server/tests/integration.rs +++ b/crates/server/tests/integration.rs @@ -4036,7 +4036,7 @@ async fn test_recovery_job_get_requires_admin_and_start_creates_job() { .expect("job_id missing") .to_string(); assert_eq!(start_body["data"]["status"], "running"); - assert_eq!(start_body["data"]["stage"], "validating"); + assert_eq!(start_body["data"]["stage"], "rebinding"); let plain_client = reqwest::Client::new(); let unauth_resp = plain_client @@ -4066,5 +4066,5 @@ async fn test_recovery_job_get_requires_admin_and_start_creates_job() { assert_eq!(get_body["data"]["source_server_id"], source_id); assert!(get_body["data"].get("checkpoint_json").is_none()); assert_eq!(get_body["data"]["status"], "running"); - assert_eq!(get_body["data"]["stage"], "validating"); + assert_eq!(get_body["data"]["stage"], "rebinding"); } From 3b5b2532291ea2a99b615e7803ef8b7697016538 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:10:03 +0800 Subject: [PATCH 46/60] fix(server): recheck recovery start preconditions before dispatch --- .../server/src/router/api/server_recovery.rs | 7 ++ crates/server/src/service/recovery_merge.rs | 70 +++++++++++++++---- 2 files changed, 65 insertions(+), 12 deletions(-) diff --git a/crates/server/src/router/api/server_recovery.rs b/crates/server/src/router/api/server_recovery.rs index f715148a..ac6d8263 100644 --- a/crates/server/src/router/api/server_recovery.rs +++ b/crates/server/src/router/api/server_recovery.rs @@ -249,6 +249,13 @@ async fn start_recovery_merge_with_sender( let txn = state.db.begin().await?; let job = RecoveryMergeService::start_on_txn(&txn, target_id, source_server_id).await?; + if let Err(error) = + RecoveryMergeService::validate_dispatch_preconditions(state, target_id, source_server_id) + .await + { + txn.rollback().await?; + return Err(error); + } let token = RecoveryMergeService::rotate_target_token_on_txn(&txn, target_id).await?; if let Err(error) = sender diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index c1e06521..db4c5ce6 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -132,19 +132,27 @@ impl RecoveryMergeService { .await? .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; - if state.agent_manager.is_online(&target.id) { - return Err(AppError::Conflict( - "Target server must be offline before starting recovery".to_string(), - )); - } - - if !state.agent_manager.is_online(&source.id) { - return Err(AppError::Conflict( - "Source server must be online before starting recovery".to_string(), - )); - } + Self::validate_connectivity_preconditions( + state, + &target.id, + &source.id, + "Target server must be offline before starting recovery", + "Source server must be online before starting recovery", + ) + } - Ok(()) + pub async fn validate_dispatch_preconditions( + state: &Arc, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::validate_connectivity_preconditions( + state, + target_server_id, + source_server_id, + "Recovery start aborted because target server came back online before dispatch", + "Recovery start aborted because source server went offline before dispatch", + ) } async fn start_on_db( @@ -351,6 +359,24 @@ impl RecoveryMergeService { Ok(recovery_job::Entity::find_by_id(job_id).one(db).await?) } + fn validate_connectivity_preconditions( + state: &Arc, + target_server_id: &str, + source_server_id: &str, + target_online_message: &str, + source_offline_message: &str, + ) -> Result<(), AppError> { + if state.agent_manager.is_online(target_server_id) { + return Err(AppError::Conflict(target_online_message.to_string())); + } + + if !state.agent_manager.is_online(source_server_id) { + return Err(AppError::Conflict(source_offline_message.to_string())); + } + + Ok(()) + } + async fn handle_rebind_ack_on_db( db: &DatabaseConnection, job_id: &str, @@ -889,4 +915,24 @@ mod tests { assert_eq!(loaded.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); assert_eq!(loaded.status, "running"); } + + #[tokio::test] + async fn dispatch_validation_rejects_stale_source_offline_state() { + let (state, _tmp) = test_state_with_servers().await; + mark_online(&state, "source-1"); + + RecoveryMergeService::validate_start_request(&state, "target-1", "source-1") + .await + .expect("initial start validation should succeed"); + + state.agent_manager.remove_connection("source-1"); + + let result = + RecoveryMergeService::validate_dispatch_preconditions(&state, "target-1", "source-1") + .await; + + assert!( + matches!(result, Err(AppError::Conflict(message)) if message.contains("went offline before dispatch")) + ); + } } From 7ffad3e214f84c0666ea99ced67268d6cb09bcc3 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:11:42 +0800 Subject: [PATCH 47/60] fix(server): harden recovery rebind dispatch safety --- .../server/src/router/api/server_recovery.rs | 92 ++++++++++++++++--- crates/server/src/service/recovery_merge.rs | 25 ++++- 2 files changed, 102 insertions(+), 15 deletions(-) diff --git a/crates/server/src/router/api/server_recovery.rs b/crates/server/src/router/api/server_recovery.rs index ac6d8263..1004f025 100644 --- a/crates/server/src/router/api/server_recovery.rs +++ b/crates/server/src/router/api/server_recovery.rs @@ -15,7 +15,7 @@ use crate::entity::{recovery_job, server}; use crate::error::{ApiResponse, AppError, ok}; use crate::router::ws::browser::broadcast_recovery_update; use crate::service::recovery_job::RecoveryJobService; -use crate::service::recovery_merge::RecoveryMergeService; +use crate::service::recovery_merge::{RECOVERY_STAGE_REBINDING, RecoveryMergeService}; use crate::state::AppState; #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, utoipa::ToSchema)] @@ -257,6 +257,7 @@ async fn start_recovery_merge_with_sender( return Err(error); } let token = RecoveryMergeService::rotate_target_token_on_txn(&txn, target_id).await?; + txn.commit().await?; if let Err(error) = sender .send(ServerMessage::RebindIdentity { @@ -266,13 +267,13 @@ async fn start_recovery_merge_with_sender( }) .await { - txn.rollback().await?; + let message = format!("Failed to dispatch RebindIdentity to source agent: {error}"); + RecoveryJobService::mark_failed(&state.db, &job.job_id, RECOVERY_STAGE_REBINDING, &message) + .await?; return Err(AppError::Internal(format!( "Failed to dispatch RebindIdentity to source agent: {error}" ))); } - - txn.commit().await?; Ok(job) } @@ -450,7 +451,12 @@ mod tests { use tokio::sync::mpsc; use tokio::time::{Duration, timeout}; - async fn insert_server(db: &sea_orm::DatabaseConnection, id: &str, name: &str) { + async fn insert_server_with_protocol( + db: &sea_orm::DatabaseConnection, + id: &str, + name: &str, + protocol_version: i32, + ) { let now = Utc::now(); let token_hash = AuthService::hash_password("test").unwrap(); server::ActiveModel { @@ -461,7 +467,7 @@ mod tests { weight: Set(0), hidden: Set(false), capabilities: Set(CAP_DEFAULT as i32), - protocol_version: Set(1), + protocol_version: Set(protocol_version), created_at: Set(now), updated_at: Set(now), ..Default::default() @@ -471,6 +477,10 @@ mod tests { .unwrap(); } + async fn insert_server(db: &sea_orm::DatabaseConnection, id: &str, name: &str) { + insert_server_with_protocol(db, id, name, 4).await; + } + fn test_addr() -> SocketAddr { SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 9527) } @@ -508,6 +518,7 @@ mod tests { state .agent_manager .add_connection("source-1".into(), "Source".into(), tx, test_addr()); + state.agent_manager.set_protocol_version("source-1", 4); let Json(response) = start_recovery_merge( State(Arc::clone(&state)), @@ -539,6 +550,7 @@ mod tests { state .agent_manager .add_connection("source-1".into(), "Source".into(), tx, test_addr()); + state.agent_manager.set_protocol_version("source-1", 4); let Json(response) = start_recovery_merge( State(Arc::clone(&state)), @@ -616,7 +628,58 @@ mod tests { } #[tokio::test] - async fn start_recovery_merge_fails_safely_when_dispatch_fails() { + async fn start_recovery_merge_rejects_unsupported_source_protocol() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server_with_protocol(&db, "source-1", "Source", 3).await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let (tx, mut rx) = mpsc::channel(1); + state + .agent_manager + .add_connection("source-1".into(), "Source".into(), tx, test_addr()); + state.agent_manager.set_protocol_version("source-1", 3); + state.agent_manager.set_protocol_version("source-1", 3); + + let before = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + + let error = start_recovery_merge( + State(Arc::clone(&state)), + Path("target-1".to_string()), + Json(StartRecoveryRequest { + source_server_id: "source-1".to_string(), + }), + ) + .await + .expect_err("protocol v3 source should be rejected"); + + assert!( + matches!(error, AppError::Conflict(message) if message.contains("protocol v4+")) + ); + + assert!( + timeout(Duration::from_millis(100), rx.recv()).await.is_err(), + "unsupported source should not receive a rebind dispatch" + ); + + let after = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(after.token_prefix, before.token_prefix); + assert_eq!(after.token_hash, before.token_hash); + assert!(recovery_job::Entity::find().all(&db).await.unwrap().is_empty()); + } + + #[tokio::test] + async fn start_recovery_merge_persists_state_before_dispatch_failure() { let (db, _tmp) = setup_test_db().await; insert_server(&db, "target-1", "Target").await; insert_server(&db, "source-1", "Source").await; @@ -629,6 +692,7 @@ mod tests { state .agent_manager .add_connection("source-1".into(), "Source".into(), tx, test_addr()); + state.agent_manager.set_protocol_version("source-1", 4); let before = server::Entity::find_by_id("target-1") .one(&db) @@ -647,7 +711,8 @@ mod tests { .expect_err("dispatch failure should fail safely"); assert!( - matches!(error, AppError::Internal(message) if message.contains("Failed to dispatch RebindIdentity")) + matches!(error, AppError::Internal(ref message) if message.contains("Failed to dispatch RebindIdentity")), + "unexpected error: {error:?}" ); let after = server::Entity::find_by_id("target-1") @@ -655,13 +720,12 @@ mod tests { .await .unwrap() .unwrap(); - assert_eq!(after.token_prefix, before.token_prefix); - assert_eq!(after.token_hash, before.token_hash); + assert_ne!(after.token_prefix, before.token_prefix); + assert_ne!(after.token_hash, before.token_hash); let jobs = recovery_job::Entity::find().all(&db).await.unwrap(); - assert!( - jobs.is_empty(), - "no recovery job should remain after failed dispatch" - ); + assert_eq!(jobs.len(), 1, "recovery job state should stay committed"); + assert_eq!(jobs[0].target_server_id, "target-1"); + assert_eq!(jobs[0].source_server_id, "source-1"); } } diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index db4c5ce6..9616311e 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -16,6 +16,7 @@ use crate::state::AppState; pub const RECOVERY_STAGE_VALIDATING: &str = "validating"; pub const RECOVERY_STAGE_REBINDING: &str = "rebinding"; pub const RECOVERY_STAGE_AWAITING_TARGET_ONLINE: &str = "awaiting_target_online"; +pub const REBIND_IDENTITY_MIN_PROTOCOL_VERSION: u32 = 4; pub struct RecoveryMergeService; @@ -138,7 +139,10 @@ impl RecoveryMergeService { &source.id, "Target server must be offline before starting recovery", "Source server must be online before starting recovery", - ) + )?; + + Self::validate_rebind_identity_protocol(state, &source)?; + Ok(()) } pub async fn validate_dispatch_preconditions( @@ -359,6 +363,25 @@ impl RecoveryMergeService { Ok(recovery_job::Entity::find_by_id(job_id).one(db).await?) } + fn validate_rebind_identity_protocol( + state: &Arc, + source: &server::Model, + ) -> Result<(), AppError> { + let protocol_version = state + .agent_manager + .get_protocol_version(&source.id) + .unwrap_or(source.protocol_version as u32); + + if protocol_version < REBIND_IDENTITY_MIN_PROTOCOL_VERSION { + return Err(AppError::Conflict(format!( + "Source server must support RebindIdentity (protocol v{}+ required)", + REBIND_IDENTITY_MIN_PROTOCOL_VERSION + ))); + } + + Ok(()) + } + fn validate_connectivity_preconditions( state: &Arc, target_server_id: &str, From 1824b38f74d402c13145050d9ba3c928f9cb4c36 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:13:18 +0800 Subject: [PATCH 48/60] test(server): prime recovery source before start flow --- crates/server/tests/integration.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/server/tests/integration.rs b/crates/server/tests/integration.rs index 7b015f47..39409479 100644 --- a/crates/server/tests/integration.rs +++ b/crates/server/tests/integration.rs @@ -4016,8 +4016,9 @@ async fn test_recovery_job_get_requires_admin_and_start_creates_job() { let (target_id, _target_token) = register_agent(&auth_client, &base_url).await; let (source_id, source_token) = register_agent(&auth_client, &base_url).await; - let (_sink, mut reader) = connect_agent(&base_url, &source_token).await; + let (mut sink, mut reader) = connect_agent(&base_url, &source_token).await; let _welcome = recv_agent_text(&mut reader).await; + send_system_info(&mut sink, &mut reader, "recovery-source-info", None).await; let start_resp = auth_client .post(format!( From 8ebea2a076e1b0336160966a7e6481c76dc3f3e8 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:14:56 +0800 Subject: [PATCH 49/60] feat(server): merge recovered server history into target identity --- crates/server/src/service/recovery_merge.rs | 632 +++++++++++++++++++- crates/server/src/service/traffic.rs | 75 +++ 2 files changed, 699 insertions(+), 8 deletions(-) diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index 9616311e..99fbeab6 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -1,16 +1,18 @@ use std::sync::Arc; use chrono::Utc; +use sea_orm::DatabaseBackend; use sea_orm::prelude::Expr; use sea_orm::{ ActiveModelTrait, ColumnTrait, ConnectionTrait, DatabaseConnection, DatabaseTransaction, - EntityTrait, QueryFilter, + EntityTrait, QueryFilter, Statement, }; -use crate::entity::{recovery_job, server}; +use crate::entity::{network_probe_config, recovery_job, server, server_tag}; use crate::error::AppError; use crate::service::auth::AuthService; use crate::service::recovery_job::RecoveryJobService; +use crate::service::traffic::TrafficService; use crate::state::AppState; pub const RECOVERY_STAGE_VALIDATING: &str = "validating"; @@ -494,6 +496,315 @@ impl RecoveryMergeService { transitioned: true, }) } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_server_history_on_db( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_raw_table(db, "records", "time", target_server_id, source_server_id).await?; + Self::merge_raw_table(db, "gpu_records", "time", target_server_id, source_server_id) + .await?; + Self::merge_raw_table(db, "ping_records", "time", target_server_id, source_server_id) + .await?; + Self::merge_raw_table( + db, + "task_results", + "finished_at", + target_server_id, + source_server_id, + ) + .await?; + Self::merge_raw_table( + db, + "network_probe_record", + "timestamp", + target_server_id, + source_server_id, + ) + .await?; + Self::merge_raw_table( + db, + "docker_event", + "timestamp", + target_server_id, + source_server_id, + ) + .await?; + + Self::merge_unique_key_table( + db, + "records_hourly", + &["time"], + target_server_id, + source_server_id, + ) + .await?; + Self::merge_unique_key_table( + db, + "network_probe_record_hourly", + &["target_id", "hour"], + target_server_id, + source_server_id, + ) + .await?; + TrafficService::merge_recovered_server_history(db, target_server_id, source_server_id) + .await?; + Self::merge_unique_key_table( + db, + "uptime_daily", + &["date"], + target_server_id, + source_server_id, + ) + .await?; + Self::merge_alert_states(db, target_server_id, source_server_id).await?; + Self::rewrite_server_ids_json_tables(db, target_server_id, source_server_id).await?; + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_raw_table( + db: &DatabaseConnection, + table: &str, + time_column: &str, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + format!( + "DELETE FROM {table} \ + WHERE server_id = $1 \ + AND (SELECT MIN({time_column}) FROM {table} WHERE server_id = $2) IS NOT NULL \ + AND {time_column} >= (SELECT MIN({time_column}) FROM {table} WHERE server_id = $2) \ + AND {time_column} <= (SELECT MAX({time_column}) FROM {table} WHERE server_id = $2)" + ), + [target_server_id.into(), source_server_id.into()], + )) + .await?; + + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + format!("UPDATE {table} SET server_id = $1 WHERE server_id = $2"), + [target_server_id.into(), source_server_id.into()], + )) + .await?; + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_unique_key_table( + db: &DatabaseConnection, + table: &str, + key_columns: &[&str], + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + TrafficService::replace_unique_key_table_server_id( + db, + table, + key_columns, + target_server_id, + source_server_id, + ) + .await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_alert_states( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + "DELETE FROM alert_states AS source \ + WHERE source.server_id = $1 \ + AND EXISTS ( \ + SELECT 1 FROM alert_states AS target \ + WHERE target.server_id = $2 AND target.rule_id = source.rule_id \ + )", + [source_server_id.into(), target_server_id.into()], + )) + .await?; + + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + "UPDATE alert_states SET server_id = $1 WHERE server_id = $2", + [target_server_id.into(), source_server_id.into()], + )) + .await?; + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn rewrite_server_ids_json_tables( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + let tables = [ + ("alert_rules", "server_ids_json", true), + ("ping_tasks", "server_ids_json", false), + ("tasks", "server_ids_json", false), + ("service_monitor", "server_ids_json", true), + ("maintenance", "server_ids_json", true), + ("incident", "server_ids_json", true), + ("status_page", "server_ids_json", false), + ]; + + for (table, column, nullable) in tables { + Self::rewrite_server_ids_json_table( + db, + table, + column, + nullable, + target_server_id, + source_server_id, + ) + .await?; + } + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + async fn rewrite_server_ids_json_table( + db: &DatabaseConnection, + table: &str, + column: &str, + nullable: bool, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + let rows = db + .query_all(Statement::from_sql_and_values( + DatabaseBackend::Sqlite, + format!( + "SELECT id, {column} FROM {table} WHERE {column} LIKE '%' || $1 || '%'" + ), + [source_server_id.into()], + )) + .await?; + + for row in rows { + let id: String = row.try_get_by_index(0)?; + let current: Option = row.try_get_by_index(1)?; + let Some(current) = current else { + continue; + }; + + let rewritten = + Self::rewrite_server_ids_json_value(¤t, target_server_id, source_server_id)?; + if rewritten.as_deref() == Some(current.as_str()) { + continue; + } + + let value = if nullable { + rewritten.unwrap_or_else(|| "[]".to_string()).into() + } else { + rewritten + .unwrap_or_else(|| "[]".to_string()) + .into() + }; + + db.execute(Statement::from_sql_and_values( + DatabaseBackend::Sqlite, + format!("UPDATE {table} SET {column} = $1 WHERE id = $2"), + [value, id.into()], + )) + .await?; + } + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) fn rewrite_server_ids_json_value( + current: &str, + target_server_id: &str, + source_server_id: &str, + ) -> Result, AppError> { + let ids: Vec = serde_json::from_str(current).map_err(|error| { + AppError::Internal(format!("Failed to parse server_ids_json during recovery merge: {error}")) + })?; + + let mut rewritten = Vec::new(); + for id in ids { + let next = if id == source_server_id { + target_server_id.to_string() + } else { + id + }; + if !rewritten.iter().any(|existing| existing == &next) { + rewritten.push(next); + } + } + + serde_json::to_string(&rewritten) + .map(Some) + .map_err(|error| { + AppError::Internal(format!( + "Failed to serialize server_ids_json during recovery merge: {error}" + )) + }) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn finalize_target_server_row( + db: &DatabaseConnection, + target_server_id: &str, + source: &server::Model, + ) -> Result<(), AppError> { + let target = server::Entity::find_by_id(target_server_id) + .one(db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + + let mut active: server::ActiveModel = target.into(); + active.cpu_name = sea_orm::Set(source.cpu_name.clone()); + active.cpu_cores = sea_orm::Set(source.cpu_cores); + active.cpu_arch = sea_orm::Set(source.cpu_arch.clone()); + active.os = sea_orm::Set(source.os.clone()); + active.kernel_version = sea_orm::Set(source.kernel_version.clone()); + active.mem_total = sea_orm::Set(source.mem_total); + active.swap_total = sea_orm::Set(source.swap_total); + active.disk_total = sea_orm::Set(source.disk_total); + active.ipv4 = sea_orm::Set(source.ipv4.clone()); + active.ipv6 = sea_orm::Set(source.ipv6.clone()); + active.region = sea_orm::Set(source.region.clone()); + active.country_code = sea_orm::Set(source.country_code.clone()); + active.virtualization = sea_orm::Set(source.virtualization.clone()); + active.agent_version = sea_orm::Set(source.agent_version.clone()); + active.protocol_version = sea_orm::Set(source.protocol_version); + active.features = sea_orm::Set(source.features.clone()); + active.updated_at = sea_orm::Set(Utc::now()); + active.update(db).await?; + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn delete_intentionally_unmerged_source_rows( + db: &DatabaseConnection, + source_server_id: &str, + ) -> Result<(), AppError> { + server_tag::Entity::delete_many() + .filter(server_tag::Column::ServerId.eq(source_server_id)) + .exec(db) + .await?; + network_probe_config::Entity::delete_many() + .filter(network_probe_config::Column::ServerId.eq(source_server_id)) + .exec(db) + .await?; + + Ok(()) + } } pub fn recovery_phase_for_stage(stage: &str) -> Option { @@ -527,19 +838,23 @@ pub fn retry_strategy_for_stage(stage: &str) -> Option { #[cfg(test)] mod tests { use super::{ - RECOVERY_STAGE_AWAITING_TARGET_ONLINE, RECOVERY_STAGE_REBINDING, RecoveryFailurePhase, - RecoveryMergeService, RecoveryRetryStrategy, is_pre_rebind_stage, recovery_phase_for_stage, + REBIND_IDENTITY_MIN_PROTOCOL_VERSION, RECOVERY_STAGE_AWAITING_TARGET_ONLINE, + RECOVERY_STAGE_REBINDING, RecoveryFailurePhase, RecoveryMergeService, + RecoveryRetryStrategy, is_pre_rebind_stage, recovery_phase_for_stage, retry_strategy_for_phase, retry_strategy_for_stage, }; use crate::config::AppConfig; - use crate::entity::server; + use crate::entity::{ + alert_rule, alert_state, record, server, server_tag, service_monitor, traffic_daily, + traffic_hourly, traffic_state, + }; use crate::error::AppError; use crate::service::auth::AuthService; use crate::service::recovery_job::RecoveryJobService; use crate::state::AppState; use crate::test_utils::setup_test_db; - use chrono::Utc; - use sea_orm::{ActiveModelTrait, DatabaseConnection, Set}; + use chrono::{NaiveDate, Utc}; + use sea_orm::{ActiveModelTrait, ColumnTrait, DatabaseConnection, EntityTrait, QueryFilter, Set}; use serverbee_common::constants::CAP_DEFAULT; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use std::sync::Arc; @@ -557,7 +872,7 @@ mod tests { weight: Set(0), hidden: Set(false), capabilities: Set(CAP_DEFAULT as i32), - protocol_version: Set(1), + protocol_version: Set(REBIND_IDENTITY_MIN_PROTOCOL_VERSION as i32), created_at: Set(now), updated_at: Set(now), ..Default::default() @@ -590,6 +905,42 @@ mod tests { tx, test_addr(), ); + state + .agent_manager + .set_protocol_version(server_id, REBIND_IDENTITY_MIN_PROTOCOL_VERSION); + } + + async fn insert_record( + db: &DatabaseConnection, + server_id: &str, + time: chrono::DateTime, + cpu: f64, + ) { + record::ActiveModel { + server_id: Set(server_id.to_string()), + time: Set(time), + cpu: Set(cpu), + mem_used: Set(1), + swap_used: Set(1), + disk_used: Set(1), + net_in_speed: Set(1), + net_out_speed: Set(1), + net_in_transfer: Set(1), + net_out_transfer: Set(1), + load1: Set(1.0), + load5: Set(1.0), + load15: Set(1.0), + tcp_conn: Set(1), + udp_conn: Set(1), + process_count: Set(1), + temperature: Set(None), + gpu_usage: Set(None), + disk_io_json: Set(None), + ..Default::default() + } + .insert(db) + .await + .expect("insert record should succeed"); } #[test] @@ -958,4 +1309,269 @@ mod tests { matches!(result, Err(AppError::Conflict(message)) if message.contains("went offline before dispatch")) ); } + + #[tokio::test] + async fn merge_raw_records_replaces_target_overlap_with_source() { + let (db, _tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + + let before_overlap = NaiveDate::from_ymd_opt(2026, 4, 16) + .unwrap() + .and_hms_opt(9, 0, 0) + .unwrap() + .and_utc(); + let overlap_start = NaiveDate::from_ymd_opt(2026, 4, 16) + .unwrap() + .and_hms_opt(10, 0, 0) + .unwrap() + .and_utc(); + let overlap_end = NaiveDate::from_ymd_opt(2026, 4, 16) + .unwrap() + .and_hms_opt(11, 0, 0) + .unwrap() + .and_utc(); + + insert_record(&db, "target-1", before_overlap, 10.0).await; + insert_record(&db, "target-1", overlap_start, 20.0).await; + insert_record(&db, "target-1", overlap_end, 30.0).await; + insert_record(&db, "source-1", overlap_start, 200.0).await; + insert_record(&db, "source-1", overlap_end, 300.0).await; + + RecoveryMergeService::merge_server_history_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let target_rows = record::Entity::find() + .filter(record::Column::ServerId.eq("target-1")) + .all(&db) + .await + .unwrap(); + assert_eq!(target_rows.len(), 3); + assert!(target_rows.iter().any(|row| row.time == before_overlap && row.cpu == 10.0)); + assert!(target_rows.iter().any(|row| row.time == overlap_start && row.cpu == 200.0)); + assert!(target_rows.iter().any(|row| row.time == overlap_end && row.cpu == 300.0)); + + let source_rows = record::Entity::find() + .filter(record::Column::ServerId.eq("source-1")) + .all(&db) + .await + .unwrap(); + assert!(source_rows.is_empty()); + } + + #[tokio::test] + async fn merge_alert_state_keeps_target_when_rule_conflicts() { + let (db, _tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + + let now = Utc::now(); + alert_state::ActiveModel { + rule_id: Set("rule-1".to_string()), + server_id: Set("target-1".to_string()), + first_triggered_at: Set(now), + last_notified_at: Set(now), + count: Set(5), + resolved: Set(false), + resolved_at: Set(None), + updated_at: Set(now), + ..Default::default() + } + .insert(&db) + .await + .unwrap(); + alert_state::ActiveModel { + rule_id: Set("rule-1".to_string()), + server_id: Set("source-1".to_string()), + first_triggered_at: Set(now), + last_notified_at: Set(now), + count: Set(1), + resolved: Set(true), + resolved_at: Set(Some(now)), + updated_at: Set(now), + ..Default::default() + } + .insert(&db) + .await + .unwrap(); + + RecoveryMergeService::merge_server_history_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let target_states = alert_state::Entity::find() + .filter(alert_state::Column::ServerId.eq("target-1")) + .all(&db) + .await + .unwrap(); + assert_eq!(target_states.len(), 1); + assert_eq!(target_states[0].rule_id, "rule-1"); + assert_eq!(target_states[0].count, 5); + assert!(!target_states[0].resolved); + + let source_states = alert_state::Entity::find() + .filter(alert_state::Column::ServerId.eq("source-1")) + .all(&db) + .await + .unwrap(); + assert!(source_states.is_empty()); + } + + #[tokio::test] + async fn rewrite_server_ids_json_replaces_source_with_target_once() { + let (db, _tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + let now = Utc::now(); + + alert_rule::ActiveModel { + id: Set("rule-1".to_string()), + name: Set("rule".to_string()), + enabled: Set(true), + rules_json: Set("[]".to_string()), + trigger_mode: Set("any".to_string()), + notification_group_id: Set(None), + fail_trigger_tasks: Set(None), + recover_trigger_tasks: Set(None), + cover_type: Set("include".to_string()), + server_ids_json: Set(Some(r#"["target-1","source-1","source-1"]"#.to_string())), + created_at: Set(now), + updated_at: Set(now), + } + .insert(&db) + .await + .unwrap(); + + service_monitor::ActiveModel { + id: Set("monitor-1".to_string()), + name: Set("monitor".to_string()), + monitor_type: Set("http".to_string()), + target: Set("https://example.com".to_string()), + interval: Set(60), + config_json: Set("{}".to_string()), + notification_group_id: Set(None), + retry_count: Set(0), + server_ids_json: Set(Some(r#"["source-1","target-1","source-1"]"#.to_string())), + enabled: Set(true), + last_status: Set(None), + consecutive_failures: Set(0), + last_checked_at: Set(None), + created_at: Set(now), + updated_at: Set(now), + } + .insert(&db) + .await + .unwrap(); + + RecoveryMergeService::rewrite_server_ids_json_tables(&db, "target-1", "source-1") + .await + .unwrap(); + + let rule = alert_rule::Entity::find_by_id("rule-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(rule.server_ids_json.as_deref(), Some(r#"["target-1"]"#)); + + let monitor = service_monitor::Entity::find_by_id("monitor-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(monitor.server_ids_json.as_deref(), Some(r#"["target-1"]"#)); + } + + #[tokio::test] + async fn finalize_target_server_row_copies_runtime_fields_and_cleans_source_rows() { + let (db, _tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + let now = Utc::now(); + + let mut source: server::ActiveModel = server::Entity::find_by_id("source-1") + .one(&db) + .await + .unwrap() + .unwrap() + .into(); + source.cpu_name = Set(Some("Ryzen".to_string())); + source.cpu_cores = Set(Some(16)); + source.cpu_arch = Set(Some("x86_64".to_string())); + source.os = Set(Some("Linux".to_string())); + source.kernel_version = Set(Some("6.9.0".to_string())); + source.mem_total = Set(Some(64)); + source.swap_total = Set(Some(32)); + source.disk_total = Set(Some(1024)); + source.ipv4 = Set(Some("1.2.3.4".to_string())); + source.ipv6 = Set(Some("::1".to_string())); + source.region = Set(Some("Taipei".to_string())); + source.country_code = Set(Some("TW".to_string())); + source.virtualization = Set(Some("kvm".to_string())); + source.agent_version = Set(Some("1.2.3".to_string())); + source.protocol_version = Set(4); + source.features = Set(r#"["docker","process"]"#.to_string()); + let source_model = source.update(&db).await.unwrap(); + + server_tag::ActiveModel { + server_id: Set("source-1".to_string()), + tag: Set("temporary".to_string()), + } + .insert(&db) + .await + .unwrap(); + traffic_hourly::ActiveModel { + server_id: Set("source-1".to_string()), + hour: Set(now), + bytes_in: Set(10), + bytes_out: Set(20), + ..Default::default() + } + .insert(&db) + .await + .unwrap(); + traffic_daily::ActiveModel { + server_id: Set("source-1".to_string()), + date: Set(now.date_naive()), + bytes_in: Set(30), + bytes_out: Set(40), + ..Default::default() + } + .insert(&db) + .await + .unwrap(); + traffic_state::ActiveModel { + server_id: Set("source-1".to_string()), + last_in: Set(100), + last_out: Set(200), + updated_at: Set(now), + } + .insert(&db) + .await + .unwrap(); + + RecoveryMergeService::finalize_target_server_row(&db, "target-1", &source_model) + .await + .unwrap(); + RecoveryMergeService::delete_intentionally_unmerged_source_rows(&db, "source-1") + .await + .unwrap(); + + let target = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(target.cpu_name.as_deref(), Some("Ryzen")); + assert_eq!(target.protocol_version, 4); + assert_eq!(target.features, r#"["docker","process"]"#); + + let source_tags = server_tag::Entity::find() + .filter(server_tag::Column::ServerId.eq("source-1")) + .all(&db) + .await + .unwrap(); + assert!(source_tags.is_empty()); + } } diff --git a/crates/server/src/service/traffic.rs b/crates/server/src/service/traffic.rs index 1ee447a9..4185bf9c 100644 --- a/crates/server/src/service/traffic.rs +++ b/crates/server/src/service/traffic.rs @@ -10,6 +10,81 @@ use crate::error::AppError; pub struct TrafficService; impl TrafficService { + pub async fn merge_recovered_server_history( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::replace_unique_key_table_server_id( + db, + "traffic_hourly", + &["hour"], + target_server_id, + source_server_id, + ) + .await?; + Self::replace_unique_key_table_server_id( + db, + "traffic_daily", + &["date"], + target_server_id, + source_server_id, + ) + .await?; + Self::replace_unique_key_table_server_id( + db, + "traffic_state", + &[], + target_server_id, + source_server_id, + ) + .await?; + + Ok(()) + } + + pub(crate) async fn replace_unique_key_table_server_id( + db: &DatabaseConnection, + table: &str, + key_columns: &[&str], + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + let join_predicate = if key_columns.is_empty() { + "1 = 1".to_string() + } else { + key_columns + .iter() + .map(|column| format!("source.{column} = target.{column}")) + .collect::>() + .join(" AND ") + }; + + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + format!( + "DELETE FROM {table} AS target \ + WHERE target.server_id = $1 \ + AND EXISTS ( \ + SELECT 1 FROM {table} AS source \ + WHERE source.server_id = $2 \ + AND {join_predicate} \ + )" + ), + [target_server_id.into(), source_server_id.into()], + )) + .await?; + + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + format!("UPDATE {table} SET server_id = $1 WHERE server_id = $2"), + [target_server_id.into(), source_server_id.into()], + )) + .await?; + + Ok(()) + } + /// Upsert a traffic_hourly row, accumulating bytes_in/bytes_out on conflict. pub async fn upsert_hourly( db: &DatabaseConnection, From cfeece953fb8f54882dccf5e87e90fbf4be4f5b9 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:16:37 +0800 Subject: [PATCH 50/60] fix(server): make recovery merge atomic and preserve identity --- crates/server/src/service/recovery_merge.rs | 312 +++++++++++++++++++- 1 file changed, 296 insertions(+), 16 deletions(-) diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index 99fbeab6..6b3fe3e8 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -503,12 +503,45 @@ impl RecoveryMergeService { target_server_id: &str, source_server_id: &str, ) -> Result<(), AppError> { - Self::merge_raw_table(db, "records", "time", target_server_id, source_server_id).await?; - Self::merge_raw_table(db, "gpu_records", "time", target_server_id, source_server_id) - .await?; - Self::merge_raw_table(db, "ping_records", "time", target_server_id, source_server_id) + Self::merge_server_history_on_connection(db, target_server_id, source_server_id).await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_server_history_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_server_history_on_connection(txn, target_server_id, source_server_id).await + } + + async fn merge_server_history_on_connection( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + Self::merge_raw_table_on_connection(db, "records", "time", target_server_id, source_server_id) .await?; - Self::merge_raw_table( + Self::merge_raw_table_on_connection( + db, + "gpu_records", + "time", + target_server_id, + source_server_id, + ) + .await?; + Self::merge_raw_table_on_connection( + db, + "ping_records", + "time", + target_server_id, + source_server_id, + ) + .await?; + Self::merge_raw_table_on_connection( db, "task_results", "finished_at", @@ -516,7 +549,7 @@ impl RecoveryMergeService { source_server_id, ) .await?; - Self::merge_raw_table( + Self::merge_raw_table_on_connection( db, "network_probe_record", "timestamp", @@ -524,7 +557,7 @@ impl RecoveryMergeService { source_server_id, ) .await?; - Self::merge_raw_table( + Self::merge_raw_table_on_connection( db, "docker_event", "timestamp", @@ -533,7 +566,7 @@ impl RecoveryMergeService { ) .await?; - Self::merge_unique_key_table( + Self::merge_unique_key_table_on_connection( db, "records_hourly", &["time"], @@ -541,7 +574,7 @@ impl RecoveryMergeService { source_server_id, ) .await?; - Self::merge_unique_key_table( + Self::merge_unique_key_table_on_connection( db, "network_probe_record_hourly", &["target_id", "hour"], @@ -549,9 +582,13 @@ impl RecoveryMergeService { source_server_id, ) .await?; - TrafficService::merge_recovered_server_history(db, target_server_id, source_server_id) + TrafficService::merge_recovered_server_history_on_connection( + db, + target_server_id, + source_server_id, + ) .await?; - Self::merge_unique_key_table( + Self::merge_unique_key_table_on_connection( db, "uptime_daily", &["date"], @@ -559,8 +596,9 @@ impl RecoveryMergeService { source_server_id, ) .await?; - Self::merge_alert_states(db, target_server_id, source_server_id).await?; - Self::rewrite_server_ids_json_tables(db, target_server_id, source_server_id).await?; + Self::merge_alert_states_on_connection(db, target_server_id, source_server_id).await?; + Self::rewrite_server_ids_json_tables_on_connection(db, target_server_id, source_server_id) + .await?; Ok(()) } @@ -573,6 +611,44 @@ impl RecoveryMergeService { target_server_id: &str, source_server_id: &str, ) -> Result<(), AppError> { + Self::merge_raw_table_on_connection( + db, + table, + time_column, + target_server_id, + source_server_id, + ) + .await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_raw_table_on_txn( + txn: &DatabaseTransaction, + table: &str, + time_column: &str, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_raw_table_on_connection( + txn, + table, + time_column, + target_server_id, + source_server_id, + ) + .await + } + + async fn merge_raw_table_on_connection( + db: &C, + table: &str, + time_column: &str, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { db.execute(Statement::from_sql_and_values( db.get_database_backend(), format!( @@ -604,7 +680,45 @@ impl RecoveryMergeService { target_server_id: &str, source_server_id: &str, ) -> Result<(), AppError> { - TrafficService::replace_unique_key_table_server_id( + Self::merge_unique_key_table_on_connection( + db, + table, + key_columns, + target_server_id, + source_server_id, + ) + .await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_unique_key_table_on_txn( + txn: &DatabaseTransaction, + table: &str, + key_columns: &[&str], + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_unique_key_table_on_connection( + txn, + table, + key_columns, + target_server_id, + source_server_id, + ) + .await + } + + async fn merge_unique_key_table_on_connection( + db: &C, + table: &str, + key_columns: &[&str], + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + TrafficService::replace_unique_key_table_server_id_on_connection( db, table, key_columns, @@ -620,6 +734,26 @@ impl RecoveryMergeService { target_server_id: &str, source_server_id: &str, ) -> Result<(), AppError> { + Self::merge_alert_states_on_connection(db, target_server_id, source_server_id).await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_alert_states_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_alert_states_on_connection(txn, target_server_id, source_server_id).await + } + + async fn merge_alert_states_on_connection( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { db.execute(Statement::from_sql_and_values( db.get_database_backend(), "DELETE FROM alert_states AS source \ @@ -648,6 +782,28 @@ impl RecoveryMergeService { target_server_id: &str, source_server_id: &str, ) -> Result<(), AppError> { + Self::rewrite_server_ids_json_tables_on_connection(db, target_server_id, source_server_id) + .await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn rewrite_server_ids_json_tables_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::rewrite_server_ids_json_tables_on_connection(txn, target_server_id, source_server_id) + .await + } + + async fn rewrite_server_ids_json_tables_on_connection( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { let tables = [ ("alert_rules", "server_ids_json", true), ("ping_tasks", "server_ids_json", false), @@ -659,7 +815,7 @@ impl RecoveryMergeService { ]; for (table, column, nullable) in tables { - Self::rewrite_server_ids_json_table( + Self::rewrite_server_ids_json_table_on_connection( db, table, column, @@ -682,6 +838,28 @@ impl RecoveryMergeService { target_server_id: &str, source_server_id: &str, ) -> Result<(), AppError> { + Self::rewrite_server_ids_json_table_on_connection( + db, + table, + column, + nullable, + target_server_id, + source_server_id, + ) + .await + } + + async fn rewrite_server_ids_json_table_on_connection( + db: &C, + table: &str, + column: &str, + nullable: bool, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { let rows = db .query_all(Statement::from_sql_and_values( DatabaseBackend::Sqlite, @@ -761,6 +939,35 @@ impl RecoveryMergeService { target_server_id: &str, source: &server::Model, ) -> Result<(), AppError> { + Self::finalize_target_server_row_on_connection(db, target_server_id, source).await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn finalize_target_server_row_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + source: &server::Model, + ) -> Result<(), AppError> { + Self::finalize_target_server_row_on_connection(txn, target_server_id, source).await + } + + async fn finalize_target_server_row_on_connection( + db: &C, + target_server_id: &str, + source: &server::Model, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + if source.fingerprint.is_some() { + server::Entity::update_many() + .col_expr(server::Column::Fingerprint, Expr::value(None::)) + .col_expr(server::Column::UpdatedAt, Expr::value(Utc::now())) + .filter(server::Column::Id.eq(source.id.clone())) + .exec(db) + .await?; + } + let target = server::Entity::find_by_id(target_server_id) .one(db) .await? @@ -783,6 +990,8 @@ impl RecoveryMergeService { active.agent_version = sea_orm::Set(source.agent_version.clone()); active.protocol_version = sea_orm::Set(source.protocol_version); active.features = sea_orm::Set(source.features.clone()); + active.last_remote_addr = sea_orm::Set(source.last_remote_addr.clone()); + active.fingerprint = sea_orm::Set(source.fingerprint.clone()); active.updated_at = sea_orm::Set(Utc::now()); active.update(db).await?; @@ -794,6 +1003,24 @@ impl RecoveryMergeService { db: &DatabaseConnection, source_server_id: &str, ) -> Result<(), AppError> { + Self::delete_intentionally_unmerged_source_rows_on_connection(db, source_server_id).await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn delete_intentionally_unmerged_source_rows_on_txn( + txn: &DatabaseTransaction, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::delete_intentionally_unmerged_source_rows_on_connection(txn, source_server_id).await + } + + async fn delete_intentionally_unmerged_source_rows_on_connection( + db: &C, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { server_tag::Entity::delete_many() .filter(server_tag::Column::ServerId.eq(source_server_id)) .exec(db) @@ -854,7 +1081,10 @@ mod tests { use crate::state::AppState; use crate::test_utils::setup_test_db; use chrono::{NaiveDate, Utc}; - use sea_orm::{ActiveModelTrait, ColumnTrait, DatabaseConnection, EntityTrait, QueryFilter, Set}; + use sea_orm::{ + ActiveModelTrait, ColumnTrait, DatabaseConnection, EntityTrait, QueryFilter, Set, + TransactionTrait, + }; use serverbee_common::constants::CAP_DEFAULT; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use std::sync::Arc; @@ -1418,6 +1648,52 @@ mod tests { assert!(source_states.is_empty()); } + #[tokio::test] + async fn merge_server_history_can_be_rolled_back_atomically() { + let (db, _tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + + let before_overlap = NaiveDate::from_ymd_opt(2026, 4, 16) + .unwrap() + .and_hms_opt(9, 0, 0) + .unwrap() + .and_utc(); + let overlap = NaiveDate::from_ymd_opt(2026, 4, 16) + .unwrap() + .and_hms_opt(10, 0, 0) + .unwrap() + .and_utc(); + + insert_record(&db, "target-1", before_overlap, 10.0).await; + insert_record(&db, "target-1", overlap, 20.0).await; + insert_record(&db, "source-1", overlap, 200.0).await; + + let txn = db.begin().await.unwrap(); + RecoveryMergeService::merge_server_history_on_txn(&txn, "target-1", "source-1") + .await + .unwrap(); + txn.rollback().await.unwrap(); + + let target_rows = record::Entity::find() + .filter(record::Column::ServerId.eq("target-1")) + .all(&db) + .await + .unwrap(); + assert_eq!(target_rows.len(), 2); + assert!(target_rows.iter().any(|row| row.time == before_overlap && row.cpu == 10.0)); + assert!(target_rows.iter().any(|row| row.time == overlap && row.cpu == 20.0)); + + let source_rows = record::Entity::find() + .filter(record::Column::ServerId.eq("source-1")) + .all(&db) + .await + .unwrap(); + assert_eq!(source_rows.len(), 1); + assert_eq!(source_rows[0].time, overlap); + assert_eq!(source_rows[0].cpu, 200.0); + } + #[tokio::test] async fn rewrite_server_ids_json_replaces_source_with_target_once() { let (db, _tmp) = setup_test_db().await; @@ -1512,6 +1788,8 @@ mod tests { source.agent_version = Set(Some("1.2.3".to_string())); source.protocol_version = Set(4); source.features = Set(r#"["docker","process"]"#.to_string()); + source.last_remote_addr = Set(Some("192.0.2.10:9527".to_string())); + source.fingerprint = Set(Some("fingerprint-123".to_string())); let source_model = source.update(&db).await.unwrap(); server_tag::ActiveModel { @@ -1566,6 +1844,8 @@ mod tests { assert_eq!(target.cpu_name.as_deref(), Some("Ryzen")); assert_eq!(target.protocol_version, 4); assert_eq!(target.features, r#"["docker","process"]"#); + assert_eq!(target.last_remote_addr.as_deref(), Some("192.0.2.10:9527")); + assert_eq!(target.fingerprint.as_deref(), Some("fingerprint-123")); let source_tags = server_tag::Entity::find() .filter(server_tag::Column::ServerId.eq("source-1")) From 093b3a64d7fd51334d12378bf9be8ff45ba68cf5 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:18:11 +0800 Subject: [PATCH 51/60] refactor(server): add txn-capable traffic merge helpers --- crates/server/src/service/traffic.rs | 66 ++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/crates/server/src/service/traffic.rs b/crates/server/src/service/traffic.rs index 4185bf9c..fd838ced 100644 --- a/crates/server/src/service/traffic.rs +++ b/crates/server/src/service/traffic.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use chrono::{Datelike, Duration, NaiveDate, SecondsFormat, Utc}; -use sea_orm::{ConnectionTrait, DatabaseConnection, EntityTrait, Statement}; +use sea_orm::{ConnectionTrait, DatabaseConnection, DatabaseTransaction, EntityTrait, Statement}; use serde::Serialize; use crate::entity::{server, traffic_state}; @@ -15,7 +15,28 @@ impl TrafficService { target_server_id: &str, source_server_id: &str, ) -> Result<(), AppError> { - Self::replace_unique_key_table_server_id( + Self::merge_recovered_server_history_on_connection(db, target_server_id, source_server_id) + .await + } + + pub async fn merge_recovered_server_history_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_recovered_server_history_on_connection(txn, target_server_id, source_server_id) + .await + } + + pub(crate) async fn merge_recovered_server_history_on_connection( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + Self::replace_unique_key_table_server_id_on_connection( db, "traffic_hourly", &["hour"], @@ -23,7 +44,7 @@ impl TrafficService { source_server_id, ) .await?; - Self::replace_unique_key_table_server_id( + Self::replace_unique_key_table_server_id_on_connection( db, "traffic_daily", &["date"], @@ -31,7 +52,7 @@ impl TrafficService { source_server_id, ) .await?; - Self::replace_unique_key_table_server_id( + Self::replace_unique_key_table_server_id_on_connection( db, "traffic_state", &[], @@ -50,6 +71,43 @@ impl TrafficService { target_server_id: &str, source_server_id: &str, ) -> Result<(), AppError> { + Self::replace_unique_key_table_server_id_on_connection( + db, + table, + key_columns, + target_server_id, + source_server_id, + ) + .await + } + + pub(crate) async fn replace_unique_key_table_server_id_on_txn( + txn: &DatabaseTransaction, + table: &str, + key_columns: &[&str], + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::replace_unique_key_table_server_id_on_connection( + txn, + table, + key_columns, + target_server_id, + source_server_id, + ) + .await + } + + pub(crate) async fn replace_unique_key_table_server_id_on_connection( + db: &C, + table: &str, + key_columns: &[&str], + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { let join_predicate = if key_columns.is_empty() { "1 = 1".to_string() } else { From b38f3d6c301ce3e2877e05780069e799ae8ccedb Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:19:48 +0800 Subject: [PATCH 52/60] feat(web): add recovery merge workflow UI --- .../server/recovery-merge-dialog.test.tsx | 74 +++++++++ .../server/recovery-merge-dialog.tsx | 145 ++++++++++++++++++ apps/web/src/hooks/use-api.test.tsx | 111 +++++++++++++- apps/web/src/hooks/use-api.ts | 30 +++- apps/web/src/hooks/use-servers-ws.test.ts | 96 ++++++++++++ apps/web/src/hooks/use-servers-ws.ts | 14 +- apps/web/src/lib/api-schema.ts | 37 +++++ apps/web/src/locales/en/servers.json | 16 +- apps/web/src/locales/zh/servers.json | 16 +- .../src/routes/_authed/servers/$id.test.tsx | 26 +++- apps/web/src/routes/_authed/servers/$id.tsx | 26 +++- .../src/stores/recovery-jobs-store.test.ts | 56 +++++++ apps/web/src/stores/recovery-jobs-store.ts | 40 +++++ 13 files changed, 679 insertions(+), 8 deletions(-) create mode 100644 apps/web/src/components/server/recovery-merge-dialog.test.tsx create mode 100644 apps/web/src/components/server/recovery-merge-dialog.tsx create mode 100644 apps/web/src/stores/recovery-jobs-store.test.ts create mode 100644 apps/web/src/stores/recovery-jobs-store.ts diff --git a/apps/web/src/components/server/recovery-merge-dialog.test.tsx b/apps/web/src/components/server/recovery-merge-dialog.test.tsx new file mode 100644 index 00000000..1f984ba1 --- /dev/null +++ b/apps/web/src/components/server/recovery-merge-dialog.test.tsx @@ -0,0 +1,74 @@ +import { QueryClient, QueryClientProvider } from '@tanstack/react-query' +import { fireEvent, render, screen } from '@testing-library/react' +import type { ReactNode } from 'react' +import { describe, expect, it, vi } from 'vitest' +import { RecoveryMergeDialog } from './recovery-merge-dialog' + +const mockUseRecoveryCandidates = vi.fn() +const mockStartRecoveryMerge = vi.fn() + +vi.mock('react-i18next', () => ({ + useTranslation: () => ({ + t: (key: string, options?: { defaultValue?: string }) => options?.defaultValue ?? key + }) +})) + +vi.mock('sonner', () => ({ + toast: { + error: vi.fn(), + success: vi.fn() + } +})) + +vi.mock('@/hooks/use-api', () => ({ + useRecoveryCandidates: (...args: unknown[]) => mockUseRecoveryCandidates(...args), + startRecoveryMerge: (...args: unknown[]) => mockStartRecoveryMerge(...args) +})) + +function Wrapper({ children }: { children: ReactNode }) { + const queryClient = new QueryClient({ + defaultOptions: { + queries: { retry: false }, + mutations: { retry: false } + } + }) + + return {children} +} + +describe('RecoveryMergeDialog', () => { + it('renders candidate list', () => { + mockUseRecoveryCandidates.mockReturnValue({ + data: [{ server_id: 'source-1', name: 'Source', score: 42, reasons: ['same remote address'] }], + isError: false, + isLoading: false + }) + + render( + , + { wrapper: Wrapper } + ) + + expect(screen.getByText('Source')).toBeDefined() + expect(screen.getByText('same remote address')).toBeDefined() + }) + + it('disables submit until a candidate is selected', () => { + mockUseRecoveryCandidates.mockReturnValue({ + data: [{ server_id: 'source-1', name: 'Source', score: 42, reasons: ['same remote address'] }], + isError: false, + isLoading: false + }) + + render( + , + { wrapper: Wrapper } + ) + + const button = screen.getByText('Start Recovery').closest('button') + expect(button?.getAttribute('disabled')).toBe('') + + fireEvent.click(screen.getByText('Source')) + expect(button?.hasAttribute('disabled')).toBe(false) + }) +}) diff --git a/apps/web/src/components/server/recovery-merge-dialog.tsx b/apps/web/src/components/server/recovery-merge-dialog.tsx new file mode 100644 index 00000000..b0a97399 --- /dev/null +++ b/apps/web/src/components/server/recovery-merge-dialog.tsx @@ -0,0 +1,145 @@ +import { useMutation, useQueryClient } from '@tanstack/react-query' +import { Loader2, RotateCcw } from 'lucide-react' +import { useState } from 'react' +import { useTranslation } from 'react-i18next' +import { toast } from 'sonner' +import { useRecoveryCandidates, startRecoveryMerge } from '@/hooks/use-api' +import type { RecoveryJobResponse } from '@/lib/api-schema' +import { Badge } from '@/components/ui/badge' +import { Button } from '@/components/ui/button' +import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from '@/components/ui/dialog' + +interface RecoveryMergeDialogProps { + currentJob?: RecoveryJobResponse + onOpenChange: (open: boolean) => void + open: boolean + targetServerId: string +} + +export function RecoveryMergeDialog({ + currentJob, + onOpenChange, + open, + targetServerId +}: RecoveryMergeDialogProps) { + const { t } = useTranslation('servers') + const queryClient = useQueryClient() + const [selectedSourceId, setSelectedSourceId] = useState('') + + const candidatesQuery = useRecoveryCandidates(targetServerId, open) + + const startMutation = useMutation({ + mutationFn: (sourceServerId: string) => startRecoveryMerge(targetServerId, { source_server_id: sourceServerId }), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['servers', targetServerId, 'recovery-candidates'] }) + toast.success(t('recovery_merge_started', { defaultValue: 'Recovery started' })) + onOpenChange(false) + }, + onError: (error) => { + toast.error(error instanceof Error ? error.message : t('recovery_merge_failed', { defaultValue: 'Recovery failed' })) + } + }) + + const candidates = candidatesQuery.data ?? [] + const selectedCandidate = candidates.find((candidate) => candidate.server_id === selectedSourceId) + const canSubmit = selectedCandidate != null && !startMutation.isPending + + return ( + + + + {t('recovery_merge_title', { defaultValue: 'Recover Offline Server' })} + + {t('recovery_merge_description', { + defaultValue: 'Pick the online replacement agent to rebind and merge back into this offline server.' + })} + + + + {currentJob && ( +
+
+ {currentJob.stage} + {t('recovery_merge_existing_job', { defaultValue: 'A recovery job is already in progress.' })} +
+
+ )} + + {candidatesQuery.isLoading && ( +
+ + {t('recovery_merge_loading', { defaultValue: 'Loading recovery candidates…' })} +
+ )} + + {candidatesQuery.isError && ( +
+ {t('recovery_merge_candidates_failed', { defaultValue: 'Failed to load recovery candidates.' })} +
+ )} + + {!candidatesQuery.isLoading && !candidatesQuery.isError && candidates.length === 0 && ( +
+ {t('recovery_merge_empty', { defaultValue: 'No online recovery candidates are available right now.' })} +
+ )} + + {candidates.length > 0 && ( +
+ {candidates.map((candidate) => { + const selected = candidate.server_id === selectedSourceId + return ( + + ) + })} +
+ )} + +
+ {t('recovery_merge_warning', { + defaultValue: 'This keeps the original server record, asks the replacement agent to rebind, and continues the recovery flow from there.' + })} +
+ +
+ + +
+
+
+ ) +} diff --git a/apps/web/src/hooks/use-api.test.tsx b/apps/web/src/hooks/use-api.test.tsx index ca755bc7..594ead83 100644 --- a/apps/web/src/hooks/use-api.test.tsx +++ b/apps/web/src/hooks/use-api.test.tsx @@ -2,7 +2,7 @@ import { QueryClient, QueryClientProvider } from '@tanstack/react-query' import { renderHook, waitFor } from '@testing-library/react' import type { ReactNode } from 'react' import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' -import { useServer, useServerRecords } from './use-api' +import { startRecoveryMerge, useRecoveryCandidates, useRecoveryJob, useServer, useServerRecords } from './use-api' function createWrapper() { const queryClient = new QueryClient({ @@ -181,3 +181,112 @@ describe('useServerRecords', () => { expect(globalThis.fetch).not.toHaveBeenCalled() }) }) + +describe('recovery hooks', () => { + it('fetches recovery candidates for a target server', async () => { + vi.mocked(globalThis.fetch).mockResolvedValueOnce( + new Response( + JSON.stringify({ + data: [{ server_id: 'source-1', name: 'Source', score: 42, reasons: ['same remote address'] }] + }), + { + status: 200, + headers: { 'Content-Type': 'application/json' } + } + ) + ) + + const { result } = renderHook(() => useRecoveryCandidates('target-1'), { + wrapper: createWrapper() + }) + + await waitFor(() => { + expect(result.current.isSuccess).toBe(true) + }) + + expect(result.current.data?.[0].server_id).toBe('source-1') + expect(result.current.data?.[0].reasons).toEqual(['same remote address']) + }) + + it('does not fetch recovery candidates when disabled', async () => { + const { result } = renderHook(() => useRecoveryCandidates('target-1', false), { + wrapper: createWrapper() + }) + + await waitFor(() => { + expect(result.current.fetchStatus).toBe('idle') + }) + + expect(globalThis.fetch).not.toHaveBeenCalled() + }) + + it('fetches a recovery job by id', async () => { + vi.mocked(globalThis.fetch).mockResolvedValueOnce( + new Response( + JSON.stringify({ + data: { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running', + stage: 'rebinding', + error: null, + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + } + }), + { + status: 200, + headers: { 'Content-Type': 'application/json' } + } + ) + ) + + const { result } = renderHook(() => useRecoveryJob('job-1'), { + wrapper: createWrapper() + }) + + await waitFor(() => { + expect(result.current.isSuccess).toBe(true) + }) + + expect(result.current.data?.stage).toBe('rebinding') + }) + + it('starts a recovery merge', async () => { + vi.mocked(globalThis.fetch).mockResolvedValueOnce( + new Response( + JSON.stringify({ + data: { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running', + stage: 'rebinding', + error: null, + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + } + }), + { + status: 200, + headers: { 'Content-Type': 'application/json' } + } + ) + ) + + const job = await startRecoveryMerge('target-1', { source_server_id: 'source-1' }) + + expect(job.job_id).toBe('job-1') + expect(globalThis.fetch).toHaveBeenCalledWith( + '/api/servers/target-1/recover-merge', + expect.objectContaining({ + method: 'POST' + }) + ) + }) +}) diff --git a/apps/web/src/hooks/use-api.ts b/apps/web/src/hooks/use-api.ts index 933487b2..e60dae2c 100644 --- a/apps/web/src/hooks/use-api.ts +++ b/apps/web/src/hooks/use-api.ts @@ -1,6 +1,12 @@ import { useQuery } from '@tanstack/react-query' import { api } from '@/lib/api-client' -import type { ServerResponse, UptimeDailyEntry } from '@/lib/api-schema' +import type { + RecoveryCandidateResponse, + RecoveryJobResponse, + ServerResponse, + StartRecoveryRequest, + UptimeDailyEntry +} from '@/lib/api-schema' type ServerRecord = import('@/lib/api-schema').ServerMetricRecord @@ -37,4 +43,26 @@ export function useUptimeDaily(serverId: string, days = 90) { }) } +export function useRecoveryCandidates(targetId: string, enabled = true) { + return useQuery({ + queryKey: ['servers', targetId, 'recovery-candidates'], + queryFn: () => api.get(`/api/servers/${targetId}/recovery-candidates`), + enabled: enabled && targetId.length > 0, + staleTime: 30_000 + }) +} + +export function useRecoveryJob(jobId: string, enabled = true) { + return useQuery({ + queryKey: ['recovery-jobs', jobId], + queryFn: () => api.get(`/api/servers/recovery-jobs/${jobId}`), + enabled: enabled && jobId.length > 0, + staleTime: 15_000 + }) +} + +export async function startRecoveryMerge(targetId: string, payload: StartRecoveryRequest) { + return api.post(`/api/servers/${targetId}/recover-merge`, payload) +} + export type { ServerMetricRecord as ServerRecord } from '@/lib/api-schema' diff --git a/apps/web/src/hooks/use-servers-ws.test.ts b/apps/web/src/hooks/use-servers-ws.test.ts index f9c2087e..00f42277 100644 --- a/apps/web/src/hooks/use-servers-ws.test.ts +++ b/apps/web/src/hooks/use-servers-ws.test.ts @@ -1,4 +1,5 @@ import { describe, expect, it } from 'vitest' +import { useRecoveryJobsStore } from '@/stores/recovery-jobs-store' import { useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' import type { ServerMetrics } from './use-servers-ws' import { handleWsMessage, mergeServerUpdate, setServerCapabilities, setServerOnlineStatus } from './use-servers-ws' @@ -209,3 +210,98 @@ describe('handleWsMessage upgrade messages', () => { expect(job?.finished_at).not.toBeNull() }) }) + +describe('handleWsMessage recovery messages', () => { + function makeQueryClient() { + const cache = new Map() + return { + setQueryData: (key: unknown[], value: unknown | ((prev: unknown) => unknown)) => { + const cacheKey = JSON.stringify(key) + const prev = cache.get(cacheKey) + const next = typeof value === 'function' ? (value as (prev: unknown) => unknown)(prev) : value + cache.set(cacheKey, next) + } + } + } + + it('hydrates recovery jobs from full_sync', () => { + useRecoveryJobsStore.setState({ jobs: new Map() }) + const queryClient = makeQueryClient() + + handleWsMessage( + { + type: 'full_sync', + servers: [], + recoveries: [ + { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running', + stage: 'rebinding', + error: null, + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + } + ] + }, + queryClient as never + ) + + expect(useRecoveryJobsStore.getState().getJob('target-1')?.job_id).toBe('job-1') + }) + + it('updates recovery jobs only when update payload includes recoveries', () => { + useRecoveryJobsStore.setState({ jobs: new Map() }) + useRecoveryJobsStore.getState().setJob('target-1', { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running', + stage: 'rebinding', + error: null, + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + }) + + const queryClient = makeQueryClient() + handleWsMessage( + { + type: 'update', + servers: [] + }, + queryClient as never + ) + + expect(useRecoveryJobsStore.getState().getJob('target-1')?.job_id).toBe('job-1') + + handleWsMessage( + { + type: 'update', + servers: [], + recoveries: [ + { + job_id: 'job-2', + target_server_id: 'target-2', + source_server_id: 'source-2', + status: 'failed', + stage: 'failed', + error: 'boom', + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + } + ] + }, + queryClient as never + ) + + expect(useRecoveryJobsStore.getState().getJob('target-1')).toBeUndefined() + expect(useRecoveryJobsStore.getState().getJob('target-2')?.job_id).toBe('job-2') + }) +}) diff --git a/apps/web/src/hooks/use-servers-ws.ts b/apps/web/src/hooks/use-servers-ws.ts index 65ec5382..03a99144 100644 --- a/apps/web/src/hooks/use-servers-ws.ts +++ b/apps/web/src/hooks/use-servers-ws.ts @@ -7,6 +7,8 @@ import type { DockerContainerStats, DockerEventInfo } from '@/routes/_authed/servers/$serverId/docker/types' +import type { RecoveryJobResponse } from '@/lib/api-schema' +import { useRecoveryJobsStore } from '@/stores/recovery-jobs-store' import { type UpgradeJob, useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' const MAX_DOCKER_EVENTS = 100 @@ -49,8 +51,8 @@ interface ServerMetrics { } type WsMessage = - | { type: 'full_sync'; servers: ServerMetrics[]; upgrades?: UpgradeJob[] } - | { type: 'update'; servers: ServerMetrics[] } + | { type: 'full_sync'; servers: ServerMetrics[]; upgrades?: UpgradeJob[]; recoveries?: RecoveryJobResponse[] } + | { type: 'update'; servers: ServerMetrics[]; recoveries?: RecoveryJobResponse[] | null } | { type: 'server_online'; server_id: string } | { type: 'server_offline'; server_id: string } | { @@ -196,10 +198,18 @@ function handleServerMetricsMessage(raw: { type: string } & Record(['servers'], (prev) => prev ? mergeServerUpdate(prev, msg.servers) : msg.servers ) + if (Array.isArray(raw.recoveries)) { + useRecoveryJobsStore.getState().setJobs(raw.recoveries) + } } return } diff --git a/apps/web/src/lib/api-schema.ts b/apps/web/src/lib/api-schema.ts index bee9b969..a549c9c8 100644 --- a/apps/web/src/lib/api-schema.ts +++ b/apps/web/src/lib/api-schema.ts @@ -26,6 +26,43 @@ export type UpdateServerInput = S['UpdateServerInput'] export type BatchDeleteRequest = S['BatchDeleteRequest'] export type BatchDeleteResponse = S['BatchDeleteResponse'] +export interface RecoveryCandidateResponse { + name: string + reasons: string[] + score: number + server_id: string +} + +export type RecoveryJobStatus = 'running' | 'failed' | 'succeeded' | 'unknown' + +export type RecoveryJobStage = + | 'validating' + | 'rebinding' + | 'awaiting_target_online' + | 'freezing_writes' + | 'merging_history' + | 'finalizing' + | 'succeeded' + | 'failed' + | 'unknown' + +export interface RecoveryJobResponse { + created_at: string + error: string | null + job_id: string + last_heartbeat_at: string | null + source_server_id: string + stage: RecoveryJobStage + started_at: string + status: RecoveryJobStatus + target_server_id: string + updated_at: string +} + +export interface StartRecoveryRequest { + source_server_id: string +} + // Server groups export type ServerGroup = S['ServerGroup'] export type CreateGroupRequest = S['CreateGroupRequest'] diff --git a/apps/web/src/locales/en/servers.json b/apps/web/src/locales/en/servers.json index 9dfdea9a..684cec2f 100644 --- a/apps/web/src/locales/en/servers.json +++ b/apps/web/src/locales/en/servers.json @@ -197,5 +197,19 @@ "upgrade_status_failed": "Upgrade failed", "upgrade_status_timeout": "Upgrade timed out", "upgrade_error_with_backup": "Backup saved at", - "upgrade_backup_path": "Backup location" + "upgrade_backup_path": "Backup location", + + "recovery_merge_open": "Recover Agent", + "recovery_merge_resume": "View Recovery", + "recovery_merge_title": "Recover Offline Server", + "recovery_merge_description": "Pick the online replacement agent to rebind and merge back into this offline server.", + "recovery_merge_loading": "Loading recovery candidates…", + "recovery_merge_empty": "No online recovery candidates are available right now.", + "recovery_merge_existing_job": "A recovery job is already in progress.", + "recovery_merge_warning": "This keeps the original server record, asks the replacement agent to rebind, and continues the recovery flow from there.", + "recovery_merge_start": "Start Recovery", + "recovery_merge_starting": "Starting…", + "recovery_merge_started": "Recovery started", + "recovery_merge_failed": "Recovery failed", + "recovery_merge_candidates_failed": "Failed to load recovery candidates." } diff --git a/apps/web/src/locales/zh/servers.json b/apps/web/src/locales/zh/servers.json index 023be7c1..98fe7dba 100644 --- a/apps/web/src/locales/zh/servers.json +++ b/apps/web/src/locales/zh/servers.json @@ -197,5 +197,19 @@ "upgrade_status_failed": "升级失败", "upgrade_status_timeout": "升级超时", "upgrade_error_with_backup": "备份保存在", - "upgrade_backup_path": "备份位置" + "upgrade_backup_path": "备份位置", + + "recovery_merge_open": "恢复 Agent", + "recovery_merge_resume": "查看恢复任务", + "recovery_merge_title": "恢复离线服务器", + "recovery_merge_description": "选择在线的替代 Agent,将其重新绑定并合并回当前离线服务器。", + "recovery_merge_loading": "正在加载恢复候选项…", + "recovery_merge_empty": "当前没有可用的在线恢复候选项。", + "recovery_merge_existing_job": "当前已有一个恢复任务正在进行。", + "recovery_merge_warning": "此操作会保留原服务器记录,要求替代 Agent 重新绑定,并继续后续恢复流程。", + "recovery_merge_start": "开始恢复", + "recovery_merge_starting": "启动中…", + "recovery_merge_started": "恢复任务已启动", + "recovery_merge_failed": "恢复失败", + "recovery_merge_candidates_failed": "加载恢复候选项失败。" } diff --git a/apps/web/src/routes/_authed/servers/$id.test.tsx b/apps/web/src/routes/_authed/servers/$id.test.tsx index 72cb3bd8..0fb1e3a1 100644 --- a/apps/web/src/routes/_authed/servers/$id.test.tsx +++ b/apps/web/src/routes/_authed/servers/$id.test.tsx @@ -28,7 +28,7 @@ vi.mock('@tanstack/react-query', () => ({ vi.mock('react-i18next', () => ({ useTranslation: () => ({ - t: (key: string) => key + t: (key: string, options?: { defaultValue?: string }) => options?.defaultValue ?? key }) })) @@ -38,6 +38,10 @@ vi.mock('@/components/server/agent-version-section', () => ({ ) })) +vi.mock('@/components/server/recovery-merge-dialog', () => ({ + RecoveryMergeDialog: () =>
+})) + vi.mock('@/components/server/capabilities-dialog', () => ({ CapabilitiesDialog: () =>
capabilities
})) @@ -107,6 +111,12 @@ vi.mock('@/hooks/use-realtime-metrics', () => ({ useRealtimeMetrics: () => [] })) +vi.mock('@/hooks/use-auth', () => ({ + useAuth: () => ({ + user: { role: 'admin' } + }) +})) + vi.mock('@/lib/api-client', () => ({ api: { get: vi.fn() @@ -135,6 +145,14 @@ vi.mock('@/lib/widget-helpers', () => ({ computeAggregateUptime: () => null })) +vi.mock('@/stores/upgrade-jobs-store', () => ({ + useUpgradeJobsStore: () => undefined +})) + +vi.mock('@/stores/recovery-jobs-store', () => ({ + useRecoveryJobsStore: () => undefined +})) + const { ServerDetailPage } = await import('./$id') describe('ServerDetailPage', () => { @@ -199,4 +217,10 @@ describe('ServerDetailPage', () => { expect(headerGrid?.children[1]).toContainElement(upgradeCard) expect(headerGrid?.children[2]).toContainElement(editButton) }) + + it('shows recovery action for offline server when admin', () => { + render() + + expect(screen.getByText('Recover Agent')).toBeDefined() + }) }) diff --git a/apps/web/src/routes/_authed/servers/$id.tsx b/apps/web/src/routes/_authed/servers/$id.tsx index 56918d2b..e01ebe0a 100644 --- a/apps/web/src/routes/_authed/servers/$id.tsx +++ b/apps/web/src/routes/_authed/servers/$id.tsx @@ -13,19 +13,23 @@ import { TrafficCard } from '@/components/server/traffic-card' import { TrafficProgress } from '@/components/server/traffic-progress' import { TrafficTab } from '@/components/server/traffic-tab' import { UpgradeJobBadge } from '@/components/server/upgrade-job-badge' +import { RecoveryMergeDialog } from '@/components/server/recovery-merge-dialog' +import { Badge } from '@/components/ui/badge' import { Button } from '@/components/ui/button' import { Skeleton } from '@/components/ui/skeleton' import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs' import { UptimeTimeline } from '@/components/uptime/uptime-timeline' +import { useAuth } from '@/hooks/use-auth' import { useServer, useServerRecords, useUptimeDaily } from '@/hooks/use-api' import { useRealtimeMetrics } from '@/hooks/use-realtime-metrics' import type { ServerMetrics } from '@/hooks/use-servers-ws' import { api } from '@/lib/api-client' -import type { ServerResponse } from '@/lib/api-schema' +import type { RecoveryJobResponse, ServerResponse } from '@/lib/api-schema' import { CAP_DOCKER, CAP_FILE, CAP_TERMINAL, getEffectiveCapabilityEnabled } from '@/lib/capabilities' import { buildMergedDiskIoSeries, buildPerDiskIoSeries } from '@/lib/disk-io' import { cn, countryCodeToFlag, formatBytes } from '@/lib/utils' import { computeAggregateUptime } from '@/lib/widget-helpers' +import { useRecoveryJobsStore } from '@/stores/recovery-jobs-store' import { useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' export const Route = createFileRoute('/_authed/servers/$id')({ @@ -122,19 +126,25 @@ function ServerInfoMeta({ server }: { server: ServerResponse }) { } function ServerActionButtons({ + currentRecoveryJob, dockerEnabled, fileEnabled, id, + isAdmin, isOnline, onEditOpen, + onRecoveryOpen, serverWithCaps, terminalEnabled }: { + currentRecoveryJob?: RecoveryJobResponse dockerEnabled: boolean fileEnabled: boolean id: string + isAdmin: boolean isOnline: boolean onEditOpen: () => void + onRecoveryOpen: () => void serverWithCaps: ServerResponse & ServerWithCaps terminalEnabled: boolean }) { @@ -146,6 +156,11 @@ function ServerActionButtons({ {t('detail_edit')} + {isAdmin && !isOnline && ( + + )} {isOnline && terminalEnabled && (
@@ -580,11 +600,14 @@ export function ServerDetailPage() {
setEditOpen(true)} + onRecoveryOpen={() => setRecoveryOpen(true)} serverWithCaps={serverWithCaps} terminalEnabled={terminalEnabled} /> @@ -647,6 +670,7 @@ export function ServerDetailPage() { setEditOpen(false)} open={editOpen} server={server} /> +
) } diff --git a/apps/web/src/stores/recovery-jobs-store.test.ts b/apps/web/src/stores/recovery-jobs-store.test.ts new file mode 100644 index 00000000..013f80db --- /dev/null +++ b/apps/web/src/stores/recovery-jobs-store.test.ts @@ -0,0 +1,56 @@ +import { describe, expect, it } from 'vitest' +import { useRecoveryJobsStore } from './recovery-jobs-store' + +function makeJob(overrides: Partial> = {}) { + return { + ...buildJob(), + ...overrides + } +} + +function buildJob() { + return { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running' as const, + stage: 'rebinding' as const, + error: null, + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + } +} + +describe('useRecoveryJobsStore', () => { + it('stores jobs keyed by target server id', () => { + useRecoveryJobsStore.setState({ jobs: new Map() }) + + useRecoveryJobsStore.getState().setJob('target-1', makeJob()) + + expect(useRecoveryJobsStore.getState().getJob('target-1')?.job_id).toBe('job-1') + }) + + it('replaces the whole map on setJobs', () => { + useRecoveryJobsStore.setState({ jobs: new Map() }) + useRecoveryJobsStore.getState().setJob('old-target', makeJob({ target_server_id: 'old-target' })) + + useRecoveryJobsStore.getState().setJobs([ + makeJob(), + makeJob({ job_id: 'job-2', target_server_id: 'target-2', source_server_id: 'source-2' }) + ]) + + expect(useRecoveryJobsStore.getState().getJob('old-target')).toBeUndefined() + expect(useRecoveryJobsStore.getState().getJob('target-2')?.job_id).toBe('job-2') + }) + + it('clears a job by target server id', () => { + useRecoveryJobsStore.setState({ jobs: new Map() }) + useRecoveryJobsStore.getState().setJob('target-1', makeJob()) + + useRecoveryJobsStore.getState().clearJob('target-1') + + expect(useRecoveryJobsStore.getState().getJob('target-1')).toBeUndefined() + }) +}) diff --git a/apps/web/src/stores/recovery-jobs-store.ts b/apps/web/src/stores/recovery-jobs-store.ts new file mode 100644 index 00000000..229058bd --- /dev/null +++ b/apps/web/src/stores/recovery-jobs-store.ts @@ -0,0 +1,40 @@ +import { create } from 'zustand' +import type { RecoveryJobResponse } from '@/lib/api-schema' + +interface RecoveryJobsState { + clearJob: (targetServerId: string) => void + getJob: (targetServerId: string) => RecoveryJobResponse | undefined + jobs: Map + setJob: (targetServerId: string, job: RecoveryJobResponse) => void + setJobs: (jobs: RecoveryJobResponse[]) => void +} + +export const useRecoveryJobsStore = create()((set, get) => ({ + jobs: new Map(), + + setJob: (targetServerId: string, job: RecoveryJobResponse) => { + set((state) => { + const next = new Map(state.jobs) + next.set(targetServerId, job) + return { jobs: next } + }) + }, + + clearJob: (targetServerId: string) => { + set((state) => { + const next = new Map(state.jobs) + next.delete(targetServerId) + return { jobs: next } + }) + }, + + setJobs: (jobs: RecoveryJobResponse[]) => { + const next = new Map() + for (const job of jobs) { + next.set(job.target_server_id, job) + } + set({ jobs: next }) + }, + + getJob: (targetServerId: string) => get().jobs.get(targetServerId) +})) From ff3f43845f040fcc19e1b0cb596b60e5e8e9fcee Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:21:25 +0800 Subject: [PATCH 53/60] docs: add recovery merge guidance and api endpoints --- apps/docs/content/docs/cn/api-reference.mdx | 3 +++ apps/docs/content/docs/cn/server.mdx | 13 +++++++++++++ apps/docs/content/docs/en/api-reference.mdx | 3 +++ apps/docs/content/docs/en/server.mdx | 13 +++++++++++++ 4 files changed, 32 insertions(+) diff --git a/apps/docs/content/docs/cn/api-reference.mdx b/apps/docs/content/docs/cn/api-reference.mdx index 0232fef3..e1582740 100644 --- a/apps/docs/content/docs/cn/api-reference.mdx +++ b/apps/docs/content/docs/cn/api-reference.mdx @@ -80,6 +80,9 @@ API Key 格式为 `serverbee_` 前缀 + 43 字符随机字符串,创建时仅 | POST/PUT/DELETE | `/api/servers/*` | 服务器管理 | | PUT | `/api/servers/batch-capabilities` | 批量更新功能开关 | | POST | `/api/servers/{id}/upgrade` | 触发 Agent 升级 | +| GET | `/api/servers/{target_id}/recovery-candidates` | 列出推荐的恢复候选项 | +| GET | `/api/servers/recovery-jobs/{job_id}` | 获取恢复任务详情 | +| POST | `/api/servers/{target_id}/recover-merge` | 启动 Agent 恢复任务 | | CRUD | `/api/server-groups/*` | 服务器分组管理 | | CRUD | `/api/notifications/*` | 通知渠道管理 | | CRUD | `/api/notification-groups/*` | 通知组管理 | diff --git a/apps/docs/content/docs/cn/server.mdx b/apps/docs/content/docs/cn/server.mdx index 1831ad3e..58f54f79 100644 --- a/apps/docs/content/docs/cn/server.mdx +++ b/apps/docs/content/docs/cn/server.mdx @@ -218,6 +218,19 @@ Auto Discovery Key 用于 Agent 首次自动注册,优先级如下: 为减少重复的占位服务器,Agent 在可读取稳定机器标识时会携带指纹,相同机器重复注册会复用原有服务器记录。你还可以通过 `auth.max_servers` 软限制自动注册创建的新服务器数量;如果历史失败注册留下了离线占位条目,可在 `/servers` 页面使用 **Clean up unconnected** 清理。 +## 恢复重装后的 Agent + +如果一台已有服务器重装系统后重新注册成了一条新的临时在线节点,可以在原来的离线服务器详情页里发起恢复: + +1. 打开原来的离线服务器 +2. 点击 **恢复 Agent** +3. 选择推荐的在线替代节点 +4. 启动恢复任务 + +恢复会保留原始服务器记录,并要求替代 Agent 重新绑定到原来的服务器身份,后续恢复流程会继续在这个基础上执行。 + +这个恢复流程用于“同一台逻辑机器重装后重新接回”。它不适用于任意两条服务器记录的合并,也不适用于迁移到完全不同的硬件主机。 + ## OAuth 设置 ServerBee 支持通过 OAuth 第三方登录,目前支持以下提供商: diff --git a/apps/docs/content/docs/en/api-reference.mdx b/apps/docs/content/docs/en/api-reference.mdx index 3fd231f3..c366b81f 100644 --- a/apps/docs/content/docs/en/api-reference.mdx +++ b/apps/docs/content/docs/en/api-reference.mdx @@ -80,6 +80,9 @@ API keys use the format `serverbee_` prefix + 43-character random string. The ke | POST/PUT/DELETE | `/api/servers/*` | Server management | | PUT | `/api/servers/batch-capabilities` | Batch update capabilities | | POST | `/api/servers/{id}/upgrade` | Trigger agent upgrade | +| GET | `/api/servers/{target_id}/recovery-candidates` | List recommended recovery candidates | +| GET | `/api/servers/recovery-jobs/{job_id}` | Get recovery job details | +| POST | `/api/servers/{target_id}/recover-merge` | Start an agent recovery job | | CRUD | `/api/server-groups/*` | Server group management | | CRUD | `/api/notifications/*` | Notification channel management | | CRUD | `/api/notification-groups/*` | Notification group management | diff --git a/apps/docs/content/docs/en/server.mdx b/apps/docs/content/docs/en/server.mdx index 83396602..08c318e9 100644 --- a/apps/docs/content/docs/en/server.mdx +++ b/apps/docs/content/docs/en/server.mdx @@ -178,6 +178,19 @@ The full key is printed to the logs on first startup and can also be viewed or r To reduce duplicate placeholder servers, agents reuse the existing server row when they re-register with the same machine fingerprint. You can also set `auth.max_servers` to soft-cap new auto-registered servers, and use **Clean up unconnected** on the Servers page to remove offline placeholders that never finished initialization. +## Recovering a Reinstalled Agent + +If an existing server was reinstalled and then re-registered as a new temporary online node, you can recover it from the original offline server detail page: + +1. Open the original offline server. +2. Click **Recover Agent**. +3. Choose the recommended online replacement candidate. +4. Start the recovery job. + +The original server record is kept. The replacement agent is asked to rebind onto the original server identity, and the recovery flow continues from there. + +Recovery is designed for reinstalling the same logical machine. It is not intended for arbitrary record merges or hardware migrations to a different host. + ## GeoIP Setup To enable geographic location detection for your agents: From 8e6a5ada6abd96786599f2a379abebd6db97c108 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:27:00 +0800 Subject: [PATCH 54/60] test(server): seed recovery candidate source protocol state --- crates/server/tests/integration.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/server/tests/integration.rs b/crates/server/tests/integration.rs index 39409479..8f3f2854 100644 --- a/crates/server/tests/integration.rs +++ b/crates/server/tests/integration.rs @@ -3810,8 +3810,9 @@ async fn test_recovery_candidates_rejects_online_or_busy_target() { .contains("must be offline") ); - let (_source_sink, mut source_reader) = connect_agent(&base_url, &source_token).await; + let (mut source_sink, mut source_reader) = connect_agent(&base_url, &source_token).await; let _source_welcome = recv_agent_text(&mut source_reader).await; + send_system_info(&mut source_sink, &mut source_reader, "recovery-busy-source-info", None).await; let start_resp = admin_client .post(format!("{}/api/servers/{}/recover-merge", base_url, busy_target_id)) From abfed86a21aae5223dc4e5f99994d3d5e5c0ce15 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 21:56:51 +0800 Subject: [PATCH 55/60] style(web): satisfy recovery workflow lint --- .../server/recovery-merge-dialog.test.tsx | 10 +---- .../server/recovery-merge-dialog.tsx | 37 ++++++++------- apps/web/src/hooks/use-api.ts | 2 +- apps/web/src/hooks/use-servers-ws.ts | 45 ++++++++++++------- apps/web/src/routes/_authed/servers/$id.tsx | 15 +++++-- .../src/stores/recovery-jobs-store.test.ts | 7 ++- 6 files changed, 66 insertions(+), 50 deletions(-) diff --git a/apps/web/src/components/server/recovery-merge-dialog.test.tsx b/apps/web/src/components/server/recovery-merge-dialog.test.tsx index 1f984ba1..46769cbf 100644 --- a/apps/web/src/components/server/recovery-merge-dialog.test.tsx +++ b/apps/web/src/components/server/recovery-merge-dialog.test.tsx @@ -44,10 +44,7 @@ describe('RecoveryMergeDialog', () => { isLoading: false }) - render( - , - { wrapper: Wrapper } - ) + render(, { wrapper: Wrapper }) expect(screen.getByText('Source')).toBeDefined() expect(screen.getByText('same remote address')).toBeDefined() @@ -60,10 +57,7 @@ describe('RecoveryMergeDialog', () => { isLoading: false }) - render( - , - { wrapper: Wrapper } - ) + render(, { wrapper: Wrapper }) const button = screen.getByText('Start Recovery').closest('button') expect(button?.getAttribute('disabled')).toBe('') diff --git a/apps/web/src/components/server/recovery-merge-dialog.tsx b/apps/web/src/components/server/recovery-merge-dialog.tsx index b0a97399..c8dc5a72 100644 --- a/apps/web/src/components/server/recovery-merge-dialog.tsx +++ b/apps/web/src/components/server/recovery-merge-dialog.tsx @@ -3,11 +3,11 @@ import { Loader2, RotateCcw } from 'lucide-react' import { useState } from 'react' import { useTranslation } from 'react-i18next' import { toast } from 'sonner' -import { useRecoveryCandidates, startRecoveryMerge } from '@/hooks/use-api' -import type { RecoveryJobResponse } from '@/lib/api-schema' import { Badge } from '@/components/ui/badge' import { Button } from '@/components/ui/button' import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from '@/components/ui/dialog' +import { startRecoveryMerge, useRecoveryCandidates } from '@/hooks/use-api' +import type { RecoveryJobResponse } from '@/lib/api-schema' interface RecoveryMergeDialogProps { currentJob?: RecoveryJobResponse @@ -16,12 +16,7 @@ interface RecoveryMergeDialogProps { targetServerId: string } -export function RecoveryMergeDialog({ - currentJob, - onOpenChange, - open, - targetServerId -}: RecoveryMergeDialogProps) { +export function RecoveryMergeDialog({ currentJob, onOpenChange, open, targetServerId }: RecoveryMergeDialogProps) { const { t } = useTranslation('servers') const queryClient = useQueryClient() const [selectedSourceId, setSelectedSourceId] = useState('') @@ -36,7 +31,9 @@ export function RecoveryMergeDialog({ onOpenChange(false) }, onError: (error) => { - toast.error(error instanceof Error ? error.message : t('recovery_merge_failed', { defaultValue: 'Recovery failed' })) + toast.error( + error instanceof Error ? error.message : t('recovery_merge_failed', { defaultValue: 'Recovery failed' }) + ) } }) @@ -60,26 +57,28 @@ export function RecoveryMergeDialog({
{currentJob.stage} - {t('recovery_merge_existing_job', { defaultValue: 'A recovery job is already in progress.' })} + + {t('recovery_merge_existing_job', { defaultValue: 'A recovery job is already in progress.' })} +
)} {candidatesQuery.isLoading && ( -
+
{t('recovery_merge_loading', { defaultValue: 'Loading recovery candidates…' })}
)} {candidatesQuery.isError && ( -
+
{t('recovery_merge_candidates_failed', { defaultValue: 'Failed to load recovery candidates.' })}
)} - {!candidatesQuery.isLoading && !candidatesQuery.isError && candidates.length === 0 && ( -
+ {!(candidatesQuery.isLoading || candidatesQuery.isError) && candidates.length === 0 && ( +
{t('recovery_merge_empty', { defaultValue: 'No online recovery candidates are available right now.' })}
)} @@ -115,9 +114,10 @@ export function RecoveryMergeDialog({
)} -
+
{t('recovery_merge_warning', { - defaultValue: 'This keeps the original server record, asks the replacement agent to rebind, and continues the recovery flow from there.' + defaultValue: + 'This keeps the original server record, asks the replacement agent to rebind, and continues the recovery flow from there.' })}
@@ -125,7 +125,10 @@ export function RecoveryMergeDialog({ - )} {isOnline && terminalEnabled && ( @@ -670,7 +672,12 @@ export function ServerDetailPage() { setEditOpen(false)} open={editOpen} server={server} /> - +
) } diff --git a/apps/web/src/stores/recovery-jobs-store.test.ts b/apps/web/src/stores/recovery-jobs-store.test.ts index 013f80db..6b20699b 100644 --- a/apps/web/src/stores/recovery-jobs-store.test.ts +++ b/apps/web/src/stores/recovery-jobs-store.test.ts @@ -36,10 +36,9 @@ describe('useRecoveryJobsStore', () => { useRecoveryJobsStore.setState({ jobs: new Map() }) useRecoveryJobsStore.getState().setJob('old-target', makeJob({ target_server_id: 'old-target' })) - useRecoveryJobsStore.getState().setJobs([ - makeJob(), - makeJob({ job_id: 'job-2', target_server_id: 'target-2', source_server_id: 'source-2' }) - ]) + useRecoveryJobsStore + .getState() + .setJobs([makeJob(), makeJob({ job_id: 'job-2', target_server_id: 'target-2', source_server_id: 'source-2' })]) expect(useRecoveryJobsStore.getState().getJob('old-target')).toBeUndefined() expect(useRecoveryJobsStore.getState().getJob('target-2')?.job_id).toBe('job-2') From 56d0c120f1a448ca1719f4182275e1a3c89cf2e4 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 22:04:18 +0800 Subject: [PATCH 56/60] fix(web): narrow recovery websocket message types --- apps/web/src/hooks/use-servers-ws.ts | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/apps/web/src/hooks/use-servers-ws.ts b/apps/web/src/hooks/use-servers-ws.ts index 4f9ea992..722a3770 100644 --- a/apps/web/src/hooks/use-servers-ws.ts +++ b/apps/web/src/hooks/use-servers-ws.ts @@ -182,12 +182,14 @@ function setServerDetailDockerAvailability( } type QueryClient = ReturnType +type FullSyncMessage = Extract +type UpdateMessage = Extract function isWsMessageLike(raw: unknown): raw is { type: string } & Record { return typeof raw === 'object' && raw !== null && 'type' in raw && typeof (raw as { type: unknown }).type === 'string' } -function hydrateRecoveryJobs(raw: WsMessage & { type: 'full_sync' | 'update' }, replaceMissing: boolean): void { +function hydrateRecoveryJobs(raw: FullSyncMessage | UpdateMessage, replaceMissing: boolean): void { if (Array.isArray(raw.recoveries)) { useRecoveryJobsStore.getState().setJobs(raw.recoveries) return @@ -198,7 +200,7 @@ function hydrateRecoveryJobs(raw: WsMessage & { type: 'full_sync' | 'update' }, } } -function handleFullSyncMessage(msg: WsMessage & { type: 'full_sync' | 'update' }, queryClient: QueryClient): void { +function handleFullSyncMessage(msg: FullSyncMessage, queryClient: QueryClient): void { queryClient.setQueryData(['servers'], msg.servers) if (Array.isArray(msg.upgrades)) { useUpgradeJobsStore.getState().setJobs(msg.upgrades as UpgradeJob[]) @@ -206,7 +208,7 @@ function handleFullSyncMessage(msg: WsMessage & { type: 'full_sync' | 'update' } hydrateRecoveryJobs(msg, true) } -function handleUpdateMessage(msg: WsMessage & { type: 'full_sync' | 'update' }, queryClient: QueryClient): void { +function handleUpdateMessage(msg: UpdateMessage, queryClient: QueryClient): void { queryClient.setQueryData(['servers'], (prev) => prev ? mergeServerUpdate(prev, msg.servers) : msg.servers ) @@ -218,11 +220,11 @@ function handleServerMetricsMessage(raw: { type: string } & Record s == null || typeof s !== 'object')) { return } - const msg = raw as WsMessage & { type: 'full_sync' | 'update' } + const msg = raw as FullSyncMessage | UpdateMessage if (raw.type === 'full_sync') { - handleFullSyncMessage(msg, queryClient) + handleFullSyncMessage(msg as FullSyncMessage, queryClient) } else { - handleUpdateMessage(msg, queryClient) + handleUpdateMessage(msg as UpdateMessage, queryClient) } return } From 8d4d63e143bad9d1eecec786c9b081f2dedb288c Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 22:27:46 +0800 Subject: [PATCH 57/60] fix(recovery): address follow-up review findings --- apps/web/openapi.json | 395 ++++++++++++++++++ .../server/recovery-merge-dialog.test.tsx | 41 +- .../server/recovery-merge-dialog.tsx | 71 ++-- apps/web/src/hooks/use-servers-ws.test.ts | 36 +- apps/web/src/lib/api-schema.ts | 42 +- apps/web/src/lib/api-types.ts | 310 ++++++++++++++ apps/web/src/locales/en/servers.json | 11 +- apps/web/src/locales/zh/servers.json | 11 +- .../src/routes/_authed/servers/$id.test.tsx | 8 +- apps/web/src/routes/_authed/servers/$id.tsx | 22 +- .../src/stores/recovery-jobs-store.test.ts | 15 +- apps/web/src/stores/recovery-jobs-store.ts | 9 +- crates/agent/src/rebind.rs | 33 +- .../m20260416_000017_create_recovery_job.rs | 15 +- .../server/src/router/api/server_recovery.rs | 1 - crates/server/src/router/ws/browser.rs | 138 +++++- crates/server/src/service/db_error.rs | 11 + crates/server/src/service/mod.rs | 1 + crates/server/src/service/recovery_job.rs | 11 +- crates/server/src/service/recovery_merge.rs | 39 +- crates/server/src/task/record_writer.rs | 10 +- .../plans/2026-04-16-agent-recovery-merge.md | 35 +- 22 files changed, 1078 insertions(+), 187 deletions(-) create mode 100644 crates/server/src/service/db_error.rs diff --git a/apps/web/openapi.json b/apps/web/openapi.json index 06a41fe6..692bbe49 100644 --- a/apps/web/openapi.json +++ b/apps/web/openapi.json @@ -9,6 +9,35 @@ "version": "0.2.1" }, "paths": { + "/api/agent/latest-version": { + "get": { + "tags": ["agent"], + "operationId": "latest_version", + "responses": { + "200": { + "description": "Latest agent release metadata", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/LatestAgentVersionResponse" + } + } + } + } + }, + "security": [ + { + "session_cookie": [] + }, + { + "api_key": [] + }, + { + "bearer_token": [] + } + ] + } + }, "/api/agent/register": { "post": { "tags": ["agent"], @@ -3518,6 +3547,76 @@ ] } }, + "/api/servers/recovery-jobs/{job_id}": { + "get": { + "tags": ["server-recovery"], + "operationId": "get_recovery_job", + "parameters": [ + { + "name": "job_id", + "in": "path", + "description": "Recovery job id", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Recovery job details", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RecoveryJobResponse" + } + } + } + }, + "401": { + "description": "Authentication required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "403": { + "description": "Admin required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "404": { + "description": "Recovery job not found", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + } + }, + "security": [ + { + "session_cookie": [] + }, + { + "api_key": [] + }, + { + "bearer_token": [] + } + ] + } + }, "/api/servers/{id}": { "get": { "tags": ["servers"], @@ -4021,6 +4120,189 @@ ] } }, + "/api/servers/{target_id}/recover-merge": { + "post": { + "tags": ["server-recovery"], + "operationId": "start_recovery_merge", + "parameters": [ + { + "name": "target_id", + "in": "path", + "description": "Original offline server id", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/StartRecoveryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Recovery job created", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RecoveryJobResponse" + } + } + } + }, + "401": { + "description": "Authentication required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "403": { + "description": "Admin required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "404": { + "description": "Server not found", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "409": { + "description": "Recovery cannot be started in the current state", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "422": { + "description": "Invalid request", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + } + }, + "security": [ + { + "session_cookie": [] + }, + { + "api_key": [] + }, + { + "bearer_token": [] + } + ] + } + }, + "/api/servers/{target_id}/recovery-candidates": { + "get": { + "tags": ["server-recovery"], + "operationId": "list_candidates", + "parameters": [ + { + "name": "target_id", + "in": "path", + "description": "Original offline server id", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Recommended recovery candidates", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/RecoveryCandidateResponse" + } + } + } + } + }, + "401": { + "description": "Authentication required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "403": { + "description": "Admin required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "404": { + "description": "Target server not found", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "409": { + "description": "Target must be offline and not already in a running recovery job", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + } + }, + "security": [ + { + "session_cookie": [] + }, + { + "api_key": [] + }, + { + "bearer_token": [] + } + ] + } + }, "/api/service-monitors": { "get": { "tags": ["service-monitors"], @@ -6764,6 +7046,21 @@ } ] }, + "LatestAgentVersionResponse": { + "type": "object", + "properties": { + "error": { + "type": ["string", "null"] + }, + "released_at": { + "type": ["string", "null"], + "format": "date-time" + }, + "version": { + "type": ["string", "null"] + } + } + }, "ListFilesRequest": { "type": "object", "required": ["path"], @@ -7356,6 +7653,95 @@ } } }, + "RecoveryCandidateResponse": { + "type": "object", + "required": ["server_id", "name", "score", "reasons"], + "properties": { + "name": { + "type": "string" + }, + "reasons": { + "type": "array", + "items": { + "type": "string" + } + }, + "score": { + "type": "integer", + "format": "int32" + }, + "server_id": { + "type": "string" + } + } + }, + "RecoveryJobResponse": { + "type": "object", + "required": [ + "job_id", + "target_server_id", + "source_server_id", + "status", + "stage", + "started_at", + "created_at", + "updated_at" + ], + "properties": { + "created_at": { + "type": "string", + "format": "date-time" + }, + "error": { + "type": ["string", "null"] + }, + "job_id": { + "type": "string" + }, + "last_heartbeat_at": { + "type": ["string", "null"], + "format": "date-time" + }, + "source_server_id": { + "type": "string" + }, + "stage": { + "$ref": "#/components/schemas/RecoveryJobStage" + }, + "started_at": { + "type": "string", + "format": "date-time" + }, + "status": { + "$ref": "#/components/schemas/RecoveryJobStatus" + }, + "target_server_id": { + "type": "string" + }, + "updated_at": { + "type": "string", + "format": "date-time" + } + } + }, + "RecoveryJobStage": { + "type": "string", + "enum": [ + "validating", + "rebinding", + "awaiting_target_online", + "freezing_writes", + "merging_history", + "finalizing", + "succeeded", + "failed", + "unknown" + ] + }, + "RecoveryJobStatus": { + "type": "string", + "enum": ["running", "failed", "succeeded", "unknown"] + }, "RegisterRequest": { "type": "object", "properties": { @@ -7929,6 +8315,15 @@ } } }, + "StartRecoveryRequest": { + "type": "object", + "required": ["source_server_id"], + "properties": { + "source_server_id": { + "type": "string" + } + } + }, "StatRequest": { "type": "object", "required": ["path"], diff --git a/apps/web/src/components/server/recovery-merge-dialog.test.tsx b/apps/web/src/components/server/recovery-merge-dialog.test.tsx index 46769cbf..189c43a3 100644 --- a/apps/web/src/components/server/recovery-merge-dialog.test.tsx +++ b/apps/web/src/components/server/recovery-merge-dialog.test.tsx @@ -46,8 +46,8 @@ describe('RecoveryMergeDialog', () => { render(, { wrapper: Wrapper }) - expect(screen.getByText('Source')).toBeDefined() - expect(screen.getByText('same remote address')).toBeDefined() + expect(screen.getByText('Source')).toBeInTheDocument() + expect(screen.getByText('same remote address')).toBeInTheDocument() }) it('disables submit until a candidate is selected', () => { @@ -60,9 +60,42 @@ describe('RecoveryMergeDialog', () => { render(, { wrapper: Wrapper }) const button = screen.getByText('Start Recovery').closest('button') - expect(button?.getAttribute('disabled')).toBe('') + expect(button).toBeDisabled() fireEvent.click(screen.getByText('Source')) - expect(button?.hasAttribute('disabled')).toBe(false) + expect(button).toBeEnabled() + }) + + it('becomes read-only when a current job exists', () => { + mockUseRecoveryCandidates.mockReturnValue({ + data: [{ server_id: 'source-1', name: 'Source', score: 42, reasons: ['same remote address'] }], + isError: false, + isLoading: false + }) + + render( + , + { wrapper: Wrapper } + ) + + expect(screen.getByText('This dialog is read-only while a recovery job is active.')).toBeInTheDocument() + expect(screen.queryByText('Source')).toBeNull() + expect(screen.getByText('Start Recovery').closest('button')).toBeDisabled() }) }) diff --git a/apps/web/src/components/server/recovery-merge-dialog.tsx b/apps/web/src/components/server/recovery-merge-dialog.tsx index c8dc5a72..df33483d 100644 --- a/apps/web/src/components/server/recovery-merge-dialog.tsx +++ b/apps/web/src/components/server/recovery-merge-dialog.tsx @@ -6,6 +6,7 @@ import { toast } from 'sonner' import { Badge } from '@/components/ui/badge' import { Button } from '@/components/ui/button' import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from '@/components/ui/dialog' +import { ScrollArea } from '@/components/ui/scroll-area' import { startRecoveryMerge, useRecoveryCandidates } from '@/hooks/use-api' import type { RecoveryJobResponse } from '@/lib/api-schema' @@ -20,8 +21,9 @@ export function RecoveryMergeDialog({ currentJob, onOpenChange, open, targetServ const { t } = useTranslation('servers') const queryClient = useQueryClient() const [selectedSourceId, setSelectedSourceId] = useState('') + const readOnly = currentJob != null - const candidatesQuery = useRecoveryCandidates(targetServerId, open) + const candidatesQuery = useRecoveryCandidates(targetServerId, open && !readOnly) const startMutation = useMutation({ mutationFn: (sourceServerId: string) => startRecoveryMerge(targetServerId, { source_server_id: sourceServerId }), @@ -39,7 +41,7 @@ export function RecoveryMergeDialog({ currentJob, onOpenChange, open, targetServ const candidates = candidatesQuery.data ?? [] const selectedCandidate = candidates.find((candidate) => candidate.server_id === selectedSourceId) - const canSubmit = selectedCandidate != null && !startMutation.isPending + const canSubmit = !readOnly && selectedCandidate != null && !startMutation.isPending return ( @@ -83,35 +85,46 @@ export function RecoveryMergeDialog({ currentJob, onOpenChange, open, targetServ
)} - {candidates.length > 0 && ( -
- {candidates.map((candidate) => { - const selected = candidate.server_id === selectedSourceId - return ( - - ) + {readOnly ? ( +
+ {t('recovery_merge_read_only', { + defaultValue: 'This dialog is read-only while a recovery job is active.' })}
+ ) : ( + candidates.length > 0 && ( + +
+ {candidates.map((candidate) => { + const selected = candidate.server_id === selectedSourceId + return ( + + ) + })} +
+
+ ) )}
diff --git a/apps/web/src/hooks/use-servers-ws.test.ts b/apps/web/src/hooks/use-servers-ws.test.ts index 00f42277..8a87335c 100644 --- a/apps/web/src/hooks/use-servers-ws.test.ts +++ b/apps/web/src/hooks/use-servers-ws.test.ts @@ -37,6 +37,18 @@ function makeServer(overrides: Partial = {}): ServerMetrics { } } +function makeQueryClient() { + const cache = new Map() + return { + setQueryData: (key: unknown[], value: unknown | ((prev: unknown) => unknown)) => { + const cacheKey = JSON.stringify(key) + const prev = cache.get(cacheKey) + const next = typeof value === 'function' ? (value as (prev: unknown) => unknown)(prev) : value + cache.set(cacheKey, next) + } + } +} + describe('mergeServerUpdate', () => { it('updates dynamic fields', () => { const prev = [makeServer({ cpu: 50 })] @@ -110,18 +122,6 @@ describe('setServerCapabilities', () => { }) describe('handleWsMessage upgrade messages', () => { - function makeQueryClient() { - const cache = new Map() - return { - setQueryData: (key: unknown[], value: unknown | ((prev: unknown) => unknown)) => { - const cacheKey = JSON.stringify(key) - const prev = cache.get(cacheKey) - const next = typeof value === 'function' ? (value as (prev: unknown) => unknown)(prev) : value - cache.set(cacheKey, next) - } - } - } - it('hydrates upgrade jobs from full_sync', () => { useUpgradeJobsStore.setState({ jobs: new Map() }) const queryClient = makeQueryClient() @@ -212,18 +212,6 @@ describe('handleWsMessage upgrade messages', () => { }) describe('handleWsMessage recovery messages', () => { - function makeQueryClient() { - const cache = new Map() - return { - setQueryData: (key: unknown[], value: unknown | ((prev: unknown) => unknown)) => { - const cacheKey = JSON.stringify(key) - const prev = cache.get(cacheKey) - const next = typeof value === 'function' ? (value as (prev: unknown) => unknown)(prev) : value - cache.set(cacheKey, next) - } - } - } - it('hydrates recovery jobs from full_sync', () => { useRecoveryJobsStore.setState({ jobs: new Map() }) const queryClient = makeQueryClient() diff --git a/apps/web/src/lib/api-schema.ts b/apps/web/src/lib/api-schema.ts index a549c9c8..bd7476e2 100644 --- a/apps/web/src/lib/api-schema.ts +++ b/apps/web/src/lib/api-schema.ts @@ -25,43 +25,11 @@ export type ServerMetricRecord = S['ServerRecord'] export type UpdateServerInput = S['UpdateServerInput'] export type BatchDeleteRequest = S['BatchDeleteRequest'] export type BatchDeleteResponse = S['BatchDeleteResponse'] - -export interface RecoveryCandidateResponse { - name: string - reasons: string[] - score: number - server_id: string -} - -export type RecoveryJobStatus = 'running' | 'failed' | 'succeeded' | 'unknown' - -export type RecoveryJobStage = - | 'validating' - | 'rebinding' - | 'awaiting_target_online' - | 'freezing_writes' - | 'merging_history' - | 'finalizing' - | 'succeeded' - | 'failed' - | 'unknown' - -export interface RecoveryJobResponse { - created_at: string - error: string | null - job_id: string - last_heartbeat_at: string | null - source_server_id: string - stage: RecoveryJobStage - started_at: string - status: RecoveryJobStatus - target_server_id: string - updated_at: string -} - -export interface StartRecoveryRequest { - source_server_id: string -} +export type RecoveryCandidateResponse = S['RecoveryCandidateResponse'] +export type RecoveryJobResponse = S['RecoveryJobResponse'] +export type RecoveryJobStage = S['RecoveryJobStage'] +export type RecoveryJobStatus = S['RecoveryJobStatus'] +export type StartRecoveryRequest = S['StartRecoveryRequest'] // Server groups export type ServerGroup = S['ServerGroup'] diff --git a/apps/web/src/lib/api-types.ts b/apps/web/src/lib/api-types.ts index a59529d1..d94b8f23 100644 --- a/apps/web/src/lib/api-types.ts +++ b/apps/web/src/lib/api-types.ts @@ -4,6 +4,22 @@ */ export interface paths { + '/api/agent/latest-version': { + parameters: { + query?: never + header?: never + path?: never + cookie?: never + } + get: operations['latest_version'] + put?: never + post?: never + delete?: never + options?: never + head?: never + patch?: never + trace?: never + } '/api/agent/register': { parameters: { query?: never @@ -1161,6 +1177,38 @@ export interface paths { patch?: never trace?: never } + '/api/servers/{target_id}/recover-merge': { + parameters: { + query?: never + header?: never + path?: never + cookie?: never + } + get?: never + put?: never + post: operations['start_recovery_merge'] + delete?: never + options?: never + head?: never + patch?: never + trace?: never + } + '/api/servers/{target_id}/recovery-candidates': { + parameters: { + query?: never + header?: never + path?: never + cookie?: never + } + get: operations['list_candidates'] + put?: never + post?: never + delete?: never + options?: never + head?: never + patch?: never + trace?: never + } '/api/servers/batch-capabilities': { parameters: { query?: never @@ -1209,6 +1257,22 @@ export interface paths { patch?: never trace?: never } + '/api/servers/recovery-jobs/{job_id}': { + parameters: { + query?: never + header?: never + path?: never + cookie?: never + } + get: operations['get_recovery_job'] + put?: never + post?: never + delete?: never + options?: never + head?: never + patch?: never + trace?: never + } '/api/service-monitors': { parameters: { query?: never @@ -2007,6 +2071,12 @@ export interface components { IncidentWithUpdates: components['schemas']['Incident'] & { updates: components['schemas']['IncidentUpdate'][] } + LatestAgentVersionResponse: { + error?: string | null + /** Format: date-time */ + released_at?: string | null + version?: string | null + } ListFilesRequest: { path: string } @@ -2187,6 +2257,42 @@ export interface components { ReadResponse: { content: string } + RecoveryCandidateResponse: { + name: string + reasons: string[] + /** Format: int32 */ + score: number + server_id: string + } + RecoveryJobResponse: { + /** Format: date-time */ + created_at: string + error?: string | null + job_id: string + /** Format: date-time */ + last_heartbeat_at?: string | null + source_server_id: string + stage: components['schemas']['RecoveryJobStage'] + /** Format: date-time */ + started_at: string + status: components['schemas']['RecoveryJobStatus'] + target_server_id: string + /** Format: date-time */ + updated_at: string + } + /** @enum {string} */ + RecoveryJobStage: + | 'validating' + | 'rebinding' + | 'awaiting_target_online' + | 'freezing_writes' + | 'merging_history' + | 'finalizing' + | 'succeeded' + | 'failed' + | 'unknown' + /** @enum {string} */ + RecoveryJobStatus: 'running' | 'failed' | 'succeeded' | 'unknown' RegisterRequest: { fingerprint?: string } @@ -2399,6 +2505,9 @@ export interface components { /** Format: date-time */ time: string } + StartRecoveryRequest: { + source_server_id: string + } StatRequest: { path: string } @@ -4219,6 +4328,56 @@ export interface operations { } } } + get_recovery_job: { + parameters: { + query?: never + header?: never + path: { + /** @description Recovery job id */ + job_id: string + } + cookie?: never + } + requestBody?: never + responses: { + /** @description Recovery job details */ + 200: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['RecoveryJobResponse'] + } + } + /** @description Authentication required */ + 401: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Admin required */ + 403: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Recovery job not found */ + 404: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + } + } get_rule: { parameters: { query?: never @@ -4654,6 +4813,26 @@ export interface operations { } } } + latest_version: { + parameters: { + query?: never + header?: never + path?: never + cookie?: never + } + requestBody?: never + responses: { + /** @description Latest agent release metadata */ + 200: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['LatestAgentVersionResponse'] + } + } + } + } list_alert_events: { parameters: { query?: { @@ -4726,6 +4905,65 @@ export interface operations { } } } + list_candidates: { + parameters: { + query?: never + header?: never + path: { + /** @description Original offline server id */ + target_id: string + } + cookie?: never + } + requestBody?: never + responses: { + /** @description Recommended recovery candidates */ + 200: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['RecoveryCandidateResponse'][] + } + } + /** @description Authentication required */ + 401: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Admin required */ + 403: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Target server not found */ + 404: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Target must be offline and not already in a running recovery job */ + 409: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + } + } list_dashboards: { parameters: { query?: never @@ -5925,6 +6163,78 @@ export interface operations { } } } + start_recovery_merge: { + parameters: { + query?: never + header?: never + path: { + /** @description Original offline server id */ + target_id: string + } + cookie?: never + } + requestBody: { + content: { + 'application/json': components['schemas']['StartRecoveryRequest'] + } + } + responses: { + /** @description Recovery job created */ + 200: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['RecoveryJobResponse'] + } + } + /** @description Authentication required */ + 401: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Admin required */ + 403: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Server not found */ + 404: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Recovery cannot be started in the current state */ + 409: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Invalid request */ + 422: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + } + } stat_file: { parameters: { query?: never diff --git a/apps/web/src/locales/en/servers.json b/apps/web/src/locales/en/servers.json index 684cec2f..9fa830f2 100644 --- a/apps/web/src/locales/en/servers.json +++ b/apps/web/src/locales/en/servers.json @@ -211,5 +211,14 @@ "recovery_merge_starting": "Starting…", "recovery_merge_started": "Recovery started", "recovery_merge_failed": "Recovery failed", - "recovery_merge_candidates_failed": "Failed to load recovery candidates." + "recovery_merge_candidates_failed": "Failed to load recovery candidates.", + "recovery_merge_read_only": "This dialog is read-only while a recovery job is active.", + "recovery_stage_validating": "Validating", + "recovery_stage_rebinding": "Rebinding", + "recovery_stage_awaiting_target_online": "Waiting for Target", + "recovery_stage_freezing_writes": "Freezing Writes", + "recovery_stage_merging_history": "Merging History", + "recovery_stage_finalizing": "Finalizing", + "recovery_stage_failed": "Failed", + "recovery_stage_succeeded": "Succeeded" } diff --git a/apps/web/src/locales/zh/servers.json b/apps/web/src/locales/zh/servers.json index 98fe7dba..85cb7b4b 100644 --- a/apps/web/src/locales/zh/servers.json +++ b/apps/web/src/locales/zh/servers.json @@ -211,5 +211,14 @@ "recovery_merge_starting": "启动中…", "recovery_merge_started": "恢复任务已启动", "recovery_merge_failed": "恢复失败", - "recovery_merge_candidates_failed": "加载恢复候选项失败。" + "recovery_merge_candidates_failed": "加载恢复候选项失败。", + "recovery_merge_read_only": "当前存在恢复任务时,此对话框为只读状态。", + "recovery_stage_validating": "校验中", + "recovery_stage_rebinding": "重新绑定中", + "recovery_stage_awaiting_target_online": "等待目标上线", + "recovery_stage_freezing_writes": "冻结写入", + "recovery_stage_merging_history": "合并历史", + "recovery_stage_finalizing": "收尾中", + "recovery_stage_failed": "失败", + "recovery_stage_succeeded": "完成" } diff --git a/apps/web/src/routes/_authed/servers/$id.test.tsx b/apps/web/src/routes/_authed/servers/$id.test.tsx index 0fb1e3a1..4daccdab 100644 --- a/apps/web/src/routes/_authed/servers/$id.test.tsx +++ b/apps/web/src/routes/_authed/servers/$id.test.tsx @@ -150,7 +150,11 @@ vi.mock('@/stores/upgrade-jobs-store', () => ({ })) vi.mock('@/stores/recovery-jobs-store', () => ({ - useRecoveryJobsStore: () => undefined + useRecoveryJobsStore: (selector: (state: { hydrated: boolean; jobs: Map }) => unknown) => + selector({ + hydrated: true, + jobs: new Map() + }) })) const { ServerDetailPage } = await import('./$id') @@ -221,6 +225,6 @@ describe('ServerDetailPage', () => { it('shows recovery action for offline server when admin', () => { render() - expect(screen.getByText('Recover Agent')).toBeDefined() + expect(screen.getByText('Recover Agent')).toBeInTheDocument() }) }) diff --git a/apps/web/src/routes/_authed/servers/$id.tsx b/apps/web/src/routes/_authed/servers/$id.tsx index f8d5d299..4ccbf921 100644 --- a/apps/web/src/routes/_authed/servers/$id.tsx +++ b/apps/web/src/routes/_authed/servers/$id.tsx @@ -79,6 +79,17 @@ function formatCurrency(price: number, currency: string): string { } } +function translateRecoveryStage( + t: (key: string, options?: { defaultValue?: string }) => string, + stage: string | undefined +): string | null { + if (!stage) { + return null + } + + return t(`recovery_stage_${stage}`, { defaultValue: stage }) +} + function ServerInfoMeta({ server }: { server: ServerResponse }) { const { t } = useTranslation('servers') return ( @@ -132,6 +143,7 @@ function ServerActionButtons({ id, isAdmin, isOnline, + recoveryHydrated, onEditOpen, onRecoveryOpen, serverWithCaps, @@ -143,6 +155,7 @@ function ServerActionButtons({ id: string isAdmin: boolean isOnline: boolean + recoveryHydrated: boolean onEditOpen: () => void onRecoveryOpen: () => void serverWithCaps: ServerResponse & ServerWithCaps @@ -157,7 +170,7 @@ function ServerActionButtons({ {isAdmin && !isOnline && ( -
@@ -610,6 +627,7 @@ export function ServerDetailPage() { isOnline={isOnline} onEditOpen={() => setEditOpen(true)} onRecoveryOpen={() => setRecoveryOpen(true)} + recoveryHydrated={recoveryHydrated} serverWithCaps={serverWithCaps} terminalEnabled={terminalEnabled} /> diff --git a/apps/web/src/stores/recovery-jobs-store.test.ts b/apps/web/src/stores/recovery-jobs-store.test.ts index 6b20699b..60d51aec 100644 --- a/apps/web/src/stores/recovery-jobs-store.test.ts +++ b/apps/web/src/stores/recovery-jobs-store.test.ts @@ -25,7 +25,7 @@ function buildJob() { describe('useRecoveryJobsStore', () => { it('stores jobs keyed by target server id', () => { - useRecoveryJobsStore.setState({ jobs: new Map() }) + useRecoveryJobsStore.setState({ hydrated: false, jobs: new Map() }) useRecoveryJobsStore.getState().setJob('target-1', makeJob()) @@ -33,7 +33,7 @@ describe('useRecoveryJobsStore', () => { }) it('replaces the whole map on setJobs', () => { - useRecoveryJobsStore.setState({ jobs: new Map() }) + useRecoveryJobsStore.setState({ hydrated: false, jobs: new Map() }) useRecoveryJobsStore.getState().setJob('old-target', makeJob({ target_server_id: 'old-target' })) useRecoveryJobsStore @@ -42,14 +42,23 @@ describe('useRecoveryJobsStore', () => { expect(useRecoveryJobsStore.getState().getJob('old-target')).toBeUndefined() expect(useRecoveryJobsStore.getState().getJob('target-2')?.job_id).toBe('job-2') + expect(useRecoveryJobsStore.getState().hydrated).toBe(true) }) it('clears a job by target server id', () => { - useRecoveryJobsStore.setState({ jobs: new Map() }) + useRecoveryJobsStore.setState({ hydrated: false, jobs: new Map() }) useRecoveryJobsStore.getState().setJob('target-1', makeJob()) useRecoveryJobsStore.getState().clearJob('target-1') expect(useRecoveryJobsStore.getState().getJob('target-1')).toBeUndefined() }) + + it('can mark hydration explicitly', () => { + useRecoveryJobsStore.setState({ hydrated: false, jobs: new Map() }) + + useRecoveryJobsStore.getState().setHydrated(true) + + expect(useRecoveryJobsStore.getState().hydrated).toBe(true) + }) }) diff --git a/apps/web/src/stores/recovery-jobs-store.ts b/apps/web/src/stores/recovery-jobs-store.ts index 229058bd..dd49e104 100644 --- a/apps/web/src/stores/recovery-jobs-store.ts +++ b/apps/web/src/stores/recovery-jobs-store.ts @@ -4,12 +4,15 @@ import type { RecoveryJobResponse } from '@/lib/api-schema' interface RecoveryJobsState { clearJob: (targetServerId: string) => void getJob: (targetServerId: string) => RecoveryJobResponse | undefined + hydrated: boolean jobs: Map + setHydrated: (hydrated: boolean) => void setJob: (targetServerId: string, job: RecoveryJobResponse) => void setJobs: (jobs: RecoveryJobResponse[]) => void } export const useRecoveryJobsStore = create()((set, get) => ({ + hydrated: false, jobs: new Map(), setJob: (targetServerId: string, job: RecoveryJobResponse) => { @@ -33,8 +36,10 @@ export const useRecoveryJobsStore = create()((set, get) => ({ for (const job of jobs) { next.set(job.target_server_id, job) } - set({ jobs: next }) + set({ jobs: next, hydrated: true }) }, - getJob: (targetServerId: string) => get().jobs.get(targetServerId) + getJob: (targetServerId: string) => get().jobs.get(targetServerId), + + setHydrated: (hydrated: boolean) => set({ hydrated }) })) diff --git a/crates/agent/src/rebind.rs b/crates/agent/src/rebind.rs index 5df03765..97643579 100644 --- a/crates/agent/src/rebind.rs +++ b/crates/agent/src/rebind.rs @@ -7,6 +7,7 @@ use anyhow::Context; fn render_token_content(existing: &str, token: &str) -> String { let token_line = format!("token = \"{token}\""); + let had_trailing_newline = existing.ends_with('\n'); let mut lines: Vec = existing.lines().map(ToOwned::to_owned).collect(); let preamble_end = lines .iter() @@ -20,7 +21,11 @@ fn render_token_content(existing: &str, token: &str) -> String { lines.insert(preamble_end, token_line); } - lines.join("\n") + let mut rendered = lines.join("\n"); + if had_trailing_newline { + rendered.push('\n'); + } + rendered } fn is_token_line(line: &str) -> bool { @@ -96,8 +101,8 @@ fn replace_file(temp_path: &Path, path: &Path) -> anyhow::Result<()> { fs::rename(temp_path, path).with_context(|| { format!( "failed to atomically replace {} with {}", - path.display(), - temp_path.display() + temp_path.display(), + path.display() ) }) } @@ -127,8 +132,8 @@ fn replace_file(temp_path: &Path, path: &Path) -> anyhow::Result<()> { Err(std::io::Error::last_os_error()).with_context(|| { format!( "failed to atomically replace {} with {}", - path.display(), - temp_path.display() + temp_path.display(), + path.display() ) }) } else { @@ -148,7 +153,8 @@ pub(crate) fn assert_persist_rebind_token() { assert_eq!( content, r#"server_url = "http://127.0.0.1:9527" -token = "focused-token""# +token = "focused-token" +"# ); } @@ -192,7 +198,8 @@ log.level = "debug""# assert_eq!( content, r#"server_url = "http://127.0.0.1:9527" -token = "fresh-token""# +token = "fresh-token" +"# ); } @@ -224,6 +231,18 @@ level = "info""# ); } + #[test] + fn persist_rebind_token_preserves_trailing_newline() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\n").expect("seed file"); + + super::persist_rebind_token_impl(&path, "fresh-token").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert!(content.ends_with('\n')); + } + #[test] fn persist_rebind_token_preserves_nested_token_and_inserts_top_level_token() { let tempdir = TempDir::new().expect("tempdir"); diff --git a/crates/server/src/migration/m20260416_000017_create_recovery_job.rs b/crates/server/src/migration/m20260416_000017_create_recovery_job.rs index af476b06..7b92deba 100644 --- a/crates/server/src/migration/m20260416_000017_create_recovery_job.rs +++ b/crates/server/src/migration/m20260416_000017_create_recovery_job.rs @@ -17,8 +17,19 @@ impl MigrationTrait for Migration { job_id TEXT PRIMARY KEY NOT NULL, target_server_id TEXT NOT NULL, source_server_id TEXT NOT NULL, - status TEXT NOT NULL, - stage TEXT NOT NULL, + status TEXT NOT NULL CHECK (status IN ('running', 'failed', 'succeeded')), + stage TEXT NOT NULL CHECK ( + stage IN ( + 'validating', + 'rebinding', + 'awaiting_target_online', + 'freezing_writes', + 'merging_history', + 'finalizing', + 'succeeded', + 'failed' + ) + ), checkpoint_json TEXT NULL, error TEXT NULL, started_at DATETIME NOT NULL, diff --git a/crates/server/src/router/api/server_recovery.rs b/crates/server/src/router/api/server_recovery.rs index 1004f025..04409bca 100644 --- a/crates/server/src/router/api/server_recovery.rs +++ b/crates/server/src/router/api/server_recovery.rs @@ -641,7 +641,6 @@ mod tests { .agent_manager .add_connection("source-1".into(), "Source".into(), tx, test_addr()); state.agent_manager.set_protocol_version("source-1", 3); - state.agent_manager.set_protocol_version("source-1", 3); let before = server::Entity::find_by_id("target-1") .one(&db) diff --git a/crates/server/src/router/ws/browser.rs b/crates/server/src/router/ws/browser.rs index abc915c0..0c086c1c 100644 --- a/crates/server/src/router/ws/browser.rs +++ b/crates/server/src/router/ws/browser.rs @@ -33,9 +33,9 @@ async fn browser_ws_handler( // Validate auth: try session cookie first, then API key, then Bearer token let auth = validate_browser_auth(&state, &headers).await; match auth { - Some((_user_id, mobile_expires)) => ws + Some((_user_id, is_admin, mobile_expires)) => ws .max_message_size(MAX_WS_MESSAGE_SIZE) - .on_upgrade(move |socket| handle_browser_ws(socket, state, mobile_expires)), + .on_upgrade(move |socket| handle_browser_ws(socket, state, is_admin, mobile_expires)), None => axum::http::StatusCode::UNAUTHORIZED.into_response(), } } @@ -48,20 +48,20 @@ async fn browser_ws_handler( async fn validate_browser_auth( state: &Arc, headers: &HeaderMap, -) -> Option<(String, Option>)> { +) -> Option<(String, bool, Option>)> { // Try session cookie (always web source → no mobile expiry) if let Some(token) = extract_session_cookie(headers) && let Ok(Some((user, _session))) = AuthService::validate_session(&state.db, &token, state.config.auth.session_ttl).await { - return Some((user.id, None)); + return Some((user.id, user.role == "admin", None)); } // Try API key header (no expiry) if let Some(key) = extract_api_key(headers) && let Ok(Some(user)) = AuthService::validate_api_key(&state.db, &key).await { - return Some((user.id, None)); + return Some((user.id, user.role == "admin", None)); } // Try Bearer token (may be a mobile session with a fixed expiry) @@ -74,7 +74,7 @@ async fn validate_browser_auth( } else { None }; - return Some((user.id, mobile_expires)); + return Some((user.id, user.role == "admin", mobile_expires)); } None @@ -112,6 +112,7 @@ fn extract_bearer_token(headers: &HeaderMap) -> Option { async fn handle_browser_ws( socket: WebSocket, state: Arc, + is_admin: bool, mobile_expires: Option>, ) { let (mut ws_sink, mut ws_stream) = socket.split(); @@ -119,7 +120,7 @@ async fn handle_browser_ws( let connection_id = uuid::Uuid::new_v4().to_string(); // Build FullSync message from DB servers + agent_manager online/report data - let full_sync = build_full_sync(&state).await; + let full_sync = build_full_sync(&state, is_admin).await; if let Err(e) = send_browser_message(&mut ws_sink, &full_sync).await { tracing::error!("Failed to send FullSync to browser: {e}"); return; @@ -136,7 +137,10 @@ async fn handle_browser_ws( msg = browser_rx.recv() => { match msg { Ok(browser_msg) => { - if let Err(e) = send_browser_message(&mut ws_sink, &browser_msg).await { + let filtered = filter_browser_message(browser_msg, is_admin); + if let Some(filtered) = filtered + && let Err(e) = send_browser_message(&mut ws_sink, &filtered).await + { tracing::debug!("Failed to send to browser WS: {e}"); break; } @@ -144,7 +148,7 @@ async fn handle_browser_ws( Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => { tracing::warn!("Browser WS lagged by {n} messages, sending full resync"); // On lag, send a full resync - let resync = build_full_sync(&state).await; + let resync = build_full_sync(&state, is_admin).await; if let Err(e) = send_browser_message(&mut ws_sink, &resync).await { tracing::debug!("Failed to send resync to browser WS: {e}"); break; @@ -253,8 +257,12 @@ async fn handle_browser_client_message( } } -async fn build_full_sync(state: &Arc) -> BrowserMessage { - let recoveries = recovery_snapshot(state).await; +async fn build_full_sync(state: &Arc, is_admin: bool) -> BrowserMessage { + let recoveries = if is_admin { + recovery_snapshot(state).await.unwrap_or_default() + } else { + Vec::new() + }; let servers = match ServerService::list_servers(&state.db).await { Ok(servers) => servers, Err(e) => { @@ -366,23 +374,49 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { } } -pub(crate) async fn recovery_snapshot(state: &Arc) -> Vec { +pub(crate) async fn recovery_snapshot(state: &Arc) -> Option> { match recovery_job::Entity::find().all(&state.db).await { - Ok(jobs) => jobs.into_iter().map(Into::into).collect(), + Ok(jobs) => Some(jobs.into_iter().map(Into::into).collect()), Err(e) => { tracing::error!("Failed to list recovery jobs for browser sync: {e}"); - Vec::new() + None } } } pub(crate) async fn broadcast_recovery_update(state: &Arc) { + let Some(recoveries) = recovery_snapshot(state).await else { + return; + }; let _ = state.browser_tx.send(BrowserMessage::Update { servers: Vec::new(), - recoveries: Some(recovery_snapshot(state).await), + recoveries: Some(recoveries), }); } +fn filter_browser_message(msg: BrowserMessage, is_admin: bool) -> Option { + if is_admin { + return Some(msg); + } + + match msg { + BrowserMessage::FullSync { + servers, + upgrades, + .. + } => Some(BrowserMessage::FullSync { + servers, + upgrades, + recoveries: Vec::new(), + }), + BrowserMessage::Update { servers, .. } => Some(BrowserMessage::Update { + servers, + recoveries: None, + }), + other => Some(other), + } +} + async fn send_browser_message( sink: &mut futures_util::stream::SplitSink, msg: &BrowserMessage, @@ -490,7 +524,7 @@ mod tests { .await .unwrap(); - let message = build_full_sync(&state).await; + let message = build_full_sync(&state, true).await; match message { BrowserMessage::FullSync { recoveries, .. } => { @@ -546,7 +580,7 @@ mod tests { .await .unwrap(); - let message = build_full_sync(&state).await; + let message = build_full_sync(&state, true).await; match message { BrowserMessage::FullSync { recoveries, .. } => { @@ -566,6 +600,41 @@ mod tests { } } + #[tokio::test] + async fn full_sync_hides_recoveries_for_non_admin() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set("job-1".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("running".to_string()), + stage: Set("rebinding".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + + let message = build_full_sync(&state, false).await; + + match message { + BrowserMessage::FullSync { recoveries, .. } => assert!(recoveries.is_empty()), + other => panic!("expected full sync, got {other:?}"), + } + } + #[tokio::test] async fn broadcast_recovery_update_includes_terminal_recovery_states() { let (db, _tmp) = setup_test_db().await; @@ -610,4 +679,39 @@ mod tests { other => panic!("expected update with recoveries, got {other:?}"), } } + + #[tokio::test] + async fn full_sync_strips_recoveries_for_non_admin() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set("job-1".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("running".to_string()), + stage: Set("rebinding".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + + let message = build_full_sync(&state, false).await; + + match message { + BrowserMessage::FullSync { recoveries, .. } => assert!(recoveries.is_empty()), + other => panic!("expected full sync, got {other:?}"), + } + } } diff --git a/crates/server/src/service/db_error.rs b/crates/server/src/service/db_error.rs new file mode 100644 index 00000000..08e289a4 --- /dev/null +++ b/crates/server/src/service/db_error.rs @@ -0,0 +1,11 @@ +use sea_orm::DbErr; + +pub(crate) fn is_unique_violation(err: &DbErr) -> bool { + let message = err.to_string(); + message.contains("UNIQUE constraint failed") || message.contains("UNIQUE") +} + +pub(crate) fn is_active_recovery_conflict(err: &DbErr) -> bool { + let message = err.to_string(); + is_unique_violation(err) || message.contains("recovery_job_active_conflict") +} diff --git a/crates/server/src/service/mod.rs b/crates/server/src/service/mod.rs index 1b11fa5b..3818305d 100644 --- a/crates/server/src/service/mod.rs +++ b/crates/server/src/service/mod.rs @@ -6,6 +6,7 @@ pub mod auth; pub mod checker; pub mod config; pub mod dashboard; +pub mod db_error; pub mod docker; pub mod docker_viewer; pub mod file_transfer; diff --git a/crates/server/src/service/recovery_job.rs b/crates/server/src/service/recovery_job.rs index 591b763b..2d920614 100644 --- a/crates/server/src/service/recovery_job.rs +++ b/crates/server/src/service/recovery_job.rs @@ -4,19 +4,10 @@ use uuid::Uuid; use crate::entity::recovery_job; use crate::error::AppError; +use crate::service::db_error::is_active_recovery_conflict; pub struct RecoveryJobService; -fn is_unique_violation(err: &DbErr) -> bool { - let message = err.to_string(); - message.contains("UNIQUE constraint failed") || message.contains("UNIQUE") -} - -fn is_active_recovery_conflict(err: &DbErr) -> bool { - let message = err.to_string(); - is_unique_violation(err) || message.contains("recovery_job_active_conflict") -} - impl RecoveryJobService { pub async fn create_job( db: &DatabaseConnection, diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs index 6b3fe3e8..1d2338c8 100644 --- a/crates/server/src/service/recovery_merge.rs +++ b/crates/server/src/service/recovery_merge.rs @@ -11,6 +11,7 @@ use sea_orm::{ use crate::entity::{network_probe_config, recovery_job, server, server_tag}; use crate::error::AppError; use crate::service::auth::AuthService; +use crate::service::db_error::is_active_recovery_conflict; use crate::service::recovery_job::RecoveryJobService; use crate::service::traffic::TrafficService; use crate::state::AppState; @@ -39,16 +40,6 @@ pub enum RecoveryRetryStrategy { ResumeSameJob, } -fn is_unique_violation(err: &sea_orm::DbErr) -> bool { - let message = err.to_string(); - message.contains("UNIQUE constraint failed") || message.contains("UNIQUE") -} - -fn is_active_recovery_conflict(err: &sea_orm::DbErr) -> bool { - let message = err.to_string(); - is_unique_violation(err) || message.contains("recovery_job_active_conflict") -} - impl RecoveryMergeService { pub async fn start( state: &Arc, @@ -805,21 +796,20 @@ impl RecoveryMergeService { C: ConnectionTrait, { let tables = [ - ("alert_rules", "server_ids_json", true), - ("ping_tasks", "server_ids_json", false), - ("tasks", "server_ids_json", false), - ("service_monitor", "server_ids_json", true), - ("maintenance", "server_ids_json", true), - ("incident", "server_ids_json", true), - ("status_page", "server_ids_json", false), + ("alert_rules", "server_ids_json"), + ("ping_tasks", "server_ids_json"), + ("tasks", "server_ids_json"), + ("service_monitor", "server_ids_json"), + ("maintenance", "server_ids_json"), + ("incident", "server_ids_json"), + ("status_page", "server_ids_json"), ]; - for (table, column, nullable) in tables { + for (table, column) in tables { Self::rewrite_server_ids_json_table_on_connection( db, table, column, - nullable, target_server_id, source_server_id, ) @@ -834,7 +824,6 @@ impl RecoveryMergeService { db: &DatabaseConnection, table: &str, column: &str, - nullable: bool, target_server_id: &str, source_server_id: &str, ) -> Result<(), AppError> { @@ -842,7 +831,6 @@ impl RecoveryMergeService { db, table, column, - nullable, target_server_id, source_server_id, ) @@ -853,7 +841,6 @@ impl RecoveryMergeService { db: &C, table: &str, column: &str, - nullable: bool, target_server_id: &str, source_server_id: &str, ) -> Result<(), AppError> @@ -883,13 +870,7 @@ impl RecoveryMergeService { continue; } - let value = if nullable { - rewritten.unwrap_or_else(|| "[]".to_string()).into() - } else { - rewritten - .unwrap_or_else(|| "[]".to_string()) - .into() - }; + let value = rewritten.unwrap_or_else(|| "[]".to_string()).into(); db.execute(Statement::from_sql_and_values( DatabaseBackend::Sqlite, diff --git a/crates/server/src/task/record_writer.rs b/crates/server/src/task/record_writer.rs index c6971d0f..ce072369 100644 --- a/crates/server/src/task/record_writer.rs +++ b/crates/server/src/task/record_writer.rs @@ -43,8 +43,10 @@ pub async fn run(state: Arc) { let mut count = 0; for (server_id, report) in &reports { + let writes_allowed = state.recovery_lock.writes_allowed_for(server_id); + // Save metrics record - if state.recovery_lock.writes_allowed_for(server_id) { + if writes_allowed { if let Err(e) = RecordService::save_report(&state.db, server_id, report).await { tracing::error!("Failed to save record for {server_id}: {e}"); } else { @@ -69,7 +71,7 @@ pub async fn run(state: Arc) { } else { // First observation: no previous state, skip delta (just record state) transfer_cache.insert(server_id.clone(), (curr_in, curr_out)); - if state.recovery_lock.writes_allowed_for(server_id) { + if writes_allowed { if let Err(e) = TrafficService::upsert_state( &state.db, server_id, @@ -91,7 +93,7 @@ pub async fn run(state: Arc) { // Only write if there's actual traffic if delta_in > 0 || delta_out > 0 { - if state.recovery_lock.writes_allowed_for(server_id) { + if writes_allowed { if let Err(e) = TrafficService::upsert_hourly( &state.db, server_id, @@ -109,7 +111,7 @@ pub async fn run(state: Arc) { } // Always update state - if state.recovery_lock.writes_allowed_for(server_id) { + if writes_allowed { if let Err(e) = TrafficService::upsert_state(&state.db, server_id, curr_in, curr_out).await { diff --git a/docs/superpowers/plans/2026-04-16-agent-recovery-merge.md b/docs/superpowers/plans/2026-04-16-agent-recovery-merge.md index f3642ff5..6d616c60 100644 --- a/docs/superpowers/plans/2026-04-16-agent-recovery-merge.md +++ b/docs/superpowers/plans/2026-04-16-agent-recovery-merge.md @@ -164,23 +164,34 @@ Expected: FAIL with unresolved import or missing `persist_rebind_token`. ```rust // crates/agent/src/rebind.rs pub fn persist_rebind_token(path: &std::path::Path, token: &str) -> anyhow::Result<()> { - let content = if path.exists() { - std::fs::read_to_string(path)? - } else { - String::new() - }; - - let mut lines: Vec = content.lines().map(str::to_owned).collect(); + let content = if path.exists() { std::fs::read_to_string(path)? } else { String::new() }; let token_line = format!("token = \"{token}\""); - if let Some(pos) = lines.iter().position(|line| line.starts_with("token")) { + let had_trailing_newline = content.ends_with('\n'); + let mut lines: Vec = content.lines().map(str::to_owned).collect(); + let insert_pos = lines.iter().position(|line| line.trim_start().starts_with('[')).unwrap_or(lines.len()); + if let Some(pos) = lines[..insert_pos].iter().position(|line| line.trim_start().starts_with("token")) { lines[pos] = token_line; } else { - lines.push(token_line); + lines.insert(insert_pos, token_line); + } + + let mut rendered = lines.join("\n"); + if had_trailing_newline { + rendered.push('\n'); } - let tmp = path.with_extension("tmp"); - std::fs::write(&tmp, lines.join("\n"))?; - std::fs::rename(&tmp, path)?; + let parent = path.parent().unwrap_or_else(|| std::path::Path::new(".")); + let temp_path = parent.join(format!(".agent.toml.rebind.{}.tmp", uuid::Uuid::new_v4())); + let mut temp_file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(&temp_path)?; + use std::io::Write; + temp_file.write_all(rendered.as_bytes())?; + temp_file.sync_all()?; + std::fs::rename(&temp_path, path)?; + let dir_file = std::fs::File::open(parent)?; + dir_file.sync_all()?; Ok(()) } From 968774edd01a8cacba390e11a0c28a3fbe8840e7 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 22:34:00 +0800 Subject: [PATCH 58/60] fix(server): remove unused traffic merge wrappers --- crates/server/src/service/traffic.rs | 34 ---------------------------- 1 file changed, 34 deletions(-) diff --git a/crates/server/src/service/traffic.rs b/crates/server/src/service/traffic.rs index fd838ced..f0ccf0de 100644 --- a/crates/server/src/service/traffic.rs +++ b/crates/server/src/service/traffic.rs @@ -64,40 +64,6 @@ impl TrafficService { Ok(()) } - pub(crate) async fn replace_unique_key_table_server_id( - db: &DatabaseConnection, - table: &str, - key_columns: &[&str], - target_server_id: &str, - source_server_id: &str, - ) -> Result<(), AppError> { - Self::replace_unique_key_table_server_id_on_connection( - db, - table, - key_columns, - target_server_id, - source_server_id, - ) - .await - } - - pub(crate) async fn replace_unique_key_table_server_id_on_txn( - txn: &DatabaseTransaction, - table: &str, - key_columns: &[&str], - target_server_id: &str, - source_server_id: &str, - ) -> Result<(), AppError> { - Self::replace_unique_key_table_server_id_on_connection( - txn, - table, - key_columns, - target_server_id, - source_server_id, - ) - .await - } - pub(crate) async fn replace_unique_key_table_server_id_on_connection( db: &C, table: &str, From c2716681149374ed871e35d11064fed0e8a05f82 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 22:41:12 +0800 Subject: [PATCH 59/60] fix(agent): satisfy directory entry sort lint --- crates/agent/src/file_manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/agent/src/file_manager.rs b/crates/agent/src/file_manager.rs index 22fa8a35..a8edb987 100644 --- a/crates/agent/src/file_manager.rs +++ b/crates/agent/src/file_manager.rs @@ -186,7 +186,7 @@ impl FileManager { if entries.is_empty() { anyhow::bail!("Path '{}' is outside allowed root paths", path); } - entries.sort_by(|a, b| a.name.to_lowercase().cmp(&b.name.to_lowercase())); + entries.sort_by_key(|a| a.name.to_lowercase()); Ok(entries) } } From 8bd2bbf6c0098dd8fa97713169daaff490212346 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Thu, 16 Apr 2026 22:52:57 +0800 Subject: [PATCH 60/60] fix(server): use capability default constant for new servers --- crates/server/src/router/api/agent.rs | 3 ++- crates/server/src/router/api/server.rs | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/server/src/router/api/agent.rs b/crates/server/src/router/api/agent.rs index 1682a241..7c12c6ea 100644 --- a/crates/server/src/router/api/agent.rs +++ b/crates/server/src/router/api/agent.rs @@ -21,6 +21,7 @@ use crate::service::config::ConfigService; use crate::service::network_probe::NetworkProbeService; use crate::service::upgrade_release::LatestAgentVersionResponse; use crate::state::AppState; +use serverbee_common::constants::CAP_DEFAULT; const CONFIG_KEY_AUTO_DISCOVERY: &str = "auto_discovery_key"; const DEFAULT_SERVER_NAME: &str = "New Server"; @@ -208,7 +209,7 @@ async fn register( traffic_limit: Set(None), traffic_limit_type: Set(None), billing_start_day: Set(None), - capabilities: Set(56), + capabilities: Set(CAP_DEFAULT as i32), protocol_version: Set(1), features: Set("[]".to_string()), last_remote_addr: Set(Some(ip.clone())), diff --git a/crates/server/src/router/api/server.rs b/crates/server/src/router/api/server.rs index c673d48f..54b189ac 100644 --- a/crates/server/src/router/api/server.rs +++ b/crates/server/src/router/api/server.rs @@ -1080,6 +1080,7 @@ mod cleanup_tests { use crate::entity::server; use chrono::Utc; use std::collections::HashSet; + use serverbee_common::constants::CAP_DEFAULT; fn make_server(id: &str, name: &str, os: Option<&str>) -> server::Model { let now = Utc::now(); @@ -1114,7 +1115,7 @@ mod cleanup_tests { traffic_limit: None, traffic_limit_type: None, billing_start_day: None, - capabilities: 56, + capabilities: CAP_DEFAULT as i32, protocol_version: 1, features: "[]".to_string(), last_remote_addr: None,