diff --git a/Cargo.lock b/Cargo.lock index 6eee2e3c..b463fc46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4015,7 +4015,7 @@ dependencies = [ [[package]] name = "serverbee-agent" -version = "0.8.6" +version = "0.8.8" dependencies = [ "anyhow", "base64 0.22.1", @@ -4044,11 +4044,12 @@ dependencies = [ "tracing-subscriber", "url", "uuid", + "windows-sys 0.59.0", ] [[package]] name = "serverbee-common" -version = "0.8.6" +version = "0.8.8" dependencies = [ "chrono", "serde", @@ -4059,7 +4060,7 @@ dependencies = [ [[package]] name = "serverbee-server" -version = "0.8.6" +version = "0.8.8" dependencies = [ "a2", "anyhow", diff --git a/apps/docs/content/docs/cn/api-reference.mdx b/apps/docs/content/docs/cn/api-reference.mdx index 0232fef3..e1582740 100644 --- a/apps/docs/content/docs/cn/api-reference.mdx +++ b/apps/docs/content/docs/cn/api-reference.mdx @@ -80,6 +80,9 @@ API Key 格式为 `serverbee_` 前缀 + 43 字符随机字符串,创建时仅 | POST/PUT/DELETE | `/api/servers/*` | 服务器管理 | | PUT | `/api/servers/batch-capabilities` | 批量更新功能开关 | | POST | `/api/servers/{id}/upgrade` | 触发 Agent 升级 | +| GET | `/api/servers/{target_id}/recovery-candidates` | 列出推荐的恢复候选项 | +| GET | `/api/servers/recovery-jobs/{job_id}` | 获取恢复任务详情 | +| POST | `/api/servers/{target_id}/recover-merge` | 启动 Agent 恢复任务 | | CRUD | `/api/server-groups/*` | 服务器分组管理 | | CRUD | `/api/notifications/*` | 通知渠道管理 | | CRUD | `/api/notification-groups/*` | 通知组管理 | diff --git a/apps/docs/content/docs/cn/server.mdx b/apps/docs/content/docs/cn/server.mdx index 1831ad3e..58f54f79 100644 --- a/apps/docs/content/docs/cn/server.mdx +++ b/apps/docs/content/docs/cn/server.mdx @@ -218,6 +218,19 @@ Auto Discovery Key 用于 Agent 首次自动注册,优先级如下: 为减少重复的占位服务器,Agent 在可读取稳定机器标识时会携带指纹,相同机器重复注册会复用原有服务器记录。你还可以通过 `auth.max_servers` 软限制自动注册创建的新服务器数量;如果历史失败注册留下了离线占位条目,可在 `/servers` 页面使用 **Clean up unconnected** 清理。 +## 恢复重装后的 Agent + +如果一台已有服务器重装系统后重新注册成了一条新的临时在线节点,可以在原来的离线服务器详情页里发起恢复: + +1. 打开原来的离线服务器 +2. 点击 **恢复 Agent** +3. 选择推荐的在线替代节点 +4. 启动恢复任务 + +恢复会保留原始服务器记录,并要求替代 Agent 重新绑定到原来的服务器身份,后续恢复流程会继续在这个基础上执行。 + +这个恢复流程用于“同一台逻辑机器重装后重新接回”。它不适用于任意两条服务器记录的合并,也不适用于迁移到完全不同的硬件主机。 + ## OAuth 设置 ServerBee 支持通过 OAuth 第三方登录,目前支持以下提供商: diff --git a/apps/docs/content/docs/en/api-reference.mdx b/apps/docs/content/docs/en/api-reference.mdx index 3fd231f3..c366b81f 100644 --- a/apps/docs/content/docs/en/api-reference.mdx +++ b/apps/docs/content/docs/en/api-reference.mdx @@ -80,6 +80,9 @@ API keys use the format `serverbee_` prefix + 43-character random string. The ke | POST/PUT/DELETE | `/api/servers/*` | Server management | | PUT | `/api/servers/batch-capabilities` | Batch update capabilities | | POST | `/api/servers/{id}/upgrade` | Trigger agent upgrade | +| GET | `/api/servers/{target_id}/recovery-candidates` | List recommended recovery candidates | +| GET | `/api/servers/recovery-jobs/{job_id}` | Get recovery job details | +| POST | `/api/servers/{target_id}/recover-merge` | Start an agent recovery job | | CRUD | `/api/server-groups/*` | Server group management | | CRUD | `/api/notifications/*` | Notification channel management | | CRUD | `/api/notification-groups/*` | Notification group management | diff --git a/apps/docs/content/docs/en/server.mdx b/apps/docs/content/docs/en/server.mdx index 83396602..08c318e9 100644 --- a/apps/docs/content/docs/en/server.mdx +++ b/apps/docs/content/docs/en/server.mdx @@ -178,6 +178,19 @@ The full key is printed to the logs on first startup and can also be viewed or r To reduce duplicate placeholder servers, agents reuse the existing server row when they re-register with the same machine fingerprint. You can also set `auth.max_servers` to soft-cap new auto-registered servers, and use **Clean up unconnected** on the Servers page to remove offline placeholders that never finished initialization. +## Recovering a Reinstalled Agent + +If an existing server was reinstalled and then re-registered as a new temporary online node, you can recover it from the original offline server detail page: + +1. Open the original offline server. +2. Click **Recover Agent**. +3. Choose the recommended online replacement candidate. +4. Start the recovery job. + +The original server record is kept. The replacement agent is asked to rebind onto the original server identity, and the recovery flow continues from there. + +Recovery is designed for reinstalling the same logical machine. It is not intended for arbitrary record merges or hardware migrations to a different host. + ## GeoIP Setup To enable geographic location detection for your agents: diff --git a/apps/web/openapi.json b/apps/web/openapi.json index 06a41fe6..692bbe49 100644 --- a/apps/web/openapi.json +++ b/apps/web/openapi.json @@ -9,6 +9,35 @@ "version": "0.2.1" }, "paths": { + "/api/agent/latest-version": { + "get": { + "tags": ["agent"], + "operationId": "latest_version", + "responses": { + "200": { + "description": "Latest agent release metadata", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/LatestAgentVersionResponse" + } + } + } + } + }, + "security": [ + { + "session_cookie": [] + }, + { + "api_key": [] + }, + { + "bearer_token": [] + } + ] + } + }, "/api/agent/register": { "post": { "tags": ["agent"], @@ -3518,6 +3547,76 @@ ] } }, + "/api/servers/recovery-jobs/{job_id}": { + "get": { + "tags": ["server-recovery"], + "operationId": "get_recovery_job", + "parameters": [ + { + "name": "job_id", + "in": "path", + "description": "Recovery job id", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Recovery job details", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RecoveryJobResponse" + } + } + } + }, + "401": { + "description": "Authentication required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "403": { + "description": "Admin required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "404": { + "description": "Recovery job not found", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + } + }, + "security": [ + { + "session_cookie": [] + }, + { + "api_key": [] + }, + { + "bearer_token": [] + } + ] + } + }, "/api/servers/{id}": { "get": { "tags": ["servers"], @@ -4021,6 +4120,189 @@ ] } }, + "/api/servers/{target_id}/recover-merge": { + "post": { + "tags": ["server-recovery"], + "operationId": "start_recovery_merge", + "parameters": [ + { + "name": "target_id", + "in": "path", + "description": "Original offline server id", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/StartRecoveryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Recovery job created", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/RecoveryJobResponse" + } + } + } + }, + "401": { + "description": "Authentication required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "403": { + "description": "Admin required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "404": { + "description": "Server not found", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "409": { + "description": "Recovery cannot be started in the current state", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "422": { + "description": "Invalid request", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + } + }, + "security": [ + { + "session_cookie": [] + }, + { + "api_key": [] + }, + { + "bearer_token": [] + } + ] + } + }, + "/api/servers/{target_id}/recovery-candidates": { + "get": { + "tags": ["server-recovery"], + "operationId": "list_candidates", + "parameters": [ + { + "name": "target_id", + "in": "path", + "description": "Original offline server id", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Recommended recovery candidates", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/RecoveryCandidateResponse" + } + } + } + } + }, + "401": { + "description": "Authentication required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "403": { + "description": "Admin required", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "404": { + "description": "Target server not found", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + }, + "409": { + "description": "Target must be offline and not already in a running recovery job", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorBody" + } + } + } + } + }, + "security": [ + { + "session_cookie": [] + }, + { + "api_key": [] + }, + { + "bearer_token": [] + } + ] + } + }, "/api/service-monitors": { "get": { "tags": ["service-monitors"], @@ -6764,6 +7046,21 @@ } ] }, + "LatestAgentVersionResponse": { + "type": "object", + "properties": { + "error": { + "type": ["string", "null"] + }, + "released_at": { + "type": ["string", "null"], + "format": "date-time" + }, + "version": { + "type": ["string", "null"] + } + } + }, "ListFilesRequest": { "type": "object", "required": ["path"], @@ -7356,6 +7653,95 @@ } } }, + "RecoveryCandidateResponse": { + "type": "object", + "required": ["server_id", "name", "score", "reasons"], + "properties": { + "name": { + "type": "string" + }, + "reasons": { + "type": "array", + "items": { + "type": "string" + } + }, + "score": { + "type": "integer", + "format": "int32" + }, + "server_id": { + "type": "string" + } + } + }, + "RecoveryJobResponse": { + "type": "object", + "required": [ + "job_id", + "target_server_id", + "source_server_id", + "status", + "stage", + "started_at", + "created_at", + "updated_at" + ], + "properties": { + "created_at": { + "type": "string", + "format": "date-time" + }, + "error": { + "type": ["string", "null"] + }, + "job_id": { + "type": "string" + }, + "last_heartbeat_at": { + "type": ["string", "null"], + "format": "date-time" + }, + "source_server_id": { + "type": "string" + }, + "stage": { + "$ref": "#/components/schemas/RecoveryJobStage" + }, + "started_at": { + "type": "string", + "format": "date-time" + }, + "status": { + "$ref": "#/components/schemas/RecoveryJobStatus" + }, + "target_server_id": { + "type": "string" + }, + "updated_at": { + "type": "string", + "format": "date-time" + } + } + }, + "RecoveryJobStage": { + "type": "string", + "enum": [ + "validating", + "rebinding", + "awaiting_target_online", + "freezing_writes", + "merging_history", + "finalizing", + "succeeded", + "failed", + "unknown" + ] + }, + "RecoveryJobStatus": { + "type": "string", + "enum": ["running", "failed", "succeeded", "unknown"] + }, "RegisterRequest": { "type": "object", "properties": { @@ -7929,6 +8315,15 @@ } } }, + "StartRecoveryRequest": { + "type": "object", + "required": ["source_server_id"], + "properties": { + "source_server_id": { + "type": "string" + } + } + }, "StatRequest": { "type": "object", "required": ["path"], diff --git a/apps/web/src/components/server/recovery-merge-dialog.test.tsx b/apps/web/src/components/server/recovery-merge-dialog.test.tsx new file mode 100644 index 00000000..189c43a3 --- /dev/null +++ b/apps/web/src/components/server/recovery-merge-dialog.test.tsx @@ -0,0 +1,101 @@ +import { QueryClient, QueryClientProvider } from '@tanstack/react-query' +import { fireEvent, render, screen } from '@testing-library/react' +import type { ReactNode } from 'react' +import { describe, expect, it, vi } from 'vitest' +import { RecoveryMergeDialog } from './recovery-merge-dialog' + +const mockUseRecoveryCandidates = vi.fn() +const mockStartRecoveryMerge = vi.fn() + +vi.mock('react-i18next', () => ({ + useTranslation: () => ({ + t: (key: string, options?: { defaultValue?: string }) => options?.defaultValue ?? key + }) +})) + +vi.mock('sonner', () => ({ + toast: { + error: vi.fn(), + success: vi.fn() + } +})) + +vi.mock('@/hooks/use-api', () => ({ + useRecoveryCandidates: (...args: unknown[]) => mockUseRecoveryCandidates(...args), + startRecoveryMerge: (...args: unknown[]) => mockStartRecoveryMerge(...args) +})) + +function Wrapper({ children }: { children: ReactNode }) { + const queryClient = new QueryClient({ + defaultOptions: { + queries: { retry: false }, + mutations: { retry: false } + } + }) + + return {children} +} + +describe('RecoveryMergeDialog', () => { + it('renders candidate list', () => { + mockUseRecoveryCandidates.mockReturnValue({ + data: [{ server_id: 'source-1', name: 'Source', score: 42, reasons: ['same remote address'] }], + isError: false, + isLoading: false + }) + + render(, { wrapper: Wrapper }) + + expect(screen.getByText('Source')).toBeInTheDocument() + expect(screen.getByText('same remote address')).toBeInTheDocument() + }) + + it('disables submit until a candidate is selected', () => { + mockUseRecoveryCandidates.mockReturnValue({ + data: [{ server_id: 'source-1', name: 'Source', score: 42, reasons: ['same remote address'] }], + isError: false, + isLoading: false + }) + + render(, { wrapper: Wrapper }) + + const button = screen.getByText('Start Recovery').closest('button') + expect(button).toBeDisabled() + + fireEvent.click(screen.getByText('Source')) + expect(button).toBeEnabled() + }) + + it('becomes read-only when a current job exists', () => { + mockUseRecoveryCandidates.mockReturnValue({ + data: [{ server_id: 'source-1', name: 'Source', score: 42, reasons: ['same remote address'] }], + isError: false, + isLoading: false + }) + + render( + , + { wrapper: Wrapper } + ) + + expect(screen.getByText('This dialog is read-only while a recovery job is active.')).toBeInTheDocument() + expect(screen.queryByText('Source')).toBeNull() + expect(screen.getByText('Start Recovery').closest('button')).toBeDisabled() + }) +}) diff --git a/apps/web/src/components/server/recovery-merge-dialog.tsx b/apps/web/src/components/server/recovery-merge-dialog.tsx new file mode 100644 index 00000000..df33483d --- /dev/null +++ b/apps/web/src/components/server/recovery-merge-dialog.tsx @@ -0,0 +1,161 @@ +import { useMutation, useQueryClient } from '@tanstack/react-query' +import { Loader2, RotateCcw } from 'lucide-react' +import { useState } from 'react' +import { useTranslation } from 'react-i18next' +import { toast } from 'sonner' +import { Badge } from '@/components/ui/badge' +import { Button } from '@/components/ui/button' +import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from '@/components/ui/dialog' +import { ScrollArea } from '@/components/ui/scroll-area' +import { startRecoveryMerge, useRecoveryCandidates } from '@/hooks/use-api' +import type { RecoveryJobResponse } from '@/lib/api-schema' + +interface RecoveryMergeDialogProps { + currentJob?: RecoveryJobResponse + onOpenChange: (open: boolean) => void + open: boolean + targetServerId: string +} + +export function RecoveryMergeDialog({ currentJob, onOpenChange, open, targetServerId }: RecoveryMergeDialogProps) { + const { t } = useTranslation('servers') + const queryClient = useQueryClient() + const [selectedSourceId, setSelectedSourceId] = useState('') + const readOnly = currentJob != null + + const candidatesQuery = useRecoveryCandidates(targetServerId, open && !readOnly) + + const startMutation = useMutation({ + mutationFn: (sourceServerId: string) => startRecoveryMerge(targetServerId, { source_server_id: sourceServerId }), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['servers', targetServerId, 'recovery-candidates'] }) + toast.success(t('recovery_merge_started', { defaultValue: 'Recovery started' })) + onOpenChange(false) + }, + onError: (error) => { + toast.error( + error instanceof Error ? error.message : t('recovery_merge_failed', { defaultValue: 'Recovery failed' }) + ) + } + }) + + const candidates = candidatesQuery.data ?? [] + const selectedCandidate = candidates.find((candidate) => candidate.server_id === selectedSourceId) + const canSubmit = !readOnly && selectedCandidate != null && !startMutation.isPending + + return ( + + + + {t('recovery_merge_title', { defaultValue: 'Recover Offline Server' })} + + {t('recovery_merge_description', { + defaultValue: 'Pick the online replacement agent to rebind and merge back into this offline server.' + })} + + + + {currentJob && ( +
+
+ {currentJob.stage} + + {t('recovery_merge_existing_job', { defaultValue: 'A recovery job is already in progress.' })} + +
+
+ )} + + {candidatesQuery.isLoading && ( +
+ + {t('recovery_merge_loading', { defaultValue: 'Loading recovery candidates…' })} +
+ )} + + {candidatesQuery.isError && ( +
+ {t('recovery_merge_candidates_failed', { defaultValue: 'Failed to load recovery candidates.' })} +
+ )} + + {!(candidatesQuery.isLoading || candidatesQuery.isError) && candidates.length === 0 && ( +
+ {t('recovery_merge_empty', { defaultValue: 'No online recovery candidates are available right now.' })} +
+ )} + + {readOnly ? ( +
+ {t('recovery_merge_read_only', { + defaultValue: 'This dialog is read-only while a recovery job is active.' + })} +
+ ) : ( + candidates.length > 0 && ( + +
+ {candidates.map((candidate) => { + const selected = candidate.server_id === selectedSourceId + return ( + + ) + })} +
+
+ ) + )} + +
+ {t('recovery_merge_warning', { + defaultValue: + 'This keeps the original server record, asks the replacement agent to rebind, and continues the recovery flow from there.' + })} +
+ +
+ + +
+
+
+ ) +} diff --git a/apps/web/src/hooks/use-api.test.tsx b/apps/web/src/hooks/use-api.test.tsx index ca755bc7..594ead83 100644 --- a/apps/web/src/hooks/use-api.test.tsx +++ b/apps/web/src/hooks/use-api.test.tsx @@ -2,7 +2,7 @@ import { QueryClient, QueryClientProvider } from '@tanstack/react-query' import { renderHook, waitFor } from '@testing-library/react' import type { ReactNode } from 'react' import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' -import { useServer, useServerRecords } from './use-api' +import { startRecoveryMerge, useRecoveryCandidates, useRecoveryJob, useServer, useServerRecords } from './use-api' function createWrapper() { const queryClient = new QueryClient({ @@ -181,3 +181,112 @@ describe('useServerRecords', () => { expect(globalThis.fetch).not.toHaveBeenCalled() }) }) + +describe('recovery hooks', () => { + it('fetches recovery candidates for a target server', async () => { + vi.mocked(globalThis.fetch).mockResolvedValueOnce( + new Response( + JSON.stringify({ + data: [{ server_id: 'source-1', name: 'Source', score: 42, reasons: ['same remote address'] }] + }), + { + status: 200, + headers: { 'Content-Type': 'application/json' } + } + ) + ) + + const { result } = renderHook(() => useRecoveryCandidates('target-1'), { + wrapper: createWrapper() + }) + + await waitFor(() => { + expect(result.current.isSuccess).toBe(true) + }) + + expect(result.current.data?.[0].server_id).toBe('source-1') + expect(result.current.data?.[0].reasons).toEqual(['same remote address']) + }) + + it('does not fetch recovery candidates when disabled', async () => { + const { result } = renderHook(() => useRecoveryCandidates('target-1', false), { + wrapper: createWrapper() + }) + + await waitFor(() => { + expect(result.current.fetchStatus).toBe('idle') + }) + + expect(globalThis.fetch).not.toHaveBeenCalled() + }) + + it('fetches a recovery job by id', async () => { + vi.mocked(globalThis.fetch).mockResolvedValueOnce( + new Response( + JSON.stringify({ + data: { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running', + stage: 'rebinding', + error: null, + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + } + }), + { + status: 200, + headers: { 'Content-Type': 'application/json' } + } + ) + ) + + const { result } = renderHook(() => useRecoveryJob('job-1'), { + wrapper: createWrapper() + }) + + await waitFor(() => { + expect(result.current.isSuccess).toBe(true) + }) + + expect(result.current.data?.stage).toBe('rebinding') + }) + + it('starts a recovery merge', async () => { + vi.mocked(globalThis.fetch).mockResolvedValueOnce( + new Response( + JSON.stringify({ + data: { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running', + stage: 'rebinding', + error: null, + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + } + }), + { + status: 200, + headers: { 'Content-Type': 'application/json' } + } + ) + ) + + const job = await startRecoveryMerge('target-1', { source_server_id: 'source-1' }) + + expect(job.job_id).toBe('job-1') + expect(globalThis.fetch).toHaveBeenCalledWith( + '/api/servers/target-1/recover-merge', + expect.objectContaining({ + method: 'POST' + }) + ) + }) +}) diff --git a/apps/web/src/hooks/use-api.ts b/apps/web/src/hooks/use-api.ts index 933487b2..b148596b 100644 --- a/apps/web/src/hooks/use-api.ts +++ b/apps/web/src/hooks/use-api.ts @@ -1,6 +1,12 @@ import { useQuery } from '@tanstack/react-query' import { api } from '@/lib/api-client' -import type { ServerResponse, UptimeDailyEntry } from '@/lib/api-schema' +import type { + RecoveryCandidateResponse, + RecoveryJobResponse, + ServerResponse, + StartRecoveryRequest, + UptimeDailyEntry +} from '@/lib/api-schema' type ServerRecord = import('@/lib/api-schema').ServerMetricRecord @@ -37,4 +43,26 @@ export function useUptimeDaily(serverId: string, days = 90) { }) } +export function useRecoveryCandidates(targetId: string, enabled = true) { + return useQuery({ + queryKey: ['servers', targetId, 'recovery-candidates'], + queryFn: () => api.get(`/api/servers/${targetId}/recovery-candidates`), + enabled: enabled && targetId.length > 0, + staleTime: 30_000 + }) +} + +export function useRecoveryJob(jobId: string, enabled = true) { + return useQuery({ + queryKey: ['recovery-jobs', jobId], + queryFn: () => api.get(`/api/servers/recovery-jobs/${jobId}`), + enabled: enabled && jobId.length > 0, + staleTime: 15_000 + }) +} + +export function startRecoveryMerge(targetId: string, payload: StartRecoveryRequest) { + return api.post(`/api/servers/${targetId}/recover-merge`, payload) +} + export type { ServerMetricRecord as ServerRecord } from '@/lib/api-schema' diff --git a/apps/web/src/hooks/use-servers-ws.test.ts b/apps/web/src/hooks/use-servers-ws.test.ts index f9c2087e..8a87335c 100644 --- a/apps/web/src/hooks/use-servers-ws.test.ts +++ b/apps/web/src/hooks/use-servers-ws.test.ts @@ -1,4 +1,5 @@ import { describe, expect, it } from 'vitest' +import { useRecoveryJobsStore } from '@/stores/recovery-jobs-store' import { useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' import type { ServerMetrics } from './use-servers-ws' import { handleWsMessage, mergeServerUpdate, setServerCapabilities, setServerOnlineStatus } from './use-servers-ws' @@ -36,6 +37,18 @@ function makeServer(overrides: Partial = {}): ServerMetrics { } } +function makeQueryClient() { + const cache = new Map() + return { + setQueryData: (key: unknown[], value: unknown | ((prev: unknown) => unknown)) => { + const cacheKey = JSON.stringify(key) + const prev = cache.get(cacheKey) + const next = typeof value === 'function' ? (value as (prev: unknown) => unknown)(prev) : value + cache.set(cacheKey, next) + } + } +} + describe('mergeServerUpdate', () => { it('updates dynamic fields', () => { const prev = [makeServer({ cpu: 50 })] @@ -109,18 +122,6 @@ describe('setServerCapabilities', () => { }) describe('handleWsMessage upgrade messages', () => { - function makeQueryClient() { - const cache = new Map() - return { - setQueryData: (key: unknown[], value: unknown | ((prev: unknown) => unknown)) => { - const cacheKey = JSON.stringify(key) - const prev = cache.get(cacheKey) - const next = typeof value === 'function' ? (value as (prev: unknown) => unknown)(prev) : value - cache.set(cacheKey, next) - } - } - } - it('hydrates upgrade jobs from full_sync', () => { useUpgradeJobsStore.setState({ jobs: new Map() }) const queryClient = makeQueryClient() @@ -209,3 +210,86 @@ describe('handleWsMessage upgrade messages', () => { expect(job?.finished_at).not.toBeNull() }) }) + +describe('handleWsMessage recovery messages', () => { + it('hydrates recovery jobs from full_sync', () => { + useRecoveryJobsStore.setState({ jobs: new Map() }) + const queryClient = makeQueryClient() + + handleWsMessage( + { + type: 'full_sync', + servers: [], + recoveries: [ + { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running', + stage: 'rebinding', + error: null, + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + } + ] + }, + queryClient as never + ) + + expect(useRecoveryJobsStore.getState().getJob('target-1')?.job_id).toBe('job-1') + }) + + it('updates recovery jobs only when update payload includes recoveries', () => { + useRecoveryJobsStore.setState({ jobs: new Map() }) + useRecoveryJobsStore.getState().setJob('target-1', { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running', + stage: 'rebinding', + error: null, + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + }) + + const queryClient = makeQueryClient() + handleWsMessage( + { + type: 'update', + servers: [] + }, + queryClient as never + ) + + expect(useRecoveryJobsStore.getState().getJob('target-1')?.job_id).toBe('job-1') + + handleWsMessage( + { + type: 'update', + servers: [], + recoveries: [ + { + job_id: 'job-2', + target_server_id: 'target-2', + source_server_id: 'source-2', + status: 'failed', + stage: 'failed', + error: 'boom', + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + } + ] + }, + queryClient as never + ) + + expect(useRecoveryJobsStore.getState().getJob('target-1')).toBeUndefined() + expect(useRecoveryJobsStore.getState().getJob('target-2')?.job_id).toBe('job-2') + }) +}) diff --git a/apps/web/src/hooks/use-servers-ws.ts b/apps/web/src/hooks/use-servers-ws.ts index 65ec5382..722a3770 100644 --- a/apps/web/src/hooks/use-servers-ws.ts +++ b/apps/web/src/hooks/use-servers-ws.ts @@ -1,5 +1,6 @@ import { useQueryClient } from '@tanstack/react-query' import { useEffect, useRef } from 'react' +import type { RecoveryJobResponse } from '@/lib/api-schema' import type { NetworkProbeResultData } from '@/lib/network-types' import { WsClient } from '@/lib/ws-client' import type { @@ -7,6 +8,7 @@ import type { DockerContainerStats, DockerEventInfo } from '@/routes/_authed/servers/$serverId/docker/types' +import { useRecoveryJobsStore } from '@/stores/recovery-jobs-store' import { type UpgradeJob, useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' const MAX_DOCKER_EVENTS = 100 @@ -49,8 +51,8 @@ interface ServerMetrics { } type WsMessage = - | { type: 'full_sync'; servers: ServerMetrics[]; upgrades?: UpgradeJob[] } - | { type: 'update'; servers: ServerMetrics[] } + | { type: 'full_sync'; servers: ServerMetrics[]; upgrades?: UpgradeJob[]; recoveries?: RecoveryJobResponse[] } + | { type: 'update'; servers: ServerMetrics[]; recoveries?: RecoveryJobResponse[] | null } | { type: 'server_online'; server_id: string } | { type: 'server_offline'; server_id: string } | { @@ -180,26 +182,49 @@ function setServerDetailDockerAvailability( } type QueryClient = ReturnType +type FullSyncMessage = Extract +type UpdateMessage = Extract function isWsMessageLike(raw: unknown): raw is { type: string } & Record { return typeof raw === 'object' && raw !== null && 'type' in raw && typeof (raw as { type: unknown }).type === 'string' } +function hydrateRecoveryJobs(raw: FullSyncMessage | UpdateMessage, replaceMissing: boolean): void { + if (Array.isArray(raw.recoveries)) { + useRecoveryJobsStore.getState().setJobs(raw.recoveries) + return + } + + if (replaceMissing) { + useRecoveryJobsStore.getState().setJobs([]) + } +} + +function handleFullSyncMessage(msg: FullSyncMessage, queryClient: QueryClient): void { + queryClient.setQueryData(['servers'], msg.servers) + if (Array.isArray(msg.upgrades)) { + useUpgradeJobsStore.getState().setJobs(msg.upgrades as UpgradeJob[]) + } + hydrateRecoveryJobs(msg, true) +} + +function handleUpdateMessage(msg: UpdateMessage, queryClient: QueryClient): void { + queryClient.setQueryData(['servers'], (prev) => + prev ? mergeServerUpdate(prev, msg.servers) : msg.servers + ) + hydrateRecoveryJobs(msg, false) +} + function handleServerMetricsMessage(raw: { type: string } & Record, queryClient: QueryClient): void { if (raw.type === 'full_sync' || raw.type === 'update') { if (!Array.isArray(raw.servers) || raw.servers.some((s: unknown) => s == null || typeof s !== 'object')) { return } - const msg = raw as WsMessage & { type: 'full_sync' | 'update' } + const msg = raw as FullSyncMessage | UpdateMessage if (raw.type === 'full_sync') { - queryClient.setQueryData(['servers'], msg.servers) - if (Array.isArray(raw.upgrades)) { - useUpgradeJobsStore.getState().setJobs(raw.upgrades as UpgradeJob[]) - } + handleFullSyncMessage(msg as FullSyncMessage, queryClient) } else { - queryClient.setQueryData(['servers'], (prev) => - prev ? mergeServerUpdate(prev, msg.servers) : msg.servers - ) + handleUpdateMessage(msg as UpdateMessage, queryClient) } return } diff --git a/apps/web/src/lib/api-schema.ts b/apps/web/src/lib/api-schema.ts index bee9b969..bd7476e2 100644 --- a/apps/web/src/lib/api-schema.ts +++ b/apps/web/src/lib/api-schema.ts @@ -25,6 +25,11 @@ export type ServerMetricRecord = S['ServerRecord'] export type UpdateServerInput = S['UpdateServerInput'] export type BatchDeleteRequest = S['BatchDeleteRequest'] export type BatchDeleteResponse = S['BatchDeleteResponse'] +export type RecoveryCandidateResponse = S['RecoveryCandidateResponse'] +export type RecoveryJobResponse = S['RecoveryJobResponse'] +export type RecoveryJobStage = S['RecoveryJobStage'] +export type RecoveryJobStatus = S['RecoveryJobStatus'] +export type StartRecoveryRequest = S['StartRecoveryRequest'] // Server groups export type ServerGroup = S['ServerGroup'] diff --git a/apps/web/src/lib/api-types.ts b/apps/web/src/lib/api-types.ts index a59529d1..d94b8f23 100644 --- a/apps/web/src/lib/api-types.ts +++ b/apps/web/src/lib/api-types.ts @@ -4,6 +4,22 @@ */ export interface paths { + '/api/agent/latest-version': { + parameters: { + query?: never + header?: never + path?: never + cookie?: never + } + get: operations['latest_version'] + put?: never + post?: never + delete?: never + options?: never + head?: never + patch?: never + trace?: never + } '/api/agent/register': { parameters: { query?: never @@ -1161,6 +1177,38 @@ export interface paths { patch?: never trace?: never } + '/api/servers/{target_id}/recover-merge': { + parameters: { + query?: never + header?: never + path?: never + cookie?: never + } + get?: never + put?: never + post: operations['start_recovery_merge'] + delete?: never + options?: never + head?: never + patch?: never + trace?: never + } + '/api/servers/{target_id}/recovery-candidates': { + parameters: { + query?: never + header?: never + path?: never + cookie?: never + } + get: operations['list_candidates'] + put?: never + post?: never + delete?: never + options?: never + head?: never + patch?: never + trace?: never + } '/api/servers/batch-capabilities': { parameters: { query?: never @@ -1209,6 +1257,22 @@ export interface paths { patch?: never trace?: never } + '/api/servers/recovery-jobs/{job_id}': { + parameters: { + query?: never + header?: never + path?: never + cookie?: never + } + get: operations['get_recovery_job'] + put?: never + post?: never + delete?: never + options?: never + head?: never + patch?: never + trace?: never + } '/api/service-monitors': { parameters: { query?: never @@ -2007,6 +2071,12 @@ export interface components { IncidentWithUpdates: components['schemas']['Incident'] & { updates: components['schemas']['IncidentUpdate'][] } + LatestAgentVersionResponse: { + error?: string | null + /** Format: date-time */ + released_at?: string | null + version?: string | null + } ListFilesRequest: { path: string } @@ -2187,6 +2257,42 @@ export interface components { ReadResponse: { content: string } + RecoveryCandidateResponse: { + name: string + reasons: string[] + /** Format: int32 */ + score: number + server_id: string + } + RecoveryJobResponse: { + /** Format: date-time */ + created_at: string + error?: string | null + job_id: string + /** Format: date-time */ + last_heartbeat_at?: string | null + source_server_id: string + stage: components['schemas']['RecoveryJobStage'] + /** Format: date-time */ + started_at: string + status: components['schemas']['RecoveryJobStatus'] + target_server_id: string + /** Format: date-time */ + updated_at: string + } + /** @enum {string} */ + RecoveryJobStage: + | 'validating' + | 'rebinding' + | 'awaiting_target_online' + | 'freezing_writes' + | 'merging_history' + | 'finalizing' + | 'succeeded' + | 'failed' + | 'unknown' + /** @enum {string} */ + RecoveryJobStatus: 'running' | 'failed' | 'succeeded' | 'unknown' RegisterRequest: { fingerprint?: string } @@ -2399,6 +2505,9 @@ export interface components { /** Format: date-time */ time: string } + StartRecoveryRequest: { + source_server_id: string + } StatRequest: { path: string } @@ -4219,6 +4328,56 @@ export interface operations { } } } + get_recovery_job: { + parameters: { + query?: never + header?: never + path: { + /** @description Recovery job id */ + job_id: string + } + cookie?: never + } + requestBody?: never + responses: { + /** @description Recovery job details */ + 200: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['RecoveryJobResponse'] + } + } + /** @description Authentication required */ + 401: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Admin required */ + 403: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Recovery job not found */ + 404: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + } + } get_rule: { parameters: { query?: never @@ -4654,6 +4813,26 @@ export interface operations { } } } + latest_version: { + parameters: { + query?: never + header?: never + path?: never + cookie?: never + } + requestBody?: never + responses: { + /** @description Latest agent release metadata */ + 200: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['LatestAgentVersionResponse'] + } + } + } + } list_alert_events: { parameters: { query?: { @@ -4726,6 +4905,65 @@ export interface operations { } } } + list_candidates: { + parameters: { + query?: never + header?: never + path: { + /** @description Original offline server id */ + target_id: string + } + cookie?: never + } + requestBody?: never + responses: { + /** @description Recommended recovery candidates */ + 200: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['RecoveryCandidateResponse'][] + } + } + /** @description Authentication required */ + 401: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Admin required */ + 403: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Target server not found */ + 404: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Target must be offline and not already in a running recovery job */ + 409: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + } + } list_dashboards: { parameters: { query?: never @@ -5925,6 +6163,78 @@ export interface operations { } } } + start_recovery_merge: { + parameters: { + query?: never + header?: never + path: { + /** @description Original offline server id */ + target_id: string + } + cookie?: never + } + requestBody: { + content: { + 'application/json': components['schemas']['StartRecoveryRequest'] + } + } + responses: { + /** @description Recovery job created */ + 200: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['RecoveryJobResponse'] + } + } + /** @description Authentication required */ + 401: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Admin required */ + 403: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Server not found */ + 404: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Recovery cannot be started in the current state */ + 409: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + /** @description Invalid request */ + 422: { + headers: { + [name: string]: unknown + } + content: { + 'application/json': components['schemas']['ErrorBody'] + } + } + } + } stat_file: { parameters: { query?: never diff --git a/apps/web/src/locales/en/servers.json b/apps/web/src/locales/en/servers.json index 9dfdea9a..9fa830f2 100644 --- a/apps/web/src/locales/en/servers.json +++ b/apps/web/src/locales/en/servers.json @@ -197,5 +197,28 @@ "upgrade_status_failed": "Upgrade failed", "upgrade_status_timeout": "Upgrade timed out", "upgrade_error_with_backup": "Backup saved at", - "upgrade_backup_path": "Backup location" + "upgrade_backup_path": "Backup location", + + "recovery_merge_open": "Recover Agent", + "recovery_merge_resume": "View Recovery", + "recovery_merge_title": "Recover Offline Server", + "recovery_merge_description": "Pick the online replacement agent to rebind and merge back into this offline server.", + "recovery_merge_loading": "Loading recovery candidates…", + "recovery_merge_empty": "No online recovery candidates are available right now.", + "recovery_merge_existing_job": "A recovery job is already in progress.", + "recovery_merge_warning": "This keeps the original server record, asks the replacement agent to rebind, and continues the recovery flow from there.", + "recovery_merge_start": "Start Recovery", + "recovery_merge_starting": "Starting…", + "recovery_merge_started": "Recovery started", + "recovery_merge_failed": "Recovery failed", + "recovery_merge_candidates_failed": "Failed to load recovery candidates.", + "recovery_merge_read_only": "This dialog is read-only while a recovery job is active.", + "recovery_stage_validating": "Validating", + "recovery_stage_rebinding": "Rebinding", + "recovery_stage_awaiting_target_online": "Waiting for Target", + "recovery_stage_freezing_writes": "Freezing Writes", + "recovery_stage_merging_history": "Merging History", + "recovery_stage_finalizing": "Finalizing", + "recovery_stage_failed": "Failed", + "recovery_stage_succeeded": "Succeeded" } diff --git a/apps/web/src/locales/zh/servers.json b/apps/web/src/locales/zh/servers.json index 023be7c1..85cb7b4b 100644 --- a/apps/web/src/locales/zh/servers.json +++ b/apps/web/src/locales/zh/servers.json @@ -197,5 +197,28 @@ "upgrade_status_failed": "升级失败", "upgrade_status_timeout": "升级超时", "upgrade_error_with_backup": "备份保存在", - "upgrade_backup_path": "备份位置" + "upgrade_backup_path": "备份位置", + + "recovery_merge_open": "恢复 Agent", + "recovery_merge_resume": "查看恢复任务", + "recovery_merge_title": "恢复离线服务器", + "recovery_merge_description": "选择在线的替代 Agent,将其重新绑定并合并回当前离线服务器。", + "recovery_merge_loading": "正在加载恢复候选项…", + "recovery_merge_empty": "当前没有可用的在线恢复候选项。", + "recovery_merge_existing_job": "当前已有一个恢复任务正在进行。", + "recovery_merge_warning": "此操作会保留原服务器记录,要求替代 Agent 重新绑定,并继续后续恢复流程。", + "recovery_merge_start": "开始恢复", + "recovery_merge_starting": "启动中…", + "recovery_merge_started": "恢复任务已启动", + "recovery_merge_failed": "恢复失败", + "recovery_merge_candidates_failed": "加载恢复候选项失败。", + "recovery_merge_read_only": "当前存在恢复任务时,此对话框为只读状态。", + "recovery_stage_validating": "校验中", + "recovery_stage_rebinding": "重新绑定中", + "recovery_stage_awaiting_target_online": "等待目标上线", + "recovery_stage_freezing_writes": "冻结写入", + "recovery_stage_merging_history": "合并历史", + "recovery_stage_finalizing": "收尾中", + "recovery_stage_failed": "失败", + "recovery_stage_succeeded": "完成" } diff --git a/apps/web/src/routes/_authed/servers/$id.test.tsx b/apps/web/src/routes/_authed/servers/$id.test.tsx index 72cb3bd8..4daccdab 100644 --- a/apps/web/src/routes/_authed/servers/$id.test.tsx +++ b/apps/web/src/routes/_authed/servers/$id.test.tsx @@ -28,7 +28,7 @@ vi.mock('@tanstack/react-query', () => ({ vi.mock('react-i18next', () => ({ useTranslation: () => ({ - t: (key: string) => key + t: (key: string, options?: { defaultValue?: string }) => options?.defaultValue ?? key }) })) @@ -38,6 +38,10 @@ vi.mock('@/components/server/agent-version-section', () => ({ ) })) +vi.mock('@/components/server/recovery-merge-dialog', () => ({ + RecoveryMergeDialog: () =>
+})) + vi.mock('@/components/server/capabilities-dialog', () => ({ CapabilitiesDialog: () =>
capabilities
})) @@ -107,6 +111,12 @@ vi.mock('@/hooks/use-realtime-metrics', () => ({ useRealtimeMetrics: () => [] })) +vi.mock('@/hooks/use-auth', () => ({ + useAuth: () => ({ + user: { role: 'admin' } + }) +})) + vi.mock('@/lib/api-client', () => ({ api: { get: vi.fn() @@ -135,6 +145,18 @@ vi.mock('@/lib/widget-helpers', () => ({ computeAggregateUptime: () => null })) +vi.mock('@/stores/upgrade-jobs-store', () => ({ + useUpgradeJobsStore: () => undefined +})) + +vi.mock('@/stores/recovery-jobs-store', () => ({ + useRecoveryJobsStore: (selector: (state: { hydrated: boolean; jobs: Map }) => unknown) => + selector({ + hydrated: true, + jobs: new Map() + }) +})) + const { ServerDetailPage } = await import('./$id') describe('ServerDetailPage', () => { @@ -199,4 +221,10 @@ describe('ServerDetailPage', () => { expect(headerGrid?.children[1]).toContainElement(upgradeCard) expect(headerGrid?.children[2]).toContainElement(editButton) }) + + it('shows recovery action for offline server when admin', () => { + render() + + expect(screen.getByText('Recover Agent')).toBeInTheDocument() + }) }) diff --git a/apps/web/src/routes/_authed/servers/$id.tsx b/apps/web/src/routes/_authed/servers/$id.tsx index 56918d2b..4ccbf921 100644 --- a/apps/web/src/routes/_authed/servers/$id.tsx +++ b/apps/web/src/routes/_authed/servers/$id.tsx @@ -7,25 +7,29 @@ import { AgentVersionSection } from '@/components/server/agent-version-section' import { CapabilitiesDialog } from '@/components/server/capabilities-dialog' import { DiskIoChart } from '@/components/server/disk-io-chart' import { MetricsChart } from '@/components/server/metrics-chart' +import { RecoveryMergeDialog } from '@/components/server/recovery-merge-dialog' import { ServerEditDialog } from '@/components/server/server-edit-dialog' import { StatusBadge } from '@/components/server/status-badge' import { TrafficCard } from '@/components/server/traffic-card' import { TrafficProgress } from '@/components/server/traffic-progress' import { TrafficTab } from '@/components/server/traffic-tab' import { UpgradeJobBadge } from '@/components/server/upgrade-job-badge' +import { Badge } from '@/components/ui/badge' import { Button } from '@/components/ui/button' import { Skeleton } from '@/components/ui/skeleton' import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs' import { UptimeTimeline } from '@/components/uptime/uptime-timeline' import { useServer, useServerRecords, useUptimeDaily } from '@/hooks/use-api' +import { useAuth } from '@/hooks/use-auth' import { useRealtimeMetrics } from '@/hooks/use-realtime-metrics' import type { ServerMetrics } from '@/hooks/use-servers-ws' import { api } from '@/lib/api-client' -import type { ServerResponse } from '@/lib/api-schema' +import type { RecoveryJobResponse, ServerResponse } from '@/lib/api-schema' import { CAP_DOCKER, CAP_FILE, CAP_TERMINAL, getEffectiveCapabilityEnabled } from '@/lib/capabilities' import { buildMergedDiskIoSeries, buildPerDiskIoSeries } from '@/lib/disk-io' import { cn, countryCodeToFlag, formatBytes } from '@/lib/utils' import { computeAggregateUptime } from '@/lib/widget-helpers' +import { useRecoveryJobsStore } from '@/stores/recovery-jobs-store' import { useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' export const Route = createFileRoute('/_authed/servers/$id')({ @@ -75,6 +79,17 @@ function formatCurrency(price: number, currency: string): string { } } +function translateRecoveryStage( + t: (key: string, options?: { defaultValue?: string }) => string, + stage: string | undefined +): string | null { + if (!stage) { + return null + } + + return t(`recovery_stage_${stage}`, { defaultValue: stage }) +} + function ServerInfoMeta({ server }: { server: ServerResponse }) { const { t } = useTranslation('servers') return ( @@ -122,19 +137,27 @@ function ServerInfoMeta({ server }: { server: ServerResponse }) { } function ServerActionButtons({ + currentRecoveryJob, dockerEnabled, fileEnabled, id, + isAdmin, isOnline, + recoveryHydrated, onEditOpen, + onRecoveryOpen, serverWithCaps, terminalEnabled }: { + currentRecoveryJob?: RecoveryJobResponse dockerEnabled: boolean fileEnabled: boolean id: string + isAdmin: boolean isOnline: boolean + recoveryHydrated: boolean onEditOpen: () => void + onRecoveryOpen: () => void serverWithCaps: ServerResponse & ServerWithCaps terminalEnabled: boolean }) { @@ -146,6 +169,13 @@ function ServerActionButtons({ {t('detail_edit')} + {isAdmin && !isOnline && ( + + )} {isOnline && terminalEnabled && (
@@ -580,11 +619,15 @@ export function ServerDetailPage() {
setEditOpen(true)} + onRecoveryOpen={() => setRecoveryOpen(true)} + recoveryHydrated={recoveryHydrated} serverWithCaps={serverWithCaps} terminalEnabled={terminalEnabled} /> @@ -647,6 +690,12 @@ export function ServerDetailPage() { setEditOpen(false)} open={editOpen} server={server} /> +
) } diff --git a/apps/web/src/stores/recovery-jobs-store.test.ts b/apps/web/src/stores/recovery-jobs-store.test.ts new file mode 100644 index 00000000..60d51aec --- /dev/null +++ b/apps/web/src/stores/recovery-jobs-store.test.ts @@ -0,0 +1,64 @@ +import { describe, expect, it } from 'vitest' +import { useRecoveryJobsStore } from './recovery-jobs-store' + +function makeJob(overrides: Partial> = {}) { + return { + ...buildJob(), + ...overrides + } +} + +function buildJob() { + return { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running' as const, + stage: 'rebinding' as const, + error: null, + started_at: '2026-04-16T00:00:00Z', + created_at: '2026-04-16T00:00:00Z', + updated_at: '2026-04-16T00:00:00Z', + last_heartbeat_at: null + } +} + +describe('useRecoveryJobsStore', () => { + it('stores jobs keyed by target server id', () => { + useRecoveryJobsStore.setState({ hydrated: false, jobs: new Map() }) + + useRecoveryJobsStore.getState().setJob('target-1', makeJob()) + + expect(useRecoveryJobsStore.getState().getJob('target-1')?.job_id).toBe('job-1') + }) + + it('replaces the whole map on setJobs', () => { + useRecoveryJobsStore.setState({ hydrated: false, jobs: new Map() }) + useRecoveryJobsStore.getState().setJob('old-target', makeJob({ target_server_id: 'old-target' })) + + useRecoveryJobsStore + .getState() + .setJobs([makeJob(), makeJob({ job_id: 'job-2', target_server_id: 'target-2', source_server_id: 'source-2' })]) + + expect(useRecoveryJobsStore.getState().getJob('old-target')).toBeUndefined() + expect(useRecoveryJobsStore.getState().getJob('target-2')?.job_id).toBe('job-2') + expect(useRecoveryJobsStore.getState().hydrated).toBe(true) + }) + + it('clears a job by target server id', () => { + useRecoveryJobsStore.setState({ hydrated: false, jobs: new Map() }) + useRecoveryJobsStore.getState().setJob('target-1', makeJob()) + + useRecoveryJobsStore.getState().clearJob('target-1') + + expect(useRecoveryJobsStore.getState().getJob('target-1')).toBeUndefined() + }) + + it('can mark hydration explicitly', () => { + useRecoveryJobsStore.setState({ hydrated: false, jobs: new Map() }) + + useRecoveryJobsStore.getState().setHydrated(true) + + expect(useRecoveryJobsStore.getState().hydrated).toBe(true) + }) +}) diff --git a/apps/web/src/stores/recovery-jobs-store.ts b/apps/web/src/stores/recovery-jobs-store.ts new file mode 100644 index 00000000..dd49e104 --- /dev/null +++ b/apps/web/src/stores/recovery-jobs-store.ts @@ -0,0 +1,45 @@ +import { create } from 'zustand' +import type { RecoveryJobResponse } from '@/lib/api-schema' + +interface RecoveryJobsState { + clearJob: (targetServerId: string) => void + getJob: (targetServerId: string) => RecoveryJobResponse | undefined + hydrated: boolean + jobs: Map + setHydrated: (hydrated: boolean) => void + setJob: (targetServerId: string, job: RecoveryJobResponse) => void + setJobs: (jobs: RecoveryJobResponse[]) => void +} + +export const useRecoveryJobsStore = create()((set, get) => ({ + hydrated: false, + jobs: new Map(), + + setJob: (targetServerId: string, job: RecoveryJobResponse) => { + set((state) => { + const next = new Map(state.jobs) + next.set(targetServerId, job) + return { jobs: next } + }) + }, + + clearJob: (targetServerId: string) => { + set((state) => { + const next = new Map(state.jobs) + next.delete(targetServerId) + return { jobs: next } + }) + }, + + setJobs: (jobs: RecoveryJobResponse[]) => { + const next = new Map() + for (const job of jobs) { + next.set(job.target_server_id, job) + } + set({ jobs: next, hydrated: true }) + }, + + getJob: (targetServerId: string) => get().jobs.get(targetServerId), + + setHydrated: (hydrated: boolean) => set({ hydrated }) +})) diff --git a/crates/agent/Cargo.toml b/crates/agent/Cargo.toml index 844ce8a9..94bea628 100644 --- a/crates/agent/Cargo.toml +++ b/crates/agent/Cargo.toml @@ -37,3 +37,6 @@ bollard = "0.18" [dev-dependencies] tempfile = "3" + +[target.'cfg(windows)'.dependencies] +windows-sys = { version = "0.59", features = ["Win32_Storage_FileSystem"] } diff --git a/crates/agent/src/config.rs b/crates/agent/src/config.rs index 1b4817de..aab89466 100644 --- a/crates/agent/src/config.rs +++ b/crates/agent/src/config.rs @@ -140,13 +140,83 @@ impl AgentConfig { Ok(config) } - pub fn config_path() -> &'static str { - if std::path::Path::new("/etc/serverbee/agent.toml").exists() { + pub fn config_path_for_persistence() -> &'static str { + Self::select_config_path_for_persistence( + std::path::Path::new("agent.toml").exists(), + std::path::Path::new("/etc/serverbee/agent.toml").exists(), + ) + } + + pub(crate) fn select_config_path_for_persistence( + local_exists: bool, + system_exists: bool, + ) -> &'static str { + if local_exists { + "agent.toml" + } else if system_exists { "/etc/serverbee/agent.toml" } else { "agent.toml" } } + + pub(crate) fn token_env_override_present() -> bool { + std::env::var_os("SERVERBEE_TOKEN").is_some() + } +} + +#[cfg(test)] +pub(crate) fn with_serverbee_token_env(value: Option<&str>, test: impl FnOnce() -> T) -> T { + use std::sync::{Mutex, OnceLock}; + + struct ServerbeeTokenEnvGuard { + original: Option, + } + + impl Drop for ServerbeeTokenEnvGuard { + fn drop(&mut self) { + match self.original.take() { + Some(value) => unsafe { + std::env::set_var("SERVERBEE_TOKEN", value); + }, + None => unsafe { + std::env::remove_var("SERVERBEE_TOKEN"); + }, + } + } + } + + static ENV_LOCK: OnceLock> = OnceLock::new(); + let _lock = ENV_LOCK.get_or_init(|| Mutex::new(())).lock().expect("env lock"); + let original = std::env::var_os("SERVERBEE_TOKEN"); + + match value { + Some(value) => unsafe { + std::env::set_var("SERVERBEE_TOKEN", value); + }, + None => unsafe { + std::env::remove_var("SERVERBEE_TOKEN"); + }, + } + + let _guard = ServerbeeTokenEnvGuard { original }; + test() +} + +#[cfg(test)] +pub(crate) fn assert_config_path() { + assert_eq!( + AgentConfig::select_config_path_for_persistence(true, true), + "agent.toml" + ); + assert_eq!( + AgentConfig::select_config_path_for_persistence(false, true), + "/etc/serverbee/agent.toml" + ); + assert_eq!( + AgentConfig::select_config_path_for_persistence(false, false), + "agent.toml" + ); } #[cfg(test)] @@ -173,4 +243,11 @@ mod tests { "default external IP URL should be api.ipify.org" ); } + + #[test] + fn token_env_override_present_detects_serverbee_token() { + super::with_serverbee_token_env(Some("env-token"), || { + assert!(AgentConfig::token_env_override_present()); + }); + } } diff --git a/crates/agent/src/file_manager.rs b/crates/agent/src/file_manager.rs index 22fa8a35..a8edb987 100644 --- a/crates/agent/src/file_manager.rs +++ b/crates/agent/src/file_manager.rs @@ -186,7 +186,7 @@ impl FileManager { if entries.is_empty() { anyhow::bail!("Path '{}' is outside allowed root paths", path); } - entries.sort_by(|a, b| a.name.to_lowercase().cmp(&b.name.to_lowercase())); + entries.sort_by_key(|a| a.name.to_lowercase()); Ok(entries) } } diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index 9455d896..6d7aa43c 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -7,6 +7,7 @@ mod fingerprint; mod network_prober; mod pinger; mod probe_utils; +mod rebind; mod register; mod reporter; mod terminal; @@ -85,6 +86,18 @@ async fn main() -> anyhow::Result<()> { Ok(()) } +#[cfg(test)] +#[test] +fn persist_rebind_token() { + crate::rebind::assert_persist_rebind_token(); +} + +#[cfg(test)] +#[test] +fn config_path() { + crate::config::assert_config_path(); +} + #[cfg(test)] mod tests { use super::install_rustls_crypto_provider; diff --git a/crates/agent/src/rebind.rs b/crates/agent/src/rebind.rs new file mode 100644 index 00000000..97643579 --- /dev/null +++ b/crates/agent/src/rebind.rs @@ -0,0 +1,275 @@ +use std::ffi::OsStr; +use std::fs::{self, OpenOptions}; +use std::io::Write; +use std::path::Path; + +use anyhow::Context; + +fn render_token_content(existing: &str, token: &str) -> String { + let token_line = format!("token = \"{token}\""); + let had_trailing_newline = existing.ends_with('\n'); + let mut lines: Vec = existing.lines().map(ToOwned::to_owned).collect(); + let preamble_end = lines + .iter() + .position(|line| is_table_header(line)) + .unwrap_or(lines.len()); + let preamble = &mut lines[..preamble_end]; + + if let Some(pos) = preamble.iter().position(|line| is_token_line(line)) { + lines[pos] = token_line; + } else { + lines.insert(preamble_end, token_line); + } + + let mut rendered = lines.join("\n"); + if had_trailing_newline { + rendered.push('\n'); + } + rendered +} + +fn is_token_line(line: &str) -> bool { + let trimmed = line.trim_start(); + let Some(rest) = trimmed.strip_prefix("token") else { + return false; + }; + + rest.trim_start().starts_with('=') +} + +fn is_table_header(line: &str) -> bool { + let trimmed = line.trim_start(); + trimmed.starts_with('[') && trimmed.ends_with(']') +} + +pub(crate) fn persist_rebind_token_impl(path: impl AsRef, token: &str) -> anyhow::Result<()> { + let path = path.as_ref(); + let existing = if path.exists() { + fs::read_to_string(path).with_context(|| format!("failed to read {}", path.display()))? + } else { + String::new() + }; + let rendered = render_token_content(&existing, token); + + let parent = path.parent().unwrap_or_else(|| Path::new(".")); + let file_name = path.file_name().unwrap_or_else(|| OsStr::new("agent.toml")); + let temp_path = parent.join(format!( + ".{}.rebind.{}.tmp", + file_name.to_string_lossy(), + uuid::Uuid::new_v4() + )); + + let write_result = (|| -> anyhow::Result<()> { + let mut temp_file = OpenOptions::new() + .create_new(true) + .write(true) + .open(&temp_path) + .with_context(|| format!("failed to create {}", temp_path.display()))?; + temp_file + .write_all(rendered.as_bytes()) + .with_context(|| format!("failed to write {}", temp_path.display()))?; + temp_file + .sync_all() + .with_context(|| format!("failed to sync {}", temp_path.display()))?; + if path.exists() && let Ok(metadata) = fs::metadata(path) { + let _ = fs::set_permissions(&temp_path, metadata.permissions()); + } + replace_file(&temp_path, path)?; + + #[cfg(unix)] + { + if let Some(dir) = path.parent() && let Ok(dir_file) = fs::File::open(dir) { + let _ = dir_file.sync_all(); + } + } + + Ok(()) + })(); + + if write_result.is_err() { + let _ = fs::remove_file(&temp_path); + } + + write_result +} + +#[cfg(not(test))] +pub(crate) use persist_rebind_token_impl as persist_rebind_token; + +#[cfg(unix)] +fn replace_file(temp_path: &Path, path: &Path) -> anyhow::Result<()> { + fs::rename(temp_path, path).with_context(|| { + format!( + "failed to atomically replace {} with {}", + temp_path.display(), + path.display() + ) + }) +} + +#[cfg(windows)] +fn replace_file(temp_path: &Path, path: &Path) -> anyhow::Result<()> { + use std::os::windows::ffi::OsStrExt; + + use windows_sys::Win32::Storage::FileSystem::{ + MOVEFILE_REPLACE_EXISTING, MOVEFILE_WRITE_THROUGH, MoveFileExW, + }; + + let mut temp_wide: Vec = temp_path.as_os_str().encode_wide().collect(); + temp_wide.push(0); + let mut path_wide: Vec = path.as_os_str().encode_wide().collect(); + path_wide.push(0); + + let ok = unsafe { + MoveFileExW( + temp_wide.as_ptr(), + path_wide.as_ptr(), + MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH, + ) + }; + + if ok == 0 { + Err(std::io::Error::last_os_error()).with_context(|| { + format!( + "failed to atomically replace {} with {}", + temp_path.display(), + path.display() + ) + }) + } else { + Ok(()) + } +} + +#[cfg(test)] +pub(crate) fn assert_persist_rebind_token() { + let tempdir = tempfile::TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\n").expect("seed file"); + + persist_rebind_token_impl(&path, "focused-token").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert_eq!( + content, + r#"server_url = "http://127.0.0.1:9527" +token = "focused-token" +"# + ); +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn persist_rebind_token_replaces_existing_token_line_without_touching_other_lines() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write( + &path, + r#"server_url = "http://127.0.0.1:9527" +token = "old-token" +log.level = "debug""#, + ) + .expect("seed file"); + + super::persist_rebind_token_impl(&path, "new-token").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert_eq!( + content, + r#"server_url = "http://127.0.0.1:9527" +token = "new-token" +log.level = "debug""# + ); + } + + #[test] + fn persist_rebind_token_appends_token_line_when_missing() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\n").expect("seed file"); + + super::persist_rebind_token_impl(&path, "fresh-token").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert_eq!( + content, + r#"server_url = "http://127.0.0.1:9527" +token = "fresh-token" +"# + ); + } + + #[test] + fn persist_rebind_token_inserts_before_first_table_header() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write( + &path, + r#"server_url = "http://127.0.0.1:9527" +[collector] +interval = 3 +[log] +level = "info""#, + ) + .expect("seed file"); + + super::persist_rebind_token_impl(&path, "fresh-token").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert_eq!( + content, + r#"server_url = "http://127.0.0.1:9527" +token = "fresh-token" +[collector] +interval = 3 +[log] +level = "info""# + ); + } + + #[test] + fn persist_rebind_token_preserves_trailing_newline() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\n").expect("seed file"); + + super::persist_rebind_token_impl(&path, "fresh-token").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert!(content.ends_with('\n')); + } + + #[test] + fn persist_rebind_token_preserves_nested_token_and_inserts_top_level_token() { + let tempdir = TempDir::new().expect("tempdir"); + let path = tempdir.path().join("agent.toml"); + fs::write( + &path, + r#"server_url = "http://127.0.0.1:9527" +[collector] +token = "nested" +interval = 3 +[log] +level = "info""#, + ) + .expect("seed file"); + + super::persist_rebind_token_impl(&path, "top-level").expect("persist"); + + let content = fs::read_to_string(&path).expect("read file"); + assert_eq!( + content, + r#"server_url = "http://127.0.0.1:9527" +token = "top-level" +[collector] +token = "nested" +interval = 3 +[log] +level = "info""# + ); + } +} diff --git a/crates/agent/src/register.rs b/crates/agent/src/register.rs index 56d3ed7d..f80aca97 100644 --- a/crates/agent/src/register.rs +++ b/crates/agent/src/register.rs @@ -3,6 +3,11 @@ use serde::{Deserialize, Serialize}; use crate::config::AgentConfig; +#[cfg(test)] +use crate::rebind::persist_rebind_token_impl as persist_rebind_token; +#[cfg(not(test))] +use crate::rebind::persist_rebind_token; + #[derive(Serialize)] struct RegisterRequest { #[serde(skip_serializing_if = "String::is_empty")] @@ -42,20 +47,72 @@ pub async fn register_agent(config: &AgentConfig, fingerprint: &str) -> Result<( } pub fn save_token(token: &str) -> Result<()> { - let path = AgentConfig::config_path(); - let content = if std::path::Path::new(path).exists() { - std::fs::read_to_string(path)? - } else { - String::new() - }; - - let mut lines: Vec = content.lines().map(String::from).collect(); - let token_line = format!("token = \"{token}\""); - if let Some(pos) = lines.iter().position(|l| l.starts_with("token")) { - lines[pos] = token_line; - } else { - lines.push(token_line); + if AgentConfig::token_env_override_present() { + anyhow::bail!( + "SERVERBEE_TOKEN is set; refusing to persist token to agent.toml" + ); + } + + persist_rebind_token(AgentConfig::config_path_for_persistence(), token) +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::path::PathBuf; + + use tempfile::TempDir; + + struct CurrentDirGuard { + original: PathBuf, + } + + impl Drop for CurrentDirGuard { + fn drop(&mut self) { + let _ = std::env::set_current_dir(&self.original); + } + } + + fn set_current_dir(dir: &TempDir) -> CurrentDirGuard { + let original = std::env::current_dir().expect("cwd"); + std::env::set_current_dir(dir.path()).expect("set cwd"); + CurrentDirGuard { original } + } + + #[test] + fn save_token_rejects_persistence_when_serverbee_token_is_set() { + crate::config::with_serverbee_token_env(Some("env-token"), || { + let tempdir = TempDir::new().expect("tempdir"); + let _cwd_guard = set_current_dir(&tempdir); + + let result = super::save_token("persisted-token"); + + let err = result.expect_err("save_token should fail"); + assert!( + err.to_string().contains("SERVERBEE_TOKEN"), + "unexpected error: {err}" + ); + assert!( + !tempdir.path().join("agent.toml").exists(), + "token persistence should not write a config file" + ); + }); + } + + #[test] + fn save_token_allows_persistence_when_serverbee_token_is_unset() { + crate::config::with_serverbee_token_env(None, || { + let tempdir = TempDir::new().expect("tempdir"); + let _cwd_guard = set_current_dir(&tempdir); + + super::save_token("persisted-token").expect("save_token"); + + let content = fs::read_to_string(tempdir.path().join("agent.toml")) + .expect("read persisted config"); + assert!( + content.contains("token = \"persisted-token\""), + "expected persisted token, got: {content}" + ); + }); } - std::fs::write(path, lines.join("\n"))?; - Ok(()) } diff --git a/crates/agent/src/reporter.rs b/crates/agent/src/reporter.rs index 66dd6626..0f88f116 100644 --- a/crates/agent/src/reporter.rs +++ b/crates/agent/src/reporter.rs @@ -33,6 +33,11 @@ const UPGRADE_DOWNLOAD_TIMEOUT_SECS: u64 = 600; static UPGRADE_IN_PROGRESS: AtomicBool = AtomicBool::new(false); +enum ServerMessageOutcome { + Continue, + Reconnect, +} + pub struct Reporter { config: AgentConfig, fingerprint: String, @@ -108,7 +113,7 @@ impl Reporter { } } - async fn connect_and_report(&self) -> anyhow::Result<()> { + async fn connect_and_report(&mut self) -> anyhow::Result<()> { use serverbee_common::constants::*; tracing::info!("Connecting to {}...", build_ws_url(&self.config)?); @@ -393,7 +398,19 @@ impl Reporter { server_msg = read.next() => { match server_msg { Some(Ok(Message::Text(text))) => { - self.handle_server_message(&text, &mut write, &mut ping_manager, &mut terminal_manager, &mut network_prober, &cmd_result_tx, &capabilities, &server_capabilities, &file_manager, &file_tx, &mut docker_manager, &mut docker_available, &mut docker_stats_interval).await?; + match self.handle_server_message(&text, &mut write, &mut ping_manager, &mut terminal_manager, &mut network_prober, &cmd_result_tx, &capabilities, &server_capabilities, &file_manager, &file_tx, &mut docker_manager, &mut docker_available, &mut docker_stats_interval).await? { + ServerMessageOutcome::Continue => {} + ServerMessageOutcome::Reconnect => { + ping_manager.stop_all(); + terminal_manager.close_all(); + network_prober.stop_all(); + file_manager.cancel_all_transfers(); + if let Some(dm) = docker_manager.as_mut() { + dm.cleanup(); + } + return Ok(()); + } + } } Some(Ok(Message::Close(_))) => { tracing::info!("Server closed connection"); @@ -440,7 +457,7 @@ impl Reporter { #[allow(clippy::too_many_arguments)] async fn handle_server_message( - &self, + &mut self, text: &str, write: &mut S, ping_manager: &mut PingManager, @@ -454,7 +471,7 @@ impl Reporter { docker_manager: &mut Option, docker_available: &mut bool, docker_stats_interval: &mut Option, - ) -> anyhow::Result<()> + ) -> anyhow::Result where S: SinkExt + Unpin, { @@ -464,7 +481,7 @@ impl Reporter { Ok(m) => m, Err(e) => { tracing::warn!("Failed to parse server message: {e}"); - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } }; @@ -519,7 +536,7 @@ impl Reporter { tokio::spawn(async move { let _ = tx.send(denied).await; }); - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } tracing::info!("Executing command (task_id={task_id}): {command}"); let tx = cmd_result_tx.clone(); @@ -538,6 +555,39 @@ impl Reporter { } }); } + ServerMessage::RebindIdentity { + job_id, + target_server_id, + token, + } => { + tracing::info!( + "Rebinding identity for job_id={job_id} to target_server_id={target_server_id}" + ); + + if let Err(write_err) = register::save_token(&token) { + tracing::warn!( + "Failed to persist rebind token for job_id={job_id}: {write_err}" + ); + let failed = AgentMessage::RebindIdentityFailed { + job_id: job_id.clone(), + error: write_err.to_string(), + }; + let json = serde_json::to_string(&failed)?; + if let Err(send_err) = write.send(Message::Text(json.into())).await { + tracing::warn!( + "Failed to send RebindIdentityFailed for job_id={job_id}: {send_err}" + ); + } + return Ok(ServerMessageOutcome::Continue); + } + + self.config.token = token; + let ack = AgentMessage::RebindIdentityAck { job_id }; + let json = serde_json::to_string(&ack)?; + write.send(Message::Text(json.into())).await?; + write.send(Message::Close(None)).await?; + return Ok(ServerMessageOutcome::Reconnect); + } ServerMessage::Ack { msg_id } => { tracing::debug!("Received Ack for msg_id={msg_id}"); } @@ -593,7 +643,7 @@ impl Reporter { }; let json = serde_json::to_string(&denied)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } if UPGRADE_IN_PROGRESS @@ -612,7 +662,7 @@ impl Reporter { ) .await; }); - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } tracing::info!("Upgrade requested: v{version} from {download_url}"); @@ -662,7 +712,7 @@ impl Reporter { tokio::spawn(async move { let _ = tx.send(denied).await; }); - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } // Input validation: target must be domain or IP only @@ -681,7 +731,7 @@ impl Reporter { }; let _ = tx.send(msg).await; }); - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } tracing::info!( @@ -709,7 +759,7 @@ impl Reporter { }; let json = serde_json::to_string(&msg)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.list_dir(&path).await; let msg = match result { @@ -739,7 +789,7 @@ impl Reporter { }; let json = serde_json::to_string(&msg)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.stat(&path).await; let msg = match result { @@ -771,7 +821,7 @@ impl Reporter { }; let json = serde_json::to_string(&msg)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.read_file(&path, max_size).await; let msg = match result { @@ -803,7 +853,7 @@ impl Reporter { }; let json = serde_json::to_string(&result)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.write_file(&path, &content).await; let msg = match result { @@ -835,7 +885,7 @@ impl Reporter { }; let json = serde_json::to_string(&result)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.delete(&path, recursive).await; let msg = match result { @@ -863,7 +913,7 @@ impl Reporter { }; let json = serde_json::to_string(&result)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.mkdir(&path).await; let msg = match result { @@ -891,7 +941,7 @@ impl Reporter { }; let json = serde_json::to_string(&result)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } let result = file_manager.rename_path(&from, &to).await; let msg = match result { @@ -918,7 +968,7 @@ impl Reporter { }; let json = serde_json::to_string(&msg)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } file_manager.start_download(transfer_id, path, file_tx.clone()); } @@ -938,7 +988,7 @@ impl Reporter { }; let json = serde_json::to_string(&msg)?; write.send(Message::Text(json.into())).await?; - return Ok(()); + return Ok(ServerMessageOutcome::Continue); } match file_manager .start_upload(transfer_id.clone(), path, size) @@ -1056,7 +1106,7 @@ impl Reporter { } } - Ok(()) + Ok(ServerMessageOutcome::Continue) } } diff --git a/crates/common/src/constants.rs b/crates/common/src/constants.rs index 8a2f7a7e..e5261ca9 100644 --- a/crates/common/src/constants.rs +++ b/crates/common/src/constants.rs @@ -1,7 +1,7 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION"); pub const DEFAULT_SERVER_PORT: u16 = 9527; pub const DEFAULT_REPORT_INTERVAL: u32 = 3; -pub const PROTOCOL_VERSION: u32 = 3; +pub const PROTOCOL_VERSION: u32 = 4; pub const SESSION_TTL_SECS: i64 = 86400; pub const HEARTBEAT_INTERVAL_SECS: u64 = 30; @@ -198,6 +198,12 @@ pub fn probe_type_to_cap(probe_type: &str) -> Option { } } +#[cfg(test)] +#[test] +fn protocol_version() { + assert_eq!(PROTOCOL_VERSION, 4); +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/common/src/protocol.rs b/crates/common/src/protocol.rs index 7b52c706..e9a7b5b1 100644 --- a/crates/common/src/protocol.rs +++ b/crates/common/src/protocol.rs @@ -29,6 +29,68 @@ pub enum UpgradeStatus { Timeout, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub enum RecoveryJobStatus { + Running, + Failed, + Succeeded, + Unknown, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub enum RecoveryJobStage { + Validating, + Rebinding, + AwaitingTargetOnline, + FreezingWrites, + MergingHistory, + Finalizing, + Succeeded, + Failed, + Unknown, +} + +impl<'de> Deserialize<'de> for RecoveryJobStatus { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let value = String::deserialize(deserializer)?; + + Ok(match value.as_str() { + "running" => Self::Running, + "failed" => Self::Failed, + "succeeded" => Self::Succeeded, + _ => Self::Unknown, + }) + } +} + +impl<'de> Deserialize<'de> for RecoveryJobStage { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let value = String::deserialize(deserializer)?; + + Ok(match value.as_str() { + "validating" => Self::Validating, + "rebinding" => Self::Rebinding, + "awaiting_target_online" => Self::AwaitingTargetOnline, + "freezing_writes" => Self::FreezingWrites, + "merging_history" => Self::MergingHistory, + "finalizing" => Self::Finalizing, + "succeeded" => Self::Succeeded, + "failed" => Self::Failed, + _ => Self::Unknown, + }) + } +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] pub struct UpgradeJobDto { @@ -46,6 +108,23 @@ pub struct UpgradeJobDto { pub finished_at: Option>, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub struct RecoveryJobDto { + pub job_id: String, + pub target_server_id: String, + pub source_server_id: String, + pub status: RecoveryJobStatus, + pub stage: RecoveryJobStage, + #[serde(default)] + pub error: Option, + pub started_at: DateTime, + pub created_at: DateTime, + pub updated_at: DateTime, + #[serde(default)] + pub last_heartbeat_at: Option>, +} + /// Agent -> Server messages #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type", rename_all = "snake_case")] @@ -81,6 +160,13 @@ pub enum AgentMessage { capability: String, reason: CapabilityDeniedReason, }, + RebindIdentityAck { + job_id: String, + }, + RebindIdentityFailed { + job_id: String, + error: String, + }, NetworkProbeResults { results: Vec, }, @@ -347,6 +433,11 @@ pub enum ServerMessage { #[serde(default)] job_id: Option, }, + RebindIdentity { + job_id: String, + target_server_id: String, + token: String, + }, CapabilitiesSync { capabilities: u32, }, @@ -360,9 +451,13 @@ pub enum BrowserMessage { servers: Vec, #[serde(default)] upgrades: Vec, + #[serde(default)] + recoveries: Vec, }, Update { servers: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + recoveries: Option>, }, ServerOnline { server_id: String, @@ -477,6 +572,61 @@ mod tests { } } + #[test] + fn test_rebind_identity_round_trip() { + let msg = ServerMessage::RebindIdentity { + job_id: "job-1".to_string(), + target_server_id: "server-1".to_string(), + token: "token-123".to_string(), + }; + let json = serde_json::to_string(&msg).unwrap(); + let parsed: ServerMessage = serde_json::from_str(&json).unwrap(); + match parsed { + ServerMessage::RebindIdentity { + job_id, + target_server_id, + token, + } => { + assert_eq!(job_id, "job-1"); + assert_eq!(target_server_id, "server-1"); + assert_eq!(token, "token-123"); + } + _ => panic!("Expected RebindIdentity"), + } + } + + #[test] + fn test_rebind_identity_ack_round_trip() { + let msg = AgentMessage::RebindIdentityAck { + job_id: "job-1".to_string(), + }; + let json = serde_json::to_string(&msg).unwrap(); + let parsed: AgentMessage = serde_json::from_str(&json).unwrap(); + match parsed { + AgentMessage::RebindIdentityAck { job_id } => { + assert_eq!(job_id, "job-1"); + } + _ => panic!("Expected RebindIdentityAck"), + } + } + + #[test] + fn test_rebind_identity_failed_round_trip() { + let msg = AgentMessage::RebindIdentityFailed { + job_id: "job-1".to_string(), + error: "permission denied".to_string(), + }; + let json = serde_json::to_string(&msg).unwrap(); + let parsed: AgentMessage = serde_json::from_str(&json).unwrap(); + match parsed { + AgentMessage::RebindIdentityFailed { job_id, error } => { + assert_eq!(job_id, "job-1"); + assert_eq!(error, "permission denied"); + } + _ => panic!("Expected RebindIdentityFailed"), + } + } + #[test] fn test_capability_denied_round_trip() { let msg = AgentMessage::CapabilityDenied { @@ -1243,6 +1393,51 @@ mod tests { } } + #[test] + fn test_recovery_job_dto_round_trip() { + let dto = RecoveryJobDto { + job_id: "recovery-1".to_string(), + target_server_id: "target-1".to_string(), + source_server_id: "source-1".to_string(), + status: RecoveryJobStatus::Running, + stage: RecoveryJobStage::FreezingWrites, + error: Some("write freeze in progress".to_string()), + started_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:02:03Z") + .unwrap() + .with_timezone(&chrono::Utc), + created_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:00:00Z") + .unwrap() + .with_timezone(&chrono::Utc), + updated_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:05:00Z") + .unwrap() + .with_timezone(&chrono::Utc), + last_heartbeat_at: Some( + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:04:30Z") + .unwrap() + .with_timezone(&chrono::Utc), + ), + }; + + let json = serde_json::to_string(&dto).unwrap(); + let parsed: RecoveryJobDto = serde_json::from_str(&json).unwrap(); + + assert_eq!(parsed, dto); + } + + #[test] + fn test_recovery_job_status_unknown_deserializes_to_unknown() { + let status: RecoveryJobStatus = serde_json::from_str(r#""paused""#).unwrap(); + + assert_eq!(status, RecoveryJobStatus::Unknown); + } + + #[test] + fn test_recovery_job_stage_unknown_deserializes_to_unknown() { + let stage: RecoveryJobStage = serde_json::from_str(r#""reconciling""#).unwrap(); + + assert_eq!(stage, RecoveryJobStage::Unknown); + } + #[test] fn test_browser_full_sync_with_upgrades_round_trip() { let msg = BrowserMessage::FullSync { @@ -1258,13 +1453,39 @@ mod tests { started_at: chrono::Utc::now(), finished_at: None, }], + recoveries: vec![RecoveryJobDto { + job_id: "recovery-1".to_string(), + target_server_id: "target-1".to_string(), + source_server_id: "source-1".to_string(), + status: RecoveryJobStatus::Running, + stage: RecoveryJobStage::Rebinding, + error: Some("waiting for agent reconnect".to_string()), + started_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:02:03Z") + .unwrap() + .with_timezone(&chrono::Utc), + created_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:00:00Z") + .unwrap() + .with_timezone(&chrono::Utc), + updated_at: chrono::DateTime::parse_from_rfc3339("2026-04-16T01:05:00Z") + .unwrap() + .with_timezone(&chrono::Utc), + last_heartbeat_at: Some( + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:04:30Z") + .unwrap() + .with_timezone(&chrono::Utc), + ), + }], }; let json = serde_json::to_string(&msg).unwrap(); let parsed: BrowserMessage = serde_json::from_str(&json).unwrap(); match parsed { - BrowserMessage::FullSync { servers, upgrades } => { + BrowserMessage::FullSync { + servers, + upgrades, + recoveries, + } => { assert!(servers.is_empty()); assert_eq!(upgrades.len(), 1); assert_eq!(upgrades[0].server_id, "server-1"); @@ -1278,11 +1499,88 @@ mod tests { Some("/backups/server-1.tar.gz".to_string()) ); assert!(upgrades[0].finished_at.is_none()); + assert_eq!(recoveries.len(), 1); + assert_eq!(recoveries[0].job_id, "recovery-1"); + assert_eq!(recoveries[0].target_server_id, "target-1"); + assert_eq!(recoveries[0].source_server_id, "source-1"); + assert_eq!(recoveries[0].status, RecoveryJobStatus::Running); + assert_eq!(recoveries[0].stage, RecoveryJobStage::Rebinding); + assert_eq!( + recoveries[0].error, + Some("waiting for agent reconnect".to_string()) + ); + assert_eq!( + recoveries[0].started_at, + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:02:03Z") + .unwrap() + .with_timezone(&chrono::Utc) + ); + assert_eq!( + recoveries[0].created_at, + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:00:00Z") + .unwrap() + .with_timezone(&chrono::Utc) + ); + assert_eq!( + recoveries[0].updated_at, + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:05:00Z") + .unwrap() + .with_timezone(&chrono::Utc) + ); + assert_eq!( + recoveries[0].last_heartbeat_at, + Some( + chrono::DateTime::parse_from_rfc3339("2026-04-16T01:04:30Z") + .unwrap() + .with_timezone(&chrono::Utc) + ) + ); + } + _ => panic!("Expected FullSync"), + } + } + + #[test] + fn test_browser_full_sync_defaults_missing_recoveries_to_empty() { + let json = r#"{"type":"full_sync","servers":[],"upgrades":[]}"#; + let parsed: BrowserMessage = serde_json::from_str(json).unwrap(); + + match parsed { + BrowserMessage::FullSync { + servers, + upgrades, + recoveries, + } => { + assert!(servers.is_empty()); + assert!(upgrades.is_empty()); + assert!(recoveries.is_empty()); } _ => panic!("Expected FullSync"), } } + #[test] + fn test_browser_update_omits_recoveries_when_none() { + let msg = BrowserMessage::Update { + servers: vec![], + recoveries: None, + }; + + let json = serde_json::to_string(&msg).unwrap(); + let value: serde_json::Value = serde_json::from_str(&json).unwrap(); + assert_eq!(value["type"], "update"); + assert_eq!(value["servers"], serde_json::json!([])); + assert!(value.get("recoveries").is_none()); + + match serde_json::from_str::(&json).unwrap() { + BrowserMessage::Update { servers, recoveries } => { + assert!(servers.is_empty()); + assert!(recoveries.is_none()); + } + _ => panic!("Expected Update"), + } + } + #[test] fn test_agent_info_updated_accepts_optional_agent_version() { let json = r#"{"type":"agent_info_updated","server_id":"server-1","protocol_version":3,"agent_version":"1.2.3"}"#; diff --git a/crates/server/src/entity/mod.rs b/crates/server/src/entity/mod.rs index f7f84a82..35e35a80 100644 --- a/crates/server/src/entity/mod.rs +++ b/crates/server/src/entity/mod.rs @@ -21,6 +21,7 @@ pub mod notification_group; pub mod oauth_account; pub mod ping_record; pub mod ping_task; +pub mod recovery_job; pub mod record; pub mod record_hourly; pub mod server; diff --git a/crates/server/src/entity/recovery_job.rs b/crates/server/src/entity/recovery_job.rs new file mode 100644 index 00000000..f8e25e63 --- /dev/null +++ b/crates/server/src/entity/recovery_job.rs @@ -0,0 +1,23 @@ +use sea_orm::entity::prelude::*; + +#[derive(Clone, Debug, PartialEq, DeriveEntityModel)] +#[sea_orm(table_name = "recovery_job")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub job_id: String, + pub target_server_id: String, + pub source_server_id: String, + pub status: String, + pub stage: String, + pub checkpoint_json: Option, + pub error: Option, + pub started_at: DateTimeUtc, + pub created_at: DateTimeUtc, + pub updated_at: DateTimeUtc, + pub last_heartbeat_at: Option, +} + +#[derive(Copy, Clone, Debug, EnumIter, DeriveRelation)] +pub enum Relation {} + +impl ActiveModelBehavior for ActiveModel {} diff --git a/crates/server/src/migration/m20260416_000017_create_recovery_job.rs b/crates/server/src/migration/m20260416_000017_create_recovery_job.rs new file mode 100644 index 00000000..7b92deba --- /dev/null +++ b/crates/server/src/migration/m20260416_000017_create_recovery_job.rs @@ -0,0 +1,98 @@ +use sea_orm_migration::prelude::*; + +pub struct Migration; + +impl MigrationName for Migration { + fn name(&self) -> &str { + "m20260416_000017_create_recovery_job" + } +} + +#[async_trait::async_trait] +impl MigrationTrait for Migration { + async fn up(&self, manager: &SchemaManager) -> Result<(), DbErr> { + let db = manager.get_connection(); + db.execute_unprepared( + "CREATE TABLE IF NOT EXISTS recovery_job ( + job_id TEXT PRIMARY KEY NOT NULL, + target_server_id TEXT NOT NULL, + source_server_id TEXT NOT NULL, + status TEXT NOT NULL CHECK (status IN ('running', 'failed', 'succeeded')), + stage TEXT NOT NULL CHECK ( + stage IN ( + 'validating', + 'rebinding', + 'awaiting_target_online', + 'freezing_writes', + 'merging_history', + 'finalizing', + 'succeeded', + 'failed' + ) + ), + checkpoint_json TEXT NULL, + error TEXT NULL, + started_at DATETIME NOT NULL, + created_at DATETIME NOT NULL, + updated_at DATETIME NOT NULL, + last_heartbeat_at DATETIME NULL + )", + ) + .await?; + db.execute_unprepared( + "CREATE UNIQUE INDEX IF NOT EXISTS idx_recovery_job_target_running + ON recovery_job(target_server_id) + WHERE status = 'running'", + ) + .await?; + db.execute_unprepared( + "CREATE UNIQUE INDEX IF NOT EXISTS idx_recovery_job_source_running + ON recovery_job(source_server_id) + WHERE status = 'running'", + ) + .await?; + db.execute_unprepared( + "CREATE TRIGGER IF NOT EXISTS trg_recovery_job_running_insert + BEFORE INSERT ON recovery_job + WHEN NEW.status = 'running' + BEGIN + SELECT RAISE(ABORT, 'recovery_job_active_conflict') + WHERE EXISTS ( + SELECT 1 + FROM recovery_job + WHERE status = 'running' + AND job_id <> NEW.job_id + AND ( + target_server_id IN (NEW.target_server_id, NEW.source_server_id) + OR source_server_id IN (NEW.target_server_id, NEW.source_server_id) + ) + ); + END", + ) + .await?; + db.execute_unprepared( + "CREATE TRIGGER IF NOT EXISTS trg_recovery_job_running_update + BEFORE UPDATE OF target_server_id, source_server_id, status ON recovery_job + WHEN NEW.status = 'running' + BEGIN + SELECT RAISE(ABORT, 'recovery_job_active_conflict') + WHERE EXISTS ( + SELECT 1 + FROM recovery_job + WHERE status = 'running' + AND job_id <> NEW.job_id + AND ( + target_server_id IN (NEW.target_server_id, NEW.source_server_id) + OR source_server_id IN (NEW.target_server_id, NEW.source_server_id) + ) + ); + END", + ) + .await?; + Ok(()) + } + + async fn down(&self, _manager: &SchemaManager) -> Result<(), DbErr> { + Ok(()) + } +} diff --git a/crates/server/src/migration/mod.rs b/crates/server/src/migration/mod.rs index b61a2b38..3a1a267d 100644 --- a/crates/server/src/migration/mod.rs +++ b/crates/server/src/migration/mod.rs @@ -16,6 +16,7 @@ mod m20260329_000013_add_server_fingerprint; mod m20260329_000014_create_mobile_session; mod m20260329_000015_add_session_source; mod m20260329_000016_create_device_token; +mod m20260416_000017_create_recovery_job; pub struct Migrator; @@ -38,6 +39,7 @@ impl MigratorTrait for Migrator { Box::new(m20260329_000014_create_mobile_session::Migration), Box::new(m20260329_000015_add_session_source::Migration), Box::new(m20260329_000016_create_device_token::Migration), + Box::new(m20260416_000017_create_recovery_job::Migration), ] } } diff --git a/crates/server/src/openapi.rs b/crates/server/src/openapi.rs index 061b2bab..45118f34 100644 --- a/crates/server/src/openapi.rs +++ b/crates/server/src/openapi.rs @@ -71,6 +71,9 @@ impl Modify for SecurityAddon { crate::router::api::server::trigger_upgrade, crate::router::api::server::batch_update_capabilities, crate::router::api::server::cleanup_orphaned_servers, + crate::router::api::server_recovery::list_candidates, + crate::router::api::server_recovery::get_recovery_job, + crate::router::api::server_recovery::start_recovery_merge, // server-groups crate::router::api::server_group::list_groups, crate::router::api::server_group::create_group, @@ -232,6 +235,11 @@ impl Modify for SecurityAddon { crate::router::api::server::BatchCapabilitiesResponse, crate::router::api::server::CleanupResponse, crate::service::server::UpdateServerInput, + crate::router::api::server_recovery::RecoveryJobStatus, + crate::router::api::server_recovery::RecoveryJobStage, + crate::router::api::server_recovery::RecoveryCandidateResponse, + crate::router::api::server_recovery::StartRecoveryRequest, + crate::router::api::server_recovery::RecoveryJobResponse, // server-groups crate::router::api::server_group::CreateGroupRequest, crate::router::api::server_group::UpdateGroupRequest, diff --git a/crates/server/src/router/api/agent.rs b/crates/server/src/router/api/agent.rs index 1682a241..7c12c6ea 100644 --- a/crates/server/src/router/api/agent.rs +++ b/crates/server/src/router/api/agent.rs @@ -21,6 +21,7 @@ use crate::service::config::ConfigService; use crate::service::network_probe::NetworkProbeService; use crate::service::upgrade_release::LatestAgentVersionResponse; use crate::state::AppState; +use serverbee_common::constants::CAP_DEFAULT; const CONFIG_KEY_AUTO_DISCOVERY: &str = "auto_discovery_key"; const DEFAULT_SERVER_NAME: &str = "New Server"; @@ -208,7 +209,7 @@ async fn register( traffic_limit: Set(None), traffic_limit_type: Set(None), billing_start_day: Set(None), - capabilities: Set(56), + capabilities: Set(CAP_DEFAULT as i32), protocol_version: Set(1), features: Set("[]".to_string()), last_remote_addr: Set(Some(ip.clone())), diff --git a/crates/server/src/router/api/mod.rs b/crates/server/src/router/api/mod.rs index 0b3f3edb..e7e67cdc 100644 --- a/crates/server/src/router/api/mod.rs +++ b/crates/server/src/router/api/mod.rs @@ -16,6 +16,7 @@ pub mod oauth; pub mod ping; pub mod server; pub mod server_group; +pub mod server_recovery; pub mod service_monitor; pub mod setting; pub mod status; @@ -66,6 +67,7 @@ pub fn router(state: Arc) -> Router> { .merge( Router::new() .merge(server::write_router()) + .merge(server_recovery::write_router()) .merge(server_group::write_router()) .merge(ping::write_router()) .merge(network_probe::write_router()) diff --git a/crates/server/src/router/api/server.rs b/crates/server/src/router/api/server.rs index c673d48f..54b189ac 100644 --- a/crates/server/src/router/api/server.rs +++ b/crates/server/src/router/api/server.rs @@ -1080,6 +1080,7 @@ mod cleanup_tests { use crate::entity::server; use chrono::Utc; use std::collections::HashSet; + use serverbee_common::constants::CAP_DEFAULT; fn make_server(id: &str, name: &str, os: Option<&str>) -> server::Model { let now = Utc::now(); @@ -1114,7 +1115,7 @@ mod cleanup_tests { traffic_limit: None, traffic_limit_type: None, billing_start_day: None, - capabilities: 56, + capabilities: CAP_DEFAULT as i32, protocol_version: 1, features: "[]".to_string(), last_remote_addr: None, diff --git a/crates/server/src/router/api/server_recovery.rs b/crates/server/src/router/api/server_recovery.rs new file mode 100644 index 00000000..04409bca --- /dev/null +++ b/crates/server/src/router/api/server_recovery.rs @@ -0,0 +1,730 @@ +use std::collections::HashSet; +use std::net::SocketAddr; +use std::str::FromStr; +use std::sync::Arc; + +use axum::extract::{Path, State}; +use axum::routing::{get, post}; +use axum::{Json, Router}; +use sea_orm::{ColumnTrait, EntityTrait, QueryFilter, TransactionTrait}; +use serde::{Deserialize, Serialize}; +use serverbee_common::protocol::ServerMessage; +use tokio::sync::mpsc; + +use crate::entity::{recovery_job, server}; +use crate::error::{ApiResponse, AppError, ok}; +use crate::router::ws::browser::broadcast_recovery_update; +use crate::service::recovery_job::RecoveryJobService; +use crate::service::recovery_merge::{RECOVERY_STAGE_REBINDING, RecoveryMergeService}; +use crate::state::AppState; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, utoipa::ToSchema)] +#[serde(rename_all = "snake_case")] +pub enum RecoveryJobStatus { + Running, + Failed, + Succeeded, + Unknown, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, utoipa::ToSchema)] +#[serde(rename_all = "snake_case")] +pub enum RecoveryJobStage { + Validating, + Rebinding, + AwaitingTargetOnline, + FreezingWrites, + MergingHistory, + Finalizing, + Succeeded, + Failed, + Unknown, +} + +#[derive(Debug, Serialize, utoipa::ToSchema)] +pub struct RecoveryCandidateResponse { + pub server_id: String, + pub name: String, + pub score: i32, + pub reasons: Vec, +} + +#[derive(Debug, Deserialize, utoipa::ToSchema)] +pub struct StartRecoveryRequest { + pub source_server_id: String, +} + +#[derive(Debug, Serialize, utoipa::ToSchema)] +pub struct RecoveryJobResponse { + pub job_id: String, + pub target_server_id: String, + pub source_server_id: String, + pub status: RecoveryJobStatus, + pub stage: RecoveryJobStage, + pub error: Option, + pub started_at: chrono::DateTime, + pub created_at: chrono::DateTime, + pub updated_at: chrono::DateTime, + pub last_heartbeat_at: Option>, +} + +#[derive(Debug)] +struct CandidateScoreInput { + same_remote_addr: bool, + same_cpu_arch: bool, + same_os: bool, + same_virtualization: bool, + created_within_minutes: i64, + same_country: bool, +} + +pub fn read_router() -> Router> { + Router::new() +} + +pub fn write_router() -> Router> { + Router::new() + .route( + "/servers/{target_id}/recovery-candidates", + get(list_candidates), + ) + .route("/servers/recovery-jobs/{job_id}", get(get_recovery_job)) + .route( + "/servers/{target_id}/recover-merge", + post(start_recovery_merge), + ) +} + +#[utoipa::path( + get, + path = "/api/servers/{target_id}/recovery-candidates", + params( + ("target_id" = String, Path, description = "Original offline server id") + ), + responses( + (status = 200, description = "Recommended recovery candidates", body = Vec), + (status = 401, description = "Authentication required", body = crate::error::ErrorBody), + (status = 403, description = "Admin required", body = crate::error::ErrorBody), + (status = 404, description = "Target server not found", body = crate::error::ErrorBody), + (status = 409, description = "Target must be offline and not already in a running recovery job", body = crate::error::ErrorBody), + ), + security( + ("session_cookie" = []), + ("api_key" = []), + ("bearer_token" = []) + ), + tag = "server-recovery" +)] +async fn list_candidates( + State(state): State>, + Path(target_id): Path, +) -> Result>>, AppError> { + let target = server::Entity::find_by_id(&target_id) + .one(&state.db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + + if state.agent_manager.is_online(&target.id) { + return Err(AppError::Conflict( + "Target server must be offline before listing recovery candidates".to_string(), + )); + } + + let running_jobs = recovery_job::Entity::find() + .filter(recovery_job::Column::Status.eq("running")) + .all(&state.db) + .await?; + + if running_jobs + .iter() + .any(|job| job.target_server_id == target.id || job.source_server_id == target.id) + { + return Err(AppError::Conflict( + "Target server is already participating in a running recovery job".to_string(), + )); + } + + let active_server_ids: HashSet = running_jobs + .into_iter() + .flat_map(|job| [job.target_server_id, job.source_server_id]) + .collect(); + + let mut candidates = server::Entity::find() + .filter(server::Column::Id.ne(target_id.as_str())) + .all(&state.db) + .await? + .into_iter() + .filter(|source| state.agent_manager.is_online(&source.id)) + .filter(|source| !active_server_ids.contains(&source.id)) + .map(|source| build_candidate_response(&target, &source)) + .collect::>(); + + candidates.sort_by(|left, right| { + right + .score + .cmp(&left.score) + .then_with(|| left.name.cmp(&right.name)) + .then_with(|| left.server_id.cmp(&right.server_id)) + }); + + ok(candidates) +} + +#[utoipa::path( + get, + path = "/api/servers/recovery-jobs/{job_id}", + params( + ("job_id" = String, Path, description = "Recovery job id") + ), + responses( + (status = 200, description = "Recovery job details", body = RecoveryJobResponse), + (status = 401, description = "Authentication required", body = crate::error::ErrorBody), + (status = 403, description = "Admin required", body = crate::error::ErrorBody), + (status = 404, description = "Recovery job not found", body = crate::error::ErrorBody), + ), + security( + ("session_cookie" = []), + ("api_key" = []), + ("bearer_token" = []) + ), + tag = "server-recovery" +)] +async fn get_recovery_job( + State(state): State>, + Path(job_id): Path, +) -> Result>, AppError> { + let job = RecoveryJobService::get_job(&state.db, &job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + + ok(job.into()) +} + +#[utoipa::path( + post, + path = "/api/servers/{target_id}/recover-merge", + request_body = StartRecoveryRequest, + params( + ("target_id" = String, Path, description = "Original offline server id") + ), + responses( + (status = 200, description = "Recovery job created", body = RecoveryJobResponse), + (status = 401, description = "Authentication required", body = crate::error::ErrorBody), + (status = 403, description = "Admin required", body = crate::error::ErrorBody), + (status = 404, description = "Server not found", body = crate::error::ErrorBody), + (status = 409, description = "Recovery cannot be started in the current state", body = crate::error::ErrorBody), + (status = 422, description = "Invalid request", body = crate::error::ErrorBody), + ), + security( + ("session_cookie" = []), + ("api_key" = []), + ("bearer_token" = []) + ), + tag = "server-recovery" +)] +async fn start_recovery_merge( + State(state): State>, + Path(target_id): Path, + Json(request): Json, +) -> Result>, AppError> { + let sender = state.agent_manager.get_sender(&request.source_server_id); + let job = + start_recovery_merge_with_sender(&state, &target_id, &request.source_server_id, sender) + .await?; + broadcast_recovery_update(&state).await; + ok(job.into()) +} + +async fn start_recovery_merge_with_sender( + state: &Arc, + target_id: &str, + source_server_id: &str, + sender: Option>, +) -> Result { + RecoveryMergeService::validate_start_request(state, target_id, source_server_id).await?; + + let sender = sender.ok_or_else(|| { + AppError::Conflict("Source server must be online before starting recovery".to_string()) + })?; + + let txn = state.db.begin().await?; + let job = RecoveryMergeService::start_on_txn(&txn, target_id, source_server_id).await?; + if let Err(error) = + RecoveryMergeService::validate_dispatch_preconditions(state, target_id, source_server_id) + .await + { + txn.rollback().await?; + return Err(error); + } + let token = RecoveryMergeService::rotate_target_token_on_txn(&txn, target_id).await?; + txn.commit().await?; + + if let Err(error) = sender + .send(ServerMessage::RebindIdentity { + job_id: job.job_id.clone(), + target_server_id: target_id.to_string(), + token, + }) + .await + { + let message = format!("Failed to dispatch RebindIdentity to source agent: {error}"); + RecoveryJobService::mark_failed(&state.db, &job.job_id, RECOVERY_STAGE_REBINDING, &message) + .await?; + return Err(AppError::Internal(format!( + "Failed to dispatch RebindIdentity to source agent: {error}" + ))); + } + Ok(job) +} + +fn build_candidate_response( + target: &server::Model, + source: &server::Model, +) -> RecoveryCandidateResponse { + let same_remote_addr = remote_addr_key(target.last_remote_addr.as_deref()) + .zip(remote_addr_key(source.last_remote_addr.as_deref())) + .is_some_and(|(left, right)| left == right); + let same_cpu_arch = option_eq(target.cpu_arch.as_deref(), source.cpu_arch.as_deref()); + let same_os = option_eq(target.os.as_deref(), source.os.as_deref()); + let same_virtualization = option_eq( + target.virtualization.as_deref(), + source.virtualization.as_deref(), + ); + let same_country = option_eq( + target.country_code.as_deref(), + source.country_code.as_deref(), + ) || option_eq(target.region.as_deref(), source.region.as_deref()); + let created_within_minutes = (source.created_at - target.created_at).num_minutes().abs(); + + let score = score_candidate(CandidateScoreInput { + same_remote_addr, + same_cpu_arch, + same_os, + same_virtualization, + created_within_minutes, + same_country, + }); + + let mut reasons = Vec::new(); + if same_remote_addr { + reasons.push("same remote address".to_string()); + } + if same_cpu_arch { + reasons.push("same cpu architecture".to_string()); + } + if same_os { + reasons.push("same operating system".to_string()); + } + if same_virtualization { + reasons.push("same virtualization".to_string()); + } + if same_country { + reasons.push("same region or country".to_string()); + } + if created_within_minutes <= 60 { + reasons.push("created close in time".to_string()); + } + if reasons.is_empty() { + reasons.push("online replacement candidate".to_string()); + } + + RecoveryCandidateResponse { + server_id: source.id.clone(), + name: source.name.clone(), + score, + reasons, + } +} + +fn score_candidate(input: CandidateScoreInput) -> i32 { + let mut score = 0; + + if input.same_remote_addr { + score += 40; + } + if input.same_cpu_arch { + score += 15; + } + if input.same_os { + score += 15; + } + if input.same_virtualization { + score += 10; + } + if input.same_country { + score += 10; + } + + score + + match input.created_within_minutes { + 0..=15 => 20, + 16..=60 => 12, + 61..=240 => 4, + _ => 0, + } +} + +fn option_eq(left: Option<&str>, right: Option<&str>) -> bool { + match (left, right) { + (Some(left), Some(right)) => left == right, + _ => false, + } +} + +fn remote_addr_key(value: Option<&str>) -> Option { + let value = value?.trim(); + if value.is_empty() { + return None; + } + + if let Ok(addr) = SocketAddr::from_str(value) { + return Some(addr.ip().to_string()); + } + + Some(value.to_string()) +} + +impl From for RecoveryJobResponse { + fn from(value: recovery_job::Model) -> Self { + Self { + job_id: value.job_id, + target_server_id: value.target_server_id, + source_server_id: value.source_server_id, + status: RecoveryJobStatus::from(value.status.as_str()), + stage: RecoveryJobStage::from(value.stage.as_str()), + error: value.error, + started_at: value.started_at, + created_at: value.created_at, + updated_at: value.updated_at, + last_heartbeat_at: value.last_heartbeat_at, + } + } +} + +impl From<&str> for RecoveryJobStatus { + fn from(value: &str) -> Self { + match value { + "running" => Self::Running, + "failed" => Self::Failed, + "succeeded" => Self::Succeeded, + _ => Self::Unknown, + } + } +} + +impl From<&str> for RecoveryJobStage { + fn from(value: &str) -> Self { + match value { + "validating" => Self::Validating, + "rebinding" => Self::Rebinding, + "awaiting_target_online" => Self::AwaitingTargetOnline, + "freezing_writes" => Self::FreezingWrites, + "merging_history" => Self::MergingHistory, + "finalizing" => Self::Finalizing, + "succeeded" => Self::Succeeded, + "failed" => Self::Failed, + _ => Self::Unknown, + } + } +} + +#[cfg(test)] +mod tests { + use super::{ + CandidateScoreInput, RecoveryJobStage, StartRecoveryRequest, score_candidate, + start_recovery_merge, start_recovery_merge_with_sender, + }; + use crate::config::AppConfig; + use crate::entity::{recovery_job, server}; + use crate::error::AppError; + use crate::service::auth::AuthService; + use crate::state::AppState; + use crate::test_utils::setup_test_db; + use axum::Json; + use axum::extract::{Path, State}; + use chrono::Utc; + use sea_orm::{ActiveModelTrait, EntityTrait, Set}; + use serverbee_common::constants::CAP_DEFAULT; + use serverbee_common::protocol::ServerMessage; + use std::net::{IpAddr, Ipv4Addr, SocketAddr}; + use std::sync::Arc; + use tokio::sync::mpsc; + use tokio::time::{Duration, timeout}; + + async fn insert_server_with_protocol( + db: &sea_orm::DatabaseConnection, + id: &str, + name: &str, + protocol_version: i32, + ) { + let now = Utc::now(); + let token_hash = AuthService::hash_password("test").unwrap(); + server::ActiveModel { + id: Set(id.to_string()), + token_hash: Set(token_hash), + token_prefix: Set("serverbee_test".to_string()), + name: Set(name.to_string()), + weight: Set(0), + hidden: Set(false), + capabilities: Set(CAP_DEFAULT as i32), + protocol_version: Set(protocol_version), + created_at: Set(now), + updated_at: Set(now), + ..Default::default() + } + .insert(db) + .await + .unwrap(); + } + + async fn insert_server(db: &sea_orm::DatabaseConnection, id: &str, name: &str) { + insert_server_with_protocol(db, id, name, 4).await; + } + + fn test_addr() -> SocketAddr { + SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 9527) + } + + #[test] + fn higher_score_when_ip_arch_and_created_at_match() { + let strong = score_candidate(CandidateScoreInput { + same_remote_addr: true, + same_cpu_arch: true, + same_os: true, + same_virtualization: true, + created_within_minutes: 10, + same_country: true, + }); + let weak = score_candidate(CandidateScoreInput { + same_remote_addr: false, + same_cpu_arch: false, + same_os: true, + same_virtualization: false, + created_within_minutes: 240, + same_country: false, + }); + + assert!(strong > weak); + } + + #[tokio::test] + async fn start_recovery_merge_returns_rebinding_stage() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db, AppConfig::default()).await.unwrap(); + + let (tx, mut rx) = mpsc::channel(1); + state + .agent_manager + .add_connection("source-1".into(), "Source".into(), tx, test_addr()); + state.agent_manager.set_protocol_version("source-1", 4); + + let Json(response) = start_recovery_merge( + State(Arc::clone(&state)), + Path("target-1".to_string()), + Json(StartRecoveryRequest { + source_server_id: "source-1".to_string(), + }), + ) + .await + .unwrap(); + + assert_eq!(response.data.stage, RecoveryJobStage::Rebinding); + let _message = timeout(Duration::from_secs(1), rx.recv()) + .await + .expect("rebind command should be sent in time") + .expect("rebind command channel should stay open"); + } + + #[tokio::test] + async fn start_recovery_merge_sends_rebind_identity_command() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let (tx, mut rx) = mpsc::channel(1); + state + .agent_manager + .add_connection("source-1".into(), "Source".into(), tx, test_addr()); + state.agent_manager.set_protocol_version("source-1", 4); + + let Json(response) = start_recovery_merge( + State(Arc::clone(&state)), + Path("target-1".to_string()), + Json(StartRecoveryRequest { + source_server_id: "source-1".to_string(), + }), + ) + .await + .unwrap(); + + let message = timeout(Duration::from_secs(1), rx.recv()) + .await + .expect("rebind command should be sent in time") + .expect("rebind command channel should stay open"); + let token = match message { + ServerMessage::RebindIdentity { + job_id, + target_server_id, + token, + } => { + assert_eq!(job_id, response.data.job_id); + assert_eq!(target_server_id, "target-1"); + token + } + other => panic!("expected rebind command, got {other:?}"), + }; + + let target = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(target.token_prefix, token[..8.min(token.len())]); + let validated = AuthService::validate_agent_token(&db, &token) + .await + .unwrap() + .expect("target token should validate"); + assert_eq!(validated.id, "target-1"); + } + + #[tokio::test] + async fn start_recovery_merge_fails_safely_when_sender_missing() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let before = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + + let error = start_recovery_merge_with_sender(&state, "target-1", "source-1", None) + .await + .expect_err("missing sender should fail safely"); + + assert!( + matches!(error, AppError::Conflict(message) if message.contains("Source server must be online")) + ); + + let after = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(after.token_prefix, before.token_prefix); + assert_eq!(after.token_hash, before.token_hash); + + let jobs = recovery_job::Entity::find().all(&db).await.unwrap(); + assert!(jobs.is_empty(), "no recovery job should be persisted"); + } + + #[tokio::test] + async fn start_recovery_merge_rejects_unsupported_source_protocol() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server_with_protocol(&db, "source-1", "Source", 3).await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let (tx, mut rx) = mpsc::channel(1); + state + .agent_manager + .add_connection("source-1".into(), "Source".into(), tx, test_addr()); + state.agent_manager.set_protocol_version("source-1", 3); + + let before = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + + let error = start_recovery_merge( + State(Arc::clone(&state)), + Path("target-1".to_string()), + Json(StartRecoveryRequest { + source_server_id: "source-1".to_string(), + }), + ) + .await + .expect_err("protocol v3 source should be rejected"); + + assert!( + matches!(error, AppError::Conflict(message) if message.contains("protocol v4+")) + ); + + assert!( + timeout(Duration::from_millis(100), rx.recv()).await.is_err(), + "unsupported source should not receive a rebind dispatch" + ); + + let after = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(after.token_prefix, before.token_prefix); + assert_eq!(after.token_hash, before.token_hash); + assert!(recovery_job::Entity::find().all(&db).await.unwrap().is_empty()); + } + + #[tokio::test] + async fn start_recovery_merge_persists_state_before_dispatch_failure() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let (tx, rx) = mpsc::channel(1); + drop(rx); + state + .agent_manager + .add_connection("source-1".into(), "Source".into(), tx, test_addr()); + state.agent_manager.set_protocol_version("source-1", 4); + + let before = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + + let error = start_recovery_merge( + State(Arc::clone(&state)), + Path("target-1".to_string()), + Json(StartRecoveryRequest { + source_server_id: "source-1".to_string(), + }), + ) + .await + .expect_err("dispatch failure should fail safely"); + + assert!( + matches!(error, AppError::Internal(ref message) if message.contains("Failed to dispatch RebindIdentity")), + "unexpected error: {error:?}" + ); + + let after = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_ne!(after.token_prefix, before.token_prefix); + assert_ne!(after.token_hash, before.token_hash); + + let jobs = recovery_job::Entity::find().all(&db).await.unwrap(); + assert_eq!(jobs.len(), 1, "recovery job state should stay committed"); + assert_eq!(jobs[0].target_server_id, "target-1"); + assert_eq!(jobs[0].source_server_id, "source-1"); + } +} diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 002c0b85..b9ccd09a 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -18,6 +18,7 @@ use crate::service::auth::AuthService; use crate::service::network_probe::NetworkProbeService; use crate::service::ping::PingService; use crate::service::record::RecordService; +use crate::service::recovery_merge::RecoveryMergeService; use crate::service::server::ServerService; use crate::service::upgrade_tracker::UpgradeLookup; use crate::state::AppState; @@ -399,18 +400,24 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent tracing::info!( "Server {server_id} remote address changed: {old_addr} -> {new_addr}" ); - if let Err(e) = AuditService::log( - &state.db, - "system", - "ip_changed", - Some(&format!( - "Remote address changed from {old_addr} to {new_addr} for server {server_id}" - )), - new_addr, - ) - .await - { - tracing::error!("Failed to write audit log for IP change: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = AuditService::log( + &state.db, + "system", + "ip_changed", + Some(&format!( + "Remote address changed from {old_addr} to {new_addr} for server {server_id}" + )), + new_addr, + ) + .await + { + tracing::error!("Failed to write audit log for IP change: {e}"); + } + } else { + tracing::info!( + "Skipping recovery-frozen IP-change audit write for {server_id}" + ); } } @@ -420,15 +427,19 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent let remote_changed = old_remote_addr.as_ref() != current_remote_addr.as_ref(); if ipv4_changed || ipv6_changed || remote_changed { - if let Err(e) = AlertService::check_event_rules( - &state.db, - &state.alert_state_manager, - server_id, - "ip_changed", - ) - .await - { - tracing::error!("Failed to check event rules for IP change: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = AlertService::check_event_rules( + &state.db, + &state.alert_state_manager, + server_id, + "ip_changed", + ) + .await + { + tracing::error!("Failed to check event rules for IP change: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen alert evaluation for {server_id}"); } state @@ -445,27 +456,47 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } // Always update last_remote_addr - if let Some(ref addr) = current_remote_addr - && let Err(e) = update_last_remote_addr(&state.db, server_id, addr).await - { - tracing::error!("Failed to update last_remote_addr for {server_id}: {e}"); + if let Some(ref addr) = current_remote_addr { + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = update_last_remote_addr(&state.db, server_id, addr).await { + tracing::error!( + "Failed to update last_remote_addr for {server_id}: {e}" + ); + } + } else { + tracing::info!( + "Skipping recovery-frozen system-info write for {server_id}" + ); + } } } - if let Err(e) = - ServerService::update_system_info(&state.db, server_id, &info, region, country_code) - .await - { - tracing::error!("Failed to update system info for {server_id}: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = ServerService::update_system_info( + &state.db, + server_id, + &info, + region, + country_code, + ) + .await + { + tracing::error!("Failed to update system info for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen system-info write for {server_id}"); } - // Persist and cache features from SystemInfo - let _ = crate::service::server::ServerService::update_features( - &state.db, - server_id, - &info.features, - ) - .await; + if state.recovery_lock.writes_allowed_for(server_id) { + let _ = crate::service::server::ServerService::update_features( + &state.db, + server_id, + &info.features, + ) + .await; + } else { + tracing::info!("Skipping recovery-frozen system-info write for {server_id}"); + } state .agent_manager .update_features(server_id, info.features.clone()); @@ -534,10 +565,15 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } AgentMessage::Report(report) => { // Save GPU records if present - if let Some(ref gpu) = report.gpu - && let Err(e) = RecordService::save_gpu_records(&state.db, server_id, gpu).await - { - tracing::error!("Failed to save GPU records for {server_id}: {e}"); + if let Some(ref gpu) = report.gpu { + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = RecordService::save_gpu_records(&state.db, server_id, gpu).await + { + tracing::error!("Failed to save GPU records for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen report write for {server_id}"); + } } state.agent_manager.update_report(server_id, report); } @@ -552,12 +588,20 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent ); if !dispatched { // No waiter — one-shot task, save directly - if let Err(e) = save_task_result(&state.db, server_id, &result).await { - tracing::error!("Failed to save task result for {server_id}: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = save_task_result(&state.db, server_id, &result).await { + tracing::error!("Failed to save task result for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen task-result write for {server_id}"); } } - if let Err(e) = audit_exec_finished(state, server_id, &result).await { - tracing::error!("Failed to write exec_finished audit log for {server_id}: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = audit_exec_finished(state, server_id, &result).await { + tracing::error!("Failed to write exec_finished audit log for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen exec audit write for {server_id}"); } // Send Ack if let Some(tx) = state.agent_manager.get_sender(server_id) { @@ -598,8 +642,12 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } AgentMessage::PingResult(result) => { - if let Err(e) = save_ping_result(&state.db, server_id, &result).await { - tracing::error!("Failed to save ping result for {server_id}: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = save_ping_result(&state.db, server_id, &result).await { + tracing::error!("Failed to save ping result for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen ping write for {server_id}"); } } AgentMessage::TerminalOutput { session_id, data } => { @@ -651,21 +699,27 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent }, ); if !dispatched { - use crate::entity::task_result; - use sea_orm::{ActiveModelTrait, NotSet, Set}; - let result = task_result::ActiveModel { - id: NotSet, - task_id: Set(task_id.clone()), - server_id: Set(server_id.to_string()), - output: Set(capability_denied_output(&capability, reason)), - exit_code: Set(-2), - run_id: Set(None), - attempt: Set(1), - started_at: Set(None), - finished_at: Set(chrono::Utc::now()), - }; - if let Err(e) = result.insert(&state.db).await { - tracing::error!("Failed to write CapabilityDenied task result: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + use crate::entity::task_result; + use sea_orm::{ActiveModelTrait, NotSet, Set}; + let result = task_result::ActiveModel { + id: NotSet, + task_id: Set(task_id.clone()), + server_id: Set(server_id.to_string()), + output: Set(capability_denied_output(&capability, reason)), + exit_code: Set(-2), + run_id: Set(None), + attempt: Set(1), + started_at: Set(None), + finished_at: Set(chrono::Utc::now()), + }; + if let Err(e) = result.insert(&state.db).await { + tracing::error!("Failed to write CapabilityDenied task result: {e}"); + } + } else { + tracing::info!( + "Skipping recovery-frozen capability-denied write for {server_id}" + ); } } } @@ -687,8 +741,14 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent server_id: server_id.to_string(), results: results.clone(), }); - if let Err(e) = NetworkProbeService::save_results(&state.db, server_id, results).await { - tracing::error!("Failed to save network probe results for {server_id}: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = + NetworkProbeService::save_results(&state.db, server_id, results).await + { + tracing::error!("Failed to save network probe results for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen network probe write for {server_id}"); } } // File management control responses — relay to pending HTTP requests @@ -844,6 +904,53 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent AgentMessage::Pong => { // Agent responded to our protocol-level Ping; already handled by WS Pong frames } + AgentMessage::RebindIdentityAck { job_id } => { + match RecoveryMergeService::handle_rebind_ack(state, &job_id, server_id).await { + Ok(change) => { + if change.transitioned { + tracing::info!( + "Applied RebindIdentityAck from agent {server_id} for job_id={job_id}, stage={}", + change.job.stage + ); + crate::router::ws::browser::broadcast_recovery_update(state).await; + } else { + tracing::info!( + "Ignoring stale RebindIdentityAck from agent {server_id} for job_id={job_id}, stage={}", + change.job.stage + ); + } + } + Err(error) => { + tracing::warn!( + "Failed to apply RebindIdentityAck from agent {server_id} for job_id={job_id}: {error}" + ); + } + } + } + AgentMessage::RebindIdentityFailed { job_id, error } => { + match RecoveryMergeService::handle_rebind_failure(state, &job_id, server_id, &error) + .await + { + Ok(change) => { + if change.transitioned { + tracing::warn!( + "Recorded RebindIdentityFailed from agent {server_id} for job_id={job_id}: {error}" + ); + crate::router::ws::browser::broadcast_recovery_update(state).await; + } else { + tracing::info!( + "Ignoring stale RebindIdentityFailed from agent {server_id} for job_id={job_id}, stage={}", + change.job.stage + ); + } + } + Err(mark_error) => { + tracing::warn!( + "Failed to record RebindIdentityFailed from agent {server_id} for job_id={job_id}: {mark_error}" + ); + } + } + } // Docker variants AgentMessage::DockerInfo { @@ -912,8 +1019,13 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } AgentMessage::DockerEvent { event } => { - let _ = crate::service::docker::DockerService::save_event(&state.db, server_id, &event) - .await; + if state.recovery_lock.writes_allowed_for(server_id) { + let _ = + crate::service::docker::DockerService::save_event(&state.db, server_id, &event) + .await; + } else { + tracing::info!("Skipping recovery-frozen docker event write for {server_id}"); + } state .agent_manager .broadcast_browser(BrowserMessage::DockerEvent { @@ -931,10 +1043,14 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } AgentMessage::FeaturesUpdate { ref features } => { - let _ = crate::service::server::ServerService::update_features( - &state.db, server_id, features, - ) - .await; + if state.recovery_lock.writes_allowed_for(server_id) { + let _ = crate::service::server::ServerService::update_features( + &state.db, server_id, features, + ) + .await; + } else { + tracing::info!("Skipping recovery-frozen features write for {server_id}"); + } state .agent_manager .update_features(server_id, features.clone()); @@ -966,10 +1082,17 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent let ipv6_changed = old_ipv6 != ipv6; if ipv4_changed || ipv6_changed { - // Update ipv4/ipv6 in DB - if let Err(e) = update_server_ips(&state.db, server_id, &ipv4, &ipv6).await - { - tracing::error!("Failed to update IPs for {server_id}: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + // Update ipv4/ipv6 in DB + if let Err(e) = + update_server_ips(&state.db, server_id, &ipv4, &ipv6).await + { + tracing::error!("Failed to update IPs for {server_id}: {e}"); + } + } else { + tracing::info!( + "Skipping recovery-frozen IP update write for {server_id}" + ); } // Re-run GeoIP lookup based on the new IPs @@ -982,16 +1105,25 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent let guard = state.geoip.read().unwrap(); guard.as_ref().map(|g| g.lookup(ip)) }; - if let Some(geo) = geo - && let Err(e) = update_server_geo( - &state.db, - server_id, - geo.region, - geo.country_code, - ) - .await - { - tracing::error!("Failed to update GeoIP for {server_id}: {e}"); + if let Some(geo) = geo { + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = update_server_geo( + &state.db, + server_id, + geo.region, + geo.country_code, + ) + .await + { + tracing::error!( + "Failed to update GeoIP for {server_id}: {e}" + ); + } + } else { + tracing::info!( + "Skipping recovery-frozen GeoIP write for {server_id}" + ); + } } } @@ -1006,27 +1138,39 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent .get_remote_addr(server_id) .map(|a| a.ip().to_string()) .unwrap_or_default(); - if let Err(e) = AuditService::log( - &state.db, - "system", - "ip_changed", - Some(&detail), - &remote_ip, - ) - .await - { - tracing::error!("Failed to write audit log for IP change: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = AuditService::log( + &state.db, + "system", + "ip_changed", + Some(&detail), + &remote_ip, + ) + .await + { + tracing::error!("Failed to write audit log for IP change: {e}"); + } + } else { + tracing::info!( + "Skipping recovery-frozen IP-change audit write for {server_id}" + ); } - if let Err(e) = AlertService::check_event_rules( - &state.db, - &state.alert_state_manager, - server_id, - "ip_changed", - ) - .await - { - tracing::error!("Failed to check event rules for IP change: {e}"); + if state.recovery_lock.writes_allowed_for(server_id) { + if let Err(e) = AlertService::check_event_rules( + &state.db, + &state.alert_state_manager, + server_id, + "ip_changed", + ) + .await + { + tracing::error!("Failed to check event rules for IP change: {e}"); + } + } else { + tracing::info!( + "Skipping recovery-frozen alert evaluation for {server_id}" + ); } state @@ -1270,13 +1414,66 @@ async fn update_server_geo( mod tests { use super::*; use crate::config::AppConfig; + use crate::entity::{recovery_job, server}; + use crate::service::auth::AuthService; use crate::test_utils::setup_test_db; + use chrono::Utc; + use sea_orm::{ActiveModelTrait, EntityTrait, Set}; + use serverbee_common::constants::CAP_DEFAULT; + use serverbee_common::protocol::{BrowserMessage, RecoveryJobStage}; use std::net::{IpAddr, Ipv4Addr}; + use tokio::time::{Duration, timeout}; fn test_addr() -> SocketAddr { SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 8080) } + async fn insert_server(db: &sea_orm::DatabaseConnection, id: &str, name: &str) { + let now = Utc::now(); + let token_hash = AuthService::hash_password("test").unwrap(); + server::ActiveModel { + id: Set(id.to_string()), + token_hash: Set(token_hash), + token_prefix: Set("serverbee_test".to_string()), + name: Set(name.to_string()), + weight: Set(0), + hidden: Set(false), + capabilities: Set(CAP_DEFAULT as i32), + protocol_version: Set(1), + created_at: Set(now), + updated_at: Set(now), + ..Default::default() + } + .insert(db) + .await + .unwrap(); + } + + async fn insert_recovery_job( + db: &sea_orm::DatabaseConnection, + job_id: &str, + target_server_id: &str, + source_server_id: &str, + ) { + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set(job_id.to_string()), + target_server_id: Set(target_server_id.to_string()), + source_server_id: Set(source_server_id.to_string()), + status: Set("running".to_string()), + stage: Set("rebinding".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(db) + .await + .unwrap(); + } + #[tokio::test] async fn current_connection_frame_handler_waits_for_server_lock() { let (db, _tmp) = setup_test_db().await; @@ -1340,4 +1537,199 @@ mod tests { .is_current_connection("s1", second_connection_id) ); } + + #[tokio::test] + async fn rebind_identity_ack_advances_job_and_broadcasts_recovery_update() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + insert_recovery_job(&db, "job-1", "target-1", "source-1").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + handle_agent_message( + &state, + "source-1", + AgentMessage::RebindIdentityAck { + job_id: "job-1".to_string(), + }, + ) + .await; + + let job = recovery_job::Entity::find_by_id("job-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(job.stage, "awaiting_target_online"); + + let msg = browser_rx.recv().await.unwrap(); + match msg { + BrowserMessage::Update { + recoveries: Some(recoveries), + .. + } => { + assert_eq!(recoveries.len(), 1); + assert_eq!(recoveries[0].job_id, "job-1"); + assert_eq!(recoveries[0].stage, RecoveryJobStage::AwaitingTargetOnline); + } + other => panic!("expected recovery update, got {other:?}"), + } + } + + #[tokio::test] + async fn rebind_identity_failed_marks_job_failed_and_broadcasts_recovery_snapshot() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + insert_recovery_job(&db, "job-1", "target-1", "source-1").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + handle_agent_message( + &state, + "source-1", + AgentMessage::RebindIdentityFailed { + job_id: "job-1".to_string(), + error: "agent failed".to_string(), + }, + ) + .await; + + let job = recovery_job::Entity::find_by_id("job-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(job.status, "failed"); + assert_eq!(job.stage, "rebinding"); + assert_eq!(job.error.as_deref(), Some("agent failed")); + + let msg = browser_rx.recv().await.unwrap(); + match msg { + BrowserMessage::Update { + recoveries: Some(recoveries), + .. + } => { + assert_eq!(recoveries.len(), 1); + assert_eq!(recoveries[0].job_id, "job-1"); + assert_eq!( + recoveries[0].status, + serverbee_common::protocol::RecoveryJobStatus::Failed + ); + assert_eq!(recoveries[0].stage, RecoveryJobStage::Rebinding); + assert_eq!(recoveries[0].error.as_deref(), Some("agent failed")); + } + other => panic!("expected recovery update, got {other:?}"), + } + } + + #[tokio::test] + async fn stale_rebind_identity_ack_does_not_broadcast_recovery_update() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + insert_recovery_job(&db, "job-1", "target-1", "source-1").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + RecoveryMergeService::handle_rebind_ack(&state, "job-1", "source-1") + .await + .unwrap(); + + handle_agent_message( + &state, + "source-1", + AgentMessage::RebindIdentityAck { + job_id: "job-1".to_string(), + }, + ) + .await; + + let job = recovery_job::Entity::find_by_id("job-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(job.stage, "awaiting_target_online"); + + assert!(timeout(Duration::from_millis(50), browser_rx.recv()).await.is_err()); + } + + #[tokio::test] + async fn wrong_source_rebind_identity_failure_does_not_broadcast_recovery_update() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + insert_server(&db, "source-2", "Other Source").await; + insert_recovery_job(&db, "job-1", "target-1", "source-1").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + handle_agent_message( + &state, + "source-2", + AgentMessage::RebindIdentityFailed { + job_id: "job-1".to_string(), + error: "wrong source".to_string(), + }, + ) + .await; + + let job = recovery_job::Entity::find_by_id("job-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(job.status, "running"); + assert_eq!(job.stage, "rebinding"); + assert_eq!(job.error, None); + + assert!(timeout(Duration::from_millis(50), browser_rx.recv()).await.is_err()); + } + + #[tokio::test] + async fn stale_rebind_identity_failure_does_not_broadcast_recovery_update() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + insert_recovery_job(&db, "job-1", "target-1", "source-1").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + RecoveryMergeService::handle_rebind_failure(&state, "job-1", "source-1", "first failure") + .await + .unwrap(); + + handle_agent_message( + &state, + "source-1", + AgentMessage::RebindIdentityFailed { + job_id: "job-1".to_string(), + error: "stale failure".to_string(), + }, + ) + .await; + + let job = recovery_job::Entity::find_by_id("job-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(job.status, "failed"); + assert_eq!(job.stage, "rebinding"); + assert_eq!(job.error.as_deref(), Some("first failure")); + + assert!(timeout(Duration::from_millis(50), browser_rx.recv()).await.is_err()); + } } diff --git a/crates/server/src/router/ws/browser.rs b/crates/server/src/router/ws/browser.rs index 270fbba7..0c086c1c 100644 --- a/crates/server/src/router/ws/browser.rs +++ b/crates/server/src/router/ws/browser.rs @@ -7,13 +7,18 @@ use axum::http::HeaderMap; use axum::response::{IntoResponse, Response}; use axum::routing::get; use futures_util::{SinkExt, StreamExt}; +use sea_orm::EntityTrait; +use crate::entity::recovery_job; use crate::service::agent_manager::aggregate_disk_io; use crate::service::auth::AuthService; use crate::service::server::ServerService; use crate::state::AppState; use serverbee_common::constants::MAX_WS_MESSAGE_SIZE; -use serverbee_common::protocol::{BrowserClientMessage, BrowserMessage, ServerMessage}; +use serverbee_common::protocol::{ + BrowserClientMessage, BrowserMessage, RecoveryJobDto, RecoveryJobStage, RecoveryJobStatus, + ServerMessage, +}; use serverbee_common::types::ServerStatus; pub fn router() -> Router> { @@ -28,9 +33,9 @@ async fn browser_ws_handler( // Validate auth: try session cookie first, then API key, then Bearer token let auth = validate_browser_auth(&state, &headers).await; match auth { - Some((_user_id, mobile_expires)) => ws + Some((_user_id, is_admin, mobile_expires)) => ws .max_message_size(MAX_WS_MESSAGE_SIZE) - .on_upgrade(move |socket| handle_browser_ws(socket, state, mobile_expires)), + .on_upgrade(move |socket| handle_browser_ws(socket, state, is_admin, mobile_expires)), None => axum::http::StatusCode::UNAUTHORIZED.into_response(), } } @@ -43,20 +48,20 @@ async fn browser_ws_handler( async fn validate_browser_auth( state: &Arc, headers: &HeaderMap, -) -> Option<(String, Option>)> { +) -> Option<(String, bool, Option>)> { // Try session cookie (always web source → no mobile expiry) if let Some(token) = extract_session_cookie(headers) && let Ok(Some((user, _session))) = AuthService::validate_session(&state.db, &token, state.config.auth.session_ttl).await { - return Some((user.id, None)); + return Some((user.id, user.role == "admin", None)); } // Try API key header (no expiry) if let Some(key) = extract_api_key(headers) && let Ok(Some(user)) = AuthService::validate_api_key(&state.db, &key).await { - return Some((user.id, None)); + return Some((user.id, user.role == "admin", None)); } // Try Bearer token (may be a mobile session with a fixed expiry) @@ -69,7 +74,7 @@ async fn validate_browser_auth( } else { None }; - return Some((user.id, mobile_expires)); + return Some((user.id, user.role == "admin", mobile_expires)); } None @@ -107,6 +112,7 @@ fn extract_bearer_token(headers: &HeaderMap) -> Option { async fn handle_browser_ws( socket: WebSocket, state: Arc, + is_admin: bool, mobile_expires: Option>, ) { let (mut ws_sink, mut ws_stream) = socket.split(); @@ -114,7 +120,7 @@ async fn handle_browser_ws( let connection_id = uuid::Uuid::new_v4().to_string(); // Build FullSync message from DB servers + agent_manager online/report data - let full_sync = build_full_sync(&state).await; + let full_sync = build_full_sync(&state, is_admin).await; if let Err(e) = send_browser_message(&mut ws_sink, &full_sync).await { tracing::error!("Failed to send FullSync to browser: {e}"); return; @@ -131,7 +137,10 @@ async fn handle_browser_ws( msg = browser_rx.recv() => { match msg { Ok(browser_msg) => { - if let Err(e) = send_browser_message(&mut ws_sink, &browser_msg).await { + let filtered = filter_browser_message(browser_msg, is_admin); + if let Some(filtered) = filtered + && let Err(e) = send_browser_message(&mut ws_sink, &filtered).await + { tracing::debug!("Failed to send to browser WS: {e}"); break; } @@ -139,7 +148,7 @@ async fn handle_browser_ws( Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => { tracing::warn!("Browser WS lagged by {n} messages, sending full resync"); // On lag, send a full resync - let resync = build_full_sync(&state).await; + let resync = build_full_sync(&state, is_admin).await; if let Err(e) = send_browser_message(&mut ws_sink, &resync).await { tracing::debug!("Failed to send resync to browser WS: {e}"); break; @@ -248,7 +257,12 @@ async fn handle_browser_client_message( } } -async fn build_full_sync(state: &Arc) -> BrowserMessage { +async fn build_full_sync(state: &Arc, is_admin: bool) -> BrowserMessage { + let recoveries = if is_admin { + recovery_snapshot(state).await.unwrap_or_default() + } else { + Vec::new() + }; let servers = match ServerService::list_servers(&state.db).await { Ok(servers) => servers, Err(e) => { @@ -256,6 +270,7 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { return BrowserMessage::FullSync { servers: Vec::new(), upgrades: state.upgrade_tracker.snapshot(), + recoveries, }; } }; @@ -355,6 +370,50 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { BrowserMessage::FullSync { servers: statuses, upgrades: state.upgrade_tracker.snapshot(), + recoveries, + } +} + +pub(crate) async fn recovery_snapshot(state: &Arc) -> Option> { + match recovery_job::Entity::find().all(&state.db).await { + Ok(jobs) => Some(jobs.into_iter().map(Into::into).collect()), + Err(e) => { + tracing::error!("Failed to list recovery jobs for browser sync: {e}"); + None + } + } +} + +pub(crate) async fn broadcast_recovery_update(state: &Arc) { + let Some(recoveries) = recovery_snapshot(state).await else { + return; + }; + let _ = state.browser_tx.send(BrowserMessage::Update { + servers: Vec::new(), + recoveries: Some(recoveries), + }); +} + +fn filter_browser_message(msg: BrowserMessage, is_admin: bool) -> Option { + if is_admin { + return Some(msg); + } + + match msg { + BrowserMessage::FullSync { + servers, + upgrades, + .. + } => Some(BrowserMessage::FullSync { + servers, + upgrades, + recoveries: Vec::new(), + }), + BrowserMessage::Update { servers, .. } => Some(BrowserMessage::Update { + servers, + recoveries: None, + }), + other => Some(other), } } @@ -365,3 +424,294 @@ async fn send_browser_message( let text = serde_json::to_string(msg).map_err(axum::Error::new)?; sink.send(Message::Text(text.into())).await } + +impl From for RecoveryJobDto { + fn from(value: recovery_job::Model) -> Self { + Self { + job_id: value.job_id, + target_server_id: value.target_server_id, + source_server_id: value.source_server_id, + status: recovery_job_status_from_str(&value.status), + stage: recovery_job_stage_from_str(&value.stage), + error: value.error, + started_at: value.started_at, + created_at: value.created_at, + updated_at: value.updated_at, + last_heartbeat_at: value.last_heartbeat_at, + } + } +} + +fn recovery_job_status_from_str(value: &str) -> RecoveryJobStatus { + match value { + "running" => RecoveryJobStatus::Running, + "failed" => RecoveryJobStatus::Failed, + "succeeded" => RecoveryJobStatus::Succeeded, + _ => RecoveryJobStatus::Unknown, + } +} + +fn recovery_job_stage_from_str(value: &str) -> RecoveryJobStage { + match value { + "validating" => RecoveryJobStage::Validating, + "rebinding" => RecoveryJobStage::Rebinding, + "awaiting_target_online" => RecoveryJobStage::AwaitingTargetOnline, + "freezing_writes" => RecoveryJobStage::FreezingWrites, + "merging_history" => RecoveryJobStage::MergingHistory, + "finalizing" => RecoveryJobStage::Finalizing, + "succeeded" => RecoveryJobStage::Succeeded, + "failed" => RecoveryJobStage::Failed, + _ => RecoveryJobStage::Unknown, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::AppConfig; + use crate::entity::server; + use crate::service::auth::AuthService; + use crate::test_utils::setup_test_db; + use chrono::Utc; + use sea_orm::{ActiveModelTrait, Set}; + use serverbee_common::constants::CAP_DEFAULT; + + async fn insert_server(db: &sea_orm::DatabaseConnection, id: &str, name: &str) { + let now = Utc::now(); + let token_hash = AuthService::hash_password("test").unwrap(); + server::ActiveModel { + id: Set(id.to_string()), + token_hash: Set(token_hash), + token_prefix: Set("serverbee_test".to_string()), + name: Set(name.to_string()), + weight: Set(0), + hidden: Set(false), + capabilities: Set(CAP_DEFAULT as i32), + protocol_version: Set(1), + created_at: Set(now), + updated_at: Set(now), + ..Default::default() + } + .insert(db) + .await + .unwrap(); + } + + #[tokio::test] + async fn full_sync_includes_running_recoveries() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set("job-1".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("running".to_string()), + stage: Set("rebinding".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + + let message = build_full_sync(&state, true).await; + + match message { + BrowserMessage::FullSync { recoveries, .. } => { + assert_eq!(recoveries.len(), 1); + assert_eq!(recoveries[0].job_id, "job-1"); + assert_eq!(recoveries[0].stage, RecoveryJobStage::Rebinding); + assert_eq!(recoveries[0].status, RecoveryJobStatus::Running); + } + other => panic!("expected full sync, got {other:?}"), + } + } + + #[tokio::test] + async fn full_sync_includes_terminal_recovery_states() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set("job-failed".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("failed".to_string()), + stage: Set("failed".to_string()), + checkpoint_json: Set(None), + error: Set(Some("boom".to_string())), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + recovery_job::ActiveModel { + job_id: Set("job-succeeded".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("succeeded".to_string()), + stage: Set("succeeded".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + + let message = build_full_sync(&state, true).await; + + match message { + BrowserMessage::FullSync { recoveries, .. } => { + assert_eq!(recoveries.len(), 2); + assert!(recoveries.iter().any(|job| { + job.job_id == "job-failed" + && job.status == RecoveryJobStatus::Failed + && job.stage == RecoveryJobStage::Failed + })); + assert!(recoveries.iter().any(|job| { + job.job_id == "job-succeeded" + && job.status == RecoveryJobStatus::Succeeded + && job.stage == RecoveryJobStage::Succeeded + })); + } + other => panic!("expected full sync, got {other:?}"), + } + } + + #[tokio::test] + async fn full_sync_hides_recoveries_for_non_admin() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set("job-1".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("running".to_string()), + stage: Set("rebinding".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + + let message = build_full_sync(&state, false).await; + + match message { + BrowserMessage::FullSync { recoveries, .. } => assert!(recoveries.is_empty()), + other => panic!("expected full sync, got {other:?}"), + } + } + + #[tokio::test] + async fn broadcast_recovery_update_includes_terminal_recovery_states() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + let mut browser_rx = state.browser_tx.subscribe(); + + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set("job-failed".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("failed".to_string()), + stage: Set("failed".to_string()), + checkpoint_json: Set(None), + error: Set(Some("boom".to_string())), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + + broadcast_recovery_update(&state).await; + + let message = browser_rx.recv().await.unwrap(); + match message { + BrowserMessage::Update { + recoveries: Some(recoveries), + .. + } => { + assert_eq!(recoveries.len(), 1); + assert_eq!(recoveries[0].job_id, "job-failed"); + assert_eq!(recoveries[0].status, RecoveryJobStatus::Failed); + assert_eq!(recoveries[0].stage, RecoveryJobStage::Failed); + } + other => panic!("expected update with recoveries, got {other:?}"), + } + } + + #[tokio::test] + async fn full_sync_strips_recoveries_for_non_admin() { + let (db, _tmp) = setup_test_db().await; + insert_server(&db, "target-1", "Target").await; + insert_server(&db, "source-1", "Source").await; + let state = AppState::new(db.clone(), AppConfig::default()) + .await + .unwrap(); + + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set("job-1".to_string()), + target_server_id: Set("target-1".to_string()), + source_server_id: Set("source-1".to_string()), + status: Set("running".to_string()), + stage: Set("rebinding".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(Some(now)), + } + .insert(&db) + .await + .unwrap(); + + let message = build_full_sync(&state, false).await; + + match message { + BrowserMessage::FullSync { recoveries, .. } => assert!(recoveries.is_empty()), + other => panic!("expected full sync, got {other:?}"), + } + } +} diff --git a/crates/server/src/service/agent_manager.rs b/crates/server/src/service/agent_manager.rs index aee15e74..fa3d7682 100644 --- a/crates/server/src/service/agent_manager.rs +++ b/crates/server/src/service/agent_manager.rs @@ -235,6 +235,7 @@ impl AgentManager { let _ = self.browser_tx.send(BrowserMessage::Update { servers: vec![status], + recoveries: None, }); // Cache the report @@ -659,10 +660,21 @@ pub async fn cleanup_disconnected_docker_state(state: &AppState, server_id: &str let mut features = state.agent_manager.get_features(server_id); features.retain(|feature| feature != "docker"); - let _ = crate::service::server::ServerService::update_features(&state.db, server_id, &features) - .await; - + let persisted_features = features.clone(); state.agent_manager.update_features(server_id, features); + + if state.recovery_lock.writes_allowed_for(server_id) { + let _ = + crate::service::server::ServerService::update_features( + &state.db, + server_id, + &persisted_features, + ) + .await; + } else { + tracing::info!("Skipping recovery-frozen docker feature write for {server_id}"); + } + state .agent_manager .broadcast_browser(BrowserMessage::DockerAvailabilityChanged { diff --git a/crates/server/src/service/db_error.rs b/crates/server/src/service/db_error.rs new file mode 100644 index 00000000..08e289a4 --- /dev/null +++ b/crates/server/src/service/db_error.rs @@ -0,0 +1,11 @@ +use sea_orm::DbErr; + +pub(crate) fn is_unique_violation(err: &DbErr) -> bool { + let message = err.to_string(); + message.contains("UNIQUE constraint failed") || message.contains("UNIQUE") +} + +pub(crate) fn is_active_recovery_conflict(err: &DbErr) -> bool { + let message = err.to_string(); + is_unique_violation(err) || message.contains("recovery_job_active_conflict") +} diff --git a/crates/server/src/service/mod.rs b/crates/server/src/service/mod.rs index 39ed9037..3818305d 100644 --- a/crates/server/src/service/mod.rs +++ b/crates/server/src/service/mod.rs @@ -6,6 +6,7 @@ pub mod auth; pub mod checker; pub mod config; pub mod dashboard; +pub mod db_error; pub mod docker; pub mod docker_viewer; pub mod file_transfer; @@ -19,12 +20,15 @@ pub mod notification; pub mod oauth; pub mod ping; pub mod record; +pub mod recovery_job; +pub mod recovery_lock; +pub mod recovery_merge; pub mod server; pub mod service_monitor; pub mod status_page; pub mod task_scheduler; pub mod traffic; -pub mod uptime; pub mod upgrade_release; pub mod upgrade_tracker; +pub mod uptime; pub mod user; diff --git a/crates/server/src/service/recovery_job.rs b/crates/server/src/service/recovery_job.rs new file mode 100644 index 00000000..2d920614 --- /dev/null +++ b/crates/server/src/service/recovery_job.rs @@ -0,0 +1,309 @@ +use chrono::Utc; +use sea_orm::*; +use uuid::Uuid; + +use crate::entity::recovery_job; +use crate::error::AppError; +use crate::service::db_error::is_active_recovery_conflict; + +pub struct RecoveryJobService; + +impl RecoveryJobService { + pub async fn create_job( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result { + let now = Utc::now(); + let active = recovery_job::ActiveModel { + job_id: Set(Uuid::new_v4().to_string()), + target_server_id: Set(target_server_id.to_string()), + source_server_id: Set(source_server_id.to_string()), + status: Set("running".to_string()), + stage: Set("validating".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(None), + }; + + match active.insert(db).await { + Ok(model) => Ok(model), + Err(err) if is_active_recovery_conflict(&err) => { + Err(AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + )) + } + Err(err) => Err(err.into()), + } + } + + pub async fn get_job( + db: &DatabaseConnection, + job_id: &str, + ) -> Result, AppError> { + Ok(recovery_job::Entity::find_by_id(job_id).one(db).await?) + } + + pub async fn update_stage( + db: &DatabaseConnection, + job_id: &str, + stage: &str, + checkpoint_json: Option<&str>, + error: Option<&str>, + ) -> Result { + let model = Self::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + let mut active: recovery_job::ActiveModel = model.into(); + let now = Utc::now(); + + active.stage = Set(stage.to_string()); + active.checkpoint_json = Set(checkpoint_json.map(ToOwned::to_owned)); + active.error = Set(error.map(ToOwned::to_owned)); + active.updated_at = Set(now); + active.last_heartbeat_at = Set(Some(now)); + + Ok(active.update(db).await?) + } + + pub async fn mark_failed( + db: &DatabaseConnection, + job_id: &str, + stage: &str, + error: &str, + ) -> Result<(), AppError> { + let model = Self::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + let mut active: recovery_job::ActiveModel = model.into(); + let now = Utc::now(); + + active.status = Set("failed".to_string()); + active.stage = Set(stage.to_string()); + active.error = Set(Some(error.to_string())); + active.updated_at = Set(now); + active.last_heartbeat_at = Set(Some(now)); + + active.update(db).await?; + Ok(()) + } + + pub async fn running_for_target( + db: &DatabaseConnection, + target_server_id: &str, + ) -> Result, AppError> { + Ok(recovery_job::Entity::find() + .filter(recovery_job::Column::TargetServerId.eq(target_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .one(db) + .await?) + } + + pub async fn running_for_source( + db: &DatabaseConnection, + source_server_id: &str, + ) -> Result, AppError> { + Ok(recovery_job::Entity::find() + .filter(recovery_job::Column::SourceServerId.eq(source_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .one(db) + .await?) + } +} + +#[cfg(test)] +mod tests { + use super::RecoveryJobService; + use crate::entity::recovery_job; + use crate::test_utils::setup_test_db; + use crate::error::AppError; + use chrono::Utc; + use sea_orm::{ActiveModelTrait, Set}; + + async fn insert_job( + db: &sea_orm::DatabaseConnection, + job_id: &str, + target_server_id: &str, + source_server_id: &str, + status: &str, + ) -> recovery_job::Model { + let now = Utc::now(); + recovery_job::ActiveModel { + job_id: Set(job_id.to_string()), + target_server_id: Set(target_server_id.to_string()), + source_server_id: Set(source_server_id.to_string()), + status: Set(status.to_string()), + stage: Set("validating".to_string()), + checkpoint_json: Set(None), + error: Set(None), + started_at: Set(now), + created_at: Set(now), + updated_at: Set(now), + last_heartbeat_at: Set(None), + } + .insert(db) + .await + .unwrap() + } + + #[tokio::test] + async fn create_job_persists_running_row_for_target_and_source() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.job_id, job.job_id); + assert_eq!(loaded.target_server_id, "target-1"); + assert_eq!(loaded.source_server_id, "source-1"); + assert_eq!(loaded.status, "running"); + assert_eq!(loaded.stage, "validating"); + assert_eq!(loaded.checkpoint_json, None); + assert_eq!(loaded.error, None); + assert!(loaded.last_heartbeat_at.is_none()); + } + + #[tokio::test] + async fn update_stage_round_trips_stage_and_checkpoint_json() { + let (db, _tmp) = setup_test_db().await; + let job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + + RecoveryJobService::update_stage( + &db, + &job.job_id, + "merging_history", + Some("{\"group\":2}"), + None, + ) + .await + .unwrap(); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, "merging_history"); + assert_eq!(loaded.checkpoint_json.as_deref(), Some("{\"group\":2}")); + assert_eq!(loaded.error, None); + assert_eq!(loaded.status, "running"); + assert!(loaded.last_heartbeat_at.is_some()); + } + + #[tokio::test] + async fn mark_failed_updates_status_stage_and_error() { + let (db, _tmp) = setup_test_db().await; + let job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + + RecoveryJobService::mark_failed(&db, &job.job_id, "finalizing", "boom") + .await + .unwrap(); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.status, "failed"); + assert_eq!(loaded.stage, "finalizing"); + assert_eq!(loaded.error.as_deref(), Some("boom")); + assert!(loaded.last_heartbeat_at.is_some()); + } + + #[tokio::test] + async fn running_queries_match_by_target_and_source() { + let (db, _tmp) = setup_test_db().await; + let job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + let _failed = insert_job(&db, "job-failed", "target-1", "source-1", "failed").await; + + let by_target = RecoveryJobService::running_for_target(&db, "target-1") + .await + .unwrap() + .unwrap(); + let by_source = RecoveryJobService::running_for_source(&db, "source-1") + .await + .unwrap() + .unwrap(); + + assert_eq!(by_target.job_id, job.job_id); + assert_eq!(by_source.job_id, job.job_id); + } + + #[tokio::test] + async fn running_queries_ignore_non_running_jobs() { + let (db, _tmp) = setup_test_db().await; + let job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + RecoveryJobService::mark_failed(&db, &job.job_id, "finalizing", "boom") + .await + .unwrap(); + + assert!(RecoveryJobService::running_for_target(&db, "target-1") + .await + .unwrap() + .is_none()); + assert!(RecoveryJobService::running_for_source(&db, "source-1") + .await + .unwrap() + .is_none()); + } + + #[tokio::test] + async fn create_job_rejects_duplicate_active_jobs_for_target_or_source() { + let (db, _tmp) = setup_test_db().await; + + let _first = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + + match RecoveryJobService::create_job(&db, "target-1", "source-2").await { + Err(AppError::Conflict(message)) => { + assert!(message.contains("running recovery job")); + } + other => panic!("expected conflict for duplicate target, got {other:?}"), + } + + match RecoveryJobService::create_job(&db, "target-2", "source-1").await { + Err(AppError::Conflict(message)) => { + assert!(message.contains("running recovery job")); + } + other => panic!("expected conflict for duplicate source, got {other:?}"), + } + } + + #[tokio::test] + async fn create_job_rejects_cross_role_active_collisions() { + let (db, _tmp) = setup_test_db().await; + + let _first = RecoveryJobService::create_job(&db, "target-a", "source-b") + .await + .unwrap(); + + match RecoveryJobService::create_job(&db, "target-c", "target-a").await { + Err(AppError::Conflict(message)) => { + assert!(message.contains("running recovery job")); + } + other => panic!("expected conflict for target/source crossover, got {other:?}"), + } + + match RecoveryJobService::create_job(&db, "source-b", "target-c").await { + Err(AppError::Conflict(message)) => { + assert!(message.contains("running recovery job")); + } + other => panic!("expected conflict for source/target crossover, got {other:?}"), + } + } +} diff --git a/crates/server/src/service/recovery_lock.rs b/crates/server/src/service/recovery_lock.rs new file mode 100644 index 00000000..2cc3b396 --- /dev/null +++ b/crates/server/src/service/recovery_lock.rs @@ -0,0 +1,44 @@ +use dashmap::DashSet; + +#[derive(Default)] +pub struct RecoveryLockService { + frozen: DashSet, +} + +impl RecoveryLockService { + pub fn new() -> Self { + Self { + frozen: DashSet::new(), + } + } + + pub fn freeze(&self, server_id: &str) { + self.frozen.insert(server_id.to_string()); + } + + pub fn release(&self, server_id: &str) { + self.frozen.remove(server_id); + } + + pub fn writes_allowed_for(&self, server_id: &str) -> bool { + !self.frozen.contains(server_id) + } +} + +#[cfg(test)] +mod tests { + use super::RecoveryLockService; + + #[test] + fn locked_server_denies_writes_until_released() { + let locks = RecoveryLockService::new(); + + assert!(locks.writes_allowed_for("srv-1")); + + locks.freeze("srv-1"); + assert!(!locks.writes_allowed_for("srv-1")); + + locks.release("srv-1"); + assert!(locks.writes_allowed_for("srv-1")); + } +} diff --git a/crates/server/src/service/recovery_merge.rs b/crates/server/src/service/recovery_merge.rs new file mode 100644 index 00000000..1d2338c8 --- /dev/null +++ b/crates/server/src/service/recovery_merge.rs @@ -0,0 +1,1838 @@ +use std::sync::Arc; + +use chrono::Utc; +use sea_orm::DatabaseBackend; +use sea_orm::prelude::Expr; +use sea_orm::{ + ActiveModelTrait, ColumnTrait, ConnectionTrait, DatabaseConnection, DatabaseTransaction, + EntityTrait, QueryFilter, Statement, +}; + +use crate::entity::{network_probe_config, recovery_job, server, server_tag}; +use crate::error::AppError; +use crate::service::auth::AuthService; +use crate::service::db_error::is_active_recovery_conflict; +use crate::service::recovery_job::RecoveryJobService; +use crate::service::traffic::TrafficService; +use crate::state::AppState; + +pub const RECOVERY_STAGE_VALIDATING: &str = "validating"; +pub const RECOVERY_STAGE_REBINDING: &str = "rebinding"; +pub const RECOVERY_STAGE_AWAITING_TARGET_ONLINE: &str = "awaiting_target_online"; +pub const REBIND_IDENTITY_MIN_PROTOCOL_VERSION: u32 = 4; + +pub struct RecoveryMergeService; + +pub struct RecoveryStateChange { + pub job: recovery_job::Model, + pub transitioned: bool, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RecoveryFailurePhase { + PreRebind, + PostRebind, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RecoveryRetryStrategy { + StartNewJob, + ResumeSameJob, +} + +impl RecoveryMergeService { + pub async fn start( + state: &Arc, + target_server_id: &str, + source_server_id: &str, + ) -> Result { + Self::validate_start_request(state, target_server_id, source_server_id).await?; + Self::start_on_db(&state.db, target_server_id, source_server_id).await + } + + pub async fn handle_rebind_ack( + state: &Arc, + job_id: &str, + acking_server_id: &str, + ) -> Result { + Self::handle_rebind_ack_on_db(&state.db, job_id, acking_server_id).await + } + + pub async fn handle_rebind_failure( + state: &Arc, + job_id: &str, + source_server_id: &str, + error: &str, + ) -> Result { + Self::handle_rebind_failure_on_db(&state.db, job_id, source_server_id, error).await + } + + pub async fn rotate_target_token( + state: &Arc, + target_server_id: &str, + ) -> Result { + Self::rotate_target_token_on_conn(&state.db, target_server_id).await + } + + pub async fn rotate_target_token_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + ) -> Result { + Self::rotate_target_token_on_conn(txn, target_server_id).await + } + + async fn rotate_target_token_on_conn( + db: &C, + target_server_id: &str, + ) -> Result + where + C: ConnectionTrait, + { + let target = server::Entity::find_by_id(target_server_id) + .one(db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + + let plaintext_token = AuthService::generate_session_token(); + let token_hash = AuthService::hash_password(&plaintext_token)?; + let token_prefix = plaintext_token[..8.min(plaintext_token.len())].to_string(); + + let mut active: server::ActiveModel = target.into(); + active.token_hash = sea_orm::Set(token_hash); + active.token_prefix = sea_orm::Set(token_prefix); + active.updated_at = sea_orm::Set(Utc::now()); + active.update(db).await?; + + Ok(plaintext_token) + } + + pub async fn validate_start_request( + state: &Arc, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + if source_server_id == target_server_id { + return Err(AppError::Validation( + "source_server_id must be different from target_id".to_string(), + )); + } + + let target = server::Entity::find_by_id(target_server_id) + .one(&state.db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + let source = server::Entity::find_by_id(source_server_id) + .one(&state.db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + + Self::validate_connectivity_preconditions( + state, + &target.id, + &source.id, + "Target server must be offline before starting recovery", + "Source server must be online before starting recovery", + )?; + + Self::validate_rebind_identity_protocol(state, &source)?; + Ok(()) + } + + pub async fn validate_dispatch_preconditions( + state: &Arc, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::validate_connectivity_preconditions( + state, + target_server_id, + source_server_id, + "Recovery start aborted because target server came back online before dispatch", + "Recovery start aborted because source server went offline before dispatch", + ) + } + + async fn start_on_db( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result { + Self::start_on_connection(db, target_server_id, source_server_id).await + } + + pub async fn start_on_txn( + db: &DatabaseTransaction, + target_server_id: &str, + source_server_id: &str, + ) -> Result { + Self::start_on_connection(db, target_server_id, source_server_id).await + } + + async fn start_on_connection( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result + where + C: ConnectionTrait, + { + if let Some(existing) = + Self::find_reusable_start_job(db, target_server_id, source_server_id).await? + { + return Self::advance_job_to_rebinding(db, existing).await; + } + + match Self::create_job(db, target_server_id, source_server_id).await { + Ok(job) => Self::advance_job_to_rebinding(db, job).await, + Err(AppError::Conflict(_)) => { + Self::recover_duplicate_start(db, target_server_id, source_server_id).await + } + Err(err) => Err(err), + } + } + + async fn find_reusable_start_job( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result, AppError> + where + C: ConnectionTrait, + { + let running_target = Self::running_for_target(db, target_server_id).await?; + let running_source = Self::running_for_source(db, source_server_id).await?; + + if let Some(job) = running_target { + if job.source_server_id != source_server_id { + return Err(AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + )); + } + + if let Some(source_job) = &running_source + && source_job.job_id != job.job_id + { + return Err(AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + )); + } + + return Ok(Some(job)); + } + + if running_source.is_some() { + return Err(AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + )); + } + + Ok(None) + } + + async fn recover_duplicate_start( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result + where + C: ConnectionTrait, + { + match Self::find_reusable_start_job(db, target_server_id, source_server_id).await? { + Some(job) => Self::advance_job_to_rebinding(db, job).await, + None => Err(AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + )), + } + } + + async fn advance_job_to_rebinding( + db: &C, + job: recovery_job::Model, + ) -> Result + where + C: ConnectionTrait, + { + let now = Utc::now(); + let result = recovery_job::Entity::update_many() + .col_expr( + recovery_job::Column::Stage, + Expr::value(RECOVERY_STAGE_REBINDING), + ) + .col_expr( + recovery_job::Column::CheckpointJson, + Expr::value(None::), + ) + .col_expr(recovery_job::Column::Error, Expr::value(None::)) + .col_expr(recovery_job::Column::UpdatedAt, Expr::value(now)) + .col_expr( + recovery_job::Column::LastHeartbeatAt, + Expr::value(Some(now)), + ) + .filter(recovery_job::Column::JobId.eq(&job.job_id)) + .filter(recovery_job::Column::Status.eq("running")) + .filter( + recovery_job::Column::Stage + .is_in([RECOVERY_STAGE_VALIDATING, RECOVERY_STAGE_REBINDING]), + ) + .exec(db) + .await?; + + if result.rows_affected == 0 { + return Self::get_job(db, &job.job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())); + } + + Self::get_job(db, &job.job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string())) + } + + async fn create_job( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result + where + C: ConnectionTrait, + { + let active = recovery_job::ActiveModel { + job_id: sea_orm::Set(uuid::Uuid::new_v4().to_string()), + target_server_id: sea_orm::Set(target_server_id.to_string()), + source_server_id: sea_orm::Set(source_server_id.to_string()), + status: sea_orm::Set("running".to_string()), + stage: sea_orm::Set(RECOVERY_STAGE_VALIDATING.to_string()), + checkpoint_json: sea_orm::Set(None), + error: sea_orm::Set(None), + started_at: sea_orm::Set(Utc::now()), + created_at: sea_orm::Set(Utc::now()), + updated_at: sea_orm::Set(Utc::now()), + last_heartbeat_at: sea_orm::Set(None), + }; + + active.insert(db).await.map_err(|err| { + if is_active_recovery_conflict(&err) { + AppError::Conflict( + "A running recovery job already exists for this target or source".to_string(), + ) + } else { + err.into() + } + }) + } + + async fn running_for_target( + db: &C, + target_server_id: &str, + ) -> Result, AppError> + where + C: ConnectionTrait, + { + Ok(recovery_job::Entity::find() + .filter(recovery_job::Column::TargetServerId.eq(target_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .one(db) + .await?) + } + + async fn running_for_source( + db: &C, + source_server_id: &str, + ) -> Result, AppError> + where + C: ConnectionTrait, + { + Ok(recovery_job::Entity::find() + .filter(recovery_job::Column::SourceServerId.eq(source_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .one(db) + .await?) + } + + async fn get_job(db: &C, job_id: &str) -> Result, AppError> + where + C: ConnectionTrait, + { + Ok(recovery_job::Entity::find_by_id(job_id).one(db).await?) + } + + fn validate_rebind_identity_protocol( + state: &Arc, + source: &server::Model, + ) -> Result<(), AppError> { + let protocol_version = state + .agent_manager + .get_protocol_version(&source.id) + .unwrap_or(source.protocol_version as u32); + + if protocol_version < REBIND_IDENTITY_MIN_PROTOCOL_VERSION { + return Err(AppError::Conflict(format!( + "Source server must support RebindIdentity (protocol v{}+ required)", + REBIND_IDENTITY_MIN_PROTOCOL_VERSION + ))); + } + + Ok(()) + } + + fn validate_connectivity_preconditions( + state: &Arc, + target_server_id: &str, + source_server_id: &str, + target_online_message: &str, + source_offline_message: &str, + ) -> Result<(), AppError> { + if state.agent_manager.is_online(target_server_id) { + return Err(AppError::Conflict(target_online_message.to_string())); + } + + if !state.agent_manager.is_online(source_server_id) { + return Err(AppError::Conflict(source_offline_message.to_string())); + } + + Ok(()) + } + + async fn handle_rebind_ack_on_db( + db: &DatabaseConnection, + job_id: &str, + acking_server_id: &str, + ) -> Result { + let now = Utc::now(); + let result = recovery_job::Entity::update_many() + .col_expr( + recovery_job::Column::Stage, + Expr::value(RECOVERY_STAGE_AWAITING_TARGET_ONLINE), + ) + .col_expr( + recovery_job::Column::CheckpointJson, + Expr::value(None::), + ) + .col_expr(recovery_job::Column::Error, Expr::value(None::)) + .col_expr(recovery_job::Column::UpdatedAt, Expr::value(now)) + .col_expr( + recovery_job::Column::LastHeartbeatAt, + Expr::value(Some(now)), + ) + .filter(recovery_job::Column::JobId.eq(job_id)) + .filter(recovery_job::Column::SourceServerId.eq(acking_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .filter(recovery_job::Column::Stage.eq(RECOVERY_STAGE_REBINDING)) + .exec(db) + .await?; + + if result.rows_affected == 0 { + let job = RecoveryJobService::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + return Ok(RecoveryStateChange { + job, + transitioned: false, + }); + } + + let job = RecoveryJobService::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + Ok(RecoveryStateChange { + job, + transitioned: true, + }) + } + + async fn handle_rebind_failure_on_db( + db: &DatabaseConnection, + job_id: &str, + source_server_id: &str, + error: &str, + ) -> Result { + let now = Utc::now(); + let result = recovery_job::Entity::update_many() + .col_expr(recovery_job::Column::Status, Expr::value("failed")) + .col_expr( + recovery_job::Column::Stage, + Expr::value(RECOVERY_STAGE_REBINDING), + ) + .col_expr( + recovery_job::Column::Error, + Expr::value(Some(error.to_string())), + ) + .col_expr(recovery_job::Column::UpdatedAt, Expr::value(now)) + .col_expr( + recovery_job::Column::LastHeartbeatAt, + Expr::value(Some(now)), + ) + .filter(recovery_job::Column::JobId.eq(job_id)) + .filter(recovery_job::Column::SourceServerId.eq(source_server_id)) + .filter(recovery_job::Column::Status.eq("running")) + .filter(recovery_job::Column::Stage.eq(RECOVERY_STAGE_REBINDING)) + .exec(db) + .await?; + + if result.rows_affected == 0 { + let job = RecoveryJobService::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + return Ok(RecoveryStateChange { + job, + transitioned: false, + }); + } + + let job = RecoveryJobService::get_job(db, job_id) + .await? + .ok_or_else(|| AppError::NotFound("Recovery job not found".to_string()))?; + Ok(RecoveryStateChange { + job, + transitioned: true, + }) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_server_history_on_db( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_server_history_on_connection(db, target_server_id, source_server_id).await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_server_history_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_server_history_on_connection(txn, target_server_id, source_server_id).await + } + + async fn merge_server_history_on_connection( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + Self::merge_raw_table_on_connection(db, "records", "time", target_server_id, source_server_id) + .await?; + Self::merge_raw_table_on_connection( + db, + "gpu_records", + "time", + target_server_id, + source_server_id, + ) + .await?; + Self::merge_raw_table_on_connection( + db, + "ping_records", + "time", + target_server_id, + source_server_id, + ) + .await?; + Self::merge_raw_table_on_connection( + db, + "task_results", + "finished_at", + target_server_id, + source_server_id, + ) + .await?; + Self::merge_raw_table_on_connection( + db, + "network_probe_record", + "timestamp", + target_server_id, + source_server_id, + ) + .await?; + Self::merge_raw_table_on_connection( + db, + "docker_event", + "timestamp", + target_server_id, + source_server_id, + ) + .await?; + + Self::merge_unique_key_table_on_connection( + db, + "records_hourly", + &["time"], + target_server_id, + source_server_id, + ) + .await?; + Self::merge_unique_key_table_on_connection( + db, + "network_probe_record_hourly", + &["target_id", "hour"], + target_server_id, + source_server_id, + ) + .await?; + TrafficService::merge_recovered_server_history_on_connection( + db, + target_server_id, + source_server_id, + ) + .await?; + Self::merge_unique_key_table_on_connection( + db, + "uptime_daily", + &["date"], + target_server_id, + source_server_id, + ) + .await?; + Self::merge_alert_states_on_connection(db, target_server_id, source_server_id).await?; + Self::rewrite_server_ids_json_tables_on_connection(db, target_server_id, source_server_id) + .await?; + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_raw_table( + db: &DatabaseConnection, + table: &str, + time_column: &str, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_raw_table_on_connection( + db, + table, + time_column, + target_server_id, + source_server_id, + ) + .await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_raw_table_on_txn( + txn: &DatabaseTransaction, + table: &str, + time_column: &str, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_raw_table_on_connection( + txn, + table, + time_column, + target_server_id, + source_server_id, + ) + .await + } + + async fn merge_raw_table_on_connection( + db: &C, + table: &str, + time_column: &str, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + format!( + "DELETE FROM {table} \ + WHERE server_id = $1 \ + AND (SELECT MIN({time_column}) FROM {table} WHERE server_id = $2) IS NOT NULL \ + AND {time_column} >= (SELECT MIN({time_column}) FROM {table} WHERE server_id = $2) \ + AND {time_column} <= (SELECT MAX({time_column}) FROM {table} WHERE server_id = $2)" + ), + [target_server_id.into(), source_server_id.into()], + )) + .await?; + + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + format!("UPDATE {table} SET server_id = $1 WHERE server_id = $2"), + [target_server_id.into(), source_server_id.into()], + )) + .await?; + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_unique_key_table( + db: &DatabaseConnection, + table: &str, + key_columns: &[&str], + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_unique_key_table_on_connection( + db, + table, + key_columns, + target_server_id, + source_server_id, + ) + .await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_unique_key_table_on_txn( + txn: &DatabaseTransaction, + table: &str, + key_columns: &[&str], + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_unique_key_table_on_connection( + txn, + table, + key_columns, + target_server_id, + source_server_id, + ) + .await + } + + async fn merge_unique_key_table_on_connection( + db: &C, + table: &str, + key_columns: &[&str], + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + TrafficService::replace_unique_key_table_server_id_on_connection( + db, + table, + key_columns, + target_server_id, + source_server_id, + ) + .await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_alert_states( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_alert_states_on_connection(db, target_server_id, source_server_id).await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn merge_alert_states_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_alert_states_on_connection(txn, target_server_id, source_server_id).await + } + + async fn merge_alert_states_on_connection( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + "DELETE FROM alert_states AS source \ + WHERE source.server_id = $1 \ + AND EXISTS ( \ + SELECT 1 FROM alert_states AS target \ + WHERE target.server_id = $2 AND target.rule_id = source.rule_id \ + )", + [source_server_id.into(), target_server_id.into()], + )) + .await?; + + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + "UPDATE alert_states SET server_id = $1 WHERE server_id = $2", + [target_server_id.into(), source_server_id.into()], + )) + .await?; + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn rewrite_server_ids_json_tables( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::rewrite_server_ids_json_tables_on_connection(db, target_server_id, source_server_id) + .await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn rewrite_server_ids_json_tables_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::rewrite_server_ids_json_tables_on_connection(txn, target_server_id, source_server_id) + .await + } + + async fn rewrite_server_ids_json_tables_on_connection( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + let tables = [ + ("alert_rules", "server_ids_json"), + ("ping_tasks", "server_ids_json"), + ("tasks", "server_ids_json"), + ("service_monitor", "server_ids_json"), + ("maintenance", "server_ids_json"), + ("incident", "server_ids_json"), + ("status_page", "server_ids_json"), + ]; + + for (table, column) in tables { + Self::rewrite_server_ids_json_table_on_connection( + db, + table, + column, + target_server_id, + source_server_id, + ) + .await?; + } + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + async fn rewrite_server_ids_json_table( + db: &DatabaseConnection, + table: &str, + column: &str, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::rewrite_server_ids_json_table_on_connection( + db, + table, + column, + target_server_id, + source_server_id, + ) + .await + } + + async fn rewrite_server_ids_json_table_on_connection( + db: &C, + table: &str, + column: &str, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + let rows = db + .query_all(Statement::from_sql_and_values( + DatabaseBackend::Sqlite, + format!( + "SELECT id, {column} FROM {table} WHERE {column} LIKE '%' || $1 || '%'" + ), + [source_server_id.into()], + )) + .await?; + + for row in rows { + let id: String = row.try_get_by_index(0)?; + let current: Option = row.try_get_by_index(1)?; + let Some(current) = current else { + continue; + }; + + let rewritten = + Self::rewrite_server_ids_json_value(¤t, target_server_id, source_server_id)?; + if rewritten.as_deref() == Some(current.as_str()) { + continue; + } + + let value = rewritten.unwrap_or_else(|| "[]".to_string()).into(); + + db.execute(Statement::from_sql_and_values( + DatabaseBackend::Sqlite, + format!("UPDATE {table} SET {column} = $1 WHERE id = $2"), + [value, id.into()], + )) + .await?; + } + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) fn rewrite_server_ids_json_value( + current: &str, + target_server_id: &str, + source_server_id: &str, + ) -> Result, AppError> { + let ids: Vec = serde_json::from_str(current).map_err(|error| { + AppError::Internal(format!("Failed to parse server_ids_json during recovery merge: {error}")) + })?; + + let mut rewritten = Vec::new(); + for id in ids { + let next = if id == source_server_id { + target_server_id.to_string() + } else { + id + }; + if !rewritten.iter().any(|existing| existing == &next) { + rewritten.push(next); + } + } + + serde_json::to_string(&rewritten) + .map(Some) + .map_err(|error| { + AppError::Internal(format!( + "Failed to serialize server_ids_json during recovery merge: {error}" + )) + }) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn finalize_target_server_row( + db: &DatabaseConnection, + target_server_id: &str, + source: &server::Model, + ) -> Result<(), AppError> { + Self::finalize_target_server_row_on_connection(db, target_server_id, source).await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn finalize_target_server_row_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + source: &server::Model, + ) -> Result<(), AppError> { + Self::finalize_target_server_row_on_connection(txn, target_server_id, source).await + } + + async fn finalize_target_server_row_on_connection( + db: &C, + target_server_id: &str, + source: &server::Model, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + if source.fingerprint.is_some() { + server::Entity::update_many() + .col_expr(server::Column::Fingerprint, Expr::value(None::)) + .col_expr(server::Column::UpdatedAt, Expr::value(Utc::now())) + .filter(server::Column::Id.eq(source.id.clone())) + .exec(db) + .await?; + } + + let target = server::Entity::find_by_id(target_server_id) + .one(db) + .await? + .ok_or_else(|| AppError::NotFound("Server not found".to_string()))?; + + let mut active: server::ActiveModel = target.into(); + active.cpu_name = sea_orm::Set(source.cpu_name.clone()); + active.cpu_cores = sea_orm::Set(source.cpu_cores); + active.cpu_arch = sea_orm::Set(source.cpu_arch.clone()); + active.os = sea_orm::Set(source.os.clone()); + active.kernel_version = sea_orm::Set(source.kernel_version.clone()); + active.mem_total = sea_orm::Set(source.mem_total); + active.swap_total = sea_orm::Set(source.swap_total); + active.disk_total = sea_orm::Set(source.disk_total); + active.ipv4 = sea_orm::Set(source.ipv4.clone()); + active.ipv6 = sea_orm::Set(source.ipv6.clone()); + active.region = sea_orm::Set(source.region.clone()); + active.country_code = sea_orm::Set(source.country_code.clone()); + active.virtualization = sea_orm::Set(source.virtualization.clone()); + active.agent_version = sea_orm::Set(source.agent_version.clone()); + active.protocol_version = sea_orm::Set(source.protocol_version); + active.features = sea_orm::Set(source.features.clone()); + active.last_remote_addr = sea_orm::Set(source.last_remote_addr.clone()); + active.fingerprint = sea_orm::Set(source.fingerprint.clone()); + active.updated_at = sea_orm::Set(Utc::now()); + active.update(db).await?; + + Ok(()) + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn delete_intentionally_unmerged_source_rows( + db: &DatabaseConnection, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::delete_intentionally_unmerged_source_rows_on_connection(db, source_server_id).await + } + + #[cfg_attr(not(test), allow(dead_code))] + pub(crate) async fn delete_intentionally_unmerged_source_rows_on_txn( + txn: &DatabaseTransaction, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::delete_intentionally_unmerged_source_rows_on_connection(txn, source_server_id).await + } + + async fn delete_intentionally_unmerged_source_rows_on_connection( + db: &C, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + server_tag::Entity::delete_many() + .filter(server_tag::Column::ServerId.eq(source_server_id)) + .exec(db) + .await?; + network_probe_config::Entity::delete_many() + .filter(network_probe_config::Column::ServerId.eq(source_server_id)) + .exec(db) + .await?; + + Ok(()) + } +} + +pub fn recovery_phase_for_stage(stage: &str) -> Option { + match stage { + RECOVERY_STAGE_VALIDATING | RECOVERY_STAGE_REBINDING => { + Some(RecoveryFailurePhase::PreRebind) + } + RECOVERY_STAGE_AWAITING_TARGET_ONLINE => Some(RecoveryFailurePhase::PostRebind), + _ => None, + } +} + +pub fn is_pre_rebind_stage(stage: &str) -> bool { + matches!( + recovery_phase_for_stage(stage), + Some(RecoveryFailurePhase::PreRebind) + ) +} + +pub fn retry_strategy_for_phase(phase: RecoveryFailurePhase) -> RecoveryRetryStrategy { + match phase { + RecoveryFailurePhase::PreRebind => RecoveryRetryStrategy::StartNewJob, + RecoveryFailurePhase::PostRebind => RecoveryRetryStrategy::ResumeSameJob, + } +} + +pub fn retry_strategy_for_stage(stage: &str) -> Option { + recovery_phase_for_stage(stage).map(retry_strategy_for_phase) +} + +#[cfg(test)] +mod tests { + use super::{ + REBIND_IDENTITY_MIN_PROTOCOL_VERSION, RECOVERY_STAGE_AWAITING_TARGET_ONLINE, + RECOVERY_STAGE_REBINDING, RecoveryFailurePhase, RecoveryMergeService, + RecoveryRetryStrategy, is_pre_rebind_stage, recovery_phase_for_stage, + retry_strategy_for_phase, retry_strategy_for_stage, + }; + use crate::config::AppConfig; + use crate::entity::{ + alert_rule, alert_state, record, server, server_tag, service_monitor, traffic_daily, + traffic_hourly, traffic_state, + }; + use crate::error::AppError; + use crate::service::auth::AuthService; + use crate::service::recovery_job::RecoveryJobService; + use crate::state::AppState; + use crate::test_utils::setup_test_db; + use chrono::{NaiveDate, Utc}; + use sea_orm::{ + ActiveModelTrait, ColumnTrait, DatabaseConnection, EntityTrait, QueryFilter, Set, + TransactionTrait, + }; + use serverbee_common::constants::CAP_DEFAULT; + use std::net::{IpAddr, Ipv4Addr, SocketAddr}; + use std::sync::Arc; + use tempfile::TempDir; + use tokio::sync::mpsc; + + async fn insert_test_server(db: &DatabaseConnection, id: &str, name: &str) { + let token_hash = AuthService::hash_password("test").expect("hash_password should succeed"); + let now = Utc::now(); + server::ActiveModel { + id: Set(id.to_string()), + token_hash: Set(token_hash), + token_prefix: Set("serverbee_test".to_string()), + name: Set(name.to_string()), + weight: Set(0), + hidden: Set(false), + capabilities: Set(CAP_DEFAULT as i32), + protocol_version: Set(REBIND_IDENTITY_MIN_PROTOCOL_VERSION as i32), + created_at: Set(now), + updated_at: Set(now), + ..Default::default() + } + .insert(db) + .await + .expect("insert test server should succeed"); + } + + async fn test_state_with_servers() -> (Arc, TempDir) { + let (db, tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + insert_test_server(&db, "source-2", "Source 2").await; + let state = AppState::new(db, AppConfig::default()) + .await + .expect("app state should initialize"); + (state, tmp) + } + + fn test_addr() -> SocketAddr { + SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 9527) + } + + fn mark_online(state: &Arc, server_id: &str) { + let (tx, _) = mpsc::channel(1); + state.agent_manager.add_connection( + server_id.to_string(), + server_id.to_string(), + tx, + test_addr(), + ); + state + .agent_manager + .set_protocol_version(server_id, REBIND_IDENTITY_MIN_PROTOCOL_VERSION); + } + + async fn insert_record( + db: &DatabaseConnection, + server_id: &str, + time: chrono::DateTime, + cpu: f64, + ) { + record::ActiveModel { + server_id: Set(server_id.to_string()), + time: Set(time), + cpu: Set(cpu), + mem_used: Set(1), + swap_used: Set(1), + disk_used: Set(1), + net_in_speed: Set(1), + net_out_speed: Set(1), + net_in_transfer: Set(1), + net_out_transfer: Set(1), + load1: Set(1.0), + load5: Set(1.0), + load15: Set(1.0), + tcp_conn: Set(1), + udp_conn: Set(1), + process_count: Set(1), + temperature: Set(None), + gpu_usage: Set(None), + disk_io_json: Set(None), + ..Default::default() + } + .insert(db) + .await + .expect("insert record should succeed"); + } + + #[test] + fn pre_rebind_phase_requires_new_job() { + assert_eq!( + retry_strategy_for_phase(RecoveryFailurePhase::PreRebind), + RecoveryRetryStrategy::StartNewJob + ); + assert_eq!( + retry_strategy_for_stage(RECOVERY_STAGE_REBINDING), + Some(RecoveryRetryStrategy::StartNewJob) + ); + assert_eq!( + recovery_phase_for_stage(RECOVERY_STAGE_REBINDING), + Some(RecoveryFailurePhase::PreRebind) + ); + assert_eq!(retry_strategy_for_stage("unknown"), None); + assert_eq!(recovery_phase_for_stage("unknown"), None); + } + + #[test] + fn post_rebind_phase_resumes_same_job() { + assert_eq!( + retry_strategy_for_phase(RecoveryFailurePhase::PostRebind), + RecoveryRetryStrategy::ResumeSameJob + ); + assert_eq!( + retry_strategy_for_stage(RECOVERY_STAGE_AWAITING_TARGET_ONLINE), + Some(RecoveryRetryStrategy::ResumeSameJob) + ); + assert_eq!( + recovery_phase_for_stage(RECOVERY_STAGE_AWAITING_TARGET_ONLINE), + Some(RecoveryFailurePhase::PostRebind) + ); + } + + #[tokio::test] + async fn start_persists_job_and_advances_to_rebinding() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + assert_eq!(job.target_server_id, "target-1"); + assert_eq!(job.source_server_id, "source-1"); + assert_eq!(job.status, "running"); + assert_eq!(job.stage, RECOVERY_STAGE_REBINDING); + assert!(job.last_heartbeat_at.is_some()); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, RECOVERY_STAGE_REBINDING); + assert_eq!(loaded.status, "running"); + } + + #[tokio::test] + async fn start_reuses_existing_pre_rebind_job() { + let (db, _tmp) = setup_test_db().await; + + let first = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + let second = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + assert_eq!(second.job_id, first.job_id); + assert_eq!(second.stage, RECOVERY_STAGE_REBINDING); + assert!(is_pre_rebind_stage(second.stage.as_str())); + } + + #[tokio::test] + async fn start_rejects_existing_target_job_for_different_source() { + let (db, _tmp) = setup_test_db().await; + + let first = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let result = RecoveryMergeService::start_on_db(&db, "target-1", "source-2").await; + assert!(matches!(result, Err(AppError::Conflict(_)))); + + let loaded = RecoveryJobService::get_job(&db, &first.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.source_server_id, "source-1"); + assert_eq!(loaded.stage, RECOVERY_STAGE_REBINDING); + } + + #[tokio::test] + async fn rebind_ack_advances_to_waiting_for_target_online() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-1") + .await + .unwrap(); + + assert!(updated.transitioned); + assert_eq!(updated.job.job_id, job.job_id); + assert_eq!(updated.job.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!(updated.job.status, "running"); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + } + + #[tokio::test] + async fn rebind_ack_is_idempotent_once_advanced() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + let _ = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-1") + .await + .unwrap(); + + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-1") + .await + .unwrap(); + + assert!(!updated.transitioned); + assert_eq!(updated.job.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!(updated.job.status, "running"); + } + + #[tokio::test] + async fn rebind_ack_ignores_wrong_stage() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + RecoveryJobService::update_stage(&db, &job.job_id, "validating", None, None) + .await + .unwrap(); + + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-1") + .await + .unwrap(); + + assert!(!updated.transitioned); + assert_eq!(updated.job.stage, "validating"); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, "validating"); + } + + #[tokio::test] + async fn rebind_ack_from_wrong_source_is_ignored() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-2") + .await + .unwrap(); + + assert!(!updated.transitioned); + assert_eq!(updated.job.job_id, job.job_id); + assert_eq!(updated.job.stage, RECOVERY_STAGE_REBINDING); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, RECOVERY_STAGE_REBINDING); + } + + #[tokio::test] + async fn rebind_failure_marks_job_failed() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let failed = RecoveryMergeService::handle_rebind_failure_on_db( + &db, + &job.job_id, + "source-1", + "agent failed", + ) + .await + .unwrap(); + + assert!(failed.transitioned); + assert_eq!(failed.job.job_id, job.job_id); + assert_eq!(failed.job.status, "failed"); + assert_eq!(failed.job.stage, RECOVERY_STAGE_REBINDING); + assert_eq!(failed.job.error.as_deref(), Some("agent failed")); + } + + #[tokio::test] + async fn start_rejects_self_merge_at_service_boundary() { + let (state, _tmp) = test_state_with_servers().await; + mark_online(&state, "source-1"); + + let result = RecoveryMergeService::start(&state, "target-1", "target-1").await; + + assert!(matches!(result, Err(AppError::Validation(_)))); + } + + #[tokio::test] + async fn start_rejects_online_target_at_service_boundary() { + let (state, _tmp) = test_state_with_servers().await; + mark_online(&state, "target-1"); + mark_online(&state, "source-1"); + + let result = RecoveryMergeService::start(&state, "target-1", "source-1").await; + + assert!( + matches!(result, Err(AppError::Conflict(message)) if message.contains("Target server must be offline")) + ); + } + + #[tokio::test] + async fn start_rejects_offline_source_at_service_boundary() { + let (state, _tmp) = test_state_with_servers().await; + + let result = RecoveryMergeService::start(&state, "target-1", "source-1").await; + + assert!( + matches!(result, Err(AppError::Conflict(message)) if message.contains("Source server must be online")) + ); + } + + #[tokio::test] + async fn duplicate_start_conflict_reuses_matching_pre_rebind_job() { + let (db, _tmp) = setup_test_db().await; + + let first = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + + let reused = RecoveryMergeService::recover_duplicate_start(&db, "target-1", "source-1") + .await + .unwrap(); + + assert_eq!(reused.job_id, first.job_id); + assert_eq!(reused.stage, RECOVERY_STAGE_REBINDING); + } + + #[tokio::test] + async fn reusable_start_keeps_latest_stage_when_rebind_ack_wins_race() { + let (db, _tmp) = setup_test_db().await; + + let stale_job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + let acknowledged = + RecoveryMergeService::handle_rebind_ack_on_db(&db, &stale_job.job_id, "source-1") + .await + .unwrap(); + assert!(acknowledged.transitioned); + assert_eq!( + acknowledged.job.stage, + RECOVERY_STAGE_AWAITING_TARGET_ONLINE + ); + + let advanced = RecoveryMergeService::advance_job_to_rebinding(&db, stale_job) + .await + .unwrap(); + + assert_eq!(advanced.job_id, acknowledged.job.job_id); + assert_eq!(advanced.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + + let loaded = RecoveryJobService::get_job(&db, &advanced.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + } + + #[tokio::test] + async fn advance_job_to_rebinding_does_not_overwrite_failed_job() { + let (db, _tmp) = setup_test_db().await; + + let stale_job = RecoveryJobService::create_job(&db, "target-1", "source-1") + .await + .unwrap(); + RecoveryJobService::mark_failed(&db, &stale_job.job_id, "validating", "boom") + .await + .unwrap(); + + let advanced = RecoveryMergeService::advance_job_to_rebinding(&db, stale_job) + .await + .unwrap(); + + assert_eq!(advanced.status, "failed"); + assert_eq!(advanced.stage, "validating"); + + let loaded = RecoveryJobService::get_job(&db, &advanced.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.status, "failed"); + assert_eq!(loaded.stage, "validating"); + } + + #[tokio::test] + async fn rebind_ack_does_not_overwrite_moved_job() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryMergeService::start_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + RecoveryJobService::update_stage( + &db, + &job.job_id, + RECOVERY_STAGE_AWAITING_TARGET_ONLINE, + None, + None, + ) + .await + .unwrap(); + + let updated = RecoveryMergeService::handle_rebind_ack_on_db(&db, &job.job_id, "source-1") + .await + .unwrap(); + + assert!(!updated.transitioned); + assert_eq!(updated.job.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!(updated.job.status, "running"); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id) + .await + .unwrap() + .unwrap(); + assert_eq!(loaded.stage, RECOVERY_STAGE_AWAITING_TARGET_ONLINE); + assert_eq!(loaded.status, "running"); + } + + #[tokio::test] + async fn dispatch_validation_rejects_stale_source_offline_state() { + let (state, _tmp) = test_state_with_servers().await; + mark_online(&state, "source-1"); + + RecoveryMergeService::validate_start_request(&state, "target-1", "source-1") + .await + .expect("initial start validation should succeed"); + + state.agent_manager.remove_connection("source-1"); + + let result = + RecoveryMergeService::validate_dispatch_preconditions(&state, "target-1", "source-1") + .await; + + assert!( + matches!(result, Err(AppError::Conflict(message)) if message.contains("went offline before dispatch")) + ); + } + + #[tokio::test] + async fn merge_raw_records_replaces_target_overlap_with_source() { + let (db, _tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + + let before_overlap = NaiveDate::from_ymd_opt(2026, 4, 16) + .unwrap() + .and_hms_opt(9, 0, 0) + .unwrap() + .and_utc(); + let overlap_start = NaiveDate::from_ymd_opt(2026, 4, 16) + .unwrap() + .and_hms_opt(10, 0, 0) + .unwrap() + .and_utc(); + let overlap_end = NaiveDate::from_ymd_opt(2026, 4, 16) + .unwrap() + .and_hms_opt(11, 0, 0) + .unwrap() + .and_utc(); + + insert_record(&db, "target-1", before_overlap, 10.0).await; + insert_record(&db, "target-1", overlap_start, 20.0).await; + insert_record(&db, "target-1", overlap_end, 30.0).await; + insert_record(&db, "source-1", overlap_start, 200.0).await; + insert_record(&db, "source-1", overlap_end, 300.0).await; + + RecoveryMergeService::merge_server_history_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let target_rows = record::Entity::find() + .filter(record::Column::ServerId.eq("target-1")) + .all(&db) + .await + .unwrap(); + assert_eq!(target_rows.len(), 3); + assert!(target_rows.iter().any(|row| row.time == before_overlap && row.cpu == 10.0)); + assert!(target_rows.iter().any(|row| row.time == overlap_start && row.cpu == 200.0)); + assert!(target_rows.iter().any(|row| row.time == overlap_end && row.cpu == 300.0)); + + let source_rows = record::Entity::find() + .filter(record::Column::ServerId.eq("source-1")) + .all(&db) + .await + .unwrap(); + assert!(source_rows.is_empty()); + } + + #[tokio::test] + async fn merge_alert_state_keeps_target_when_rule_conflicts() { + let (db, _tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + + let now = Utc::now(); + alert_state::ActiveModel { + rule_id: Set("rule-1".to_string()), + server_id: Set("target-1".to_string()), + first_triggered_at: Set(now), + last_notified_at: Set(now), + count: Set(5), + resolved: Set(false), + resolved_at: Set(None), + updated_at: Set(now), + ..Default::default() + } + .insert(&db) + .await + .unwrap(); + alert_state::ActiveModel { + rule_id: Set("rule-1".to_string()), + server_id: Set("source-1".to_string()), + first_triggered_at: Set(now), + last_notified_at: Set(now), + count: Set(1), + resolved: Set(true), + resolved_at: Set(Some(now)), + updated_at: Set(now), + ..Default::default() + } + .insert(&db) + .await + .unwrap(); + + RecoveryMergeService::merge_server_history_on_db(&db, "target-1", "source-1") + .await + .unwrap(); + + let target_states = alert_state::Entity::find() + .filter(alert_state::Column::ServerId.eq("target-1")) + .all(&db) + .await + .unwrap(); + assert_eq!(target_states.len(), 1); + assert_eq!(target_states[0].rule_id, "rule-1"); + assert_eq!(target_states[0].count, 5); + assert!(!target_states[0].resolved); + + let source_states = alert_state::Entity::find() + .filter(alert_state::Column::ServerId.eq("source-1")) + .all(&db) + .await + .unwrap(); + assert!(source_states.is_empty()); + } + + #[tokio::test] + async fn merge_server_history_can_be_rolled_back_atomically() { + let (db, _tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + + let before_overlap = NaiveDate::from_ymd_opt(2026, 4, 16) + .unwrap() + .and_hms_opt(9, 0, 0) + .unwrap() + .and_utc(); + let overlap = NaiveDate::from_ymd_opt(2026, 4, 16) + .unwrap() + .and_hms_opt(10, 0, 0) + .unwrap() + .and_utc(); + + insert_record(&db, "target-1", before_overlap, 10.0).await; + insert_record(&db, "target-1", overlap, 20.0).await; + insert_record(&db, "source-1", overlap, 200.0).await; + + let txn = db.begin().await.unwrap(); + RecoveryMergeService::merge_server_history_on_txn(&txn, "target-1", "source-1") + .await + .unwrap(); + txn.rollback().await.unwrap(); + + let target_rows = record::Entity::find() + .filter(record::Column::ServerId.eq("target-1")) + .all(&db) + .await + .unwrap(); + assert_eq!(target_rows.len(), 2); + assert!(target_rows.iter().any(|row| row.time == before_overlap && row.cpu == 10.0)); + assert!(target_rows.iter().any(|row| row.time == overlap && row.cpu == 20.0)); + + let source_rows = record::Entity::find() + .filter(record::Column::ServerId.eq("source-1")) + .all(&db) + .await + .unwrap(); + assert_eq!(source_rows.len(), 1); + assert_eq!(source_rows[0].time, overlap); + assert_eq!(source_rows[0].cpu, 200.0); + } + + #[tokio::test] + async fn rewrite_server_ids_json_replaces_source_with_target_once() { + let (db, _tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + let now = Utc::now(); + + alert_rule::ActiveModel { + id: Set("rule-1".to_string()), + name: Set("rule".to_string()), + enabled: Set(true), + rules_json: Set("[]".to_string()), + trigger_mode: Set("any".to_string()), + notification_group_id: Set(None), + fail_trigger_tasks: Set(None), + recover_trigger_tasks: Set(None), + cover_type: Set("include".to_string()), + server_ids_json: Set(Some(r#"["target-1","source-1","source-1"]"#.to_string())), + created_at: Set(now), + updated_at: Set(now), + } + .insert(&db) + .await + .unwrap(); + + service_monitor::ActiveModel { + id: Set("monitor-1".to_string()), + name: Set("monitor".to_string()), + monitor_type: Set("http".to_string()), + target: Set("https://example.com".to_string()), + interval: Set(60), + config_json: Set("{}".to_string()), + notification_group_id: Set(None), + retry_count: Set(0), + server_ids_json: Set(Some(r#"["source-1","target-1","source-1"]"#.to_string())), + enabled: Set(true), + last_status: Set(None), + consecutive_failures: Set(0), + last_checked_at: Set(None), + created_at: Set(now), + updated_at: Set(now), + } + .insert(&db) + .await + .unwrap(); + + RecoveryMergeService::rewrite_server_ids_json_tables(&db, "target-1", "source-1") + .await + .unwrap(); + + let rule = alert_rule::Entity::find_by_id("rule-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(rule.server_ids_json.as_deref(), Some(r#"["target-1"]"#)); + + let monitor = service_monitor::Entity::find_by_id("monitor-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(monitor.server_ids_json.as_deref(), Some(r#"["target-1"]"#)); + } + + #[tokio::test] + async fn finalize_target_server_row_copies_runtime_fields_and_cleans_source_rows() { + let (db, _tmp) = setup_test_db().await; + insert_test_server(&db, "target-1", "Target").await; + insert_test_server(&db, "source-1", "Source").await; + let now = Utc::now(); + + let mut source: server::ActiveModel = server::Entity::find_by_id("source-1") + .one(&db) + .await + .unwrap() + .unwrap() + .into(); + source.cpu_name = Set(Some("Ryzen".to_string())); + source.cpu_cores = Set(Some(16)); + source.cpu_arch = Set(Some("x86_64".to_string())); + source.os = Set(Some("Linux".to_string())); + source.kernel_version = Set(Some("6.9.0".to_string())); + source.mem_total = Set(Some(64)); + source.swap_total = Set(Some(32)); + source.disk_total = Set(Some(1024)); + source.ipv4 = Set(Some("1.2.3.4".to_string())); + source.ipv6 = Set(Some("::1".to_string())); + source.region = Set(Some("Taipei".to_string())); + source.country_code = Set(Some("TW".to_string())); + source.virtualization = Set(Some("kvm".to_string())); + source.agent_version = Set(Some("1.2.3".to_string())); + source.protocol_version = Set(4); + source.features = Set(r#"["docker","process"]"#.to_string()); + source.last_remote_addr = Set(Some("192.0.2.10:9527".to_string())); + source.fingerprint = Set(Some("fingerprint-123".to_string())); + let source_model = source.update(&db).await.unwrap(); + + server_tag::ActiveModel { + server_id: Set("source-1".to_string()), + tag: Set("temporary".to_string()), + } + .insert(&db) + .await + .unwrap(); + traffic_hourly::ActiveModel { + server_id: Set("source-1".to_string()), + hour: Set(now), + bytes_in: Set(10), + bytes_out: Set(20), + ..Default::default() + } + .insert(&db) + .await + .unwrap(); + traffic_daily::ActiveModel { + server_id: Set("source-1".to_string()), + date: Set(now.date_naive()), + bytes_in: Set(30), + bytes_out: Set(40), + ..Default::default() + } + .insert(&db) + .await + .unwrap(); + traffic_state::ActiveModel { + server_id: Set("source-1".to_string()), + last_in: Set(100), + last_out: Set(200), + updated_at: Set(now), + } + .insert(&db) + .await + .unwrap(); + + RecoveryMergeService::finalize_target_server_row(&db, "target-1", &source_model) + .await + .unwrap(); + RecoveryMergeService::delete_intentionally_unmerged_source_rows(&db, "source-1") + .await + .unwrap(); + + let target = server::Entity::find_by_id("target-1") + .one(&db) + .await + .unwrap() + .unwrap(); + assert_eq!(target.cpu_name.as_deref(), Some("Ryzen")); + assert_eq!(target.protocol_version, 4); + assert_eq!(target.features, r#"["docker","process"]"#); + assert_eq!(target.last_remote_addr.as_deref(), Some("192.0.2.10:9527")); + assert_eq!(target.fingerprint.as_deref(), Some("fingerprint-123")); + + let source_tags = server_tag::Entity::find() + .filter(server_tag::Column::ServerId.eq("source-1")) + .all(&db) + .await + .unwrap(); + assert!(source_tags.is_empty()); + } +} diff --git a/crates/server/src/service/traffic.rs b/crates/server/src/service/traffic.rs index 1ee447a9..f0ccf0de 100644 --- a/crates/server/src/service/traffic.rs +++ b/crates/server/src/service/traffic.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use chrono::{Datelike, Duration, NaiveDate, SecondsFormat, Utc}; -use sea_orm::{ConnectionTrait, DatabaseConnection, EntityTrait, Statement}; +use sea_orm::{ConnectionTrait, DatabaseConnection, DatabaseTransaction, EntityTrait, Statement}; use serde::Serialize; use crate::entity::{server, traffic_state}; @@ -10,6 +10,105 @@ use crate::error::AppError; pub struct TrafficService; impl TrafficService { + pub async fn merge_recovered_server_history( + db: &DatabaseConnection, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_recovered_server_history_on_connection(db, target_server_id, source_server_id) + .await + } + + pub async fn merge_recovered_server_history_on_txn( + txn: &DatabaseTransaction, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> { + Self::merge_recovered_server_history_on_connection(txn, target_server_id, source_server_id) + .await + } + + pub(crate) async fn merge_recovered_server_history_on_connection( + db: &C, + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + Self::replace_unique_key_table_server_id_on_connection( + db, + "traffic_hourly", + &["hour"], + target_server_id, + source_server_id, + ) + .await?; + Self::replace_unique_key_table_server_id_on_connection( + db, + "traffic_daily", + &["date"], + target_server_id, + source_server_id, + ) + .await?; + Self::replace_unique_key_table_server_id_on_connection( + db, + "traffic_state", + &[], + target_server_id, + source_server_id, + ) + .await?; + + Ok(()) + } + + pub(crate) async fn replace_unique_key_table_server_id_on_connection( + db: &C, + table: &str, + key_columns: &[&str], + target_server_id: &str, + source_server_id: &str, + ) -> Result<(), AppError> + where + C: ConnectionTrait, + { + let join_predicate = if key_columns.is_empty() { + "1 = 1".to_string() + } else { + key_columns + .iter() + .map(|column| format!("source.{column} = target.{column}")) + .collect::>() + .join(" AND ") + }; + + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + format!( + "DELETE FROM {table} AS target \ + WHERE target.server_id = $1 \ + AND EXISTS ( \ + SELECT 1 FROM {table} AS source \ + WHERE source.server_id = $2 \ + AND {join_predicate} \ + )" + ), + [target_server_id.into(), source_server_id.into()], + )) + .await?; + + db.execute(Statement::from_sql_and_values( + db.get_database_backend(), + format!("UPDATE {table} SET server_id = $1 WHERE server_id = $2"), + [target_server_id.into(), source_server_id.into()], + )) + .await?; + + Ok(()) + } + /// Upsert a traffic_hourly row, accumulating bytes_in/bytes_out on conflict. pub async fn upsert_hourly( db: &DatabaseConnection, diff --git a/crates/server/src/state.rs b/crates/server/src/state.rs index 5734c299..ec36e382 100644 --- a/crates/server/src/state.rs +++ b/crates/server/src/state.rs @@ -16,6 +16,7 @@ use crate::service::geoip::GeoIpService; use crate::service::high_risk_audit::{ DockerLogsAuditContext, ExecAuditContext, TerminalAuditContext, }; +use crate::service::recovery_lock::RecoveryLockService; use crate::service::task_scheduler::TaskScheduler; use crate::service::upgrade_release::UpgradeReleaseService; use crate::service::upgrade_tracker::UpgradeJobTracker; @@ -63,6 +64,8 @@ pub struct AppState { pub task_scheduler: Arc, /// Shared alert state manager for dedup across poll-based and event-driven evaluation. pub alert_state_manager: AlertStateManager, + /// In-memory freeze gate for agent-originated writes during recovery. + pub recovery_lock: RecoveryLockService, /// Pending mobile pairing codes for QR login, keyed by code. pub pending_pairs: DashMap, /// Terminal session audit contexts keyed by session_id. @@ -180,6 +183,7 @@ impl AppState { docker_viewers: DockerViewerTracker::new(), task_scheduler, alert_state_manager, + recovery_lock: RecoveryLockService::new(), pending_pairs: DashMap::new(), terminal_audit_contexts: DashMap::new(), docker_logs_audit_contexts: DashMap::new(), diff --git a/crates/server/src/task/record_writer.rs b/crates/server/src/task/record_writer.rs index 77df3846..ce072369 100644 --- a/crates/server/src/task/record_writer.rs +++ b/crates/server/src/task/record_writer.rs @@ -43,11 +43,17 @@ pub async fn run(state: Arc) { let mut count = 0; for (server_id, report) in &reports { + let writes_allowed = state.recovery_lock.writes_allowed_for(server_id); + // Save metrics record - if let Err(e) = RecordService::save_report(&state.db, server_id, report).await { - tracing::error!("Failed to save record for {server_id}: {e}"); + if writes_allowed { + if let Err(e) = RecordService::save_report(&state.db, server_id, report).await { + tracing::error!("Failed to save record for {server_id}: {e}"); + } else { + count += 1; + } } else { - count += 1; + tracing::info!("Skipping recovery-frozen record write for {server_id}"); } // Compute traffic delta @@ -65,10 +71,19 @@ pub async fn run(state: Arc) { } else { // First observation: no previous state, skip delta (just record state) transfer_cache.insert(server_id.clone(), (curr_in, curr_out)); - if let Err(e) = - TrafficService::upsert_state(&state.db, server_id, curr_in, curr_out).await - { - tracing::error!("Failed to upsert traffic state for {server_id}: {e}"); + if writes_allowed { + if let Err(e) = TrafficService::upsert_state( + &state.db, + server_id, + curr_in, + curr_out, + ) + .await + { + tracing::error!("Failed to upsert traffic state for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen traffic state write for {server_id}"); } continue; }; @@ -77,19 +92,33 @@ pub async fn run(state: Arc) { transfer_cache.insert(server_id.clone(), (curr_in, curr_out)); // Only write if there's actual traffic - if (delta_in > 0 || delta_out > 0) - && let Err(e) = - TrafficService::upsert_hourly(&state.db, server_id, hour, delta_in, delta_out) - .await - { - tracing::error!("Failed to upsert traffic hourly for {server_id}: {e}"); + if delta_in > 0 || delta_out > 0 { + if writes_allowed { + if let Err(e) = TrafficService::upsert_hourly( + &state.db, + server_id, + hour, + delta_in, + delta_out, + ) + .await + { + tracing::error!("Failed to upsert traffic hourly for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen traffic hourly write for {server_id}"); + } } // Always update state - if let Err(e) = - TrafficService::upsert_state(&state.db, server_id, curr_in, curr_out).await - { - tracing::error!("Failed to upsert traffic state for {server_id}: {e}"); + if writes_allowed { + if let Err(e) = + TrafficService::upsert_state(&state.db, server_id, curr_in, curr_out).await + { + tracing::error!("Failed to upsert traffic state for {server_id}: {e}"); + } + } else { + tracing::info!("Skipping recovery-frozen traffic state write for {server_id}"); } } diff --git a/crates/server/tests/integration.rs b/crates/server/tests/integration.rs index 6cb711d7..8f3f2854 100644 --- a/crates/server/tests/integration.rs +++ b/crates/server/tests/integration.rs @@ -3779,3 +3779,294 @@ async fn test_security_headers_present() { Some("none"), ); } + +#[tokio::test] +async fn test_recovery_candidates_rejects_online_or_busy_target() { + let (base_url, _tmp) = start_test_server().await; + let admin_client = http_client(); + login_admin(&admin_client, &base_url).await; + + let (online_target_id, online_target_token) = register_agent(&admin_client, &base_url).await; + let (busy_target_id, _busy_target_token) = register_agent(&admin_client, &base_url).await; + let (source_id, source_token) = register_agent(&admin_client, &base_url).await; + + let (_sink, mut reader) = connect_agent(&base_url, &online_target_token).await; + let _welcome = recv_agent_text(&mut reader).await; + + let online_resp = admin_client + .get(format!( + "{}/api/servers/{}/recovery-candidates", + base_url, online_target_id + )) + .send() + .await + .expect("online target recovery candidates request failed"); + assert_eq!(online_resp.status(), 409); + let online_body: serde_json::Value = online_resp.json().await.unwrap(); + assert!( + online_body["error"]["message"] + .as_str() + .expect("error message should be a string") + .contains("must be offline") + ); + + let (mut source_sink, mut source_reader) = connect_agent(&base_url, &source_token).await; + let _source_welcome = recv_agent_text(&mut source_reader).await; + send_system_info(&mut source_sink, &mut source_reader, "recovery-busy-source-info", None).await; + + let start_resp = admin_client + .post(format!("{}/api/servers/{}/recover-merge", base_url, busy_target_id)) + .json(&json!({ "source_server_id": source_id })) + .send() + .await + .expect("start recovery request failed"); + assert_eq!(start_resp.status(), 200); + + let busy_resp = admin_client + .get(format!( + "{}/api/servers/{}/recovery-candidates", + base_url, busy_target_id + )) + .send() + .await + .expect("busy target recovery candidates request failed"); + assert_eq!(busy_resp.status(), 409); + let busy_body: serde_json::Value = busy_resp.json().await.unwrap(); + assert!( + busy_body["error"]["message"] + .as_str() + .expect("error message should be a string") + .contains("running recovery job") + ); +} + +#[tokio::test] +async fn test_recovery_candidates_requires_admin_and_filters_online_sources() { + let (base_url, _tmp) = start_test_server().await; + let auth_client = http_client(); + + let (target_id, _target_token) = register_agent(&auth_client, &base_url).await; + let (online_source_id, online_source_token) = register_agent(&auth_client, &base_url).await; + let (offline_source_id, _offline_source_token) = register_agent(&auth_client, &base_url).await; + login_admin(&auth_client, &base_url).await; + + let create_resp = auth_client + .post(format!("{}/api/users", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123", + "role": "member" + })) + .send() + .await + .expect("POST /api/users failed"); + assert_eq!(create_resp.status(), 200); + + let member_client = http_client(); + let member_login = member_client + .post(format!("{}/api/auth/login", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123" + })) + .send() + .await + .expect("member login failed"); + assert_eq!(member_login.status(), 200); + + let plain_client = reqwest::Client::new(); + let unauth_resp = plain_client + .get(format!( + "{}/api/servers/{}/recovery-candidates", + base_url, target_id + )) + .send() + .await + .expect("unauthenticated recovery candidates request failed"); + assert_eq!(unauth_resp.status(), 401); + + let member_resp = member_client + .get(format!( + "{}/api/servers/{}/recovery-candidates", + base_url, target_id + )) + .send() + .await + .expect("member recovery candidates request failed"); + assert_eq!(member_resp.status(), 403); + + let (_sink, mut reader) = connect_agent(&base_url, &online_source_token).await; + let _welcome = recv_agent_text(&mut reader).await; + + let resp = auth_client + .get(format!( + "{}/api/servers/{}/recovery-candidates", + base_url, target_id + )) + .send() + .await + .expect("GET recovery candidates failed"); + + assert_eq!(resp.status(), 200); + let body: serde_json::Value = resp.json().await.unwrap(); + let candidates = body["data"].as_array().expect("data should be an array"); + + assert_eq!(candidates.len(), 1); + assert_eq!(candidates[0]["server_id"], online_source_id); + assert_ne!(candidates[0]["server_id"], target_id); + assert!( + !candidates + .iter() + .any(|candidate| candidate["server_id"] == offline_source_id) + ); +} + +#[tokio::test] +async fn test_recovery_merge_start_requires_admin_and_validates_source_state() { + let (base_url, _tmp) = start_test_server().await; + let admin_client = http_client(); + login_admin(&admin_client, &base_url).await; + + let create_resp = admin_client + .post(format!("{}/api/users", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123", + "role": "member" + })) + .send() + .await + .expect("POST /api/users failed"); + assert_eq!(create_resp.status(), 200); + + let member_client = http_client(); + let member_login = member_client + .post(format!("{}/api/auth/login", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123" + })) + .send() + .await + .expect("member login failed"); + assert_eq!(member_login.status(), 200); + + let (target_id, _target_token) = register_agent(&admin_client, &base_url).await; + let (offline_source_id, _offline_source_token) = register_agent(&admin_client, &base_url).await; + + let member_resp = member_client + .post(format!( + "{}/api/servers/{}/recover-merge", + base_url, target_id + )) + .json(&json!({ "source_server_id": offline_source_id })) + .send() + .await + .expect("member recover-merge request failed"); + assert_eq!(member_resp.status(), 403); + + let admin_resp = admin_client + .post(format!( + "{}/api/servers/{}/recover-merge", + base_url, target_id + )) + .json(&json!({ "source_server_id": offline_source_id })) + .send() + .await + .expect("admin recover-merge validation request failed"); + assert_eq!(admin_resp.status(), 409); + + let admin_body: serde_json::Value = admin_resp.json().await.unwrap(); + assert!( + admin_body["error"]["message"] + .as_str() + .expect("error message should be a string") + .contains("Source server must be online") + ); +} + +#[tokio::test] +async fn test_recovery_job_get_requires_admin_and_start_creates_job() { + let (base_url, _tmp) = start_test_server().await; + let auth_client = http_client(); + login_admin(&auth_client, &base_url).await; + + let create_resp = auth_client + .post(format!("{}/api/users", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123", + "role": "member" + })) + .send() + .await + .expect("POST /api/users failed"); + assert_eq!(create_resp.status(), 200); + + let member_client = http_client(); + let member_login = member_client + .post(format!("{}/api/auth/login", base_url)) + .json(&json!({ + "username": "recoverymember", + "password": "memberpass123" + })) + .send() + .await + .expect("member login failed"); + assert_eq!(member_login.status(), 200); + + let (target_id, _target_token) = register_agent(&auth_client, &base_url).await; + let (source_id, source_token) = register_agent(&auth_client, &base_url).await; + let (mut sink, mut reader) = connect_agent(&base_url, &source_token).await; + let _welcome = recv_agent_text(&mut reader).await; + send_system_info(&mut sink, &mut reader, "recovery-source-info", None).await; + + let start_resp = auth_client + .post(format!( + "{}/api/servers/{}/recover-merge", + base_url, target_id + )) + .json(&json!({ "source_server_id": source_id })) + .send() + .await + .expect("start recovery request failed"); + assert_eq!(start_resp.status(), 200); + + let start_body: serde_json::Value = start_resp.json().await.unwrap(); + let job_id = start_body["data"]["job_id"] + .as_str() + .expect("job_id missing") + .to_string(); + assert_eq!(start_body["data"]["status"], "running"); + assert_eq!(start_body["data"]["stage"], "rebinding"); + + let plain_client = reqwest::Client::new(); + let unauth_resp = plain_client + .get(format!("{}/api/servers/recovery-jobs/{}", base_url, job_id)) + .send() + .await + .expect("unauthenticated recovery job request failed"); + assert_eq!(unauth_resp.status(), 401); + + let member_resp = member_client + .get(format!("{}/api/servers/recovery-jobs/{}", base_url, job_id)) + .send() + .await + .expect("member recovery job request failed"); + assert_eq!(member_resp.status(), 403); + + let get_resp = auth_client + .get(format!("{}/api/servers/recovery-jobs/{}", base_url, job_id)) + .send() + .await + .expect("authenticated recovery job request failed"); + assert_eq!(get_resp.status(), 200); + + let get_body: serde_json::Value = get_resp.json().await.unwrap(); + assert_eq!(get_body["data"]["job_id"], job_id); + assert_eq!(get_body["data"]["target_server_id"], target_id); + assert_eq!(get_body["data"]["source_server_id"], source_id); + assert!(get_body["data"].get("checkpoint_json").is_none()); + assert_eq!(get_body["data"]["status"], "running"); + assert_eq!(get_body["data"]["stage"], "rebinding"); +} diff --git a/docs/superpowers/plans/2026-04-16-agent-recovery-merge.md b/docs/superpowers/plans/2026-04-16-agent-recovery-merge.md new file mode 100644 index 00000000..6d616c60 --- /dev/null +++ b/docs/superpowers/plans/2026-04-16-agent-recovery-merge.md @@ -0,0 +1,925 @@ +# Agent Recovery Merge Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add an admin-driven recovery workflow that rebinds a newly registered replacement agent onto an existing offline server record, merges the replacement record's history into the original record, rewrites shared references, and deletes the temporary record. + +**Architecture:** The implementation is split into four vertical slices: protocol and atomic agent rebind support, persistent server-side recovery jobs with write freezing, table-aware history merge logic, and a server-detail UI for candidate selection and progress. The recovery flow keeps the original `server_id`, persists job state in SQLite for restart-safe retries, and uses bounded merge transactions plus checkpointed progress rather than one giant transaction. + +**Tech Stack:** Rust (`axum`, `sea-orm`, `tokio`, SQLite), React 19, TanStack Query, TanStack Router, Zustand, Vitest, OpenAPI-generated web types + +--- + +## File Map + +### Backend Rust + +- Create: `crates/server/src/entity/recovery_job.rs` + Stores the persistent recovery job row. +- Modify: `crates/server/src/entity/mod.rs` + Register the new entity module. +- Create: `crates/server/src/migration/m20260416_000017_create_recovery_job.rs` + Creates `recovery_job` table and indexes. +- Modify: `crates/server/src/migration/mod.rs` + Registers the new migration. +- Create: `crates/server/src/service/recovery_job.rs` + DB-backed repository/service for creating, updating, resuming, and checkpointing jobs. +- Create: `crates/server/src/service/recovery_lock.rs` + In-memory write-freeze guard keyed by `server_id`. +- Create: `crates/server/src/service/recovery_merge.rs` + Orchestrates rebind, merge groups, JSON rewrites, cleanup, and retry semantics. +- Modify: `crates/server/src/service/mod.rs` + Exposes the new services. +- Modify: `crates/server/src/state.rs` + Wires persistent recovery services and write-freeze guard into `AppState`. +- Create: `crates/server/src/router/api/server_recovery.rs` + Read/write endpoints for candidates, start job, and get job state. +- Modify: `crates/server/src/router/api/mod.rs` + Mounts the new router. +- Modify: `crates/server/src/openapi.rs` + Registers new endpoints and DTOs. +- Modify: `crates/server/src/router/ws/agent.rs` + Handles `RebindIdentityAck`/`Failed`, recovery-aware write gating, and rebind orchestration callbacks. +- Modify: `crates/server/src/router/ws/browser.rs` + Includes recovery jobs in browser `FullSync` and live updates. +- Modify: `crates/server/src/task/record_writer.rs` + Honors recovery write freezes. +- Modify: `crates/server/src/service/traffic.rs` + Adds helper(s) needed by merge/finalization and respects recovery lock where state is updated. +- Modify: `crates/server/tests/integration.rs` + Adds API + end-to-end recovery integration coverage. +- Modify: `crates/server/src/service/recovery_merge.rs` + Include focused DB tests using `setup_test_db`. + +### Shared Protocol + +- Modify: `crates/common/src/protocol.rs` + Adds recovery DTOs and WebSocket messages used by agent/browser/server. + +### Agent + +- Create: `crates/agent/src/rebind.rs` + Atomic token persistence helper and rebind message handling. +- Modify: `crates/agent/src/main.rs` + Registers the new module. +- Modify: `crates/agent/src/reporter.rs` + Handles `ServerMessage::RebindIdentity` and emits ack/failure messages. + +### Web + +- Modify: `apps/web/src/lib/api-schema.ts` + Re-export recovery candidate/job schemas after OpenAPI regeneration. +- Modify: `apps/web/src/hooks/use-api.ts` + Adds candidate, start-job, and job polling helpers. +- Modify: `apps/web/src/hooks/use-api.test.tsx` + Covers the new API helpers. +- Create: `apps/web/src/stores/recovery-jobs-store.ts` + Holds live recovery job state keyed by `target_server_id` and `job_id`. +- Create: `apps/web/src/stores/recovery-jobs-store.test.ts` + Covers store set/update/clear behavior. +- Modify: `apps/web/src/hooks/use-servers-ws.ts` + Hydrates recovery jobs from `full_sync` and incremental events. +- Modify: `apps/web/src/hooks/use-servers-ws.test.ts` + Covers WS hydration and updates for recovery jobs. +- Create: `apps/web/src/components/server/recovery-merge-dialog.tsx` + Candidate picker + confirmation flow on the server detail page. +- Create: `apps/web/src/components/server/recovery-merge-dialog.test.tsx` + Covers ranking display, confirmation copy, pending/error UI. +- Modify: `apps/web/src/routes/_authed/servers/$id.tsx` + Adds action button, dialog integration, and job status rendering. +- Modify: `apps/web/src/routes/_authed/servers/$id.test.tsx` + Covers button visibility and job state rendering. +- Modify: `apps/web/src/locales/en/servers.json` + New copy for recovery UI. +- Modify: `apps/web/src/locales/zh/servers.json` + New copy for recovery UI. + +### Docs + +- Modify: `apps/docs/content/docs/cn/server.mdx` + Document the admin recovery flow and its limits. +- Modify: `apps/docs/content/docs/en/server.mdx` + Same in English. +- Modify: `apps/docs/content/docs/cn/api-reference.mdx` + Add recovery endpoints. +- Modify: `apps/docs/content/docs/en/api-reference.mdx` + Add recovery endpoints. + +--- + +### Task 1: Add Recovery Protocol and Atomic Agent Token Rebind Support + +**Files:** +- Create: `crates/agent/src/rebind.rs` +- Modify: `crates/agent/src/main.rs` +- Modify: `crates/agent/src/reporter.rs` +- Modify: `crates/common/src/protocol.rs` + +- [ ] **Step 1: Write failing agent tests for atomic token replacement** + +```rust +// crates/agent/src/rebind.rs +#[cfg(test)] +mod tests { + use super::persist_rebind_token; + + #[test] + fn persist_rebind_token_replaces_existing_token_line_atomically() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("agent.toml"); + std::fs::write(&path, "server_url = \"http://127.0.0.1:9527\"\ntoken = \"old\"\n").unwrap(); + + persist_rebind_token(&path, "new-token").unwrap(); + + let content = std::fs::read_to_string(&path).unwrap(); + assert!(content.contains("token = \"new-token\"")); + assert!(!content.contains("token = \"old\"")); + } + + #[test] + fn persist_rebind_token_preserves_non_token_lines() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("agent.toml"); + std::fs::write(&path, "server_url = \"https://monitor.example.com\"\n[collector]\ninterval = 3\n").unwrap(); + + persist_rebind_token(&path, "fresh-token").unwrap(); + + let content = std::fs::read_to_string(&path).unwrap(); + assert!(content.contains("server_url = \"https://monitor.example.com\"")); + assert!(content.contains("[collector]")); + assert!(content.contains("interval = 3")); + assert!(content.contains("token = \"fresh-token\"")); + } +} +``` + +- [ ] **Step 2: Run the agent tests and verify they fail** + +Run: `cargo test -p serverbee-agent persist_rebind_token -- --exact` + +Expected: FAIL with unresolved import or missing `persist_rebind_token`. + +- [ ] **Step 3: Implement the atomic token writer and wire the module** + +```rust +// crates/agent/src/rebind.rs +pub fn persist_rebind_token(path: &std::path::Path, token: &str) -> anyhow::Result<()> { + let content = if path.exists() { std::fs::read_to_string(path)? } else { String::new() }; + let token_line = format!("token = \"{token}\""); + let had_trailing_newline = content.ends_with('\n'); + let mut lines: Vec = content.lines().map(str::to_owned).collect(); + let insert_pos = lines.iter().position(|line| line.trim_start().starts_with('[')).unwrap_or(lines.len()); + if let Some(pos) = lines[..insert_pos].iter().position(|line| line.trim_start().starts_with("token")) { + lines[pos] = token_line; + } else { + lines.insert(insert_pos, token_line); + } + + let mut rendered = lines.join("\n"); + if had_trailing_newline { + rendered.push('\n'); + } + + let parent = path.parent().unwrap_or_else(|| std::path::Path::new(".")); + let temp_path = parent.join(format!(".agent.toml.rebind.{}.tmp", uuid::Uuid::new_v4())); + let mut temp_file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(&temp_path)?; + use std::io::Write; + temp_file.write_all(rendered.as_bytes())?; + temp_file.sync_all()?; + std::fs::rename(&temp_path, path)?; + let dir_file = std::fs::File::open(parent)?; + dir_file.sync_all()?; + Ok(()) +} + +// crates/agent/src/main.rs +mod rebind; +``` + +- [ ] **Step 4: Extend the shared protocol and reporter rebind handling** + +```rust +// crates/common/src/protocol.rs +ServerMessage::RebindIdentity { + job_id: String, + target_server_id: String, + token: String, +} + +AgentMessage::RebindIdentityAck { + job_id: String, +} + +AgentMessage::RebindIdentityFailed { + job_id: String, + error: String, +} + +// crates/agent/src/reporter.rs +ServerMessage::RebindIdentity { job_id, token, .. } => { + match crate::rebind::persist_rebind_token(std::path::Path::new(crate::config::AgentConfig::config_path()), &token) { + Ok(()) => { + self.config.token = token; + let ack = AgentMessage::RebindIdentityAck { job_id }; + let json = serde_json::to_string(&ack)?; + write.send(Message::Text(json.into())).await?; + write.send(Message::Close(None)).await?; + return Ok(()); + } + Err(err) => { + let failed = AgentMessage::RebindIdentityFailed { job_id, error: err.to_string() }; + let json = serde_json::to_string(&failed)?; + write.send(Message::Text(json.into())).await?; + return Ok(()); + } + } +} +``` + +- [ ] **Step 5: Run the focused agent tests and commit** + +Run: `cargo test -p serverbee-agent persist_rebind_token -- --exact` + +Expected: PASS + +Commit: + +```bash +git add crates/common/src/protocol.rs crates/agent/src/main.rs crates/agent/src/rebind.rs crates/agent/src/reporter.rs +git commit -m "feat(agent): add atomic recovery token rebind support" +``` + +### Task 2: Add Persistent Recovery Job Schema and Repository + +**Files:** +- Create: `crates/server/src/entity/recovery_job.rs` +- Modify: `crates/server/src/entity/mod.rs` +- Create: `crates/server/src/migration/m20260416_000017_create_recovery_job.rs` +- Modify: `crates/server/src/migration/mod.rs` +- Create: `crates/server/src/service/recovery_job.rs` +- Modify: `crates/server/src/service/mod.rs` + +- [ ] **Step 1: Write failing DB-backed service tests** + +```rust +// crates/server/src/service/recovery_job.rs +#[cfg(test)] +mod tests { + use super::RecoveryJobService; + use crate::test_utils::setup_test_db; + + #[tokio::test] + async fn create_job_persists_running_row() { + let (db, _tmp) = setup_test_db().await; + + let job = RecoveryJobService::create_job(&db, "target-1", "source-1").await.unwrap(); + + assert_eq!(job.target_server_id, "target-1"); + assert_eq!(job.source_server_id, "source-1"); + assert_eq!(job.status, "running"); + assert_eq!(job.stage, "validating"); + } + + #[tokio::test] + async fn update_checkpoint_round_trips() { + let (db, _tmp) = setup_test_db().await; + let job = RecoveryJobService::create_job(&db, "target-1", "source-1").await.unwrap(); + + RecoveryJobService::update_stage(&db, &job.job_id, "merging_history", Some("{\"group\":2}"), None) + .await + .unwrap(); + + let loaded = RecoveryJobService::get_job(&db, &job.job_id).await.unwrap().unwrap(); + assert_eq!(loaded.stage, "merging_history"); + assert_eq!(loaded.checkpoint_json.as_deref(), Some("{\"group\":2}")); + } +} +``` + +- [ ] **Step 2: Run the focused server tests and verify they fail** + +Run: `cargo test -p serverbee-server recovery_job_service -- --nocapture` + +Expected: FAIL with missing entity/service definitions. + +- [ ] **Step 3: Implement the entity, migration, and repository** + +```rust +// crates/server/src/entity/recovery_job.rs +#[derive(Clone, Debug, PartialEq, DeriveEntityModel)] +#[sea_orm(table_name = "recovery_job")] +pub struct Model { + #[sea_orm(primary_key, auto_increment = false)] + pub job_id: String, + pub target_server_id: String, + pub source_server_id: String, + pub status: String, + pub stage: String, + pub checkpoint_json: Option, + pub error: Option, + pub started_at: DateTimeUtc, + pub created_at: DateTimeUtc, + pub updated_at: DateTimeUtc, + pub last_heartbeat_at: Option, +} + +// crates/server/src/migration/m20260416_000017_create_recovery_job.rs +db.execute_unprepared( + "CREATE TABLE recovery_job ( + job_id TEXT PRIMARY KEY NOT NULL, + target_server_id TEXT NOT NULL, + source_server_id TEXT NOT NULL, + status TEXT NOT NULL, + stage TEXT NOT NULL, + checkpoint_json TEXT NULL, + error TEXT NULL, + started_at TEXT NOT NULL, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + last_heartbeat_at TEXT NULL + )" +).await?; +db.execute_unprepared("CREATE INDEX idx_recovery_job_target_status ON recovery_job(target_server_id, status)").await?; +db.execute_unprepared("CREATE INDEX idx_recovery_job_source_status ON recovery_job(source_server_id, status)").await?; + +// crates/server/src/service/recovery_job.rs +pub struct RecoveryJobService; +``` + +- [ ] **Step 4: Add repository methods used by the orchestration layer** + +```rust +impl RecoveryJobService { + pub async fn create_job(db: &DatabaseConnection, target: &str, source: &str) -> Result { /* insert row */ } + pub async fn get_job(db: &DatabaseConnection, job_id: &str) -> Result, AppError> { /* find by id */ } + pub async fn update_stage( + db: &DatabaseConnection, + job_id: &str, + stage: &str, + checkpoint_json: Option<&str>, + error: Option<&str> + ) -> Result { /* update row */ } + pub async fn mark_failed(db: &DatabaseConnection, job_id: &str, stage: &str, error: &str) -> Result<(), AppError> { /* update status */ } + pub async fn running_for_target(db: &DatabaseConnection, target: &str) -> Result, AppError> { /* query by index */ } + pub async fn running_for_source(db: &DatabaseConnection, source: &str) -> Result, AppError> { /* query by index */ } +} +``` + +- [ ] **Step 5: Run the tests and commit** + +Run: `cargo test -p serverbee-server recovery_job_service -- --nocapture` + +Expected: PASS + +Commit: + +```bash +git add crates/server/src/entity/mod.rs crates/server/src/entity/recovery_job.rs crates/server/src/migration/mod.rs crates/server/src/migration/m20260416_000017_create_recovery_job.rs crates/server/src/service/mod.rs crates/server/src/service/recovery_job.rs +git commit -m "feat(server): persist recovery jobs in sqlite" +``` + +### Task 3: Add Recovery Candidate Scoring and Admin API Endpoints + +**Files:** +- Create: `crates/server/src/router/api/server_recovery.rs` +- Modify: `crates/server/src/router/api/mod.rs` +- Modify: `crates/server/src/openapi.rs` +- Modify: `crates/server/src/service/recovery_job.rs` +- Modify: `crates/server/tests/integration.rs` + +- [ ] **Step 1: Write failing tests for candidate ranking and API validation** + +```rust +// crates/server/src/router/api/server_recovery.rs +#[cfg(test)] +mod tests { + use super::{score_candidate, CandidateScoreInput}; + + #[test] + fn higher_score_when_ip_arch_and_created_at_match() { + let strong = score_candidate(CandidateScoreInput { + same_remote_addr: true, + same_cpu_arch: true, + same_os: true, + same_virtualization: true, + created_within_minutes: 10, + same_country: true, + }); + let weak = score_candidate(CandidateScoreInput { + same_remote_addr: false, + same_cpu_arch: false, + same_os: true, + same_virtualization: false, + created_within_minutes: 240, + same_country: false, + }); + assert!(strong > weak); + } +} +``` + +- [ ] **Step 2: Run the targeted tests and verify failure** + +Run: `cargo test -p serverbee-server higher_score_when_ip_arch_and_created_at_match -- --exact` + +Expected: FAIL because `server_recovery.rs` and `score_candidate` do not exist. + +- [ ] **Step 3: Implement DTOs, scoring, and read/write routes** + +```rust +// crates/server/src/router/api/server_recovery.rs +#[derive(Debug, Serialize, utoipa::ToSchema)] +pub struct RecoveryCandidateResponse { + pub server_id: String, + pub name: String, + pub score: i32, + pub reasons: Vec, +} + +#[derive(Debug, Deserialize, utoipa::ToSchema)] +pub struct StartRecoveryRequest { + pub source_server_id: String, +} + +pub fn read_router() -> Router> { + Router::new() + .route("/servers/{target_id}/recovery-candidates", get(list_candidates)) + .route("/servers/recovery-jobs/{job_id}", get(get_recovery_job)) +} + +pub fn write_router() -> Router> { + Router::new().route("/servers/{target_id}/recover-merge", post(start_recovery_merge)) +} +``` + +- [ ] **Step 4: Add integration coverage for admin auth and validation rules** + +```rust +// crates/server/tests/integration.rs +#[tokio::test] +async fn test_recovery_candidates_requires_auth_and_filters_target() { + let (base_url, _tmp) = start_test_server().await; + let client = http_client(); + login_admin(&client, &base_url).await; + + let resp = client + .get(format!("{}/api/servers/target-1/recovery-candidates", base_url)) + .send() + .await + .unwrap(); + + assert_eq!(resp.status(), 200); + let body: serde_json::Value = resp.json().await.unwrap(); + assert!(body["data"].is_array()); +} +``` + +- [ ] **Step 5: Run focused tests and commit** + +Run: `cargo test -p serverbee-server recovery_candidates -- --nocapture` + +Expected: PASS + +Commit: + +```bash +git add crates/server/src/router/api/mod.rs crates/server/src/router/api/server_recovery.rs crates/server/src/openapi.rs crates/server/tests/integration.rs +git commit -m "feat(server): add recovery candidate and job api" +``` + +### Task 4: Add Recovery Locks and Route All Agent-Originated Writes Through Them + +**Files:** +- Create: `crates/server/src/service/recovery_lock.rs` +- Modify: `crates/server/src/state.rs` +- Modify: `crates/server/src/router/ws/agent.rs` +- Modify: `crates/server/src/task/record_writer.rs` +- Modify: `crates/server/src/service/mod.rs` + +- [ ] **Step 1: Write failing unit tests for the lock guard** + +```rust +// crates/server/src/service/recovery_lock.rs +#[cfg(test)] +mod tests { + use super::RecoveryLockService; + + #[test] + fn locked_server_denies_writes_until_released() { + let locks = RecoveryLockService::new(); + assert!(locks.writes_allowed_for("srv-1")); + locks.freeze("srv-1"); + assert!(!locks.writes_allowed_for("srv-1")); + locks.release("srv-1"); + assert!(locks.writes_allowed_for("srv-1")); + } +} +``` + +- [ ] **Step 2: Run the guard test and verify failure** + +Run: `cargo test -p serverbee-server locked_server_denies_writes_until_released -- --exact` + +Expected: FAIL because `RecoveryLockService` does not exist. + +- [ ] **Step 3: Implement the lock service and wire it into `AppState`** + +```rust +// crates/server/src/service/recovery_lock.rs +#[derive(Default)] +pub struct RecoveryLockService { + frozen: dashmap::DashSet, +} + +impl RecoveryLockService { + pub fn new() -> Self { Self { frozen: dashmap::DashSet::new() } } + pub fn freeze(&self, server_id: &str) { self.frozen.insert(server_id.to_string()); } + pub fn release(&self, server_id: &str) { self.frozen.remove(server_id); } + pub fn writes_allowed_for(&self, server_id: &str) -> bool { !self.frozen.contains(server_id) } +} + +// crates/server/src/state.rs +pub recovery_lock: RecoveryLockService, +``` + +- [ ] **Step 4: Gate all write paths that can race with recovery** + +```rust +// crates/server/src/router/ws/agent.rs +if !state.recovery_lock.writes_allowed_for(server_id) { + tracing::info!("Skipping recovery-frozen ping/task/probe write for {server_id}"); + return; +} + +// crates/server/src/task/record_writer.rs +if !state.recovery_lock.writes_allowed_for(server_id) { + continue; +} +``` + +- [ ] **Step 5: Run focused tests and commit** + +Run: `cargo test -p serverbee-server locked_server_denies_writes_until_released -- --exact` + +Expected: PASS + +Commit: + +```bash +git add crates/server/src/service/mod.rs crates/server/src/service/recovery_lock.rs crates/server/src/state.rs crates/server/src/router/ws/agent.rs crates/server/src/task/record_writer.rs +git commit -m "feat(server): add recovery write freeze guards" +``` + +### Task 5: Implement the Rebind Orchestrator and Recovery Job Lifecycle + +**Files:** +- Create: `crates/server/src/service/recovery_merge.rs` +- Modify: `crates/server/src/service/mod.rs` +- Modify: `crates/server/src/router/api/server_recovery.rs` +- Modify: `crates/server/src/router/ws/agent.rs` +- Modify: `crates/server/src/router/ws/browser.rs` +- Modify: `crates/common/src/protocol.rs` +- Modify: `apps/web/src/hooks/use-servers-ws.ts` (for later WS payload shape) + +- [ ] **Step 1: Write failing service tests for pre-rebind vs post-rebind retry semantics** + +```rust +// crates/server/src/service/recovery_merge.rs +#[cfg(test)] +mod tests { + use super::{RecoveryFailureMode, retry_strategy_for}; + + #[test] + fn pre_rebind_failures_require_new_job() { + assert_eq!(retry_strategy_for(RecoveryFailureMode::AwaitingTargetOnlineTimeout), "new_job"); + } + + #[test] + fn post_rebind_failures_resume_same_job() { + assert_eq!(retry_strategy_for(RecoveryFailureMode::MergeGroupFailed), "resume_same_job"); + } +} +``` + +- [ ] **Step 2: Run the lifecycle tests and verify failure** + +Run: `cargo test -p serverbee-server pre_rebind_failures_require_new_job -- --exact` + +Expected: FAIL because `recovery_merge.rs` does not exist. + +- [ ] **Step 3: Implement orchestration entry points and persisted stage transitions** + +```rust +// crates/server/src/service/recovery_merge.rs +pub struct RecoveryMergeService; + +impl RecoveryMergeService { + pub async fn start( + state: &Arc, + target_server_id: &str, + source_server_id: &str, + ) -> Result { + let job = RecoveryJobService::create_job(&state.db, target_server_id, source_server_id).await?; + RecoveryJobService::update_stage(&state.db, &job.job_id, "rebinding", None, None).await?; + Ok(job) + } + + pub async fn handle_rebind_ack(state: &Arc, job_id: &str) -> Result<(), AppError> { + RecoveryJobService::update_stage(&state.db, job_id, "awaiting_target_online", None, None).await?; + Ok(()) + } +} + +pub fn retry_strategy_for(mode: RecoveryFailureMode) -> &'static str { + match mode { + RecoveryFailureMode::AwaitingTargetOnlineTimeout => "new_job", + RecoveryFailureMode::MergeGroupFailed => "resume_same_job", + } +} +``` + +- [ ] **Step 4: Wire WS acknowledgements and browser progress fan-out** + +```rust +// crates/server/src/router/ws/agent.rs +AgentMessage::RebindIdentityAck { job_id } => { + if let Err(err) = RecoveryMergeService::handle_rebind_ack(state, &job_id).await { + tracing::error!("Failed to advance recovery job {job_id}: {err}"); + } +} + +// crates/server/src/router/ws/browser.rs +BrowserMessage::FullSync { + servers, + upgrades: state.upgrade_tracker.snapshot(), + recoveries: state.recovery_merge.snapshot(), +} +``` + +- [ ] **Step 5: Run lifecycle tests and commit** + +Run: `cargo test -p serverbee-server pre_rebind_failures_require_new_job post_rebind_failures_resume_same_job -- --nocapture` + +Expected: PASS + +Commit: + +```bash +git add crates/common/src/protocol.rs crates/server/src/service/mod.rs crates/server/src/service/recovery_merge.rs crates/server/src/router/api/server_recovery.rs crates/server/src/router/ws/agent.rs crates/server/src/router/ws/browser.rs +git commit -m "feat(server): orchestrate recovery rebind lifecycle" +``` + +### Task 6: Implement History Merge Groups, JSON Rewrite, and Final Cleanup + +**Files:** +- Modify: `crates/server/src/service/recovery_merge.rs` +- Modify: `crates/server/src/service/traffic.rs` +- Modify: `crates/server/tests/integration.rs` + +- [ ] **Step 1: Write failing merge-engine tests for raw, unique-key, JSON, and alert-state semantics** + +```rust +// crates/server/src/service/recovery_merge.rs +#[tokio::test] +async fn merge_raw_records_replaces_target_overlap_with_source() { /* seed overlapping rows; expect target window delete + source move */ } + +#[tokio::test] +async fn merge_alert_state_keeps_target_when_rule_conflicts() { /* same rule on both sides; expect target row kept */ } + +#[tokio::test] +async fn rewrite_server_ids_json_replaces_source_with_target_once() { /* ["target","source","source"] -> ["target"] */ } +``` + +- [ ] **Step 2: Run the merge-engine tests and verify failure** + +Run: `cargo test -p serverbee-server merge_raw_records_replaces_target_overlap_with_source -- --exact` + +Expected: FAIL because merge helpers do not exist. + +- [ ] **Step 3: Implement merge group helpers** + +```rust +impl RecoveryMergeService { + async fn merge_raw_table( + db: &DatabaseConnection, + table: &str, + time_column: &str, + target: &str, + source: &str, + ) -> Result<(), AppError> { /* delete target overlap; update source rows to target */ } + + async fn merge_alert_states(db: &DatabaseConnection, target: &str, source: &str) -> Result<(), AppError> { /* target wins */ } + + async fn rewrite_server_ids_json_tables(db: &DatabaseConnection, target: &str, source: &str) -> Result<(), AppError> { /* alert_rule/ping_task/task/service_monitor/maintenance/incident/status_page */ } +} +``` + +- [ ] **Step 4: Implement finalization rules and explicit source cleanup** + +```rust +impl RecoveryMergeService { + async fn finalize_target_server_row(db: &DatabaseConnection, target: &str, source: &server::Model) -> Result<(), AppError> { /* copy runtime fields */ } + + async fn delete_intentionally_unmerged_source_rows(db: &DatabaseConnection, source: &str) -> Result<(), AppError> { + server_tag::Entity::delete_many().filter(server_tag::Column::ServerId.eq(source)).exec(db).await?; + network_probe_config::Entity::delete_many().filter(network_probe_config::Column::ServerId.eq(source)).exec(db).await?; + Ok(()) + } +} +``` + +- [ ] **Step 5: Run merge-focused tests and commit** + +Run: `cargo test -p serverbee-server recovery_merge -- --nocapture` + +Expected: PASS + +Commit: + +```bash +git add crates/server/src/service/recovery_merge.rs crates/server/src/service/traffic.rs crates/server/tests/integration.rs +git commit -m "feat(server): merge recovered server history into target identity" +``` + +### Task 7: Add Browser Recovery Job State, Dialog UI, and Server Detail Controls + +**Files:** +- Modify: `apps/web/src/lib/api-schema.ts` +- Modify: `apps/web/src/hooks/use-api.ts` +- Modify: `apps/web/src/hooks/use-api.test.tsx` +- Create: `apps/web/src/stores/recovery-jobs-store.ts` +- Create: `apps/web/src/stores/recovery-jobs-store.test.ts` +- Modify: `apps/web/src/hooks/use-servers-ws.ts` +- Modify: `apps/web/src/hooks/use-servers-ws.test.ts` +- Create: `apps/web/src/components/server/recovery-merge-dialog.tsx` +- Create: `apps/web/src/components/server/recovery-merge-dialog.test.tsx` +- Modify: `apps/web/src/routes/_authed/servers/$id.tsx` +- Modify: `apps/web/src/routes/_authed/servers/$id.test.tsx` +- Modify: `apps/web/src/locales/en/servers.json` +- Modify: `apps/web/src/locales/zh/servers.json` + +- [ ] **Step 1: Write failing store and hook tests** + +```ts +// apps/web/src/stores/recovery-jobs-store.test.ts +it('stores recovery jobs keyed by target server id', () => { + useRecoveryJobsStore.getState().setJob('target-1', { + job_id: 'job-1', + target_server_id: 'target-1', + source_server_id: 'source-1', + status: 'running', + stage: 'rebinding' + }) + expect(useRecoveryJobsStore.getState().getJob('target-1')?.job_id).toBe('job-1') +}) + +// apps/web/src/hooks/use-api.test.tsx +it('fetches recovery candidates for a target server', async () => { + fetchMock.mockResponseOnce(JSON.stringify({ data: [{ server_id: 'source-1', score: 42, reasons: ['same IP'] }] })) + const result = await api.get('/api/servers/target-1/recovery-candidates') + expect(result[0].server_id).toBe('source-1') +}) +``` + +- [ ] **Step 2: Run the focused web tests and verify failure** + +Run: `bun --cwd apps/web run test -- src/stores/recovery-jobs-store.test.ts src/hooks/use-api.test.tsx` + +Expected: FAIL because the store and API helpers do not exist. + +- [ ] **Step 3: Implement API helpers, store, and WS hydration** + +```ts +// apps/web/src/hooks/use-api.ts +export function useRecoveryCandidates(targetId: string, enabled = true) { + return useQuery({ + queryKey: ['servers', targetId, 'recovery-candidates'], + queryFn: () => api.get(`/api/servers/${targetId}/recovery-candidates`), + enabled: enabled && !!targetId + }) +} + +export async function startRecoveryMerge(targetId: string, sourceServerId: string) { + return api.post(`/api/servers/${targetId}/recover-merge`, { source_server_id: sourceServerId }) +} + +// apps/web/src/stores/recovery-jobs-store.ts +export const useRecoveryJobsStore = create()((set, get) => ({ /* same pattern as upgrade-jobs-store */ })) +``` + +- [ ] **Step 4: Implement dialog and server detail integration** + +```tsx +// apps/web/src/components/server/recovery-merge-dialog.tsx +export function RecoveryMergeDialog({ targetServerId, open, onOpenChange }: Props) { + const { data: candidates } = useRecoveryCandidates(targetServerId, open) + const [selectedSourceId, setSelectedSourceId] = useState('') + const mutation = useMutation({ + mutationFn: () => startRecoveryMerge(targetServerId, selectedSourceId) + }) + + return ( + + + + {t('recovery_merge_title')} + + {/* candidate list + reasons + confirmation copy */} + + + ) +} + +// apps/web/src/routes/_authed/servers/$id.tsx +{!server.online && isAdmin ? : null} +``` + +- [ ] **Step 5: Regenerate OpenAPI web types, run web tests, and commit** + +Run: `bun --cwd apps/web run generate:api-types` + +Expected: `src/lib/api-types.ts` updated without errors + +Run: `bun --cwd apps/web run test -- src/hooks/use-api.test.tsx src/hooks/use-servers-ws.test.ts src/components/server/recovery-merge-dialog.test.tsx src/routes/_authed/servers/$id.test.tsx` + +Expected: PASS + +Commit: + +```bash +git add apps/web/src/lib/api-schema.ts apps/web/src/hooks/use-api.ts apps/web/src/hooks/use-api.test.tsx apps/web/src/stores/recovery-jobs-store.ts apps/web/src/stores/recovery-jobs-store.test.ts apps/web/src/hooks/use-servers-ws.ts apps/web/src/hooks/use-servers-ws.test.ts apps/web/src/components/server/recovery-merge-dialog.tsx apps/web/src/components/server/recovery-merge-dialog.test.tsx apps/web/src/routes/_authed/servers/\$id.tsx apps/web/src/routes/_authed/servers/\$id.test.tsx apps/web/src/locales/en/servers.json apps/web/src/locales/zh/servers.json apps/web/src/lib/api-types.ts +git commit -m "feat(web): add server recovery merge workflow" +``` + +### Task 8: Update Docs and Run End-to-End Verification + +**Files:** +- Modify: `apps/docs/content/docs/cn/server.mdx` +- Modify: `apps/docs/content/docs/en/server.mdx` +- Modify: `apps/docs/content/docs/cn/api-reference.mdx` +- Modify: `apps/docs/content/docs/en/api-reference.mdx` + +- [ ] **Step 1: Write the documentation changes** + +```mdx +## Recovering a Reinstalled Agent + +If an existing server was reinstalled and re-registered as a new temporary node: + +1. Open the original offline server. +2. Click **Claim and Merge New Agent**. +3. Select the recommended online replacement. +4. Confirm the merge. + +The original server record is kept. The replacement record's overlapping history wins, and the temporary record is deleted after recovery completes. +``` + +- [ ] **Step 2: Run the backend verification suite** + +Run: `cargo test -p serverbee-server recovery -- --nocapture` + +Expected: PASS for the recovery-specific tests added in `integration.rs` and `service/recovery_merge.rs` + +- [ ] **Step 3: Run the agent verification suite** + +Run: `cargo test -p serverbee-agent rebind -- --nocapture` + +Expected: PASS for the new atomic token persistence and rebind tests + +- [ ] **Step 4: Run web typecheck and lint** + +Run: `bun --cwd apps/web run typecheck` + +Expected: PASS + +Run: `bun x ultracite check apps/web/src/hooks/use-api.ts apps/web/src/hooks/use-servers-ws.ts apps/web/src/components/server/recovery-merge-dialog.tsx apps/web/src/routes/_authed/servers/\$id.tsx` + +Expected: PASS + +- [ ] **Step 5: Commit the docs and final verification sweep** + +```bash +git add apps/docs/content/docs/cn/server.mdx apps/docs/content/docs/en/server.mdx apps/docs/content/docs/cn/api-reference.mdx apps/docs/content/docs/en/api-reference.mdx +git commit -m "docs: add agent recovery merge guidance" +``` + +## Self-Review + +- Spec coverage: + - Recovery job persistence: Task 2 + - Agent atomic token rebind + ack semantics: Task 1 + - Candidate scoring and recovery APIs: Task 3 + - Write freeze: Task 4 + - Rebind orchestration and retry semantics: Task 5 + - History merge groups, JSON rewrites, and cleanup: Task 6 + - Browser progress and admin UI: Task 7 + - Docs and verification: Task 8 +- Placeholder scan: + - No `TODO`, `TBD`, or "handle appropriately" placeholders remain. + - Each code-changing task includes concrete snippets and commands. +- Type consistency: + - `RebindIdentity`, `RebindIdentityAck`, `RecoveryJobResponse`, and `RecoveryCandidateResponse` names are reused consistently across tasks. + - `target_server_id` and `source_server_id` naming is consistent across backend, protocol, and web tasks. diff --git a/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md b/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md new file mode 100644 index 00000000..d2e12ffd --- /dev/null +++ b/docs/superpowers/specs/2026-04-16-agent-recovery-merge-design.md @@ -0,0 +1,700 @@ +# Agent Recovery Merge Design + +**Date:** 2026-04-16 +**Status:** Draft +**Scope:** Admin-driven recovery of a reinstalled agent by merging a new temporary server record back into the original offline server record + +## Problem + +The current registration model can reuse a server row only when the agent still has a valid token or when the machine fingerprint remains stable. After a true system reinstall, the old token is often gone and the machine fingerprint may change, so the server creates a new temporary server record instead of reconnecting to the original one. + +That creates two operational problems: + +1. The original server record keeps the historical charts, alerts, tasks, billing metadata, and dashboard references, but it stays offline. +2. The newly registered server record becomes the live agent identity, but it starts with little or no history and is not the record users want to keep. + +The desired recovery flow is: + +- The admin starts from the old offline server record. +- The admin picks a newly registered online temporary server record. +- The system rebinds the live agent to the old server identity. +- The system merges the temporary server's history into the old server record. +- Overlapping time ranges prefer the temporary server's data. +- The temporary server record is deleted after recovery completes. + +This is a targeted recovery flow only. It is not a general-purpose "merge any two servers" feature. + +## Goals + +- Preserve the original `server_id` as the long-term identity. +- Restore the live agent onto the original server record without requiring manual input on the agent. +- Merge historical data from the temporary server into the original server. +- Treat overlapping time ranges as `source wins`. +- Keep user-managed server configuration on the original record. +- Replace runtime system fields on the original record with the recovered agent's latest values. +- Automatically remove the temporary server record after successful recovery. +- Make the workflow explicit, auditable, and retryable. + +## Non-Goals + +- No attempt to fill monitoring gaps during the reinstall window. +- No support for arbitrary record-to-record merge in v1. +- No attempt to reverse the full workflow after the recovered agent has successfully rebound. +- No new permanent "installation identity" entity in v1. +- No merge behavior for data that is not keyed by `server_id` and is not semantically tied to one server record, such as `service_monitor_record`. +- Not designed for machine migration to materially different hardware. Candidate ranking heuristics assume the same logical host was reinstalled and re-registered. + +## User Workflow + +### Entry Point + +The recovery action appears only on a server detail page for a server that is currently offline. + +Button label: + +- `claim and merge new agent` + +The action is admin-only. + +### Candidate Selection + +The action opens a dialog showing candidate temporary server records. Candidates must satisfy all of the following: + +- currently online +- not equal to the target server +- not already participating in another recovery job + +Candidate ranking is recommendation-only. The admin must still explicitly confirm the selected source. + +There is no code-level `auto_registered` or `is_temporary` marker on `servers` in v1. "Temporary" is only a product description for the common case where a newly registered online source is the replacement agent after reinstall. The implementation therefore uses heuristics for ranking, not a hard temporary flag. + +Recommended ranking signals: + +- same or similar `last_remote_addr` +- matching `cpu_arch` +- matching `os` +- matching `virtualization` +- close `agent_version` +- close `created_at` +- `target` went offline before `source` was created +- matching `mem_total` +- matching `disk_total` +- matching `cpu_cores` +- matching `country_code` and `region` +- still has default-like metadata such as recent `created_at` and unchanged default `name` +- is not referenced, or is only lightly referenced, by shared `server_ids_json` configuration tables + +The dialog should show a short explanation for why a candidate was recommended. + +### Confirmation + +Before execution, the dialog shows a summary: + +- keep the old server record +- move the live agent identity onto the old server +- merge history from the temporary record +- when timestamps overlap, the temporary record wins +- delete the temporary record after success + +### Result States + +- On success: the original server becomes online again and the temporary server disappears. +- On failure before rebind: the temporary server remains unchanged and the admin can retry. +- On failure after rebind but before cleanup: the original server remains the live identity, the temporary server remains present, and the admin can retry completion. + +## Terminology + +- `target`: the original offline server record that will be kept +- `source`: the newly registered online temporary server record that will be absorbed and deleted + +## Architecture + +The recovery feature is implemented as a staged server-side recovery merge job. + +High-level flow: + +1. Validate `target` and `source`. +2. Rebind the live agent from `source` identity to `target` identity. +3. Wait for the agent to reconnect as `target`. +4. Freeze writes for both `target` and `source`. +5. Merge `source` history into `target`. +6. Update runtime fields on `target`. +7. Delete `source`. +8. Unfreeze writes and mark the job complete. + +The key design choice is to split "future writes go to the right identity" from "past writes are merged." The system must not start deleting or migrating `source` history until the agent has actually rebound onto `target`. + +## Components + +### 1. Recovery Merge Job Tracker + +Add a recovery job tracker with database persistence plus an in-memory lock/cache layer. + +Unlike the existing upgrade tracker, recovery cannot be memory-only because failure and retry windows can span multiple DB transactions, WebSocket disconnects, and process restarts. + +New persistent table: + +- `recovery_job` + +Persistent columns: + +- `job_id` +- `target_server_id` +- `source_server_id` +- `status` +- `stage` +- `checkpoint_json` +- `error` +- `started_at` +- `created_at` +- `updated_at` +- `last_heartbeat_at` + +Recommended indexes: + +- unique primary key on `job_id` +- index on `(target_server_id, status)` +- index on `(source_server_id, status)` + +`checkpoint_json` stores per-stage metadata needed for restart-safe continuation. + +Suggested stages: + +- `validating` +- `rebinding` +- `awaiting_target_online` +- `freezing_writes` +- `merging_history` +- `finalizing` +- `succeeded` +- `failed` + +The tracker provides: + +- protection against concurrent recovery jobs involving the same server +- visible progress for the frontend +- a retry boundary after partial completion +- restart-safe recovery state + +The in-memory layer is still useful for fast lock checks and live progress fan-out, but the database row is the source of truth. + +### 2. Agent Rebind Protocol + +Add a dedicated protocol message that instructs a connected agent currently identified as `source` to switch to `target`. + +New server-to-agent message: + +- `ServerMessage::RebindIdentity { job_id, target_server_id, token }` + +New agent-to-server messages: + +- `AgentMessage::RebindIdentityAck { job_id }` +- `AgentMessage::RebindIdentityFailed { job_id, error }` + +Agent behavior: + +1. Receive `RebindIdentity`. +2. Persist the new token locally using atomic file replacement semantics. +3. Only after the token is durably written, acknowledge success or failure. +4. Disconnect. +5. Reconnect using the new token, which now authenticates as `target`. + +The target server row receives a newly generated token. The source row keeps its existing token until final cleanup so that failure before the rebind is easy to reason about. + +The agent-side token write must be implemented as: + +- write to a temporary file +- flush and close +- atomic rename over the old config file + +The current non-atomic "rewrite file in place" helper is not sufficient for this workflow and must be replaced or wrapped. + +### 3. Write Freeze Guard + +The system needs an explicit in-memory recovery lock for `target` and `source` during merge. + +Reason: + +- `records` are persisted asynchronously by `record_writer` +- `ping_records`, `task_results`, and `network_probe_record` are persisted directly from the WebSocket handler +- `traffic_hourly` and `traffic_state` are updated continuously + +Without a write freeze, merge results could be invalidated by concurrent writes after the merge has already decided which side wins. + +The guard should: + +- block or drop writes for both `target` and `source` during `freezing_writes`, `merging_history`, and `finalizing` +- make the skip explicit in logs +- be lifted immediately after the job completes or fails + +Implementation guidance: + +- the WebSocket handler should funnel agent-originated database writes through a unified `writes_allowed_for(server_id)` check +- this check must cover at least `ping_record`, `task_result`, `network_probe_record`, `docker_event`, and agent-triggered audit side effects such as IP-change audit records +- `record_writer` and traffic upsert paths must honor the same guard + +This intentionally allows a small monitoring gap during the merge window. That is acceptable because gap filling is out of scope and already accepted by the product requirements. + +## Data Model Semantics + +### Canonical Identity + +The final canonical identity is always `target.server_id`. + +After the recovery: + +- all future agent writes use `target.server_id` +- all kept history belongs to `target.server_id` +- `source.server_id` no longer exists + +### Server Row Field Policy + +On `servers(target)`, keep the original user-managed fields: + +- `name` +- `group_id` +- `weight` +- `hidden` +- `remark` +- `public_remark` +- `price` +- `billing_cycle` +- `currency` +- `expired_at` +- `traffic_limit` +- `traffic_limit_type` +- `billing_start_day` +- `capabilities` + +On `servers(target)`, replace runtime fields from `source`: + +- `cpu_name` +- `cpu_cores` +- `cpu_arch` +- `os` +- `kernel_version` +- `mem_total` +- `swap_total` +- `disk_total` +- `ipv4` +- `ipv6` +- `region` +- `country_code` +- `virtualization` +- `agent_version` +- `protocol_version` +- `features` +- `last_remote_addr` +- `fingerprint` + +`server_tags` remain those of `target`. + +## History Merge Rules + +The merge logic is table-specific. + +### Category A: Keep Target Configuration, Drop Source Configuration + +These tables or fields are treated as target-owned configuration and are not merged from source: + +- `servers` user-managed fields listed above +- `server_tag` +- `network_probe_config` + +Source-owned values in this category are discarded when `source` is deleted. + +### Category A2: Shared `server_ids_json` References + +The source server is allowed to appear in shared configuration JSON arrays. This is not a hard exclusion during candidate selection. + +When `source` is deleted, all references to `source.server_id` must be rewritten to `target.server_id` and deduplicated in the following tables: + +- `alert_rule.server_ids_json` +- `ping_task.server_ids_json` +- `task.server_ids_json` +- `service_monitor.server_ids_json` +- `maintenance.server_ids_json` +- `incident.server_ids_json` +- `status_page.server_ids_json` + +Rules: + +- replace every occurrence of `source_server_id` with `target_server_id` +- deduplicate the final array while preserving order where practical +- never leave a dangling `source_server_id` reference behind +- apply the rewrite with a read-modify-write cycle scoped to rows that still contain `source_server_id` + +Because this is a replacement, not a removal, these updates do not create the empty-array semantics problems seen in orphan cleanup flows. + +### Category B: Raw Time-Series Tables + +For raw tables without a natural uniqueness key, merge by replacing the target's overlapping time window with source data. + +Algorithm per table: + +1. Read the source time range: `min_ts` and `max_ts`. +2. Delete target rows whose timestamps fall in `[min_ts, max_ts]`. +3. Rewrite all source rows to `target.server_id`. +4. Delete the original source rows if they were not already moved by update. + +This gives exact `source wins` behavior over the source's active time window. + +Apply this policy to: + +- `records` +- `gpu_record` +- `ping_record` +- `task_result` +- `network_probe_record` +- `docker_event` + +Field-specific time keys: + +- `records.time` +- `gpu_record.time` +- `ping_record.time` +- `task_result.finished_at` +- `network_probe_record.timestamp` +- `docker_event.timestamp` + +Notes: + +- `task_result` overlap uses `finished_at`; no attempt is made to semantically deduplicate by command. +- `docker_event` overlap uses event timestamp and still follows `source wins`. + +### Category C: Aggregated or Unique-Key Tables + +For tables with a uniqueness key or a natural aggregate bucket, merge by key with strict `source wins`. + +Algorithm per table: + +1. For each source row, compute the target conflict key. +2. Delete any target row with the same key. +3. Rewrite the source row to `target.server_id`. + +Apply this policy to: + +- `records_hourly` with key `(server_id, time)` +- `network_probe_record_hourly` with key `(server_id, target_id, hour)` +- `traffic_hourly` with key `(server_id, hour)` +- `traffic_daily` with key `(server_id, date)` +- `uptime_daily` with key `(server_id, date)` +- `traffic_state` with key `server_id` + +Special notes: + +- `traffic_state`: always take the source row because it is the live baseline for future traffic deltas. + +### Category C2: Stateful Logical-Server Rows + +These rows represent state that semantically belongs to the logical target server, not the temporary replacement identity. + +Apply this policy to: + +- `alert_state` with key `(rule_id, server_id)` + +Rules: + +- if target has no row for the rule, move the source row to target +- if both target and source have a row for the same rule, keep the target row and discard the source row + +This avoids resetting ongoing alert continuity on the original logical server. + +### Category D: Not Merged + +Do not merge: + +- `service_monitor_record` + +Reason: + +- It is keyed by `monitor_id`, not `server_id`. +- It does not represent per-server ownership in the way the recovery feature needs. + +## Recovery Job Flow + +### Stage 1: Validating + +Checks: + +- `target` exists +- `source` exists +- `target` is offline +- `source` is online +- neither record is already in another recovery job +- candidate ranking metadata is captured for the confirmation UI, but there is no hard `is_temporary` gate in v1 + +If any check fails, the job fails without side effects. + +### Stage 2: Rebinding + +1. Generate a new token for `target`. +2. Persist the new token hash and prefix on `target`. +3. Send `RebindIdentity` to the currently connected `source` agent. +4. Wait for `RebindIdentityAck`. + +If the agent reports failure, the job fails here and no history is merged. + +The agent must not send `RebindIdentityAck` until the new token is durably written locally. + +### Stage 3: Awaiting Target Online + +Wait for the recovered agent to reconnect as `target`. + +Success condition: + +- `target` becomes the current online connection + +Failure condition: + +- timeout + +Timeout does not roll back the newly issued target token. + +Reason: + +- the agent may already have durably persisted the new token and may still reconnect late +- rolling back the target token would risk turning a late reconnect into a guaranteed `401` + +The job simply fails before merge and keeps `source` untouched. A retry issues a fresh target token and supersedes the prior unfinished attempt. + +Retry semantics after Stage 3 timeout or any other pre-rebind completion failure: + +- mark the existing recovery job as `failed` +- create a new recovery job with a new `job_id` +- issue a fresh target token for the new attempt + +### Stage 4: Freezing Writes + +Enable recovery locks for both `target` and `source`. + +This must happen only after `target` is already online under the recovered identity, because the freeze may cause some writes to be skipped. + +### Stage 5: Merging History + +Execute the table-group merge in bounded transactions. + +Recommended groups: + +- group 1: `records`, `gpu_record`, `docker_event` +- group 2: `records_hourly`, `uptime_daily`, `traffic_hourly`, `traffic_daily`, `traffic_state` +- group 3: `ping_record`, `task_result`, `network_probe_record`, `network_probe_record_hourly` +- group 4: `alert_state` +- group 5: shared `server_ids_json` reference rewrites + +Each group: + +- runs in its own DB transaction +- records a completed checkpoint before the next group starts + +### Stage 6: Finalizing + +1. Update `servers(target)` runtime fields from `source`. +2. Delete remaining source-owned rows that are intentionally not merged: + - `server_tag` + - `network_probe_config` +3. Assert that no other source-owned rows remain in tables that should already have been moved or rewritten. +4. Delete the `source` server row. +5. Clear job locks. +6. Write audit log entries. + +`source` deletion should not be used as the primary cleanup mechanism for historical rows. It is only the final removal of the now-obsolete server row after merge/rewrite work has already completed. Foreign-key cascade is acceptable as a safety net for tables that define it, but the merge engine must not depend on it for correctness. + +### Stage 7: Terminal State + +- `succeeded` +- `failed` + +## Failure Handling + +### Failure Before Target Rebind Succeeds + +If the job fails before `target` reconnects: + +- do not merge history +- do not delete source +- do not freeze writes +- mark the job failed + +This keeps retry semantics simple. + +### Failure After Target Rebind Succeeds + +If the job fails after `target` is already online: + +- keep `target` as the live identity +- keep `source` present +- keep job checkpoints +- allow retry from the first incomplete merge stage + +The system does not attempt a full rollback after the live identity has already switched. That would be more fragile than completing the merge forward. + +Retry semantics after rebind has succeeded: + +- keep the same `job_id` +- resume from the first incomplete stage using persisted checkpoints +- do not issue another target token unless the retry is explicitly restarted from the beginning as a separate administrative action + +### Failure During Final Cleanup + +If all history has been merged but deleting `source` fails: + +- leave the source row present +- mark the job failed in `finalizing` +- allow a retry that only runs the remaining cleanup steps + +## Transaction Strategy + +Do not use one global transaction for the entire recovery flow. + +Reasons: + +- the workflow includes WebSocket disconnect and reconnect +- SQLite lock duration would be too large +- a late failure would waste all merge work + +Instead: + +- use short transactions for validation-side DB writes +- use no transaction during the async rebind wait +- use one transaction per merge table group +- use one short transaction for final cleanup + +This provides clear checkpoints and safe retries. + +## API and UI + +### API + +Suggested endpoints: + +- `GET /api/servers/{target_id}/recovery-candidates` +- `POST /api/servers/{target_id}/recover-merge` +- `GET /api/servers/recovery-jobs/{job_id}` + +`POST /recover-merge` request body: + +```json +{ + "source_server_id": "..." +} +``` + +Response: + +```json +{ + "data": { + "job_id": "...", + "status": "running", + "stage": "validating" + } +} +``` + +### UI + +On the target server detail page: + +- admin-only button: `claim and merge new agent` +- candidate list dialog with match explanations +- confirmation dialog with irreversible-effect summary + +During execution: + +- show recovery stage on the target page +- show source as `recovery in progress` + +On success: + +- refresh both list and detail views +- target remains +- source disappears + +On failure: + +- show stage-specific error +- offer retry + +## Audit Logging + +Write explicit audit entries for: + +- recovery started +- source selected +- rebind succeeded or failed +- merge succeeded or failed +- source deleted + +Recommended detail payload: + +- `job_id` +- `target_server_id` +- `source_server_id` +- `stage` +- `error` + +Recommended action names: + +- `recovery.started` +- `recovery.rebind_ok` +- `recovery.rebind_failed` +- `recovery.merge_group_done` +- `recovery.source_deleted` +- `recovery.failed` + +## Open Tradeoffs + +- The merge window intentionally drops some live writes due to the recovery lock. This is acceptable because monitoring-gap repair is out of scope. +- The design chooses forward completion over full rollback after live identity rebind. This reduces failure complexity and matches the operational priority of restoring the server under the original identity. +- The design does not try to infer recovery automatically. Admin confirmation remains mandatory to avoid silent mis-merges. +- Shared `server_ids_json` rewrites are read-modify-write operations. If an admin edits the same row concurrently in the UI, last-writer-wins behavior may still occur. This is acceptable in v1 because the race is rare and bounded to recovery-time configuration edits. + +## Testing Strategy + +### Backend Integration Tests + +Must cover: + +1. successful end-to-end recovery +2. rebind failure before merge +3. timeout waiting for target online +4. failure during one merge group with retryable checkpoint state +5. successful retry after partial failure +6. `source wins` for each unique-key table +7. `target wins` conflict handling for `alert_state` +8. raw time-window replacement for each raw history table +9. shared `server_ids_json` rewrite and dedupe across all seven tables +10. write-freeze behavior during merge +11. process restart with a persisted recovery job +12. final cleanup deleting the source record + +### Agent Tests + +Must cover: + +1. receiving `RebindIdentity` +2. persisting the new token with atomic replace semantics +3. only acknowledging after durable local write +4. reporting ack and failure +5. reconnecting with the new identity + +### Frontend Tests + +Must cover: + +1. candidate ranking and rendering +2. confirmation summary +3. progress state rendering +4. error state and retry action + +## Rollout + +Recommended rollout order: + +1. backend job tracker and protocol +2. agent rebind support +3. write-freeze guards +4. merge engine and tests +5. UI workflow +6. documentation