Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions test/extended/node/node_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,43 @@ func loadConfigFromFile(path string) string {
}

// restartKubeletOnNode restarts the kubelet service on the specified node
// Retries on transient network errors which are common on real clusters
func restartKubeletOnNode(oc *exutil.CLI, nodeName string) error {
_, err := ExecOnNodeWithChroot(oc, nodeName, "systemctl", "restart", "kubelet")
return err
var lastErr error
for attempt := 1; attempt <= 3; attempt++ {
_, err := ExecOnNodeWithChroot(oc, nodeName, "systemctl", "restart", "kubelet")
if err == nil {
return nil
}
lastErr = err
if isTransientNetworkError(err) {
framework.Logf("Attempt %d/3 to restart kubelet on %s failed: %v, retrying...", attempt, nodeName, err)
time.Sleep(time.Duration(attempt*5) * time.Second)
continue
}
return err
}
return fmt.Errorf("failed to restart kubelet on %s after 3 attempts: %w", nodeName, lastErr)
Comment on lines 200 to +215
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Retry loop adds a no-op 15s delay on final failure and cannot be canceled.

Line [210] sleeps even on the last attempt (Line [202]), so failure path pays an extra 15s with no next retry. Also, the backoff is not context-aware, which can overrun cleanup windows (see test/extended/node/node_swap_cnv.go:242-262 where restart runs in defer with test ctx).

Proposed fix
-func restartKubeletOnNode(oc *exutil.CLI, nodeName string) error {
+func restartKubeletOnNode(ctx context.Context, oc *exutil.CLI, nodeName string) error {
+	const maxRetries = 3
 	var lastErr error
-	for attempt := 1; attempt <= 3; attempt++ {
+	for attempt := 0; attempt <= maxRetries; attempt++ {
 		_, err := ExecOnNodeWithChroot(oc, nodeName, "systemctl", "restart", "kubelet")
 		if err == nil {
 			return nil
 		}
 		lastErr = err
-		if isTransientNetworkError(err) {
-			framework.Logf("Attempt %d/3 to restart kubelet on %s failed: %v, retrying...", attempt, nodeName, err)
-			time.Sleep(time.Duration(attempt*5) * time.Second)
-			continue
+		if !isTransientNetworkError(err) {
+			return fmt.Errorf("failed to restart kubelet on %s: %w", nodeName, err)
 		}
-		return err
+		if attempt == maxRetries {
+			break
+		}
+		backoff := time.Duration((attempt+1)*5) * time.Second
+		framework.Logf("Attempt %d/%d to restart kubelet on %s failed: %v; retrying in %s",
+			attempt+1, maxRetries+1, nodeName, err, backoff)
+		timer := time.NewTimer(backoff)
+		select {
+		case <-ctx.Done():
+			timer.Stop()
+			return fmt.Errorf("context canceled while restarting kubelet on %s: %w", nodeName, ctx.Err())
+		case <-timer.C:
+		}
 	}
-	return fmt.Errorf("failed to restart kubelet on %s after 3 attempts: %w", nodeName, lastErr)
+	return fmt.Errorf("failed to restart kubelet on %s after %d attempts: %w", nodeName, maxRetries+1, lastErr)
 }
// call sites (examples):
err = restartKubeletOnNode(ctx, oc, cnvWorkerNode)
restartKubeletOnNode(ctx, oc, nodeName)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@test/extended/node/node_utils.go` around lines 200 - 215, The retry loop in
restartKubeletOnNode is blocking and sleeps even after the final attempt and
isn't cancelable; change the signature to accept a context.Context
(restartKubeletOnNode(ctx context.Context, oc *exutil.CLI, nodeName string)),
remove the unconditional time.Sleep on the final attempt by only sleeping when
attempt < maxAttempts, and replace time.Sleep with a context-aware wait (select
between time.After(backoff) and ctx.Done()) so the retry/backoff can be
canceled; keep using ExecOnNodeWithChroot and isTransientNetworkError to detect
retryable errors and propagate the lastErr when context is done or retries
exhausted.

}

// isTransientNetworkError checks if the error is a transient network error worth retrying
func isTransientNetworkError(err error) bool {
if err == nil {
return false
}
errStr := err.Error()
transientErrors := []string{
"connection refused",
"connection reset",
"connection timed out",
"i/o timeout",
}
for _, transientErr := range transientErrors {
if strings.Contains(errStr, transientErr) {
return true
}
}
return false
}

// waitForNodeToBeReady waits for a node to become Ready
Expand Down