-
Notifications
You must be signed in to change notification settings - Fork 192
Make transparent proxy health check parameters configurable #4108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
7d879e7
f0ec550
2fd71b6
9e89dc3
a41f8a8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ import ( | |
| "net/http/httputil" | ||
| "net/url" | ||
| "os" | ||
| "strconv" | ||
| "strings" | ||
| "sync" | ||
| "sync/atomic" | ||
|
|
@@ -120,6 +121,9 @@ type TransparentProxy struct { | |
| // Health check ping timeout (default: 5 seconds) | ||
| healthCheckPingTimeout time.Duration | ||
|
|
||
| // Health check failure threshold: consecutive failures before shutdown (default: 5) | ||
| healthCheckFailureThreshold int | ||
|
|
||
| // Shutdown timeout for graceful HTTP server shutdown (default: 30 seconds) | ||
| shutdownTimeout time.Duration | ||
| } | ||
|
|
@@ -140,8 +144,21 @@ const ( | |
| defaultIdleTimeout = 120 * time.Second | ||
|
|
||
| // HealthCheckIntervalEnvVar is the environment variable name for configuring health check interval. | ||
| // This is primarily useful for testing with shorter intervals. | ||
| HealthCheckIntervalEnvVar = "TOOLHIVE_HEALTH_CHECK_INTERVAL" | ||
|
|
||
| // HealthCheckPingTimeoutEnvVar is the environment variable name for configuring health check ping timeout. | ||
| HealthCheckPingTimeoutEnvVar = "TOOLHIVE_HEALTH_CHECK_PING_TIMEOUT" | ||
|
|
||
| // HealthCheckRetryDelayEnvVar is the environment variable name for configuring health check retry delay. | ||
| HealthCheckRetryDelayEnvVar = "TOOLHIVE_HEALTH_CHECK_RETRY_DELAY" | ||
|
|
||
| // HealthCheckFailureThresholdEnvVar is the environment variable name for configuring | ||
| // the number of consecutive health check failures before shutdown. | ||
| HealthCheckFailureThresholdEnvVar = "TOOLHIVE_HEALTH_CHECK_FAILURE_THRESHOLD" | ||
|
|
||
| // DefaultHealthCheckFailureThreshold is the default number of consecutive health check | ||
| // failures before the proxy initiates shutdown. | ||
| DefaultHealthCheckFailureThreshold = 5 | ||
| ) | ||
|
|
||
| // Option is a functional option for configuring TransparentProxy | ||
|
|
@@ -189,6 +206,17 @@ func withHealthCheckPingTimeout(timeout time.Duration) Option { | |
| } | ||
| } | ||
|
|
||
| // withHealthCheckFailureThreshold sets the consecutive failure count before shutdown. | ||
| // This is primarily useful for testing with lower thresholds. | ||
| // Ignores non-positive values; default will be used. | ||
| func withHealthCheckFailureThreshold(threshold int) Option { | ||
| return func(p *TransparentProxy) { | ||
| if threshold > 0 { | ||
| p.healthCheckFailureThreshold = threshold | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // withShutdownTimeout sets the graceful shutdown timeout for the HTTP server. | ||
| // This is primarily useful for testing with shorter timeouts. | ||
| // Ignores non-positive timeouts; default will be used. | ||
|
|
@@ -244,12 +272,60 @@ func NewTransparentProxy( | |
| func getHealthCheckInterval() time.Duration { | ||
| if val := os.Getenv(HealthCheckIntervalEnvVar); val != "" { | ||
| if d, err := time.ParseDuration(val); err == nil && d > 0 { | ||
| slog.Debug("using custom health check interval", "interval", d) | ||
| return d | ||
| } | ||
| slog.Warn("invalid health check interval, using default", | ||
| "env_var", HealthCheckIntervalEnvVar, "value", val, "default", DefaultHealthCheckInterval) | ||
| } | ||
| return DefaultHealthCheckInterval | ||
| } | ||
|
|
||
| // getHealthCheckPingTimeout returns the health check ping timeout to use. | ||
| // Uses TOOLHIVE_HEALTH_CHECK_PING_TIMEOUT environment variable if set and valid, | ||
| // otherwise returns the default timeout. | ||
| func getHealthCheckPingTimeout() time.Duration { | ||
| if val := os.Getenv(HealthCheckPingTimeoutEnvVar); val != "" { | ||
| if d, err := time.ParseDuration(val); err == nil && d > 0 { | ||
| slog.Debug("using custom health check ping timeout", "timeout", d) | ||
| return d | ||
| } | ||
| slog.Warn("invalid health check ping timeout, using default", | ||
| "env_var", HealthCheckPingTimeoutEnvVar, "value", val, "default", DefaultPingerTimeout) | ||
| } | ||
| return DefaultPingerTimeout | ||
| } | ||
|
|
||
| // getHealthCheckRetryDelay returns the health check retry delay to use. | ||
| // Uses TOOLHIVE_HEALTH_CHECK_RETRY_DELAY environment variable if set and valid, | ||
| // otherwise returns the default delay. | ||
| func getHealthCheckRetryDelay() time.Duration { | ||
| if val := os.Getenv(HealthCheckRetryDelayEnvVar); val != "" { | ||
| if d, err := time.ParseDuration(val); err == nil && d > 0 { | ||
| slog.Debug("using custom health check retry delay", "delay", d) | ||
| return d | ||
| } | ||
| slog.Warn("invalid health check retry delay, using default", | ||
| "env_var", HealthCheckRetryDelayEnvVar, "value", val, "default", DefaultHealthCheckRetryDelay) | ||
| } | ||
| return DefaultHealthCheckRetryDelay | ||
| } | ||
|
|
||
| // getHealthCheckFailureThreshold returns the consecutive failure threshold. | ||
| // Uses TOOLHIVE_HEALTH_CHECK_FAILURE_THRESHOLD environment variable if set and valid, | ||
| // otherwise returns the default threshold. | ||
| func getHealthCheckFailureThreshold() int { | ||
| if val := os.Getenv(HealthCheckFailureThresholdEnvVar); val != "" { | ||
| if n, err := strconv.Atoi(val); err == nil && n > 0 { | ||
| slog.Debug("using custom health check failure threshold", "threshold", n) | ||
| return n | ||
| } | ||
| slog.Warn("invalid health check failure threshold, using default", | ||
| "env_var", HealthCheckFailureThresholdEnvVar, "value", val, "default", DefaultHealthCheckFailureThreshold) | ||
| } | ||
| return DefaultHealthCheckFailureThreshold | ||
| } | ||
|
|
||
| // NewTransparentProxyWithOptions creates a new transparent proxy with optional configuration. | ||
| func NewTransparentProxyWithOptions( | ||
| host string, | ||
|
|
@@ -269,25 +345,26 @@ func NewTransparentProxyWithOptions( | |
| options ...Option, | ||
| ) *TransparentProxy { | ||
| proxy := &TransparentProxy{ | ||
| host: host, | ||
| port: port, | ||
| targetURI: targetURI, | ||
| middlewares: middlewares, | ||
| shutdownCh: make(chan struct{}), | ||
| prometheusHandler: prometheusHandler, | ||
| authInfoHandler: authInfoHandler, | ||
| prefixHandlers: prefixHandlers, | ||
| sessionManager: session.NewManager(session.DefaultSessionTTL, session.NewProxySession), | ||
| isRemote: isRemote, | ||
| transportType: transportType, | ||
| onHealthCheckFailed: onHealthCheckFailed, | ||
| onUnauthorizedResponse: onUnauthorizedResponse, | ||
| endpointPrefix: endpointPrefix, | ||
| trustProxyHeaders: trustProxyHeaders, | ||
| healthCheckInterval: getHealthCheckInterval(), | ||
| healthCheckRetryDelay: DefaultHealthCheckRetryDelay, | ||
| healthCheckPingTimeout: DefaultPingerTimeout, | ||
| shutdownTimeout: defaultShutdownTimeout, | ||
| host: host, | ||
| port: port, | ||
| targetURI: targetURI, | ||
| middlewares: middlewares, | ||
| shutdownCh: make(chan struct{}), | ||
| prometheusHandler: prometheusHandler, | ||
| authInfoHandler: authInfoHandler, | ||
| prefixHandlers: prefixHandlers, | ||
| sessionManager: session.NewManager(session.DefaultSessionTTL, session.NewProxySession), | ||
| isRemote: isRemote, | ||
| transportType: transportType, | ||
| onHealthCheckFailed: onHealthCheckFailed, | ||
| onUnauthorizedResponse: onUnauthorizedResponse, | ||
| endpointPrefix: endpointPrefix, | ||
| trustProxyHeaders: trustProxyHeaders, | ||
| healthCheckInterval: getHealthCheckInterval(), | ||
| healthCheckRetryDelay: getHealthCheckRetryDelay(), | ||
| healthCheckPingTimeout: getHealthCheckPingTimeout(), | ||
| healthCheckFailureThreshold: getHealthCheckFailureThreshold(), | ||
| shutdownTimeout: defaultShutdownTimeout, | ||
| } | ||
|
|
||
| // Apply options | ||
|
|
@@ -598,24 +675,10 @@ func (p *TransparentProxy) CloseListener() error { | |
| return nil | ||
| } | ||
|
|
||
| // healthCheckRetryConfig holds retry configuration for health checks. | ||
| // These values are designed to handle transient network issues like | ||
| // VPN/firewall idle connection timeouts (commonly 5-10 minutes). | ||
| const ( | ||
| // healthCheckRetryCount is the number of consecutive failures before marking unhealthy. | ||
| // This prevents immediate shutdown on transient network issues. | ||
| healthCheckRetryCount = 3 | ||
| ) | ||
|
|
||
| // performHealthCheckRetry performs a retry health check after a delay | ||
| // Returns true if the retry was successful (health check recovered), false otherwise | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Redundant guard: The This was meaningful when the struct field could be left at the zero-value, but it's now dead code. Consider removing it to avoid confusion for future readers. |
||
| func (p *TransparentProxy) performHealthCheckRetry(ctx context.Context) bool { | ||
| retryDelay := p.healthCheckRetryDelay | ||
| if retryDelay == 0 { | ||
| retryDelay = DefaultHealthCheckRetryDelay | ||
| } | ||
|
|
||
| retryTimer := time.NewTimer(retryDelay) | ||
| retryTimer := time.NewTimer(p.healthCheckRetryDelay) | ||
| defer retryTimer.Stop() | ||
|
|
||
| select { | ||
|
|
@@ -646,10 +709,10 @@ func (p *TransparentProxy) handleHealthCheckFailure( | |
| slog.Warn("health check failed", | ||
| "target", p.targetURI, | ||
| "attempt", consecutiveFailures, | ||
| "max_attempts", healthCheckRetryCount, | ||
| "max_attempts", p.healthCheckFailureThreshold, | ||
| "status", status) | ||
|
|
||
| if consecutiveFailures < healthCheckRetryCount { | ||
| if consecutiveFailures < p.healthCheckFailureThreshold { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Edge case: The getter already accepts 1 as valid (the "valid minimum" test case). Options:
At minimum this should be noted in the docs. |
||
| if p.performHealthCheckRetry(ctx) { | ||
| consecutiveFailures = 0 | ||
| } | ||
|
|
@@ -659,7 +722,7 @@ func (p *TransparentProxy) handleHealthCheckFailure( | |
| // All retries exhausted, initiate shutdown | ||
| //nolint:gosec // G706: logging target URI from config | ||
| slog.Error("health check failed after consecutive attempts; initiating proxy shutdown", | ||
| "target", p.targetURI, "attempts", healthCheckRetryCount) | ||
| "target", p.targetURI, "attempts", p.healthCheckFailureThreshold) | ||
| if p.onHealthCheckFailed != nil { | ||
| p.onHealthCheckFailed() | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Undocumented interaction worth calling out: when
TOOLHIVE_HEALTH_CHECK_PING_TIMEOUT > TOOLHIVE_HEALTH_CHECK_INTERVAL, each retry blocks the monitor goroutine longer than one interval tick. The next tick fires into a buffered channel, so the effective cycle time becomespingTimeout + retryDelay + pingTimeoutrather thaninterval + retryDelay. This silently extends the failure window beyond what the formula predicts.Consider adding a note here: