diff --git a/docs/arch/03-transport-architecture.md b/docs/arch/03-transport-architecture.md index 4633ddf337..1d71c1b5e5 100644 --- a/docs/arch/03-transport-architecture.md +++ b/docs/arch/03-transport-architecture.md @@ -336,6 +336,33 @@ export TOOLHIVE_REMOTE_HEALTHCHECKS=true thv proxy --remote-url https://example.com/mcp my-remote-server ``` +### Health Check Tuning Parameters + +**Implementation**: `pkg/transport/proxy/transparent/transparent_proxy.go` + +The transparent proxy health check behavior can be tuned via environment variables. These control how the proxy detects and responds to unhealthy backends: + +| Environment Variable | Description | Default | Type | +|---|---|---|---| +| `TOOLHIVE_HEALTH_CHECK_INTERVAL` | How often to run health checks | `10s` | duration | +| `TOOLHIVE_HEALTH_CHECK_PING_TIMEOUT` | Timeout for each health check ping | `5s` | duration | +| `TOOLHIVE_HEALTH_CHECK_RETRY_DELAY` | Delay between retry attempts after a failure | `5s` | duration | +| `TOOLHIVE_HEALTH_CHECK_FAILURE_THRESHOLD` | Consecutive failures before proxy shutdown | `5` | integer | + +Duration values use Go's `time.ParseDuration` format (e.g., `10s`, `500ms`, `1m30s`). Invalid values are ignored with a warning log, and the default is used instead. + +**Threshold of 1**: Setting `TOOLHIVE_HEALTH_CHECK_FAILURE_THRESHOLD=1` means the proxy shuts down on the first health check failure with no retries. + +**Failure window**: With the defaults, the proxy tolerates roughly `(threshold-1) × (interval + retryDelay)` before shutting down — approximately 60 seconds with default values. This is designed to survive transient network disruptions without prematurely killing healthy backends. If `TOOLHIVE_HEALTH_CHECK_PING_TIMEOUT` exceeds `TOOLHIVE_HEALTH_CHECK_INTERVAL`, each health check cycle takes longer than one interval tick, extending the failure window beyond what the formula predicts. + +**Usage example** (increase tolerance for a flaky network): +```bash +export TOOLHIVE_HEALTH_CHECK_FAILURE_THRESHOLD=10 +export TOOLHIVE_HEALTH_CHECK_RETRY_DELAY=10s +``` + +> **Note**: These parameters only affect the transparent proxy (used by SSE and streamable HTTP transports). The stdio transport's streamable HTTP proxy uses separate timeout settings. The vMCP server uses its own circuit breaker pattern. + ### Kubernetes Support for Remote MCPs **Implementation**: [PR #2151](https://github.com/stacklok/toolhive/pull/2151) diff --git a/pkg/transport/proxy/transparent/transparent_proxy.go b/pkg/transport/proxy/transparent/transparent_proxy.go index 9c49dddf73..5c8eb8ee32 100644 --- a/pkg/transport/proxy/transparent/transparent_proxy.go +++ b/pkg/transport/proxy/transparent/transparent_proxy.go @@ -18,6 +18,7 @@ import ( "net/http/httputil" "net/url" "os" + "strconv" "strings" "sync" "sync/atomic" @@ -120,6 +121,9 @@ type TransparentProxy struct { // Health check ping timeout (default: 5 seconds) healthCheckPingTimeout time.Duration + // Health check failure threshold: consecutive failures before shutdown (default: 5) + healthCheckFailureThreshold int + // Shutdown timeout for graceful HTTP server shutdown (default: 30 seconds) shutdownTimeout time.Duration } @@ -140,8 +144,21 @@ const ( defaultIdleTimeout = 120 * time.Second // HealthCheckIntervalEnvVar is the environment variable name for configuring health check interval. - // This is primarily useful for testing with shorter intervals. HealthCheckIntervalEnvVar = "TOOLHIVE_HEALTH_CHECK_INTERVAL" + + // HealthCheckPingTimeoutEnvVar is the environment variable name for configuring health check ping timeout. + HealthCheckPingTimeoutEnvVar = "TOOLHIVE_HEALTH_CHECK_PING_TIMEOUT" + + // HealthCheckRetryDelayEnvVar is the environment variable name for configuring health check retry delay. + HealthCheckRetryDelayEnvVar = "TOOLHIVE_HEALTH_CHECK_RETRY_DELAY" + + // HealthCheckFailureThresholdEnvVar is the environment variable name for configuring + // the number of consecutive health check failures before shutdown. + HealthCheckFailureThresholdEnvVar = "TOOLHIVE_HEALTH_CHECK_FAILURE_THRESHOLD" + + // DefaultHealthCheckFailureThreshold is the default number of consecutive health check + // failures before the proxy initiates shutdown. + DefaultHealthCheckFailureThreshold = 5 ) // Option is a functional option for configuring TransparentProxy @@ -189,6 +206,17 @@ func withHealthCheckPingTimeout(timeout time.Duration) Option { } } +// withHealthCheckFailureThreshold sets the consecutive failure count before shutdown. +// This is primarily useful for testing with lower thresholds. +// Ignores non-positive values; default will be used. +func withHealthCheckFailureThreshold(threshold int) Option { + return func(p *TransparentProxy) { + if threshold > 0 { + p.healthCheckFailureThreshold = threshold + } + } +} + // withShutdownTimeout sets the graceful shutdown timeout for the HTTP server. // This is primarily useful for testing with shorter timeouts. // Ignores non-positive timeouts; default will be used. @@ -244,12 +272,60 @@ func NewTransparentProxy( func getHealthCheckInterval() time.Duration { if val := os.Getenv(HealthCheckIntervalEnvVar); val != "" { if d, err := time.ParseDuration(val); err == nil && d > 0 { + slog.Debug("using custom health check interval", "interval", d) return d } + slog.Warn("invalid health check interval, using default", + "env_var", HealthCheckIntervalEnvVar, "value", val, "default", DefaultHealthCheckInterval) } return DefaultHealthCheckInterval } +// getHealthCheckPingTimeout returns the health check ping timeout to use. +// Uses TOOLHIVE_HEALTH_CHECK_PING_TIMEOUT environment variable if set and valid, +// otherwise returns the default timeout. +func getHealthCheckPingTimeout() time.Duration { + if val := os.Getenv(HealthCheckPingTimeoutEnvVar); val != "" { + if d, err := time.ParseDuration(val); err == nil && d > 0 { + slog.Debug("using custom health check ping timeout", "timeout", d) + return d + } + slog.Warn("invalid health check ping timeout, using default", + "env_var", HealthCheckPingTimeoutEnvVar, "value", val, "default", DefaultPingerTimeout) + } + return DefaultPingerTimeout +} + +// getHealthCheckRetryDelay returns the health check retry delay to use. +// Uses TOOLHIVE_HEALTH_CHECK_RETRY_DELAY environment variable if set and valid, +// otherwise returns the default delay. +func getHealthCheckRetryDelay() time.Duration { + if val := os.Getenv(HealthCheckRetryDelayEnvVar); val != "" { + if d, err := time.ParseDuration(val); err == nil && d > 0 { + slog.Debug("using custom health check retry delay", "delay", d) + return d + } + slog.Warn("invalid health check retry delay, using default", + "env_var", HealthCheckRetryDelayEnvVar, "value", val, "default", DefaultHealthCheckRetryDelay) + } + return DefaultHealthCheckRetryDelay +} + +// getHealthCheckFailureThreshold returns the consecutive failure threshold. +// Uses TOOLHIVE_HEALTH_CHECK_FAILURE_THRESHOLD environment variable if set and valid, +// otherwise returns the default threshold. +func getHealthCheckFailureThreshold() int { + if val := os.Getenv(HealthCheckFailureThresholdEnvVar); val != "" { + if n, err := strconv.Atoi(val); err == nil && n > 0 { + slog.Debug("using custom health check failure threshold", "threshold", n) + return n + } + slog.Warn("invalid health check failure threshold, using default", + "env_var", HealthCheckFailureThresholdEnvVar, "value", val, "default", DefaultHealthCheckFailureThreshold) + } + return DefaultHealthCheckFailureThreshold +} + // NewTransparentProxyWithOptions creates a new transparent proxy with optional configuration. func NewTransparentProxyWithOptions( host string, @@ -269,25 +345,26 @@ func NewTransparentProxyWithOptions( options ...Option, ) *TransparentProxy { proxy := &TransparentProxy{ - host: host, - port: port, - targetURI: targetURI, - middlewares: middlewares, - shutdownCh: make(chan struct{}), - prometheusHandler: prometheusHandler, - authInfoHandler: authInfoHandler, - prefixHandlers: prefixHandlers, - sessionManager: session.NewManager(session.DefaultSessionTTL, session.NewProxySession), - isRemote: isRemote, - transportType: transportType, - onHealthCheckFailed: onHealthCheckFailed, - onUnauthorizedResponse: onUnauthorizedResponse, - endpointPrefix: endpointPrefix, - trustProxyHeaders: trustProxyHeaders, - healthCheckInterval: getHealthCheckInterval(), - healthCheckRetryDelay: DefaultHealthCheckRetryDelay, - healthCheckPingTimeout: DefaultPingerTimeout, - shutdownTimeout: defaultShutdownTimeout, + host: host, + port: port, + targetURI: targetURI, + middlewares: middlewares, + shutdownCh: make(chan struct{}), + prometheusHandler: prometheusHandler, + authInfoHandler: authInfoHandler, + prefixHandlers: prefixHandlers, + sessionManager: session.NewManager(session.DefaultSessionTTL, session.NewProxySession), + isRemote: isRemote, + transportType: transportType, + onHealthCheckFailed: onHealthCheckFailed, + onUnauthorizedResponse: onUnauthorizedResponse, + endpointPrefix: endpointPrefix, + trustProxyHeaders: trustProxyHeaders, + healthCheckInterval: getHealthCheckInterval(), + healthCheckRetryDelay: getHealthCheckRetryDelay(), + healthCheckPingTimeout: getHealthCheckPingTimeout(), + healthCheckFailureThreshold: getHealthCheckFailureThreshold(), + shutdownTimeout: defaultShutdownTimeout, } // Apply options @@ -598,24 +675,10 @@ func (p *TransparentProxy) CloseListener() error { return nil } -// healthCheckRetryConfig holds retry configuration for health checks. -// These values are designed to handle transient network issues like -// VPN/firewall idle connection timeouts (commonly 5-10 minutes). -const ( - // healthCheckRetryCount is the number of consecutive failures before marking unhealthy. - // This prevents immediate shutdown on transient network issues. - healthCheckRetryCount = 3 -) - // performHealthCheckRetry performs a retry health check after a delay // Returns true if the retry was successful (health check recovered), false otherwise func (p *TransparentProxy) performHealthCheckRetry(ctx context.Context) bool { - retryDelay := p.healthCheckRetryDelay - if retryDelay == 0 { - retryDelay = DefaultHealthCheckRetryDelay - } - - retryTimer := time.NewTimer(retryDelay) + retryTimer := time.NewTimer(p.healthCheckRetryDelay) defer retryTimer.Stop() select { @@ -646,10 +709,10 @@ func (p *TransparentProxy) handleHealthCheckFailure( slog.Warn("health check failed", "target", p.targetURI, "attempt", consecutiveFailures, - "max_attempts", healthCheckRetryCount, + "max_attempts", p.healthCheckFailureThreshold, "status", status) - if consecutiveFailures < healthCheckRetryCount { + if consecutiveFailures < p.healthCheckFailureThreshold { if p.performHealthCheckRetry(ctx) { consecutiveFailures = 0 } @@ -659,7 +722,7 @@ func (p *TransparentProxy) handleHealthCheckFailure( // All retries exhausted, initiate shutdown //nolint:gosec // G706: logging target URI from config slog.Error("health check failed after consecutive attempts; initiating proxy shutdown", - "target", p.targetURI, "attempts", healthCheckRetryCount) + "target", p.targetURI, "attempts", p.healthCheckFailureThreshold) if p.onHealthCheckFailed != nil { p.onHealthCheckFailed() } diff --git a/pkg/transport/proxy/transparent/transparent_test.go b/pkg/transport/proxy/transparent/transparent_test.go index deacb92c96..777b1d3b44 100644 --- a/pkg/transport/proxy/transparent/transparent_test.go +++ b/pkg/transport/proxy/transparent/transparent_test.go @@ -570,15 +570,148 @@ func TestTransparentProxy_NilUnauthorizedCallback(t *testing.T) { func TestHealthCheckRetryConstants(t *testing.T) { t.Parallel() - // Verify retry count is reasonable (not too low, not too high) - assert.GreaterOrEqual(t, healthCheckRetryCount, 2, "Should retry at least twice before giving up") - assert.LessOrEqual(t, healthCheckRetryCount, 10, "Should not retry too many times") + // Verify failure threshold is reasonable (not too low, not too high) + assert.GreaterOrEqual(t, DefaultHealthCheckFailureThreshold, 2, "Should retry at least twice before giving up") + assert.LessOrEqual(t, DefaultHealthCheckFailureThreshold, 10, "Should not retry too many times") // Verify retry delay is reasonable assert.GreaterOrEqual(t, DefaultHealthCheckRetryDelay, 1*time.Second, "Retry delay should be at least 1 second") assert.LessOrEqual(t, DefaultHealthCheckRetryDelay, 30*time.Second, "Retry delay should not be too long") } +func TestGetHealthCheckInterval(t *testing.T) { + tests := []struct { + name string + envValue string + expected time.Duration + }{ + {name: "default when unset", envValue: "", expected: DefaultHealthCheckInterval}, + {name: "valid duration", envValue: "30s", expected: 30 * time.Second}, + {name: "valid short duration", envValue: "500ms", expected: 500 * time.Millisecond}, + {name: "invalid string", envValue: "not-a-duration", expected: DefaultHealthCheckInterval}, + {name: "zero duration", envValue: "0s", expected: DefaultHealthCheckInterval}, + {name: "negative duration", envValue: "-5s", expected: DefaultHealthCheckInterval}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv(HealthCheckIntervalEnvVar, tt.envValue) + assert.Equal(t, tt.expected, getHealthCheckInterval()) + }) + } +} + +func TestGetHealthCheckPingTimeout(t *testing.T) { + tests := []struct { + name string + envValue string + expected time.Duration + }{ + {name: "default when unset", envValue: "", expected: DefaultPingerTimeout}, + {name: "valid duration", envValue: "10s", expected: 10 * time.Second}, + {name: "valid short duration", envValue: "500ms", expected: 500 * time.Millisecond}, + {name: "invalid string", envValue: "not-a-duration", expected: DefaultPingerTimeout}, + {name: "zero duration", envValue: "0s", expected: DefaultPingerTimeout}, + {name: "negative duration", envValue: "-5s", expected: DefaultPingerTimeout}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv(HealthCheckPingTimeoutEnvVar, tt.envValue) + assert.Equal(t, tt.expected, getHealthCheckPingTimeout()) + }) + } +} + +func TestGetHealthCheckRetryDelay(t *testing.T) { + tests := []struct { + name string + envValue string + expected time.Duration + }{ + {name: "default when unset", envValue: "", expected: DefaultHealthCheckRetryDelay}, + {name: "valid duration", envValue: "10s", expected: 10 * time.Second}, + {name: "valid short duration", envValue: "500ms", expected: 500 * time.Millisecond}, + {name: "invalid string", envValue: "not-a-duration", expected: DefaultHealthCheckRetryDelay}, + {name: "zero duration", envValue: "0s", expected: DefaultHealthCheckRetryDelay}, + {name: "negative duration", envValue: "-5s", expected: DefaultHealthCheckRetryDelay}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv(HealthCheckRetryDelayEnvVar, tt.envValue) + assert.Equal(t, tt.expected, getHealthCheckRetryDelay()) + }) + } +} + +func TestGetHealthCheckFailureThreshold(t *testing.T) { + tests := []struct { + name string + envValue string + expected int + }{ + {name: "default when unset", envValue: "", expected: DefaultHealthCheckFailureThreshold}, + {name: "valid integer", envValue: "10", expected: 10}, + {name: "valid minimum", envValue: "1", expected: 1}, + {name: "invalid string", envValue: "not-a-number", expected: DefaultHealthCheckFailureThreshold}, + {name: "zero value", envValue: "0", expected: DefaultHealthCheckFailureThreshold}, + {name: "negative value", envValue: "-3", expected: DefaultHealthCheckFailureThreshold}, + {name: "float value", envValue: "2.5", expected: DefaultHealthCheckFailureThreshold}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv(HealthCheckFailureThresholdEnvVar, tt.envValue) + assert.Equal(t, tt.expected, getHealthCheckFailureThreshold()) + }) + } +} + +func TestNewTransparentProxyUsesEnvVars(t *testing.T) { + t.Setenv(HealthCheckPingTimeoutEnvVar, "15s") + t.Setenv(HealthCheckRetryDelayEnvVar, "8s") + t.Setenv(HealthCheckFailureThresholdEnvVar, "7") + t.Setenv(HealthCheckIntervalEnvVar, "20s") + + proxy := newMinimalProxy() + + assert.Equal(t, 15*time.Second, proxy.healthCheckPingTimeout) + assert.Equal(t, 8*time.Second, proxy.healthCheckRetryDelay) + assert.Equal(t, 7, proxy.healthCheckFailureThreshold) + assert.Equal(t, 20*time.Second, proxy.healthCheckInterval) +} + +func TestNewTransparentProxyDefaultValues(t *testing.T) { + t.Setenv(HealthCheckIntervalEnvVar, "") + t.Setenv(HealthCheckPingTimeoutEnvVar, "") + t.Setenv(HealthCheckRetryDelayEnvVar, "") + t.Setenv(HealthCheckFailureThresholdEnvVar, "") + + proxy := newMinimalProxy() + + assert.Equal(t, DefaultPingerTimeout, proxy.healthCheckPingTimeout) + assert.Equal(t, DefaultHealthCheckRetryDelay, proxy.healthCheckRetryDelay) + assert.Equal(t, DefaultHealthCheckFailureThreshold, proxy.healthCheckFailureThreshold) + assert.Equal(t, DefaultHealthCheckInterval, proxy.healthCheckInterval) +} + +func TestWithHealthCheckFailureThresholdOption(t *testing.T) { + t.Parallel() + + proxy := newMinimalProxy(withHealthCheckFailureThreshold(8)) + + assert.Equal(t, 8, proxy.healthCheckFailureThreshold) +} + +func TestWithHealthCheckFailureThresholdOption_IgnoresNonPositive(t *testing.T) { + t.Setenv(HealthCheckFailureThresholdEnvVar, "") + + proxy := newMinimalProxy(withHealthCheckFailureThreshold(0)) + + assert.Equal(t, DefaultHealthCheckFailureThreshold, proxy.healthCheckFailureThreshold) +} + // TestRewriteEndpointURL tests the rewriteEndpointURL function func TestRewriteEndpointURL(t *testing.T) { t.Parallel() @@ -974,6 +1107,28 @@ type callbackTracker struct { mu sync.Mutex } +// newMinimalProxy creates a proxy with minimal configuration for unit tests +// that only need to inspect struct fields (no server started, no health checks). +func newMinimalProxy(options ...Option) *TransparentProxy { + return NewTransparentProxyWithOptions( + "127.0.0.1", + 0, // port (auto-assign) + "http://localhost:8080", + nil, // prometheusHandler + nil, // authInfoHandler + nil, // prefixHandlers + false, // enableHealthCheck + false, // isRemote + "sse", // transportType + nil, // onHealthCheckFailed + nil, // onUnauthorizedResponse + "", // endpointPrefix + false, // trustProxyHeaders + nil, // middlewares + options..., + ) +} + func newCallbackTracker() (*callbackTracker, func()) { tracker := &callbackTracker{ done: make(chan struct{}), @@ -1050,6 +1205,7 @@ func setupRemoteProxyTestWithTimeout(t *testing.T, serverURL string, callback ty withHealthCheckInterval(100*time.Millisecond), // Use 100ms for faster tests withHealthCheckRetryDelay(50*time.Millisecond), // Use 50ms retry delay for faster tests withHealthCheckPingTimeout(100*time.Millisecond), // Use 100ms ping timeout for faster tests + withHealthCheckFailureThreshold(3), // Use 3 for faster tests (production default is 5) ) ctx, cancel := context.WithTimeout(context.Background(), timeout)