diff --git a/.gitignore b/.gitignore index d4d10e14fe..161db5f191 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,5 @@ storybook-static/ test-results.xml docsite/ + +.kilo-format-temp-* diff --git a/pkg/remote/conncontroller/conncontroller.go b/pkg/remote/conncontroller/conncontroller.go index 94229a070e..a24a789009 100644 --- a/pkg/remote/conncontroller/conncontroller.go +++ b/pkg/remote/conncontroller/conncontroller.go @@ -749,7 +749,8 @@ func (conn *SSHConn) Connect(ctx context.Context, connFlags *wconfig.ConnKeyword conn.FireConnChangeEvent() err := conn.connectInternal(ctx, connFlags) if err != nil { - errorCode := remote.ClassifyConnError(err) + errorCode, subCode := remote.ClassifyConnError(err) + isContextError := errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) conn.Infof(ctx, "ERROR [%s] %v\n\n", errorCode, err) conn.WithLock(func() { conn.Status = Status_Error @@ -762,8 +763,10 @@ func (conn *SSHConn) Connect(ctx context.Context, connFlags *wconfig.ConnKeyword telemetry.GoRecordTEventWrap(&telemetrydata.TEvent{ Event: "conn:connecterror", Props: telemetrydata.TEventProps{ - ConnType: "ssh", - ConnErrorCode: errorCode, + ConnType: "ssh", + ConnErrorCode: errorCode, + ConnSubErrorCode: subCode, + ConnContextError: isContextError, }, }) } else { diff --git a/pkg/remote/sshclient.go b/pkg/remote/sshclient.go index d23186d569..63cc75f176 100644 --- a/pkg/remote/sshclient.go +++ b/pkg/remote/sshclient.go @@ -61,6 +61,27 @@ const ( ConnErrCode_Unknown = "unknown" ) +// Dial error subcodes for more granular classification +const ( + DialSubCode_DNS = "dns" + DialSubCode_Refused = "refused" + DialSubCode_Timeout = "timeout" + DialSubCode_ContextCanceled = "context-canceled" + DialSubCode_NoRoute = "no-route" + DialSubCode_HostUnreach = "host-unreachable" + DialSubCode_NetUnreach = "net-unreachable" + DialSubCode_ConnReset = "conn-reset" + DialSubCode_PermDenied = "perm-denied" + DialSubCode_ProxyJump = "proxy-jump" + DialSubCode_Other = "other" +) + +// Auth error subcodes for more granular classification +const ( + AuthSubCode_UnableToAuth = "unable-to-auth" + AuthSubCode_HandshakeFailed = "handshake-failed" +) + var waveSshConfigUserSettingsInternal *ssh_config.UserSettings var configUserSettingsOnce = &sync.Once{} @@ -118,33 +139,110 @@ func SimpleMessageFromPossibleConnectionError(err error) string { return err.Error() } -func ClassifyConnError(err error) string { +func ClassifyConnError(err error) (string, string) { code := utilds.GetErrorCode(err) + subCode := utilds.GetErrorSubCode(err) if code != "" { - return code + return code, subCode } var dnsErr *net.DNSError if errors.As(err, &dnsErr) { - return ConnErrCode_Dial + return ConnErrCode_Dial, ClassifyDialErrorSubCode(err) } var opErr *net.OpError if errors.As(err, &opErr) { - return ConnErrCode_Dial + return ConnErrCode_Dial, ClassifyDialErrorSubCode(err) } errStr := err.Error() if strings.Contains(errStr, "unable to authenticate") { - return ConnErrCode_AuthFailed + return ConnErrCode_AuthFailed, AuthSubCode_UnableToAuth } if strings.Contains(errStr, "handshake failed") { - return ConnErrCode_AuthFailed + return ConnErrCode_AuthFailed, AuthSubCode_HandshakeFailed } if strings.Contains(errStr, "connection refused") { - return ConnErrCode_Dial + return ConnErrCode_Dial, ClassifyDialErrorSubCode(err) } if strings.Contains(errStr, "timed out") || strings.Contains(errStr, "timeout") { - return ConnErrCode_Dial + return ConnErrCode_Dial, ClassifyDialErrorSubCode(err) + } + return ConnErrCode_Unknown, "" +} + +// ClassifyDialErrorSubCode provides more granular classification of dial errors +// to help identify root causes (DNS, VPN, timeouts, etc.) +func ClassifyDialErrorSubCode(err error) string { + if err == nil { + return "" + } + + // Check for context cancellation first + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return DialSubCode_ContextCanceled + } + + // Check if it's a DNS error + var dnsErr *net.DNSError + if errors.As(err, &dnsErr) { + return DialSubCode_DNS + } + + // Check if it's a network operation error + var opErr *net.OpError + if errors.As(err, &opErr) { + // Check the underlying error for more details + if opErr.Err != nil { + errStr := opErr.Err.Error() + if strings.Contains(errStr, "connection refused") { + return DialSubCode_Refused + } + if strings.Contains(errStr, "no route to host") { + return DialSubCode_NoRoute + } + if strings.Contains(errStr, "host is unreachable") || strings.Contains(errStr, "host unreachable") { + return DialSubCode_HostUnreach + } + if strings.Contains(errStr, "network is unreachable") || strings.Contains(errStr, "network unreachable") { + return DialSubCode_NetUnreach + } + if strings.Contains(errStr, "connection reset") { + return DialSubCode_ConnReset + } + if strings.Contains(errStr, "permission denied") { + return DialSubCode_PermDenied + } + } + // Generic timeout detection in OpError + if opErr.Timeout() { + return DialSubCode_Timeout + } + } + + // Check error string for common patterns + errStr := err.Error() + if strings.Contains(errStr, "connection refused") { + return DialSubCode_Refused + } + if strings.Contains(errStr, "timed out") || strings.Contains(errStr, "timeout") || strings.Contains(errStr, "i/o timeout") { + return DialSubCode_Timeout + } + if strings.Contains(errStr, "no route to host") { + return DialSubCode_NoRoute } - return ConnErrCode_Unknown + if strings.Contains(errStr, "host is unreachable") || strings.Contains(errStr, "host unreachable") { + return DialSubCode_HostUnreach + } + if strings.Contains(errStr, "network is unreachable") || strings.Contains(errStr, "network unreachable") { + return DialSubCode_NetUnreach + } + if strings.Contains(errStr, "connection reset") { + return DialSubCode_ConnReset + } + if strings.Contains(errStr, "permission denied") { + return DialSubCode_PermDenied + } + + return DialSubCode_Other } // This exists to trick the ssh library into continuing to try @@ -747,15 +845,17 @@ func connectInternal(ctx context.Context, networkAddr string, clientConfig *ssh. blocklogger.Infof(ctx, "[conndebug] ssh dial %s\n", networkAddr) clientConn, err = d.DialContext(ctx, "tcp", networkAddr) if err != nil { - blocklogger.Infof(ctx, "[conndebug] ERROR dial error: %v\n", err) - return nil, utilds.MakeCodedError(ConnErrCode_Dial, err) + subCode := ClassifyDialErrorSubCode(err) + blocklogger.Infof(ctx, "[conndebug] ERROR dial error [%s]: %v\n", subCode, err) + return nil, utilds.MakeSubCodedError(ConnErrCode_Dial, subCode, err) } } else { blocklogger.Infof(ctx, "[conndebug] ssh dial (from client) %s\n", networkAddr) clientConn, err = currentClient.DialContext(ctx, "tcp", networkAddr) if err != nil { - blocklogger.Infof(ctx, "[conndebug] ERROR dial error: %v\n", err) - return nil, utilds.MakeCodedError(ConnErrCode_Dial, err) + subCode := DialSubCode_ProxyJump // This is a proxy jump connection error + blocklogger.Infof(ctx, "[conndebug] ERROR dial error [%s]: %v\n", subCode, err) + return nil, utilds.MakeSubCodedError(ConnErrCode_Dial, subCode, err) } } c, chans, reqs, err := ssh.NewClientConn(clientConn, networkAddr, clientConfig) diff --git a/pkg/telemetry/telemetrydata/telemetrydata.go b/pkg/telemetry/telemetrydata/telemetrydata.go index b574f586be..463be152bf 100644 --- a/pkg/telemetry/telemetrydata/telemetrydata.go +++ b/pkg/telemetry/telemetrydata/telemetrydata.go @@ -126,9 +126,11 @@ type TEventProps struct { WshCmd string `json:"wsh:cmd,omitempty"` WshHadError bool `json:"wsh:haderror,omitempty"` - ConnType string `json:"conn:conntype,omitempty"` - ConnWshErrorCode string `json:"conn:wsherrorcode,omitempty"` - ConnErrorCode string `json:"conn:errorcode,omitempty"` + ConnType string `json:"conn:conntype,omitempty"` + ConnWshErrorCode string `json:"conn:wsherrorcode,omitempty"` + ConnErrorCode string `json:"conn:errorcode,omitempty"` + ConnSubErrorCode string `json:"conn:suberrorcode,omitempty"` + ConnContextError bool `json:"conn:contexterror,omitempty"` OnboardingFeature string `json:"onboarding:feature,omitempty" tstype:"\"waveai\" | \"durable\" | \"magnify\" | \"wsh\""` OnboardingVersion string `json:"onboarding:version,omitempty"` diff --git a/pkg/utilds/codederror.go b/pkg/utilds/codederror.go index f0f4ed71e6..283c381f3a 100644 --- a/pkg/utilds/codederror.go +++ b/pkg/utilds/codederror.go @@ -10,9 +10,11 @@ import ( // CodedError wraps an error with a string code for categorization. // The code can be extracted from anywhere in an error chain using GetErrorCode. +// SubCode provides additional granularity for error classification. type CodedError struct { - Code string - Err error + Code string + SubCode string + Err error } func (e CodedError) Error() string { @@ -25,7 +27,12 @@ func (e CodedError) Unwrap() error { // MakeCodedError creates a new CodedError with the given code and error. func MakeCodedError(code string, err error) CodedError { - return CodedError{Code: code, Err: err} + return CodedError{Code: code, SubCode: "", Err: err} +} + +// MakeSubCodedError creates a new CodedError with the given code, subcode, and error. +func MakeSubCodedError(code string, subCode string, err error) CodedError { + return CodedError{Code: code, SubCode: subCode, Err: err} } // GetErrorCode extracts the error code from anywhere in the error chain. @@ -41,6 +48,19 @@ func GetErrorCode(err error) string { return "" } +// GetErrorSubCode extracts the error subcode from anywhere in the error chain. +// Returns empty string if no CodedError is found or if SubCode is not set. +func GetErrorSubCode(err error) string { + if err == nil { + return "" + } + var coded CodedError + if errors.As(err, &coded) { + return coded.SubCode + } + return "" +} + // Errorf creates a formatted error wrapped in a CodedError. // This is a convenience function that combines fmt.Errorf with MakeCodedError. func Errorf(code string, format string, args ...interface{}) error {