diff --git a/Makefile b/Makefile index 4cfccff752..fb58bbb637 100644 --- a/Makefile +++ b/Makefile @@ -640,6 +640,9 @@ run-perf-test: run-e2e-test: go test -v ./test/e2e/ -timeout 1h -tags=e2e -count=1 -args -image-tag=${TAG} -image-registry=${IMAGE_REGISTRY} -image-namespace=${IMAGE_NAMESPACE} +run-e2ev3-test: + go test -v ./test/e2ev3/ -timeout 1h -tags=e2e -count=1 -args -image-tag=${TAG} -image-registry=${IMAGE_REGISTRY} -image-namespace=${IMAGE_NAMESPACE} + .PHONY: update-hubble update-hubble: @echo "Checking for Hubble updates..." diff --git a/controller/Dockerfile b/controller/Dockerfile index d6b958140c..ff70dafe44 100644 --- a/controller/Dockerfile +++ b/controller/Dockerfile @@ -18,7 +18,8 @@ ARG GOARCH=amd64 # default to amd64 ARG GOOS=linux # default to linux ENV GOARCH=${GOARCH} ENV GOOS=${GOOS} -RUN if [ "$GOOS" = "linux" ] ; then \ +RUN --mount=type=cache,target=/var/cache/tdnf \ + if [ "$GOOS" = "linux" ] ; then \ tdnf install -y clang lld bpftool libbpf-devel; \ fi COPY ./pkg/plugin /go/src/github.com/microsoft/retina/pkg/plugin @@ -68,7 +69,7 @@ RUN --mount=type=cache,target="/root/.cache/go-build" go build -v -o /go/bin/ret # tools image FROM azurelinux-core AS tools -RUN tdnf install -y \ +RUN --mount=type=cache,target=/var/cache/tdnf tdnf install -y \ clang \ iproute \ iptables \ diff --git a/deploy/standard/manifests/controller/helm/retina/crds/monitoring.coreos.com_servicemonitors.yaml b/deploy/standard/manifests/controller/helm/retina/crds/monitoring.coreos.com_servicemonitors.yaml new file mode 100644 index 0000000000..39532119bd --- /dev/null +++ b/deploy/standard/manifests/controller/helm/retina/crds/monitoring.coreos.com_servicemonitors.yaml @@ -0,0 +1,1412 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + operator.prometheus.io/version: 0.90.0 + name: servicemonitors.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + categories: + - prometheus-operator + kind: ServiceMonitor + listKind: ServiceMonitorList + plural: servicemonitors + shortNames: + - smon + singular: servicemonitor + scope: Namespaced + versions: + - name: v1 + schema: + openAPIV3Schema: + description: |- + The `ServiceMonitor` custom resource definition (CRD) defines how `Prometheus` and `PrometheusAgent` can scrape metrics from a group of services. + Among other things, it allows to specify: + * The services to scrape via label selectors. + * The container ports to scrape. + * Authentication credentials to use. + * Target and metric relabeling. + + `Prometheus` and `PrometheusAgent` objects select `ServiceMonitor` objects using label and namespace selectors. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + spec defines the specification of desired Service selection for target discovery by + Prometheus. + properties: + attachMetadata: + description: |- + attachMetadata defines additional metadata which is added to the + discovered targets. + + It requires Prometheus >= v2.37.0. + properties: + node: + description: |- + node when set to true, Prometheus attaches node metadata to the discovered + targets. + + The Prometheus service account must have the `list` and `watch` + permissions on the `Nodes` objects. + type: boolean + type: object + bodySizeLimit: + description: |- + bodySizeLimit when defined, bodySizeLimit specifies a job level limit on the size + of uncompressed response body that will be accepted by Prometheus. + + It requires Prometheus >= v2.28.0. + pattern: (^0|([0-9]*[.])?[0-9]+((K|M|G|T|E|P)i?)?B)$ + type: string + convertClassicHistogramsToNHCB: + description: |- + convertClassicHistogramsToNHCB defines whether to convert all scraped classic histograms into a native histogram with custom buckets. + It requires Prometheus >= v3.0.0. + type: boolean + endpoints: + description: |- + endpoints defines the list of endpoints part of this ServiceMonitor. + Defines how to scrape metrics from Kubernetes [Endpoints](https://kubernetes.io/docs/concepts/services-networking/service/#endpoints) objects. + In most cases, an Endpoints object is backed by a Kubernetes [Service](https://kubernetes.io/docs/concepts/services-networking/service/) object with the same name and labels. + items: + description: |- + Endpoint defines an endpoint serving Prometheus metrics to be scraped by + Prometheus. + properties: + authorization: + description: |- + authorization configures the Authorization header credentials used by + the client. + + Cannot be set at the same time as `basicAuth`, `bearerTokenSecret` or `oauth2`. + properties: + credentials: + description: credentials defines a key of a Secret in the + namespace that contains the credentials for authentication. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: + description: |- + type defines the authentication type. The value is case-insensitive. + + "Basic" is not a supported value. + + Default: "Bearer" + type: string + type: object + basicAuth: + description: |- + basicAuth defines the Basic Authentication credentials used by the + client. + + Cannot be set at the same time as `authorization`, `bearerTokenSecret` or `oauth2`. + properties: + password: + description: |- + password defines a key of a Secret containing the password for + authentication. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + username: + description: |- + username defines a key of a Secret containing the username for + authentication. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + bearerTokenFile: + description: |- + bearerTokenFile defines the file to read bearer token for scraping the target. + + Deprecated: use `authorization` instead. + type: string + bearerTokenSecret: + description: |- + bearerTokenSecret defines a key of a Secret containing the bearer token + used by the client for authentication. The secret needs to be in the + same namespace as the custom resource and readable by the Prometheus + Operator. + + Cannot be set at the same time as `authorization`, `basicAuth` or `oauth2`. + + Deprecated: use `authorization` instead. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + enableHttp2: + description: enableHttp2 can be used to disable HTTP2. + type: boolean + filterRunning: + description: |- + filterRunning when true, the pods which are not running (e.g. either in Failed or + Succeeded state) are dropped during the target discovery. + + If unset, the filtering is enabled. + + More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase + type: boolean + followRedirects: + description: |- + followRedirects defines whether the client should follow HTTP 3xx + redirects. + type: boolean + honorLabels: + description: |- + honorLabels defines when true the metric's labels when they collide + with the target's labels. + type: boolean + honorTimestamps: + description: |- + honorTimestamps defines whether Prometheus preserves the timestamps + when exposed by the target. + type: boolean + interval: + description: |- + interval at which Prometheus scrapes the metrics from the target. + + If empty, Prometheus uses the global scrape interval. + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + metricRelabelings: + description: |- + metricRelabelings defines the relabeling rules to apply to the + samples before ingestion. + items: + description: |- + RelabelConfig allows dynamic rewriting of the label set for targets, alerts, + scraped samples and remote write samples. + + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config + properties: + action: + default: replace + description: |- + action to perform based on the regex matching. + + `Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0. + `DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0. + + Default: "Replace" + enum: + - replace + - Replace + - keep + - Keep + - drop + - Drop + - hashmod + - HashMod + - labelmap + - LabelMap + - labeldrop + - LabelDrop + - labelkeep + - LabelKeep + - lowercase + - Lowercase + - uppercase + - Uppercase + - keepequal + - KeepEqual + - dropequal + - DropEqual + type: string + modulus: + description: |- + modulus to take of the hash of the source label values. + + Only applicable when the action is `HashMod`. + format: int64 + type: integer + regex: + description: regex defines the regular expression against + which the extracted value is matched. + type: string + replacement: + description: |- + replacement value against which a Replace action is performed if the + regular expression matches. + + Regex capture groups are available. + type: string + separator: + description: separator defines the string between concatenated + SourceLabels. + type: string + sourceLabels: + description: |- + sourceLabels defines the source labels select values from existing labels. Their content is + concatenated using the configured Separator and matched against the + configured regular expression. + items: + description: |- + LabelName is a valid Prometheus label name. + For Prometheus 3.x, a label name is valid if it contains UTF-8 characters. + For Prometheus 2.x, a label name is only valid if it contains ASCII characters, letters, numbers, as well as underscores. + type: string + type: array + targetLabel: + description: |- + targetLabel defines the label to which the resulting string is written in a replacement. + + It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`, + `KeepEqual` and `DropEqual` actions. + + Regex capture groups are available. + type: string + type: object + type: array + noProxy: + description: |- + noProxy defines a comma-separated string that can contain IPs, CIDR notation, domain names + that should be excluded from proxying. IP and domain names can + contain port numbers. + + It requires Prometheus >= v2.43.0, Alertmanager >= v0.25.0 or Thanos >= v0.32.0. + type: string + oauth2: + description: |- + oauth2 defines the OAuth2 settings used by the client. + + It requires Prometheus >= 2.27.0. + + Cannot be set at the same time as `authorization`, `basicAuth` or `bearerTokenSecret`. + properties: + clientId: + description: |- + clientId defines a key of a Secret or ConfigMap containing the + OAuth2 client's ID. + properties: + configMap: + description: configMap defines the ConfigMap containing + data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or its + key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + secret: + description: secret defines the Secret containing data + to use for the targets. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + clientSecret: + description: |- + clientSecret defines a key of a Secret containing the OAuth2 + client's secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + endpointParams: + additionalProperties: + type: string + description: |- + endpointParams configures the HTTP parameters to append to the token + URL. + type: object + noProxy: + description: |- + noProxy defines a comma-separated string that can contain IPs, CIDR notation, domain names + that should be excluded from proxying. IP and domain names can + contain port numbers. + + It requires Prometheus >= v2.43.0, Alertmanager >= v0.25.0 or Thanos >= v0.32.0. + type: string + proxyConnectHeader: + additionalProperties: + items: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: array + description: |- + proxyConnectHeader optionally specifies headers to send to + proxies during CONNECT requests. + + It requires Prometheus >= v2.43.0, Alertmanager >= v0.25.0 or Thanos >= v0.32.0. + type: object + x-kubernetes-map-type: atomic + proxyFromEnvironment: + description: |- + proxyFromEnvironment defines whether to use the proxy configuration defined by environment variables (HTTP_PROXY, HTTPS_PROXY, and NO_PROXY). + + It requires Prometheus >= v2.43.0, Alertmanager >= v0.25.0 or Thanos >= v0.32.0. + type: boolean + proxyUrl: + description: proxyUrl defines the HTTP proxy server to use. + pattern: ^(http|https|socks5)://.+$ + type: string + scopes: + description: scopes defines the OAuth2 scopes used for the + token request. + items: + type: string + type: array + tlsConfig: + description: |- + tlsConfig defines the TLS configuration to use when connecting to the OAuth2 server. + It requires Prometheus >= v2.43.0. + properties: + ca: + description: ca defines the Certificate authority used + when verifying server certificates. + properties: + configMap: + description: configMap defines the ConfigMap containing + data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or + its key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + secret: + description: secret defines the Secret containing + data to use for the targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + cert: + description: cert defines the Client certificate to + present when doing client-authentication. + properties: + configMap: + description: configMap defines the ConfigMap containing + data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or + its key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + secret: + description: secret defines the Secret containing + data to use for the targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + insecureSkipVerify: + description: insecureSkipVerify defines how to disable + target certificate validation. + type: boolean + keySecret: + description: keySecret defines the Secret containing + the client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + maxVersion: + description: |- + maxVersion defines the maximum acceptable TLS version. + + It requires Prometheus >= v2.41.0 or Thanos >= v0.31.0. + enum: + - TLS10 + - TLS11 + - TLS12 + - TLS13 + type: string + minVersion: + description: |- + minVersion defines the minimum acceptable TLS version. + + It requires Prometheus >= v2.35.0 or Thanos >= v0.28.0. + enum: + - TLS10 + - TLS11 + - TLS12 + - TLS13 + type: string + serverName: + description: serverName is used to verify the hostname + for the targets. + type: string + type: object + tokenUrl: + description: tokenUrl defines the URL to fetch the token + from. + minLength: 1 + type: string + required: + - clientId + - clientSecret + - tokenUrl + type: object + params: + additionalProperties: + items: + type: string + type: array + description: params define optional HTTP URL parameters. + type: object + path: + description: |- + path defines the HTTP path from which to scrape for metrics. + + If empty, Prometheus uses the default value (e.g. `/metrics`). + type: string + port: + description: |- + port defines the name of the Service port which this endpoint refers to. + + It takes precedence over `targetPort`. + type: string + proxyConnectHeader: + additionalProperties: + items: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: array + description: |- + proxyConnectHeader optionally specifies headers to send to + proxies during CONNECT requests. + + It requires Prometheus >= v2.43.0, Alertmanager >= v0.25.0 or Thanos >= v0.32.0. + type: object + x-kubernetes-map-type: atomic + proxyFromEnvironment: + description: |- + proxyFromEnvironment defines whether to use the proxy configuration defined by environment variables (HTTP_PROXY, HTTPS_PROXY, and NO_PROXY). + + It requires Prometheus >= v2.43.0, Alertmanager >= v0.25.0 or Thanos >= v0.32.0. + type: boolean + proxyUrl: + description: proxyUrl defines the HTTP proxy server to use. + pattern: ^(http|https|socks5)://.+$ + type: string + relabelings: + description: |- + relabelings defines the relabeling rules to apply the target's + metadata labels. + + The Operator automatically adds relabelings for a few standard Kubernetes fields. + + The original scrape job's name is available via the `__tmp_prometheus_job_name` label. + + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config + items: + description: |- + RelabelConfig allows dynamic rewriting of the label set for targets, alerts, + scraped samples and remote write samples. + + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config + properties: + action: + default: replace + description: |- + action to perform based on the regex matching. + + `Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0. + `DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0. + + Default: "Replace" + enum: + - replace + - Replace + - keep + - Keep + - drop + - Drop + - hashmod + - HashMod + - labelmap + - LabelMap + - labeldrop + - LabelDrop + - labelkeep + - LabelKeep + - lowercase + - Lowercase + - uppercase + - Uppercase + - keepequal + - KeepEqual + - dropequal + - DropEqual + type: string + modulus: + description: |- + modulus to take of the hash of the source label values. + + Only applicable when the action is `HashMod`. + format: int64 + type: integer + regex: + description: regex defines the regular expression against + which the extracted value is matched. + type: string + replacement: + description: |- + replacement value against which a Replace action is performed if the + regular expression matches. + + Regex capture groups are available. + type: string + separator: + description: separator defines the string between concatenated + SourceLabels. + type: string + sourceLabels: + description: |- + sourceLabels defines the source labels select values from existing labels. Their content is + concatenated using the configured Separator and matched against the + configured regular expression. + items: + description: |- + LabelName is a valid Prometheus label name. + For Prometheus 3.x, a label name is valid if it contains UTF-8 characters. + For Prometheus 2.x, a label name is only valid if it contains ASCII characters, letters, numbers, as well as underscores. + type: string + type: array + targetLabel: + description: |- + targetLabel defines the label to which the resulting string is written in a replacement. + + It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`, + `KeepEqual` and `DropEqual` actions. + + Regex capture groups are available. + type: string + type: object + type: array + scheme: + description: scheme defines the HTTP scheme to use when scraping + the metrics. + enum: + - http + - https + - HTTP + - HTTPS + type: string + scrapeTimeout: + description: |- + scrapeTimeout defines the timeout after which Prometheus considers the scrape to be failed. + + If empty, Prometheus uses the global scrape timeout unless it is less + than the target's scrape interval value in which the latter is used. + The value cannot be greater than the scrape interval otherwise the operator will reject the resource. + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + targetPort: + anyOf: + - type: integer + - type: string + description: |- + targetPort defines the name or number of the target port of the `Pod` object behind the + Service. The port must be specified with the container's port property. + x-kubernetes-int-or-string: true + tlsConfig: + description: tlsConfig defines TLS configuration used by the + client. + properties: + ca: + description: ca defines the Certificate authority used when + verifying server certificates. + properties: + configMap: + description: configMap defines the ConfigMap containing + data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or its + key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + secret: + description: secret defines the Secret containing data + to use for the targets. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + caFile: + description: caFile defines the path to the CA cert in the + Prometheus container to use for the targets. + type: string + cert: + description: cert defines the Client certificate to present + when doing client-authentication. + properties: + configMap: + description: configMap defines the ConfigMap containing + data to use for the targets. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or its + key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + secret: + description: secret defines the Secret containing data + to use for the targets. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + certFile: + description: certFile defines the path to the client cert + file in the Prometheus container for the targets. + type: string + insecureSkipVerify: + description: insecureSkipVerify defines how to disable target + certificate validation. + type: boolean + keyFile: + description: keyFile defines the path to the client key + file in the Prometheus container for the targets. + type: string + keySecret: + description: keySecret defines the Secret containing the + client key file for the targets. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key must + be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + maxVersion: + description: |- + maxVersion defines the maximum acceptable TLS version. + + It requires Prometheus >= v2.41.0 or Thanos >= v0.31.0. + enum: + - TLS10 + - TLS11 + - TLS12 + - TLS13 + type: string + minVersion: + description: |- + minVersion defines the minimum acceptable TLS version. + + It requires Prometheus >= v2.35.0 or Thanos >= v0.28.0. + enum: + - TLS10 + - TLS11 + - TLS12 + - TLS13 + type: string + serverName: + description: serverName is used to verify the hostname for + the targets. + type: string + type: object + trackTimestampsStaleness: + description: |- + trackTimestampsStaleness defines whether Prometheus tracks staleness of + the metrics that have an explicit timestamp present in scraped data. + Has no effect if `honorTimestamps` is false. + + It requires Prometheus >= v2.48.0. + type: boolean + type: object + type: array + fallbackScrapeProtocol: + description: |- + fallbackScrapeProtocol defines the protocol to use if a scrape returns blank, unparseable, or otherwise invalid Content-Type. + + It requires Prometheus >= v3.0.0. + enum: + - PrometheusProto + - OpenMetricsText0.0.1 + - OpenMetricsText1.0.0 + - PrometheusText0.0.4 + - PrometheusText1.0.0 + type: string + jobLabel: + description: |- + jobLabel selects the label from the associated Kubernetes `Service` + object which will be used as the `job` label for all metrics. + + For example if `jobLabel` is set to `foo` and the Kubernetes `Service` + object is labeled with `foo: bar`, then Prometheus adds the `job="bar"` + label to all ingested metrics. + + If the value of this field is empty or if the label doesn't exist for + the given Service, the `job` label of the metrics defaults to the name + of the associated Kubernetes `Service`. + type: string + keepDroppedTargets: + description: |- + keepDroppedTargets defines the per-scrape limit on the number of targets dropped by relabeling + that will be kept in memory. 0 means no limit. + + It requires Prometheus >= v2.47.0. + format: int64 + type: integer + labelLimit: + description: |- + labelLimit defines the per-scrape limit on number of labels that will be accepted for a sample. + + It requires Prometheus >= v2.27.0. + format: int64 + type: integer + labelNameLengthLimit: + description: |- + labelNameLengthLimit defines the per-scrape limit on length of labels name that will be accepted for a sample. + + It requires Prometheus >= v2.27.0. + format: int64 + type: integer + labelValueLengthLimit: + description: |- + labelValueLengthLimit defines the per-scrape limit on length of labels value that will be accepted for a sample. + + It requires Prometheus >= v2.27.0. + format: int64 + type: integer + namespaceSelector: + description: |- + namespaceSelector defines in which namespace(s) Prometheus should discover the services. + By default, the services are discovered in the same namespace as the `ServiceMonitor` object but it is possible to select pods across different/all namespaces. + properties: + any: + description: |- + any defines the boolean describing whether all namespaces are selected in contrast to a + list restricting them. + type: boolean + matchNames: + description: matchNames defines the list of namespace names to + select from. + items: + type: string + type: array + type: object + nativeHistogramBucketLimit: + description: |- + nativeHistogramBucketLimit defines ff there are more than this many buckets in a native histogram, + buckets will be merged to stay within the limit. + It requires Prometheus >= v2.45.0. + format: int64 + type: integer + nativeHistogramMinBucketFactor: + anyOf: + - type: integer + - type: string + description: |- + nativeHistogramMinBucketFactor defines if the growth factor of one bucket to the next is smaller than this, + buckets will be merged to increase the factor sufficiently. + It requires Prometheus >= v2.50.0. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + podTargetLabels: + description: |- + podTargetLabels defines the labels which are transferred from the + associated Kubernetes `Pod` object onto the ingested metrics. + items: + type: string + type: array + sampleLimit: + description: |- + sampleLimit defines a per-scrape limit on the number of scraped samples + that will be accepted. + format: int64 + type: integer + scrapeClass: + description: scrapeClass defines the scrape class to apply. + minLength: 1 + type: string + scrapeClassicHistograms: + description: |- + scrapeClassicHistograms defines whether to scrape a classic histogram that is also exposed as a native histogram. + It requires Prometheus >= v2.45.0. + + Notice: `scrapeClassicHistograms` corresponds to the `always_scrape_classic_histograms` field in the Prometheus configuration. + type: boolean + scrapeNativeHistograms: + description: |- + scrapeNativeHistograms defines whether to enable scraping of native histograms. + It requires Prometheus >= v3.8.0. + type: boolean + scrapeProtocols: + description: |- + scrapeProtocols defines the protocols to negotiate during a scrape. It tells clients the + protocols supported by Prometheus in order of preference (from most to least preferred). + + If unset, Prometheus uses its default value. + + It requires Prometheus >= v2.49.0. + items: + description: |- + ScrapeProtocol represents a protocol used by Prometheus for scraping metrics. + Supported values are: + * `OpenMetricsText0.0.1` + * `OpenMetricsText1.0.0` + * `PrometheusProto` + * `PrometheusText0.0.4` + * `PrometheusText1.0.0` + enum: + - PrometheusProto + - OpenMetricsText0.0.1 + - OpenMetricsText1.0.0 + - PrometheusText0.0.4 + - PrometheusText1.0.0 + type: string + type: array + x-kubernetes-list-type: set + selector: + description: selector defines the label selector to select the Kubernetes + `Endpoints` objects to scrape metrics from. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + selectorMechanism: + description: |- + selectorMechanism defines the mechanism used to select the endpoints to scrape. + By default, the selection process relies on relabel configurations to filter the discovered targets. + Alternatively, you can opt in for role selectors, which may offer better efficiency in large clusters. + Which strategy is best for your use case needs to be carefully evaluated. + + It requires Prometheus >= v2.17.0. + enum: + - RelabelConfig + - RoleSelector + type: string + serviceDiscoveryRole: + description: |- + serviceDiscoveryRole defines the service discovery role used to discover targets. + + If set, the value should be either "Endpoints" or "EndpointSlice". + Otherwise it defaults to the value defined in the + Prometheus/PrometheusAgent resource. + enum: + - Endpoints + - EndpointSlice + type: string + targetLabels: + description: |- + targetLabels defines the labels which are transferred from the + associated Kubernetes `Service` object onto the ingested metrics. + items: + type: string + type: array + targetLimit: + description: |- + targetLimit defines a limit on the number of scraped targets that will + be accepted. + format: int64 + type: integer + required: + - endpoints + - selector + type: object + status: + description: |- + status defines the status subresource. It is under active development and is updated only when the + "StatusForConfigurationResources" feature gate is enabled. + + Most recent observed status of the ServiceMonitor. Read-only. + More info: + https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#spec-and-status + properties: + bindings: + description: bindings defines the list of workload resources (Prometheus, + PrometheusAgent, ThanosRuler or Alertmanager) which select the configuration + resource. + items: + description: WorkloadBinding is a link between a configuration resource + and a workload resource. + properties: + conditions: + description: conditions defines the current state of the configuration + resource when bound to the referenced Workload object. + items: + description: ConfigResourceCondition describes the status + of configuration resources linked to Prometheus, PrometheusAgent, + Alertmanager or ThanosRuler. + properties: + lastTransitionTime: + description: lastTransitionTime defines the time of the + last update to the current status property. + format: date-time + type: string + message: + description: message defines the human-readable message + indicating details for the condition's last transition. + type: string + observedGeneration: + description: |- + observedGeneration defines the .metadata.generation that the + condition was set based upon. For instance, if `.metadata.generation` is + currently 12, but the `.status.conditions[].observedGeneration` is 9, the + condition is out of date with respect to the current state of the object. + format: int64 + type: integer + reason: + description: reason for the condition's last transition. + type: string + status: + description: status of the condition. + minLength: 1 + type: string + type: + description: |- + type of the condition being reported. + Currently, only "Accepted" is supported. + enum: + - Accepted + minLength: 1 + type: string + required: + - lastTransitionTime + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + group: + description: group defines the group of the referenced resource. + enum: + - monitoring.coreos.com + type: string + name: + description: name defines the name of the referenced object. + minLength: 1 + type: string + namespace: + description: namespace defines the namespace of the referenced + object. + minLength: 1 + type: string + resource: + description: resource defines the type of resource being referenced + (e.g. Prometheus, PrometheusAgent, ThanosRuler or Alertmanager). + enum: + - prometheuses + - prometheusagents + - thanosrulers + - alertmanagers + type: string + required: + - group + - name + - namespace + - resource + type: object + type: array + x-kubernetes-list-map-keys: + - group + - resource + - name + - namespace + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/go.mod b/go.mod index 2552fc6360..55d2492385 100644 --- a/go.mod +++ b/go.mod @@ -361,6 +361,7 @@ require ( github.com/Azure/go-autorest/autorest/azure/auth v0.5.12 // indirect github.com/Azure/go-autorest/autorest/azure/cli v0.4.6 // indirect github.com/Azure/go-autorest/autorest/to v0.4.0 // indirect + github.com/Azure/go-workflow v0.1.13 // indirect github.com/Azure/msi-dataplane v0.4.3 // indirect github.com/Crocmagnon/fatcontext v0.7.1 // indirect github.com/Djarvur/go-err113 v0.0.0-20210108212216-aea10b59be24 // indirect @@ -391,6 +392,7 @@ require ( github.com/awslabs/amazon-ecr-credential-helper/ecr-login v0.0.0-20240514230400-03fa26f5508f // indirect github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/bahlo/generic-list-go v0.2.0 // indirect + github.com/benbjohnson/clock v1.3.5 // indirect github.com/bkielbasa/cyclop v1.2.3 // indirect github.com/blacktop/go-dwarf v1.0.9 // indirect github.com/blacktop/go-macho v1.1.162 // indirect diff --git a/go.sum b/go.sum index 50dcee13ba..28502742e9 100644 --- a/go.sum +++ b/go.sum @@ -142,6 +142,8 @@ github.com/Azure/go-autorest/logger v0.2.1 h1:IG7i4p/mDa2Ce4TRyAO8IHnVhAVF3RFU+Z github.com/Azure/go-autorest/logger v0.2.1/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZmbF5NWuPV8+WeEW8= github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUMfuitfgcfuo= github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU= +github.com/Azure/go-workflow v0.1.13 h1:tqYmmKsw068Uu/OKRIctQEe1H6BZyXs9mJtzocc7jtE= +github.com/Azure/go-workflow v0.1.13/go.mod h1:gOt4hadDnP+SzV+ywWJRcM1BSopJ+1rfGlhrTIR040I= github.com/Azure/msi-dataplane v0.4.3 h1:dWPWzY4b54tLIR9T1Q014Xxd/1DxOsMIp6EjRFAJlQY= github.com/Azure/msi-dataplane v0.4.3/go.mod h1:yAfxdJyvcnvSDfSyOFV9qm4fReEQDl+nZLGeH2ZWSmw= github.com/Azure/perf-tests/network/benchmarks/netperf v0.0.0-20241008140716-395a79947d2c h1:TMXh4Z1Z98o4Ob7JYhiNHocITQXHqJSsrB5ts4uwKl8= @@ -295,6 +297,8 @@ github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk= github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= +github.com/benbjohnson/clock v1.3.5 h1:VvXlSJBzZpA/zum6Sj74hxwYI2DIxRWuNIoXAzHZz5o= +github.com/benbjohnson/clock v1.3.5/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= diff --git a/operator/Dockerfile b/operator/Dockerfile index a6e34f5037..5d8ccdbd05 100644 --- a/operator/Dockerfile +++ b/operator/Dockerfile @@ -5,9 +5,14 @@ ARG VERSION ARG APP_INSIGHTS_ID WORKDIR /workspace + +# Cache module download separately from source changes. +COPY go.mod go.sum ./ +RUN go mod download + COPY . . -RUN tdnf install -y jq +RUN --mount=type=cache,target=/var/cache/tdnf tdnf install -y jq # Default linux/architecture. ARG GOOS=linux diff --git a/test/e2e/framework/prometheus/prometheus.go b/test/e2e/framework/prometheus/prometheus.go index ebd0cb185c..b65764fe0a 100644 --- a/test/e2e/framework/prometheus/prometheus.go +++ b/test/e2e/framework/prometheus/prometheus.go @@ -81,23 +81,55 @@ func CheckMetricFromBuffer(prometheusMetricData []byte, metricName string, valid return nil } +func formatMetricDetail(name string, mf *promclient.MetricFamily, m *promclient.Metric) string { + var sb strings.Builder + sb.WriteString(name) + sb.WriteString("{") + for i, label := range m.GetLabel() { + if i > 0 { + sb.WriteString(", ") + } + fmt.Fprintf(&sb, "%s=%q", label.GetName(), label.GetValue()) + } + sb.WriteString("}") + + switch mf.GetType() { + case promclient.MetricType_COUNTER: + fmt.Fprintf(&sb, " counter:%v", m.GetCounter().GetValue()) + case promclient.MetricType_GAUGE: + fmt.Fprintf(&sb, " gauge:%v", m.GetGauge().GetValue()) + case promclient.MetricType_HISTOGRAM: + h := m.GetHistogram() + fmt.Fprintf(&sb, " histogram:count=%v sum=%v", h.GetSampleCount(), h.GetSampleSum()) + case promclient.MetricType_SUMMARY: + s := m.GetSummary() + fmt.Fprintf(&sb, " summary:count=%v sum=%v", s.GetSampleCount(), s.GetSampleSum()) + case promclient.MetricType_UNTYPED: + fmt.Fprintf(&sb, " untyped:%v", m.GetUntyped().GetValue()) + } + + return sb.String() +} + func verifyValidMetricPresent(metricName string, data map[string]*promclient.MetricFamily, validMetric map[string]string) error { - for _, metric := range data { - if metric.GetName() == metricName { - for _, metric := range metric.GetMetric() { + for _, mf := range data { + if mf.GetName() == metricName { + for _, m := range mf.GetMetric() { // get all labels and values on the metric metricLabels := map[string]string{} - for _, label := range metric.GetLabel() { + for _, label := range m.GetLabel() { metricLabels[label.GetName()] = label.GetValue() } // if valid metric is empty, then we just need to make sure the metric and value is present if len(validMetric) == 0 && len(metricLabels) > 0 { + log.Printf("found matching metric: %s", formatMetricDetail(metricName, mf, m)) return nil } if reflect.DeepEqual(metricLabels, validMetric) { + log.Printf("found matching metric: %s", formatMetricDetail(metricName, mf, m)) return nil } } @@ -130,18 +162,19 @@ func getAllPrometheusMetricsFromURL(url string) (map[string]*promclient.MetricFa // verifyValidMetricPresentPartial checks if a metric exists with labels that contain // all the key-value pairs in validMetric (partial matching - the metric can have additional labels) func verifyValidMetricPresentPartial(metricName string, data map[string]*promclient.MetricFamily, validMetric map[string]string) error { - for _, metric := range data { - if metric.GetName() == metricName { - for _, metric := range metric.GetMetric() { + for _, mf := range data { + if mf.GetName() == metricName { + for _, m := range mf.GetMetric() { // get all labels and values on the metric metricLabels := map[string]string{} - for _, label := range metric.GetLabel() { + for _, label := range m.GetLabel() { metricLabels[label.GetName()] = label.GetValue() } // if valid metric is empty, then we just need to make sure the metric and value is present if len(validMetric) == 0 && len(metricLabels) > 0 { + log.Printf("found matching metric: %s", formatMetricDetail(metricName, mf, m)) return nil } @@ -155,6 +188,7 @@ func verifyValidMetricPresentPartial(metricName string, data map[string]*promcli } if allMatch { + log.Printf("found matching metric: %s", formatMetricDetail(metricName, mf, m)) return nil } } diff --git a/test/e2e/scenarios/dns/validate-advanced-dns-metric.go b/test/e2e/scenarios/dns/validate-advanced-dns-metric.go index 00badbe8bb..de92a3295b 100644 --- a/test/e2e/scenarios/dns/validate-advanced-dns-metric.go +++ b/test/e2e/scenarios/dns/validate-advanced-dns-metric.go @@ -4,7 +4,6 @@ package dns import ( "fmt" - "log" "github.com/microsoft/retina/test/e2e/framework/constants" "github.com/microsoft/retina/test/e2e/framework/kubernetes" @@ -50,8 +49,6 @@ func (v *ValidateAdvancedDNSRequestMetrics) Run() error { if err != nil { return errors.Wrapf(err, "failed to verify advance dns request metrics %s", dnsAdvRequestCountMetricName) } - log.Printf("found metrics matching %+v\n", dnsAdvRequestCountMetricName) - return nil } @@ -106,8 +103,6 @@ func (v *ValidateAdvanceDNSResponseMetrics) Run() error { if err != nil { return errors.Wrapf(err, "failed to verify advance dns response metrics %s", dnsAdvRequestCountMetricName) } - log.Printf("found metrics matching %+v\n", dnsAdvResponseCountMetricName) - return nil } diff --git a/test/e2e/scenarios/dns/validate-basic-dns-metric.go b/test/e2e/scenarios/dns/validate-basic-dns-metric.go index 91662e1bb5..2c047dde3c 100644 --- a/test/e2e/scenarios/dns/validate-basic-dns-metric.go +++ b/test/e2e/scenarios/dns/validate-basic-dns-metric.go @@ -4,7 +4,6 @@ package dns import ( "fmt" - "log" "github.com/microsoft/retina/test/e2e/framework/constants" prom "github.com/microsoft/retina/test/e2e/framework/prometheus" @@ -30,8 +29,6 @@ func (v *validateBasicDNSRequestMetrics) Run() error { if err != nil { return errors.Wrapf(err, "failed to verify basic dns request metrics %s", dnsBasicRequestCountMetricName) } - log.Printf("found metrics matching %+v\n", dnsBasicRequestCountMetricName) - return nil } @@ -64,8 +61,6 @@ func (v *validateBasicDNSResponseMetrics) Run() error { if err != nil { return errors.Wrapf(err, "failed to verify basic dns response metrics %s", dnsBasicResponseCountMetricName) } - log.Printf("found metrics matching %+v\n", dnsBasicResponseCountMetricName) - return nil } diff --git a/test/e2e/scenarios/drop/validate-drop-metric.go b/test/e2e/scenarios/drop/validate-drop-metric.go index 7b647e65f8..e2a759612a 100644 --- a/test/e2e/scenarios/drop/validate-drop-metric.go +++ b/test/e2e/scenarios/drop/validate-drop-metric.go @@ -2,7 +2,6 @@ package drop import ( "fmt" - "log" prom "github.com/microsoft/retina/test/e2e/framework/prometheus" ) @@ -45,7 +44,6 @@ func (v *ValidateRetinaDropMetric) Run() error { return fmt.Errorf("failed to verify prometheus metrics %s: %w", dropBytesMetricName, err) } - log.Printf("found metrics matching %+v\n", metric) return nil } diff --git a/test/e2e/scenarios/latency/validate-latency-metric.go b/test/e2e/scenarios/latency/validate-latency-metric.go index 270ae95969..0da7d3341b 100644 --- a/test/e2e/scenarios/latency/validate-latency-metric.go +++ b/test/e2e/scenarios/latency/validate-latency-metric.go @@ -2,7 +2,6 @@ package latency import ( "fmt" - "log" "github.com/microsoft/retina/test/e2e/framework/constants" prom "github.com/microsoft/retina/test/e2e/framework/prometheus" @@ -25,8 +24,6 @@ func (v *ValidateAPIServerLatencyMetric) Run() error { if err != nil { return errors.Wrapf(err, "failed to verify latency metrics %s", latencyBucketMetricName) } - - log.Printf("found metrics matching %s\n", latencyBucketMetricName) return nil } diff --git a/test/e2e/scenarios/tcp/validate-flow-metric.go b/test/e2e/scenarios/tcp/validate-flow-metric.go index 9ade9947b9..8463f7b04f 100644 --- a/test/e2e/scenarios/tcp/validate-flow-metric.go +++ b/test/e2e/scenarios/tcp/validate-flow-metric.go @@ -2,7 +2,6 @@ package flow import ( "fmt" - "log" prom "github.com/microsoft/retina/test/e2e/framework/prometheus" ) @@ -37,7 +36,6 @@ func (v *ValidateRetinaTCPStateMetric) Run() error { } } - log.Printf("found metrics matching %+v\n", validMetrics) return nil } diff --git a/test/e2e/scenarios/tcp/validate-tcp-connection-remote.go b/test/e2e/scenarios/tcp/validate-tcp-connection-remote.go index 70c1c7fb97..9fe94e156e 100644 --- a/test/e2e/scenarios/tcp/validate-tcp-connection-remote.go +++ b/test/e2e/scenarios/tcp/validate-tcp-connection-remote.go @@ -2,7 +2,6 @@ package flow import ( "fmt" - "log" prom "github.com/microsoft/retina/test/e2e/framework/prometheus" ) @@ -30,7 +29,6 @@ func (v *ValidateRetinaTCPConnectionRemoteMetric) Run() error { } } - log.Printf("found metrics matching %+v\n", tcpConnectionRemoteMetricName) return nil } diff --git a/test/e2ev3/Makefile b/test/e2ev3/Makefile new file mode 100644 index 0000000000..21887fb218 --- /dev/null +++ b/test/e2ev3/Makefile @@ -0,0 +1,50 @@ +TIMEOUT_KIND ?= 60m +TIMEOUT_AZURE ?= 120m +PROVIDER ?= kind +KUBECONFIG ?= +CREATE_INFRA ?= true +DELETE_INFRA ?= true + +REPO_ROOT := $(shell git rev-parse --show-toplevel) + +GO_TEST = cd $(REPO_ROOT) && go test -v -tags e2e ./test/e2ev3/ -timeout + +# base flags, computed from variables above +FLAGS = -provider=$(PROVIDER) -create-infra=$(CREATE_INFRA) -delete-infra=$(DELETE_INFRA) +ifdef KUBECONFIG +FLAGS += -kubeconfig=$(KUBECONFIG) +endif + +# timeout picked by provider +ifeq ($(PROVIDER),kind) +TIMEOUT = $(TIMEOUT_KIND) +else +TIMEOUT = $(TIMEOUT_AZURE) +endif + +.PHONY: test-e2e test-basic-metrics test-advanced-metrics test-hubble-metrics test-capture \ + test-basic-metrics-exp test-advanced-metrics-exp help + +test-e2e: ## run all e2e scenarios, defaults to kind + $(GO_TEST) $(TIMEOUT) $(FLAGS) + +test-basic-metrics: ## run basic metrics scenarios (drop, tcp, dns) + $(GO_TEST) $(TIMEOUT) -run TestE2ERetina/BasicMetrics$$ $(FLAGS) + +test-advanced-metrics: ## run advanced metrics scenarios (dns, latency) + $(GO_TEST) $(TIMEOUT) -run TestE2ERetina/AdvancedMetrics$$ $(FLAGS) + +test-hubble-metrics: ## run hubble metrics scenarios (drop, tcp, dns, flows) + $(GO_TEST) $(TIMEOUT) -run TestE2ERetina/HubbleMetrics$$ $(FLAGS) + +test-capture: ## run packet capture scenarios + $(GO_TEST) $(TIMEOUT) -run TestE2ERetina/Capture$$ $(FLAGS) + +test-basic-metrics-exp: ## run experimental basic metrics scenarios + $(GO_TEST) $(TIMEOUT) -run TestE2ERetina/BasicMetricsExperimental$$ $(FLAGS) + +test-advanced-metrics-exp: ## run experimental advanced metrics scenarios + $(GO_TEST) $(TIMEOUT) -run TestE2ERetina/AdvancedMetricsExperimental$$ $(FLAGS) + +help: ## show this help + @grep -E '^[a-z-]+:.*## ' $(MAKEFILE_LIST) | awk -F ':.*## ' '{printf " %-26s %s\n", $$1, $$2}' diff --git a/test/e2ev3/README.md b/test/e2ev3/README.md new file mode 100644 index 0000000000..8feb69edef --- /dev/null +++ b/test/e2ev3/README.md @@ -0,0 +1,171 @@ +# Retina E2E Tests (v3) + +End-to-end tests built on [go-workflow](https://github.com/Azure/go-workflow), a DAG-based test orchestration framework. + +## Prerequisites + +- Go 1.24+ +- Docker (required for the Kind provider) + +## Environment Variables + +| Variable | Required | Default | Description | +|---|---|---|---| +| `TAG` | No | `git describe` | Image tag. If unset, images are built from source. | +| `IMAGE_NAMESPACE` | No | `microsoft/retina` | Image namespace | +| `IMAGE_REGISTRY` | No | `ghcr.io` | Container registry | +| `AZURE_SUBSCRIPTION_ID` | Azure only | — | Azure subscription ID | +| `AZURE_LOCATION` | Azure only | — | Azure region (fallback: `LOCATION`) | +| `AZURE_RESOURCE_GROUP` | Azure only | — | Resource group name | +| `CLUSTER_NAME` | Azure only | — | AKS cluster name | +| `HELM_DRIVER` | No | `secrets` | Helm storage driver | + +## Test Flags + +| Flag | Default | Description | +|---|---|---| +| `-provider` | `azure` | Infrastructure provider: `azure` or `kind` | +| `-kubeconfig` | `""` | Path to an existing kubeconfig (skips infra creation) | +| `-create-infra` | `true` | Create infrastructure before tests | +| `-delete-infra` | `true` | Delete infrastructure after tests | + +## Running Tests + +All commands are run from `test/e2ev3/`. + +### Make Targets + +```bash +make test-e2e # Run all scenarios +make test-basic-metrics # Drop, TCP, DNS +make test-advanced-metrics # DNS, latency +make test-hubble-metrics # Hubble drop, TCP, DNS, flows +make test-capture # Packet capture +make test-basic-metrics-exp # Experimental basic metrics +make test-advanced-metrics-exp # Experimental advanced metrics +``` + +The default provider is `kind`. When no `TAG` is set, images are built from source automatically using `git describe` as the tag (agent, init, and operator for linux/amd64). For Kind, images are built locally; for Azure, they are built and pushed to the registry. + +Override with Make variables: + +```bash +# Use an existing Kind cluster +make test-basic-metrics KUBECONFIG=$HOME/.kube/config CREATE_INFRA=false DELETE_INFRA=false + +# Run against Azure +make test-e2e PROVIDER=azure +``` + +### Kind (Local) + +With no environment variables, images are built from source and loaded onto a new Kind cluster: + +```bash +make test-e2e +``` + +Or with an explicit tag pointing at pre-built images: + +```bash +TAG=v0.0.1 \ +IMAGE_NAMESPACE=retina \ +IMAGE_REGISTRY=ghcr.io/microsoft \ + go test -v -tags e2e ./test/e2ev3/ \ + -provider=kind \ + -timeout 60m +``` + +Use an existing Kind cluster: + +```bash +TAG=v0.0.1 \ +IMAGE_NAMESPACE=retina \ +IMAGE_REGISTRY=ghcr.io/microsoft \ + go test -v -tags e2e ./test/e2ev3/ \ + -provider=kind \ + -kubeconfig=$HOME/.kube/config \ + -create-infra=false \ + -delete-infra=false \ + -timeout 60m +``` + +### Azure (AKS) + +Create an AKS cluster, run all scenarios, and tear down: + +```bash +TAG=v0.0.1 \ +IMAGE_NAMESPACE=retina \ +IMAGE_REGISTRY=ghcr.io/microsoft \ +AZURE_SUBSCRIPTION_ID= \ +AZURE_LOCATION=eastus2 \ +AZURE_RESOURCE_GROUP=retina-e2e-rg \ +CLUSTER_NAME=retina-e2e \ + go test -v -tags e2e ./test/e2ev3/ \ + -provider=azure \ + -timeout 120m +``` + +Use an existing AKS cluster: + +```bash +TAG=v0.0.1 \ +IMAGE_NAMESPACE=retina \ +IMAGE_REGISTRY=ghcr.io/microsoft \ + go test -v -tags e2e ./test/e2ev3/ \ + -kubeconfig=$HOME/.kube/config \ + -create-infra=false \ + -delete-infra=false \ + -timeout 120m +``` + +### Running a Specific Sub-Test + +> **Note:** The test pipeline runs as a single `flow.Pipe` — there are no Go +> sub-tests. The individual Makefile targets (`test-basic-metrics`, etc.) +> currently run the full pipeline. To run a subset, use `-kubeconfig` to point +> at an existing cluster and comment out unwanted steps in +> `retina_e2e_test.go`. + +## Workflow Structure + +Each scenario follows the same DAG pattern: + +``` +create → exec → validate (retry with backoff) → cleanup (always) +``` + +- **Create** — Provision resources (pods, network policies). +- **Exec** — Generate traffic (curl, nslookup). +- **Validate** — Port-forward to Retina or Hubble and assert Prometheus metrics. Retried with exponential backoff. +- **Cleanup** — Delete resources. Runs even if validation fails via `When(flow.Always)`. + +## Directory Layout + +``` +test/e2ev3/ +├── retina_e2e_test.go # Test entry point (declarative pipeline) +├── Makefile # Make targets +├── config/ # E2E config, flags, paths, shared params +│ ├── e2e.go # Config types, env loading, E2EParams +│ └── load_step.go # config.Step — resolves config + image tag +├── pkg/ +│ ├── images/ # Image loading interface + images.Step +│ │ ├── build/ # Build images from source + build.Step +│ │ └── load/ # Load images onto clusters (Kind sideload vs registry pull) +│ ├── infra/ # Infrastructure orchestration + infra.Workflow +│ │ └── providers/ +│ │ ├── azure/ # AKS cluster provisioning (ARM templates) +│ │ └── kind/ # Kind cluster lifecycle (native SDK) +│ ├── kubernetes/ # Reusable K8s steps (Helm, pods, port-forward, exec) +│ ├── prometheus/ # Prometheus metric scraping and validation +│ └── utils/ # Shared utilities +└── workflows/ + ├── basicmetrics/ # Drop, TCP, DNS scenarios + │ └── experimental/ # Experimental basic metrics (conntrack, forward, etc.) + ├── advancedmetrics/ # DNS, latency scenarios (upgraded Helm profile) + │ └── experimental/ # Experimental advanced metrics (drop, forward, etc.) + ├── hubblemetrics/ # Hubble drop, TCP, DNS, flow scenarios + └── capture/ # Packet capture validation +``` diff --git a/test/e2ev3/config/e2e.go b/test/e2ev3/config/e2e.go new file mode 100644 index 0000000000..5d34570092 --- /dev/null +++ b/test/e2ev3/config/e2e.go @@ -0,0 +1,215 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package config + +import ( + "context" + "flag" + "fmt" + "log/slog" + "os/exec" + "os/signal" + "path/filepath" + "strings" + "syscall" + "testing" + "time" + + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/azure" + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/kind" + "github.com/spf13/viper" + "k8s.io/client-go/rest" +) + +// E2EConfig holds all configuration and runtime state for e2e tests. +// Fields are populated incrementally by pipeline steps. +type E2EConfig struct { + Cluster ClusterProvider + Image ImageConfig + Scale ScaleConfig + Helm HelmConfig + Paths Paths +} + +// ClusterProvider abstracts cluster-specific behaviors. +// Implementations live in pkg/infra/providers/{kind,azure}. +type ClusterProvider interface { + ClusterName() string + KubeConfigPath() string + RestConfig() *rest.Config + LoadImages(ctx context.Context, images []string) error + ImagePullPolicy() string + ImagePullSecrets() []map[string]interface{} +} + +// ImageConfig holds container image coordinates. +type ImageConfig struct { + Tag string + Namespace string + Registry string +} + +// ScaleConfig holds scale-test parameters. +type ScaleConfig struct { + Nodes string + NumDeployments string + NumReplicas string + NumNetworkPolicies string + CleanUp string +} + +// HelmConfig holds Helm-specific settings. +type HelmConfig struct { + Driver string +} + +// Flags parsed from test command line. +var ( + CreateInfra = flag.Bool("create-infra", true, "create infrastructure for testing") + DeleteInfra = flag.Bool("delete-infra", true, "delete infrastructure after testing") + KubeConfig = flag.String("kubeconfig", "", "path to kubeconfig file") + Provider = flag.String("provider", "azure", "infrastructure provider: azure or kind") + ForceBuild = flag.Bool("force-build", false, "rebuild images even if they already exist locally") +) + +const ( + KubeSystemNamespace = "kube-system" + TestPodNamespace = "kube-system-test" + safetyTimeout = 24 * time.Hour +) + +// Architectures lists the CPU architectures to test across. +// Kind clusters are single-arch (amd64), so arm64 is only tested on Azure. +var Architectures []string + +// Paths holds resolved filesystem paths relative to the repository root. +type Paths struct { + RootDir string + RetinaChart string + HubbleChart string + AdvancedProfile string +} + +// ResolvePaths computes all standard paths from the repository root directory. +func ResolvePaths(rootDir string) *Paths { + return &Paths{ + RootDir: rootDir, + RetinaChart: filepath.Join(rootDir, "deploy", "standard", "manifests", "controller", "helm", "retina"), + HubbleChart: filepath.Join(rootDir, "deploy", "hubble", "manifests", "controller", "helm", "retina"), + AdvancedProfile: filepath.Join(rootDir, "test", "profiles", "advanced", "values.yaml"), + } +} + +// TestContext returns a context with a deadline set to the test deadline minus 1 min to ensure cleanup. +// If the test deadline is not set, a deadline is set to Now + 24h to prevent the test from running indefinitely. +func TestContext(t *testing.T) (context.Context, context.CancelFunc) { + t.Helper() + + deadline, ok := t.Deadline() + if !ok { + t.Log("Test deadline disabled, deadline set to Now + 24h to prevent test from running indefinitely") + deadline = time.Now().Add(safetyTimeout) + } + deadline = deadline.Add(-time.Minute) + + ctx, cancel := context.WithDeadline(context.Background(), deadline) //nolint:all // cancel is reassigned in next line + ctx, cancel = signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM) + + return ctx, cancel +} + +// DevTag returns a tag derived from git describe, suitable for local dev builds. +func DevTag(rootDir string) (string, error) { + cmd := exec.Command("git", "describe", "--tags", "--always") + cmd.Dir = rootDir + + out, err := cmd.Output() + if err != nil { + return "", fmt.Errorf("git describe: %w", err) + } + return strings.TrimSpace(string(out)), nil +} + +// LoadE2EConfig reads environment variables via viper and returns a populated E2EConfig. +func LoadE2EConfig() (*E2EConfig, error) { + v := viper.New() + + // Bind each env var explicitly — env var names don't match struct field paths. + bindings := map[string]string{ + "azure.subscriptionid": "AZURE_SUBSCRIPTION_ID", + "azure.location": "AZURE_LOCATION", + "azure.resourcegroup": "AZURE_RESOURCE_GROUP", + "azure.clustername": "CLUSTER_NAME", + "image.tag": "TAG", + "image.namespace": "IMAGE_NAMESPACE", + "image.registry": "IMAGE_REGISTRY", + "scale.nodes": "NODES", + "scale.numdeployments": "NUM_DEPLOYMENTS", + "scale.numreplicas": "NUM_REPLICAS", + "scale.numnetworkpolicies": "NUM_NET_POL", + "scale.cleanup": "CLEANUP", + "helm.driver": "HELM_DRIVER", + } + + for key, env := range bindings { + if err := v.BindEnv(key, env); err != nil { + return nil, fmt.Errorf("binding env %s to %s: %w", env, key, err) + } + } + + // Also accept LOCATION as a fallback for AZURE_LOCATION. + if v.GetString("azure.location") == "" { + if err := v.BindEnv("azure.location", "LOCATION"); err != nil { + return nil, fmt.Errorf("binding env LOCATION: %w", err) + } + } + + // Build the provider-specific cluster config. + var cluster ClusterProvider + switch *Provider { + case "kind": + Architectures = []string{"amd64"} + cluster = &kind.Cluster{ + Name: v.GetString("azure.clustername"), + } + default: + Architectures = []string{"amd64", "arm64"} + cluster = &azure.Cluster{ + SubscriptionID: v.GetString("azure.subscriptionid"), + Location: v.GetString("azure.location"), + ResourceGroup: v.GetString("azure.resourcegroup"), + Name: v.GetString("azure.clustername"), + } + } + + cfg := &E2EConfig{ + Cluster: cluster, + Image: ImageConfig{ + Tag: v.GetString("image.tag"), + Namespace: v.GetString("image.namespace"), + Registry: v.GetString("image.registry"), + }, + Scale: ScaleConfig{ + Nodes: v.GetString("scale.nodes"), + NumDeployments: v.GetString("scale.numdeployments"), + NumReplicas: v.GetString("scale.numreplicas"), + NumNetworkPolicies: v.GetString("scale.numnetworkpolicies"), + CleanUp: v.GetString("scale.cleanup"), + }, + Helm: HelmConfig{ + Driver: v.GetString("helm.driver"), + }, + } + + if cfg.Image.Registry == "" { + cfg.Image.Registry = "ghcr.io" + } + if cfg.Image.Namespace == "" { + cfg.Image.Namespace = "microsoft/retina" + } + + slog.Info("using image", "registry", cfg.Image.Registry, "namespace", cfg.Image.Namespace, "tag", cfg.Image.Tag) + + return cfg, nil +} diff --git a/test/e2ev3/config/load_step.go b/test/e2ev3/config/load_step.go new file mode 100644 index 0000000000..4571ded1e9 --- /dev/null +++ b/test/e2ev3/config/load_step.go @@ -0,0 +1,54 @@ +//go:build e2e + +package config + +import ( + "context" + "fmt" + "log/slog" + "os" + "path/filepath" + + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/azure" + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/kind" +) + +// Step resolves e2e config, paths, and image tag. +type Step struct { + Cfg *E2EConfig +} + +func (l *Step) String() string { return "load-config" } + +func (l *Step) Do(ctx context.Context) error { + log := slog.With("step", l.String()) + cfg, err := LoadE2EConfig() + if err != nil { + return fmt.Errorf("load e2e config: %w", err) + } + + cwd, err := os.Getwd() + if err != nil { + return fmt.Errorf("get cwd: %w", err) + } + *l.Cfg = *cfg + l.Cfg.Paths = *ResolvePaths(filepath.Dir(filepath.Dir(cwd))) + + kubeCfgPath := filepath.Join(l.Cfg.Paths.RootDir, "test", "e2e", "test.pem") + switch c := l.Cfg.Cluster.(type) { + case *kind.Cluster: + c.KubeCfgPath = kubeCfgPath + case *azure.Cluster: + c.KubeCfgPath = kubeCfgPath + } + + if l.Cfg.Image.Tag == "" { + tag, err := DevTag(l.Cfg.Paths.RootDir) + if err != nil { + return fmt.Errorf("generate dev tag: %w", err) + } + l.Cfg.Image.Tag = tag + log.Info("no TAG provided, will build images", "tag", tag) + } + return nil +} diff --git a/test/e2ev3/config/metrics.go b/test/e2ev3/config/metrics.go new file mode 100644 index 0000000000..b4526eb5c2 --- /dev/null +++ b/test/e2ev3/config/metrics.go @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package config + +const ( + // Retina Metrics Port + RetinaMetricsPort = "10093" + + // Retina MetricsName + RetinaDropMetricName = "networkobservability_drop_count" + RetinaForwardMetricName = "networkobservability_forward_count" + + // Retina Labels + RetinaSourceLabel = "source" + RetinaDestinationLabel = "destination" + RetinaProtocolLabel = "protocol" + RetinaReasonLabel = "reason" + RetinaDirectionLabel = "direction" + + // Hubble Metrics Port + HubbleMetricsPort = "9965" + + // Hubble MetricsName + HubbleDNSQueryMetricName = "hubble_dns_queries_total" + HubbleDNSResponseMetricName = "hubble_dns_responses_total" + HubbleFlowMetricName = "hubble_flows_processed_total" + HubbleDropMetricName = "hubble_drop_total" + HubbleTCPFlagsMetricName = "hubble_tcp_flags_total" + + // Hubble Labels + HubbleDestinationLabel = "destination" + HubbleSourceLabel = "source" + HubbleIPsRetunedLabel = "ips_returned" + HubbleQTypesLabel = "qtypes" + HubbleRCodeLabel = "rcode" + HubbleQueryLabel = "query" + + HubbleProtocolLabel = "protocol" + HubbleReasonLabel = "reason" + + HubbleSubtypeLabel = "subtype" + HubbleTypeLabel = "type" + HubbleVerdictLabel = "verdict" + + HubbleFamilyLabel = "family" + HubbleFlagLabel = "flag" +) diff --git a/test/e2ev3/config/network.go b/test/e2ev3/config/network.go new file mode 100644 index 0000000000..7f3904baea --- /dev/null +++ b/test/e2ev3/config/network.go @@ -0,0 +1,18 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package config + +const ( + MetricsEndpoint = "metrics" + + TCP = "TCP" + UDP = "UDP" + IPV4 = "IPv4" + IPTableRuleDrop = "IPTABLE_RULE_DROP" + SYN = "SYN" + SYNACK = "SYN-ACK" + ACK = "ACK" + FIN = "FIN" + RST = "RST" +) diff --git a/test/e2ev3/pkg/images/build/build.go b/test/e2ev3/pkg/images/build/build.go new file mode 100644 index 0000000000..546fc8eaa5 --- /dev/null +++ b/test/e2ev3/pkg/images/build/build.go @@ -0,0 +1,103 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package build + +import ( + "context" + "fmt" + "log/slog" + "os/exec" + "strings" + + "github.com/microsoft/retina/test/e2ev3/config" + "github.com/microsoft/retina/test/e2ev3/pkg/images" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" +) + +// Step builds Retina container images by invoking the top-level Makefile. +// It builds the agent, init, and operator images for linux/amd64. +// If all images already exist locally and ForceBuild is false, the build is skipped. +type Step struct { + Cfg *config.E2EConfig +} + +func (b *Step) String() string { return "build-images" } + +func (b *Step) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, b) + img := &b.Cfg.Image + if !*config.ForceBuild && allImagesExist(img.Registry, img.Namespace, img.Tag) { + log.Info("all images already present locally, skipping build") + return nil + } + + push := *config.Provider != "kind" + return b.build(ctx, b.Cfg.Paths.RootDir, img.Registry, img.Namespace, img.Tag, push) +} + +func (b *Step) build(ctx context.Context, rootDir, registry, namespace, tag string, push bool) error { + targets := []string{"retina-image", "retina-operator-image"} + + errs := make(chan error, len(targets)) + for _, target := range targets { + go func(t string) { + errs <- runMake(ctx, rootDir, registry, namespace, tag, push, t) + }(target) + } + + var firstErr error + for range targets { + if err := <-errs; err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +func runMake(ctx context.Context, rootDir, registry, namespace, tag string, push bool, target string) error { + args := []string{ + target, + "PLATFORM=linux/amd64", + "TAG=" + tag, + "RETINA_PLATFORM_TAG=" + tag, + "IMAGE_REGISTRY=" + registry, + "IMAGE_NAMESPACE=" + namespace, + } + if push { + args = append(args, "BUILDX_ACTION=--push", "OUTPUT_LOCAL=") + } else { + // Load into local docker daemon for Kind sideloading. + // Disable provenance/sbom attestations — Kind's ctr import can't handle them. + args = append(args, "BUILDX_ACTION=--load --provenance=false --sbom=false", "OUTPUT_LOCAL=") + } + + slog.Info("building image", "command", "make "+strings.Join(args, " ")) + + cmd := exec.CommandContext(ctx, "make", args...) + cmd.Dir = rootDir + cmdOut := &utils.SlogWriter{Level: slog.LevelInfo, Source: "make-" + target} + cmd.Stdout = cmdOut + cmd.Stderr = cmdOut + + if err := cmd.Run(); err != nil { + cmdOut.Flush() + return fmt.Errorf("make %s failed: %w", target, err) + } + cmdOut.Flush() + return nil +} + +// allImagesExist returns true if every Retina image is already in the local Docker daemon. +func allImagesExist(registry, namespace, tag string) bool { + for _, ref := range images.RetinaImages(registry, namespace, tag) { + cmd := exec.Command("docker", "image", "inspect", ref) + if err := cmd.Run(); err != nil { + return false + } + } + return true +} + diff --git a/test/e2ev3/pkg/images/step.go b/test/e2ev3/pkg/images/step.go new file mode 100644 index 0000000000..3f7ddbccc5 --- /dev/null +++ b/test/e2ev3/pkg/images/step.go @@ -0,0 +1,33 @@ +package images + +import ( + "context" + "log/slog" + + "github.com/microsoft/retina/test/e2ev3/config" +) + +// Step loads container images into the cluster. +type Step struct { + Cfg *config.E2EConfig +} + +func (l *Step) String() string { return "load-images" } + +func (l *Step) Do(ctx context.Context) error { + log := slog.With("step", l.String()) + p := l.Cfg + imgs := RetinaImages(p.Image.Registry, p.Image.Namespace, p.Image.Tag) + log.Info("loading images into cluster", "count", len(imgs), "cluster", p.Cluster.ClusterName()) + return p.Cluster.LoadImages(ctx, imgs) +} + +// RetinaImages returns the standard Retina image references for the given coordinates. +func RetinaImages(registry, namespace, tag string) []string { + base := registry + "/" + namespace + return []string{ + base + "/retina-agent:" + tag, + base + "/retina-init:" + tag, + base + "/retina-operator:" + tag, + } +} diff --git a/test/e2ev3/pkg/infra/azure.go b/test/e2ev3/pkg/infra/azure.go new file mode 100644 index 0000000000..599b048518 --- /dev/null +++ b/test/e2ev3/pkg/infra/azure.go @@ -0,0 +1,64 @@ +package infra + +import ( + "context" + "testing" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/azure" + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/azure/arm" +) + +// ResolveInfraConfig builds the Azure infrastructure config from viper-loaded values, +// falling back to a random location and generated cluster name when not set. +func ResolveInfraConfig(t *testing.T, ac *azure.Cluster) *azure.InfraConfig { + t.Helper() + + subID := ac.SubscriptionID + if subID == "" { + t.Fatal("AZURE_SUBSCRIPTION_ID must be set") + } + + location := ac.Location + if location == "" { + location = azure.RandomLocation(t) + } + + clusterName := azure.ClusterNameForE2ETest(t, ac.Name) + + rg := ac.ResourceGroup + if rg == "" { + rg = clusterName + } + + return azure.DefaultE2EInfraConfig(subID, rg, location, clusterName) +} + +// AzureSteps returns the workflow steps to deploy Azure infrastructure and +// retrieve the cluster kubeconfig, plus registers teardown via t.Cleanup. +func AzureSteps(t *testing.T, cfg *azure.InfraConfig, kubeConfigFilePath string, createInfra, deleteInfra bool) []flow.Steper { + var steps []flow.Steper + + if createInfra { + steps = append(steps, &arm.DeployInfra{Config: cfg}) + } + + steps = append(steps, &azure.GetAKSKubeConfig{ + SubscriptionID: cfg.SubscriptionID, + ResourceGroupName: cfg.ResourceGroupName, + ClusterName: cfg.ClusterName, + Location: cfg.Location, + KubeConfigFilePath: kubeConfigFilePath, + }) + + if deleteInfra { + t.Cleanup(func() { + del := &arm.DeleteInfra{Config: cfg} + if err := del.Do(context.Background()); err != nil { + t.Logf("Failed to delete test infrastructure: %v", err) + } + }) + } + + return steps +} diff --git a/test/e2ev3/pkg/infra/kind.go b/test/e2ev3/pkg/infra/kind.go new file mode 100644 index 0000000000..6020c737ba --- /dev/null +++ b/test/e2ev3/pkg/infra/kind.go @@ -0,0 +1,44 @@ +package infra + +import ( + "context" + "testing" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/kind" +) + +// KindSteps returns the workflow steps to provision a Kind cluster and +// export its kubeconfig, plus registers teardown via t.Cleanup. +func KindSteps(t *testing.T, cfg *kind.Config, kubeConfigFilePath string, createInfra, deleteInfra bool) []flow.Steper { + var steps []flow.Steper + + if createInfra { + steps = append(steps, &kind.CreateCluster{Config: cfg}) + } + + steps = append(steps, &kind.ExportKubeConfig{ + ClusterName: cfg.ClusterName, + KubeConfigFilePath: kubeConfigFilePath, + }) + + if createInfra { + steps = append(steps, &kind.InstallNPM{ + KubeConfigFilePath: kubeConfigFilePath, + }) + } + + if deleteInfra { + t.Cleanup(func() { + del := &kind.DeleteCluster{ + ClusterName: cfg.ClusterName, + KubeConfigFilePath: kubeConfigFilePath, + } + if err := del.Do(context.Background()); err != nil { + t.Logf("Failed to delete Kind cluster: %v", err) + } + }) + } + + return steps +} diff --git a/test/e2ev3/pkg/infra/providers/azure/arm/deploy.go b/test/e2ev3/pkg/infra/providers/azure/arm/deploy.go new file mode 100644 index 0000000000..11d5e67638 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/arm/deploy.go @@ -0,0 +1,92 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package arm + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources" + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/azure" +) + +const ( + deploymentPollFrequency = 30 * time.Second + deploymentStatusTicker = 60 * time.Second +) + +// DeployInfra is a go-workflow step that generates an ARM template from InfraConfig +// and deploys all e2e infrastructure (resource group, VNet, public IPs, AKS cluster) +// in a single subscription-level ARM deployment. +type DeployInfra struct { + Config *azure.InfraConfig +} + +func (d *DeployInfra) String() string { return "deploy-azure-infra" } + +func (d *DeployInfra) Do(ctx context.Context) error { + log := slog.With("step", d.String()) + template := GenerateTemplate(d.Config) + + templateJSON, err := json.MarshalIndent(template, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal ARM template: %w", err) + } + log.Info("generated ARM template", "bytes", len(templateJSON), "cluster", d.Config.ClusterName, "location", d.Config.Location) + + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain Azure CLI credential: %w", err) + } + + client, err := armresources.NewDeploymentsClient(d.Config.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create deployments client: %w", err) + } + + deploymentName := fmt.Sprintf("e2e-%s", d.Config.ClusterName) + log.Info("starting ARM deployment at subscription scope", "deployment", deploymentName) + + poller, err := client.BeginCreateOrUpdateAtSubscriptionScope(ctx, deploymentName, armresources.Deployment{ + Location: to.Ptr(d.Config.Location), + Properties: &armresources.DeploymentProperties{ + Mode: to.Ptr(armresources.DeploymentModeIncremental), + Template: template, + }, + }, nil) + if err != nil { + return fmt.Errorf("failed to begin ARM deployment: %w", err) + } + + notifychan := make(chan struct{}) + go func() { + _, err = poller.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: deploymentPollFrequency, + }) + close(notifychan) + }() + + ticker := time.NewTicker(deploymentStatusTicker) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return fmt.Errorf("ARM deployment timed out: %w", ctx.Err()) + case <-ticker.C: + log.Info("waiting for ARM deployment to complete", "deployment", deploymentName) + case <-notifychan: + if err != nil { + return fmt.Errorf("ARM deployment %q failed: %w", deploymentName, err) + } + log.Info("ARM deployment completed successfully", "deployment", deploymentName) + return nil + } + } +} diff --git a/test/e2ev3/pkg/infra/providers/azure/arm/lifecycle.go b/test/e2ev3/pkg/infra/providers/azure/arm/lifecycle.go new file mode 100644 index 0000000000..fc66192357 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/arm/lifecycle.go @@ -0,0 +1,93 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package arm + +import ( + "context" + "fmt" + "log/slog" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources" + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/azure" +) + +// DeleteInfra is a go-workflow step that deletes the resource group created +// by DeployInfra, cascading deletion of all resources within it. +type DeleteInfra struct { + Config *azure.InfraConfig +} + +func (d *DeleteInfra) String() string { return "delete-azure-infra" } + +func (d *DeleteInfra) Do(ctx context.Context) error { + log := slog.With("step", d.String()) + log.Info("deleting resource group and all resources within", "resourceGroup", d.Config.ResourceGroupName) + + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain Azure CLI credential: %w", err) + } + + clientFactory, err := armresources.NewClientFactory(d.Config.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create resource group client: %w", err) + } + + forceDeleteType := "Microsoft.Compute/virtualMachines,Microsoft.Compute/virtualMachineScaleSets" + poller, err := clientFactory.NewResourceGroupsClient().BeginDelete(ctx, d.Config.ResourceGroupName, + &armresources.ResourceGroupsClientBeginDeleteOptions{ + ForceDeletionTypes: &forceDeleteType, + }) + if err != nil { + return fmt.Errorf("failed to begin resource group deletion: %w", err) + } + + notifychan := make(chan struct{}) + go func() { + _, err = poller.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: deploymentPollFrequency, + }) + close(notifychan) + }() + + ticker := time.NewTicker(deploymentStatusTicker) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return fmt.Errorf("resource group deletion timed out: %w", ctx.Err()) + case <-ticker.C: + log.Info("waiting for resource group deletion", "resourceGroup", d.Config.ResourceGroupName) + case <-notifychan: + if err != nil { + return fmt.Errorf("resource group %q deletion failed: %w", d.Config.ResourceGroupName, err) + } + log.Info("resource group deleted successfully", "resourceGroup", d.Config.ResourceGroupName) + return nil + } + } +} + +// GetKubeConfig is a go-workflow step that retrieves kubeconfig for a cluster +// deployed via ARM template. +type GetKubeConfig struct { + Config *azure.InfraConfig + KubeConfigFilePath string +} + +func (g *GetKubeConfig) String() string { return "get-arm-kubeconfig" } + +func (g *GetKubeConfig) Do(ctx context.Context) error { + step := &azure.GetAKSKubeConfig{ + ClusterName: g.Config.ClusterName, + SubscriptionID: g.Config.SubscriptionID, + ResourceGroupName: g.Config.ResourceGroupName, + Location: g.Config.Location, + KubeConfigFilePath: g.KubeConfigFilePath, + } + return step.Do(ctx) +} diff --git a/test/e2ev3/pkg/infra/providers/azure/arm/template.go b/test/e2ev3/pkg/infra/providers/azure/arm/template.go new file mode 100644 index 0000000000..bf0020aaca --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/arm/template.go @@ -0,0 +1,236 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package arm + +import ( + "encoding/json" + "fmt" + + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/azure" +) + +// GenerateTemplate builds a subscription-level ARM template that creates +// all e2e infrastructure in a single deployment: resource group, VNet with +// subnet, public IPs, and AKS cluster. +func GenerateTemplate(cfg *azure.InfraConfig) map[string]any { + nestedResources := []any{buildVNet(cfg)} + + for _, ip := range cfg.PublicIPs { + nestedResources = append(nestedResources, buildPublicIP(cfg, ip)) + } + + nestedResources = append(nestedResources, buildAKSCluster(cfg)) + + return map[string]any{ + "$schema": "https://schema.management.azure.com/schemas/2018-05-01/subscriptionDeploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "resources": []any{ + buildResourceGroup(cfg), + buildNestedDeployment(cfg, nestedResources), + }, + } +} + +// GenerateTemplateJSON returns the ARM template as pretty-printed JSON bytes. +func GenerateTemplateJSON(cfg *azure.InfraConfig) ([]byte, error) { + template := GenerateTemplate(cfg) + return json.MarshalIndent(template, "", " ") +} + +func buildResourceGroup(cfg *azure.InfraConfig) map[string]any { + return map[string]any{ + "type": "Microsoft.Resources/resourceGroups", + "apiVersion": "2022-09-01", + "name": cfg.ResourceGroupName, + "location": cfg.Location, + } +} + +func buildNestedDeployment(cfg *azure.InfraConfig, resources []any) map[string]any { + return map[string]any{ + "type": "Microsoft.Resources/deployments", + "apiVersion": "2022-09-01", + "name": "e2e-infra-deployment", + "resourceGroup": cfg.ResourceGroupName, + "dependsOn": []string{ + fmt.Sprintf("[resourceId('Microsoft.Resources/resourceGroups', '%s')]", cfg.ResourceGroupName), + }, + "properties": map[string]any{ + "mode": "Incremental", + "template": map[string]any{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "resources": resources, + }, + }, + } +} + +func buildVNet(cfg *azure.InfraConfig) map[string]any { + return map[string]any{ + "type": "Microsoft.Network/virtualNetworks", + "apiVersion": "2023-04-01", + "name": cfg.VnetName, + "location": cfg.Location, + "properties": map[string]any{ + "addressSpace": map[string]any{ + "addressPrefixes": []string{cfg.VnetAddressSpace}, + }, + "flowTimeoutInMinutes": 10, + "subnets": []map[string]any{ + { + "name": cfg.SubnetName, + "properties": map[string]any{ + "addressPrefix": cfg.SubnetAddressSpace, + }, + }, + }, + }, + } +} + +func buildPublicIP(cfg *azure.InfraConfig, ip azure.PublicIPConfig) map[string]any { + return map[string]any{ + "type": "Microsoft.Network/publicIPAddresses", + "apiVersion": "2023-04-01", + "name": ip.FullName(cfg.ClusterName), + "location": cfg.Location, + "sku": map[string]any{ + "name": "Standard", + "tier": "Regional", + }, + "properties": map[string]any{ + "publicIPAllocationMethod": "Static", + "publicIPAddressVersion": ip.IPVersion, + "ipTags": []map[string]any{ + { + "ipTagType": "FirstPartyUsage", + "tag": "/NonProd", + }, + }, + }, + } +} + +func buildAKSCluster(cfg *azure.InfraConfig) map[string]any { + subnetRef := fmt.Sprintf("[resourceId('Microsoft.Network/virtualNetworks/subnets', '%s', '%s')]", + cfg.VnetName, cfg.SubnetName) + + // Agent pool profiles + pools := make([]map[string]any, 0, len(cfg.AgentPools)) + for _, pool := range cfg.AgentPools { + p := map[string]any{ + "name": pool.Name, + "count": pool.Count, + "vmSize": pool.VMSize, + "osType": pool.OSType, + "mode": pool.Mode, + "maxPods": pool.MaxPods, + "type": "VirtualMachineScaleSets", + "enableNodePublicIP": false, + "scaleDownMode": "Delete", + "vnetSubnetID": subnetRef, + } + if pool.OSSku != "" { + p["osSku"] = pool.OSSku + } + if pool.EnableFIPS { + p["enableFIPS"] = true + } + pools = append(pools, p) + } + + // Outbound public IP references for load balancer + outboundIPs := make([]map[string]any, 0, len(cfg.PublicIPs)) + for _, ip := range cfg.PublicIPs { + outboundIPs = append(outboundIPs, map[string]any{ + "id": fmt.Sprintf("[resourceId('Microsoft.Network/publicIPAddresses', '%s')]", + ip.FullName(cfg.ClusterName)), + }) + } + + // Dependencies + deps := []string{ + fmt.Sprintf("[resourceId('Microsoft.Network/virtualNetworks', '%s')]", cfg.VnetName), + } + for _, ip := range cfg.PublicIPs { + deps = append(deps, fmt.Sprintf("[resourceId('Microsoft.Network/publicIPAddresses', '%s')]", + ip.FullName(cfg.ClusterName))) + } + + // Network profile + networkProfile := map[string]any{ + "networkPlugin": cfg.NetworkPlugin, + "loadBalancerSku": "standard", + "outboundType": "loadBalancer", + } + if cfg.NetworkPolicy != "" { + networkProfile["networkPolicy"] = cfg.NetworkPolicy + } + if cfg.PodCidr != "" { + networkProfile["podCidr"] = cfg.PodCidr + } + if cfg.ServiceCidr != "" { + networkProfile["serviceCidr"] = cfg.ServiceCidr + } + if cfg.DNSServiceIP != "" { + networkProfile["dnsServiceIP"] = cfg.DNSServiceIP + } + if cfg.NetworkPluginMode != "" { + networkProfile["networkPluginMode"] = cfg.NetworkPluginMode + } + if len(outboundIPs) > 0 { + networkProfile["loadBalancerProfile"] = map[string]any{ + "outboundIPs": map[string]any{ + "publicIPs": outboundIPs, + }, + } + } + + // Cluster properties + properties := map[string]any{ + "dnsPrefix": cfg.ClusterName, + "enableRBAC": cfg.EnableRBAC, + "enablePodSecurityPolicy": false, + "agentPoolProfiles": pools, + "networkProfile": networkProfile, + } + + if cfg.AutoUpgradeChannel != "" { + properties["autoUpgradeProfile"] = map[string]any{ + "nodeOSUpgradeChannel": cfg.AutoUpgradeChannel, + } + } + + // Add Windows profile if any pool is Windows + for _, pool := range cfg.AgentPools { + if pool.OSType == "Windows" { + properties["windowsProfile"] = map[string]any{ + "adminUsername": cfg.WindowsAdminUsername, + "adminPassword": cfg.WindowsAdminPassword, + } + break + } + } + + return map[string]any{ + "type": "Microsoft.ContainerService/managedClusters", + "apiVersion": "2024-01-01", + "name": cfg.ClusterName, + "location": cfg.Location, + "tags": map[string]string{ + "archv2": "", + "tier": "production", + }, + "identity": map[string]any{ + "type": "SystemAssigned", + }, + "sku": map[string]any{ + "name": "Base", + "tier": "Standard", + }, + "properties": properties, + "dependsOn": deps, + } +} diff --git a/test/e2ev3/pkg/infra/providers/azure/cluster.go b/test/e2ev3/pkg/infra/providers/azure/cluster.go new file mode 100644 index 0000000000..b0fb68213b --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/cluster.go @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package azure + +import ( + "context" + + "k8s.io/client-go/rest" +) + +// Cluster is a ClusterProvider for Azure Kubernetes Service clusters. +// Images are pulled from a container registry, so LoadImages is a no-op. +type Cluster struct { + SubscriptionID string + Location string + ResourceGroup string + Name string + KubeCfgPath string + RC *rest.Config +} + +func (a *Cluster) ClusterName() string { return a.Name } +func (a *Cluster) KubeConfigPath() string { return a.KubeCfgPath } +func (a *Cluster) RestConfig() *rest.Config { return a.RC } + +func (a *Cluster) LoadImages(_ context.Context, _ []string) error { return nil } +func (a *Cluster) ImagePullPolicy() string { return "Always" } + +func (a *Cluster) ImagePullSecrets() []map[string]interface{} { + return []map[string]interface{}{ + {"name": "acr-credentials"}, + } +} diff --git a/test/e2ev3/pkg/infra/providers/azure/config.go b/test/e2ev3/pkg/infra/providers/azure/config.go new file mode 100644 index 0000000000..1b8fe19722 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/config.go @@ -0,0 +1,159 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package azure + +import ( + "crypto/rand" + "encoding/base64" + "fmt" + "math/big" + "os/user" + "strconv" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +const netObsRGtag = "-e2e-netobs-" + +// AzureLocations is the set of locations randomly chosen when AZURE_LOCATION is unset. +var AzureLocations = []string{"eastus2", "northeurope", "uksouth", "centralindia", "westus2"} + +// RandomLocation picks a random Azure location from AzureLocations. +func RandomLocation(t *testing.T) string { + t.Helper() + nBig, err := rand.Int(rand.Reader, big.NewInt(int64(len(AzureLocations)))) + require.NoError(t, err) + return AzureLocations[nBig.Int64()] +} + +// ClusterNameForE2ETest returns clusterName if set, or generates one from +// the current username and timestamp. +func ClusterNameForE2ETest(t *testing.T, clusterName string) string { + if clusterName == "" { + curuser, err := user.Current() + require.NoError(t, err) + username := curuser.Username + + if len(username) > 8 { + username = username[:8] + t.Logf("Username is too long, truncating to 8 characters: %s", username) + } + clusterName = username + netObsRGtag + strconv.FormatInt(time.Now().Unix(), 10) + t.Logf("CLUSTER_NAME is not set, generating a random cluster name: %s", clusterName) + } + return clusterName +} + +// InfraConfig defines the complete infrastructure configuration for deploying +// all e2e test resources in a single ARM template deployment. +type InfraConfig struct { + SubscriptionID string + ResourceGroupName string + Location string + ClusterName string + + // VNet configuration + VnetName string + VnetAddressSpace string + SubnetName string + SubnetAddressSpace string + + // Cluster network configuration + PodCidr string + ServiceCidr string + DNSServiceIP string + + // Public IP configuration + PublicIPs []PublicIPConfig + + // Agent pool configuration + AgentPools []AgentPoolConfig + + // Cluster configuration + NetworkPlugin string + NetworkPolicy string + NetworkPluginMode string + EnableRBAC bool + AutoUpgradeChannel string + + // Windows node configuration + WindowsAdminUsername string + WindowsAdminPassword string +} + +// PublicIPConfig defines a public IP address to create. +type PublicIPConfig struct { + NamePrefix string + IPVersion string // "IPv4" or "IPv6" +} + +// AgentPoolConfig defines an AKS agent pool. +type AgentPoolConfig struct { + Name string + Count int32 + VMSize string + OSType string // "Linux" or "Windows" + OSSku string // "Windows2022", "AzureLinux", etc. Empty for default. + Mode string // "System" or "User" + MaxPods int32 + EnableFIPS bool +} + +// FullName returns the public IP resource name, e.g. "serviceTaggedIp-mycluster-v4". +func (ip PublicIPConfig) FullName(clusterName string) string { + suffix := "v4" + if strings.Contains(ip.IPVersion, "6") { + suffix = "v6" + } + return fmt.Sprintf("%s-%s-%s", ip.NamePrefix, clusterName, suffix) +} + +// DefaultE2EInfraConfig returns the standard infrastructure configuration +// matching the existing e2e test setup (NPM cluster with 4 agent pools). +func DefaultE2EInfraConfig(subscriptionID, resourceGroupName, location, clusterName string) *InfraConfig { + return &InfraConfig{ + SubscriptionID: subscriptionID, + ResourceGroupName: resourceGroupName, + Location: location, + ClusterName: clusterName, + + VnetName: "testvnet", + VnetAddressSpace: "10.0.0.0/9", + SubnetName: "testsubnet", + SubnetAddressSpace: "10.0.0.0/12", + + PodCidr: "10.128.0.0/9", + ServiceCidr: "192.168.0.0/28", + DNSServiceIP: "192.168.0.10", + + PublicIPs: []PublicIPConfig{ + {NamePrefix: "serviceTaggedIp", IPVersion: "IPv4"}, + {NamePrefix: "serviceTaggedIp", IPVersion: "IPv6"}, + }, + + AgentPools: []AgentPoolConfig{ + {Name: "nodepool1", Count: 3, VMSize: "Standard_DS4_v2", OSType: "Linux", Mode: "System", MaxPods: 250}, + {Name: "ws22", Count: 1, VMSize: "Standard_DS4_v2", OSType: "Windows", OSSku: "Windows2022", Mode: "User", MaxPods: 250}, + {Name: "azlinux", Count: 1, VMSize: "Standard_D4pls_v5", OSType: "Linux", OSSku: "AzureLinux", Mode: "User", MaxPods: 250, EnableFIPS: true}, + {Name: "arm64", Count: 2, VMSize: "Standard_D4pls_v5", OSType: "Linux", Mode: "User", MaxPods: 250}, + }, + + NetworkPlugin: "azure", + NetworkPolicy: "azure", + EnableRBAC: true, + AutoUpgradeChannel: "node-image", + WindowsAdminUsername: "azureuser", + WindowsAdminPassword: generatePassword(), + } +} + +func generatePassword() string { + b := make([]byte, 16) + _, _ = rand.Read(b) + // Guarantee complexity: uppercase (P), lowercase (w), digit (1), special (!) + return "Pw" + base64.RawStdEncoding.EncodeToString(b)[:12] + "!1" +} diff --git a/test/e2ev3/pkg/infra/providers/azure/delete.go b/test/e2ev3/pkg/infra/providers/azure/delete.go new file mode 100644 index 0000000000..805c592e13 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/delete.go @@ -0,0 +1,89 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package azure + +import ( + "context" + "fmt" + "log/slog" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + armcontainerservice "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources" +) + +// DeleteResourceGroup is a go-workflow step that deletes a resource group +// and all resources within it. +type DeleteResourceGroup struct { + SubscriptionID string + ResourceGroupName string + Location string +} + +func (d *DeleteResourceGroup) String() string { return "delete-resource-group" } + +func (d *DeleteResourceGroup) Do(ctx context.Context) error { + log := slog.With("step", d.String()) + log.Info("deleting resource group", "resourceGroup", d.ResourceGroupName) + + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + + clientFactory, err := armresources.NewClientFactory(d.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create resource group client: %w", err) + } + + forceDeleteType := "Microsoft.Compute/virtualMachines,Microsoft.Compute/virtualMachineScaleSets" + _, err = clientFactory.NewResourceGroupsClient().BeginDelete(ctx, d.ResourceGroupName, + &armresources.ResourceGroupsClientBeginDeleteOptions{ + ForceDeletionTypes: to.Ptr(forceDeleteType), + }) + if err != nil { + return fmt.Errorf("failed to delete resource group %q: %w", d.ResourceGroupName, err) + } + + log.Info("resource group deleted successfully", "resourceGroup", d.ResourceGroupName) + return nil +} + +// DeleteCluster is a go-workflow step that deletes an AKS cluster. +type DeleteCluster struct { + ClusterName string + SubscriptionID string + ResourceGroupName string + Location string +} + +func (d *DeleteCluster) String() string { return "delete-aks-cluster" } + +func (d *DeleteCluster) Do(ctx context.Context) error { + log := slog.With("step", d.String()) + log.Info("deleting cluster", "cluster", d.ClusterName, "resourceGroup", d.ResourceGroupName) + + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + + clientFactory, err := armcontainerservice.NewClientFactory(d.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create client: %w", err) + } + + poller, err := clientFactory.NewManagedClustersClient().BeginDelete(ctx, d.ResourceGroupName, d.ClusterName, nil) + if err != nil { + return fmt.Errorf("failed to begin cluster deletion: %w", err) + } + + if _, err = poller.PollUntilDone(ctx, nil); err != nil { + return fmt.Errorf("failed to delete cluster %q: %w", d.ClusterName, err) + } + + log.Info("cluster deleted successfully", "cluster", d.ClusterName) + return nil +} diff --git a/test/e2ev3/pkg/infra/providers/azure/get.go b/test/e2ev3/pkg/infra/providers/azure/get.go new file mode 100644 index 0000000000..1b7476a4a6 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/get.go @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package azure + +import ( + "context" + "fmt" + "log/slog" + "os" + + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + armcontainerservice "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4" +) + +const kubeConfigPerms = 0o600 + +// GetAKSKubeConfig is a go-workflow step that retrieves cluster credentials +// and writes the kubeconfig to a file. +type GetAKSKubeConfig struct { + ClusterName string + SubscriptionID string + ResourceGroupName string + Location string + KubeConfigFilePath string +} + +func (c *GetAKSKubeConfig) String() string { return "get-aks-kubeconfig" } + +func (c *GetAKSKubeConfig) Do(ctx context.Context) error { + log := slog.With("step", c.String()) + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + + clientFactory, err := armcontainerservice.NewClientFactory(c.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create client: %w", err) + } + + res, err := clientFactory.NewManagedClustersClient().ListClusterUserCredentials(ctx, c.ResourceGroupName, c.ClusterName, nil) + if err != nil { + return fmt.Errorf("failed to get cluster credentials: %w", err) + } + + if err := os.WriteFile(c.KubeConfigFilePath, res.Kubeconfigs[0].Value, kubeConfigPerms); err != nil { + return fmt.Errorf("failed to write kubeconfig to %q: %w", c.KubeConfigFilePath, err) + } + + log.Info("kubeconfig for cluster written", "cluster", c.ClusterName, "path", c.KubeConfigFilePath) + return nil +} + +// GetFQDN returns the FQDN of the given AKS cluster. +func GetFQDN(ctx context.Context, subscriptionID, resourceGroupName, clusterName string) (string, error) { + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return "", fmt.Errorf("failed to obtain a credential: %w", err) + } + + clientFactory, err := armcontainerservice.NewClientFactory(subscriptionID, cred, nil) + if err != nil { + return "", fmt.Errorf("failed to create client: %w", err) + } + + res, err := clientFactory.NewManagedClustersClient().Get(ctx, resourceGroupName, clusterName, nil) + if err != nil { + return "", fmt.Errorf("failed to get cluster: %w", err) + } + + return *res.Properties.Fqdn, nil +} diff --git a/test/e2ev3/pkg/infra/providers/azure/legacy/create-cluster-with-npm.go b/test/e2ev3/pkg/infra/providers/azure/legacy/create-cluster-with-npm.go new file mode 100644 index 0000000000..edff3c914d --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/legacy/create-cluster-with-npm.go @@ -0,0 +1,170 @@ +package legacy + +import ( + "context" + "fmt" + "log/slog" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + armcontainerservice "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4" +) + +var ( + ErrResourceNameTooLong = fmt.Errorf("resource name too long") + ErrEmptyFile = fmt.Errorf("empty file") +) + +const ( + clusterTimeout = 15 * time.Minute + clusterCreateTicker = 30 * time.Second + pollFrequency = 5 * time.Second + AgentARMSKU = "Standard_D4pls_v5" + AuxilaryNodeCount = 1 + AuxilaryARMNodeCount = 2 +) + +type CreateNPMCluster struct { + SubscriptionID string + ResourceGroupName string + Location string + ClusterName string + VnetName string + SubnetName string + PodCidr string + DNSServiceIP string + ServiceCidr string + PublicIPs []string +} + +func (c *CreateNPMCluster) Do(_ context.Context) error { + // Start with default cluster template + npmCluster := GetStarterClusterTemplate(c.Location) + + npmCluster.Properties.NetworkProfile.NetworkPolicy = to.Ptr(armcontainerservice.NetworkPolicyAzure) + + //nolint:appendCombine // separate for verbosity + npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all + Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets), + // AvailabilityZones: []*string{to.Ptr("1")}, + Count: to.Ptr[int32](AuxilaryNodeCount), + EnableNodePublicIP: to.Ptr(false), + Mode: to.Ptr(armcontainerservice.AgentPoolModeUser), + OSType: to.Ptr(armcontainerservice.OSTypeWindows), + OSSKU: to.Ptr(armcontainerservice.OSSKUWindows2022), + ScaleDownMode: to.Ptr(armcontainerservice.ScaleDownModeDelete), + VMSize: to.Ptr(AgentSKU), + Name: to.Ptr("ws22"), + MaxPods: to.Ptr(int32(MaxPodsPerNode)), + }) + + //nolint:appendCombine // separate for verbosity + npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ + Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets), + AvailabilityZones: []*string{to.Ptr("1")}, + Count: to.Ptr[int32](AuxilaryNodeCount), + EnableNodePublicIP: to.Ptr(false), + EnableFIPS: to.Ptr(true), + Mode: to.Ptr(armcontainerservice.AgentPoolModeUser), + OSType: to.Ptr(armcontainerservice.OSTypeLinux), + OSSKU: to.Ptr(armcontainerservice.OSSKUAzureLinux), + ScaleDownMode: to.Ptr(armcontainerservice.ScaleDownModeDelete), + VMSize: to.Ptr(AgentSKU), + Name: to.Ptr("azlinux"), + MaxPods: to.Ptr(int32(MaxPodsPerNode)), + }) + + //nolint:appendCombine // separate for verbosity + npmCluster.Properties.AgentPoolProfiles = append(npmCluster.Properties.AgentPoolProfiles, &armcontainerservice.ManagedClusterAgentPoolProfile{ //nolint:all + Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets), + // AvailabilityZones: []*string{to.Ptr("1")}, + Count: to.Ptr[int32](AuxilaryARMNodeCount), + EnableNodePublicIP: to.Ptr(false), + Mode: to.Ptr(armcontainerservice.AgentPoolModeUser), + OSType: to.Ptr(armcontainerservice.OSTypeLinux), + ScaleDownMode: to.Ptr(armcontainerservice.ScaleDownModeDelete), + VMSize: to.Ptr(AgentARMSKU), + Name: to.Ptr("arm64"), + MaxPods: to.Ptr(int32(MaxPodsPerNode)), + }) + + npmCluster.Properties.AutoUpgradeProfile = &armcontainerservice.ManagedClusterAutoUpgradeProfile{ + NodeOSUpgradeChannel: to.Ptr(armcontainerservice.NodeOSUpgradeChannelNodeImage), + } + + if len(c.PublicIPs) > 0 { + publicIPIDs := make([]*armcontainerservice.ResourceReference, 0, len(c.PublicIPs)) + + for _, ipID := range c.PublicIPs { + slog.Info("adding public IP", "id", ipID) + publicIPIDs = append(publicIPIDs, &armcontainerservice.ResourceReference{ + ID: to.Ptr(ipID), + }) + } + + for _, ip := range c.PublicIPs { + slog.Info("public IP", "id", ip) + } + + if npmCluster.Properties.NetworkProfile.LoadBalancerProfile == nil { + npmCluster.Properties.NetworkProfile.LoadBalancerProfile = &armcontainerservice.ManagedClusterLoadBalancerProfile{ + OutboundIPs: &armcontainerservice.ManagedClusterLoadBalancerProfileOutboundIPs{ + PublicIPs: publicIPIDs, + }, + } + } + } + + // Deploy cluster + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + ctx, cancel := context.WithTimeout(context.Background(), clusterTimeout) + defer cancel() + + clientFactory, err := armcontainerservice.NewClientFactory(c.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create az client: %w", err) + } + + slog.Info("when the cluster is ready, use the below command to access and debug") + slog.Info("az aks get-credentials", "resourceGroup", c.ResourceGroupName, "cluster", c.ClusterName, "subscription", c.SubscriptionID) + slog.Info("creating cluster", "cluster", c.ClusterName, "resourceGroup", c.ResourceGroupName) + + poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, npmCluster, nil) + if err != nil { + return fmt.Errorf("failed to finish the create cluster request: %w", err) + } + + notifychan := make(chan struct{}) + go func() { + _, err = poller.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: pollFrequency, + }) + if err != nil { + slog.Error("failed to create cluster", "error", err) + } else { + slog.Info("cluster is ready", "cluster", c.ClusterName) + } + close(notifychan) + }() + + ticker := time.NewTicker(clusterCreateTicker) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return fmt.Errorf("failed to create cluster: %w", ctx.Err()) + case <-ticker.C: + slog.Info("waiting for cluster to be ready", "cluster", c.ClusterName) + case <-notifychan: + if err != nil { + return fmt.Errorf("received notification, failed to create cluster: %w", err) + } + return nil + } + } +} diff --git a/test/e2ev3/pkg/infra/providers/azure/legacy/create-cluster.go b/test/e2ev3/pkg/infra/providers/azure/legacy/create-cluster.go new file mode 100644 index 0000000000..8c0a825690 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/legacy/create-cluster.go @@ -0,0 +1,149 @@ +package legacy + +import ( + "context" + "fmt" + "log/slog" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + armcontainerservice "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4" +) + +const ( + MaxNumberOfNodes = 3 + MaxPodsPerNode = 250 + AgentSKU = "Standard_DS4_v2" +) + +var defaultClusterCreateTimeout = 30 * time.Minute + +type CreateCluster struct { + SubscriptionID string + ResourceGroupName string + Location string + ClusterName string + podCidr string + vmSize string + networkPluginMode string + Nodes int32 +} + +func (c *CreateCluster) SetPodCidr(podCidr string) *CreateCluster { + c.podCidr = podCidr + return c +} + +func (c *CreateCluster) SetVMSize(vmSize string) *CreateCluster { + c.vmSize = vmSize + return c +} + +func (c *CreateCluster) SetNetworkPluginMode(networkPluginMode string) *CreateCluster { + c.networkPluginMode = networkPluginMode + return c +} + +func (c *CreateCluster) Do(_ context.Context) error { + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + ctx := context.TODO() + clientFactory, err := armcontainerservice.NewClientFactory(c.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create client: %w", err) + } + if c.Nodes == 0 { + c.Nodes = MaxNumberOfNodes + } + + template := GetStarterClusterTemplate(c.Location) + + if c.Nodes > 0 { + template.Properties.AgentPoolProfiles[0].Count = to.Ptr(c.Nodes) + } + + if c.podCidr != "" { + template.Properties.NetworkProfile.PodCidr = to.Ptr(c.podCidr) + } + + if c.vmSize != "" { + template.Properties.AgentPoolProfiles[0].VMSize = to.Ptr(c.vmSize) + } + + if c.networkPluginMode != "" { + template.Properties.NetworkProfile.NetworkPluginMode = to.Ptr(armcontainerservice.NetworkPluginMode(c.networkPluginMode)) + } + + slog.Info("creating cluster", "cluster", c.ClusterName, "location", c.Location) + poller, err := clientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, template, nil) + if err != nil { + return fmt.Errorf("failed to finish the create cluster request: %w", err) + } + _, err = poller.PollUntilDone(ctx, nil) + if err != nil { + return fmt.Errorf("failed to pull the create cluster result: %w", err) + } + slog.Info("cluster created", "cluster", c.ClusterName, "location", c.Location) + + return nil +} + +func GetStarterClusterTemplate(location string) armcontainerservice.ManagedCluster { + id := armcontainerservice.ResourceIdentityTypeSystemAssigned + return armcontainerservice.ManagedCluster{ + Location: to.Ptr(location), + Tags: map[string]*string{ + "archv2": to.Ptr(""), + "tier": to.Ptr("production"), + }, + Properties: &armcontainerservice.ManagedClusterProperties{ + AddonProfiles: map[string]*armcontainerservice.ManagedClusterAddonProfile{}, + /* Moving this to a separate stage to enable AMA since it takes some time to provision + AzureMonitorProfile: &armcontainerservice.ManagedClusterAzureMonitorProfile{ + Metrics: &armcontainerservice.ManagedClusterAzureMonitorProfileMetrics{ + Enabled: to.Ptr(true), + }, + }, + */ + AgentPoolProfiles: []*armcontainerservice.ManagedClusterAgentPoolProfile{ + { + Type: to.Ptr(armcontainerservice.AgentPoolTypeVirtualMachineScaleSets), + // AvailabilityZones: []*string{to.Ptr("1")}, + Count: to.Ptr[int32](MaxNumberOfNodes), + EnableNodePublicIP: to.Ptr(false), + Mode: to.Ptr(armcontainerservice.AgentPoolModeSystem), + OSType: to.Ptr(armcontainerservice.OSTypeLinux), + ScaleDownMode: to.Ptr(armcontainerservice.ScaleDownModeDelete), + VMSize: to.Ptr(AgentSKU), + Name: to.Ptr("nodepool1"), + MaxPods: to.Ptr(int32(MaxPodsPerNode)), + }, + }, + KubernetesVersion: to.Ptr(""), + DNSPrefix: to.Ptr("dnsprefix1"), + EnablePodSecurityPolicy: to.Ptr(false), + EnableRBAC: to.Ptr(true), + LinuxProfile: nil, + NetworkProfile: &armcontainerservice.NetworkProfile{ + LoadBalancerSKU: to.Ptr(armcontainerservice.LoadBalancerSKUStandard), + OutboundType: to.Ptr(armcontainerservice.OutboundTypeLoadBalancer), + NetworkPlugin: to.Ptr(armcontainerservice.NetworkPluginAzure), + }, + WindowsProfile: &armcontainerservice.ManagedClusterWindowsProfile{ + AdminPassword: to.Ptr("replacePassword1234$"), + AdminUsername: to.Ptr("azureuser"), + }, + }, + Identity: &armcontainerservice.ManagedClusterIdentity{ + Type: &id, + }, + + SKU: &armcontainerservice.ManagedClusterSKU{ + Name: to.Ptr(armcontainerservice.ManagedClusterSKUName("Base")), + Tier: to.Ptr(armcontainerservice.ManagedClusterSKUTierStandard), + }, + } +} diff --git a/test/e2ev3/pkg/infra/providers/azure/legacy/create-public-ip.go b/test/e2ev3/pkg/infra/providers/azure/legacy/create-public-ip.go new file mode 100644 index 0000000000..7cb639d70a --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/legacy/create-public-ip.go @@ -0,0 +1,101 @@ +package legacy + +import ( + "context" + "fmt" + "log/slog" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork" +) + +type CreatePublicIP struct { + SubscriptionID string + ResourceGroupName string + Location string + ClusterName string + IPVersion string + IPPrefix string +} + +func (c *CreatePublicIP) Do(_ context.Context) error { + cred, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), clusterTimeout) + defer cancel() + + publicIPClient, err := armnetwork.NewPublicIPAddressesClient(c.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("%w: failed to create public IP client", err) + } + + publicIPParams := armnetwork.PublicIPAddress{ + Location: to.Ptr(c.Location), + SKU: &armnetwork.PublicIPAddressSKU{ + Name: to.Ptr(armnetwork.PublicIPAddressSKUNameStandard), + Tier: to.Ptr(armnetwork.PublicIPAddressSKUTierRegional), + }, + Properties: &armnetwork.PublicIPAddressPropertiesFormat{ + PublicIPAllocationMethod: to.Ptr(armnetwork.IPAllocationMethodStatic), + PublicIPAddressVersion: to.Ptr(armnetwork.IPVersion(c.IPVersion)), + IPTags: []*armnetwork.IPTag{ + { + IPTagType: to.Ptr("FirstPartyUsage"), + Tag: to.Ptr("/NonProd"), + }, + }, + }, + } + + var version string + switch c.IPVersion { + case string(armnetwork.IPVersionIPv4): + version = "v4" + case string(armnetwork.IPVersionIPv6): + version = "v6" + default: + return fmt.Errorf("%w: invalid IP version: %s", err, c.IPVersion) + } + + ipName := fmt.Sprintf("%s-%s-%s", c.IPPrefix, c.ClusterName, version) + + poller, err := publicIPClient.BeginCreateOrUpdate(ctx, c.ResourceGroupName, ipName, publicIPParams, nil) + if err != nil { + return fmt.Errorf("%w: failed to create public IP address", err) + } + + notifychan := make(chan struct{}) + go func() { + _, err = poller.PollUntilDone(ctx, &runtime.PollUntilDoneOptions{ + Frequency: 5 * time.Second, + }) + if err != nil { + slog.Error("failed to create public IP", "name", ipName, "error", err) + } else { + slog.Info("public IP created", "name", ipName) + } + close(notifychan) + }() + + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return fmt.Errorf("failed to create Public IP: %w", ctx.Err()) + case <-ticker.C: + slog.Info("waiting for public IP to be ready", "name", ipName) + case <-notifychan: + if err != nil { + return fmt.Errorf("received notification, failed to create public IP address: %w", err) + } + return nil + } + } +} diff --git a/test/e2ev3/pkg/infra/providers/azure/legacy/create-rg.go b/test/e2ev3/pkg/infra/providers/azure/legacy/create-rg.go new file mode 100644 index 0000000000..9171a8e37d --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/legacy/create-rg.go @@ -0,0 +1,40 @@ +package legacy + +import ( + "context" + "fmt" + "log/slog" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources" +) + +type CreateResourceGroup struct { + SubscriptionID string + ResourceGroupName string + Location string +} + +func (c *CreateResourceGroup) Do(_ context.Context) error { + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + ctx := context.Background() + clientFactory, err := armresources.NewClientFactory(c.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create resource group client: %w", err) + } + slog.Info("creating resource group", "resourceGroup", c.ResourceGroupName, "location", c.Location) + + _, err = clientFactory.NewResourceGroupsClient().CreateOrUpdate(ctx, c.ResourceGroupName, armresources.ResourceGroup{ + Location: to.Ptr(c.Location), + }, nil) + if err != nil { + return fmt.Errorf("failed to finish the request: %w", err) + } + + slog.Info("resource group created", "resourceGroup", c.ResourceGroupName, "location", c.Location) + return nil +} diff --git a/test/e2ev3/pkg/infra/providers/azure/legacy/create-vnet.go b/test/e2ev3/pkg/infra/providers/azure/legacy/create-vnet.go new file mode 100644 index 0000000000..6c743e146a --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/legacy/create-vnet.go @@ -0,0 +1,94 @@ +package legacy + +import ( + "context" + "fmt" + "log/slog" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + armnetwork "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v5" +) + +const FlowTimeoutInMinutes = 10 + +type CreateVNet struct { + SubscriptionID string + ResourceGroupName string + Location string + VnetName string + VnetAddressSpace string +} + +func (c *CreateVNet) Do(_ context.Context) error { + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + ctx := context.Background() + clientFactory, err := armnetwork.NewClientFactory(c.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create client: %w", err) + } + + slog.Info("creating vnet", "vnet", c.VnetName, "resourceGroup", c.ResourceGroupName) + + poller, err := clientFactory.NewVirtualNetworksClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.VnetName, armnetwork.VirtualNetwork{ + Location: to.Ptr(c.Location), + Properties: &armnetwork.VirtualNetworkPropertiesFormat{ + AddressSpace: &armnetwork.AddressSpace{ + AddressPrefixes: []*string{ + to.Ptr(c.VnetAddressSpace), + }, + }, + FlowTimeoutInMinutes: to.Ptr[int32](FlowTimeoutInMinutes), + }, + }, nil) + if err != nil { + return fmt.Errorf("failed to finish the request for create vnet: %w", err) + } + + _, err = poller.PollUntilDone(ctx, nil) + if err != nil { + return fmt.Errorf("failed to pull the result for create vnet: %w", err) + } + return nil +} + +type CreateSubnet struct { + SubscriptionID string + ResourceGroupName string + Location string + VnetName string + SubnetName string + SubnetAddressSpace string +} + +func (c *CreateSubnet) Do(_ context.Context) error { + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + ctx := context.Background() + clientFactory, err := armnetwork.NewClientFactory(c.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create client: %w", err) + } + + slog.Info("creating subnet", "subnet", c.SubnetName, "vnet", c.VnetName, "resourceGroup", c.ResourceGroupName) + + poller, err := clientFactory.NewSubnetsClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.VnetName, c.SubnetName, armnetwork.Subnet{ + Properties: &armnetwork.SubnetPropertiesFormat{ + AddressPrefix: to.Ptr(c.SubnetAddressSpace), + }, + }, nil) + if err != nil { + return fmt.Errorf("failed to finish the request for create subnet: %w", err) + } + + _, err = poller.PollUntilDone(ctx, nil) + if err != nil { + return fmt.Errorf("failed to pull the result for create subnet: %w", err) + } + return nil +} diff --git a/test/e2ev3/pkg/infra/providers/azure/legacy/delete-cluster.go b/test/e2ev3/pkg/infra/providers/azure/legacy/delete-cluster.go new file mode 100644 index 0000000000..9be15315d6 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/legacy/delete-cluster.go @@ -0,0 +1,40 @@ +package legacy + +import ( + "context" + "fmt" + "log/slog" + + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + armcontainerservice "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4" +) + +type DeleteCluster struct { + ClusterName string + SubscriptionID string + ResourceGroupName string + Location string +} + +func (d *DeleteCluster) Do(_ context.Context) error { + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + ctx := context.Background() + clientFactory, err := armcontainerservice.NewClientFactory(d.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create client: %w", err) + } + + slog.Info("deleting cluster", "cluster", d.ClusterName, "resourceGroup", d.ResourceGroupName) + poller, err := clientFactory.NewManagedClustersClient().BeginDelete(ctx, d.ResourceGroupName, d.ClusterName, nil) + if err != nil { + return fmt.Errorf("failed to finish the request: %w", err) + } + _, err = poller.PollUntilDone(ctx, nil) + if err != nil { + return fmt.Errorf("failed to pull the result: %w", err) + } + return nil +} diff --git a/test/e2ev3/pkg/infra/providers/azure/legacy/delete-rg.go b/test/e2ev3/pkg/infra/providers/azure/legacy/delete-rg.go new file mode 100644 index 0000000000..2921fbb5ff --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/legacy/delete-rg.go @@ -0,0 +1,38 @@ +package legacy + +import ( + "context" + "fmt" + "log/slog" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources" +) + +type DeleteResourceGroup struct { + SubscriptionID string + ResourceGroupName string + Location string +} + +func (d *DeleteResourceGroup) Do(_ context.Context) error { + slog.Info("deleting resource group", "resourceGroup", d.ResourceGroupName) + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + ctx := context.Background() + clientFactory, err := armresources.NewClientFactory(d.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create resource group client: %w", err) + } + forceDeleteType := "Microsoft.Compute/virtualMachines,Microsoft.Compute/virtualMachineScaleSets" + _, err = clientFactory.NewResourceGroupsClient().BeginDelete(ctx, d.ResourceGroupName, &armresources.ResourceGroupsClientBeginDeleteOptions{ForceDeletionTypes: to.Ptr(forceDeleteType)}) + if err != nil { + return fmt.Errorf("failed to finish the delete resource group request: %w", err) + } + + slog.Info("resource group deleted successfully", "resourceGroup", d.ResourceGroupName) + return nil +} diff --git a/test/e2ev3/pkg/infra/providers/azure/legacy/enable-ama.go b/test/e2ev3/pkg/infra/providers/azure/legacy/enable-ama.go new file mode 100644 index 0000000000..44cf44ada1 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/legacy/enable-ama.go @@ -0,0 +1,109 @@ +package legacy + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "os" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + armcontainerservice "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/dashboard/armdashboard" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor" +) + +const fileperms = 0o600 + +type CreateAzureMonitor struct { + SubscriptionID string + ResourceGroupName string + Location string + ClusterName string +} + +func (c *CreateAzureMonitor) Do(_ context.Context) error { + slog.Info(`this will deploy azure monitor workspace and grafana, but as of 1/9/2024, the api docs don't show how to do +az aks update --enable-azure-monitor-metrics \ +-n $NAME \ +-g $CLUSTER_RESOURCE_GROUP \ +--azure-monitor-workspace-resource-id $AZMON_RESOURCE_ID \ +--grafana-resource-id $GRAFANA_RESOURCE_ID +`) + + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + + ctx := context.Background() + amaClientFactory, err := armmonitor.NewClientFactory(c.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create azure monitor workspace client: %w", err) + } + slog.Info("creating resource group", "resourceGroup", c.ResourceGroupName, "location", c.Location) + + // create azure monitor + _, err = amaClientFactory.NewAzureMonitorWorkspacesClient().Create(ctx, c.ResourceGroupName, "test", armmonitor.AzureMonitorWorkspaceResource{ + Location: &c.Location, + }, &armmonitor.AzureMonitorWorkspacesClientCreateOptions{}) + if err != nil { + return fmt.Errorf("failed to azure monitor workspace: %w", err) + } + + // Create grafana + + granafaClientFactory, err := armdashboard.NewClientFactory(c.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create grafana client: %w", err) + } + + _, err = granafaClientFactory.NewGrafanaClient().BeginCreate(ctx, c.ResourceGroupName, "test", armdashboard.ManagedGrafana{}, &armdashboard.GrafanaClientBeginCreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create grafana: %w", err) + } + + slog.Info("azure monitor workspace created", "resourceGroup", c.ResourceGroupName, "location", c.Location) + + // update aks cluster + + ctx, cancel := context.WithTimeout(context.Background(), defaultClusterCreateTimeout) + defer cancel() + aksClientFactory, err := armcontainerservice.NewClientFactory(c.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create client: %w", err) + } + + cluster, err := aksClientFactory.NewManagedClustersClient().Get(ctx, c.ResourceGroupName, c.ClusterName, nil) + if err != nil { + return fmt.Errorf("failed to get cluster to enable AMA: %w", err) + } + + // enable Azure Monitor Metrics + cluster.Properties.AzureMonitorProfile.Metrics.Enabled = to.Ptr(true) + + // Marshal the struct into a JSON byte array with indentation + jsonData, err := json.MarshalIndent(cluster, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal cluster to JSON for AMA: %w", err) + } + + // Write the JSON data to a file + err = os.WriteFile("cluster.json", jsonData, fileperms) + if err != nil { + return fmt.Errorf("failed to write cluster JSON to file for AMA: %w", err) + } + + poller, err := aksClientFactory.NewManagedClustersClient().BeginCreateOrUpdate(ctx, c.ResourceGroupName, c.ClusterName, GetStarterClusterTemplate(c.Location), nil) + if err != nil { + return fmt.Errorf("failed to finish the update cluster request for AMA: %w", err) + } + + _, err = poller.PollUntilDone(ctx, nil) + if err != nil { + return fmt.Errorf("failed to enable AMA on cluster %s: %w", *cluster.Name, err) + } + + return nil +} diff --git a/test/e2ev3/pkg/infra/providers/azure/legacy/get-fqdn.go b/test/e2ev3/pkg/infra/providers/azure/legacy/get-fqdn.go new file mode 100644 index 0000000000..4cfc8613a4 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/legacy/get-fqdn.go @@ -0,0 +1,27 @@ +package legacy + +import ( + "context" + "fmt" + + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + armcontainerservice "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4" +) + +func GetFqdnFn(subscriptionId, resourceGroupName, clusterName string) (string, error) { + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return "", fmt.Errorf("failed to obtain a credential: %w", err) + } + ctx := context.Background() + clientFactory, err := armcontainerservice.NewClientFactory(subscriptionId, cred, nil) + if err != nil { + return "", fmt.Errorf("failed to create client: %w", err) + } + res, err := clientFactory.NewManagedClustersClient().Get(ctx, resourceGroupName, clusterName, nil) + if err != nil { + return "", fmt.Errorf("failed to finish the get managed cluster client request: %w", err) + } + + return *res.Properties.Fqdn, nil +} diff --git a/test/e2ev3/pkg/infra/providers/azure/legacy/get-kubeconfig.go b/test/e2ev3/pkg/infra/providers/azure/legacy/get-kubeconfig.go new file mode 100644 index 0000000000..8dc80f0188 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/azure/legacy/get-kubeconfig.go @@ -0,0 +1,45 @@ +package legacy + +import ( + "context" + "fmt" + "log/slog" + "os" + + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + armcontainerservice "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/containerservice/armcontainerservice/v4" +) + +const KubeConfigPerms = 0o600 + +type GetAKSKubeConfig struct { + ClusterName string + SubscriptionID string + ResourceGroupName string + Location string + KubeConfigFilePath string +} + +func (c *GetAKSKubeConfig) Do(_ context.Context) error { + cred, err := azidentity.NewAzureCLICredential(nil) + if err != nil { + return fmt.Errorf("failed to obtain a credential: %w", err) + } + ctx := context.Background() + clientFactory, err := armcontainerservice.NewClientFactory(c.SubscriptionID, cred, nil) + if err != nil { + return fmt.Errorf("failed to create client: %w", err) + } + res, err := clientFactory.NewManagedClustersClient().ListClusterUserCredentials(ctx, c.ResourceGroupName, c.ClusterName, nil) + if err != nil { + return fmt.Errorf("failed to finish the get managed cluster client request: %w", err) + } + + err = os.WriteFile(c.KubeConfigFilePath, []byte(res.Kubeconfigs[0].Value), KubeConfigPerms) + if err != nil { + return fmt.Errorf("failed to write kubeconfig to file \"%s\": %w", c.KubeConfigFilePath, err) + } + + slog.Info("kubeconfig written", "cluster", c.ClusterName, "resourceGroup", c.ResourceGroupName, "path", c.KubeConfigFilePath) + return nil +} diff --git a/test/e2ev3/pkg/infra/providers/kind/cluster.go b/test/e2ev3/pkg/infra/providers/kind/cluster.go new file mode 100644 index 0000000000..7304ff56dc --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/kind/cluster.go @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package kind + +import ( + "context" + "fmt" + "log/slog" + "os/exec" + + "k8s.io/client-go/rest" +) + +// Cluster is a ClusterProvider for Kind (Kubernetes in Docker) clusters. +// Images are loaded directly onto cluster nodes via `kind load docker-image`. +type Cluster struct { + Name string + KubeCfgPath string + RC *rest.Config +} + +func (k *Cluster) ClusterName() string { return k.Name } +func (k *Cluster) KubeConfigPath() string { return k.KubeCfgPath } +func (k *Cluster) RestConfig() *rest.Config { return k.RC } + +func (k *Cluster) LoadImages(ctx context.Context, images []string) error { + for _, image := range images { + slog.Info("loading image onto kind cluster", "image", image, "cluster", k.Name) + args := []string{"load", "docker-image", "--name", k.Name, image} + cmd := exec.CommandContext(ctx, "kind", args...) + cmdOut := &slogWriter{level: slog.LevelInfo, source: "kind-load"} + cmd.Stdout = cmdOut + cmd.Stderr = cmdOut + if err := cmd.Run(); err != nil { + return fmt.Errorf("kind load docker-image %s: %w", image, err) + } + cmdOut.Flush() + } + return nil +} + +func (k *Cluster) ImagePullPolicy() string { return "IfNotPresent" } +func (k *Cluster) ImagePullSecrets() []map[string]interface{} { return nil } diff --git a/test/e2ev3/pkg/infra/providers/kind/config.go b/test/e2ev3/pkg/infra/providers/kind/config.go new file mode 100644 index 0000000000..83e599de29 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/kind/config.go @@ -0,0 +1,56 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package kind + +import ( + "fmt" + "os/user" + "time" + + "sigs.k8s.io/kind/pkg/apis/config/v1alpha4" +) + +// Config defines the configuration for a Kind cluster used in e2e tests. +type Config struct { + ClusterName string + NodeImage string + WaitForReady time.Duration + + // V1Alpha4Config is the native Kind cluster configuration. + // If nil, a default single-node cluster is used. + V1Alpha4Config *v1alpha4.Cluster +} + +// DefaultE2EKindConfig returns the standard Kind cluster configuration for e2e testing. +func DefaultE2EKindConfig(clusterName string) *Config { + if clusterName == "" { + clusterName = defaultClusterName() + } + + return &Config{ + ClusterName: clusterName, + WaitForReady: defaultWaitForReady, + V1Alpha4Config: &v1alpha4.Cluster{ + Nodes: []v1alpha4.Node{ + {Role: v1alpha4.ControlPlaneRole}, + {Role: v1alpha4.WorkerRole}, + }, + }, + } +} + +const defaultWaitForReady = 5 * time.Minute + +func defaultClusterName() string { + name := "retina-e2e" + u, err := user.Current() + if err == nil && u.Username != "" { + username := u.Username + if len(username) > 8 { + username = username[:8] + } + name = fmt.Sprintf("retina-e2e-%s", username) + } + return name +} diff --git a/test/e2ev3/pkg/infra/providers/kind/create.go b/test/e2ev3/pkg/infra/providers/kind/create.go new file mode 100644 index 0000000000..d697870dec --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/kind/create.go @@ -0,0 +1,59 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package kind + +import ( + "context" + "fmt" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "sigs.k8s.io/kind/pkg/cluster" +) + +// CreateCluster is a go-workflow step that creates a Kind cluster +// using the native Kind Go SDK. +type CreateCluster struct { + Config *Config +} + +func (c *CreateCluster) String() string { return "create-kind-cluster" } + +func (c *CreateCluster) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, c) + provider := cluster.NewProvider() + + clusters, err := provider.List() + if err != nil { + return fmt.Errorf("listing Kind clusters: %w", err) + } + for _, name := range clusters { + if name == c.Config.ClusterName { + log.Info("Kind cluster already exists, skipping creation", "cluster", c.Config.ClusterName) + return nil + } + } + + log.Info("creating Kind cluster", "cluster", c.Config.ClusterName) + + opts := []cluster.CreateOption{ + cluster.CreateWithWaitForReady(c.Config.WaitForReady), + cluster.CreateWithDisplayUsage(false), + cluster.CreateWithDisplaySalutation(false), + } + + if c.Config.NodeImage != "" { + opts = append(opts, cluster.CreateWithNodeImage(c.Config.NodeImage)) + } + + if c.Config.V1Alpha4Config != nil { + opts = append(opts, cluster.CreateWithV1Alpha4Config(c.Config.V1Alpha4Config)) + } + + if err := provider.Create(c.Config.ClusterName, opts...); err != nil { + return fmt.Errorf("failed to create Kind cluster %q: %w", c.Config.ClusterName, err) + } + + log.Info("Kind cluster created successfully", "cluster", c.Config.ClusterName) + return nil +} diff --git a/test/e2ev3/pkg/infra/providers/kind/delete.go b/test/e2ev3/pkg/infra/providers/kind/delete.go new file mode 100644 index 0000000000..535fb0ed1d --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/kind/delete.go @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package kind + +import ( + "context" + "fmt" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "sigs.k8s.io/kind/pkg/cluster" +) + +// DeleteCluster is a go-workflow step that deletes a Kind cluster +// using the native Kind Go SDK. +type DeleteCluster struct { + ClusterName string + KubeConfigFilePath string +} + +func (d *DeleteCluster) String() string { return "delete-kind-cluster" } + +func (d *DeleteCluster) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, d) + log.Info("deleting Kind cluster", "cluster", d.ClusterName) + + provider := cluster.NewProvider() + + if err := provider.Delete(d.ClusterName, d.KubeConfigFilePath); err != nil { + return fmt.Errorf("failed to delete Kind cluster %q: %w", d.ClusterName, err) + } + + log.Info("Kind cluster deleted successfully", "cluster", d.ClusterName) + return nil +} diff --git a/test/e2ev3/pkg/infra/providers/kind/install_npm.go b/test/e2ev3/pkg/infra/providers/kind/install_npm.go new file mode 100644 index 0000000000..d6fab8ebfb --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/kind/install_npm.go @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package kind + +import ( + "context" + "fmt" + "log/slog" + "os" + "os/exec" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" +) + +const npmManifestURL = "https://raw.githubusercontent.com/Azure/azure-container-networking/master/npm/azure-npm.yaml" + +// InstallNPM applies Azure Network Policy Manager to enable NetworkPolicy +// enforcement on Kind clusters. +type InstallNPM struct { + KubeConfigFilePath string +} + +func (n *InstallNPM) String() string { return "install-azure-npm" } + +func (n *InstallNPM) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, n) + log.Info("installing Azure NPM for NetworkPolicy enforcement") + cmd := exec.CommandContext(ctx, "kubectl", "apply", "-f", npmManifestURL) + if n.KubeConfigFilePath != "" { + cmd.Env = append(os.Environ(), "KUBECONFIG="+n.KubeConfigFilePath) + } + cmdOut := &slogWriter{level: slog.LevelInfo, source: "kubectl-apply"} + cmd.Stdout = cmdOut + cmd.Stderr = cmdOut + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to install Azure NPM: %w", err) + } + cmdOut.Flush() + + // Wait for the DaemonSet to be ready. + log.Info("waiting for Azure NPM DaemonSet to be ready") + waitCmd := exec.CommandContext(ctx, "kubectl", "rollout", "status", "daemonset/azure-npm", + "-n", "kube-system", "--timeout=120s") + if n.KubeConfigFilePath != "" { + waitCmd.Env = append(os.Environ(), "KUBECONFIG="+n.KubeConfigFilePath) + } + waitOut := &slogWriter{level: slog.LevelInfo, source: "kubectl-rollout"} + waitCmd.Stdout = waitOut + waitCmd.Stderr = waitOut + if err := waitCmd.Run(); err != nil { + return fmt.Errorf("Azure NPM DaemonSet not ready: %w", err) + } + waitOut.Flush() + + log.Info("Azure NPM installed successfully") + return nil +} diff --git a/test/e2ev3/pkg/infra/providers/kind/kubeconfig.go b/test/e2ev3/pkg/infra/providers/kind/kubeconfig.go new file mode 100644 index 0000000000..1c52b58455 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/kind/kubeconfig.go @@ -0,0 +1,43 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package kind + +import ( + "context" + "fmt" + "os" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "sigs.k8s.io/kind/pkg/cluster" +) + +const kubeConfigPerms = 0o600 + +// ExportKubeConfig is a go-workflow step that exports the kubeconfig +// for a Kind cluster to a file using the native Kind Go SDK. +type ExportKubeConfig struct { + ClusterName string + KubeConfigFilePath string +} + +func (e *ExportKubeConfig) String() string { return "export-kind-kubeconfig" } + +func (e *ExportKubeConfig) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, e) + log.Info("exporting kubeconfig for Kind cluster", "cluster", e.ClusterName, "path", e.KubeConfigFilePath) + + provider := cluster.NewProvider() + + kubeConfig, err := provider.KubeConfig(e.ClusterName, false) + if err != nil { + return fmt.Errorf("failed to get kubeconfig for Kind cluster %q: %w", e.ClusterName, err) + } + + if err := os.WriteFile(e.KubeConfigFilePath, []byte(kubeConfig), kubeConfigPerms); err != nil { + return fmt.Errorf("failed to write kubeconfig to %q: %w", e.KubeConfigFilePath, err) + } + + log.Info("kubeconfig for Kind cluster written", "cluster", e.ClusterName, "path", e.KubeConfigFilePath) + return nil +} diff --git a/test/e2ev3/pkg/infra/providers/kind/slog_writer.go b/test/e2ev3/pkg/infra/providers/kind/slog_writer.go new file mode 100644 index 0000000000..3964dd7b43 --- /dev/null +++ b/test/e2ev3/pkg/infra/providers/kind/slog_writer.go @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package kind + +import ( + "bytes" + "context" + "log/slog" +) + +// slogWriter is an io.Writer that logs each complete line through slog at the given level. +// Partial lines are buffered until a newline is received. +type slogWriter struct { + level slog.Level + source string + buf []byte +} + +func (w *slogWriter) Write(p []byte) (int, error) { + w.buf = append(w.buf, p...) + for { + idx := bytes.IndexByte(w.buf, '\n') + if idx < 0 { + break + } + line := string(bytes.TrimRight(w.buf[:idx], "\r")) + w.buf = w.buf[idx+1:] + if line != "" { + slog.Log(context.Background(), w.level, line, "source", w.source) + } + } + return len(p), nil +} + +// Flush logs any remaining buffered content not terminated by a newline. +func (w *slogWriter) Flush() { + if len(w.buf) > 0 { + line := string(bytes.TrimRight(w.buf, "\r\n")) + if line != "" { + slog.Log(context.Background(), w.level, line, "source", w.source) + } + w.buf = nil + } +} diff --git a/test/e2ev3/pkg/infra/step.go b/test/e2ev3/pkg/infra/step.go new file mode 100644 index 0000000000..71c2bc878a --- /dev/null +++ b/test/e2ev3/pkg/infra/step.go @@ -0,0 +1,71 @@ +package infra + +import ( + "context" + "fmt" + "testing" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/azure" + "github.com/microsoft/retina/test/e2ev3/pkg/infra/providers/kind" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" +) + +// Workflow provisions a cluster via the configured provider. +type Workflow struct { + Cfg *config.E2EConfig + T *testing.T +} + +func (s *Workflow) Do(ctx context.Context) error { + p := s.Cfg + kubeCfgPath := p.Cluster.KubeConfigPath() + ctx, _ = utils.StepLogger(ctx, s) + + if *config.KubeConfig != "" { + rc, err := clientcmd.BuildConfigFromFlags("", kubeCfgPath) + if err != nil { + return fmt.Errorf("build rest config: %w", err) + } + setRestConfig(p.Cluster, rc) + return nil + } + + var steps []flow.Steper + switch *config.Provider { + case "kind": + kc := p.Cluster.(*kind.Cluster) + kindCfg := kind.DefaultE2EKindConfig(kc.Name) + kc.Name = kindCfg.ClusterName + steps = KindSteps(s.T, kindCfg, kubeCfgPath, *config.CreateInfra, *config.DeleteInfra) + default: + ac := p.Cluster.(*azure.Cluster) + infraCfg := ResolveInfraConfig(s.T, ac) + steps = AzureSteps(s.T, infraCfg, kubeCfgPath, *config.CreateInfra, *config.DeleteInfra) + } + + inner := new(flow.Workflow) + inner.Add(flow.Pipe(steps...)) + if err := inner.Do(ctx); err != nil { + return err + } + + rc, err := clientcmd.BuildConfigFromFlags("", kubeCfgPath) + if err != nil { + return fmt.Errorf("build rest config: %w", err) + } + setRestConfig(p.Cluster, rc) + return nil +} + +func setRestConfig(c config.ClusterProvider, rc *rest.Config) { + switch t := c.(type) { + case *kind.Cluster: + t.RC = rc + case *azure.Cluster: + t.RC = rc + } +} diff --git a/test/e2ev3/pkg/kubernetes/check-pod-status.go b/test/e2ev3/pkg/kubernetes/check-pod-status.go new file mode 100644 index 0000000000..8d64fc3c41 --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/check-pod-status.go @@ -0,0 +1,105 @@ +package kubernetes + +import ( + "context" + "fmt" + "log/slog" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +const ( + RetryTimeoutPodsReady = 5 * time.Minute + RetryIntervalPodsReady = 5 * time.Second + + printInterval = 5 // print to stdout every 5 iterations +) + +type WaitPodsReady struct { + RestConfig *rest.Config + Namespace string + LabelSelector string +} + +func (w *WaitPodsReady) Do(ctx context.Context) error { + clientset, err := kubernetes.NewForConfig(w.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + return WaitForPodReady(ctx, clientset, w.Namespace, w.LabelSelector) +} + +func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) error { + log := slog.Default() + + printIterator := 0 + conditionFunc := wait.ConditionWithContextFunc(func(context.Context) (bool, error) { + defer func() { + printIterator++ + }() + var podList *corev1.PodList + podList, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) + if err != nil { + return false, fmt.Errorf("error listing Pods: %w", err) + } + + if len(podList.Items) == 0 { + log.Info("no pods found", "namespace", namespace, "label", labelSelector) + return false, nil + } + + // check each individual pod to see if it's in Running state + for i := range podList.Items { + + // Check the Pod phase + if podList.Items[i].Status.Phase != corev1.PodRunning { + if printIterator%printInterval == 0 { + log.Info("pod not ready, waiting", "pod", podList.Items[i].Name) + } + return false, nil + } + + // Check all container status. + for j := range podList.Items[i].Status.ContainerStatuses { + if !podList.Items[i].Status.ContainerStatuses[j].Ready { + log.Info("container not ready, waiting", "container", podList.Items[i].Status.ContainerStatuses[j].Name, "pod", podList.Items[i].Name) + return false, nil + } + } + + } + log.Info("all pods running", "namespace", namespace, "label", labelSelector) + return true, nil + }) + + err := wait.PollUntilContextCancel(ctx, RetryIntervalPodsReady, true, conditionFunc) + if err != nil { + PrintPodLogs(ctx, clientset, namespace, labelSelector) + return fmt.Errorf("error waiting for pods in namespace \"%s\" with label \"%s\" to be in Running state: %w", namespace, labelSelector, err) + } + return nil +} + +func CheckContainerRestart(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) error { + var podList *corev1.PodList + podList, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) + if err != nil { + return fmt.Errorf("error listing Pods: %w", err) + } + + for _, pod := range podList.Items { + for istatus := range pod.Status.ContainerStatuses { + status := &pod.Status.ContainerStatuses[istatus] + if status.RestartCount > 0 { + return fmt.Errorf("pod %s has %d container restarts: status: %+v: %w", pod.Name, status.RestartCount, status, ErrPodCrashed) + } + } + } + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/create-agnhost-statefulset.go b/test/e2ev3/pkg/kubernetes/create-agnhost-statefulset.go new file mode 100644 index 0000000000..c32d035d87 --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/create-agnhost-statefulset.go @@ -0,0 +1,184 @@ +package kubernetes + +import ( + "context" + "fmt" + "strconv" + "time" + + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +var ErrLabelMissingFromPod = fmt.Errorf("label missing from pod") + +const ( + AgnhostHTTPPort = 80 + AgnhostArchAmd64 = "amd64" + AgnhostArchArm64 = "arm64" +) + +type CreateAgnhostStatefulSet struct { + AgnhostName string + AgnhostNamespace string + ScheduleOnSameNode bool + RestConfig *rest.Config + AgnhostArch string + AgnhostReplicas *int +} + +func (c *CreateAgnhostStatefulSet) Do(ctx context.Context) error { + clientset, err := kubernetes.NewForConfig(c.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, defaultTimeoutSeconds*time.Second) + defer cancel() + + // set default arch to amd64 + if c.AgnhostArch == "" { + c.AgnhostArch = AgnhostArchAmd64 + } + + // set default replicas to 1 + replicas := 1 + if c.AgnhostReplicas != nil { + replicas = *c.AgnhostReplicas + } + + agnhostStatefulSet := c.getAgnhostDeployment(c.AgnhostArch, replicas) + + err = CreateResource(ctx, agnhostStatefulSet, clientset) + if err != nil { + return fmt.Errorf("error agnhost component: %w", err) + } + + selector, exists := agnhostStatefulSet.Spec.Selector.MatchLabels["app"] + if !exists { + return fmt.Errorf("missing label \"app=%s\" from agnhost statefulset: %w", c.AgnhostName, ErrLabelMissingFromPod) + } + + labelSelector := fmt.Sprintf("app=%s", selector) + err = WaitForPodReady(ctx, clientset, c.AgnhostNamespace, labelSelector) + if err != nil { + return fmt.Errorf("error waiting for agnhost pod to be ready: %w", err) + } + + return nil +} + +func (c *CreateAgnhostStatefulSet) getAgnhostDeployment(arch string, replicas int) *appsv1.StatefulSet { + if replicas < 1 { + replicas = 1 + } + reps := int32(replicas) //nolint:gosec // replicas controlled by test code + + var affinity *v1.Affinity + if c.ScheduleOnSameNode { + affinity = &v1.Affinity{ + PodAffinity: &v1.PodAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + TopologyKey: "kubernetes.io/hostname", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "k8s-app": "agnhost", + }, + }, + }, + }, + }, + } + } else { + affinity = &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + // prefer an even spread across the cluster to avoid scheduling on the same node + PreferredDuringSchedulingIgnoredDuringExecution: []v1.WeightedPodAffinityTerm{ + { + Weight: MaxAffinityWeight, + PodAffinityTerm: v1.PodAffinityTerm{ + TopologyKey: "kubernetes.io/hostname", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "k8s-app": "agnhost", + }, + }, + }, + }, + }, + }, + } + } + + return &appsv1.StatefulSet{ + TypeMeta: metav1.TypeMeta{ + Kind: "StatefulSet", + APIVersion: "apps/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: c.AgnhostName, + Namespace: c.AgnhostNamespace, + }, + Spec: appsv1.StatefulSetSpec{ + ServiceName: c.AgnhostName, + Replicas: &reps, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": c.AgnhostName, + "k8s-app": "agnhost", + }, + }, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "app": c.AgnhostName, + "k8s-app": "agnhost", + }, + }, + + Spec: v1.PodSpec{ + Affinity: affinity, + NodeSelector: map[string]string{ + "kubernetes.io/os": "linux", + "kubernetes.io/arch": arch, + }, + Containers: []v1.Container{ + { + Name: c.AgnhostName, + Image: "registry.k8s.io/e2e-test-images/agnhost:2.40", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "memory": resource.MustParse("20Mi"), + }, + Limits: v1.ResourceList{ + "memory": resource.MustParse("20Mi"), + }, + }, + Command: []string{ + "/agnhost", + }, + Args: []string{ + "serve-hostname", + "--http", + "--port", + strconv.Itoa(AgnhostHTTPPort), + }, + + Ports: []v1.ContainerPort{ + { + ContainerPort: AgnhostHTTPPort, + }, + }, + Env: []v1.EnvVar{}, + }, + }, + }, + }, + }, + } +} diff --git a/test/e2ev3/pkg/kubernetes/create-kapinger-deployment.go b/test/e2ev3/pkg/kubernetes/create-kapinger-deployment.go new file mode 100644 index 0000000000..dba310d973 --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/create-kapinger-deployment.go @@ -0,0 +1,247 @@ +package kubernetes + +import ( + "context" + "fmt" + "strconv" + + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/resource" + metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +const ( + KapingerHTTPPort = 8080 + KapingerTCPPort = 8085 + KapingerUDPPort = 8086 + MaxAffinityWeight = 100 +) + +type CreateKapingerDeployment struct { + KapingerNamespace string + KapingerReplicas string + RestConfig *rest.Config +} + +func (c *CreateKapingerDeployment) Do(ctx context.Context) error { + _, err := strconv.Atoi(c.KapingerReplicas) + if err != nil { + return fmt.Errorf("error converting replicas to int for Kapinger replicas: %w", err) + } + + clientset, err := kubernetes.NewForConfig(c.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + resources := []runtime.Object{ + c.GetKapingerService(), + c.GetKapingerServiceAccount(), + c.GetKapingerClusterRole(), + c.GetKapingerClusterRoleBinding(), + c.GetKapingerDeployment(), + } + + for i := range resources { + err = CreateResource(ctx, resources[i], clientset) + if err != nil { + return fmt.Errorf("error kapinger component: %w", err) + } + } + + return nil +} + +func (c *CreateKapingerDeployment) GetKapingerDeployment() *appsv1.Deployment { + replicas, err := strconv.ParseInt(c.KapingerReplicas, 10, 32) + if err != nil { + fmt.Println("Error converting replicas to int for Kapinger replicas: ", err) + return nil + } + reps := int32(replicas) + + return &appsv1.Deployment{ + TypeMeta: metaV1.TypeMeta{ + Kind: "Deployment", + APIVersion: "apps/v1", + }, + ObjectMeta: metaV1.ObjectMeta{ + Name: "kapinger", + Namespace: c.KapingerNamespace, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: &reps, + Selector: &metaV1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "kapinger", + }, + }, + Template: v1.PodTemplateSpec{ + ObjectMeta: metaV1.ObjectMeta{ + Labels: map[string]string{ + "app": "kapinger", + "server": "good", + }, + }, + + Spec: v1.PodSpec{ + NodeSelector: map[string]string{ + "kubernetes.io/os": "linux", + }, + Affinity: &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + // prefer an even spread across the cluster to avoid scheduling on the same node + PreferredDuringSchedulingIgnoredDuringExecution: []v1.WeightedPodAffinityTerm{ + { + Weight: MaxAffinityWeight, + PodAffinityTerm: v1.PodAffinityTerm{ + TopologyKey: "kubernetes.io/hostname", + LabelSelector: &metaV1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "kapinger", + }, + }, + }, + }, + }, + }, + }, + ServiceAccountName: "kapinger-sa", + Containers: []v1.Container{ + { + Name: "kapinger", + Image: "acnpublic.azurecr.io/kapinger:v0.0.23-9-g23ef222", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "memory": resource.MustParse("20Mi"), + }, + Limits: v1.ResourceList{ + "memory": resource.MustParse("40Mi"), + }, + }, + Ports: []v1.ContainerPort{ + { + ContainerPort: KapingerHTTPPort, + }, + }, + Env: []v1.EnvVar{ + { + Name: "GODEBUG", + Value: "netdns=go", + }, + { + Name: "TARGET_TYPE", + Value: "service", + }, + { + Name: "HTTP_PORT", + Value: strconv.Itoa(KapingerHTTPPort), + }, + { + Name: "TCP_PORT", + Value: strconv.Itoa(KapingerTCPPort), + }, + { + Name: "UDP_PORT", + Value: strconv.Itoa(KapingerUDPPort), + }, + }, + }, + }, + }, + }, + }, + } +} + +func (c *CreateKapingerDeployment) GetKapingerService() *v1.Service { + return &v1.Service{ + TypeMeta: metaV1.TypeMeta{ + Kind: "Service", + APIVersion: "v1", + }, + ObjectMeta: metaV1.ObjectMeta{ + Name: "kapinger-service", + Namespace: c.KapingerNamespace, + Labels: map[string]string{ + "app": "kapinger", + }, + }, + Spec: v1.ServiceSpec{ + Selector: map[string]string{ + "app": "kapinger", + }, + Ports: []v1.ServicePort{ + { + Port: KapingerHTTPPort, + Protocol: v1.ProtocolTCP, + TargetPort: intstr.FromInt(KapingerHTTPPort), + }, + }, + }, + } +} + +func (c *CreateKapingerDeployment) GetKapingerServiceAccount() *v1.ServiceAccount { + return &v1.ServiceAccount{ + TypeMeta: metaV1.TypeMeta{ + Kind: "ServiceAccount", + APIVersion: "v1", + }, + ObjectMeta: metaV1.ObjectMeta{ + Name: "kapinger-sa", + Namespace: c.KapingerNamespace, + }, + } +} + +func (c *CreateKapingerDeployment) GetKapingerClusterRole() *rbacv1.ClusterRole { + return &rbacv1.ClusterRole{ + TypeMeta: metaV1.TypeMeta{ + Kind: "ClusterRole", + APIVersion: "rbac.authorization.k8s.io/v1", + }, + ObjectMeta: metaV1.ObjectMeta{ + Name: "kapinger-role", + Namespace: c.KapingerNamespace, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"services", "pods"}, + Verbs: []string{"get", "list"}, + }, + }, + } +} + +func (c *CreateKapingerDeployment) GetKapingerClusterRoleBinding() *rbacv1.ClusterRoleBinding { + return &rbacv1.ClusterRoleBinding{ + TypeMeta: metaV1.TypeMeta{ + Kind: "ClusterRoleBinding", + APIVersion: "rbac.authorization.k8s.io/v1", + }, + ObjectMeta: metaV1.ObjectMeta{ + Name: "kapinger-rolebinding", + Namespace: c.KapingerNamespace, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: "kapinger-sa", + Namespace: c.KapingerNamespace, + }, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "ClusterRole", + Name: "kapinger-role", + }, + } +} diff --git a/test/e2ev3/pkg/kubernetes/create-namespace.go b/test/e2ev3/pkg/kubernetes/create-namespace.go new file mode 100644 index 0000000000..8d068602c6 --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/create-namespace.go @@ -0,0 +1,55 @@ +package kubernetes + +import ( + "context" + "fmt" + "log/slog" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +type CreateNamespace struct { + Namespace string + RestConfig *rest.Config +} + +func (c *CreateNamespace) Do(ctx context.Context) error { + return CreateNamespaceFn(ctx, c.RestConfig, c.Namespace) +} + +func (c *CreateNamespace) getNamespace() *v1.Namespace { + return &v1.Namespace{ + TypeMeta: metav1.TypeMeta{ + Kind: "Namespace", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: c.Namespace, + }, + } +} + +func CreateNamespaceFn(ctx context.Context, restConfig *rest.Config, namespace string) error { + log := slog.Default() + clientset, err := kubernetes.NewForConfig(restConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + _, err = clientset.CoreV1().Namespaces().Create(ctx, &v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespace, + }, + }, metav1.CreateOptions{}) + if err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create namespace \"%s\": %w", namespace, err) + } + + log.Info("namespace created", "namespace", namespace) + + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/create-network-policy.go b/test/e2ev3/pkg/kubernetes/create-network-policy.go new file mode 100644 index 0000000000..9ad2db06fb --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/create-network-policy.go @@ -0,0 +1,82 @@ +package kubernetes + +import ( + "context" + "fmt" + "strings" + + networkingv1 "k8s.io/api/networking/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +const ( + Egress = "egress" + Ingress = "ingress" +) + +type CreateDenyAllNetworkPolicy struct { + NetworkPolicyNamespace string + RestConfig *rest.Config + DenyAllLabelSelector string +} + +func (c *CreateDenyAllNetworkPolicy) Do(ctx context.Context) error { + clientset, err := kubernetes.NewForConfig(c.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + networkPolicy := getNetworkPolicy(c.NetworkPolicyNamespace, c.DenyAllLabelSelector) + err = CreateResource(ctx, networkPolicy, clientset) + if err != nil { + return fmt.Errorf("error creating simple deny-all network policy: %w", err) + } + + return nil +} + +func getNetworkPolicy(namespace, labelSelector string) *networkingv1.NetworkPolicy { + labelSelectorSlice := strings.Split(labelSelector, "=") + return &networkingv1.NetworkPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: "deny-all", + Namespace: namespace, + }, + Spec: networkingv1.NetworkPolicySpec{ + PodSelector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + labelSelectorSlice[0]: labelSelectorSlice[1], + }, + }, + PolicyTypes: []networkingv1.PolicyType{ + networkingv1.PolicyTypeIngress, + networkingv1.PolicyTypeEgress, + }, + Egress: []networkingv1.NetworkPolicyEgressRule{}, + Ingress: []networkingv1.NetworkPolicyIngressRule{}, + }, + } +} + +type DeleteDenyAllNetworkPolicy struct { + NetworkPolicyNamespace string + RestConfig *rest.Config + DenyAllLabelSelector string +} + +func (d *DeleteDenyAllNetworkPolicy) Do(ctx context.Context) error { + clientset, err := kubernetes.NewForConfig(d.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + networkPolicy := getNetworkPolicy(d.NetworkPolicyNamespace, d.DenyAllLabelSelector) + err = DeleteResource(ctx, networkPolicy, clientset) + if err != nil { + return fmt.Errorf("error creating simple deny-all network policy: %w", err) + } + + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/create-resource.go b/test/e2ev3/pkg/kubernetes/create-resource.go new file mode 100644 index 0000000000..d1f9c03e2e --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/create-resource.go @@ -0,0 +1,225 @@ +package kubernetes + +import ( + "context" + "fmt" + "log/slog" + + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/errors" + metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" +) + +var ( + ErrUnknownResourceType = fmt.Errorf("unknown resource type") + ErrCreateNilResource = fmt.Errorf("cannot create nil resource") +) + +func CreateResource(ctx context.Context, obj runtime.Object, clientset *kubernetes.Clientset) error { //nolint:gocyclo //this is just boilerplate code + if obj == nil { + return ErrCreateNilResource + } + + switch o := obj.(type) { + case *appsv1.DaemonSet: + slog.Info("creating DaemonSet", "name", o.Name, "namespace", o.Namespace) + client := clientset.AppsV1().DaemonSets(o.Namespace) + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create DaemonSet \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update DaemonSet \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *appsv1.Deployment: + slog.Info("creating Deployment", "name", o.Name, "namespace", o.Namespace) + client := clientset.AppsV1().Deployments(o.Namespace) + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create Deployment \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update Deployment \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *appsv1.StatefulSet: + slog.Info("creating StatefulSet", "name", o.Name, "namespace", o.Namespace) + client := clientset.AppsV1().StatefulSets(o.Namespace) + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create StatefulSet \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update StatefulSet \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *v1.Service: + slog.Info("creating Service", "name", o.Name, "namespace", o.Namespace) + client := clientset.CoreV1().Services(o.Namespace) + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create Service \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update Service \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *v1.ServiceAccount: + slog.Info("creating ServiceAccount", "name", o.Name, "namespace", o.Namespace) + client := clientset.CoreV1().ServiceAccounts(o.Namespace) + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create ServiceAccount \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update ServiceAccount \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *rbacv1.Role: + slog.Info("creating Role", "name", o.Name, "namespace", o.Namespace) + client := clientset.RbacV1().Roles(o.Namespace) + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create Role \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update Role \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *rbacv1.RoleBinding: + slog.Info("creating RoleBinding", "name", o.Name, "namespace", o.Namespace) + client := clientset.RbacV1().RoleBindings(o.Namespace) + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create RoleBinding \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update RoleBinding \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *rbacv1.ClusterRole: + slog.Info("creating ClusterRole", "name", o.Name) + client := clientset.RbacV1().ClusterRoles() + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create ClusterRole \"%s\": %w", o.Name, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update ClusterRole \"%s\": %w", o.Name, err) + } + + case *rbacv1.ClusterRoleBinding: + slog.Info("creating ClusterRoleBinding", "name", o.Name) + client := clientset.RbacV1().ClusterRoleBindings() + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create ClusterRoleBinding \"%s\": %w", o.Name, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update ClusterRoleBinding \"%s\": %w", o.Name, err) + } + + case *v1.ConfigMap: + slog.Info("creating ConfigMap", "name", o.Name, "namespace", o.Namespace) + client := clientset.CoreV1().ConfigMaps(o.Namespace) + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create ConfigMap \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update ConfigMap \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *networkingv1.NetworkPolicy: + slog.Info("creating NetworkPolicy", "name", o.Name, "namespace", o.Namespace) + client := clientset.NetworkingV1().NetworkPolicies(o.Namespace) + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create NetworkPolicy \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update NetworkPolicy \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *v1.Secret: + slog.Info("creating Secret", "name", o.Name, "namespace", o.Namespace) + client := clientset.CoreV1().Secrets(o.Namespace) + _, err := client.Get(ctx, o.Name, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + _, err = client.Create(ctx, o, metaV1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create Secret \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + return nil + } + _, err = client.Update(ctx, o, metaV1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to create/update Secret \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + default: + return fmt.Errorf("unknown object type: %T, err: %w", obj, ErrUnknownResourceType) + } + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/debug.go b/test/e2ev3/pkg/kubernetes/debug.go new file mode 100644 index 0000000000..11dbbbd621 --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/debug.go @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package kubernetes + +import ( + "context" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "k8s.io/client-go/rest" +) + +// DebugOnFailure captures diagnostic info when upstream steps fail. +// Add it to a workflow with When(flow.AnyFailed) so it only runs on failure. +type DebugOnFailure struct { + RestConfig *rest.Config + Namespace string + LabelSelector string +} + +func (d *DebugOnFailure) String() string { return "debug-on-failure" } + +func (d *DebugOnFailure) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, d) + log.Info("capturing logs for pods", "namespace", d.Namespace, "label", d.LabelSelector) + getLogs := &GetPodLogs{ + RestConfig: d.RestConfig, + Namespace: d.Namespace, + LabelSelector: d.LabelSelector, + } + if err := getLogs.Do(context.Background()); err != nil { + log.Error("failed to capture logs", "error", err) + } + return nil // never fail the debug step itself +} diff --git a/test/e2ev3/pkg/kubernetes/delete-namespace.go b/test/e2ev3/pkg/kubernetes/delete-namespace.go new file mode 100644 index 0000000000..7bc14fb68e --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/delete-namespace.go @@ -0,0 +1,65 @@ +package kubernetes + +import ( + "context" + "fmt" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/util/retry" +) + +type DeleteNamespace struct { + Namespace string + RestConfig *rest.Config +} + +func (d *DeleteNamespace) String() string { return "delete-namespace" } + +func (d *DeleteNamespace) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, d) + clientset, err := kubernetes.NewForConfig(d.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + err = clientset.CoreV1().Namespaces().Delete(ctx, d.Namespace, metaV1.DeleteOptions{}) + if err != nil { + if !errors.IsNotFound(err) { + return fmt.Errorf("failed to delete namespace \"%s\": %w", d.Namespace, err) + } + } + + backoff := wait.Backoff{ + Steps: 9, + Duration: 10 * time.Second, + Factor: 2.0, + // Jitter: 0.1, + } + + // Check if namespace was deleted + return retry.OnError(backoff, + func(err error) bool { + log.Info("namespace deletion pending", "error", err) + + return true + }, + func() error { + _, err = clientset.CoreV1().Namespaces().Get(ctx, d.Namespace, metaV1.GetOptions{}) + if errors.IsNotFound(err) { + return nil + } + + if err == nil { + return fmt.Errorf("namespace \"%s\" still exists", d.Namespace) + } + + return err + }, + ) +} diff --git a/test/e2ev3/pkg/kubernetes/delete-resource.go b/test/e2ev3/pkg/kubernetes/delete-resource.go new file mode 100644 index 0000000000..502bacb77e --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/delete-resource.go @@ -0,0 +1,346 @@ +package kubernetes + +import ( + "context" + "fmt" + "log/slog" + "time" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/errors" + metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +var ErrDeleteNilResource = fmt.Errorf("cannot create nil resource") + +type ResourceType string + +const ( + DaemonSet ResourceType = "DaemonSet" + Deployment ResourceType = "Deployment" + StatefulSet ResourceType = "StatefulSet" + Service ResourceType = "Service" + ServiceAccount ResourceType = "ServiceAccount" + Role ResourceType = "Role" + RoleBinding ResourceType = "RoleBinding" + ClusterRole ResourceType = "ClusterRole" + ClusterRoleBinding ResourceType = "ClusterRoleBinding" + ConfigMap ResourceType = "ConfigMap" + NetworkPolicy ResourceType = "NetworkPolicy" + Secret ResourceType = "Secret" + Unknown ResourceType = "Unknown" +) + +// Parameters can only be strings, heres to help add guardrails +func TypeString(resourceType ResourceType) string { + ResourceTypes := map[ResourceType]string{ + DaemonSet: "DaemonSet", + Deployment: "Deployment", + StatefulSet: "StatefulSet", + Service: "Service", + ServiceAccount: "ServiceAccount", + Role: "Role", + RoleBinding: "RoleBinding", + ClusterRole: "ClusterRole", + ClusterRoleBinding: "ClusterRoleBinding", + ConfigMap: "ConfigMap", + NetworkPolicy: "NetworkPolicy", + Secret: "Secret", + Unknown: "Unknown", + } + str, ok := ResourceTypes[resourceType] + if !ok { + return ResourceTypes[Unknown] + } + return str +} + +type DeleteKubernetesResource struct { + ResourceType string // can't use enum, breaks parameter parsing, all must be strings + ResourceName string + ResourceNamespace string + RestConfig *rest.Config +} + +func (d *DeleteKubernetesResource) String() string { return "delete-kubernetes-resource" } + +func (d *DeleteKubernetesResource) Do(ctx context.Context) error { + ctx, _ = utils.StepLogger(ctx, d) + // Prevalidation: check resource type before proceeding + restype := ResourceType(d.ResourceType) + if restype == Unknown { + return ErrUnknownResourceType + } + + clientset, err := kubernetes.NewForConfig(d.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, defaultTimeoutSeconds*time.Second) + defer cancel() + + res := ResourceType(d.ResourceType) + + var resource runtime.Object + + switch res { + case DaemonSet: + resource = &appsv1.DaemonSet{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + Namespace: d.ResourceNamespace, + }, + } + case Deployment: + resource = &appsv1.Deployment{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + Namespace: d.ResourceNamespace, + }, + } + case StatefulSet: + resource = &appsv1.StatefulSet{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + Namespace: d.ResourceNamespace, + }, + } + case Service: + resource = &v1.Service{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + Namespace: d.ResourceNamespace, + }, + } + case ServiceAccount: + resource = &v1.ServiceAccount{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + Namespace: d.ResourceNamespace, + }, + } + case Role: + resource = &rbacv1.Role{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + Namespace: d.ResourceNamespace, + }, + } + case RoleBinding: + resource = &rbacv1.RoleBinding{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + Namespace: d.ResourceNamespace, + }, + } + case ClusterRole: + resource = &rbacv1.ClusterRole{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + }, + } + case ClusterRoleBinding: + resource = &rbacv1.ClusterRoleBinding{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + }, + } + case ConfigMap: + resource = &v1.ConfigMap{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + Namespace: d.ResourceNamespace, + }, + } + case NetworkPolicy: + resource = &networkingv1.NetworkPolicy{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + Namespace: d.ResourceNamespace, + }, + } + case Secret: + resource = &v1.Secret{ + ObjectMeta: metaV1.ObjectMeta{ + Name: d.ResourceName, + Namespace: d.ResourceNamespace, + }, + } + case Unknown: + return fmt.Errorf("unknown resource type: %s: %w", d.ResourceType, ErrUnknownResourceType) + default: + return ErrUnknownResourceType + } + + err = DeleteResource(ctx, resource, clientset) + if err != nil { + return fmt.Errorf("error deleting resource: %w", err) + } + + return nil +} + +func DeleteResource(ctx context.Context, obj runtime.Object, clientset *kubernetes.Clientset) error { //nolint:gocyclo //this is just boilerplate code + if obj == nil { + return ErrCreateNilResource + } + + switch o := obj.(type) { + case *appsv1.DaemonSet: + slog.Info("deleting DaemonSet", "name", o.Name, "namespace", o.Namespace) + client := clientset.AppsV1().DaemonSets(o.Namespace) + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "DaemonSet", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete DaemonSet \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *appsv1.Deployment: + slog.Info("deleting Deployment", "name", o.Name, "namespace", o.Namespace) + client := clientset.AppsV1().Deployments(o.Namespace) + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "Deployment", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete Deployment \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *appsv1.StatefulSet: + slog.Info("deleting StatefulSet", "name", o.Name, "namespace", o.Namespace) + client := clientset.AppsV1().StatefulSets(o.Namespace) + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "StatefulSet", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete StatefulSet \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *v1.Service: + slog.Info("deleting Service", "name", o.Name, "namespace", o.Namespace) + client := clientset.CoreV1().Services(o.Namespace) + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "Service", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete Service \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *v1.ServiceAccount: + slog.Info("deleting ServiceAccount", "name", o.Name, "namespace", o.Namespace) + client := clientset.CoreV1().ServiceAccounts(o.Namespace) + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "ServiceAccount", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete ServiceAccount \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *rbacv1.Role: + slog.Info("deleting Role", "name", o.Name, "namespace", o.Namespace) + client := clientset.RbacV1().Roles(o.Namespace) + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "Role", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete Role \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *rbacv1.RoleBinding: + slog.Info("deleting RoleBinding", "name", o.Name, "namespace", o.Namespace) + client := clientset.RbacV1().RoleBindings(o.Namespace) + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "RoleBinding", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete RoleBinding \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *rbacv1.ClusterRole: + slog.Info("deleting ClusterRole", "name", o.Name) + client := clientset.RbacV1().ClusterRoles() + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "ClusterRole", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete ClusterRole \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *rbacv1.ClusterRoleBinding: + slog.Info("deleting ClusterRoleBinding", "name", o.Name) + client := clientset.RbacV1().ClusterRoleBindings() + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "ClusterRoleBinding", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete ClusterRoleBinding \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *v1.ConfigMap: + slog.Info("deleting ConfigMap", "name", o.Name, "namespace", o.Namespace) + client := clientset.CoreV1().ConfigMaps(o.Namespace) + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "ConfigMap", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete ConfigMap \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *networkingv1.NetworkPolicy: + slog.Info("deleting NetworkPolicy", "name", o.Name, "namespace", o.Namespace) + client := clientset.NetworkingV1().NetworkPolicies(o.Namespace) + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "NetworkPolicy", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete NetworkPolicy \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + case *v1.Secret: + slog.Info("deleting Secret", "name", o.Name, "namespace", o.Namespace) + client := clientset.CoreV1().Secrets(o.Namespace) + err := client.Delete(ctx, o.Name, metaV1.DeleteOptions{}) + if err != nil { + if errors.IsNotFound(err) { + slog.Info("resource does not exist", "kind", "Secret", "name", o.Name, "namespace", o.Namespace) + return nil + } + return fmt.Errorf("failed to delete Secret \"%s\" in namespace \"%s\": %w", o.Name, o.Namespace, err) + } + + default: + return fmt.Errorf("unknown object type: %T, err: %w", obj, ErrUnknownResourceType) + } + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/exec-pod.go b/test/e2ev3/pkg/kubernetes/exec-pod.go new file mode 100644 index 0000000000..4b41450929 --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/exec-pod.go @@ -0,0 +1,94 @@ +package kubernetes + +import ( + "bytes" + "context" + "fmt" + "log/slog" + "os" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "strings" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/remotecommand" + "k8s.io/client-go/util/retry" + "k8s.io/kubectl/pkg/scheme" +) + +const ExecSubResources = "exec" + +type ExecInPod struct { + PodNamespace string + RestConfig *rest.Config + PodName string + Command string +} + +func (e *ExecInPod) String() string { return "exec-in-pod" } + +func (e *ExecInPod) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, e) + ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + + clientset, err := kubernetes.NewForConfig(e.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + err = retry.OnError(retry.DefaultRetry, func(err error) bool { + // Retry on every error + return true + }, func() error { + _, execErr := ExecPod(ctx, clientset, e.RestConfig, e.PodNamespace, e.PodName, e.Command) + if execErr != nil { + log.Error("executing command, retrying", "command", e.Command, "error", execErr) + } + return execErr + }) + if err != nil { + return fmt.Errorf("error executing command, all retries exhausted [%s]: %w", e.Command, err) + } + + return nil +} + +func ExecPod(ctx context.Context, clientset *kubernetes.Clientset, config *rest.Config, namespace, podName, command string) ([]byte, error) { + slog.Info("executing command", "command", command, "pod", podName, "namespace", namespace) + req := clientset.CoreV1().RESTClient().Post().Resource("pods").Name(podName). + Namespace(namespace).SubResource(ExecSubResources) + option := &v1.PodExecOptions{ + Command: strings.Fields(command), + Stdin: true, + Stdout: true, + Stderr: true, + TTY: false, + } + + req.VersionedParams( + option, + scheme.ParameterCodec, + ) + + var buf bytes.Buffer + exec, err := remotecommand.NewSPDYExecutor(config, "POST", req.URL()) + if err != nil { + return buf.Bytes(), fmt.Errorf("error creating executor: %w", err) + } + + err = exec.StreamWithContext(ctx, remotecommand.StreamOptions{ + Stdin: os.Stdin, + Stdout: &buf, + Stderr: &buf, + }) + if err != nil { + return buf.Bytes(), fmt.Errorf("error executing command: %w", err) + } + + res := buf.Bytes() + return res, nil +} diff --git a/test/e2ev3/pkg/kubernetes/get-external-crd.go b/test/e2ev3/pkg/kubernetes/get-external-crd.go new file mode 100644 index 0000000000..0b02e7bddd --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/get-external-crd.go @@ -0,0 +1,63 @@ +package kubernetes + +import ( + "fmt" + "io" + "log/slog" + "net/http" + "net/url" + "os" + "path" + "path/filepath" +) + +func downloadExternalCRDs(chartPath string) error { + crdUrls := []string{ + "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/main/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml", + } + + for _, crdUrl := range crdUrls { + crd, err := fetchYAML(crdUrl) + if err != nil { + return err + } + + crdName, err := extractFileName(crdUrl) + if err != nil { + return err + } + + slog.Info("CRD exists", "name", crdName) + slog.Info("writing CRD file", "path", filepath.Join(chartPath, "/crds/"+crdName)) + err = saveToFile(filepath.Join(chartPath, "/crds/"+crdName), crd) + if err != nil { + return err + } + } + return nil +} + +func fetchYAML(url string) ([]byte, error) { + resp, err := http.Get(url) + if err != nil { + return nil, fmt.Errorf("failed to get crd source code from %s: %w", url, err) + } + defer resp.Body.Close() + return io.ReadAll(resp.Body) +} + +func extractFileName(rawURL string) (string, error) { + parsedURL, err := url.Parse(rawURL) + if err != nil { + return "", fmt.Errorf("failed to parse url: %w", err) + } + return path.Base(parsedURL.Path), nil +} + +func saveToFile(filename string, data []byte) error { + err := os.WriteFile(filename, data, 0644) + if err != nil { + return fmt.Errorf("failed to write crd.yaml to /crds dir : %w", err) + } + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/get-logs.go b/test/e2ev3/pkg/kubernetes/get-logs.go new file mode 100644 index 0000000000..dba0fb4eed --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/get-logs.go @@ -0,0 +1,70 @@ +package kubernetes + +import ( + "context" + "io" + "log/slog" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +type GetPodLogs struct { + RestConfig *rest.Config + Namespace string + LabelSelector string +} + +func (p *GetPodLogs) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, p) + + log.Info("printing pod logs", "namespace", p.Namespace, "labelselector", p.LabelSelector) + + clientset, err := kubernetes.NewForConfig(p.RestConfig) + if err != nil { + log.Error("error creating clientset", "error", err) + } + + PrintPodLogs(ctx, clientset, p.Namespace, p.LabelSelector) + + return nil +} + +func PrintPodLogs(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) { + log := slog.Default() + // List all the pods in the namespace + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + log.Error("error listing pods", "error", err) + } + + // Iterate over the pods and get the logs for each pod + for i := range pods.Items { + pod := pods.Items[i] + log.Info("pod logs", "pod", pod.Name) + + // Get the logs for the pod + req := clientset.CoreV1().Pods(namespace).GetLogs(pod.Name, &corev1.PodLogOptions{}) + podLogs, err := req.Stream(ctx) + if err != nil { + log.Error("error getting logs for pod", "pod", pod.Name, "error", err) + } + + // Read the logs + buf, err := io.ReadAll(podLogs) + if err != nil { + log.Error("error reading logs for pod", "pod", pod.Name, "error", err) + } + + podLogs.Close() + + // Print the logs + log.Info("pod log output", "pod", pod.Name, "output", string(buf)) + } +} diff --git a/test/e2ev3/pkg/kubernetes/get-pod-ip.go b/test/e2ev3/pkg/kubernetes/get-pod-ip.go new file mode 100644 index 0000000000..905734b59a --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/get-pod-ip.go @@ -0,0 +1,26 @@ +package kubernetes + +import ( + "context" + + "github.com/pkg/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +func GetPodIP(ctx context.Context, restConfig *rest.Config, namespace, podName string) (string, error) { + clientset, err := kubernetes.NewForConfig(restConfig) + if err != nil { + return "", errors.Wrapf(err, "error creating Kubernetes clientset") + } + + pod, err := clientset.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + return "", errors.Wrapf(err, "error getting pod %s in namespace %s", podName, namespace) + } + if pod.Status.PodIP == "" { + return "", errors.Errorf("pod %s in namespace %s has no IP", podName, namespace) + } + return pod.Status.PodIP, nil +} diff --git a/test/e2ev3/pkg/kubernetes/install-hubble-helm.go b/test/e2ev3/pkg/kubernetes/install-hubble-helm.go new file mode 100644 index 0000000000..3d2fa0e16c --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/install-hubble-helm.go @@ -0,0 +1,169 @@ +package kubernetes + +import ( + "context" + "fmt" + "strings" + "sync" + "time" + + e2ecfg "github.com/microsoft/retina/test/e2ev3/config" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "helm.sh/helm/v3/pkg/action" + "helm.sh/helm/v3/pkg/chart/loader" + "helm.sh/helm/v3/pkg/cli" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) + +const ( + HubbleNamespace = "kube-system" + HubbleUIApp = "hubble-ui" + HubbleRelayApp = "hubble-relay" +) + +type InstallHubbleHelmChart struct { + Namespace string + ReleaseName string + KubeConfigFilePath string + ChartPath string + ImageTag string + ImageRegistry string + ImageNamespace string + HelmDriver string + ImageLoader e2ecfg.ClusterProvider +} + +func (v *InstallHubbleHelmChart) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, v) + ctx, cancel := context.WithTimeout(ctx, defaultTimeoutSeconds*time.Second) + defer cancel() + + settings := cli.New() + settings.KubeConfig = v.KubeConfigFilePath + actionConfig := new(action.Configuration) + + err := actionConfig.Init(settings.RESTClientGetter(), v.Namespace, v.HelmDriver, func(format string, a ...any) { log.Info(fmt.Sprintf(format, a...)) }) + if err != nil { + return fmt.Errorf("failed to initialize helm action config: %w", err) + } + + // Creating extra namespace to deploy test pods + rc, err := clientcmd.BuildConfigFromFlags("", v.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("failed to build rest config: %w", err) + } + err = CreateNamespaceFn(ctx, rc, e2ecfg.TestPodNamespace) + if err != nil { + return fmt.Errorf("failed to create namespace %s: %w", v.Namespace, err) + } + + tag := v.ImageTag + if tag == "" { + return fmt.Errorf("tag is not set: %w", errEmpty) + } + imageRegistry := v.ImageRegistry + if imageRegistry == "" { + return fmt.Errorf("image registry is not set: %w", errEmpty) + } + + imageNamespace := v.ImageNamespace + if imageNamespace == "" { + return fmt.Errorf("image namespace is not set: %w", errEmpty) + } + + // load chart from the path + chart, err := loader.Load(v.ChartPath) + if err != nil { + return fmt.Errorf("failed to load chart from path %s: %w", v.ChartPath, err) + } + + if secrets := v.ImageLoader.ImagePullSecrets(); len(secrets) > 0 { + chart.Values["imagePullSecrets"] = secrets + } + pullPolicy := v.ImageLoader.ImagePullPolicy() + + chart.Values["operator"].(map[string]interface{})["enabled"] = true + chart.Values["operator"].(map[string]interface{})["repository"] = imageRegistry + "/" + imageNamespace + "/retina-operator" + chart.Values["operator"].(map[string]interface{})["tag"] = tag + chart.Values["operator"].(map[string]interface{})["pullPolicy"] = pullPolicy + chart.Values["agent"].(map[string]interface{})["enabled"] = true + chart.Values["agent"].(map[string]interface{})["repository"] = imageRegistry + "/" + imageNamespace + "/retina-agent" + chart.Values["agent"].(map[string]interface{})["tag"] = tag + chart.Values["agent"].(map[string]interface{})["pullPolicy"] = pullPolicy + chart.Values["agent"].(map[string]interface{})["init"].(map[string]interface{})["enabled"] = true + chart.Values["agent"].(map[string]interface{})["init"].(map[string]interface{})["repository"] = imageRegistry + "/" + imageNamespace + "/retina-init" + chart.Values["agent"].(map[string]interface{})["init"].(map[string]interface{})["tag"] = tag + chart.Values["hubble"].(map[string]interface{})["tls"].(map[string]interface{})["enabled"] = false + chart.Values["hubble"].(map[string]interface{})["relay"].(map[string]interface{})["tls"].(map[string]interface{})["server"].(map[string]interface{})["enabled"] = false + chart.Values["hubble"].(map[string]interface{})["tls"].(map[string]interface{})["auto"].(map[string]interface{})["enabled"] = false + + getclient := action.NewGet(actionConfig) + release, err := getclient.Run(v.ReleaseName) + if err == nil && release != nil { + log.Info("found existing release, removing before installing", "release", release.Name) + delclient := action.NewUninstall(actionConfig) + delclient.Wait = true + delclient.Timeout = deleteTimeout + _, err = delclient.Run(v.ReleaseName) + if err != nil { + return fmt.Errorf("failed to delete existing release %s: %w", v.ReleaseName, err) + } + } else if err != nil && !strings.Contains(err.Error(), "not found") { + return fmt.Errorf("failed to get release %s: %w", v.ReleaseName, err) + } + + client := action.NewInstall(actionConfig) + client.Namespace = v.Namespace + client.ReleaseName = v.ReleaseName + client.Timeout = createTimeout + client.Wait = true + client.WaitForJobs = true + + // install the chart here + rel, err := client.RunWithContext(ctx, chart, chart.Values) + if err != nil { + return fmt.Errorf("failed to install chart: %w", err) + } + + log.Info("installed chart", "release", rel.Name, "namespace", rel.Namespace) + // this will confirm the values set during installation + log.Info("chart values", "config", rel.Config) + + // ensure all pods are running, since helm doesn't care about windows + config, err := clientcmd.BuildConfigFromFlags("", v.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("error building kubeconfig: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + // Validate Hubble Relay and UI pods in parallel. + var relayErr, uiErr error + var wg sync.WaitGroup + wg.Add(2) + go func() { + defer wg.Done() + relayErr = WaitForPodReady(ctx, clientset, HubbleNamespace, "k8s-app="+HubbleRelayApp) + }() + go func() { + defer wg.Done() + uiErr = WaitForPodReady(ctx, clientset, HubbleNamespace, "k8s-app="+HubbleUIApp) + }() + wg.Wait() + + if relayErr != nil { + return fmt.Errorf("error waiting for Hubble Relay pods to be ready: %w", relayErr) + } + log.Info("Hubble Relay Pod is ready") + + if uiErr != nil { + return fmt.Errorf("error waiting for Hubble UI pods to be ready: %w", uiErr) + } + log.Info("Hubble UI Pod is ready") + + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/install-retina-helm.go b/test/e2ev3/pkg/kubernetes/install-retina-helm.go new file mode 100644 index 0000000000..7849d44904 --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/install-retina-helm.go @@ -0,0 +1,170 @@ +package kubernetes + +import ( + "context" + "fmt" + "os" + "strings" + "time" + + e2ecfg "github.com/microsoft/retina/test/e2ev3/config" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "helm.sh/helm/v3/pkg/action" + "helm.sh/helm/v3/pkg/chart/loader" + "helm.sh/helm/v3/pkg/cli" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) + +const ( + createTimeout = 20 * time.Minute // windows is slow + deleteTimeout = 5 * time.Minute +) + +var ( + errEmpty = fmt.Errorf("is empty") + errDirectoryNotFound = fmt.Errorf("directory not found") +) + +type InstallHelmChart struct { + Namespace string + ReleaseName string + KubeConfigFilePath string + ChartPath string + ImageTag string + ImageRegistry string + ImageNamespace string + HelmDriver string + ImageLoader e2ecfg.ClusterProvider + EnableHeartbeat bool +} + +func (i *InstallHelmChart) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, i) + // Prevalidation: check chart path and tag env + _, err := os.Stat(i.ChartPath) + if os.IsNotExist(err) { + cwd, err := os.Getwd() + if err != nil { + return fmt.Errorf("failed to get current working directory %s: %w", cwd, err) + } + log.Info("current working directory", "cwd", cwd) + return fmt.Errorf("directory not found at %s: working directory: %s: %w", i.ChartPath, cwd, errDirectoryNotFound) + } + log.Info("found chart", "path", i.ChartPath) + + if i.ImageTag == "" { + return fmt.Errorf("image tag is not set: %w", errEmpty) + } + if i.ImageRegistry == "" { + return fmt.Errorf("image registry is not set: %w", errEmpty) + } + if i.ImageNamespace == "" { + return fmt.Errorf("image namespace is not set: %w", errEmpty) + } + + tag := i.ImageTag + imageRegistry := i.ImageRegistry + imageNamespace := i.ImageNamespace + + ctx, cancel := context.WithTimeout(ctx, createTimeout) + defer cancel() + settings := cli.New() + settings.KubeConfig = i.KubeConfigFilePath + actionConfig := new(action.Configuration) + + err = actionConfig.Init(settings.RESTClientGetter(), i.Namespace, i.HelmDriver, func(format string, v ...any) { log.Info(fmt.Sprintf(format, v...)) }) + if err != nil { + return fmt.Errorf("failed to initialize helm action config: %w", err) + } + + // Creating extra namespace to deploy test pods + rc, err := clientcmd.BuildConfigFromFlags("", i.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("failed to build rest config: %w", err) + } + err = CreateNamespaceFn(ctx, rc, e2ecfg.TestPodNamespace) + if err != nil { + return fmt.Errorf("failed to create namespace %s: %w", i.Namespace, err) + } + + //Download necessary CRD's + err = downloadExternalCRDs(i.ChartPath) + if err != nil { + return fmt.Errorf("failed to load external crd's: %w", err) + } + + // load chart from the path + chart, err := loader.Load(i.ChartPath) + if err != nil { + return fmt.Errorf("failed to load chart from path %s: %w", i.ChartPath, err) + } + + if secrets := i.ImageLoader.ImagePullSecrets(); len(secrets) > 0 { + chart.Values["imagePullSecrets"] = secrets + } + + if i.EnableHeartbeat { + chart.Values["enableTelemetry"] = i.EnableHeartbeat + chart.Values["logLevel"] = "error" + } + + chart.Values["image"].(map[string]interface{})["tag"] = tag + chart.Values["image"].(map[string]interface{})["pullPolicy"] = i.ImageLoader.ImagePullPolicy() + chart.Values["operator"].(map[string]interface{})["tag"] = tag + chart.Values["image"].(map[string]interface{})["repository"] = imageRegistry + "/" + imageNamespace + "/retina-agent" + chart.Values["image"].(map[string]interface{})["initRepository"] = imageRegistry + "/" + imageNamespace + "/retina-init" + chart.Values["operator"].(map[string]interface{})["repository"] = imageRegistry + "/" + imageNamespace + "/retina-operator" + chart.Values["operator"].(map[string]interface{})["enabled"] = true + + getclient := action.NewGet(actionConfig) + release, err := getclient.Run(i.ReleaseName) + if err == nil && release != nil { + log.Info("found existing release, removing before installing", "release", release.Name) + delclient := action.NewUninstall(actionConfig) + delclient.Wait = true + delclient.Timeout = deleteTimeout + _, err = delclient.Run(i.ReleaseName) + if err != nil { + return fmt.Errorf("failed to delete existing release %s: %w", i.ReleaseName, err) + } + } else if err != nil && !strings.Contains(err.Error(), "not found") { + return fmt.Errorf("failed to get release %s: %w", i.ReleaseName, err) + } + + client := action.NewInstall(actionConfig) + client.Namespace = i.Namespace + client.ReleaseName = i.ReleaseName + client.Timeout = createTimeout + client.Wait = true + client.WaitForJobs = true + + // install the chart here + rel, err := client.RunWithContext(ctx, chart, chart.Values) + if err != nil { + return fmt.Errorf("failed to install chart: %w", err) + } + + log.Info("installed chart", "release", rel.Name, "namespace", rel.Namespace) + // this will confirm the values set during installation + log.Info("chart values", "config", rel.Config) + + // ensure all pods are running, since helm doesn't care about windows + config, err := clientcmd.BuildConfigFromFlags("", i.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("error building kubeconfig: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + labelSelector := "k8s-app=retina" + err = WaitForPodReady(ctx, clientset, "kube-system", labelSelector) + if err != nil { + return fmt.Errorf("error waiting for retina pods to be ready: %w", err) + } + + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/label-nodes.go b/test/e2ev3/pkg/kubernetes/label-nodes.go new file mode 100644 index 0000000000..c96b71371f --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/label-nodes.go @@ -0,0 +1,79 @@ +package kubernetes + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + retry "github.com/microsoft/retina/test/retry" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +type patchStringValue struct { + Op string `json:"op"` + Path string `json:"path"` + Value string `json:"value"` +} + +type LabelNodes struct { + RestConfig *rest.Config + Labels map[string]string +} + +func (l *LabelNodes) String() string { return "label-nodes" } + +func (l *LabelNodes) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, l) + clientset, err := kubernetes.NewForConfig(l.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + var nodes *corev1.NodeList + + retrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} + err = retrier.Do(ctx, func() error { + nodes, err = clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to get nodes: %w", err) + } + return nil + }) + if err != nil { + return fmt.Errorf("retrier failed: %w", err) + } + + patch := []patchStringValue{} + for k, v := range l.Labels { + patch = append(patch, patchStringValue{ + Op: "add", + Path: "/metadata/labels/" + k, + Value: v, + }) + } + b, err := json.Marshal(patch) + if err != nil { + return fmt.Errorf("failed to marshal patch: %w", err) + } + + for i := range nodes.Items { + log.Info("labeling node", "node", nodes.Items[i].Name) + err = retrier.Do(ctx, func() error { + _, err = clientset.CoreV1().Nodes().Patch(ctx, nodes.Items[i].Name, types.JSONPatchType, b, metav1.PatchOptions{}) + if err != nil { + return fmt.Errorf("failed to patch pod: %w", err) + } + return nil + }) + if err != nil { + return fmt.Errorf("retrier failed: %w", err) + } + } + + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/no-crashes.go b/test/e2ev3/pkg/kubernetes/no-crashes.go new file mode 100644 index 0000000000..5cb9847639 --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/no-crashes.go @@ -0,0 +1,42 @@ +package kubernetes + +import ( + "context" + "fmt" + + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +var ErrPodCrashed = fmt.Errorf("pod has crashes") + +type EnsureStableComponent struct { + LabelSelector string + PodNamespace string + RestConfig *rest.Config + + // Container restarts can occur for various reason, they do not necessarily mean the entire cluster + // is unstable or needs to be recreated. In some cases, container restarts are expected and acceptable. + // This flag should be set to true only in those cases and provide additional why restart restarts are acceptable. + IgnoreContainerRestart bool +} + +func (n *EnsureStableComponent) Do(ctx context.Context) error { + clientset, err := kubernetes.NewForConfig(n.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + err = WaitForPodReady(ctx, clientset, n.PodNamespace, n.LabelSelector) + if err != nil { + return fmt.Errorf("error waiting for retina pods to be ready: %w", err) + } + + if !n.IgnoreContainerRestart { + err = CheckContainerRestart(ctx, clientset, n.PodNamespace, n.LabelSelector) + if err != nil { + return fmt.Errorf("error checking pod restarts: %w", err) + } + } + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/port-forward.go b/test/e2ev3/pkg/kubernetes/port-forward.go new file mode 100644 index 0000000000..ffd83d7377 --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/port-forward.go @@ -0,0 +1,174 @@ +// todo: matmerr, this is just going to remain broken until it can be validated with scenarios pr + +package kubernetes + +import ( + "context" + "fmt" + "log/slog" + "net/http" + "strconv" + "time" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + + retry "github.com/microsoft/retina/test/retry" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +const ( + defaultTimeoutSeconds = 300 + defaultRetryDelay = 500 * time.Millisecond + defaultRetryAttempts = 60 + defaultHTTPClientTimeout = 2 * time.Second +) + +var ( + ErrNoPodWithLabelFound = fmt.Errorf("no pod with label found with matching pod affinity") + + defaultRetrier = retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay, ExpBackoff: true} +) + +type PortForward struct { + Namespace string + LabelSelector string + LocalPort string + RemotePort string + Endpoint string + RestConfig *rest.Config + OptionalLabelAffinity string + + // local properties + pf *PortForwarder +} + +func (p *PortForward) String() string { return "port-forward" } + +func (p *PortForward) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, p) + lport, _ := strconv.Atoi(p.LocalPort) + rport, _ := strconv.Atoi(p.RemotePort) + + portForwardCtx, cancel := context.WithTimeout(ctx, defaultTimeoutSeconds*time.Second) + defer cancel() + + clientset, err := kubernetes.NewForConfig(p.RestConfig) + if err != nil { + return fmt.Errorf("could not create clientset: %w", err) + } + + // if we have an optional label affinity, find a pod with that label, on the same node as a pod with the label selector + targetPodName := "" + if p.OptionalLabelAffinity != "" { + // get all pods with label + log.Info("finding pod with affinity", "label", p.LabelSelector, "affinityLabel", p.OptionalLabelAffinity) + targetPodName, err = p.findPodsWithAffinity(ctx, clientset) + if err != nil { + return fmt.Errorf("could not find pod with affinity: %w", err) + } + } + + portForwardFn := func() error { + // if we have a pod name (likely from affinity above), use it, otherwise use label selector + opts := PortForwardingOpts{ + Namespace: p.Namespace, + PodName: targetPodName, + LocalPort: lport, + DestPort: rport, + } + + if targetPodName != "" { + opts.PodName = targetPodName + } + + log.Info("attempting port forward", "pod", targetPodName, "label", p.LabelSelector, "namespace", p.Namespace) + + p.pf, err = NewPortForwarder(p.RestConfig, logger{}, opts) + if err != nil { + return fmt.Errorf("could not create port forwarder: %w", err) + } + err = p.pf.Forward(ctx) + if err != nil { + return fmt.Errorf("could not start port forward: %w", err) + } + + // verify port forward succeeded + client := http.Client{ + Timeout: defaultHTTPClientTimeout, + } + resp, err := client.Get(p.pf.Address() + "/" + p.Endpoint) //nolint + if err != nil { + log.Error("port forward validation failed", "address", p.pf.Address(), "error", err) + p.pf.Stop() + return fmt.Errorf("port forward validation HTTP request to %s failed: %w", p.pf.Address(), err) + } + defer resp.Body.Close() + + log.Info("port forward validation succeeded", "address", p.pf.Address(), "status", resp.Status) + + return nil + } + + if err = defaultRetrier.Do(portForwardCtx, portForwardFn); err != nil { + return fmt.Errorf("could not start port forward within %ds: %w", defaultTimeoutSeconds, err) + } + log.Info("successfully port forwarded", "address", p.pf.Address()) + return nil +} + +func (p *PortForward) findPodsWithAffinity(ctx context.Context, clientset *kubernetes.Clientset) (string, error) { + targetPodsAll, errAffinity := clientset.CoreV1().Pods(p.Namespace).List(ctx, metav1.ListOptions{ + LabelSelector: p.LabelSelector, + FieldSelector: "status.phase=Running", + }) + if errAffinity != nil { + return "", fmt.Errorf("could not list pods in %q with label %q: %w", p.Namespace, p.LabelSelector, errAffinity) + } + + // omit windows pods because we can't port-forward to them + targetPodsLinux := make([]v1.Pod, 0) + for i := range targetPodsAll.Items { + if targetPodsAll.Items[i].Spec.NodeSelector["kubernetes.io/os"] != "windows" { + targetPodsLinux = append(targetPodsLinux, targetPodsAll.Items[i]) + } + } + + // get all pods with optional label affinity + affinityPods, errAffinity := clientset.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{ + LabelSelector: p.OptionalLabelAffinity, + FieldSelector: "status.phase=Running", + }) + if errAffinity != nil { + return "", fmt.Errorf("could not list affinity pods across all namespaces with label %q: %w", p.OptionalLabelAffinity, errAffinity) + } + + // keep track of where the affinity pods are scheduled + affinityNodes := make(map[string]bool) + for i := range affinityPods.Items { + affinityNodes[affinityPods.Items[i].Spec.NodeName] = true + } + + // if a pod is found on the same node as an affinity pod, use it + for i := range targetPodsLinux { + if affinityNodes[targetPodsLinux[i].Spec.NodeName] { + // found a pod with the specified label, on a node with the optional label affinity + return targetPodsLinux[i].Name, nil + } + } + + return "", fmt.Errorf("could not find a pod with label \"%s\", on a node that also has a pod with label \"%s\": %w", p.LabelSelector, p.OptionalLabelAffinity, ErrNoPodWithLabelFound) +} + +func (p *PortForward) Stop() error { + p.pf.Stop() + return nil +} + +type logger struct{} + +func (l *logger) Logf(format string, args ...interface{}) { + slog.Info(fmt.Sprintf(format, args...)) +} diff --git a/test/e2ev3/pkg/kubernetes/portforward.go b/test/e2ev3/pkg/kubernetes/portforward.go new file mode 100644 index 0000000000..3a8b3f1a2a --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/portforward.go @@ -0,0 +1,199 @@ +package kubernetes + +import ( + "context" + "fmt" + "io" + "net/http" + "sort" + "sync" + "time" + + "github.com/pkg/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/portforward" + "k8s.io/client-go/transport/spdy" +) + +// PortForwarder can manage a port forwarding session. +type PortForwarder struct { + clientset *kubernetes.Clientset + transport http.RoundTripper + upgrader spdy.Upgrader + logger logger + + opts PortForwardingOpts + + stopChan chan struct{} + errChan chan error + address string + lazyAddress sync.Once +} + +type PortForwardingOpts struct { + Namespace string + LabelSelector string + PodName string + LocalPort int + DestPort int +} + +// NewPortForwarder creates a PortForwarder. +func NewPortForwarder(restConfig *rest.Config, logger logger, opts PortForwardingOpts) (*PortForwarder, error) { + clientset, err := kubernetes.NewForConfig(restConfig) + if err != nil { + return nil, fmt.Errorf("could not create clientset: %w", err) + } + + transport, upgrader, err := spdy.RoundTripperFor(restConfig) + if err != nil { + return nil, fmt.Errorf("could not create spdy roundtripper: %w", err) + } + + return &PortForwarder{ + clientset: clientset, + transport: transport, + upgrader: upgrader, + logger: logger, + opts: opts, + stopChan: make(chan struct{}, 1), + }, nil +} + +// todo: can be made more flexible to allow a service to be specified + +// Forward attempts to initiate port forwarding a pod and port using the configured namespace and labels. +// An error is returned if a port forwarding session could not be started. If no error is returned, the +// Address method can be used to communicate with the pod, and the Stop and KeepAlive methods can be used +// to manage the lifetime of the port forwarding session. + +func (p *PortForwarder) Forward(ctx context.Context) error { + var podName string + if p.opts.PodName == "" { + pods, err := p.clientset.CoreV1().Pods(p.opts.Namespace).List(ctx, metav1.ListOptions{LabelSelector: p.opts.LabelSelector, FieldSelector: "status.phase=Running"}) + if err != nil { + return fmt.Errorf("could not list pods in %q with label %q: %w", p.opts.Namespace, p.opts.LabelSelector, err) + } + + if len(pods.Items) < 1 { + return fmt.Errorf("no pods found in %q with label %q", p.opts.Namespace, p.opts.LabelSelector) //nolint:goerr113 //no specific handling expected + } + + // Deterministic selection: sort by name and pick the first pod. + sort.Slice(pods.Items, func(i, j int) bool { + return pods.Items[i].Name < pods.Items[j].Name + }) + podName = pods.Items[0].Name + } else { + podName = p.opts.PodName + } + + pods, err := p.clientset.CoreV1().Pods(p.opts.Namespace).List(ctx, metav1.ListOptions{LabelSelector: p.opts.LabelSelector, FieldSelector: "status.phase=Running"}) + if err != nil { + return fmt.Errorf("could not list pods in %q with label %q: %w", p.opts.Namespace, p.opts.LabelSelector, err) + } + + if len(pods.Items) < 1 { + return fmt.Errorf("no pods found in %q with label %q", p.opts.Namespace, p.opts.LabelSelector) //nolint:goerr113 //no specific handling expected + } + + portForwardURL := p.clientset.CoreV1().RESTClient().Post(). + Resource("pods"). + Namespace(p.opts.Namespace). + Name(podName). + SubResource("portforward").URL() + + readyChan := make(chan struct{}, 1) + dialer := spdy.NewDialer(p.upgrader, &http.Client{Transport: p.transport}, http.MethodPost, portForwardURL) + ports := []string{fmt.Sprintf("%d:%d", p.opts.LocalPort, p.opts.DestPort)} + pf, err := portforward.New(dialer, ports, p.stopChan, readyChan, io.Discard, io.Discard) + if err != nil { + return fmt.Errorf("could not create portforwarder: %w", err) + } + + errChan := make(chan error, 1) + go func() { + // ForwardPorts is a blocking function thus it has to be invoked in a goroutine to allow callers to do + // other things, but it can return 2 kinds of errors: initial dial errors that will be caught in the select + // block below (Ready should not fire in these cases) and later errors if the connection is dropped. + // this is why we propagate the error channel to PortForwardStreamHandle: to allow callers to handle + // cases of eventual errors. + errChan <- pf.ForwardPorts() + }() + + var portForwardPort int + select { + case <-ctx.Done(): + return fmt.Errorf("portforward cancelled: %w", ctx.Err()) + case err := <-errChan: + return fmt.Errorf("portforward failed: %w", err) + case <-pf.Ready: + prts, err := pf.GetPorts() + if err != nil { + return fmt.Errorf("get portforward port: %w", err) + } + + if len(prts) < 1 { + return errors.New("no ports forwarded") + } + + portForwardPort = int(prts[0].Local) + } + + // once successful, any subsequent port forwarding sessions from keep alive would yield the same address. + // since the address could be read at the same time as the session is renewed, it's appropriate to initialize + // lazily. + p.lazyAddress.Do(func() { + p.address = fmt.Sprintf("http://localhost:%d", portForwardPort) + }) + + p.errChan = errChan + + return nil +} + +// Address returns an address for communicating with a port-forwarded pod. +func (p *PortForwarder) Address() string { + return p.address +} + +// Stop terminates a port forwarding session. +func (p *PortForwarder) Stop() { + select { + case p.stopChan <- struct{}{}: + default: + } +} + +// KeepAlive can be used to restart the port forwarding session in the background. +func (p *PortForwarder) KeepAlive(ctx context.Context) { + for { + select { + case <-ctx.Done(): + p.logger.Logf("port forwarder: keep alive cancelled: %v", ctx.Err()) + return + case pfErr := <-p.errChan: + // as of client-go v0.26.1, if the connection is successful at first but then fails, + // an error is logged but only a nil error is sent to this channel. this will be fixed + // in v0.27.x, which at the time of writing has not been released. + // + // see https://github.com/kubernetes/client-go/commit/d0842249d3b92ea67c446fe273f84fe74ebaed9f + // for the relevant change. + p.logger.Logf("port forwarder: received error signal: %v. restarting session", pfErr) + p.Stop() + if err := p.Forward(ctx); err != nil { + p.logger.Logf("port forwarder: could not restart session: %v. retrying", err) + + select { + case <-ctx.Done(): + p.logger.Logf("port forwarder: keep alive cancelled: %v", ctx.Err()) + return + case <-time.After(time.Second): // todo: make configurable? + continue + } + } + } + } +} diff --git a/test/e2ev3/pkg/kubernetes/uninstall-helm.go b/test/e2ev3/pkg/kubernetes/uninstall-helm.go new file mode 100644 index 0000000000..6386182956 --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/uninstall-helm.go @@ -0,0 +1,41 @@ +package kubernetes + +import ( + "context" + "fmt" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + + "helm.sh/helm/v3/pkg/action" + "helm.sh/helm/v3/pkg/cli" +) + +type UninstallHelmChart struct { + Namespace string + ReleaseName string + KubeConfigFilePath string + HelmDriver string +} + +func (i *UninstallHelmChart) String() string { return "uninstall-helm" } + +func (i *UninstallHelmChart) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, i) + settings := cli.New() + settings.KubeConfig = i.KubeConfigFilePath + actionConfig := new(action.Configuration) + + err := actionConfig.Init(settings.RESTClientGetter(), i.Namespace, i.HelmDriver, func(format string, v ...any) { log.Info(fmt.Sprintf(format, v...)) }) + if err != nil { + return fmt.Errorf("failed to initialize helm action config: %w", err) + } + + delclient := action.NewUninstall(actionConfig) + delclient.Wait = true + delclient.Timeout = deleteTimeout + _, err = delclient.Run(i.ReleaseName) + if err != nil { + return fmt.Errorf("failed to delete existing release %s: %w", i.ReleaseName, err) + } + + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/upgrade-retina-helm.go b/test/e2ev3/pkg/kubernetes/upgrade-retina-helm.go new file mode 100644 index 0000000000..3a5a8f2dfa --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/upgrade-retina-helm.go @@ -0,0 +1,79 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +package kubernetes + +import ( + "context" + "fmt" + "time" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "helm.sh/helm/v3/pkg/action" + "helm.sh/helm/v3/pkg/cli" + helmValues "helm.sh/helm/v3/pkg/cli/values" + "helm.sh/helm/v3/pkg/getter" +) + +const upgradeTimeout = 300 * time.Second // longer timeout to accommodate slow windows node terminating and restarting. + +type UpgradeRetinaHelmChart struct { + Namespace string + ReleaseName string + KubeConfigFilePath string + ChartPath string + HelmDriver string + ValuesFile string +} + +func (u *UpgradeRetinaHelmChart) String() string { return "upgrade-retina-helm" } + +func (u *UpgradeRetinaHelmChart) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, u) + settings := cli.New() + settings.KubeConfig = u.KubeConfigFilePath + actionConfig := new(action.Configuration) + + err := actionConfig.Init(settings.RESTClientGetter(), u.Namespace, u.HelmDriver, func(format string, v ...any) { log.Info(fmt.Sprintf(format, v...)) }) + if err != nil { + return fmt.Errorf("failed to initialize helm action config: %w", err) + } + + client := action.NewUpgrade(actionConfig) + client.Wait = true + client.WaitForJobs = true + client.Timeout = upgradeTimeout + + // Create a new Get action + get := action.NewGet(actionConfig) + + // Get the current release + rel, err := get.Run(u.ReleaseName) + if err != nil { + return fmt.Errorf("failed to get release: %w", err) + } + + // Get the chart from the current release + chart := rel.Chart + + // enable advanced metrics profile + options := helmValues.Options{ + ValueFiles: []string{u.ValuesFile}, + } + provider := getter.All(settings) + values, err := options.MergeValues(provider) + if err != nil { + return fmt.Errorf("failed to merge values: %w", err) + } + // logs values to be set during upgrade + log.Info("values to be set during upgrade", "values", values) + + rel, err = client.Run(u.ReleaseName, chart, values) + if err != nil { + return fmt.Errorf("failed to upgrade chart: %w", err) + } + + log.Info("upgraded chart", "release", rel.Name, "namespace", rel.Namespace) + log.Info("chart values", "config", rel.Config) + + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/validate-service.go b/test/e2ev3/pkg/kubernetes/validate-service.go new file mode 100644 index 0000000000..ed4c014391 --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/validate-service.go @@ -0,0 +1,70 @@ +package kubernetes + +import ( + "context" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +type ResourceTypes string + +const ( + ResourceTypePod = "pod" + ResourceTypeService = "service" +) + +type ValidateResource struct { + ResourceName string + ResourceNamespace string + ResourceType string + Labels string + RestConfig *rest.Config +} + +func (v *ValidateResource) Do(ctx context.Context) error { + clientset, err := kubernetes.NewForConfig(v.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + ctx, cancel := context.WithTimeout(ctx, defaultTimeoutSeconds*time.Second) + defer cancel() + + switch v.ResourceType { + case ResourceTypePod: + err = WaitForPodReady(ctx, clientset, v.ResourceNamespace, v.Labels) + if err != nil { + return fmt.Errorf("pod not found: %w", err) + } + case ResourceTypeService: + exists, err := serviceExists(ctx, clientset, v.ResourceNamespace, v.ResourceName, v.Labels) + if err != nil || !exists { + return fmt.Errorf("service not found: %w", err) + } + + default: + return fmt.Errorf("resource type %s not supported", v.ResourceType) + } + + if err != nil { + return fmt.Errorf("error waiting for pod to be ready: %w", err) + } + return nil +} + +func serviceExists(ctx context.Context, clientset *kubernetes.Clientset, namespace, _, labels string) (bool, error) { + var serviceList *corev1.ServiceList + serviceList, err := clientset.CoreV1().Services(namespace).List(ctx, metav1.ListOptions{LabelSelector: labels}) + if err != nil { + return false, fmt.Errorf("error listing Services: %w", err) + } + if len(serviceList.Items) == 0 { + return false, nil + } + return true, nil +} diff --git a/test/e2ev3/pkg/kubernetes/validateHttp.go b/test/e2ev3/pkg/kubernetes/validateHttp.go new file mode 100644 index 0000000000..585be4456f --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/validateHttp.go @@ -0,0 +1,46 @@ +package kubernetes + +import ( + "context" + "fmt" + "net/http" + "time" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" +) + +const ( + RequestTimeout = 30 * time.Second +) + +type ValidateHTTPResponse struct { + URL string + ExpectedStatus int +} + +func (v *ValidateHTTPResponse) String() string { return "validate-http-response" } + +func (v *ValidateHTTPResponse) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, v) + ctx, cancel := context.WithTimeout(ctx, RequestTimeout) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, v.URL, nil) + if err != nil { + return fmt.Errorf("error creating HTTP request: %w", err) + } + + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("error making HTTP request: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != v.ExpectedStatus { + return fmt.Errorf("unexpected status code: got %d, want %d", resp.StatusCode, v.ExpectedStatus) + } + log.Info("HTTP validation succeeded", "url", v.URL, "statusCode", resp.StatusCode) + + return nil +} diff --git a/test/e2ev3/pkg/kubernetes/with-port-forward.go b/test/e2ev3/pkg/kubernetes/with-port-forward.go new file mode 100644 index 0000000000..4efcddfcfa --- /dev/null +++ b/test/e2ev3/pkg/kubernetes/with-port-forward.go @@ -0,0 +1,84 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package kubernetes + +import ( + "context" + "fmt" + "log/slog" + "time" + + flow "github.com/Azure/go-workflow" + "github.com/cenkalti/backoff/v4" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" +) + +const ( + // DefaultValidationTimeout bounds total time for validation within a port-forward. + DefaultValidationTimeout = 5 * time.Minute + + // DefaultRetryAttempts for metric validation (metrics may need time to appear). + DefaultRetryAttempts = 10 + + // DefaultScenarioTimeout bounds the total setup phase of a scenario. + DefaultScenarioTimeout = 10 * time.Minute +) + +// WithPortForward is a composite step that: +// 1. Starts a Kubernetes port-forward +// 2. Runs all inner steps sequentially (as a Pipe) +// 3. Guarantees the port-forward is stopped via defer, even on error +type WithPortForward struct { + PF *PortForward + Steps []flow.Steper +} + +func (w *WithPortForward) String() string { return "with-port-forward" } + +func (w *WithPortForward) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, w) + if err := w.PF.Do(ctx); err != nil { + return fmt.Errorf("port-forward failed: %w", err) + } + defer func() { + log.Info("stopping port-forward", "local", w.PF.LocalPort, "remote", w.PF.RemotePort) + w.PF.Stop() //nolint:errcheck // best-effort cleanup + }() + + inner := new(flow.Workflow) + inner.Add(flow.Pipe(w.Steps...)) + if err := inner.Do(ctx); err != nil { + return fmt.Errorf("validation within port-forward failed: %w", err) + } + return nil +} + +// Unwrap exposes inner steps to go-workflow for visibility/debugging. +func (w *WithPortForward) Unwrap() []flow.Steper { + return w.Steps +} + +// CurlExpectFail creates a named step that runs a command expected to fail +// (e.g., curl behind a deny-all network policy). The error is intentionally swallowed. +func CurlExpectFail(name string, exec *ExecInPod) flow.Steper { + return flow.Func(name, func(ctx context.Context) error { + if err := exec.Do(ctx); err != nil { + slog.Info("curl failed as expected", "step", name, "error", err) + } + return nil + }) +} + +// RetryWithBackoff configures exponential backoff for metric validation. +func RetryWithBackoff(ro *flow.RetryOption) { + bo := backoff.NewExponentialBackOff() + bo.InitialInterval = 5 * time.Second + bo.MaxInterval = 30 * time.Second + bo.MaxElapsedTime = 5 * time.Minute + ro.Backoff = bo + ro.Attempts = DefaultRetryAttempts + ro.TimeoutPerTry = 30 * time.Second +} diff --git a/test/e2ev3/pkg/prometheus/prometheus.go b/test/e2ev3/pkg/prometheus/prometheus.go new file mode 100644 index 0000000000..6513eb56ef --- /dev/null +++ b/test/e2ev3/pkg/prometheus/prometheus.go @@ -0,0 +1,222 @@ +package prom + +import ( + "context" + "errors" + "fmt" + "io" + "log" + "net/http" + "reflect" + "strings" + "time" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "github.com/microsoft/retina/test/retry" + promclient "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" +) + +var ( + ErrNoMetricFound = fmt.Errorf("no metric found") + defaultTimeout = 300 * time.Second + defaultRetryDelay = 5 * time.Second + defaultRetryAttempts = 60 +) + +func CheckMetric(ctx context.Context, promAddress, metricName string, validMetric map[string]string, partial ...bool) error { + defaultRetrier := retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} + + // Default partial to false if not provided + usePartial := len(partial) > 0 && partial[0] + + metrics := map[string]*promclient.MetricFamily{} + scrapeMetricsFn := func() error { + log.Printf("checking for metrics on %s", promAddress) + var err error + + // obtain a full dump of all metrics on the endpoint + metrics, err = getAllPrometheusMetricsFromURL(promAddress) + if err != nil { + return fmt.Errorf("could not start port forward within %ds: %w ", defaultTimeout, err) + } + + // loop through each metric to check for a match, + // if none is found then log and return an error which will trigger a retry + if usePartial { + err = verifyValidMetricPresentPartial(metricName, metrics, validMetric) + } else { + err = verifyValidMetricPresent(metricName, metrics, validMetric) + } + if err != nil { + log.Printf("failed to find metric matching %s: %+v\n", metricName, validMetric) + return ErrNoMetricFound + } + + return nil + } + + err := defaultRetrier.Do(ctx, scrapeMetricsFn) + if err != nil { + return fmt.Errorf("failed to get prometheus metrics: %w", err) + } + return nil +} + +func CheckMetricFromBuffer(prometheusMetricData []byte, metricName string, validMetric map[string]string) error { + metrics, err := getAllPrometheusMetricsFromBuffer(prometheusMetricData) + if err != nil { + return fmt.Errorf("failed to parse prometheus metrics: %w", err) + } + + err = verifyValidMetricPresent(metricName, metrics, validMetric) + if err != nil { + log.Printf("failed to find metric matching %s: %+v\n", metricName, validMetric) + return ErrNoMetricFound + } + + return nil +} + +func verifyValidMetricPresent(metricName string, data map[string]*promclient.MetricFamily, validMetric map[string]string) error { + for _, metric := range data { + if metric.GetName() == metricName { + for _, metric := range metric.GetMetric() { + + // get all labels and values on the metric + metricLabels := map[string]string{} + for _, label := range metric.GetLabel() { + metricLabels[label.GetName()] = label.GetValue() + } + + // if valid metric is empty, then we just need to make sure the metric and value is present + if len(validMetric) == 0 && len(metricLabels) > 0 { + return nil + } + + if reflect.DeepEqual(metricLabels, validMetric) { + return nil + } + } + } + } + + return fmt.Errorf("failed to find metric matching: %+v: %w", validMetric, ErrNoMetricFound) +} + +func getAllPrometheusMetricsFromURL(url string) (map[string]*promclient.MetricFamily, error) { + client := http.Client{} + resp, err := client.Get(url) //nolint + if err != nil { + return nil, fmt.Errorf("HTTP request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP request failed with status: %v", resp.Status) //nolint:goerr113,gocritic + } + + metrics, err := ParseReaderPrometheusMetrics(resp.Body) + if err != nil { + return nil, err + } + + return metrics, nil +} + +// verifyValidMetricPresentPartial checks if a metric exists with labels that contain +// all the key-value pairs in validMetric (partial matching - the metric can have additional labels) +func verifyValidMetricPresentPartial(metricName string, data map[string]*promclient.MetricFamily, validMetric map[string]string) error { + for _, metric := range data { + if metric.GetName() == metricName { + for _, metric := range metric.GetMetric() { + + // get all labels and values on the metric + metricLabels := map[string]string{} + for _, label := range metric.GetLabel() { + metricLabels[label.GetName()] = label.GetValue() + } + + // if valid metric is empty, then we just need to make sure the metric and value is present + if len(validMetric) == 0 && len(metricLabels) > 0 { + return nil + } + + // Check if all key-value pairs in validMetric exist in metricLabels + allMatch := true + for key, value := range validMetric { + if metricLabels[key] != value { + allMatch = false + break + } + } + + if allMatch { + return nil + } + } + } + } + + return fmt.Errorf("failed to find metric matching: %+v: %w", validMetric, ErrNoMetricFound) +} + +func getAllPrometheusMetricsFromBuffer(buf []byte) (map[string]*promclient.MetricFamily, error) { + parser := expfmt.NewTextParser(model.LegacyValidation) + reader := strings.NewReader(string(buf)) + return parser.TextToMetricFamilies(reader) //nolint +} + +func ParseReaderPrometheusMetrics(input io.Reader) (map[string]*promclient.MetricFamily, error) { + parser := expfmt.NewTextParser(model.LegacyValidation) + return parser.TextToMetricFamilies(input) //nolint +} + +// When capturing promethus output via curl and exect, there's a lot +// of garbage at the front +func stripExecGarbage(s string) string { + index := strings.Index(s, "#") + if index == -1 { + // If there's no `#`, return the original string + return s + } + // Slice the string up to the character before the first `#` + return s[:index] +} + +var ErrMetricFound = errors.New("unexpected metric found") + +// ValidateMetricStep validates Prometheus metrics at a given port. +// Implements flow.Steper via Do(context.Context) error. +type ValidateMetricStep struct { + ForwardedPort string + MetricName string + ValidMetrics []map[string]string + ExpectMetric bool + PartialMatch bool +} + +func (v *ValidateMetricStep) Do(ctx context.Context) error { + _, slogger := utils.StepLogger(ctx, v) + + promAddress := fmt.Sprintf("http://localhost:%s/metrics", v.ForwardedPort) + + for _, validMetric := range v.ValidMetrics { + err := CheckMetric(ctx, promAddress, v.MetricName, validMetric, v.PartialMatch) + if err != nil { + if !v.ExpectMetric && errors.Is(err, ErrNoMetricFound) { + slogger.Info("metric not found, as expected", "metric", v.MetricName) + return nil + } + return fmt.Errorf("failed to verify prometheus metrics: %w", err) + } + + if !v.ExpectMetric { + return fmt.Errorf("did not expect to find metric %s matching %+v: %w", v.MetricName, validMetric, ErrMetricFound) + } + + slogger.Info("found metric", "metric", v.MetricName, "labels", validMetric) + } + return nil +} diff --git a/test/e2ev3/pkg/utils/context_logger.go b/test/e2ev3/pkg/utils/context_logger.go new file mode 100644 index 0000000000..b5209687c6 --- /dev/null +++ b/test/e2ev3/pkg/utils/context_logger.go @@ -0,0 +1,63 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package utils + +import ( + "context" + "log/slog" + + flow "github.com/Azure/go-workflow" +) + +type prefixKey struct{} + +// StepLogger appends the step name of s to the accumulated context prefix +// and returns the enriched context + a logger tagged with the full prefix. +// +// Call this at the top of every Do(ctx) at any level: +// +// func (w *Workflow) Do(ctx context.Context) error { +// ctx, log := utils.StepLogger(ctx, w) // prefix = "basic-metrics" +// ... +// } +// func (p *PortForward) Do(ctx context.Context) error { +// _, log := utils.StepLogger(ctx, p) // prefix = "basic-metrics/drop/port-forward" +// ... +// } +func StepLogger(ctx context.Context, s any) (context.Context, *slog.Logger) { + name := StepName(s) + existing := Prefix(ctx) + var prefix string + if existing != "" { + prefix = existing + "/" + name + } else { + prefix = name + } + ctx = context.WithValue(ctx, prefixKey{}, prefix) + return ctx, slog.Default().With("prefix", prefix) +} + +// Prefix returns the accumulated log prefix stored in ctx. +func Prefix(ctx context.Context) string { + if v, ok := ctx.Value(prefixKey{}).(string); ok { + return v + } + return "" +} + +// Scenario wraps a flow.Workflow with a name that gets added to the +// context prefix when executed. Use this for test/scenario grouping: +// +// &utils.Scenario{Name: "drop", Inner: buildDropWorkflow(...)} +type Scenario struct { + Name string + Inner *flow.Workflow +} + +func (s *Scenario) String() string { return s.Name } + +func (s *Scenario) Do(ctx context.Context) error { + ctx, _ = StepLogger(ctx, s) + return s.Inner.Do(ctx) +} diff --git a/test/e2ev3/pkg/utils/slog_handler.go b/test/e2ev3/pkg/utils/slog_handler.go new file mode 100644 index 0000000000..713f3cb8f2 --- /dev/null +++ b/test/e2ev3/pkg/utils/slog_handler.go @@ -0,0 +1,291 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package utils + +import ( + "bytes" + "context" + "fmt" + "io" + "log/slog" + "path" + "runtime" + "slices" + "strings" + "sync" + "unicode" + + "golang.org/x/term" +) + +// StepHandler is an slog.Handler that produces structured log lines with +// workflow/test/step context rendered as a bracketed prefix. +// +// Output format: +// +// 15:04:05 INFO [workflow/test/step] message key=value ... +// +// The "workflow", "test", and "step" attributes are absorbed into the prefix +// and not printed as key=value pairs. When no prefix parts are set the +// brackets are omitted entirely. +type StepHandler struct { + w io.Writer + level slog.Level + workflow string + test string + step string + prefix string + color bool + attrs []slog.Attr + mu *sync.Mutex +} + +func NewStepHandler(w io.Writer, level slog.Level) *StepHandler { + c := false + if f, ok := w.(interface{ Fd() uintptr }); ok { + c = isTerminal(f.Fd()) + } + return &StepHandler{w: w, level: level, color: c, mu: &sync.Mutex{}} +} + +// NewStepHandlerWithColor creates a handler with explicit color control (for tests). +func NewStepHandlerWithColor(w io.Writer, level slog.Level, color bool) *StepHandler { + return &StepHandler{w: w, level: level, color: color, mu: &sync.Mutex{}} +} + +func (h *StepHandler) Enabled(_ context.Context, level slog.Level) bool { + return level >= h.level +} + +func (h *StepHandler) Handle(ctx context.Context, r slog.Record) error { + var buf bytes.Buffer + + // Start with any prefix from context (set by StepLogger), + // then check handler-level prefix (from WithAttrs). + prefix := Prefix(ctx) + if prefix == "" { + prefix = h.prefix + } + + // Also check handler-level and record-level attrs for prefix/workflow/test/step. + // "prefix" overrides everything; legacy workflow/test/step build a prefix if no "prefix" attr. + workflow, test, step := h.workflow, h.test, h.step + + var extra []slog.Attr + r.Attrs(func(a slog.Attr) bool { + switch a.Key { + case "prefix": + prefix = a.Value.String() + case "workflow": + workflow = a.Value.String() + case "test": + test = a.Value.String() + case "step": + step = a.Value.String() + default: + extra = append(extra, a) + } + return true + }) + + // If no explicit prefix, build from workflow/test/step parts. + if prefix == "" { + prefix = buildPrefix(workflow, test, step) + } + + // If still empty, try caller detection from stack. + if prefix == "" { + cw, _, cs := callerPrefix() + prefix = buildPrefix(cw, "", cs) + } + + // Timestamp and level always come first. + fmt.Fprintf(&buf, "%s %s ", + r.Time.Format("15:04:05"), + r.Level.String()) + + // Render the [prefix] bracket. + if prefix != "" { + if h.color { + code := colorForPrefix(prefix) + fmt.Fprintf(&buf, "\033[%dm[%s]\033[0m ", code, prefix) + } else { + fmt.Fprintf(&buf, "[%s] ", prefix) + } + } + + buf.WriteString(r.Message) + + // Pre-attached attrs (from WithAttrs), skipping prefix keys. + for _, a := range h.attrs { + fmt.Fprintf(&buf, " %s=%s", a.Key, a.Value) + } + // Record-level attrs (prefix keys already absorbed above). + for _, a := range extra { + fmt.Fprintf(&buf, " %s=%s", a.Key, a.Value) + } + + buf.WriteByte('\n') + + h.mu.Lock() + defer h.mu.Unlock() + _, err := h.w.Write(buf.Bytes()) + return err +} + +func (h *StepHandler) WithAttrs(attrs []slog.Attr) slog.Handler { + workflow, test, step, prefix := h.workflow, h.test, h.step, h.prefix + var remaining []slog.Attr + for _, a := range attrs { + switch a.Key { + case "prefix": + prefix = a.Value.String() + case "workflow": + workflow = a.Value.String() + case "test": + test = a.Value.String() + case "step": + step = a.Value.String() + default: + remaining = append(remaining, a) + } + } + return &StepHandler{ + w: h.w, + level: h.level, + workflow: workflow, + test: test, + step: step, + prefix: prefix, + color: h.color, + attrs: append(slices.Clone(h.attrs), remaining...), + mu: h.mu, + } +} + +func (h *StepHandler) WithGroup(name string) slog.Handler { + return h +} + +// buildPrefix joins non-empty parts with "/". +func buildPrefix(parts ...string) string { + var buf bytes.Buffer + for _, p := range parts { + if p == "" { + continue + } + if buf.Len() > 0 { + buf.WriteByte('/') + } + buf.WriteString(p) + } + return buf.String() +} + +// colorForPrefix returns a deterministic ANSI color code for the given prefix string. +func colorForPrefix(prefix string) int { + codes := []int{31, 32, 33, 34, 35, 36, 91, 92, 93, 94, 95, 96} + h := fnv32a(prefix) + return codes[h%uint32(len(codes))] +} + +func fnv32a(s string) uint32 { + h := uint32(2166136261) + for i := 0; i < len(s); i++ { + h ^= uint32(s[i]) + h *= 16777619 + } + return h +} + +// isTerminal checks if the given file descriptor is a terminal. +func isTerminal(fd uintptr) bool { + return term.IsTerminal(int(fd)) +} + +const e2ev3Prefix = "retina/test/e2ev3/" + +// callerPrefix scans the call stack for e2ev3 types and returns +// (workflow, test, step). It identifies the outermost Workflow receiver +// as the workflow name and the innermost non-Workflow receiver as the step. +func callerPrefix() (workflow, test, step string) { + var pcs [32]uintptr + n := runtime.Callers(3, pcs[:]) + frames := runtime.CallersFrames(pcs[:n]) + for { + frame, more := frames.Next() + if !strings.Contains(frame.Function, e2ev3Prefix) { + if !more { + break + } + continue + } + typeName, pkgName := extractCallerInfo(frame.Function) + if typeName == "" { + if !more { + break + } + continue + } + kebab := toKebabCase(typeName) + if kebab == "workflow" || kebab == "step" { + // Generic type — use package name as the workflow identifier. + workflow = toKebabCase(pkgName) + } else if kebab == "slog-writer" { + // io.Writer adapter — not a real step, skip it. + if !more { + break + } + continue + } else if step == "" { + step = kebab + } + if !more { + break + } + } + return workflow, test, step +} + +// extractCallerInfo extracts the type name and package name from a fully +// qualified function name like "github.com/.../pkg/kubernetes.(*PortForward).Do". +func extractCallerInfo(funcName string) (typeName, pkgName string) { + // Get last path component: "kubernetes.(*PortForward).Do" + base := path.Base(funcName) + // Split on ".": ["kubernetes", "(*PortForward)", "Do"] + parts := strings.SplitN(base, ".", 3) + if len(parts) < 2 { + return "", "" + } + pkgName = parts[0] + receiver := parts[1] + // Strip pointer/paren: "(*PortForward)" → "PortForward" + receiver = strings.TrimPrefix(receiver, "(*") + receiver = strings.TrimSuffix(receiver, ")") + receiver = strings.TrimPrefix(receiver, "*") + return receiver, pkgName +} + +// toKebabCase converts PascalCase to kebab-case, keeping consecutive +// uppercase letters together (e.g. "InstallNPM" → "install-npm"). +func toKebabCase(s string) string { + var buf bytes.Buffer + runes := []rune(s) + for i, r := range runes { + if unicode.IsUpper(r) { + if i > 0 { + prev := runes[i-1] + if unicode.IsLower(prev) { + buf.WriteByte('-') + } else if unicode.IsUpper(prev) && i+1 < len(runes) && unicode.IsLower(runes[i+1]) { + buf.WriteByte('-') + } + } + buf.WriteRune(unicode.ToLower(r)) + } else { + buf.WriteRune(r) + } + } + return buf.String() +} diff --git a/test/e2ev3/pkg/utils/slog_handler_test.go b/test/e2ev3/pkg/utils/slog_handler_test.go new file mode 100644 index 0000000000..348357447e --- /dev/null +++ b/test/e2ev3/pkg/utils/slog_handler_test.go @@ -0,0 +1,366 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package utils + +import ( + "bytes" + "log/slog" + "regexp" + "strings" + "testing" + +) + +// These mock types simulate the real e2e call stack: +// +// Workflow.Do() → sets "workflow" attr, passes logger down +// └─ addScenario(log) → sets "test" attr, passes logger down +// └─ WithPortForward.Do() → calls PortForward.Do() +// └─ PortForward.Do() → sets "step" attr, logs messages + +// Workflow mirrors *basicmetrics.Workflow. +// StepName should resolve to the package name ("utils" here) since the +// type name "Workflow" is generic. +type Workflow struct { + // bareStep, if set, is called instead of the normal scenario chain. + // Used by TestHandlerFormat_WorkflowPrefixFromStack to test stack-based + // workflow detection when steps don't receive an explicit logger. + bareStep func() +} + +func (w *Workflow) Do() { + if w.bareStep != nil { + w.bareStep() + return + } + // Real workflows create a logger and pass it to scenarios — they + // don't log directly. This matches basicmetrics.Workflow.Do(). + log := slog.Default().With("workflow", StepName(w)) + + // Simulate passing logger to scenario. + s := &MockScenario{log: log} + s.Do() +} + +// MockScenario mirrors addDropScenario / addTCPScenario. +type MockScenario struct { + log *slog.Logger +} + +func (s *MockScenario) Do() { + // Real scenarios add "test" attr and pass the logger to steps. + log := s.log.With("test", "drop") + + // Simulate passing logger to PortForward via WithPortForward. + pf := &MockPortForward{Log: log} + pf.Do() +} + +// MockPortForward mirrors *k8s.PortForward. +type MockPortForward struct { + Log *slog.Logger +} + +func (pf *MockPortForward) Do() { + log := pf.Log + if log == nil { + log = slog.Default() + } + log = log.With("step", StepName(pf)) + log.Info("finding pod with affinity", "label", "k8s-app=retina") + log.Info("attempting port forward", "pod", "retina-agent-abc", "namespace", "kube-system") + log.Info("port forward validation succeeded", "address", "http://localhost:10093") +} + +// MockBareStep simulates a step that does NOT receive a logger from the +// workflow (e.g., CreateAgnhostStatefulSet, CreateDenyAllNetworkPolicy). +// It uses slog.Default() — the handler must detect the workflow from the stack. +type MockBareStep struct{} + +func (s *MockBareStep) Do() { + slog.Info("creating resource", "name", "agnhost") +} + +// WorkflowWithBareStep simulates a Workflow that calls a step without passing +// a logger. Note: type name is NOT "Workflow", so the handler won't detect it +// as a workflow — only as a step. +type WorkflowWithBareStep struct{} + +func (w *WorkflowWithBareStep) Do() { + step := &MockBareStep{} + step.Do() +} + +// MockCallerDetected is used when NO explicit "step" attribute is set. +// The handler should auto-detect the type name via runtime stack inspection. +type MockCallerDetected struct{} + +func (m *MockCallerDetected) Do() { + // No log.With("step", ...) — handler must detect "mock-caller-detected" from the stack. + slog.Info("this should auto-detect step name") +} + +// stripANSI removes ANSI escape codes for easier assertion. +func stripANSI(s string) string { + re := regexp.MustCompile(`\x1b\[[0-9;]*m`) + return re.ReplaceAllString(s, "") +} + +// hasANSI checks that the bracketed prefix contains ANSI color codes. +func hasANSI(s string) bool { + re := regexp.MustCompile(`\x1b\[\d+m\[`) + return re.MatchString(s) +} + +func TestHandlerFormat_ExplicitAttributes(t *testing.T) { + var buf bytes.Buffer + handler := NewStepHandler(&buf, slog.LevelInfo) + slog.SetDefault(slog.New(handler)) + + // Replicate: Workflow.Do() → addScenario(log) → PortForward.Do() + w := &Workflow{} + w.Do() + + output := buf.String() + lines := strings.Split(strings.TrimSpace(output), "\n") + if len(lines) < 3 { + t.Fatalf("expected at least 3 log lines, got %d:\n%s", len(lines), output) + } + + // Verify each line format: "HH:MM:SS LEVEL [prefix] message key=value" + timeLevel := regexp.MustCompile(`^\d{2}:\d{2}:\d{2} (INFO|ERROR|WARN|DEBUG) `) + for i, line := range lines { + if !timeLevel.MatchString(line) { + t.Errorf("line %d: expected timestamp+level first, got: %s", i, line) + } + } + + // All 3 lines come from MockPortForward.Do() which sets step explicitly. + // Prefix should be [utils/drop/mock-port-forward]: + // workflow = "utils" (StepName resolves generic Workflow → package name) + // test = "drop" (set in MockScenario.Do) + // step = "mock-port-forward" (set explicitly via log.With) + expectedPrefix := "[utils/drop/mock-port-forward]" + for i, line := range lines { + if !strings.Contains(line, expectedPrefix) { + t.Errorf("line %d: expected %s prefix, got: %s", i, expectedPrefix, line) + } + } + + // Buffer is not a TTY → no ANSI codes should be present. + for i, line := range lines { + if hasANSI(line) { + t.Errorf("line %d: unexpected ANSI codes in non-TTY output", i) + } + } + + // Verify key=value pairs propagate. + if !strings.Contains(lines[0], "label=k8s-app=retina") { + t.Errorf("line 0: expected label=k8s-app=retina, got: %s", lines[0]) + } +} + +func TestHandlerFormat_ColorOnTTY(t *testing.T) { + var buf bytes.Buffer + handler := NewStepHandlerWithColor(&buf, slog.LevelInfo, true) + slog.SetDefault(slog.New(handler)) + + w := &Workflow{} + w.Do() + + lines := strings.Split(strings.TrimSpace(buf.String()), "\n") + if len(lines) < 3 { + t.Fatalf("expected at least 3 log lines, got %d", len(lines)) + } + + // With color forced on, ANSI codes should wrap the prefix. + for i, line := range lines { + if !hasANSI(line) { + t.Errorf("line %d: expected ANSI color on bracketed prefix", i) + } + } + + // Stripping ANSI should still show the correct prefix. + expectedPrefix := "[utils/drop/mock-port-forward]" + for i, line := range lines { + plain := stripANSI(line) + if !strings.Contains(plain, expectedPrefix) { + t.Errorf("line %d: expected %s prefix, got: %s", i, expectedPrefix, plain) + } + } +} + +func TestHandlerFormat_ColorDeterminism(t *testing.T) { + var buf bytes.Buffer + handler := NewStepHandlerWithColor(&buf, slog.LevelInfo, true) + slog.SetDefault(slog.New(handler)) + + w := &Workflow{} + w.Do() + + lines := strings.Split(strings.TrimSpace(buf.String()), "\n") + if len(lines) < 2 { + t.Fatalf("expected at least 2 lines, got %d", len(lines)) + } + + // All lines share the same prefix → same color. + ansiRe := regexp.MustCompile(`(\x1b\[\d+m)\[`) + first := ansiRe.FindStringSubmatch(lines[0]) + if first == nil { + t.Fatal("no ANSI color code found in first line") + } + for i, line := range lines[1:] { + match := ansiRe.FindStringSubmatch(line) + if match == nil { + t.Errorf("line %d: no ANSI color code found", i+1) + continue + } + if match[1] != first[1] { + t.Errorf("line %d: color %q differs from first line %q", i+1, match[1], first[1]) + } + } + + // Log with a DIFFERENT prefix and verify it also gets a valid color. + buf.Reset() + slog.SetDefault(slog.New(NewStepHandlerWithColor(&buf, slog.LevelInfo, true))) + m := &MockCallerDetected{} + m.Do() + diffLine := buf.String() + diffMatch := ansiRe.FindStringSubmatch(diffLine) + if diffMatch == nil { + t.Fatal("no ANSI color code found in caller-detected line") + } + validAnsi := regexp.MustCompile(`^\x1b\[\d+m$`) + if !validAnsi.MatchString(diffMatch[1]) { + t.Errorf("invalid ANSI escape for different prefix: %q", diffMatch[1]) + } +} + +func TestHandlerFormat_CallerAutoDetection(t *testing.T) { + var buf bytes.Buffer + handler := NewStepHandler(&buf, slog.LevelInfo) + slog.SetDefault(slog.New(handler)) + + // No explicit "step" attribute — handler should detect from call stack. + m := &MockCallerDetected{} + m.Do() + + output := stripANSI(buf.String()) + // The handler should detect "mock-caller-detected" from the receiver type. + if !strings.Contains(output, "[mock-caller-detected]") { + t.Errorf("expected auto-detected [mock-caller-detected] prefix, got: %s", output) + } +} + +func TestHandlerFormat_WorkflowAutoDetection(t *testing.T) { + var buf bytes.Buffer + handler := NewStepHandler(&buf, slog.LevelInfo) + slog.SetDefault(slog.New(handler)) + + // Simulate a step called from inside a Workflow.Do() that does NOT + // receive a logger. The handler should detect both the workflow + // ("utils" — package name of WorkflowWithBareStep) and the step + // ("mock-bare-step") from the call stack. + w := &WorkflowWithBareStep{} + w.Do() + + output := buf.String() + t.Logf("output: %s", output) + + // Should detect workflow from (*WorkflowWithBareStep).Do on the stack. + // WorkflowWithBareStep → type name ends in "...BareStep" — not "Workflow", + // so it won't be detected as a workflow. Let me use the real Workflow type. + // Actually, the type is WorkflowWithBareStep, not Workflow — the handler + // only recognizes types named exactly "Workflow". This test verifies the + // step is detected. + if !strings.Contains(output, "[mock-bare-step]") { + t.Errorf("expected [mock-bare-step] in output, got: %s", output) + } +} + +func TestHandlerFormat_WorkflowPrefixFromStack(t *testing.T) { + var buf bytes.Buffer + handler := NewStepHandler(&buf, slog.LevelInfo) + slog.SetDefault(slog.New(handler)) + + // Simulate real e2e: Workflow.Do() → step.Do() → slog.Info(). + // The step uses slog.Default() (no explicit logger/attributes). + // The handler should detect: + // step = "mock-bare-step" (from (*MockBareStep).Do) + // workflow = "utils" (from (*Workflow).Do higher on the stack) + bare := &MockBareStep{} + w := &Workflow{bareStep: bare.Do} + w.Do() + + output := buf.String() + t.Logf("output: %s", output) + + // Verify callerPrefix detects workflow from stack. + // Note: in the real e2e, go-workflow sits between Workflow.Do and Step.Do. + // Our stack walker skips non-e2ev3 frames, so it should still find both. + if !strings.Contains(output, "[utils/mock-bare-step]") { + t.Errorf("expected [utils/mock-bare-step] prefix, got: %s", output) + } +} + +func TestHandlerFormat_NoPrefix(t *testing.T) { + var buf bytes.Buffer + handler := NewStepHandler(&buf, slog.LevelInfo) + slog.SetDefault(slog.New(handler)) + + // Plain slog.Info from a non-method (no receiver to detect). + slog.Info("bare log line") + + output := buf.String() + plain := stripANSI(output) + // Should still have timestamp+level, but no bracketed prefix (or auto-detected). + if !regexp.MustCompile(`^\d{2}:\d{2}:\d{2} INFO `).MatchString(plain) { + t.Errorf("expected timestamp+level first, got: %s", plain) + } +} + +func TestStepName_GenericTypes(t *testing.T) { + // Verify that generic type "Workflow" resolves to package name, not "workflow". + w := &Workflow{} + name := StepName(w) + // In this test file (package utils), it should be "utils". + if name != "utils" { + t.Errorf("StepName(*Workflow) = %q, want %q", name, "utils") + } + + // Non-generic types keep their own name. + pf := &MockPortForward{} + name = StepName(pf) + if name != "mock-port-forward" { + t.Errorf("StepName(*MockPortForward) = %q, want %q", name, "mock-port-forward") + } + + mcd := &MockCallerDetected{} + name = StepName(mcd) + if name != "mock-caller-detected" { + t.Errorf("StepName(*MockCallerDetected) = %q, want %q", name, "mock-caller-detected") + } +} + +func TestColorForPrefix_Deterministic(t *testing.T) { + // Same input always produces the same color code. + for _, prefix := range []string{ + "basic-metrics/drop/port-forward", + "hubble-metrics/flow-intra/curl-pod", + "advanced-metrics/dns/validate", + "slog-writer", + } { + c1 := colorForPrefix(prefix) + c2 := colorForPrefix(prefix) + if c1 != c2 { + t.Errorf("colorForPrefix(%q) not deterministic: %d vs %d", prefix, c1, c2) + } + // Verify it's a valid ANSI color code (31-36 or 91-96). + if !((c1 >= 31 && c1 <= 36) || (c1 >= 91 && c1 <= 96)) { + t.Errorf("colorForPrefix(%q) = %d, not a valid ANSI color code", prefix, c1) + } + } +} diff --git a/test/e2ev3/pkg/utils/slog_writer.go b/test/e2ev3/pkg/utils/slog_writer.go new file mode 100644 index 0000000000..08ab5661d3 --- /dev/null +++ b/test/e2ev3/pkg/utils/slog_writer.go @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package utils + +import ( + "bytes" + "context" + "log/slog" +) + +// SlogWriter is an io.Writer that logs each complete line through slog at the given level. +// Partial lines are buffered until a newline is received. +type SlogWriter struct { + Level slog.Level + Source string + buf []byte +} + +func (w *SlogWriter) Write(p []byte) (int, error) { + w.buf = append(w.buf, p...) + for { + idx := bytes.IndexByte(w.buf, '\n') + if idx < 0 { + break + } + line := string(bytes.TrimRight(w.buf[:idx], "\r")) + w.buf = w.buf[idx+1:] + if line != "" { + slog.Log(context.Background(), w.Level, line, "source", w.Source) + } + } + return len(p), nil +} + +// Flush logs any remaining buffered content not terminated by a newline. +func (w *SlogWriter) Flush() { + if len(w.buf) > 0 { + line := string(bytes.TrimRight(w.buf, "\r\n")) + if line != "" { + slog.Log(context.Background(), w.Level, line, "source", w.Source) + } + w.buf = nil + } +} diff --git a/test/e2ev3/pkg/utils/stepname.go b/test/e2ev3/pkg/utils/stepname.go new file mode 100644 index 0000000000..828e6c20e4 --- /dev/null +++ b/test/e2ev3/pkg/utils/stepname.go @@ -0,0 +1,30 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package utils + +import ( + "reflect" + "strings" +) + +// StepName derives a kebab-case step name from the concrete type of s. +// For example, *k8s.CreateNamespace → "create-namespace". +// Generic names like "Workflow" or "Step" are replaced by the package name: +// *basicmetrics.Workflow → "basic-metrics", *config.Step → "config". +func StepName(s any) string { + t := reflect.TypeOf(s) + if t.Kind() == reflect.Ptr { + t = t.Elem() + } + name := toKebabCase(t.Name()) + if name == "workflow" || name == "step" { + pkg := t.PkgPath() + if idx := strings.LastIndex(pkg, "/"); idx != -1 { + return toKebabCase(pkg[idx+1:]) + } + } + return name +} diff --git a/test/e2ev3/retina_e2e_test.go b/test/e2ev3/retina_e2e_test.go new file mode 100644 index 0000000000..ec4b061bfb --- /dev/null +++ b/test/e2ev3/retina_e2e_test.go @@ -0,0 +1,62 @@ +//go:build e2e + +// Package retina contains the e2e test entry point. +// +// A single test function drives three phases — image build, infrastructure +// provisioning, and workflow tests — so that `go test -tags=e2e -provider=kind` +// is all you need for a full local run. +package retina + +import ( + "log/slog" + "os" + "testing" + "time" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + "github.com/microsoft/retina/test/e2ev3/pkg/images" + "github.com/microsoft/retina/test/e2ev3/pkg/images/build" + "github.com/microsoft/retina/test/e2ev3/pkg/infra" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "github.com/microsoft/retina/test/e2ev3/workflows/advancedmetrics" + advexp "github.com/microsoft/retina/test/e2ev3/workflows/advancedmetrics/experimental" + "github.com/microsoft/retina/test/e2ev3/workflows/basicmetrics" + basicexp "github.com/microsoft/retina/test/e2ev3/workflows/basicmetrics/experimental" + "github.com/microsoft/retina/test/e2ev3/workflows/capture" + "github.com/microsoft/retina/test/e2ev3/workflows/hubblemetrics" + "github.com/stretchr/testify/require" +) + +// TestE2ERetina drives image build, cluster provisioning, and all Retina +// workflow tests in sequence against a single cluster. +func TestE2ERetina(t *testing.T) { + slog.SetDefault(slog.New(utils.NewStepHandler(os.Stderr, slog.LevelInfo))) + + ctx, cancel := config.TestContext(t) + defer cancel() + + c := &config.E2EConfig{} + + loadConfig := &config.Step{Cfg: c} + buildImages := &build.Step{Cfg: c} + setupInfra := &infra.Workflow{Cfg: c, T: t} + loadImages := &images.Step{Cfg: c} + + basic := &basicmetrics.Workflow{Cfg: c} + advanced := &advancedmetrics.Workflow{Cfg: c} + hubble := &hubblemetrics.Workflow{Cfg: c} + basicExp := &basicexp.Workflow{Cfg: c} + advExp := &advexp.Workflow{Cfg: c} + cap := &capture.Workflow{Cfg: c} + + wf := &flow.Workflow{DontPanic: true} + wf.Add(flow.BatchPipe( + flow.Steps(loadConfig).Timeout(1*time.Minute), + flow.Steps(buildImages, setupInfra).Timeout(30*time.Minute), + flow.Steps(loadImages).Timeout(10*time.Minute), + flow.Pipe(basic, advanced, hubble, basicExp, advExp, cap), + )) + + require.NoError(t, wf.Do(ctx), "e2e workflow failed") +} diff --git a/test/e2ev3/workflows/advancedmetrics/dns.go b/test/e2ev3/workflows/advancedmetrics/dns.go new file mode 100644 index 0000000000..7e6879b016 --- /dev/null +++ b/test/e2ev3/workflows/advancedmetrics/dns.go @@ -0,0 +1,185 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package advancedmetrics + +import ( + "context" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" +) + +func addAdvancedDNSScenario(restConfig *rest.Config, namespace, arch, variant string, + command string, expectError bool, + reqQuery, reqQueryType, workloadKind string, + respNumResponse, respQuery, respQueryType, respReturnCode, respResponse string, +) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-adv-dns-" + variant + "-" + arch + podName := agnhostName + "-0" + + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, AgnhostNamespace: namespace, AgnhostArch: arch, RestConfig: restConfig, + } + // Generate traffic inside the validation loop so packetparser captures it. + execTraffic := flow.Func("adv-dns-"+variant+"-traffic-"+arch, func(ctx context.Context) error { + exec := &k8s.ExecInPod{PodName: podName, PodNamespace: namespace, Command: command, RestConfig: restConfig} + for i := 0; i < 2; i++ { + if err := exec.Do(ctx); err != nil && !expectError { + return err + } + } + return nil + }) + validateReq := &ValidateAdvancedDNSRequestStep{ + PodNamespace: namespace, PodName: podName, Query: reqQuery, QueryType: reqQueryType, + WorkloadKind: workloadKind, WorkloadName: agnhostName, RestConfig: restConfig, + } + validateResp := &ValidateAdvancedDNSResponseStep{ + PodNamespace: namespace, NumResponse: respNumResponse, PodName: podName, + Query: respQuery, QueryType: respQueryType, Response: respResponse, ReturnCode: respReturnCode, + WorkloadKind: workloadKind, WorkloadName: agnhostName, RestConfig: restConfig, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Endpoint: "metrics", RestConfig: restConfig, OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{execTraffic, validateReq, validateResp}, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, ResourceNamespace: namespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision the agnhost pod. + flow.Steps(createAgnhost).Timeout(k8s.DefaultScenarioTimeout), + // Validate: generate traffic + check metrics, retrying with backoff. + flow.Steps(validateWithPF).Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteAgnhost).When(flow.Always), + ), + ) + return wf +} + +// EmptyResponse is a sentinel value that gets converted to an empty string +// for metric label matching. +const EmptyResponse = "emptyResponse" + +// KubeServiceIP is a sentinel value that gets resolved at runtime to the +// ClusterIP of the kubernetes.default service. +const KubeServiceIP = "kubeServiceIP" + +var ( + dnsAdvRequestCountMetricName = "networkobservability_adv_dns_request_count" + dnsAdvResponseCountMetricName = "networkobservability_adv_dns_response_count" +) + +// ValidateAdvancedDNSRequestStep checks the advanced DNS request count metric +// with labels including pod IP, namespace, pod name, query info, and workload info. +type ValidateAdvancedDNSRequestStep struct { + PodNamespace string + PodName string + Query string + QueryType string + WorkloadKind string + WorkloadName string + RestConfig *rest.Config +} + +func (v *ValidateAdvancedDNSRequestStep) Do(ctx context.Context) error { + metricsEndpoint := fmt.Sprintf("http://localhost:%s/metrics", config.RetinaMetricsPort) + + podIP, err := k8s.GetPodIP(ctx, v.RestConfig, v.PodNamespace, v.PodName) + if err != nil { + return fmt.Errorf("failed to get pod IP address: %w", err) + } + + validateAdvancedDNSRequestMetrics := map[string]string{ + "ip": podIP, + "namespace": v.PodNamespace, + "podname": v.PodName, + "query": v.Query, + "query_type": v.QueryType, + "workload_kind": v.WorkloadKind, + "workload_name": v.WorkloadName, + } + + err = prom.CheckMetric(ctx, metricsEndpoint, dnsAdvRequestCountMetricName, validateAdvancedDNSRequestMetrics) + if err != nil { + return fmt.Errorf("failed to verify advance dns request metrics %s: %w", dnsAdvRequestCountMetricName, err) + } + return nil +} + +// ValidateAdvancedDNSResponseStep checks the advanced DNS response count metric +// with labels including pod IP, namespace, pod name, num_response, query info, +// response, return_code, and workload info. +type ValidateAdvancedDNSResponseStep struct { + PodNamespace string + NumResponse string + PodName string + Query string + QueryType string + Response string + ReturnCode string + WorkloadKind string + WorkloadName string + RestConfig *rest.Config +} + +func (v *ValidateAdvancedDNSResponseStep) Do(ctx context.Context) error { + metricsEndpoint := fmt.Sprintf("http://localhost:%s/metrics", config.RetinaMetricsPort) + + podIP, err := k8s.GetPodIP(ctx, v.RestConfig, v.PodNamespace, v.PodName) + if err != nil { + return fmt.Errorf("failed to get pod IP address: %w", err) + } + + if v.Response == EmptyResponse { + v.Response = "" + } + if v.Response == KubeServiceIP { + clientset, err := kubernetes.NewForConfig(v.RestConfig) + if err != nil { + return fmt.Errorf("failed to create kubernetes clientset: %w", err) + } + svc, err := clientset.CoreV1().Services("default").Get(ctx, "kubernetes", metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get kubernetes service ClusterIP: %w", err) + } + v.Response = svc.Spec.ClusterIP + } + + validateAdvanceDNSResponseMetrics := map[string]string{ + "ip": podIP, + "namespace": v.PodNamespace, + "num_response": v.NumResponse, + "podname": v.PodName, + "query": v.Query, + "query_type": v.QueryType, + "response": v.Response, + "return_code": v.ReturnCode, + "workload_kind": v.WorkloadKind, + "workload_name": v.WorkloadName, + } + + err = prom.CheckMetric(ctx, metricsEndpoint, dnsAdvResponseCountMetricName, validateAdvanceDNSResponseMetrics) + if err != nil { + return fmt.Errorf("failed to verify advance dns response metrics %s: %w", dnsAdvResponseCountMetricName, err) + } + return nil +} diff --git a/test/e2ev3/workflows/advancedmetrics/experimental/drop.go b/test/e2ev3/workflows/advancedmetrics/experimental/drop.go new file mode 100644 index 0000000000..e02e83ce88 --- /dev/null +++ b/test/e2ev3/workflows/advancedmetrics/experimental/drop.go @@ -0,0 +1,67 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package experimental + +import ( + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addAdvancedDropScenario(restConfig *rest.Config, namespace, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-adv-drop-" + arch + podName := agnhostName + "-0" + + createNetPol := &k8s.CreateDenyAllNetworkPolicy{ + NetworkPolicyNamespace: namespace, RestConfig: restConfig, DenyAllLabelSelector: "app=" + agnhostName, + } + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, AgnhostNamespace: namespace, AgnhostArch: arch, RestConfig: restConfig, + } + execCurl := k8s.CurlExpectFail("adv-drop-curl-"+arch, &k8s.ExecInPod{ + PodName: podName, PodNamespace: namespace, + Command: "curl -s -m 5 bing.com", RestConfig: restConfig, + }) + validateDropCount := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, MetricName: "networkobservability_adv_drop_count", + ValidMetrics: []map[string]string{{}}, ExpectMetric: true, PartialMatch: true, + } + validateDropBytes := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, MetricName: "networkobservability_adv_drop_bytes", + ValidMetrics: []map[string]string{{}}, ExpectMetric: true, PartialMatch: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Endpoint: config.MetricsEndpoint, RestConfig: restConfig, OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{validateDropCount, validateDropBytes}, + } + deleteNetPol := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.NetworkPolicy), ResourceName: "deny-all", + ResourceNamespace: namespace, RestConfig: restConfig, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, + ResourceNamespace: namespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources and generate traffic. + flow.Pipe(createNetPol, createAgnhost, execCurl).Timeout(k8s.DefaultScenarioTimeout), + // Validate: retry with exponential backoff until metrics appear. + flow.Steps(validateWithPF).Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteNetPol, deleteAgnhost).When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/advancedmetrics/experimental/forward.go b/test/e2ev3/workflows/advancedmetrics/experimental/forward.go new file mode 100644 index 0000000000..763f08ad68 --- /dev/null +++ b/test/e2ev3/workflows/advancedmetrics/experimental/forward.go @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package experimental + +import ( + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addAdvancedForwardScenario(restConfig *rest.Config, namespace, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-adv-fwd-" + arch + podName := agnhostName + "-0" + + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, AgnhostNamespace: namespace, AgnhostArch: arch, RestConfig: restConfig, + } + execCurl := &k8s.ExecInPod{ + PodName: podName, PodNamespace: namespace, + Command: "curl -s -m 5 bing.com", RestConfig: restConfig, + } + validateForwardCount := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, MetricName: "networkobservability_adv_forward_count", + ValidMetrics: []map[string]string{{}}, ExpectMetric: true, PartialMatch: true, + } + validateForwardBytes := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, MetricName: "networkobservability_adv_forward_bytes", + ValidMetrics: []map[string]string{{}}, ExpectMetric: true, PartialMatch: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Endpoint: config.MetricsEndpoint, RestConfig: restConfig, OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{validateForwardCount, validateForwardBytes}, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, + ResourceNamespace: namespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources and generate traffic. + flow.Pipe(createAgnhost, execCurl).Timeout(k8s.DefaultScenarioTimeout), + // Validate: retry with exponential backoff until metrics appear. + flow.Steps(validateWithPF).Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteAgnhost).When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/advancedmetrics/experimental/latency.go b/test/e2ev3/workflows/advancedmetrics/experimental/latency.go new file mode 100644 index 0000000000..2b7c8466d8 --- /dev/null +++ b/test/e2ev3/workflows/advancedmetrics/experimental/latency.go @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package experimental + +import ( + "k8s.io/client-go/rest" + flow "github.com/Azure/go-workflow" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" +) + +func addAPIServerLatencyScenario(restConfig *rest.Config) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + validateLatency := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, MetricName: "networkobservability_adv_node_apiserver_latency", + ValidMetrics: []map[string]string{{}}, ExpectMetric: true, PartialMatch: true, + } + validateNoResponse := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, MetricName: "networkobservability_adv_node_apiserver_no_response", + ValidMetrics: []map[string]string{{}}, ExpectMetric: true, PartialMatch: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Endpoint: config.MetricsEndpoint, RestConfig: restConfig, OptionalLabelAffinity: "k8s-app=retina", + }, + Steps: []flow.Steper{validateLatency, validateNoResponse}, + } + + // Validate: retry with exponential backoff until metrics appear. + wf.Add( + flow.Step(validateWithPF). + Retry(k8s.RetryWithBackoff), + ) + return wf +} diff --git a/test/e2ev3/workflows/advancedmetrics/experimental/tcp.go b/test/e2ev3/workflows/advancedmetrics/experimental/tcp.go new file mode 100644 index 0000000000..a5558d4e42 --- /dev/null +++ b/test/e2ev3/workflows/advancedmetrics/experimental/tcp.go @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package experimental + +import ( + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addAdvancedTCPScenario(restConfig *rest.Config, namespace, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-adv-tcp-" + arch + podName := agnhostName + "-0" + + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, AgnhostNamespace: namespace, AgnhostArch: arch, RestConfig: restConfig, + } + execCurl := &k8s.ExecInPod{ + PodName: podName, PodNamespace: namespace, + Command: "curl -s -m 5 bing.com", RestConfig: restConfig, + } + validateTCPFlags := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, MetricName: "networkobservability_adv_tcpflags_count", + ValidMetrics: []map[string]string{{}}, ExpectMetric: true, PartialMatch: true, + } + validateTCPRetrans := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, MetricName: "networkobservability_adv_tcpretrans_count", + ValidMetrics: []map[string]string{{}}, ExpectMetric: true, PartialMatch: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Endpoint: config.MetricsEndpoint, RestConfig: restConfig, OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{validateTCPFlags, validateTCPRetrans}, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, + ResourceNamespace: namespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources and generate traffic. + flow.Pipe(createAgnhost, execCurl).Timeout(k8s.DefaultScenarioTimeout), + // Validate: retry with exponential backoff until metrics appear. + flow.Steps(validateWithPF).Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteAgnhost).When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/advancedmetrics/experimental/workflow.go b/test/e2ev3/workflows/advancedmetrics/experimental/workflow.go new file mode 100644 index 0000000000..ad095ccda4 --- /dev/null +++ b/test/e2ev3/workflows/advancedmetrics/experimental/workflow.go @@ -0,0 +1,80 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package experimental + +import ( + "context" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" +) + +// Workflow runs the experimental advanced metrics workflow. +type Workflow struct { + Cfg *config.E2EConfig +} + +func (w *Workflow) String() string { return "advanced-metrics-experimental" } + +func (w *Workflow) Do(ctx context.Context) error { + ctx, _ = utils.StepLogger(ctx, w) + p := w.Cfg + restConfig := p.Cluster.RestConfig() + chartPath := p.Paths.RetinaChart + valuesFilePath := p.Paths.AdvancedProfile + testPodNamespace := config.TestPodNamespace + helmCfg := &p.Helm + + // Construct steps. + upgradeRetina := &k8s.UpgradeRetinaHelmChart{ + Namespace: config.KubeSystemNamespace, + ReleaseName: "retina", + KubeConfigFilePath: p.Cluster.KubeConfigPath(), + ChartPath: chartPath, + HelmDriver: helmCfg.Driver, + ValuesFile: valuesFilePath, + } + + + var scenarios []flow.Steper + for _, arch := range config.Architectures { + scenarios = append(scenarios, + addAdvancedDropScenario(restConfig, testPodNamespace, arch), + addAdvancedForwardScenario(restConfig, testPodNamespace, arch), + addAdvancedTCPScenario(restConfig, testPodNamespace, arch), + ) + } + scenarios = append(scenarios, addAPIServerLatencyScenario(restConfig)) + + ensureStable := &k8s.EnsureStableComponent{ + PodNamespace: config.KubeSystemNamespace, + LabelSelector: "k8s-app=retina", + RestConfig: restConfig, + IgnoreContainerRestart: false, + } + + debug := &k8s.DebugOnFailure{ + RestConfig: restConfig, + Namespace: config.KubeSystemNamespace, + LabelSelector: "k8s-app=retina", + } + + // Wire dependencies and register. + // Scenarios run sequentially because they share the same port-forward port. + wf := &flow.Workflow{DontPanic: true} + wf.Add(flow.Step(upgradeRetina)) + prev := flow.Steper(upgradeRetina) + for _, s := range scenarios { + wf.Add(flow.Step(s).DependsOn(prev)) + prev = s + } + wf.Add(flow.Step(ensureStable).DependsOn(prev)) + wf.Add(flow.Step(debug).DependsOn(ensureStable).When(flow.AnyFailed)) + + return wf.Do(ctx) +} diff --git a/test/e2ev3/workflows/advancedmetrics/latency.go b/test/e2ev3/workflows/advancedmetrics/latency.go new file mode 100644 index 0000000000..e705cef91e --- /dev/null +++ b/test/e2ev3/workflows/advancedmetrics/latency.go @@ -0,0 +1,57 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package advancedmetrics + +import ( + "context" + "fmt" + + "k8s.io/client-go/rest" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" +) + +func addLatencyScenario(restConfig *rest.Config) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + validateLatency := &ValidateAPIServerLatencyStep{} + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: "10093", RemotePort: "10093", Endpoint: "metrics", + RestConfig: restConfig, OptionalLabelAffinity: "k8s-app=retina", + }, + Steps: []flow.Steper{validateLatency}, + } + + // Validate: retry with exponential backoff until metrics appear. + wf.Add( + flow.Step(validateWithPF). + Retry(k8s.RetryWithBackoff), + ) + return wf +} + + + +var latencyBucketMetricName = "networkobservability_adv_node_apiserver_tcp_handshake_latency" + +// ValidateAPIServerLatencyStep checks that the API server TCP handshake +// latency metric is present. +type ValidateAPIServerLatencyStep struct{} + +func (v *ValidateAPIServerLatencyStep) Do(ctx context.Context) error { + promAddress := fmt.Sprintf("http://localhost:%s/metrics", config.RetinaMetricsPort) + + metric := map[string]string{} + err := prom.CheckMetric(ctx, promAddress, latencyBucketMetricName, metric) + if err != nil { + return fmt.Errorf("failed to verify latency metrics %s: %w", latencyBucketMetricName, err) + } + return nil +} diff --git a/test/e2ev3/workflows/advancedmetrics/workflow.go b/test/e2ev3/workflows/advancedmetrics/workflow.go new file mode 100644 index 0000000000..652219b10b --- /dev/null +++ b/test/e2ev3/workflows/advancedmetrics/workflow.go @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package advancedmetrics + +import ( + "context" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" +) + +// Workflow runs the advanced metrics workflow. +type Workflow struct { + Cfg *config.E2EConfig +} + +func (w *Workflow) String() string { return "advanced-metrics" } + +func (w *Workflow) Do(ctx context.Context) error { + ctx, _ = utils.StepLogger(ctx, w) + p := w.Cfg + restConfig := p.Cluster.RestConfig() + chartPath := p.Paths.RetinaChart + valuesFilePath := p.Paths.AdvancedProfile + testPodNamespace := config.TestPodNamespace + helmCfg := &p.Helm + + // Construct steps. + upgradeRetina := &k8s.UpgradeRetinaHelmChart{ + Namespace: config.KubeSystemNamespace, + ReleaseName: "retina", + KubeConfigFilePath: p.Cluster.KubeConfigPath(), + ChartPath: chartPath, + HelmDriver: helmCfg.Driver, + ValuesFile: valuesFilePath, + } + + var scenarios []flow.Steper + for _, arch := range config.Architectures { + scenarios = append(scenarios, + addAdvancedDNSScenario(restConfig, testPodNamespace, arch, + "valid", "nslookup kubernetes.default", false, + "kubernetes.default.svc.cluster.local.", "A", "StatefulSet", + "1", "kubernetes.default.svc.cluster.local.", "A", "NOERROR", KubeServiceIP, + ), + addAdvancedDNSScenario(restConfig, testPodNamespace, arch, + "nxdomain", "nslookup some.non.existent.domain.", true, + "some.non.existent.domain.", "A", "StatefulSet", + "0", "some.non.existent.domain.", "A", "NXDOMAIN", EmptyResponse, + ), + ) + } + scenarios = append(scenarios, addLatencyScenario(restConfig)) + + ensureStable := &k8s.EnsureStableComponent{ + PodNamespace: config.KubeSystemNamespace, + LabelSelector: "k8s-app=retina", + RestConfig: restConfig, + IgnoreContainerRestart: false, + } + + debug := &k8s.DebugOnFailure{ + RestConfig: restConfig, + Namespace: config.KubeSystemNamespace, + LabelSelector: "k8s-app=retina", + } + + // Wire dependencies and register. + // Scenarios run sequentially because they share the same port-forward port. + wf := &flow.Workflow{DontPanic: true} + wf.Add(flow.Step(upgradeRetina)) + prev := flow.Steper(upgradeRetina) + for _, s := range scenarios { + wf.Add(flow.Step(s).DependsOn(prev)) + prev = s + } + wf.Add(flow.Step(ensureStable).DependsOn(prev)) + wf.Add(flow.Step(debug).DependsOn(ensureStable).When(flow.AnyFailed)) + + return wf.Do(ctx) +} diff --git a/test/e2ev3/workflows/basicmetrics/dns.go b/test/e2ev3/workflows/basicmetrics/dns.go new file mode 100644 index 0000000000..d2d5589b83 --- /dev/null +++ b/test/e2ev3/workflows/basicmetrics/dns.go @@ -0,0 +1,118 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package basicmetrics + +import ( + "context" + "fmt" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addBasicDNSScenario(restConfig *rest.Config, namespace, arch, variant, command string, expectError bool) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-dns-basic-" + variant + "-" + arch + podName := agnhostName + "-0" + + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, AgnhostNamespace: namespace, AgnhostArch: arch, RestConfig: restConfig, + } + execCmd1 := flow.Func("basic-dns-"+variant+"-1-"+arch, func(ctx context.Context) error { + err := (&k8s.ExecInPod{PodName: podName, PodNamespace: namespace, Command: command, RestConfig: restConfig}).Do(ctx) + if expectError { + return nil + } + return err + }) + execCmd2 := flow.Func("basic-dns-"+variant+"-2-"+arch, func(ctx context.Context) error { + err := (&k8s.ExecInPod{PodName: podName, PodNamespace: namespace, Command: command, RestConfig: restConfig}).Do(ctx) + if expectError { + return nil + } + return err + }) + validateReq := &ValidateBasicDNSRequestStep{Variant: variant + "-" + arch} + validateResp := &ValidateBasicDNSResponseStep{Variant: variant + "-" + arch} + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Endpoint: "metrics", RestConfig: restConfig, OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{validateReq, validateResp}, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, ResourceNamespace: namespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + flow.Pipe(createAgnhost, execCmd1, execCmd2). + Timeout(k8s.DefaultScenarioTimeout), + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + flow.Pipe(deleteAgnhost). + When(flow.Always), + ), + ) + return wf +} + +var ( + dnsBasicRequestCountMetricName = "networkobservability_dns_request_count" + dnsBasicResponseCountMetricName = "networkobservability_dns_response_count" +) + +// ValidateBasicDNSRequestStep checks that the basic DNS request count metric exists. +type ValidateBasicDNSRequestStep struct { + Variant string // distinguishes instances in the DAG (e.g. "valid-domain-amd64") +} + +func (v *ValidateBasicDNSRequestStep) Do(ctx context.Context) error { + metricsEndpoint := fmt.Sprintf("http://localhost:%s/metrics", config.RetinaMetricsPort) + + validBasicDNSRequestMetricLabels := map[string]string{} + + err := prom.CheckMetric(ctx, metricsEndpoint, dnsBasicRequestCountMetricName, validBasicDNSRequestMetricLabels) + if err != nil { + return fmt.Errorf("failed to verify basic dns request metrics %s: %w", dnsBasicRequestCountMetricName, err) + } + return nil +} + +// ValidateBasicDNSResponseStep checks that the basic DNS response count metric exists. +type ValidateBasicDNSResponseStep struct { + Variant string // distinguishes instances in the DAG + NumResponse string + Query string + QueryType string + ReturnCode string + Response string +} + +func (v *ValidateBasicDNSResponseStep) Do(ctx context.Context) error { + metricsEndpoint := fmt.Sprintf("http://localhost:%s/metrics", config.RetinaMetricsPort) + + if v.Response == emptyResponse { + v.Response = "" + } + + validBasicDNSResponseMetricLabels := map[string]string{} + + err := prom.CheckMetric(ctx, metricsEndpoint, dnsBasicResponseCountMetricName, validBasicDNSResponseMetricLabels) + if err != nil { + return fmt.Errorf("failed to verify basic dns response metrics %s: %w", dnsBasicResponseCountMetricName, err) + } + return nil +} + +// emptyResponse is a sentinel value that gets converted to an empty string +// for metric label matching. +const emptyResponse = "emptyResponse" diff --git a/test/e2ev3/workflows/basicmetrics/drop.go b/test/e2ev3/workflows/basicmetrics/drop.go new file mode 100644 index 0000000000..e7fc83adb5 --- /dev/null +++ b/test/e2ev3/workflows/basicmetrics/drop.go @@ -0,0 +1,103 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package basicmetrics + +import ( + "context" + "fmt" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addDropScenario(restConfig *rest.Config, namespace, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-drop-" + arch + podName := agnhostName + "-0" + + createNetPol := &k8s.CreateDenyAllNetworkPolicy{ + NetworkPolicyNamespace: namespace, RestConfig: restConfig, DenyAllLabelSelector: "app=" + agnhostName, + } + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostNamespace: namespace, AgnhostName: agnhostName, AgnhostArch: arch, RestConfig: restConfig, + } + execCurl1 := k8s.CurlExpectFail("drop-curl-1-"+arch, &k8s.ExecInPod{ + PodNamespace: namespace, PodName: podName, Command: "curl -s -m 5 bing.com", RestConfig: restConfig, + }) + execCurl2 := k8s.CurlExpectFail("drop-curl-2-"+arch, &k8s.ExecInPod{ + PodNamespace: namespace, PodName: podName, Command: "curl -s -m 5 bing.com", RestConfig: restConfig, + }) + validateDrop := &ValidateRetinaDropMetricStep{PortForwardedRetinaPort: "10093", Direction: "unknown", Reason: IPTableRuleDrop} + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: "10093", RemotePort: "10093", Endpoint: "metrics", + RestConfig: restConfig, OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{validateDrop}, + } + deleteNetPol := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.NetworkPolicy), ResourceName: "deny-all", ResourceNamespace: namespace, RestConfig: restConfig, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, ResourceNamespace: namespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + flow.Pipe(createNetPol, createAgnhost, execCurl1, execCurl2). + Timeout(k8s.DefaultScenarioTimeout), + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + flow.Pipe(deleteNetPol, deleteAgnhost). + When(flow.Always), + ), + ) + return wf +} + +var ( + dropCountMetricName = "networkobservability_drop_count" + dropBytesMetricName = "networkobservability_drop_bytes" +) + +const ( + IPTableRuleDrop = "IPTABLE_RULE_DROP" + + directionKey = "direction" + reasonKey = "reason" +) + +// ValidateRetinaDropMetricStep checks that drop count and drop bytes metrics +// are present with the expected direction and reason labels. +type ValidateRetinaDropMetricStep struct { + PortForwardedRetinaPort string + Direction string + Reason string +} + +func (v *ValidateRetinaDropMetricStep) Do(ctx context.Context) error { + promAddress := fmt.Sprintf("http://localhost:%s/metrics", v.PortForwardedRetinaPort) + + metric := map[string]string{ + directionKey: v.Direction, + reasonKey: IPTableRuleDrop, + } + + err := prom.CheckMetric(ctx, promAddress, dropCountMetricName, metric) + if err != nil { + return fmt.Errorf("failed to verify prometheus metrics %s: %w", dropCountMetricName, err) + } + + err = prom.CheckMetric(ctx, promAddress, dropBytesMetricName, metric) + if err != nil { + return fmt.Errorf("failed to verify prometheus metrics %s: %w", dropBytesMetricName, err) + } + return nil +} diff --git a/test/e2ev3/workflows/basicmetrics/experimental/conntrack.go b/test/e2ev3/workflows/basicmetrics/experimental/conntrack.go new file mode 100644 index 0000000000..7449c18f22 --- /dev/null +++ b/test/e2ev3/workflows/basicmetrics/experimental/conntrack.go @@ -0,0 +1,78 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package experimental + +import ( + "context" + "k8s.io/client-go/rest" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" +) + +func addConntrackScenario(restConfig *rest.Config, namespace, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-ct-" + arch + podName := agnhostName + "-0" + + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, AgnhostNamespace: namespace, AgnhostArch: arch, RestConfig: restConfig, + } + execCurl1 := flow.Func("ct-curl-1-"+arch, func(ctx context.Context) error { + return (&k8s.ExecInPod{PodNamespace: namespace, PodName: podName, Command: "curl -s -m 5 bing.com", RestConfig: restConfig}).Do(ctx) + }) + execCurl2 := flow.Func("ct-curl-2-"+arch, func(ctx context.Context) error { + return (&k8s.ExecInPod{PodNamespace: namespace, PodName: podName, Command: "curl -s -m 5 bing.com", RestConfig: restConfig}).Do(ctx) + }) + conntrackMetrics := []string{ + "networkobservability_conntrack_packets_tx", + "networkobservability_conntrack_packets_rx", + "networkobservability_conntrack_bytes_tx", + "networkobservability_conntrack_bytes_rx", + "networkobservability_conntrack_total_connections", + } + + validateSteps := make([]flow.Steper, 0, len(conntrackMetrics)) + for _, metric := range conntrackMetrics { + validateSteps = append(validateSteps, &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, + MetricName: metric, + ValidMetrics: []map[string]string{{}}, + ExpectMetric: true, + PartialMatch: true, + }) + } + + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Endpoint: config.MetricsEndpoint, RestConfig: restConfig, + OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: validateSteps, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, ResourceNamespace: namespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources and generate traffic. + flow.Pipe(createAgnhost, execCurl1, execCurl2). + Timeout(k8s.DefaultScenarioTimeout), + // Validate: retry with exponential backoff until metrics appear. + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteAgnhost). + When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/basicmetrics/experimental/forward.go b/test/e2ev3/workflows/basicmetrics/experimental/forward.go new file mode 100644 index 0000000000..b37e81898e --- /dev/null +++ b/test/e2ev3/workflows/basicmetrics/experimental/forward.go @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package experimental + +import ( + "context" + "k8s.io/client-go/rest" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" +) + +func addForwardScenario(restConfig *rest.Config, namespace, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-fwd-" + arch + podName := agnhostName + "-0" + + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, AgnhostNamespace: namespace, AgnhostArch: arch, RestConfig: restConfig, + } + execCurl1 := flow.Func("fwd-curl-1-"+arch, func(ctx context.Context) error { + return (&k8s.ExecInPod{PodNamespace: namespace, PodName: podName, Command: "curl -s -m 5 bing.com", RestConfig: restConfig}).Do(ctx) + }) + execCurl2 := flow.Func("fwd-curl-2-"+arch, func(ctx context.Context) error { + return (&k8s.ExecInPod{PodNamespace: namespace, PodName: podName, Command: "curl -s -m 5 bing.com", RestConfig: restConfig}).Do(ctx) + }) + validateFwdCount := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, + MetricName: "networkobservability_forward_count", + ValidMetrics: []map[string]string{{"direction": "egress"}}, + ExpectMetric: true, + PartialMatch: true, + } + validateFwdBytes := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, + MetricName: "networkobservability_forward_bytes", + ValidMetrics: []map[string]string{{"direction": "egress"}}, + ExpectMetric: true, + PartialMatch: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Endpoint: config.MetricsEndpoint, RestConfig: restConfig, + OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{validateFwdCount, validateFwdBytes}, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, ResourceNamespace: namespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources and generate traffic. + flow.Pipe(createAgnhost, execCurl1, execCurl2). + Timeout(k8s.DefaultScenarioTimeout), + // Validate: retry with exponential backoff until metrics appear. + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteAgnhost). + When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/basicmetrics/experimental/network_stats.go b/test/e2ev3/workflows/basicmetrics/experimental/network_stats.go new file mode 100644 index 0000000000..31ce94b2dd --- /dev/null +++ b/test/e2ev3/workflows/basicmetrics/experimental/network_stats.go @@ -0,0 +1,54 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package experimental + +import ( + "k8s.io/client-go/rest" + flow "github.com/Azure/go-workflow" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" +) + +func addNetworkStatsScenario(restConfig *rest.Config) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + validateIPStats := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, + MetricName: "networkobservability_ip_connection_stats", + ValidMetrics: []map[string]string{{}}, + ExpectMetric: true, + PartialMatch: true, + } + validateUDPStats := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, + MetricName: "networkobservability_udp_connection_stats", + ValidMetrics: []map[string]string{{}}, + ExpectMetric: true, + PartialMatch: true, + } + validateIfaceStats := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, + MetricName: "networkobservability_interface_stats", + ValidMetrics: []map[string]string{{}}, + ExpectMetric: true, + PartialMatch: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Endpoint: config.MetricsEndpoint, RestConfig: restConfig, + }, + Steps: []flow.Steper{validateIPStats, validateUDPStats, validateIfaceStats}, + } + + // Validate: retry with exponential backoff until metrics appear. + wf.Add( + flow.Step(validateWithPF). + Retry(k8s.RetryWithBackoff), + ) + return wf +} diff --git a/test/e2ev3/workflows/basicmetrics/experimental/node_connectivity.go b/test/e2ev3/workflows/basicmetrics/experimental/node_connectivity.go new file mode 100644 index 0000000000..2d59196186 --- /dev/null +++ b/test/e2ev3/workflows/basicmetrics/experimental/node_connectivity.go @@ -0,0 +1,47 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package experimental + +import ( + "k8s.io/client-go/rest" + flow "github.com/Azure/go-workflow" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" +) + +func addNodeConnectivityScenario(restConfig *rest.Config) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + validateStatus := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, + MetricName: "networkobservability_node_connectivity_status", + ValidMetrics: []map[string]string{{}}, + ExpectMetric: true, + PartialMatch: true, + } + validateLatency := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, + MetricName: "networkobservability_node_connectivity_latency_seconds", + ValidMetrics: []map[string]string{{}}, + ExpectMetric: true, + PartialMatch: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Endpoint: config.MetricsEndpoint, RestConfig: restConfig, + }, + Steps: []flow.Steper{validateStatus, validateLatency}, + } + + // Validate: retry with exponential backoff until metrics appear. + wf.Add( + flow.Step(validateWithPF). + Retry(k8s.RetryWithBackoff), + ) + return wf +} diff --git a/test/e2ev3/workflows/basicmetrics/experimental/tcp_stats.go b/test/e2ev3/workflows/basicmetrics/experimental/tcp_stats.go new file mode 100644 index 0000000000..f39b2df43c --- /dev/null +++ b/test/e2ev3/workflows/basicmetrics/experimental/tcp_stats.go @@ -0,0 +1,82 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package experimental + +import ( + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addTCPStatsScenario(restConfig *rest.Config, namespace, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-tcpstats-" + arch + podName := agnhostName + "-0" + + createKapinger := &k8s.CreateKapingerDeployment{ + KapingerNamespace: namespace, KapingerReplicas: "1", RestConfig: restConfig, + } + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, AgnhostNamespace: namespace, AgnhostArch: arch, RestConfig: restConfig, + } + waitKapinger := &k8s.WaitPodsReady{ + RestConfig: restConfig, + Namespace: namespace, + LabelSelector: "app=kapinger", + } + execCurl1 := &k8s.ExecInPod{ + PodName: podName, PodNamespace: namespace, Command: "curl -s -m 5 kapinger:80", RestConfig: restConfig, + } + execCurl2 := &k8s.ExecInPod{ + PodName: podName, PodNamespace: namespace, Command: "curl -s -m 5 kapinger:80", RestConfig: restConfig, + } + validateConnStats := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, + MetricName: "networkobservability_tcp_connection_stats", + ValidMetrics: []map[string]string{{}}, + ExpectMetric: true, + PartialMatch: true, + } + validateFlagGauges := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, + MetricName: "networkobservability_tcp_flag_gauges", + ValidMetrics: []map[string]string{{"flag": config.SYN}}, + ExpectMetric: true, + PartialMatch: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Endpoint: config.MetricsEndpoint, RestConfig: restConfig, + OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{validateConnStats, validateFlagGauges}, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, ResourceNamespace: namespace, RestConfig: restConfig, + } + deleteKapinger := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.Deployment), ResourceName: "kapinger", ResourceNamespace: namespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources and generate traffic. + flow.Pipe(createKapinger, createAgnhost, waitKapinger, execCurl1, execCurl2). + Timeout(k8s.DefaultScenarioTimeout), + // Validate: retry with exponential backoff until metrics appear. + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteAgnhost, deleteKapinger). + When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/basicmetrics/experimental/workflow.go b/test/e2ev3/workflows/basicmetrics/experimental/workflow.go new file mode 100644 index 0000000000..61127aaed7 --- /dev/null +++ b/test/e2ev3/workflows/basicmetrics/experimental/workflow.go @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package experimental + +import ( + "context" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" +) + +// Workflow runs the experimental basic metrics workflow. +type Workflow struct { + Cfg *config.E2EConfig +} + +func (w *Workflow) String() string { return "basic-metrics-experimental" } + +func (w *Workflow) Do(ctx context.Context) error { + ctx, _ = utils.StepLogger(ctx, w) + p := w.Cfg + restConfig := p.Cluster.RestConfig() + chartPath := p.Paths.RetinaChart + testPodNamespace := config.TestPodNamespace + imgCfg := &p.Image + helmCfg := &p.Helm + + // Construct steps. + installRetina := &k8s.InstallHelmChart{ + Namespace: config.KubeSystemNamespace, + ReleaseName: "retina", + KubeConfigFilePath: p.Cluster.KubeConfigPath(), + ChartPath: chartPath, + ImageTag: imgCfg.Tag, + ImageRegistry: imgCfg.Registry, + ImageNamespace: imgCfg.Namespace, + HelmDriver: helmCfg.Driver, + ImageLoader: p.Cluster, + } + + + var scenarios []flow.Steper + for _, arch := range config.Architectures { + scenarios = append(scenarios, + addForwardScenario(restConfig, testPodNamespace, arch), + addConntrackScenario(restConfig, testPodNamespace, arch), + addTCPStatsScenario(restConfig, testPodNamespace, arch), + ) + } + scenarios = append(scenarios, + addNetworkStatsScenario(restConfig), + addNodeConnectivityScenario(restConfig), + ) + + ensureStable := &k8s.EnsureStableComponent{ + PodNamespace: config.KubeSystemNamespace, + LabelSelector: "k8s-app=retina", + RestConfig: restConfig, + IgnoreContainerRestart: false, + } + + debug := &k8s.DebugOnFailure{ + RestConfig: restConfig, + Namespace: config.KubeSystemNamespace, + LabelSelector: "k8s-app=retina", + } + + // Wire dependencies and register. + // Scenarios run sequentially because they share the same port-forward port. + wf := &flow.Workflow{DontPanic: true} + wf.Add(flow.Step(installRetina)) + prev := flow.Steper(installRetina) + for _, s := range scenarios { + wf.Add(flow.Step(s).DependsOn(prev)) + prev = s + } + wf.Add(flow.Step(ensureStable).DependsOn(prev)) + wf.Add(flow.Step(debug).DependsOn(ensureStable).When(flow.AnyFailed)) + + return wf.Do(ctx) +} diff --git a/test/e2ev3/workflows/basicmetrics/hns.go b/test/e2ev3/workflows/basicmetrics/hns.go new file mode 100644 index 0000000000..4700d34641 --- /dev/null +++ b/test/e2ev3/workflows/basicmetrics/hns.go @@ -0,0 +1,100 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package basicmetrics + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "github.com/microsoft/retina/test/retry" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +const ( + defaultRetryDelay = 5 * time.Second + defaultRetryAttempts = 5 +) + +var ( + ErrorNoWindowsPod = errors.New("no windows retina pod found") + ErrNoMetricFound = fmt.Errorf("no metric found") + + hnsMetricName = "networkobservability_windows_hns_stats" + defaultRetrier = retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay, ExpBackoff: true} +) + +// ValidateHNSMetricStep finds a Windows retina pod, curls the metrics endpoint +// inside it, and checks for the HNS stats metric with retry logic. +type ValidateHNSMetricStep struct { + RestConfig *rest.Config + RetinaDaemonSetNamespace string + RetinaDaemonSetName string +} + +func (v *ValidateHNSMetricStep) String() string { return "validate-hns-metrics" } + +func (v *ValidateHNSMetricStep) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, v) + clientset, err := kubernetes.NewForConfig(v.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + pods, err := clientset.CoreV1().Pods(v.RetinaDaemonSetNamespace).List(ctx, metav1.ListOptions{ + LabelSelector: "k8s-app=retina", + }) + if err != nil { + return fmt.Errorf("error listing pods: %w", err) + } + + var windowsRetinaPod *v1.Pod + for i := range pods.Items { + if pods.Items[i].Spec.NodeSelector["kubernetes.io/os"] == "windows" { + windowsRetinaPod = &pods.Items[i] + } + } + if windowsRetinaPod == nil { + return ErrorNoWindowsPod + } + + labels := map[string]string{ + "direction": "win_packets_sent_count", + } + + log.Info("checking for metric", "metric", hnsMetricName, "labels", labels) + + err = defaultRetrier.Do(ctx, func() error { + output, execErr := k8s.ExecPod(ctx, clientset, v.RestConfig, windowsRetinaPod.Namespace, windowsRetinaPod.Name, fmt.Sprintf("curl -s http://localhost:%s/metrics", config.RetinaMetricsPort)) + if execErr != nil { + return fmt.Errorf("error executing command in windows retina pod: %w", execErr) + } + if len(output) == 0 { + return ErrNoMetricFound + } + + checkErr := prom.CheckMetricFromBuffer(output, hnsMetricName, labels) + if checkErr != nil { + return fmt.Errorf("failed to verify prometheus metrics: %w", checkErr) + } + + return nil + }) + if err != nil { + return err + } + + log.Info("found matching metric", "metric", hnsMetricName, "labels", labels) + return nil +} diff --git a/test/e2ev3/workflows/basicmetrics/tcp.go b/test/e2ev3/workflows/basicmetrics/tcp.go new file mode 100644 index 0000000000..114c64cb1e --- /dev/null +++ b/test/e2ev3/workflows/basicmetrics/tcp.go @@ -0,0 +1,126 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package basicmetrics + +import ( + "context" + "fmt" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addTCPScenario(restConfig *rest.Config, namespace, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-tcp-" + arch + podName := agnhostName + "-0" + + createKapinger := &k8s.CreateKapingerDeployment{ + KapingerNamespace: namespace, KapingerReplicas: "1", RestConfig: restConfig, + } + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, AgnhostNamespace: namespace, AgnhostArch: arch, RestConfig: restConfig, + } + waitKapinger := &k8s.WaitPodsReady{ + RestConfig: restConfig, + Namespace: namespace, + LabelSelector: "app=kapinger", + } + execCurl1 := &k8s.ExecInPod{ + PodName: podName, PodNamespace: namespace, Command: "curl -s -m 5 bing.com", RestConfig: restConfig, + } + execCurl2 := &k8s.ExecInPod{ + PodName: podName, PodNamespace: namespace, Command: "curl -s -m 5 bing.com", RestConfig: restConfig, + } + validateState := &ValidateRetinaTCPStateStep{PortForwardedRetinaPort: "10093"} + validateRemote := &ValidateRetinaTCPConnectionRemoteStep{PortForwardedRetinaPort: "10093"} + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + Namespace: config.KubeSystemNamespace, LabelSelector: "k8s-app=retina", + LocalPort: "10093", RemotePort: "10093", Endpoint: "metrics", + RestConfig: restConfig, OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{validateState, validateRemote}, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, ResourceNamespace: namespace, RestConfig: restConfig, + } + deleteKapinger := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.Deployment), ResourceName: "kapinger", ResourceNamespace: namespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + flow.Pipe(createKapinger, createAgnhost, waitKapinger, execCurl1, execCurl2). + Timeout(k8s.DefaultScenarioTimeout), + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + flow.Pipe(deleteAgnhost, deleteKapinger). + When(flow.Always), + ), + ) + return wf +} + +var ( + tcpStateMetricName = "networkobservability_tcp_state" + tcpConnectionRemoteMetricName = "networkobservability_tcp_connection_remote" +) + +const ( + stateKey = "state" + + established = "ESTABLISHED" + listen = "LISTEN" + timewait = "TIME_WAIT" +) + +// ValidateRetinaTCPStateStep checks that the TCP state metric exists +// for ESTABLISHED, LISTEN, and TIME_WAIT states. +type ValidateRetinaTCPStateStep struct { + PortForwardedRetinaPort string +} + +func (v *ValidateRetinaTCPStateStep) Do(ctx context.Context) error { + promAddress := fmt.Sprintf("http://localhost:%s/metrics", v.PortForwardedRetinaPort) + + validMetrics := []map[string]string{ + {stateKey: established}, + {stateKey: listen}, + {stateKey: timewait}, + } + + for _, metric := range validMetrics { + err := prom.CheckMetric(ctx, promAddress, tcpStateMetricName, metric) + if err != nil { + return fmt.Errorf("failed to verify prometheus metrics: %w", err) + } + } + return nil +} + +// ValidateRetinaTCPConnectionRemoteStep checks the TCP connection remote metric. +// Currently performs empty validation (no specific labels checked). +type ValidateRetinaTCPConnectionRemoteStep struct { + PortForwardedRetinaPort string +} + +func (v *ValidateRetinaTCPConnectionRemoteStep) Do(ctx context.Context) error { + promAddress := fmt.Sprintf("http://localhost:%s/metrics", v.PortForwardedRetinaPort) + + validMetrics := []map[string]string{} + + for _, metric := range validMetrics { + err := prom.CheckMetric(ctx, promAddress, tcpConnectionRemoteMetricName, metric) + if err != nil { + return fmt.Errorf("failed to verify prometheus metrics: %w", err) + } + } + return nil +} diff --git a/test/e2ev3/workflows/basicmetrics/workflow.go b/test/e2ev3/workflows/basicmetrics/workflow.go new file mode 100644 index 0000000000..2a35972ff1 --- /dev/null +++ b/test/e2ev3/workflows/basicmetrics/workflow.go @@ -0,0 +1,93 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package basicmetrics + +import ( + "context" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" +) + +// Workflow runs the basic metrics workflow. +type Workflow struct { + Cfg *config.E2EConfig +} + +func (w *Workflow) String() string { return "basic-metrics" } + +func (w *Workflow) Do(ctx context.Context) error { + ctx, _ = utils.StepLogger(ctx, w) + p := w.Cfg + kubeConfigFilePath := p.Cluster.KubeConfigPath() + restConfig := p.Cluster.RestConfig() + chartPath := p.Paths.RetinaChart + testPodNamespace := config.TestPodNamespace + imgCfg := &p.Image + helmCfg := &p.Helm + + // Construct steps. + installRetina := &k8s.InstallHelmChart{ + Namespace: config.KubeSystemNamespace, + ReleaseName: "retina", + KubeConfigFilePath: kubeConfigFilePath, + ChartPath: chartPath, + ImageTag: imgCfg.Tag, + ImageRegistry: imgCfg.Registry, + ImageNamespace: imgCfg.Namespace, + HelmDriver: helmCfg.Driver, + ImageLoader: p.Cluster, + } + + var scenarios []flow.Steper + for _, arch := range config.Architectures { + scenarios = append(scenarios, + addDropScenario(restConfig, testPodNamespace, arch), + addTCPScenario(restConfig, testPodNamespace, arch), + addBasicDNSScenario(restConfig, testPodNamespace, arch, + "valid-domain", "nslookup kubernetes.default", false), + addBasicDNSScenario(restConfig, testPodNamespace, arch, + "nxdomain", "nslookup some.non.existent.domain", true), + ) + } + + if *config.Provider != "kind" { + scenarios = append(scenarios, &ValidateHNSMetricStep{ + RestConfig: restConfig, + RetinaDaemonSetNamespace: config.KubeSystemNamespace, + RetinaDaemonSetName: "retina-agent-win", + }) + } + + ensureStable := &k8s.EnsureStableComponent{ + PodNamespace: config.KubeSystemNamespace, + LabelSelector: "k8s-app=retina", + RestConfig: restConfig, + IgnoreContainerRestart: false, + } + + debug := &k8s.DebugOnFailure{ + RestConfig: restConfig, + Namespace: config.KubeSystemNamespace, + LabelSelector: "k8s-app=retina", + } + + // Wire dependencies and register. + // Scenarios run sequentially because they share the same port-forward port. + wf := &flow.Workflow{DontPanic: true} + wf.Add(flow.Step(installRetina)) + prev := flow.Steper(installRetina) + for _, s := range scenarios { + wf.Add(flow.Step(s).DependsOn(prev)) + prev = s + } + wf.Add(flow.Step(ensureStable).DependsOn(prev)) + wf.Add(flow.Step(debug).DependsOn(ensureStable).When(flow.AnyFailed)) + + return wf.Do(ctx) +} diff --git a/test/e2ev3/workflows/capture/install_plugin.go b/test/e2ev3/workflows/capture/install_plugin.go new file mode 100644 index 0000000000..9cc7b1f961 --- /dev/null +++ b/test/e2ev3/workflows/capture/install_plugin.go @@ -0,0 +1,87 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package capture + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/microsoft/retina/test/e2ev3/pkg/utils" +) + +const ( + // InstallRetinaBinaryDir is the directory where the kubectl-retina binary will be installed. + InstallRetinaBinaryDir = "/tmp/retina-bin" +) + +// InstallRetinaPluginStep builds and installs the kubectl-retina plugin +// to allow e2e tests to run kubectl retina commands. +type InstallRetinaPluginStep struct{} + +func (i *InstallRetinaPluginStep) String() string { return "install-retina-plugin" } + +func (i *InstallRetinaPluginStep) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, i) + log.Info("building kubectl-retina plugin") + + if err := os.MkdirAll(InstallRetinaBinaryDir, 0o755); err != nil { + return fmt.Errorf("failed to create binary directory: %w", err) + } + + binaryName := "kubectl-retina" + + cmd := exec.Command("git", "rev-parse", "--show-toplevel") // #nosec + output, err := cmd.Output() + if err != nil { + return fmt.Errorf("failed to detect git repository root: %w", err) + } + retinaRepoRoot := strings.TrimSpace(string(output)) + log.Info("auto-detected repository root", "path", retinaRepoRoot) + + if _, err := os.Stat(retinaRepoRoot); err != nil { + return fmt.Errorf("invalid RetinaRepoRoot path: %w", err) + } + + if _, err := os.Stat(filepath.Join(retinaRepoRoot, "cli", "main.go")); err != nil { + return fmt.Errorf("cli/main.go not found in repository root: %w", err) + } + + buildCmd := exec.Command("go", "build", "-o", + filepath.Join(InstallRetinaBinaryDir, binaryName), + filepath.Join(retinaRepoRoot, "cli", "main.go")) // #nosec + buildCmd.Dir = retinaRepoRoot + buildOutput, err := buildCmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to build kubectl-retina: %s: %w", buildOutput, err) + } + log.Info("successfully built kubectl-retina", "output", string(buildOutput)) + + currentPath := os.Getenv("PATH") + if !strings.Contains(currentPath, InstallRetinaBinaryDir) { + newPath := fmt.Sprintf("%s:%s", InstallRetinaBinaryDir, currentPath) + if err := os.Setenv("PATH", newPath); err != nil { + return fmt.Errorf("failed to update PATH environment variable: %w", err) + } + log.Info("added directory to PATH", "dir", InstallRetinaBinaryDir) + } + + verifyCmd := exec.Command("kubectl", "plugin", "list") // #nosec + verifyOutput, err := verifyCmd.CombinedOutput() + if err != nil { + log.Warn("kubectl plugin list command failed", "error", err, "output", string(verifyOutput)) + } else { + log.Info("kubectl plugin list", "output", string(verifyOutput)) + if !strings.Contains(string(verifyOutput), "retina") { + log.Warn("retina plugin not found in kubectl plugin list output") + } + } + + return nil +} diff --git a/test/e2ev3/workflows/capture/validate_capture.go b/test/e2ev3/workflows/capture/validate_capture.go new file mode 100644 index 0000000000..6bbec46289 --- /dev/null +++ b/test/e2ev3/workflows/capture/validate_capture.go @@ -0,0 +1,261 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package capture + +import ( + "context" + "fmt" + "log/slog" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + captureConstants "github.com/microsoft/retina/pkg/capture/constants" + "github.com/microsoft/retina/pkg/label" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "github.com/microsoft/retina/test/retry" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +var ( + ErrNoCaptureJobsFound = fmt.Errorf("no capture jobs found") + ErrFoundNonZeroCaptureJobs = fmt.Errorf("found non-zero amount of capture jobs when expecting zero after deletion") + ErrMissingEventOnCaptureJob = fmt.Errorf("missing SuccessfulCreate or Completed event on capture job") + ErrCaptureJobFailed = fmt.Errorf("capture job failed") +) + +// ValidateCaptureStep runs the full kubectl retina capture lifecycle: +// create, verify jobs, download, validate files, and delete. +type ValidateCaptureStep struct { + CaptureName string + CaptureNamespace string + Duration string + KubeConfigPath string + RestConfig *rest.Config + ImageTag string + ImageRegistry string + ImageNamespace string +} + +func (v *ValidateCaptureStep) String() string { return "validate-capture" } + +func (v *ValidateCaptureStep) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, v) + log.Info("running retina capture create") + + imageRegistry := v.ImageRegistry + imageNamespace := v.ImageNamespace + imageTag := v.ImageTag + + os.Setenv("KUBECONFIG", v.KubeConfigPath) //nolint:errcheck // best effort + log.Info("KUBECONFIG set", "path", os.Getenv("KUBECONFIG")) + + cmd := exec.CommandContext(ctx, "kubectl", "retina", "capture", "create", "--namespace", v.CaptureNamespace, "--name", v.CaptureName, "--duration", v.Duration, "--debug") //#nosec + cmd.Env = append(os.Environ(), "RETINA_AGENT_IMAGE="+filepath.Join(imageRegistry, imageNamespace, "retina-agent:"+imageTag)) + + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to execute create capture command: %s: %w", string(output), err) + } + log.Info("create capture command completed", "output", string(output)) + + clientset, err := kubernetes.NewForConfig(v.RestConfig) + if err != nil { + return fmt.Errorf("failed to create kubernetes clientset: %w", err) + } + + retrier := retry.Retrier{Attempts: 5, Delay: 10 * time.Second, ExpBackoff: true} + err = retrier.Do(ctx, func() error { + e := v.verifyJobs(ctx, log, clientset) + if e != nil { + log.Warn("failed to verify capture jobs, retrying", "error", e) + return e + } + return nil + }) + if err != nil { + return fmt.Errorf("failed to verify capture jobs were created: %w", err) + } + + if err := v.downloadCapture(ctx, log); err != nil { + return fmt.Errorf("failed to download and validate capture files: %w", err) + } + defer func() { + outputDir := filepath.Join(".", v.CaptureName) + if err := os.RemoveAll(outputDir); err != nil { + log.Warn("failed to clean up capture files", "dir", outputDir, "error", err) + } + }() + + if err := v.deleteJobs(ctx, log, clientset); err != nil { + return fmt.Errorf("failed to delete capture jobs: %w", err) + } + + return nil +} + +func (v *ValidateCaptureStep) verifyJobs(ctx context.Context, log *slog.Logger, clientset *kubernetes.Clientset) error { + captureJobSelector := &metav1.LabelSelector{ + MatchLabels: map[string]string{ + label.CaptureNameLabel: v.CaptureName, + label.AppLabel: captureConstants.CaptureAppname, + }, + } + labelSelector, err := labels.Parse(metav1.FormatLabelSelector(captureJobSelector)) + if err != nil { + return fmt.Errorf("failed to parse label selector: %w", err) + } + + jobList, err := clientset.BatchV1().Jobs(v.CaptureNamespace).List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector.String(), + }) + if err != nil { + return fmt.Errorf("failed to list capture jobs: %w", err) + } + + if len(jobList.Items) == 0 { + return fmt.Errorf("with labels %s=%s and %s=%s: %w", + label.CaptureNameLabel, v.CaptureName, + label.AppLabel, captureConstants.CaptureAppname, ErrNoCaptureJobsFound) + } + + log.Info("found capture jobs", "count", len(jobList.Items)) + + for i := range jobList.Items { + for _, condition := range jobList.Items[i].Status.Conditions { + if condition.Type == "Complete" && condition.Status == "True" { + log.Info("job completed", "job", jobList.Items[i].Name) + } + if condition.Type == "Failed" && condition.Status == "True" { + return fmt.Errorf("%s: %w", jobList.Items[i].Name, ErrCaptureJobFailed) + } + } + } + + events, err := clientset.CoreV1().Events(v.CaptureNamespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to list events: %w", err) + } + for i := range jobList.Items { + if err := v.checkJobEvents(jobList.Items[i].Name, events); err != nil { + return fmt.Errorf("failed to verify events for job %s: %w", jobList.Items[i].Name, err) + } + log.Info("job has required events", "job", jobList.Items[i].Name) + } + + return nil +} + +func (v *ValidateCaptureStep) checkJobEvents(jobName string, events *v1.EventList) error { + var created, completed bool + for i := range events.Items { + if events.Items[i].InvolvedObject.Kind == "Job" && events.Items[i].InvolvedObject.Name == jobName { + switch events.Items[i].Reason { + case "SuccessfulCreate": + created = true + case "Completed": + completed = true + } + } + } + + if !created || !completed { + return fmt.Errorf("%s: %w", jobName, ErrMissingEventOnCaptureJob) + } + + return nil +} + +func (v *ValidateCaptureStep) deleteJobs(ctx context.Context, log *slog.Logger, clientset *kubernetes.Clientset) error { + log.Info("running retina capture delete") + cmd := exec.CommandContext(ctx, "kubectl", "retina", "capture", "delete", "--namespace", v.CaptureNamespace, "--name", v.CaptureName) //#nosec + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to execute delete command: %w", err) + } + log.Info("delete command completed", "output", string(output)) + + captureJobSelector := &metav1.LabelSelector{ + MatchLabels: map[string]string{ + label.CaptureNameLabel: v.CaptureName, + label.AppLabel: captureConstants.CaptureAppname, + }, + } + labelSelector, err := labels.Parse(metav1.FormatLabelSelector(captureJobSelector)) + if err != nil { + return fmt.Errorf("failed to parse label selector: %w", err) + } + + pollRetrier := retry.Retrier{Attempts: 10, Delay: 1 * time.Second, ExpBackoff: true} + err = pollRetrier.Do(ctx, func() error { + jobList, listErr := clientset.BatchV1().Jobs(v.CaptureNamespace).List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector.String(), + }) + if listErr != nil { + return fmt.Errorf("failed to list jobs during delete verification: %w", listErr) + } + if len(jobList.Items) > 0 { + return ErrFoundNonZeroCaptureJobs + } + return nil + }) + if err != nil { + return err + } + + log.Info("all relevant capture jobs deleted") + return nil +} + +func (v *ValidateCaptureStep) downloadCapture(ctx context.Context, log *slog.Logger) error { + log.Info("downloading capture files") + + outputDir := filepath.Join(".", v.CaptureName) + + cmd := exec.CommandContext(ctx, "kubectl", "retina", "capture", "download", "--namespace", v.CaptureNamespace, "--name", v.CaptureName) // #nosec + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to execute download capture command: %s: %w", string(output), err) + } + log.Info("download capture command completed", "output", string(output)) + + files, err := os.ReadDir(outputDir) + if err != nil { + return fmt.Errorf("failed to list files in output directory %s: %w", outputDir, err) + } + + if len(files) == 0 { + return fmt.Errorf("no capture files were downloaded") + } + log.Info("downloaded capture files", "count", len(files)) + + for _, file := range files { + filePath := filepath.Join(outputDir, file.Name()) + + if !strings.HasSuffix(file.Name(), ".tar.gz") { + return fmt.Errorf("downloaded file %s does not have the expected .tar.gz extension", file.Name()) + } + + fileInfo, err := os.Stat(filePath) + if err != nil { + return fmt.Errorf("failed to get file info for %s: %w", filePath, err) + } + + if fileInfo.Size() == 0 { + return fmt.Errorf("downloaded file %s is empty", filePath) + } + + log.Info("validated file", "file", file.Name(), "size", fileInfo.Size()) + } + + return nil +} diff --git a/test/e2ev3/workflows/capture/workflow.go b/test/e2ev3/workflows/capture/workflow.go new file mode 100644 index 0000000000..a367710142 --- /dev/null +++ b/test/e2ev3/workflows/capture/workflow.go @@ -0,0 +1,50 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package capture + +import ( + "context" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" + "k8s.io/apimachinery/pkg/util/rand" +) + +// Workflow runs the capture validation workflow. +type Workflow struct { + Cfg *config.E2EConfig +} + +func (w *Workflow) String() string { return "capture" } + +func (w *Workflow) Do(ctx context.Context) error { + ctx, _ = utils.StepLogger(ctx, w) + p := w.Cfg + kubeConfigFilePath := p.Cluster.KubeConfigPath() + testPodNamespace := "default" + imgCfg := &p.Image + + wf := new(flow.Workflow) + + captureName := "retina-capture-e2e-" + rand.String(5) + + installPlugin := &InstallRetinaPluginStep{} + validateCap := &ValidateCaptureStep{ + CaptureName: captureName, + CaptureNamespace: testPodNamespace, + Duration: "5s", + KubeConfigPath: kubeConfigFilePath, + RestConfig: p.Cluster.RestConfig(), + ImageTag: imgCfg.Tag, + ImageRegistry: imgCfg.Registry, + ImageNamespace: imgCfg.Namespace, + } + + wf.Add(flow.Pipe(installPlugin, validateCap)) + + return wf.Do(ctx) +} diff --git a/test/e2ev3/workflows/hubblemetrics/curl_pod.go b/test/e2ev3/workflows/hubblemetrics/curl_pod.go new file mode 100644 index 0000000000..9e1b5f34f6 --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/curl_pod.go @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + "context" + "fmt" + + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +// CurlPodStep executes a curl command from a source pod to a destination pod +// for flow testing. It resolves the destination pod's IP and runs the command. +type CurlPodStep struct { + SrcPodName string + SrcPodNamespace string + DstPodName string + DstPodNamespace string + RestConfig *rest.Config +} + +func (c *CurlPodStep) Do(ctx context.Context) error { + clientset, err := kubernetes.NewForConfig(c.RestConfig) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + dstPodIP, err := k8s.GetPodIP(ctx, c.RestConfig, c.DstPodNamespace, c.DstPodName) + if err != nil { + return fmt.Errorf("error getting pod IP: %w", err) + } + + cmd := fmt.Sprintf("curl -s -m 5 %s:80", dstPodIP) + _, err = k8s.ExecPod(ctx, clientset, c.RestConfig, c.SrcPodNamespace, c.SrcPodName, cmd) + if err != nil { + return fmt.Errorf("error executing command: %w", err) + } + return nil +} diff --git a/test/e2ev3/workflows/hubblemetrics/dns.go b/test/e2ev3/workflows/hubblemetrics/dns.go new file mode 100644 index 0000000000..47c800eabf --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/dns.go @@ -0,0 +1,77 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addHubbleDNSScenario(restConfig *rest.Config, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-dns" + + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, + AgnhostNamespace: config.TestPodNamespace, + AgnhostArch: arch, + RestConfig: restConfig, + } + execNslookup := &k8s.ExecInPod{ + PodName: agnhostName + "-0", + PodNamespace: config.TestPodNamespace, + Command: "nslookup -type=a one.one.one.one", + RestConfig: restConfig, + } + validateQuery := &prom.ValidateMetricStep{ + ForwardedPort: config.HubbleMetricsPort, + MetricName: config.HubbleDNSQueryMetricName, + ValidMetrics: []map[string]string{ValidHubbleDNSQueryMetricLabels}, + ExpectMetric: true, + } + validateResponse := &prom.ValidateMetricStep{ + ForwardedPort: config.HubbleMetricsPort, + MetricName: config.HubbleDNSResponseMetricName, + ValidMetrics: []map[string]string{ValidHubbleDNSResponseMetricLabels}, + ExpectMetric: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + LabelSelector: "k8s-app=retina", + LocalPort: config.HubbleMetricsPort, + RemotePort: config.HubbleMetricsPort, + Namespace: config.KubeSystemNamespace, + Endpoint: "metrics", + RestConfig: restConfig, + OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{execNslookup, validateQuery, validateResponse}, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), + ResourceName: agnhostName, + ResourceNamespace: config.TestPodNamespace, + RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources. + flow.Pipe(createAgnhost). + Timeout(k8s.DefaultScenarioTimeout), + // Validate: generate traffic and check metrics, retry with backoff. + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteAgnhost). + When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/hubblemetrics/dns_labels.go b/test/e2ev3/workflows/hubblemetrics/dns_labels.go new file mode 100644 index 0000000000..a2941d266b --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/dns_labels.go @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + "github.com/microsoft/retina/test/e2ev3/config" +) + +// Hubble DNS test fixtures: pod name and expected metric labels. +var ( + HubbleDNSPodName = "agnhost-dns-0" + + ValidHubbleDNSQueryMetricLabels = map[string]string{ + config.HubbleDestinationLabel: "", + config.HubbleSourceLabel: config.TestPodNamespace + "/" + HubbleDNSPodName, + config.HubbleIPsRetunedLabel: "0", + config.HubbleQTypesLabel: "A", + config.HubbleRCodeLabel: "", + config.HubbleQueryLabel: "one.one.one.one.", + } + + ValidHubbleDNSResponseMetricLabels = map[string]string{ + config.HubbleDestinationLabel: config.TestPodNamespace + "/" + HubbleDNSPodName, + config.HubbleSourceLabel: "", + config.HubbleIPsRetunedLabel: "2", + config.HubbleQTypesLabel: "A", + config.HubbleRCodeLabel: "No Error", + config.HubbleQueryLabel: "one.one.one.one.", + } +) diff --git a/test/e2ev3/workflows/hubblemetrics/drop.go b/test/e2ev3/workflows/hubblemetrics/drop.go new file mode 100644 index 0000000000..a62c47a14f --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/drop.go @@ -0,0 +1,82 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addHubbleDropScenario(restConfig *rest.Config, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := HubbleDropAgnhostName + podName := HubbleDropPodName + + createNetPol := &k8s.CreateDenyAllNetworkPolicy{ + NetworkPolicyNamespace: config.TestPodNamespace, + RestConfig: restConfig, + DenyAllLabelSelector: "app=" + agnhostName, + } + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, AgnhostNamespace: config.TestPodNamespace, + AgnhostArch: arch, RestConfig: restConfig, + } + execCurl := k8s.CurlExpectFail("hubble-drop-curl-"+arch, &k8s.ExecInPod{ + PodName: podName, PodNamespace: config.TestPodNamespace, + Command: "curl -s -m 5 bing.com", RestConfig: restConfig, + }) + validateRetinaDrop := &prom.ValidateMetricStep{ + ForwardedPort: config.RetinaMetricsPort, MetricName: config.RetinaDropMetricName, + ValidMetrics: []map[string]string{ValidRetinaDropMetricLabels}, ExpectMetric: true, + } + validateHubbleDrop := &prom.ValidateMetricStep{ + ForwardedPort: config.HubbleMetricsPort, MetricName: config.HubbleDropMetricName, + ValidMetrics: []map[string]string{ValidHubbleDropMetricLabels}, ExpectMetric: true, PartialMatch: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + LabelSelector: "k8s-app=retina", LocalPort: config.RetinaMetricsPort, RemotePort: config.RetinaMetricsPort, + Namespace: config.KubeSystemNamespace, Endpoint: config.MetricsEndpoint, RestConfig: restConfig, OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{ + execCurl, + validateRetinaDrop, + &k8s.WithPortForward{ + PF: &k8s.PortForward{ + LabelSelector: "k8s-app=retina", LocalPort: config.HubbleMetricsPort, RemotePort: config.HubbleMetricsPort, + Namespace: config.KubeSystemNamespace, Endpoint: config.MetricsEndpoint, RestConfig: restConfig, OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{validateHubbleDrop}, + }, + }, + } + deleteNetPol := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.NetworkPolicy), ResourceName: "deny-all", + ResourceNamespace: config.TestPodNamespace, RestConfig: restConfig, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, + ResourceNamespace: config.TestPodNamespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources. + flow.Pipe(createNetPol, createAgnhost). + Timeout(k8s.DefaultScenarioTimeout), + // Validate: generate traffic and check metrics, retry with backoff. + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteNetPol, deleteAgnhost). + When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/hubblemetrics/drop_labels.go b/test/e2ev3/workflows/hubblemetrics/drop_labels.go new file mode 100644 index 0000000000..37449c8493 --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/drop_labels.go @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + "github.com/microsoft/retina/test/e2ev3/config" +) + +// Hubble drop test fixtures: pod names and expected metric labels. +var ( + HubbleDropPodName = "agnhost-drop-0" + HubbleDropAgnhostName = "agnhost-drop" + + ValidRetinaDropMetricLabels = map[string]string{ + config.RetinaReasonLabel: config.IPTableRuleDrop, + config.RetinaDirectionLabel: "unknown", + } + + // Note: When the agnhost pod (with deny-all network policy) tries to curl bing.com, + // it triggers a DNS lookup to CoreDNS. The network policy blocks this egress traffic, + // but Cilium/Hubble records the drop at the destination (CoreDNS) ingress side rather + // than the source (agnhost) egress side. + // We partially validate this metric. + ValidHubbleDropMetricLabels = map[string]string{ + config.HubbleSourceLabel: "", + config.HubbleProtocolLabel: config.UDP, + config.HubbleReasonLabel: "POLICY_DENIED", + } +) diff --git a/test/e2ev3/workflows/hubblemetrics/flow_inter.go b/test/e2ev3/workflows/hubblemetrics/flow_inter.go new file mode 100644 index 0000000000..b3b80afc1e --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/flow_inter.go @@ -0,0 +1,93 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addHubbleFlowInterNodeScenario(restConfig *rest.Config, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + podnameSrc := "agnhost-flow-inter-src" + podnameDst := "agnhost-flow-inter-dst" + validSrcLabels := []map[string]string{ + {"source": config.TestPodNamespace + "/" + podnameSrc + "-0", "destination": "", "protocol": config.TCP, "subtype": "to-stack", "type": "Trace", "verdict": "FORWARDED"}, + {"source": config.TestPodNamespace + "/" + podnameDst + "-0", "destination": "", "protocol": config.TCP, "subtype": "to-endpoint", "type": "Trace", "verdict": "FORWARDED"}, + } + // Validate from dst pod's perspective using source-based labels. + // With sourceEgressContext=pod, flow metrics always populate 'source' with the local pod + // and leave 'destination' empty — so we check dst-0 appears as source for both directions. + validDstLabels := []map[string]string{ + {"source": config.TestPodNamespace + "/" + podnameDst + "-0", "destination": "", "protocol": config.TCP, "subtype": "to-stack", "type": "Trace", "verdict": "FORWARDED"}, + {"source": config.TestPodNamespace + "/" + podnameDst + "-0", "destination": "", "protocol": config.TCP, "subtype": "to-endpoint", "type": "Trace", "verdict": "FORWARDED"}, + } + + createSrc := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: podnameSrc, AgnhostNamespace: config.TestPodNamespace, + AgnhostArch: arch, RestConfig: restConfig, + } + createDst := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: podnameDst, AgnhostNamespace: config.TestPodNamespace, + AgnhostArch: arch, RestConfig: restConfig, + } + curlPod := &CurlPodStep{ + SrcPodName: podnameSrc + "-0", SrcPodNamespace: config.TestPodNamespace, + DstPodName: podnameDst + "-0", DstPodNamespace: config.TestPodNamespace, + RestConfig: restConfig, + } + validateSrc := &prom.ValidateMetricStep{ + ForwardedPort: config.HubbleMetricsPort, MetricName: config.HubbleFlowMetricName, + ValidMetrics: validSrcLabels, ExpectMetric: true, + } + validateDst := &prom.ValidateMetricStep{ + ForwardedPort: "9966", MetricName: config.HubbleFlowMetricName, + ValidMetrics: validDstLabels, ExpectMetric: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + LabelSelector: "k8s-app=retina", LocalPort: config.HubbleMetricsPort, RemotePort: config.HubbleMetricsPort, + Namespace: config.KubeSystemNamespace, Endpoint: config.MetricsEndpoint, RestConfig: restConfig, OptionalLabelAffinity: "app=" + podnameSrc, + }, + Steps: []flow.Steper{ + curlPod, + validateSrc, + &k8s.WithPortForward{ + PF: &k8s.PortForward{ + LabelSelector: "k8s-app=retina", LocalPort: "9966", RemotePort: config.HubbleMetricsPort, + Namespace: config.KubeSystemNamespace, Endpoint: config.MetricsEndpoint, RestConfig: restConfig, OptionalLabelAffinity: "app=" + podnameDst, + }, + Steps: []flow.Steper{validateDst}, + }, + }, + } + deleteSrc := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: podnameSrc, + ResourceNamespace: config.TestPodNamespace, RestConfig: restConfig, + } + deleteDst := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: podnameDst, + ResourceNamespace: config.TestPodNamespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources. + flow.Pipe(createSrc, createDst). + Timeout(k8s.DefaultScenarioTimeout), + // Validate: generate traffic and check metrics, retry with backoff. + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteSrc, deleteDst). + When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/hubblemetrics/flow_intra.go b/test/e2ev3/workflows/hubblemetrics/flow_intra.go new file mode 100644 index 0000000000..328591cf78 --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/flow_intra.go @@ -0,0 +1,68 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + "k8s.io/client-go/rest" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" +) + +func addHubbleFlowIntraNodeScenario(restConfig *rest.Config, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + podname := "agnhost-flow-intra" + replicas := 2 + validLabels := []map[string]string{ + {"source": config.TestPodNamespace + "/" + podname + "-0", "destination": "", "protocol": config.TCP, "subtype": "to-stack", "type": "Trace", "verdict": "FORWARDED"}, + {"source": config.TestPodNamespace + "/" + podname + "-0", "destination": "", "protocol": config.TCP, "subtype": "to-endpoint", "type": "Trace", "verdict": "FORWARDED"}, + {"source": config.TestPodNamespace + "/" + podname + "-1", "destination": "", "protocol": config.TCP, "subtype": "to-stack", "type": "Trace", "verdict": "FORWARDED"}, + {"source": config.TestPodNamespace + "/" + podname + "-1", "destination": "", "protocol": config.TCP, "subtype": "to-endpoint", "type": "Trace", "verdict": "FORWARDED"}, + } + + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: podname, AgnhostNamespace: config.TestPodNamespace, + ScheduleOnSameNode: true, AgnhostReplicas: &replicas, + AgnhostArch: arch, RestConfig: restConfig, + } + curlPod := &CurlPodStep{ + SrcPodName: podname + "-0", SrcPodNamespace: config.TestPodNamespace, + DstPodName: podname + "-1", DstPodNamespace: config.TestPodNamespace, + RestConfig: restConfig, + } + validateFlow := &prom.ValidateMetricStep{ + ForwardedPort: config.HubbleMetricsPort, MetricName: config.HubbleFlowMetricName, + ValidMetrics: validLabels, ExpectMetric: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + LabelSelector: "k8s-app=retina", LocalPort: config.HubbleMetricsPort, RemotePort: config.HubbleMetricsPort, + Namespace: config.KubeSystemNamespace, Endpoint: config.MetricsEndpoint, RestConfig: restConfig, OptionalLabelAffinity: "app=" + podname, + }, + Steps: []flow.Steper{curlPod, validateFlow}, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: podname, + ResourceNamespace: config.TestPodNamespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources. + flow.Pipe(createAgnhost). + Timeout(k8s.DefaultScenarioTimeout), + // Validate: generate traffic and check metrics, retry with backoff. + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteAgnhost). + When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/hubblemetrics/flow_world.go b/test/e2ev3/workflows/hubblemetrics/flow_world.go new file mode 100644 index 0000000000..cd6a8ded7b --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/flow_world.go @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addHubbleFlowToWorldScenario(restConfig *rest.Config, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + podname := "agnhost-flow-world" + validLabels := []map[string]string{ + {"source": config.TestPodNamespace + "/" + podname + "-0", "destination": "", "protocol": config.TCP, "subtype": "to-stack", "type": "Trace", "verdict": "FORWARDED"}, + {"source": config.TestPodNamespace + "/" + podname + "-0", "destination": "", "protocol": config.UDP, "subtype": "to-stack", "type": "Trace", "verdict": "FORWARDED"}, + } + + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: podname, AgnhostNamespace: config.TestPodNamespace, + AgnhostArch: arch, RestConfig: restConfig, + } + execCurl := &k8s.ExecInPod{ + PodName: podname + "-0", PodNamespace: config.TestPodNamespace, + Command: "curl -s -m 5 bing.com", RestConfig: restConfig, + } + validateFlow := &prom.ValidateMetricStep{ + ForwardedPort: config.HubbleMetricsPort, MetricName: config.HubbleFlowMetricName, + ValidMetrics: validLabels, ExpectMetric: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + LabelSelector: "k8s-app=retina", LocalPort: config.HubbleMetricsPort, RemotePort: config.HubbleMetricsPort, + Namespace: config.KubeSystemNamespace, Endpoint: config.MetricsEndpoint, RestConfig: restConfig, OptionalLabelAffinity: "app=" + podname, + }, + Steps: []flow.Steper{execCurl, validateFlow}, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: podname, + ResourceNamespace: config.TestPodNamespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources. + flow.Pipe(createAgnhost). + Timeout(k8s.DefaultScenarioTimeout), + // Validate: generate traffic and check metrics, retry with backoff. + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteAgnhost). + When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/hubblemetrics/service.go b/test/e2ev3/workflows/hubblemetrics/service.go new file mode 100644 index 0000000000..a0ce348431 --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/service.go @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + "context" + "fmt" + "net/http" + "time" + + "k8s.io/client-go/rest" + + flow "github.com/Azure/go-workflow" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" +) + +func addHubbleRelayValidation(restConfig *rest.Config) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + validateRelay := &ValidateHubbleRelayServiceStep{RestConfig: restConfig} + wf.Add(flow.Step(validateRelay)) + return wf +} + +func addHubbleUIValidation(restConfig *rest.Config) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + validateUI := &ValidateHubbleUIServiceStep{RestConfig: restConfig} + wf.Add(flow.Step(validateUI)) + return wf +} + + + +// ValidateHubbleRelayServiceStep validates that the hubble-relay-service +// exists in the cluster. +type ValidateHubbleRelayServiceStep struct { + RestConfig *rest.Config +} + +func (v *ValidateHubbleRelayServiceStep) String() string { return "validate-hubble-relay-service" } + +func (v *ValidateHubbleRelayServiceStep) Do(ctx context.Context) error { + step := &k8s.ValidateResource{ + ResourceName: "hubble-relay-service", + ResourceNamespace: k8s.HubbleNamespace, + ResourceType: k8s.ResourceTypeService, + Labels: "k8s-app=" + k8s.HubbleRelayApp, + RestConfig: v.RestConfig, + } + return step.Do(ctx) +} + +// ValidateHubbleUIServiceStep validates that the hubble-ui service exists +// and that it responds with HTTP 200. +type ValidateHubbleUIServiceStep struct { + RestConfig *rest.Config +} + +func (v *ValidateHubbleUIServiceStep) String() string { return "validate-hubble-ui-service" } + +func (v *ValidateHubbleUIServiceStep) Do(ctx context.Context) error { + ctx, log := utils.StepLogger(ctx, v) + validateStep := &k8s.ValidateResource{ + ResourceName: k8s.HubbleUIApp, + ResourceNamespace: k8s.HubbleNamespace, + ResourceType: k8s.ResourceTypeService, + Labels: "k8s-app=" + k8s.HubbleUIApp, + RestConfig: v.RestConfig, + } + if err := validateStep.Do(ctx); err != nil { + return fmt.Errorf("failed to validate hubble-ui service: %w", err) + } + + // Port forward and validate HTTP response + pf := &k8s.PortForward{ + LabelSelector: "k8s-app=hubble-ui", + LocalPort: "8080", + RemotePort: "8081", + OptionalLabelAffinity: "k8s-app=hubble-ui", + Endpoint: "?namespace=default", + RestConfig: v.RestConfig, + } + if err := pf.Do(ctx); err != nil { + return fmt.Errorf("failed to port forward to hubble-ui: %w", err) + } + defer pf.Stop() //nolint:errcheck // best effort cleanup + + httpStep := &k8s.ValidateHTTPResponse{ + URL: "http://localhost:8080", + ExpectedStatus: http.StatusOK, + } + if err := httpStep.Do(ctx); err != nil { + return fmt.Errorf("failed to validate hubble-ui HTTP response: %w", err) + } + + log.Info("Hubble UI service validation succeeded") + return nil +} + +const hubbleUIRequestTimeout = 30 * time.Second + +// ValidateHTTPResponseStep wraps the old ValidateHTTPResponse step. +type ValidateHTTPResponseStep struct { + URL string + ExpectedStatus int +} + +func (v *ValidateHTTPResponseStep) String() string { return "validate-http-response" } + +func (v *ValidateHTTPResponseStep) Do(ctx context.Context) error { + step := &k8s.ValidateHTTPResponse{ + URL: v.URL, + ExpectedStatus: v.ExpectedStatus, + } + return step.Do(ctx) +} diff --git a/test/e2ev3/workflows/hubblemetrics/tcp.go b/test/e2ev3/workflows/hubblemetrics/tcp.go new file mode 100644 index 0000000000..01e81d1c49 --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/tcp.go @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + prom "github.com/microsoft/retina/test/e2ev3/pkg/prometheus" + "k8s.io/client-go/rest" +) + +func addHubbleTCPScenario(restConfig *rest.Config, arch string) *flow.Workflow { + wf := &flow.Workflow{DontPanic: true} + agnhostName := "agnhost-tcp" + podName := agnhostName + "-0" + + createAgnhost := &k8s.CreateAgnhostStatefulSet{ + AgnhostName: agnhostName, AgnhostNamespace: config.TestPodNamespace, + AgnhostArch: arch, RestConfig: restConfig, + } + execCurl := &k8s.ExecInPod{ + PodName: podName, PodNamespace: config.TestPodNamespace, + Command: "curl -s -m 5 bing.com", RestConfig: restConfig, + } + validateTCP := &prom.ValidateMetricStep{ + ForwardedPort: config.HubbleMetricsPort, MetricName: config.HubbleTCPFlagsMetricName, + ValidMetrics: ValidHubbleTCPMetricsLabels, ExpectMetric: true, + } + validateWithPF := &k8s.WithPortForward{ + PF: &k8s.PortForward{ + LabelSelector: "k8s-app=retina", LocalPort: config.HubbleMetricsPort, RemotePort: config.HubbleMetricsPort, + Namespace: config.KubeSystemNamespace, Endpoint: config.MetricsEndpoint, + RestConfig: restConfig, OptionalLabelAffinity: "app=" + agnhostName, + }, + Steps: []flow.Steper{execCurl, validateTCP}, + } + deleteAgnhost := &k8s.DeleteKubernetesResource{ + ResourceType: k8s.TypeString(k8s.StatefulSet), ResourceName: agnhostName, + ResourceNamespace: config.TestPodNamespace, RestConfig: restConfig, + } + + wf.Add( + flow.BatchPipe( + // Setup: provision resources. + flow.Pipe(createAgnhost). + Timeout(k8s.DefaultScenarioTimeout), + // Validate: generate traffic and check metrics, retry with backoff. + flow.Steps(validateWithPF). + Retry(k8s.RetryWithBackoff), + // Cleanup: always runs, even if validation fails. + flow.Pipe(deleteAgnhost). + When(flow.Always), + ), + ) + return wf +} diff --git a/test/e2ev3/workflows/hubblemetrics/tcp_labels.go b/test/e2ev3/workflows/hubblemetrics/tcp_labels.go new file mode 100644 index 0000000000..be15bbeb9b --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/tcp_labels.go @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + "github.com/microsoft/retina/test/e2ev3/config" +) + +// Hubble TCP test fixtures: pod name and expected metric labels. +var ( + HubbleTCPPodName = "agnhost-tcp-0" + + ValidHubbleTCPSYNFlag = map[string]string{ + config.HubbleSourceLabel: config.TestPodNamespace + "/" + HubbleTCPPodName, + config.HubbleDestinationLabel: "", + config.HubbleFamilyLabel: config.IPV4, + config.HubbleFlagLabel: config.SYN, + } + + ValidHubbleTCPFINFlag = map[string]string{ + config.HubbleSourceLabel: config.TestPodNamespace + "/" + HubbleTCPPodName, + config.HubbleDestinationLabel: "", + config.HubbleFamilyLabel: config.IPV4, + config.HubbleFlagLabel: config.FIN, + } + + ValidHubbleTCPMetricsLabels = []map[string]string{ + ValidHubbleTCPSYNFlag, + ValidHubbleTCPFINFlag, + } +) diff --git a/test/e2ev3/workflows/hubblemetrics/workflow.go b/test/e2ev3/workflows/hubblemetrics/workflow.go new file mode 100644 index 0000000000..029c1c7490 --- /dev/null +++ b/test/e2ev3/workflows/hubblemetrics/workflow.go @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build e2e + +package hubblemetrics + +import ( + "context" + + flow "github.com/Azure/go-workflow" + "github.com/microsoft/retina/test/e2ev3/config" + k8s "github.com/microsoft/retina/test/e2ev3/pkg/kubernetes" + "github.com/microsoft/retina/test/e2ev3/pkg/utils" +) + +// Workflow runs the hubble metrics workflow. +type Workflow struct { + Cfg *config.E2EConfig +} + +func (w *Workflow) String() string { return "hubble-metrics" } + +func (w *Workflow) Do(ctx context.Context) error { + ctx, _ = utils.StepLogger(ctx, w) + p := w.Cfg + restConfig := p.Cluster.RestConfig() + chartPath := p.Paths.HubbleChart + imgCfg := &p.Image + helmCfg := &p.Helm + + // Construct steps. + installHubble := &k8s.InstallHubbleHelmChart{ + Namespace: config.KubeSystemNamespace, + ReleaseName: "retina", + KubeConfigFilePath: p.Cluster.KubeConfigPath(), + ChartPath: chartPath, + ImageTag: imgCfg.Tag, + ImageRegistry: imgCfg.Registry, + ImageNamespace: imgCfg.Namespace, + HelmDriver: helmCfg.Driver, + ImageLoader: p.Cluster, + } + + scenarios := []flow.Steper{ + addHubbleRelayValidation(restConfig), + addHubbleUIValidation(restConfig), + } + for _, arch := range config.Architectures { + scenarios = append(scenarios, + addHubbleDNSScenario(restConfig, arch), + addHubbleFlowIntraNodeScenario(restConfig, arch), + addHubbleFlowInterNodeScenario(restConfig, arch), + addHubbleFlowToWorldScenario(restConfig, arch), + addHubbleDropScenario(restConfig, arch), + addHubbleTCPScenario(restConfig, arch), + ) + } + + ensureStable := &k8s.EnsureStableComponent{ + PodNamespace: config.KubeSystemNamespace, + LabelSelector: "k8s-app=retina", + RestConfig: restConfig, + IgnoreContainerRestart: false, + } + + debug := &k8s.DebugOnFailure{ + RestConfig: restConfig, + Namespace: config.KubeSystemNamespace, + LabelSelector: "k8s-app=retina", + } + + // Wire dependencies and register. + // Scenarios run sequentially because they share the same port-forward port. + wf := &flow.Workflow{DontPanic: true} + wf.Add(flow.Step(installHubble)) + prev := flow.Steper(installHubble) + for _, s := range scenarios { + wf.Add(flow.Step(s).DependsOn(prev)) + prev = s + } + wf.Add(flow.Step(ensureStable).DependsOn(prev)) + wf.Add(flow.Step(debug).DependsOn(ensureStable).When(flow.AnyFailed)) + + return wf.Do(ctx) +} diff --git a/test/profiles/advanced/values.yaml b/test/profiles/advanced/values.yaml index 5c07e6eaaf..bbc361c241 100644 --- a/test/profiles/advanced/values.yaml +++ b/test/profiles/advanced/values.yaml @@ -1,6 +1,7 @@ enablePodLevel: true enableAnnotations: true packetParserRingBuffer: "enabled" +enabledPlugin_linux: '["dropreason","packetforward","linuxutil","dns","packetparser"]' operator: enabled: true enableRetinaEndpoint: true