diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a2cfb4f2..4d8caf98 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -46,6 +46,7 @@ jobs: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo add strimzi https://strimzi.io/charts/ helm repo add seaweedfs https://seaweedfs.github.io/seaweedfs/helm + helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts - name: Run chart-releaser uses: helm/chart-releaser-action@cae68fefc6b5f367a0275617c9f83181ba54714f diff --git a/.gitignore b/.gitignore index cc8f86f3..4cda4743 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,7 @@ charts/mlrun-ce/charts/* **/.DS_Store *.DS_Store **/__pycache__ +# Packaged chart tarballs (generated by make package) +charts/mlrun-ce/mlrun-ce-*.tgz +# MLRun project directories created by test scripts +otlp-pro/ diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index b91431ec..0edfd0d5 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.11.0-rc.31 +version: 0.11.0-rc.32 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png diff --git a/charts/mlrun-ce/README.md b/charts/mlrun-ce/README.md index e3707b48..99bbf31b 100644 --- a/charts/mlrun-ce/README.md +++ b/charts/mlrun-ce/README.md @@ -14,6 +14,7 @@ The Open source MLRun ce chart includes the following stack: * Spark Operator - https://github.com/GoogleCloudPlatform/spark-on-k8s-operator * Pipelines - https://github.com/kubeflow/pipelines * Prometheus stack - https://github.com/prometheus-community/helm-charts +* OpenTelemetry Operator - https://github.com/open-telemetry/opentelemetry-operator (observability) ## Prerequisites @@ -64,6 +65,33 @@ helm --namespace mlrun \ mlrun/mlrun-ce ``` +### Installing with OpenTelemetry Enabled + +> **Note:** OpenTelemetry is **disabled by default**. Follow the standard [Installing the Chart](#installing-the-chart) steps, adding the OTel flags below. + +To install with OpenTelemetry enabled, append the following flags to the helm install command: + +```bash +helm --namespace mlrun \ + install my-mlrun \ + --wait \ + --set global.registry.url= \ + --set global.registry.secretName=registry-credentials \ + --set opentelemetry-operator.enabled=true \ + --set opentelemetry.namespaceLabel.enabled=true \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.instrumentation.enabled=true \ + mlrun/mlrun-ce +``` + +To verify the OpenTelemetry resources were created: + +```bash +kubectl -n mlrun get opentelemetrycollectors +kubectl -n mlrun get instrumentations +kubectl -n mlrun get pods | grep opentelemetry +``` + ### Installing MLRun-ce on minikube The Open source MLRun ce uses node ports for simplicity. If your kubernetes cluster is running inside a VM, @@ -89,6 +117,25 @@ following values: Additional configurable values are documented in the `values.yaml`, and the `values.yaml` of all sub charts. Override those [in the normal methods](https://helm.sh/docs/chart_template_guide/values_files/). +### Configuring OpenTelemetry (Observability) + +MLRun CE includes the OpenTelemetry Operator for collecting metrics and traces. When enabled, it deploys a single collector per namespace (deployment mode) — instrumented pods push OTLP data to the collector, which forwards metrics to Prometheus via the OTLP endpoint. Python auto-instrumentation is applied namespace-wide via a webhook, and the `mlrun.io/otel: "true"` label is applied to Jupyter and Nuclio function pods to mark them for metric enrichment and trigger OTel injection on restart. + +For a fresh install with OTel, see [Installing with OpenTelemetry Enabled](#installing-with-opentelemetry-enabled). + +To enable OTel on an existing installation: + +```bash +helm --namespace mlrun upgrade my-mlrun \ + --set opentelemetry-operator.enabled=true \ + --set opentelemetry.namespaceLabel.enabled=true \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.instrumentation.enabled=true \ + mlrun/mlrun-ce +``` + +> **Note:** The above assumes a single-namespace installation. For multi-namespace (admin/non-admin) deployments, refer to the MLRun documentation. + ### Working with ECR To work with ECR, you must create a secret with your AWS credentials and a secret with ECR Token while providing both secret names to the helm install command. @@ -282,6 +329,6 @@ Refer to the [**Kubeflow documentation**](https://www.kubeflow.org/docs/started/ This table shows the versions of the main components in the MLRun CE chart: -| MLRun CE | MLRun | Nuclio | Jupyter | MPI Operator | SeaweedFS | Spark Operator | Pipelines | Kube-Prometheus-Stack | -|------------|--------|--------|---------|--------------|-----------|----------------|-----------|-----------------------| -| **0.11.0** | 1.11.0 | 1.15.9 | 4.5.0 | 0.2.3 | 4.17.0 | 2.1.0 | 2.15.0 | 72.1.1 | +| MLRun CE | MLRun | Nuclio | Jupyter | MPI Operator | SeaweedFS | Spark Operator | Pipelines | Kube-Prometheus-Stack | OpenTelemetry Operator | +|------------|--------|--------|---------|--------------|-----------|----------------|-----------|-----------------------|------------------------| +| **0.11.0** | 1.11.0 | 1.15.9 | 4.5.0 | 0.2.3 | 4.17.0 | 2.1.0 | 2.15.0 | 72.1.1 | 0.78.1 | diff --git a/charts/mlrun-ce/admin_installation_values.yaml b/charts/mlrun-ce/admin_installation_values.yaml index 56bca94f..66e87b09 100644 --- a/charts/mlrun-ce/admin_installation_values.yaml +++ b/charts/mlrun-ce/admin_installation_values.yaml @@ -77,3 +77,28 @@ strimzi-kafka-operator: kafka: enabled: false + +# OpenTelemetry Operator - enabled for CRD installation at cluster level +opentelemetry-operator: + enabled: true + admissionWebhooks: + certManager: + enabled: false + autoGenerateCert: + enabled: true + # Only apply webhooks to namespaces with the opentelemetry label + namespaceSelector: + matchLabels: + opentelemetry.io/inject: "enabled" + +# OpenTelemetry CRs - disabled at admin level, enabled in user namespaces +# Note: Controller namespace does NOT need the opentelemetry label since +# no workloads are instrumented here - only the operator runs here +opentelemetry: + namespaceLabel: + enabled: false + collector: + enabled: false + instrumentation: + enabled: false + diff --git a/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml b/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml index 6def2bf5..110cec6a 100644 --- a/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml +++ b/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml @@ -96,3 +96,19 @@ kafka: kube-prometheus-stack: enabled: false + +# OpenTelemetry Operator - disabled, CRDs installed at controller level +opentelemetry-operator: + enabled: false + +# OpenTelemetry CRs - enabled for user namespace +# The namespace will be labeled with opentelemetry.io/inject=enabled +# so the operator can inject sidecars into pods +opentelemetry: + namespaceLabel: + enabled: true + collector: + enabled: true + instrumentation: + enabled: true + diff --git a/charts/mlrun-ce/non_admin_installation_values.yaml b/charts/mlrun-ce/non_admin_installation_values.yaml index afdf4647..2d32c68c 100644 --- a/charts/mlrun-ce/non_admin_installation_values.yaml +++ b/charts/mlrun-ce/non_admin_installation_values.yaml @@ -82,3 +82,19 @@ kafka: kube-prometheus-stack: enabled: false + +# OpenTelemetry Operator - disabled, CRDs installed at controller level +opentelemetry-operator: + enabled: false + +# OpenTelemetry CRs - enabled for user namespace +# The namespace will be labeled and annotated for OTel deployment-mode collection +# and namespace-wide Python auto-instrumentation. +opentelemetry: + namespaceLabel: + enabled: true + collector: + enabled: true + instrumentation: + enabled: true + diff --git a/charts/mlrun-ce/requirements.lock b/charts/mlrun-ce/requirements.lock index 5da41a95..adda8fc5 100644 --- a/charts/mlrun-ce/requirements.lock +++ b/charts/mlrun-ce/requirements.lock @@ -20,5 +20,8 @@ dependencies: - name: strimzi-kafka-operator repository: https://strimzi.io/charts/ version: 0.48.0 -digest: sha256:e2b2d1b7531c4829aa25c8ce8d95506642ab59d0cb692a343d2e508a71525374 -generated: "2026-03-31T17:13:31.403112322Z" +- name: opentelemetry-operator + repository: https://open-telemetry.github.io/opentelemetry-helm-charts + version: 0.78.1 +digest: sha256:50ed77fd11e450e243c05eadac99857b4b0aae92ae73ca9a6c00fc1cdc726f70 +generated: "2026-04-15T11:23:19.249332+03:00" diff --git a/charts/mlrun-ce/requirements.yaml b/charts/mlrun-ce/requirements.yaml index 5af2481b..469ab547 100644 --- a/charts/mlrun-ce/requirements.yaml +++ b/charts/mlrun-ce/requirements.yaml @@ -25,3 +25,7 @@ dependencies: repository: "https://strimzi.io/charts/" version: "0.48.0" condition: strimzi-kafka-operator.enabled + - name: opentelemetry-operator + repository: "https://open-telemetry.github.io/opentelemetry-helm-charts" + version: "0.78.1" + condition: opentelemetry-operator.enabled diff --git a/charts/mlrun-ce/templates/NOTES.txt b/charts/mlrun-ce/templates/NOTES.txt index c23894a2..6ce2f2bd 100644 --- a/charts/mlrun-ce/templates/NOTES.txt +++ b/charts/mlrun-ce/templates/NOTES.txt @@ -127,5 +127,44 @@ TimescaleDB is available at: {{- end }} {{- end }} +{{- if index .Values "opentelemetry-operator" "enabled" }} +{{- "\n" }} +OpenTelemetry Operator is enabled! +- Operator manages OpenTelemetryCollector and Instrumentation CRs +- Namespace selector: opentelemetry.io/inject=enabled +{{- if .Values.opentelemetry.collector.enabled }} +{{- "\n" }} +OpenTelemetry Collector (deployment mode): +- Collector CR: {{ include "mlrun-ce.otel.collector.fullname" . }} +- OTLP gRPC endpoint: {{ include "mlrun-ce.otel.collector.fullname" . }}-collector:{{ .Values.opentelemetry.collector.otlp.grpcPort }} +- OTLP HTTP endpoint: {{ include "mlrun-ce.otel.collector.fullname" . }}-collector:{{ .Values.opentelemetry.collector.otlp.httpPort }} +- Metrics export: collector pushes via OTLP to Prometheus at /api/v1/otlp/v1/metrics +{{- end }} +{{- if .Values.opentelemetry.instrumentation.enabled }} +{{- "\n" }} +OpenTelemetry Auto-Instrumentation: +- Instrumentation CR: {{ include "mlrun-ce.otel.instrumentation.fullname" . }} +{{- if .Values.opentelemetry.instrumentation.python.enabled }} +- Python auto-instrumentation: enabled (namespace-wide via namespace annotation) +{{- end }} +{{- if .Values.opentelemetry.instrumentation.java.enabled }} +- Java auto-instrumentation: enabled +{{- end }} +{{- end }} +{{- if .Values.opentelemetry.namespaceLabel.enabled }} +{{- "\n" }} +Namespace OTel configuration: +- Label: {{ .Values.opentelemetry.namespaceLabel.key }}={{ .Values.opentelemetry.namespaceLabel.value }} +{{- if .Values.opentelemetry.instrumentation.enabled }} +- Python instrumentation annotation applied to all pods in namespace {{ .Release.Namespace }} +{{- end }} +{{- end }} +{{- if or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} +{{- "\n" }} +Pods labeled with mlrun.io/otel=true: Jupyter and Nuclio function pods (via functionDefaults). +These Python-based pods receive OTel auto-instrumentation (runtime metrics, traces, HTTP metrics for Nuclio functions). +{{- end }} +{{- end }} + Happy MLOPSing!!! :] {{- end }} diff --git a/charts/mlrun-ce/templates/_helpers.tpl b/charts/mlrun-ce/templates/_helpers.tpl index 7bb154bb..d207c879 100644 --- a/charts/mlrun-ce/templates/_helpers.tpl +++ b/charts/mlrun-ce/templates/_helpers.tpl @@ -413,3 +413,214 @@ TimescaleDB connection string for MLRun model monitoring postgresql://{{ .Values.timescaledb.auth.username | urlquery }}:{{ .Values.timescaledb.auth.password | urlquery }}@{{ include "mlrun-ce.timescaledb.fullname" . }}:{{ .Values.timescaledb.service.port }}/{{ .Values.timescaledb.auth.database }} {{- end }} +{{/* +============================================================================= +OpenTelemetry helpers +============================================================================= +*/}} + +{{/* +OpenTelemetry Collector name +*/}} +{{- define "mlrun-ce.otel.collector.name" -}} +{{- default "otel" .Values.opentelemetry.collector.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +OpenTelemetry Collector fullname +*/}} +{{- define "mlrun-ce.otel.collector.fullname" -}} +{{- if .Values.opentelemetry.collector.fullnameOverride }} +{{- .Values.opentelemetry.collector.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default "otel" .Values.opentelemetry.collector.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +OpenTelemetry Instrumentation name +*/}} +{{- define "mlrun-ce.otel.instrumentation.name" -}} +{{- default "otel-instrumentation" .Values.opentelemetry.instrumentation.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +OpenTelemetry Instrumentation fullname +*/}} +{{- define "mlrun-ce.otel.instrumentation.fullname" -}} +{{- if .Values.opentelemetry.instrumentation.fullnameOverride }} +{{- .Values.opentelemetry.instrumentation.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default "otel-instrumentation" .Values.opentelemetry.instrumentation.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +OpenTelemetry common labels +*/}} +{{- define "mlrun-ce.otel.labels" -}} +{{ include "mlrun-ce.common.labels" . }} +{{ include "mlrun-ce.otel.selectorLabels" . }} +{{- end }} + +{{/* +OpenTelemetry selector labels +*/}} +{{- define "mlrun-ce.otel.selectorLabels" -}} +{{ include "mlrun-ce.common.selectorLabels" . }} +app.kubernetes.io/component: opentelemetry +{{- end }} + +{{/* +OpenTelemetryCollector CR manifest for use in the CRD readiness job +*/}} +{{- define "mlrun-ce.otel.collector.manifest" -}} +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: {{ include "mlrun-ce.otel.collector.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} +spec: + mode: {{ .Values.opentelemetry.collector.mode }} + upgradeStrategy: {{ .Values.opentelemetry.collector.upgradeStrategy }} + managementState: managed + image: {{ (index .Values "opentelemetry-operator").manager.collectorImage.repository }}:{{ (index .Values "opentelemetry-operator").manager.collectorImage.tag }} + resources: + {{- toYaml .Values.opentelemetry.collector.resources | nindent 4 }} + config: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.otlp.grpcPort }} + http: + endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.otlp.httpPort }} + processors: + batch: + send_batch_size: 10000 + timeout: 10s + memory_limiter: + check_interval: 1s + limit_percentage: 80 + spike_limit_percentage: 25 + resourcedetection: + detectors: + - env + - system + timeout: 5s + override: false + exporters: + otlphttp/prometheus: + endpoint: http://prometheus-operated.{{ .Release.Namespace }}.svc:9090/api/v1/otlp + tls: + insecure: true + debug: + verbosity: basic + sampling_initial: 5 + sampling_thereafter: 200 + extensions: + health_check: + endpoint: 0.0.0.0:13133 + service: + extensions: + - health_check + pipelines: + metrics: + receivers: + - otlp + processors: + - memory_limiter + - resourcedetection + - batch + exporters: + - otlphttp/prometheus + - debug + traces: + receivers: + - otlp + processors: + - memory_limiter + - resourcedetection + - batch + exporters: + - debug + telemetry: + logs: + level: info + metrics: + address: 0.0.0.0:8888 +{{- end }} + +{{/* +Instrumentation CR manifest for use in the CRD readiness job +*/}} +{{- define "mlrun-ce.otel.instrumentation.manifest" -}} +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: {{ include "mlrun-ce.otel.instrumentation.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} +spec: + exporter: + endpoint: http://{{ include "mlrun-ce.otel.collector.fullname" . }}-collector:{{ .Values.opentelemetry.collector.otlp.httpPort }} + propagators: + {{- toYaml .Values.opentelemetry.instrumentation.propagators | nindent 4 }} + sampler: + type: {{ .Values.opentelemetry.instrumentation.sampler.type }} + argument: {{ .Values.opentelemetry.instrumentation.sampler.argument | quote }} + env: + - name: OTEL_SERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: OTEL_METRICS_EXPORTER + value: otlp + - name: OTEL_TRACES_EXPORTER + value: otlp + - name: OTEL_LOGS_EXPORTER + value: none + {{- if .Values.opentelemetry.instrumentation.python.enabled }} + python: + image: {{ .Values.opentelemetry.instrumentation.python.image.repository }}:{{ .Values.opentelemetry.instrumentation.python.image.tag }} + resourceRequirements: + {{- toYaml .Values.opentelemetry.instrumentation.python.resources | nindent 6 }} + env: + - name: OTEL_PYTHON_LOG_CORRELATION + value: "true" + - name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED + value: "false" + - name: OTEL_PYTHON_DISABLED_INSTRUMENTATIONS + value: "aws_lambda" + {{- end }} + {{- if .Values.opentelemetry.instrumentation.java.enabled }} + java: + image: {{ .Values.opentelemetry.instrumentation.java.image.repository }}:{{ .Values.opentelemetry.instrumentation.java.image.tag }} + resourceRequirements: + {{- toYaml .Values.opentelemetry.instrumentation.java.resources | nindent 6 }} + env: + - name: OTEL_INSTRUMENTATION_COMMON_DEFAULT_ENABLED + value: "true" + {{- end }} +{{- end }} +{{/* +OTel pod label — marks a pod as OTel-monitored for metric enrichment and discovery. +Namespace-level instrumentation annotation (set by namespace-label job) handles Python auto-instrumentation. +Wrap usage with: {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} +*/}} +{{- define "mlrun-ce.otel.podLabels" -}} +mlrun.io/otel: "true" +{{- end }} diff --git a/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml b/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml index 6e5374f2..aa1e43dd 100644 --- a/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml +++ b/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml @@ -5,6 +5,9 @@ metadata: name: {{ include "mlrun-ce.jupyter.fullname" . }} labels: {{- include "mlrun-ce.jupyter.labels" . | nindent 4 }} + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 4 }} + {{- end }} spec: replicas: 1 selector: @@ -14,6 +17,9 @@ spec: metadata: labels: {{- include "mlrun-ce.jupyter.selectorLabels" . | nindent 8 }} + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: {{- with .Values.jupyterNotebook.image.pullSecrets }} imagePullSecrets: @@ -71,7 +77,36 @@ spec: ports: - containerPort: 8888 name: http + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + command: + - /bin/bash + - -c + - | + # Extract tar if needed (mirrors mlce-start.sh setup) + file_path="${HOME}/.intdata" + if [ ! -f "$file_path" ]; then + echo "Base data does not exist, extracting home backup..." + cd / && tar -xvf /tmp/basehome.tar + echo "1" > "$file_path" + fi + cd "${HOME}" + # Add OTel sitecustomize.py to PYTHONPATH so Jupyter's own Python + # process bootstraps OTel directly (bypassing start-notebook.py which + # strips the path to prevent re-instrumentation of subprocesses). + OTEL_PATH=/otel-auto-instrumentation-python/opentelemetry/instrumentation/auto_instrumentation + if [ -d "$OTEL_PATH" ]; then + export PYTHONPATH="${OTEL_PATH}:${PYTHONPATH:-/otel-auto-instrumentation-python}" + fi + exec /opt/conda/bin/python3 /opt/conda/bin/jupyter-lab \ + --ip=0.0.0.0 \ + --port=8888 \ + --NotebookApp.token="" \ + --NotebookApp.password="" \ + --NotebookApp.allow_origin="*" \ + --NotebookApp.default_url=/lab + {{- else }} command: ["/bin/bash", "/usr/local/bin/mlce-start.sh"] + {{- end }} {{- with .Values.jupyterNotebook.nodeSelector }} nodeSelector: {{ toYaml . | nindent 8 }} {{- end }} diff --git a/charts/mlrun-ce/templates/opentelemetry/collector.yaml b/charts/mlrun-ce/templates/opentelemetry/collector.yaml new file mode 100644 index 00000000..e1dd53ee --- /dev/null +++ b/charts/mlrun-ce/templates/opentelemetry/collector.yaml @@ -0,0 +1,14 @@ +{{/* +OpenTelemetryCollector CR is created by the crd-readiness-job.yaml via a Helm post-install/post-upgrade hook. +This solves the race condition between the OpenTelemetry Operator starting up and registering its CRDs, +and Helm trying to create this CR. + +The actual CR manifest is defined in _helpers.tpl as "mlrun-ce.otel.collector.manifest" and is applied +by the job after it confirms the CRD is available. + +To see the CR configuration, check: +- templates/_helpers.tpl: defines the manifest +- templates/opentelemetry/crd-readiness-job.yaml: creates the CR +- values.yaml: opentelemetry.collector.* settings +*/}} + diff --git a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml new file mode 100644 index 00000000..35c6b566 --- /dev/null +++ b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml @@ -0,0 +1,89 @@ +{{- if or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} +--- +# Job to create OpenTelemetry CRs after the operator webhook is ready to accept them. +# Retries kubectl apply until the webhook accepts the CR (fresh install timing fix). +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + # Run as a post-install and post-upgrade hook + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "10" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-timeout": "300s" +spec: + ttlSecondsAfterFinished: 300 + backoffLimit: 6 + template: + metadata: + labels: + {{- include "mlrun-ce.otel.selectorLabels" . | nindent 8 }} + spec: + restartPolicy: OnFailure + serviceAccountName: {{ .Release.Name }}-otel-cr-creator + containers: + - name: cr-creator + image: bitnami/kubectl:1.32 + resources: + requests: + cpu: {{ .Values.opentelemetry.crdReadinessJob.resources.requests.cpu }} + memory: {{ .Values.opentelemetry.crdReadinessJob.resources.requests.memory }} + limits: + cpu: {{ .Values.opentelemetry.crdReadinessJob.resources.limits.cpu }} + memory: {{ .Values.opentelemetry.crdReadinessJob.resources.limits.memory }} + command: + - /bin/bash + - -c + - | + set -e + + # CRDs are guaranteed to exist — Helm applies charts/mlrun-ce/crds/ before + # any hooks run, so no polling needed here. + + {{- if .Values.opentelemetry.collector.enabled }} + # Write the OpenTelemetryCollector CR to a temp file and retry applying it + # until the operator webhook is ready to accept it (fresh install timing fix). + cat > /tmp/collector-cr.yaml <<'EOF' + {{- include "mlrun-ce.otel.collector.manifest" . | nindent 14 }} + EOF + echo "Creating/updating OpenTelemetryCollector CR (with webhook readiness retry)..." + until kubectl apply -f /tmp/collector-cr.yaml 2>&1; do + echo "Webhook not ready yet, retrying in 5s..." + sleep 5 + done + echo "OpenTelemetryCollector CR created/updated!" + {{- end }} + + {{- if .Values.opentelemetry.instrumentation.enabled }} + # Create or update the Instrumentation CR + echo "Creating/updating Instrumentation CR..." + cat <<'EOF' | kubectl apply -f - + {{- include "mlrun-ce.otel.instrumentation.manifest" . | nindent 14 }} + EOF + echo "Instrumentation CR created/updated!" + {{- end }} + + echo "All OpenTelemetry CRs have been created successfully!" + + # Wait for the operator webhook to be fully ready before restarting pods + echo "Waiting for OpenTelemetry operator webhook to be ready..." + until kubectl -n {{ .Release.Namespace }} rollout status deployment \ + -l app.kubernetes.io/name=opentelemetry-operator --timeout=60s &>/dev/null; do + sleep 5 + done + + # Restart pods labeled mlrun.io/otel=true so they go through the webhook + # and receive OTel auto-instrumentation injection. + # This solves the race condition where pods start before the webhook is ready. + echo "Restarting instrumented deployments and statefulsets..." + kubectl -n {{ .Release.Namespace }} rollout restart deployment \ + -l mlrun.io/otel=true 2>/dev/null || true + kubectl -n {{ .Release.Namespace }} rollout restart statefulset \ + -l mlrun.io/otel=true 2>/dev/null || true + echo "Rollout restarts triggered — pods will be re-created with OTel injection." +{{- end }} + diff --git a/charts/mlrun-ce/templates/opentelemetry/instrumentation.yaml b/charts/mlrun-ce/templates/opentelemetry/instrumentation.yaml new file mode 100644 index 00000000..b79b9198 --- /dev/null +++ b/charts/mlrun-ce/templates/opentelemetry/instrumentation.yaml @@ -0,0 +1,14 @@ +{{/* +Instrumentation CR is created by the crd-readiness-job.yaml via a Helm post-install/post-upgrade hook. +This solves the race condition between the OpenTelemetry Operator starting up and registering its CRDs, +and Helm trying to create this CR. + +The actual CR manifest is defined in _helpers.tpl as "mlrun-ce.otel.instrumentation.manifest" and is applied +by the job after it confirms the CRD is available. + +To see the CR configuration, check: +- templates/_helpers.tpl: defines the manifest +- templates/opentelemetry/crd-readiness-job.yaml: creates the CR +- values.yaml: opentelemetry.instrumentation.* settings +*/}} + diff --git a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml new file mode 100644 index 00000000..ee8bb5c7 --- /dev/null +++ b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml @@ -0,0 +1,47 @@ +{{- if and (or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled) .Values.opentelemetry.namespaceLabel.enabled -}} +# This template uses a post-install/post-upgrade hook to label the namespace +# for OpenTelemetry operator webhook injection, avoiding Helm ownership conflicts +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ .Release.Name }}-namespace-label + namespace: {{ .Release.Namespace }} + labels: +{{ include "mlrun-ce.otel.labels" . | indent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-10" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-timeout": "120s" +spec: + ttlSecondsAfterFinished: 60 + template: + metadata: + name: {{ .Release.Name }}-namespace-label + spec: + serviceAccountName: {{ .Release.Name }}-otel-cr-creator + restartPolicy: Never + containers: + - name: label-namespace + image: bitnami/kubectl:1.32 + resources: + requests: + cpu: {{ .Values.opentelemetry.namespaceLabelJob.resources.requests.cpu }} + memory: {{ .Values.opentelemetry.namespaceLabelJob.resources.requests.memory }} + limits: + cpu: {{ .Values.opentelemetry.namespaceLabelJob.resources.limits.cpu }} + memory: {{ .Values.opentelemetry.namespaceLabelJob.resources.limits.memory }} + command: + - /bin/sh + - -c + - | + echo "Labeling namespace {{ .Release.Namespace }} for OpenTelemetry..." + kubectl label namespace {{ .Release.Namespace }} {{ .Values.opentelemetry.namespaceLabel.key }}={{ .Values.opentelemetry.namespaceLabel.value }} --overwrite + {{- if .Values.opentelemetry.instrumentation.enabled }} + echo "Annotating namespace for namespace-wide Python auto-instrumentation..." + kubectl annotate namespace {{ .Release.Namespace }} \ + instrumentation.opentelemetry.io/inject-python={{ include "mlrun-ce.otel.instrumentation.fullname" . }} \ + --overwrite + {{- end }} + echo "Namespace configured for OpenTelemetry successfully!" +{{- end -}} diff --git a/charts/mlrun-ce/templates/opentelemetry/rbac.yaml b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml new file mode 100644 index 00000000..fd978318 --- /dev/null +++ b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml @@ -0,0 +1,178 @@ +{{- if or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} +--- +# ServiceAccount for OpenTelemetry collector sidecar +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} +--- +# ServiceAccount for the CR creator job +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade + "helm.sh/hook-weight": "-20" + "helm.sh/hook-delete-policy": before-hook-creation +--- +# Role for OpenTelemetry collector to access Kubernetes resources +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: otel-collector + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} +rules: + # Allow reading pod metadata for resource detection + - apiGroups: + - "" + resources: + - pods + - namespaces + verbs: + - get + - list + - watch + # Allow reading configmaps for collector configuration + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch +--- +# RoleBinding for OpenTelemetry collector +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: otel-collector + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: otel-collector +subjects: + - kind: ServiceAccount + name: otel-collector + namespace: {{ .Release.Namespace }} +--- +# ClusterRole for the CR creator job to read CRDs and label namespaces (cluster-scoped) +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ .Release.Name }}-otel-crd-reader + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade + "helm.sh/hook-weight": "-20" + "helm.sh/hook-delete-policy": before-hook-creation +rules: + # Allow reading CRDs to check availability (CRDs are cluster-scoped) + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + # Allow labeling namespaces for OTEL injection + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - patch + - update +--- +# ClusterRoleBinding for the CR creator job +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ .Release.Name }}-otel-crd-reader + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade + "helm.sh/hook-weight": "-20" + "helm.sh/hook-delete-policy": before-hook-creation +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Release.Name }}-otel-crd-reader +subjects: + - kind: ServiceAccount + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} +--- +# Role for the CR creator job to create OpenTelemetry CRs (namespace-scoped) +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade + "helm.sh/hook-weight": "-20" + "helm.sh/hook-delete-policy": before-hook-creation +rules: + # Allow creating/updating OpenTelemetry CRs + - apiGroups: + - opentelemetry.io + resources: + - opentelemetrycollectors + - instrumentations + verbs: + - create + - get + - patch + - update + - list + # Allow rollout restart of instrumented deployments/statefulsets after webhook is ready + - apiGroups: + - apps + resources: + - deployments + - statefulsets + verbs: + - get + - list + - patch +--- +# RoleBinding for the CR creator job +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade + "helm.sh/hook-weight": "-20" + "helm.sh/hook-delete-policy": before-hook-creation +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ .Release.Name }}-otel-cr-creator +subjects: + - kind: ServiceAccount + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} +{{- end }} + diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index a031dc4b..66391428 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -94,6 +94,13 @@ nuclio: kind: mlrun synchronizationInterval: 10m apiAddress: http://mlrun-api-chief:8080/api + # Default labels applied to all Nuclio function pods. + # mlrun.io/otel marks function pods for OTel metric enrichment; the namespace-level + # instrumentation annotation handles Python auto-instrumentation automatically. + functionDefaults: + metadata: + labels: + mlrun.io/otel: "true" mlrun: # set the type of filesystem to use: filesystem, s3 @@ -173,6 +180,13 @@ mlrun: name: mlrun-override-env optional: true extraPersistentVolumeMounts: ~ + # Explicitly expose the Docker image's PYTHONPATH as a K8s env var so that the + # OpenTelemetry operator's PYTHONPATH injection (PYTHONPATH=/otel-auto-instrumentation-python:$(PYTHONPATH)) + # can expand $(PYTHONPATH) correctly. Without this, K8s env var substitution resolves + # $(PYTHONPATH) to an empty string (it cannot see Docker image ENV vars), and the + # mlrun `services` package path is lost, crashing the API on startup. + extraEnvKeyValue: + PYTHONPATH: "/mlrun/server/py:/mlrun/server/py/schemas/proto" # Set mlrun api workers count by setting the minReplicas value. # This is recommended for production environments running at high scale. @@ -571,6 +585,15 @@ kube-prometheus-stack: service: type: NodePort nodePort: 30020 + prometheusSpec: + # Enable OTLP write receiver so the OTel collector can push metrics directly + # to Prometheus at /api/v1/otlp/v1/metrics instead of Prometheus scraping the collector. + # Prometheus v3 requires both the feature flag AND --web.enable-otlp-receiver. + enableFeatures: + - otlp-write-receiver + additionalArgs: + - name: web.enable-otlp-receiver + value: "" kube-state-metrics: fullnameOverride: state-metrics prometheus-node-exporter: @@ -699,3 +722,160 @@ kafka: # In multi-NS user mode, spark.enabled is true (ConfigMap needed for MLRun) spark: enabled: true + +# ============================================================================= +# OpenTelemetry Operator configuration +# Installs the OpenTelemetry Operator for managing collectors and instrumentation +# ============================================================================= +opentelemetry-operator: + enabled: false + # CRDs are bootstrapped by the parent chart's crds/ directory (tiny stubs applied + # before any templates so the type is established immediately on fresh install). + # Disable sub-chart CRD rendering to avoid ownership conflicts with the crds/ stubs. + # The stubs use x-kubernetes-preserve-unknown-fields so the operator can still + # manage CRs; the operator's admission webhook handles CR validation. + crds: + create: false + # Admission webhooks configuration + admissionWebhooks: + certManager: + enabled: false + autoGenerateCert: + enabled: true + # Only apply webhooks to namespaces with the opentelemetry label + # This ensures the operator only monitors labeled namespaces + namespaceSelector: + matchLabels: + opentelemetry.io/inject: "enabled" + manager: + # Collector image used by the operator when creating collectors + collectorImage: + # Using contrib distribution pinned to 0.108.0 — versions 0.109+ use a dynamically linked + # binary in a distroless image that lacks /lib64/ld-linux-x86-64.so.2 and fails to exec. + repository: otel/opentelemetry-collector-contrib + tag: 0.108.0 + # Auto-instrumentation images (all fields required by the sub-chart schema) + autoInstrumentationImage: + python: + repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-python + tag: 0.50b0 + java: + repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-java + tag: 2.10.0 + nodejs: + repository: "" + tag: "" + dotnet: + repository: "" + tag: "" + go: + repository: "" + tag: "" + apacheHttpd: + repository: "" + tag: "" + # Feature gates as string (comma-separated) + featureGates: "" + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +# ============================================================================= +# OpenTelemetry Collector and Instrumentation CRs +# These are managed separately from the operator for admin/non-admin split +# ============================================================================= +opentelemetry: + # Namespace label for enabling OpenTelemetry monitoring. + # When enabled, the post-install job labels the namespace and also annotates it with + # instrumentation.opentelemetry.io/inject-python so all Python pods are auto-instrumented. + namespaceLabel: + enabled: false + key: "opentelemetry.io/inject" + value: "enabled" + + # OpenTelemetry Collector configuration (DEPLOYMENT mode) + # A single collector Deployment runs per namespace, receiving OTLP from instrumented pods + # and exporting metrics to Prometheus. + collector: + enabled: false + nameOverride: "" + fullnameOverride: "" + # DEPLOYMENT mode - one collector pod per namespace, not injected as a sidecar + mode: deployment + # Set to "none" to prevent automatic collector upgrades when the operator is upgraded + upgradeStrategy: automatic + # Collector sidecar container resources + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + # OTLP receiver configuration + otlp: + grpcPort: 4317 + httpPort: 4318 + + # Instrumentation configuration for auto-instrumentation + instrumentation: + enabled: false + nameOverride: "" + fullnameOverride: "" + # Propagators for distributed tracing context + propagators: + - tracecontext + - baggage + # Sampler configuration + sampler: + type: parentbased_traceidratio + argument: "1" + # Python auto-instrumentation + python: + enabled: true + image: + repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-python + tag: 0.50b0 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + # Java auto-instrumentation (disabled by default, enable if needed) + java: + enabled: false + image: + repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-java + tag: 2.10.0 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 500m + memory: 512Mi + + # CRD readiness job resources (kubectl-only container) + crdReadinessJob: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + # Namespace label job resources (kubectl-only container) + namespaceLabelJob: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi diff --git a/tests/helm-template-test.sh b/tests/helm-template-test.sh new file mode 100755 index 00000000..0617e192 --- /dev/null +++ b/tests/helm-template-test.sh @@ -0,0 +1,477 @@ +#!/usr/bin/env bash +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Helm template tests for MLRun CE chart +# Validates that templates render correctly with various configurations + +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CHART_DIR="${SCRIPT_DIR}/../charts/mlrun-ce" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +TESTS_PASSED=0 +TESTS_FAILED=0 + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } +log_test() { echo -e "${GREEN}[TEST]${NC} $1"; } +log_pass() { echo -e "${GREEN}[PASS]${NC} $1"; ((TESTS_PASSED++)) || true; } +log_fail() { echo -e "${RED}[FAIL]${NC} $1"; ((TESTS_FAILED++)) || true; } + +# Render a specific template and return the output +render_template() { + local template="$1" + shift + helm template test "${CHART_DIR}" \ + --skip-schema-validation \ + --show-only "${template}" \ + "$@" 2>/dev/null +} + +# Render all templates and return the output +render_all() { + helm template test "${CHART_DIR}" \ + --skip-schema-validation \ + "$@" 2>/dev/null +} + +# Check if output contains a string +assert_contains() { + local output="$1" + local expected="$2" + local test_name="$3" + + if echo "$output" | grep -q "$expected"; then + log_pass "$test_name" + return 0 + else + log_fail "$test_name - expected to find: $expected" + return 1 + fi +} + +# Check if output does NOT contain a string +assert_not_contains() { + local output="$1" + local not_expected="$2" + local test_name="$3" + + if echo "$output" | grep -q "$not_expected"; then + log_fail "$test_name - should not contain: $not_expected" + return 1 + else + log_pass "$test_name" + return 0 + fi +} + +# Check if template renders (non-empty output) +assert_renders() { + local output="$1" + local test_name="$2" + + if [[ -n "$output" ]]; then + log_pass "$test_name" + return 0 + else + log_fail "$test_name - template produced no output" + return 1 + fi +} + +# Check if template does NOT render (empty output or error) +assert_not_renders() { + local template="$1" + local test_name="$2" + shift 2 + + local output + output=$(render_template "$template" "$@" 2>&1) || true + + if [[ -z "$output" ]] || echo "$output" | grep -q "could not find template"; then + log_pass "$test_name" + return 0 + else + log_fail "$test_name - template should not render" + return 1 + fi +} + +# ============================================================================ +# OpenTelemetry Tests +# ============================================================================ + +test_otel_collector_default() { + log_test "OpenTelemetry Collector - Enabled (via CRD Readiness Job)" + + local output + # The collector CR is now created by the crd-readiness-job, not directly + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.collector.enabled=true) + + assert_renders "$output" "CRD Readiness Job renders" + assert_contains "$output" "kind: Job" "Has correct kind" + assert_contains "$output" "kind: OpenTelemetryCollector" "Job contains OpenTelemetryCollector CR" + assert_contains "$output" "mode: deployment" "Uses deployment mode" + assert_contains "$output" "otlphttp/prometheus:" "Has OTLP HTTP Prometheus exporter" + assert_contains "$output" "/api/v1/otlp" "Pushes to Prometheus OTLP endpoint" + assert_contains "$output" "otlp:" "Has OTLP receiver" + assert_contains "$output" "helm.sh/hook" "Has Helm hooks" + assert_contains "$output" "post-install,post-upgrade" "Has correct hook triggers" + assert_contains "$output" "upgradeStrategy: automatic" "Has upgradeStrategy" + assert_contains "$output" "managementState: managed" "Has managementState" +} + +test_otel_collector_disabled() { + log_test "OpenTelemetry Collector - Disabled (default)" + + # When disabled, the crd-readiness-job should not render + assert_not_renders "templates/opentelemetry/crd-readiness-job.yaml" \ + "CRD Readiness Job does not render when collector disabled (default)" +} + +test_otel_collector_upgrade_strategy() { + log_test "OpenTelemetry Collector - upgradeStrategy override" + + local output + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.collector.upgradeStrategy=none) + + assert_contains "$output" "upgradeStrategy: none" "upgradeStrategy can be overridden to none" +} + +test_otel_collector_resources() { + log_test "OpenTelemetry Collector - Custom resources" + + local output + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.collector.resources.requests.cpu=100m \ + --set opentelemetry.collector.resources.requests.memory=128Mi \ + --set opentelemetry.collector.resources.limits.cpu=500m \ + --set opentelemetry.collector.resources.limits.memory=512Mi) + + assert_contains "$output" "cpu: 100m" "Custom CPU request" + assert_contains "$output" "memory: 128Mi" "Custom memory request" + assert_contains "$output" "cpu: 500m" "Custom CPU limit" + assert_contains "$output" "memory: 512Mi" "Custom memory limit" +} + +test_otel_instrumentation_default() { + log_test "OpenTelemetry Instrumentation - Enabled (via CRD Readiness Job)" + + local output + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.instrumentation.enabled=true) + + assert_renders "$output" "CRD Readiness Job renders for Instrumentation" + assert_contains "$output" "kind: Instrumentation" "Job contains Instrumentation CR" + assert_contains "$output" "tracecontext" "Has tracecontext propagator" + assert_contains "$output" "baggage" "Has baggage propagator" + assert_contains "$output" "parentbased_traceidratio" "Has sampler type" + assert_contains "$output" "python:" "Has Python instrumentation" + assert_contains "$output" "autoinstrumentation-python" "Uses Python auto-instrumentation image" +} + +test_otel_instrumentation_disabled() { + log_test "OpenTelemetry Instrumentation - Disabled (default)" + + # When both collector and instrumentation are disabled, the job should not render + assert_not_renders "templates/opentelemetry/crd-readiness-job.yaml" \ + "CRD Readiness Job does not render when instrumentation disabled (default)" +} + +test_otel_instrumentation_java_enabled() { + log_test "OpenTelemetry Instrumentation - Java enabled" + + local output + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.instrumentation.enabled=true \ + --set opentelemetry.instrumentation.java.enabled=true) + + assert_contains "$output" "java:" "Has Java instrumentation section" + assert_contains "$output" "autoinstrumentation-java" "Uses Java auto-instrumentation image" +} + +test_otel_rbac_default() { + log_test "OpenTelemetry RBAC - Enabled" + + local output + output=$(render_template "templates/opentelemetry/rbac.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.collector.enabled=true) + + assert_renders "$output" "RBAC renders" + assert_contains "$output" "kind: ServiceAccount" "Has ServiceAccount" + assert_contains "$output" "kind: Role" "Has Role" + assert_contains "$output" "kind: RoleBinding" "Has RoleBinding" + assert_contains "$output" "name: otel-collector" "Has correct name" + assert_contains "$output" "kind: ClusterRole" "Has ClusterRole for CRD access" + assert_contains "$output" "otel-cr-creator" "Has CR creator ServiceAccount" +} + +test_otel_rbac_disabled() { + log_test "OpenTelemetry RBAC - Disabled (default)" + + assert_not_renders "templates/opentelemetry/rbac.yaml" \ + "RBAC does not render when OTEL disabled (default)" +} + +test_jupyter_otel_labels() { + log_test "Jupyter Deployment - OTel label applied when enabled" + + local output + output=$(render_template "templates/jupyter-notebook/deployment.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.instrumentation.enabled=true) + + assert_contains "$output" 'mlrun.io/otel: "true"' "Has OTel pod label" + assert_not_contains "$output" "sidecar.opentelemetry.io/inject:" "No sidecar annotation (deployment mode)" + assert_not_contains "$output" "prometheus.io/scrape:" "No per-pod Prometheus annotation (collector scrapes)" +} + +test_jupyter_no_otel_label_when_disabled() { + log_test "Jupyter Deployment - No OTel label when disabled (default)" + + local output + output=$(render_template "templates/jupyter-notebook/deployment.yaml" \ + --set global.registry.url=test.io) + + assert_not_contains "$output" 'mlrun.io/otel' "No OTel label when disabled (default)" +} + +# ============================================================================ +# Admin/Non-Admin Installation Tests +# ============================================================================ + +test_admin_values_otel() { + log_test "Admin installation - OTEL operator enabled, CRs disabled" + + # CRD readiness job should not render when CRs are disabled + assert_not_renders "templates/opentelemetry/crd-readiness-job.yaml" \ + "CRD Readiness Job not rendered with admin values" \ + -f "${CHART_DIR}/admin_installation_values.yaml" +} + +test_non_admin_values_otel() { + log_test "Non-admin installation - OTEL CRs enabled" + + local output + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ + --set global.registry.url=test.io \ + -f "${CHART_DIR}/non_admin_installation_values.yaml") + + assert_renders "$output" "CRD Readiness Job renders with non-admin values" + assert_contains "$output" "kind: OpenTelemetryCollector" "Has Collector CR" + assert_contains "$output" "kind: Instrumentation" "Has Instrumentation CR" +} + +test_namespace_label_enabled() { + log_test "Namespace Label - Enabled" + + local output + output=$(render_template "templates/opentelemetry/namespace-label.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.namespaceLabel.enabled=true \ + --set opentelemetry.collector.enabled=true) + + assert_renders "$output" "Namespace label job renders" + assert_contains "$output" "kind: Job" "Has correct kind (Job)" + assert_contains "$output" "helm.sh/hook" "Has post-install hook annotation" + assert_contains "$output" "kubectl label namespace" "Has kubectl label command" + assert_contains "$output" "opentelemetry.io/inject" "Has OTEL inject label key" +} + +test_namespace_label_with_instrumentation_annotation() { + log_test "Namespace Label - Instrumentation annotation added when instrumentation enabled" + + local output + output=$(render_template "templates/opentelemetry/namespace-label.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.namespaceLabel.enabled=true \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.instrumentation.enabled=true) + + assert_contains "$output" "kubectl annotate namespace" "Has kubectl annotate command" + assert_contains "$output" "instrumentation.opentelemetry.io/inject-python" "Has Python instrumentation namespace annotation" +} + +test_namespace_label_disabled() { + log_test "Namespace Label - Disabled (default)" + + assert_not_renders "templates/opentelemetry/namespace-label.yaml" \ + "Namespace label not rendered when disabled (default)" +} + +test_admin_namespace_label_disabled() { + log_test "Admin installation - Namespace label disabled" + + assert_not_renders "templates/opentelemetry/namespace-label.yaml" \ + "Namespace label not rendered with admin values" \ + -f "${CHART_DIR}/admin_installation_values.yaml" +} + +test_non_admin_namespace_label_enabled() { + log_test "Non-admin installation - Namespace label enabled" + + local output + output=$(render_template "templates/opentelemetry/namespace-label.yaml" \ + --set global.registry.url=test.io \ + -f "${CHART_DIR}/non_admin_installation_values.yaml") + + assert_renders "$output" "Namespace label job renders with non-admin values" + assert_contains "$output" "opentelemetry.io/inject" "Has OTEL inject label" +} + +test_otel_operator_namespace_selector() { + log_test "OTEL Operator - Namespace selector configured" + + local output + output=$(render_all \ + --set global.registry.url=test.io \ + --set opentelemetry-operator.enabled=true) + + # Check if the operator webhook has namespace selector configured + # The selector should be in the MutatingWebhookConfiguration + if echo "$output" | grep -A5 "namespaceSelector:" | grep -q "opentelemetry.io/inject"; then + log_pass "Has namespace selector in webhook configuration" + else + log_fail "Namespace selector not found in webhook configuration" + fi +} + + +# ============================================================================ +# Full Chart Render Test +# ============================================================================ + +test_full_chart_renders() { + log_test "Full chart renders without errors" + + local output + output=$(render_all --set global.registry.url=test.io 2>&1) + + if [[ $? -eq 0 ]] && [[ -n "$output" ]]; then + log_pass "Full chart renders successfully" + else + log_fail "Full chart failed to render" + fi +} + +# ============================================================================ +# Main +# ============================================================================ + +main() { + log_info "Running Helm template tests for MLRun CE" + log_info "Chart directory: ${CHART_DIR}" + echo "" + + # Ensure dependencies are up to date + log_info "Updating Helm dependencies..." + helm dependency update "${CHART_DIR}" > /dev/null 2>&1 + + echo "" + echo "========================================" + echo "OpenTelemetry Collector Tests" + echo "========================================" + test_otel_collector_default + test_otel_collector_disabled + test_otel_collector_upgrade_strategy + test_otel_collector_resources + + echo "" + echo "========================================" + echo "OpenTelemetry Instrumentation Tests" + echo "========================================" + test_otel_instrumentation_default + test_otel_instrumentation_disabled + test_otel_instrumentation_java_enabled + + echo "" + echo "========================================" + echo "OpenTelemetry RBAC Tests" + echo "========================================" + test_otel_rbac_default + test_otel_rbac_disabled + + echo "" + echo "========================================" + echo "Jupyter OTEL Integration Tests" + echo "========================================" + test_jupyter_otel_labels + test_jupyter_no_otel_label_when_disabled + + echo "" + echo "========================================" + echo "Admin/Non-Admin Installation Tests" + echo "========================================" + test_admin_values_otel + test_non_admin_values_otel + + echo "" + echo "========================================" + echo "Namespace Label Tests" + echo "========================================" + test_namespace_label_enabled + test_namespace_label_disabled + test_admin_namespace_label_disabled + test_non_admin_namespace_label_enabled + test_namespace_label_with_instrumentation_annotation + test_otel_operator_namespace_selector + + + echo "" + echo "========================================" + echo "Full Chart Tests" + echo "========================================" + test_full_chart_renders + + echo "" + echo "========================================" + echo "Test Summary" + echo "========================================" + echo -e "Passed: ${GREEN}${TESTS_PASSED}${NC}" + echo -e "Failed: ${RED}${TESTS_FAILED}${NC}" + + if [[ ${TESTS_FAILED} -gt 0 ]]; then + log_error "Some tests failed!" + exit 1 + else + log_info "All tests passed!" + exit 0 + fi +} + +main "$@" + + + + diff --git a/tests/kind-test.sh b/tests/kind-test.sh index 756f2300..f5ee4333 100755 --- a/tests/kind-test.sh +++ b/tests/kind-test.sh @@ -164,6 +164,7 @@ setup_helm_repos() { helm repo add spark-operator https://kubeflow.github.io/spark-operator 2>/dev/null || true helm repo add kube-prometheus-stack https://prometheus-community.github.io/helm-charts 2>/dev/null || true helm repo add kafka https://charts.bitnami.com/bitnami 2>/dev/null || true + helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts 2>/dev/null || true helm repo update } @@ -794,6 +795,53 @@ verify_multi_ns() { log_error "${total_errors} check(s) failed" exit 1 fi + + # Verify OpenTelemetry CRDs and resources + echo "" + log_info "Verifying OpenTelemetry..." + + # Check if OpenTelemetry Operator is installed (CRDs exist) + if kubectl get crd opentelemetrycollectors.opentelemetry.io &>/dev/null; then + log_info "OpenTelemetryCollector CRD exists" + + # Check for collector CR + local collector + collector=$(kubectl get opentelemetrycollectors -n "${NAMESPACE}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [[ -n "${collector}" ]]; then + log_info "OpenTelemetryCollector CR found: ${collector}" + kubectl get opentelemetrycollectors -n "${NAMESPACE}" "${collector}" -o yaml 2>/dev/null | grep -E "mode:|status:" | head -5 || true + else + log_warn "No OpenTelemetryCollector CR found in namespace ${NAMESPACE}" + fi + else + log_warn "OpenTelemetryCollector CRD not found - operator may not be installed" + fi + + if kubectl get crd instrumentations.opentelemetry.io &>/dev/null; then + log_info "Instrumentation CRD exists" + + # Check for instrumentation CR + local instrumentation + instrumentation=$(kubectl get instrumentations -n "${NAMESPACE}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [[ -n "${instrumentation}" ]]; then + log_info "Instrumentation CR found: ${instrumentation}" + else + log_warn "No Instrumentation CR found in namespace ${NAMESPACE}" + fi + else + log_warn "Instrumentation CRD not found - operator may not be installed" + fi + + # Check if Jupyter pod has mlrun.io/otel label (deployment mode - no sidecar injection) + echo "" + log_info "Checking Jupyter deployment for OTEL pod label..." + local jupyter_labels + jupyter_labels=$(kubectl get deployment -n "${NAMESPACE}" -l app.kubernetes.io/component=jupyter-notebook -o jsonpath='{.items[0].spec.template.metadata.labels}' 2>/dev/null || echo "") + if echo "${jupyter_labels}" | grep -q "mlrun.io/otel"; then + log_info "Jupyter has mlrun.io/otel=true pod label (deployment mode)" + else + log_warn "Jupyter does not have mlrun.io/otel label (OTel may be disabled)" + fi } delete_cluster() { diff --git a/tests/package.sh b/tests/package.sh index d8f847bc..575c3546 100755 --- a/tests/package.sh +++ b/tests/package.sh @@ -25,6 +25,88 @@ echo "Installing chart dependencies" cd "$dirname"/../charts/mlrun-ce helm dependency update +# Patch opentelemetry-operator sub-chart schema: the upstream chart has +# "examples": "" (string) for featureGates, but JSON Schema requires an array. +# Helm v4 enforces metaschema validation strictly and rejects the install otherwise. +echo "Patching opentelemetry-operator schema (featureGates.examples string -> array)..." +python3 - <<'PYEOF' +import json, tarfile, os, shutil, tempfile + +tgz = "charts/opentelemetry-operator-0.78.1.tgz" +if not os.path.exists(tgz): + print(f" {tgz} not found, skipping patch") + exit(0) + +with tempfile.TemporaryDirectory() as tmp: + with tarfile.open(tgz, "r:gz") as t: + t.extractall(tmp) + schema_path = os.path.join(tmp, "opentelemetry-operator", "values.schema.json") + with open(schema_path) as f: + schema = json.load(f) + fg = schema["properties"]["manager"]["properties"]["featureGates"] + if isinstance(fg.get("examples"), str): + fg["examples"] = [fg["examples"]] + with open(schema_path, "w") as f: + json.dump(schema, f, indent=2) + print(" Patched featureGates.examples") + else: + print(" Already correct, no patch needed") + # Repack without macOS metadata + env = os.environ.copy() + env["COPYFILE_DISABLE"] = "1" + import subprocess + subprocess.run( + ["tar", "czf", os.path.abspath(tgz), "opentelemetry-operator"], + cwd=tmp, env=env, check=True + ) +PYEOF + +# Slim down the opentelemetry-operator sub-chart by replacing large conf/crds/ files +# with empty stubs. The CRDs are managed by the parent chart's crds/ directory instead +# (crds.create: false in values.yaml). Keeping the full 542 KB CRD files would push +# the Helm release Secret over the Kubernetes 3 MB API request limit. +echo "Slimming opentelemetry-operator conf/crds/ (replacing with empty stubs)..." +python3 - <<'PYEOF' +import tarfile, os, shutil, tempfile, io + +tgz = "charts/opentelemetry-operator-0.78.1.tgz" +if not os.path.exists(tgz): + print(f" {tgz} not found, skipping") + exit(0) + +# Stub content: preserves the {{- if .Values.crds.create }} guard so the template +# renders correctly (empty output) whether crds.create is true or false. +STUB = b"{{- if .Values.crds.create }}\n{{- end }}\n" + +crd_files = { + "opentelemetry-operator/conf/crds/crd-opentelemetrycollector.yaml", + "opentelemetry-operator/conf/crds/crd-opentelemetryinstrumentation.yaml", + "opentelemetry-operator/conf/crds/crd-opentelemetry.io_opampbridges.yaml", +} + +with tempfile.TemporaryDirectory() as tmp: + with tarfile.open(tgz, "r:gz") as t: + t.extractall(tmp) + + for rel in crd_files: + path = os.path.join(tmp, rel) + if os.path.exists(path): + orig = os.path.getsize(path) + with open(path, "wb") as f: + f.write(STUB) + print(f" {os.path.basename(rel)}: {orig} -> {len(STUB)} bytes") + else: + print(f" {rel} not found, skipping") + + import subprocess, os as _os + env = _os.environ.copy() + env["COPYFILE_DISABLE"] = "1" + subprocess.run( + ["tar", "czf", os.path.abspath(tgz), "opentelemetry-operator"], + cwd=tmp, env=env, check=True + ) +PYEOF + # Create MLRun CE tarball helm package . exit 0