From a4ee7529d94bf4186bd86fe63086003af7f083e5 Mon Sep 17 00:00:00 2001 From: royischoss Date: Tue, 24 Feb 2026 13:40:07 +0200 Subject: [PATCH 01/23] Remove v3io from MLRUN_MODEL_ENDPOINT_MONITORING__STORE_PREFIXES__USER_SPACE and MLRUN_MODEL_ENDPOINT_MONITORING__STORE_PREFIXES__MONITORING_APPLICATION plus removes MLRUN_MODEL_ENDPOINT_MONITORING__ENDPOINT_STORE_CONNECTION --- charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml index a284aa5c..e6221ad3 100644 --- a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml +++ b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml @@ -17,13 +17,14 @@ data: MLRUN_HTTPDB__REAL_PATH: s3:// MLRUN_ARTIFACT_PATH: s3://{{ $bucket_name }}/projects/{{ `{{run.project}}` }}/artifacts MLRUN_FEATURE_STORE__DATA_PREFIXES__DEFAULT: s3://{{ $bucket_name }}/projects/{project}/FeatureStore/{name}/{kind} + MLRUN_MODEL_ENDPOINT_MONITORING__STORE_PREFIXES__USER_SPACE: s3://{{ $bucket_name }}/projects/{project}/model-endpoints/{kind} + MLRUN_MODEL_ENDPOINT_MONITORING__STORE_PREFIXES__MONITORING_APPLICATION: s3://{{ $bucket_name }}/users/pipelines/{project}/monitoring-apps/ MLRUN_FEATURE_STORE__DATA_PREFIXES__NOSQL: "" MLRUN_CE__MODE: {{ .Values.mlrun.ce.mode }} MLRUN_CE__VERSION: {{ .Chart.Version }} MLRUN_DEFAULT_TENSORBOARD_LOGS_PATH: /home/jovyan/data/tensorboard/{{ `{{project}} `}} MLRUN_FEATURE_STORE__DEFAULT_TARGETS: parquet MLRUN_MODEL_ENDPOINT_MONITORING__STORE_PREFIXES__DEFAULT: s3://{{ $bucket_name }}/projects/{{ `{{project}}` }}/model-endpoints/{{ `{{kind}}` }} - MLRUN_MODEL_ENDPOINT_MONITORING__ENDPOINT_STORE_CONNECTION: "{{ template "mlrun-ce.mlrun.modelMonitoring.DSN" . }}" MLRUN_GRAFANA_URL: http://{{ .Values.global.externalHostAddress }}:{{ index .Values "kube-prometheus-stack" "grafana" "service" "nodePort" }} MLRUN_DEFAULT_FUNCTION_POD_RESOURCES__LIMITS__CPU: "{{ .Values.mlrun.defaultFunctionPodResources.limits.cpu | default "" }}" MLRUN_DEFAULT_FUNCTION_POD_RESOURCES__LIMITS__MEMORY: "{{ .Values.mlrun.defaultFunctionPodResources.limits.memory | default "" }}" From ce7601ea112a83aedee03d537063ee546ca06ee9 Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 25 Feb 2026 11:59:29 +0200 Subject: [PATCH 02/23] fix formating --- charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml index e6221ad3..f37779e5 100644 --- a/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml +++ b/charts/mlrun-ce/templates/config/mlrun-env-configmap.yaml @@ -17,8 +17,8 @@ data: MLRUN_HTTPDB__REAL_PATH: s3:// MLRUN_ARTIFACT_PATH: s3://{{ $bucket_name }}/projects/{{ `{{run.project}}` }}/artifacts MLRUN_FEATURE_STORE__DATA_PREFIXES__DEFAULT: s3://{{ $bucket_name }}/projects/{project}/FeatureStore/{name}/{kind} - MLRUN_MODEL_ENDPOINT_MONITORING__STORE_PREFIXES__USER_SPACE: s3://{{ $bucket_name }}/projects/{project}/model-endpoints/{kind} - MLRUN_MODEL_ENDPOINT_MONITORING__STORE_PREFIXES__MONITORING_APPLICATION: s3://{{ $bucket_name }}/users/pipelines/{project}/monitoring-apps/ + MLRUN_MODEL_ENDPOINT_MONITORING__STORE_PREFIXES__USER_SPACE: s3://{{ $bucket_name }}/projects/{{ `{{project}}` }}/model-endpoints/{{ `{{kind}}` }} + MLRUN_MODEL_ENDPOINT_MONITORING__STORE_PREFIXES__MONITORING_APPLICATION: s3://{{ $bucket_name }}/users/pipelines/{{ `{{project}}` }}/monitoring-apps/ MLRUN_FEATURE_STORE__DATA_PREFIXES__NOSQL: "" MLRUN_CE__MODE: {{ .Values.mlrun.ce.mode }} MLRUN_CE__VERSION: {{ .Chart.Version }} From 061e2a482dccb8fff2577cd9872f8990225eef56 Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 25 Feb 2026 14:44:40 +0200 Subject: [PATCH 03/23] fix version and remove deprecated dsn --- charts/mlrun-ce/Chart.yaml | 2 +- charts/mlrun-ce/templates/_helpers.tpl | 13 ------------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index 92bc7cb0..eb248c10 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.11.0-rc9 +version: 0.11.0-rc.12 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png diff --git a/charts/mlrun-ce/templates/_helpers.tpl b/charts/mlrun-ce/templates/_helpers.tpl index 27f76052..069e4e10 100644 --- a/charts/mlrun-ce/templates/_helpers.tpl +++ b/charts/mlrun-ce/templates/_helpers.tpl @@ -286,19 +286,6 @@ Pipelines labels {{ include "mlrun-ce.pipelines.selectorLabels" . }} {{- end -}} -{{/* -Model monitoring DSN -*/}} -{{- define "mlrun-ce.mlrun.modelMonitoring.DSN" -}} -{{- if .Values.mlrun.modelMonitoring.dsn -}} -{{ .Values.mlrun.modelMonitoring.dsn }} -{{- else -}} -{{- if eq "mysql" .Values.mlrun.httpDB.dbType -}} -{{ .Values.mlrun.httpDB.dsn }}_model_monitoring -{{- end -}} -{{- end -}} -{{- end -}} - {{/* TimescaleDB helpers */}} From 0b2243ff11bbb65014cb41880a42e4ba28843f88 Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 25 Mar 2026 11:40:11 +0200 Subject: [PATCH 04/23] adding otel to ce --- .github/workflows/release.yml | 1 + charts/mlrun-ce/README.md | 278 +++++++++- .../mlrun-ce/admin_installation_values.yaml | 25 + ..._admin_cluster_ip_installation_values.yaml | 16 + .../non_admin_installation_values.yaml | 16 + charts/mlrun-ce/requirements.lock | 7 +- charts/mlrun-ce/requirements.yaml | 4 + charts/mlrun-ce/templates/NOTES.txt | 50 ++ charts/mlrun-ce/templates/_helpers.tpl | 68 +++ .../jupyter-notebook/deployment.yaml | 16 + .../templates/opentelemetry/collector.yaml | 102 ++++ .../opentelemetry/instrumentation.yaml | 86 +++ .../opentelemetry/namespace-label.yaml | 15 + .../templates/opentelemetry/rbac.yaml | 58 +++ charts/mlrun-ce/values.yaml | 221 ++++++++ tests/helm-template-test.sh | 489 ++++++++++++++++++ tests/kind-test.sh | 48 ++ 17 files changed, 1495 insertions(+), 5 deletions(-) create mode 100644 charts/mlrun-ce/templates/opentelemetry/collector.yaml create mode 100644 charts/mlrun-ce/templates/opentelemetry/instrumentation.yaml create mode 100644 charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml create mode 100644 charts/mlrun-ce/templates/opentelemetry/rbac.yaml create mode 100755 tests/helm-template-test.sh diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a2cfb4f2..4d8caf98 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -46,6 +46,7 @@ jobs: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo add strimzi https://strimzi.io/charts/ helm repo add seaweedfs https://seaweedfs.github.io/seaweedfs/helm + helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts - name: Run chart-releaser uses: helm/chart-releaser-action@cae68fefc6b5f367a0275617c9f83181ba54714f diff --git a/charts/mlrun-ce/README.md b/charts/mlrun-ce/README.md index 00fb5425..0da23123 100644 --- a/charts/mlrun-ce/README.md +++ b/charts/mlrun-ce/README.md @@ -14,6 +14,7 @@ The Open source MLRun ce chart includes the following stack: * Spark Operator - https://github.com/GoogleCloudPlatform/spark-on-k8s-operator * Pipelines - https://github.com/kubeflow/pipelines * Prometheus stack - https://github.com/prometheus-community/helm-charts +* OpenTelemetry Operator - https://github.com/open-telemetry/opentelemetry-operator (observability) ## Prerequisites @@ -36,6 +37,7 @@ kubectl create namespace mlrun Add the mlrun ce helm chart repo ```bash helm repo add mlrun https://mlrun.github.io/ce +helm repo update ``` To work with the open source MLRun stack, you must an accessible docker-registry. The registry's URL and credentials @@ -64,6 +66,97 @@ helm --namespace mlrun \ mlrun/mlrun-ce ``` +### Complete Installation with OpenTelemetry (From Scratch) + +This section provides a complete step-by-step guide to install MLRun CE with full OpenTelemetry observability enabled. + +> **Note:** OpenTelemetry is **disabled by default**. Follow these steps to enable it. + +#### Step 1: Create the namespace + +```bash +kubectl create namespace mlrun +``` + +#### Step 2: Add the Helm repository + +```bash +helm repo add mlrun https://mlrun.github.io/ce +helm repo update +``` + +#### Step 3: Create the docker registry secret + +```bash +kubectl --namespace mlrun create secret docker-registry registry-credentials \ + --docker-username \ + --docker-password \ + --docker-server \ + --docker-email +``` + +#### Step 4: Install MLRun CE with OpenTelemetry Enabled + +```bash +helm --namespace mlrun \ + install my-mlrun \ + --wait \ + --timeout 15m \ + --set global.registry.url= \ + --set global.registry.secretName=registry-credentials \ + --set opentelemetry-operator.enabled=true \ + --set opentelemetry.namespaceLabel.enabled=true \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.collector.scrapeMode=otel \ + --set opentelemetry.instrumentation.enabled=true \ + mlrun/mlrun-ce +``` + +> **Important:** When enabling OpenTelemetry, set `opentelemetry.collector.scrapeMode=otel` to collect metrics +> via the OTEL sidecar and prevent duplicate metrics. The default is `direct` (for when OTEL is disabled). + +The installation will: +- Deploy the OpenTelemetry Operator +- Create an OpenTelemetryCollector CR (sidecar mode) +- Create an Instrumentation CR for Python auto-instrumentation +- Label the namespace with `opentelemetry.io/inject=enabled` +- Configure Prometheus to scrape OTEL sidecar metrics (port 8889) + +#### Step 5: Verify OpenTelemetry Installation + +Check that the OpenTelemetry resources are created: + +```bash +# Check the namespace label +kubectl get namespace mlrun --show-labels | grep opentelemetry + +# Check the OpenTelemetry Collector CR +kubectl -n mlrun get opentelemetrycollectors + +# Check the Instrumentation CR +kubectl -n mlrun get instrumentations + +# Check that the OTEL operator is running +kubectl -n mlrun get pods | grep opentelemetry +``` + +#### Step 6: Verify Jupyter has OTEL Sidecar Annotations + +```bash +kubectl -n mlrun get deployment -l app.kubernetes.io/component=jupyter-notebook \ + -o jsonpath='{.items[0].spec.template.metadata.annotations}' | jq . +``` + +You should see annotations like: +```json +{ + "instrumentation.opentelemetry.io/inject-python": "my-mlrun-otel-instrumentation", + "prometheus.io/port": "8889", + "prometheus.io/scrape": "true", + "sidecar.opentelemetry.io/inject": "my-mlrun-otel-collector" +} +``` + ### Installing MLRun-ce on minikube The Open source MLRun ce uses node ports for simplicity. If your kubernetes cluster is running inside a VM, @@ -89,6 +182,185 @@ following values: Additional configurable values are documented in the `values.yaml`, and the `values.yaml` of all sub charts. Override those [in the normal methods](https://helm.sh/docs/chart_template_guide/values_files/). +### Configuring OpenTelemetry (Observability) + +MLRun CE includes the OpenTelemetry Operator for collecting metrics and traces from your ML workloads. +The operator runs in **sidecar mode**, automatically injecting collector containers into annotated pods. + +> **Note:** OpenTelemetry is **disabled by default**. See below for how to enable it. + +#### Namespace Labeling + +The OpenTelemetry Operator **only monitors namespaces** with the label `opentelemetry.io/inject=enabled`. +This is automatically applied to the MLRun namespace when OpenTelemetry is enabled. + +When enabling OpenTelemetry, the namespace is labeled automatically: +```yaml +# Automatically added to your namespace when opentelemetry.namespaceLabel.enabled=true +labels: + opentelemetry.io/inject: "enabled" +``` + +For custom namespaces that need OpenTelemetry instrumentation, add the label manually: +```bash +kubectl label namespace opentelemetry.io/inject=enabled +``` + +> **Note:** The controller namespace (where the operator runs) does **NOT** need this label, +> as only the operator itself runs there - no workloads require instrumentation. + +#### Default Configuration + +By default, OpenTelemetry is **disabled**. When enabled, it provides: +- Namespace labeling for OTEL operator webhook targeting +- Sidecar collector injection for instrumented pods +- Python auto-instrumentation for Jupyter notebooks +- Prometheus metrics export on port 8889 + +#### Enabling OpenTelemetry + +To install **with** OpenTelemetry enabled: + +```bash +helm --namespace mlrun install my-mlrun \ + --set global.registry.url= \ + --set global.registry.secretName=registry-credentials \ + --set opentelemetry-operator.enabled=true \ + --set opentelemetry.namespaceLabel.enabled=true \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.collector.scrapeMode=otel \ + --set opentelemetry.instrumentation.enabled=true \ + mlrun/mlrun-ce +``` + +To **enable** OpenTelemetry on an existing installation: + +```bash +helm --namespace mlrun upgrade my-mlrun \ + --set opentelemetry-operator.enabled=true \ + --set opentelemetry.namespaceLabel.enabled=true \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.collector.scrapeMode=otel \ + --set opentelemetry.instrumentation.enabled=true \ + mlrun/mlrun-ce +``` + +To **disable** OpenTelemetry (default): + +```bash +helm --namespace mlrun upgrade my-mlrun \ + --set opentelemetry-operator.enabled=false \ + --set opentelemetry.collector.enabled=false \ + --set opentelemetry.instrumentation.enabled=false \ + --set opentelemetry.namespaceLabel.enabled=false \ + --set opentelemetry.collector.scrapeMode=direct \ + mlrun/mlrun-ce +``` + +#### Custom Resource Limits + +Configure collector sidecar resources: + +```bash +helm --namespace mlrun install my-mlrun \ + --set opentelemetry.collector.resources.requests.cpu=100m \ + --set opentelemetry.collector.resources.requests.memory=128Mi \ + --set opentelemetry.collector.resources.limits.cpu=500m \ + --set opentelemetry.collector.resources.limits.memory=512Mi \ + mlrun/mlrun-ce +``` + +#### Enabling Java Auto-Instrumentation + +To enable Java auto-instrumentation (disabled by default): + +```bash +helm --namespace mlrun install my-mlrun \ + --set opentelemetry.instrumentation.java.enabled=true \ + mlrun/mlrun-ce +``` + +#### Adding OpenTelemetry to Custom Workloads + +To instrument your own deployments with the OTEL sidecar and Python auto-instrumentation: + +1. Ensure your namespace has the OpenTelemetry label: + ```bash + kubectl label namespace opentelemetry.io/inject=enabled + ``` + +2. Add these annotations to your pod spec: + ```yaml + metadata: + annotations: + sidecar.opentelemetry.io/inject: "-otel-collector" + instrumentation.opentelemetry.io/inject-python: "-otel-instrumentation" + prometheus.io/scrape: "true" + prometheus.io/scrape-mode: "otel" + prometheus.io/port: "8889" + ``` + +#### Preventing Prometheus/OTEL Metric Overlap + +To prevent duplicate metrics when using both Prometheus direct scraping and OpenTelemetry, +MLRun CE uses a **scrape-mode** annotation system: + +| Scrape Mode | Description | Use Case | +|-------------|-------------|----------| +| `direct` | Direct Prometheus scraping only | **Default** - When OTEL is disabled | +| `otel` | Metrics collected via OTEL sidecar only | **Recommended when OTEL enabled** | +| `both` | Both OTEL and direct scraping | Debugging/transition only | + +> **Note:** The default scrape mode is `direct`. When enabling OpenTelemetry, you must set +> `--set opentelemetry.collector.scrapeMode=otel` to collect metrics via the OTEL sidecar. + +**How it works:** +- OTEL-collected metrics have the `mlrun_otel_` prefix and `metrics_source=otel_collector` label +- Direct-scraped metrics have `metrics_source=direct_scrape` label +- Prometheus scrape configs filter based on `prometheus.io/scrape-mode` annotation + +**Configure scrape mode when enabling OTEL:** +```bash +helm --namespace mlrun install my-mlrun \ + --set opentelemetry-operator.enabled=true \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.collector.scrapeMode=otel \ + --set opentelemetry.instrumentation.enabled=true \ + mlrun/mlrun-ce +``` + +**Query metrics by source in Prometheus:** +```promql +# OTEL-collected metrics only +{metrics_source="otel_collector"} + +# Direct-scraped metrics only +{metrics_source="direct_scrape"} + +# OTEL metrics use prefix +mlrun_otel_http_server_duration_seconds_bucket{...} +``` + +#### Split Installation (Admin/Non-Admin) + +For multi-tenant clusters, install the operator CRDs at the cluster level and collectors in user namespaces: + +**Controller namespace (admin):** +```bash +# Operator only - no namespace label needed (no instrumented workloads here) +helm --namespace controller install mlrun-controller \ + -f admin_installation_values.yaml \ + mlrun/mlrun-ce +``` + +**User namespace (non-admin):** +```bash +# Collector CRs + namespace label applied automatically +helm --namespace mlrun install my-mlrun \ + -f non_admin_installation_values.yaml \ + mlrun/mlrun-ce +``` + ### Working with ECR To work with ECR, you must create a secret with your AWS credentials and a secret with ECR Token while providing both secret names to the helm install command. @@ -282,6 +554,6 @@ Refer to the [**Kubeflow documentation**](https://www.kubeflow.org/docs/started/ This table shows the versions of the main components in the MLRun CE chart: -| MLRun CE | MLRun | Nuclio | Jupyter | MPI Operator | SeaweedFS | Spark Operator | Pipelines | Kube-Prometheus-Stack | -|------------|--------|--------|---------|--------------|-----------|----------------|-----------|-----------------------| -| **0.11.0** | 1.11.0 | 1.15.9 | 4.5.0 | 0.2.3 | 4.0.407 | 2.1.0 | 2.14.3 | 72.1.1 | +| MLRun CE | MLRun | Nuclio | Jupyter | MPI Operator | SeaweedFS | Spark Operator | Pipelines | Kube-Prometheus-Stack | OpenTelemetry Operator | +|------------|--------|--------|---------|--------------|-----------|----------------|-----------|-----------------------|------------------------| +| **0.11.0** | 1.11.0 | 1.15.9 | 4.5.0 | 0.2.3 | 4.0.407 | 2.1.0 | 2.14.3 | 72.1.1 | 0.78.1 | diff --git a/charts/mlrun-ce/admin_installation_values.yaml b/charts/mlrun-ce/admin_installation_values.yaml index c9b2bf23..9de962a9 100644 --- a/charts/mlrun-ce/admin_installation_values.yaml +++ b/charts/mlrun-ce/admin_installation_values.yaml @@ -57,3 +57,28 @@ strimzi-kafka-operator: kafka: enabled: false + +# OpenTelemetry Operator - enabled for CRD installation at cluster level +opentelemetry-operator: + enabled: true + admissionWebhooks: + certManager: + enabled: false + autoGenerateCert: + enabled: true + # Only apply webhooks to namespaces with the opentelemetry label + namespaceSelector: + matchLabels: + opentelemetry.io/inject: "enabled" + +# OpenTelemetry CRs - disabled at admin level, enabled in user namespaces +# Note: Controller namespace does NOT need the opentelemetry label since +# no workloads are instrumented here - only the operator runs here +opentelemetry: + namespaceLabel: + enabled: false + collector: + enabled: false + instrumentation: + enabled: false + diff --git a/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml b/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml index a98463ad..87c5e58f 100644 --- a/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml +++ b/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml @@ -64,3 +64,19 @@ kafka: kube-prometheus-stack: enabled: false + +# OpenTelemetry Operator - disabled, CRDs installed at controller level +opentelemetry-operator: + enabled: false + +# OpenTelemetry CRs - enabled for user namespace +# The namespace will be labeled with opentelemetry.io/inject=enabled +# so the operator can inject sidecars into pods +opentelemetry: + namespaceLabel: + enabled: true + collector: + enabled: true + instrumentation: + enabled: true + diff --git a/charts/mlrun-ce/non_admin_installation_values.yaml b/charts/mlrun-ce/non_admin_installation_values.yaml index d84f02ee..793c38a4 100644 --- a/charts/mlrun-ce/non_admin_installation_values.yaml +++ b/charts/mlrun-ce/non_admin_installation_values.yaml @@ -58,3 +58,19 @@ kafka: kube-prometheus-stack: enabled: false + +# OpenTelemetry Operator - disabled, CRDs installed at controller level +opentelemetry-operator: + enabled: false + +# OpenTelemetry CRs - enabled for user namespace +# The namespace will be labeled with opentelemetry.io/inject=enabled +# so the operator can inject sidecars into pods +opentelemetry: + namespaceLabel: + enabled: true + collector: + enabled: true + instrumentation: + enabled: true + diff --git a/charts/mlrun-ce/requirements.lock b/charts/mlrun-ce/requirements.lock index fd36b055..9b938f12 100644 --- a/charts/mlrun-ce/requirements.lock +++ b/charts/mlrun-ce/requirements.lock @@ -20,5 +20,8 @@ dependencies: - name: strimzi-kafka-operator repository: https://strimzi.io/charts/ version: 0.48.0 -digest: sha256:f87ec580f73178cfc897d57e26f5d7b049900f1b7ef75bfe198ca327eb2ed06d -generated: "2026-02-12T23:52:46.490844+02:00" +- name: opentelemetry-operator + repository: https://open-telemetry.github.io/opentelemetry-helm-charts + version: 0.78.1 +digest: sha256:4a47a90d97b21b41cd3bb7f7e9b70b56b42b95fe067bb012e4d490fa1912e18f +generated: "2026-03-24T16:04:27.962041+02:00" diff --git a/charts/mlrun-ce/requirements.yaml b/charts/mlrun-ce/requirements.yaml index 900754a0..918f2195 100644 --- a/charts/mlrun-ce/requirements.yaml +++ b/charts/mlrun-ce/requirements.yaml @@ -25,3 +25,7 @@ dependencies: repository: "https://strimzi.io/charts/" version: "0.48.0" condition: strimzi-kafka-operator.enabled + - name: opentelemetry-operator + repository: "https://open-telemetry.github.io/opentelemetry-helm-charts" + version: "0.78.1" + condition: opentelemetry-operator.enabled diff --git a/charts/mlrun-ce/templates/NOTES.txt b/charts/mlrun-ce/templates/NOTES.txt index 980d3b54..a84d79e5 100644 --- a/charts/mlrun-ce/templates/NOTES.txt +++ b/charts/mlrun-ce/templates/NOTES.txt @@ -120,5 +120,55 @@ TimescaleDB is available at: {{- end }} {{- end }} +{{- if index .Values "opentelemetry-operator" "enabled" }} +{{- "\n" }} +OpenTelemetry Operator is enabled! +- Operator manages OpenTelemetryCollector and Instrumentation CRs +- Namespace selector: opentelemetry.io/inject=enabled +{{- if .Values.opentelemetry.collector.enabled }} +{{- "\n" }} +OpenTelemetry Collector (sidecar mode): +- Collector CR: {{ .Release.Name }}-otel-collector +- Mode: {{ .Values.opentelemetry.collector.mode }} +- OTLP gRPC endpoint: localhost:{{ .Values.opentelemetry.collector.otlp.grpcPort }} (inside pod) +- OTLP HTTP endpoint: localhost:{{ .Values.opentelemetry.collector.otlp.httpPort }} (inside pod) +- Prometheus metrics port: {{ .Values.opentelemetry.collector.prometheus.port }} +- Prometheus scrape mode: {{ .Values.opentelemetry.collector.scrapeMode }} +{{- if eq .Values.opentelemetry.collector.scrapeMode "direct" }} + +⚠️ WARNING: Scrape mode is "direct" - OTEL sidecar metrics will NOT be collected! + To collect metrics via OTEL, reinstall with: --set opentelemetry.collector.scrapeMode=otel +{{- end }} +{{- end }} +{{- if .Values.opentelemetry.instrumentation.enabled }} +{{- "\n" }} +OpenTelemetry Auto-Instrumentation: +- Instrumentation CR: {{ .Release.Name }}-otel-instrumentation +{{- if .Values.opentelemetry.instrumentation.python.enabled }} +- Python auto-instrumentation: enabled +{{- end }} +{{- if .Values.opentelemetry.instrumentation.java.enabled }} +- Java auto-instrumentation: enabled +{{- end }} +{{- end }} +{{- if .Values.opentelemetry.namespaceLabel.enabled }} +{{- "\n" }} +Namespace Label: +- Namespace {{ .Release.Namespace }} is labeled with: {{ .Values.opentelemetry.namespaceLabel.key }}={{ .Values.opentelemetry.namespaceLabel.value }} +{{- end }} +{{- "\n" }} +Prometheus Scrape Modes: +- "otel" : Metrics collected via OTEL sidecar only (recommended) +- "direct" : Direct Prometheus scraping only (current: {{ .Values.opentelemetry.collector.scrapeMode }}) +- "both" : Both methods active (for debugging) +{{- "\n" }} +To add OTEL instrumentation to your pods, add these annotations: + sidecar.opentelemetry.io/inject: "{{ .Release.Name }}-otel-collector" + instrumentation.opentelemetry.io/inject-python: "{{ .Release.Name }}-otel-instrumentation" + prometheus.io/scrape: "true" + prometheus.io/scrape-mode: "otel" + prometheus.io/port: "{{ .Values.opentelemetry.collector.prometheus.port }}" +{{- end }} + Happy MLOPSing!!! :] {{- end }} diff --git a/charts/mlrun-ce/templates/_helpers.tpl b/charts/mlrun-ce/templates/_helpers.tpl index 069e4e10..1a0bfe2a 100644 --- a/charts/mlrun-ce/templates/_helpers.tpl +++ b/charts/mlrun-ce/templates/_helpers.tpl @@ -348,3 +348,71 @@ TimescaleDB connection string for MLRun model monitoring postgresql://{{ .Values.timescaledb.auth.username | urlquery }}:{{ .Values.timescaledb.auth.password | urlquery }}@{{ include "mlrun-ce.timescaledb.fullname" . }}:{{ .Values.timescaledb.service.port }}/{{ .Values.timescaledb.auth.database }} {{- end }} +{{/* +============================================================================= +OpenTelemetry helpers +============================================================================= +*/}} + +{{/* +OpenTelemetry Collector name +*/}} +{{- define "mlrun-ce.otel.collector.name" -}} +{{- default "otel-collector" .Values.opentelemetry.collector.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +OpenTelemetry Collector fullname +*/}} +{{- define "mlrun-ce.otel.collector.fullname" -}} +{{- if .Values.opentelemetry.collector.fullnameOverride }} +{{- .Values.opentelemetry.collector.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default "otel-collector" .Values.opentelemetry.collector.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +OpenTelemetry Instrumentation name +*/}} +{{- define "mlrun-ce.otel.instrumentation.name" -}} +{{- default "otel-instrumentation" .Values.opentelemetry.instrumentation.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +OpenTelemetry Instrumentation fullname +*/}} +{{- define "mlrun-ce.otel.instrumentation.fullname" -}} +{{- if .Values.opentelemetry.instrumentation.fullnameOverride }} +{{- .Values.opentelemetry.instrumentation.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default "otel-instrumentation" .Values.opentelemetry.instrumentation.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +OpenTelemetry common labels +*/}} +{{- define "mlrun-ce.otel.labels" -}} +{{ include "mlrun-ce.common.labels" . }} +{{ include "mlrun-ce.otel.selectorLabels" . }} +{{- end }} + +{{/* +OpenTelemetry selector labels +*/}} +{{- define "mlrun-ce.otel.selectorLabels" -}} +{{ include "mlrun-ce.common.selectorLabels" . }} +app.kubernetes.io/component: opentelemetry +{{- end }} + diff --git a/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml b/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml index 6e5374f2..83135a1e 100644 --- a/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml +++ b/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml @@ -14,6 +14,22 @@ spec: metadata: labels: {{- include "mlrun-ce.jupyter.selectorLabels" . | nindent 8 }} + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + annotations: + # OpenTelemetry sidecar injection + sidecar.opentelemetry.io/inject: "{{ include "mlrun-ce.otel.collector.fullname" . }}" + # Python auto-instrumentation injection + instrumentation.opentelemetry.io/inject-python: "{{ include "mlrun-ce.otel.instrumentation.fullname" . }}" + # Prometheus scraping configuration + # scrape-mode controls how metrics are collected to prevent duplicates: + # "otel" - Only OTEL sidecar metrics (recommended) + # "both" - Both OTEL and direct scraping (debugging) + # "direct" - Only direct scraping (OTEL metrics ignored) + prometheus.io/scrape: "true" + prometheus.io/scrape-mode: {{ .Values.opentelemetry.collector.scrapeMode | quote }} + prometheus.io/port: "{{ .Values.opentelemetry.collector.prometheus.port }}" + prometheus.io/path: "/metrics" + {{- end }} spec: {{- with .Values.jupyterNotebook.image.pullSecrets }} imagePullSecrets: diff --git a/charts/mlrun-ce/templates/opentelemetry/collector.yaml b/charts/mlrun-ce/templates/opentelemetry/collector.yaml new file mode 100644 index 00000000..e32f3d4a --- /dev/null +++ b/charts/mlrun-ce/templates/opentelemetry/collector.yaml @@ -0,0 +1,102 @@ +{{- if .Values.opentelemetry.collector.enabled }} +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: {{ include "mlrun-ce.otel.collector.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + # Delay collector CR creation until after CRDs are installed by the operator + helm.sh/hook: post-install,post-upgrade + helm.sh/hook-weight: "10" +spec: + mode: {{ .Values.opentelemetry.collector.mode }} + resources: + {{- toYaml .Values.opentelemetry.collector.resources | nindent 4 }} + # Pod annotations for Prometheus scraping + podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "{{ .Values.opentelemetry.collector.prometheus.port }}" + prometheus.io/path: "/metrics" + config: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.otlp.grpcPort }} + http: + endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.otlp.httpPort }} + + processors: + # Batch processor for efficient metric export + batch: + send_batch_size: 10000 + timeout: 10s + # Memory limiter to prevent OOM + memory_limiter: + check_interval: 1s + limit_percentage: 80 + spike_limit_percentage: 25 + # Resource detection for Kubernetes metadata + resourcedetection: + detectors: + - env + - system + timeout: 5s + override: false + + exporters: + # Prometheus exporter for metrics - scraped by Prometheus + prometheus: + endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.prometheus.port }} + # Metric namespace prefix helps distinguish OTEL-collected metrics + # from directly-scraped metrics in Prometheus queries + namespace: {{ .Values.opentelemetry.collector.prometheus.namespace }} + const_labels: + collector_mode: sidecar + metrics_source: otel_collector + resource_to_telemetry_conversion: + enabled: true + # Debug exporter for troubleshooting (logs to stdout) + debug: + verbosity: basic + sampling_initial: 5 + sampling_thereafter: 200 + + extensions: + health_check: + endpoint: 0.0.0.0:13133 + + service: + extensions: + - health_check + pipelines: + # Metrics pipeline: OTLP -> processing -> Prometheus export + metrics: + receivers: + - otlp + processors: + - memory_limiter + - resourcedetection + - batch + exporters: + - prometheus + - debug + # Traces pipeline: OTLP -> processing -> debug (no trace backend configured yet) + traces: + receivers: + - otlp + processors: + - memory_limiter + - resourcedetection + - batch + exporters: + - debug + telemetry: + logs: + level: info + metrics: + address: 0.0.0.0:8888 +{{- end }} + diff --git a/charts/mlrun-ce/templates/opentelemetry/instrumentation.yaml b/charts/mlrun-ce/templates/opentelemetry/instrumentation.yaml new file mode 100644 index 00000000..7b9f0767 --- /dev/null +++ b/charts/mlrun-ce/templates/opentelemetry/instrumentation.yaml @@ -0,0 +1,86 @@ +{{- if .Values.opentelemetry.instrumentation.enabled }} +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: {{ include "mlrun-ce.otel.instrumentation.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + # Delay instrumentation CR creation until after CRDs are installed by the operator + helm.sh/hook: post-install,post-upgrade + helm.sh/hook-weight: "10" +spec: + # Propagators for distributed tracing context + propagators: + {{- toYaml .Values.opentelemetry.instrumentation.propagators | nindent 4 }} + + # Sampler configuration + sampler: + type: {{ .Values.opentelemetry.instrumentation.sampler.type }} + argument: {{ .Values.opentelemetry.instrumentation.sampler.argument | quote }} + + # Environment variables injected into instrumented pods + env: + # Service name will be auto-detected from pod metadata + - name: OTEL_SERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['app.kubernetes.io/name'] + # Resource attributes for better observability + - name: OTEL_RESOURCE_ATTRIBUTES + value: >- + k8s.namespace.name=$(OTEL_RESOURCE_ATTRIBUTES_NAMESPACE), + k8s.pod.name=$(OTEL_RESOURCE_ATTRIBUTES_POD_NAME), + k8s.container.name=$(OTEL_RESOURCE_ATTRIBUTES_CONTAINER_NAME), + service.namespace=$(OTEL_RESOURCE_ATTRIBUTES_NAMESPACE) + - name: OTEL_RESOURCE_ATTRIBUTES_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: OTEL_RESOURCE_ATTRIBUTES_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: OTEL_RESOURCE_ATTRIBUTES_CONTAINER_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + # Export metrics via OTLP to the sidecar collector + - name: OTEL_METRICS_EXPORTER + value: otlp + - name: OTEL_TRACES_EXPORTER + value: otlp + - name: OTEL_LOGS_EXPORTER + value: none + + # Python auto-instrumentation configuration + {{- if .Values.opentelemetry.instrumentation.python.enabled }} + python: + image: {{ .Values.opentelemetry.instrumentation.python.image.repository }}:{{ .Values.opentelemetry.instrumentation.python.image.tag }} + resourceRequirements: + {{- toYaml .Values.opentelemetry.instrumentation.python.resources | nindent 6 }} + env: + # Python-specific OTEL configuration + - name: OTEL_PYTHON_LOG_CORRELATION + value: "true" + - name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED + value: "false" + # Disable specific instrumentations that might cause issues + - name: OTEL_PYTHON_DISABLED_INSTRUMENTATIONS + value: "" + {{- end }} + + # Java auto-instrumentation configuration + {{- if .Values.opentelemetry.instrumentation.java.enabled }} + java: + image: {{ .Values.opentelemetry.instrumentation.java.image.repository }}:{{ .Values.opentelemetry.instrumentation.java.image.tag }} + resourceRequirements: + {{- toYaml .Values.opentelemetry.instrumentation.java.resources | nindent 6 }} + env: + # Java-specific OTEL configuration + - name: OTEL_INSTRUMENTATION_COMMON_DEFAULT_ENABLED + value: "true" + {{- end }} +{{- end }} + diff --git a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml new file mode 100644 index 00000000..985bd4fb --- /dev/null +++ b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml @@ -0,0 +1,15 @@ +{{- if and (or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled) .Values.opentelemetry.namespaceLabel.enabled -}} +# Label the namespace for OpenTelemetry operator webhook injection +# The operator will only inject sidecars into namespaces with this label +apiVersion: v1 +kind: Namespace +metadata: + name: {{ .Release.Namespace }} + labels: +{{ include "mlrun-ce.otel.labels" . | indent 4 }} + {{ .Values.opentelemetry.namespaceLabel.key }}: {{ .Values.opentelemetry.namespaceLabel.value | quote }} + annotations: + # This resource only patches the existing namespace with the required label + # It does not create the namespace (namespace should already exist) + helm.sh/resource-policy: keep +{{- end -}} diff --git a/charts/mlrun-ce/templates/opentelemetry/rbac.yaml b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml new file mode 100644 index 00000000..0ffe62b4 --- /dev/null +++ b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml @@ -0,0 +1,58 @@ +{{- if or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} +--- +# ServiceAccount for OpenTelemetry collector sidecar +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} +--- +# Role for OpenTelemetry collector to access Kubernetes resources +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: otel-collector + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} +rules: + # Allow reading pod metadata for resource detection + - apiGroups: + - "" + resources: + - pods + - namespaces + verbs: + - get + - list + - watch + # Allow reading configmaps for collector configuration + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch +--- +# RoleBinding for OpenTelemetry collector +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: otel-collector + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: otel-collector +subjects: + - kind: ServiceAccount + name: otel-collector + namespace: {{ .Release.Namespace }} +{{- end }} + diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index bed3e5b0..2999f6d5 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -533,6 +533,103 @@ kube-prometheus-stack: service: type: NodePort nodePort: 30020 + prometheusSpec: + # Additional scrape configs for OpenTelemetry collector sidecars + # This creates clear separation between direct scraping and OTEL-collected metrics + additionalScrapeConfigs: + # Job for scraping OTEL collector sidecars (metrics on port 8889) + # Only scrapes pods with prometheus.io/scrape-mode: "otel" or "both" + - job_name: 'otel-collector-sidecars' + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Only scrape pods with OTEL sidecar (port 8889) + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port] + action: keep + regex: "8889" + # Only scrape if scrape-mode is "otel" or "both" (or legacy scrape=true with port 8889) + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_mode] + action: keep + regex: (otel|both|) + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + # Set metrics path + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + # Set target address with port + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + # Add kubernetes labels + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + # Add metric relabeling to identify OTEL-sourced metrics + metric_relabel_configs: + - action: replace + target_label: metrics_source + replacement: otel_collector + + # Job for direct application scraping (non-OTEL) + # Only scrapes pods with prometheus.io/scrape-mode: "direct" or "both" + # Excludes OTEL sidecar port (8889) + - job_name: 'kubernetes-pods-direct' + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Only scrape pods with prometheus.io/scrape=true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + # Exclude OTEL sidecar port (8889) - those are handled by otel-collector-sidecars job + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port] + action: drop + regex: "8889" + # Only scrape if scrape-mode is "direct" or "both" or not set (default to direct) + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_mode] + action: keep + regex: (direct|both|) + # Set metrics path + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + replacement: $1 + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: () + replacement: /metrics + # Set target address with port + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + # Add kubernetes labels + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + # Add metric relabeling to identify direct-scraped metrics + metric_relabel_configs: + - action: replace + target_label: metrics_source + replacement: direct_scrape kube-state-metrics: fullnameOverride: state-metrics prometheus-node-exporter: @@ -645,3 +742,127 @@ kafka: # Empty means "use the release namespace" # Example: "controller" if that's where you installed the operator operatorNamespace: "" + +# ============================================================================= +# OpenTelemetry Operator configuration +# Installs the OpenTelemetry Operator for managing collectors and instrumentation +# ============================================================================= +opentelemetry-operator: + enabled: false + # Admission webhooks configuration + admissionWebhooks: + certManager: + enabled: false + autoGenerateCert: + enabled: true + # Only apply webhooks to namespaces with the opentelemetry label + # This ensures the operator only monitors labeled namespaces + namespaceSelector: + matchLabels: + opentelemetry.io/inject: "enabled" + manager: + # Collector image used by the operator when creating collectors + collectorImage: + repository: otel/opentelemetry-collector-contrib + tag: 0.115.0 + # Auto-instrumentation images + autoInstrumentationImage: + python: + repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-python + tag: 0.50b0 + java: + repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-java + tag: 2.10.0 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +# ============================================================================= +# OpenTelemetry Collector and Instrumentation CRs +# These are managed separately from the operator for admin/non-admin split +# ============================================================================= +opentelemetry: + # Namespace label for enabling OpenTelemetry monitoring + # The namespace must have this label for the operator to inject sidecars + namespaceLabel: + enabled: false + key: "opentelemetry.io/inject" + value: "enabled" + + # OpenTelemetry Collector configuration (sidecar mode) + collector: + enabled: false + nameOverride: "" + fullnameOverride: "" + # Sidecar mode - collector runs as a sidecar in instrumented pods + mode: sidecar + # Collector sidecar container resources + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + # Prometheus exporter port for metrics + prometheus: + port: 8889 + # Metric prefix added to all OTEL-collected metrics + # Helps distinguish OTEL metrics from directly-scraped metrics + namespace: mlrun_otel + # OTLP receiver configuration + otlp: + grpcPort: 4317 + httpPort: 4318 + # Prometheus scrape mode for OTEL-instrumented pods + # Options: + # - "direct": Direct Prometheus scraping only (default when OTEL disabled) + # - "otel": Metrics collected via OTEL sidecar only (recommended when OTEL enabled) + # - "both": Both OTEL sidecar and direct scraping (for debugging/transition) + # When enabling OTEL, set this to "otel" to prevent duplicate metrics + scrapeMode: "direct" + + # Instrumentation configuration for auto-instrumentation + instrumentation: + enabled: false + nameOverride: "" + fullnameOverride: "" + # Propagators for distributed tracing context + propagators: + - tracecontext + - baggage + # Sampler configuration + sampler: + type: parentbased_traceidratio + argument: "1" + # Python auto-instrumentation + python: + enabled: true + image: + repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-python + tag: 0.50b0 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + # Java auto-instrumentation (disabled by default, enable if needed) + java: + enabled: false + image: + repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-java + tag: 2.10.0 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 500m + memory: 512Mi + diff --git a/tests/helm-template-test.sh b/tests/helm-template-test.sh new file mode 100755 index 00000000..8b9d79c2 --- /dev/null +++ b/tests/helm-template-test.sh @@ -0,0 +1,489 @@ +#!/usr/bin/env bash +# Copyright 2025 Iguazio +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Helm template tests for MLRun CE chart +# Validates that templates render correctly with various configurations + +set -o nounset +set -o pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CHART_DIR="${SCRIPT_DIR}/../charts/mlrun-ce" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +TESTS_PASSED=0 +TESTS_FAILED=0 + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } +log_test() { echo -e "${GREEN}[TEST]${NC} $1"; } +log_pass() { echo -e "${GREEN}[PASS]${NC} $1"; ((TESTS_PASSED++)) || true; } +log_fail() { echo -e "${RED}[FAIL]${NC} $1"; ((TESTS_FAILED++)) || true; } + +# Render a specific template and return the output +render_template() { + local template="$1" + shift + helm template test "${CHART_DIR}" \ + --skip-schema-validation \ + --show-only "${template}" \ + "$@" 2>/dev/null +} + +# Render all templates and return the output +render_all() { + helm template test "${CHART_DIR}" \ + --skip-schema-validation \ + "$@" 2>/dev/null +} + +# Check if output contains a string +assert_contains() { + local output="$1" + local expected="$2" + local test_name="$3" + + if echo "$output" | grep -q "$expected"; then + log_pass "$test_name" + return 0 + else + log_fail "$test_name - expected to find: $expected" + return 1 + fi +} + +# Check if output does NOT contain a string +assert_not_contains() { + local output="$1" + local not_expected="$2" + local test_name="$3" + + if echo "$output" | grep -q "$not_expected"; then + log_fail "$test_name - should not contain: $not_expected" + return 1 + else + log_pass "$test_name" + return 0 + fi +} + +# Check if template renders (non-empty output) +assert_renders() { + local output="$1" + local test_name="$2" + + if [[ -n "$output" ]]; then + log_pass "$test_name" + return 0 + else + log_fail "$test_name - template produced no output" + return 1 + fi +} + +# Check if template does NOT render (empty output or error) +assert_not_renders() { + local template="$1" + local test_name="$2" + shift 2 + + local output + output=$(render_template "$template" "$@" 2>&1) || true + + if [[ -z "$output" ]] || echo "$output" | grep -q "could not find template"; then + log_pass "$test_name" + return 0 + else + log_fail "$test_name - template should not render" + return 1 + fi +} + +# ============================================================================ +# OpenTelemetry Tests +# ============================================================================ + +test_otel_collector_default() { + log_test "OpenTelemetry Collector - Enabled" + + local output + output=$(render_template "templates/opentelemetry/collector.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.collector.enabled=true) + + assert_renders "$output" "Collector CR renders" + assert_contains "$output" "kind: OpenTelemetryCollector" "Has correct kind" + assert_contains "$output" "mode: sidecar" "Uses sidecar mode" + assert_contains "$output" "prometheus:" "Has Prometheus exporter" + assert_contains "$output" "endpoint: 0.0.0.0:8889" "Prometheus on port 8889" + assert_contains "$output" "otlp:" "Has OTLP receiver" + assert_contains "$output" "helm.sh/hook: post-install,post-upgrade" "Has Helm hooks" +} + +test_otel_collector_disabled() { + log_test "OpenTelemetry Collector - Disabled (default)" + + assert_not_renders "templates/opentelemetry/collector.yaml" \ + "Collector CR does not render when disabled (default)" +} + +test_otel_collector_resources() { + log_test "OpenTelemetry Collector - Custom resources" + + local output + output=$(render_template "templates/opentelemetry/collector.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.collector.resources.requests.cpu=100m \ + --set opentelemetry.collector.resources.requests.memory=128Mi \ + --set opentelemetry.collector.resources.limits.cpu=500m \ + --set opentelemetry.collector.resources.limits.memory=512Mi) + + assert_contains "$output" "cpu: 100m" "Custom CPU request" + assert_contains "$output" "memory: 128Mi" "Custom memory request" + assert_contains "$output" "cpu: 500m" "Custom CPU limit" + assert_contains "$output" "memory: 512Mi" "Custom memory limit" +} + +test_otel_instrumentation_default() { + log_test "OpenTelemetry Instrumentation - Enabled" + + local output + output=$(render_template "templates/opentelemetry/instrumentation.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.instrumentation.enabled=true) + + assert_renders "$output" "Instrumentation CR renders" + assert_contains "$output" "kind: Instrumentation" "Has correct kind" + assert_contains "$output" "tracecontext" "Has tracecontext propagator" + assert_contains "$output" "baggage" "Has baggage propagator" + assert_contains "$output" "parentbased_traceidratio" "Has sampler type" + assert_contains "$output" "python:" "Has Python instrumentation" + assert_contains "$output" "autoinstrumentation-python" "Uses Python auto-instrumentation image" +} + +test_otel_instrumentation_disabled() { + log_test "OpenTelemetry Instrumentation - Disabled (default)" + + assert_not_renders "templates/opentelemetry/instrumentation.yaml" \ + "Instrumentation CR does not render when disabled (default)" +} + +test_otel_instrumentation_java_enabled() { + log_test "OpenTelemetry Instrumentation - Java enabled" + + local output + output=$(render_template "templates/opentelemetry/instrumentation.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.instrumentation.enabled=true \ + --set opentelemetry.instrumentation.java.enabled=true) + + assert_contains "$output" "java:" "Has Java instrumentation section" + assert_contains "$output" "autoinstrumentation-java" "Uses Java auto-instrumentation image" +} + +test_otel_rbac_default() { + log_test "OpenTelemetry RBAC - Enabled" + + local output + output=$(render_template "templates/opentelemetry/rbac.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.collector.enabled=true) + + assert_renders "$output" "RBAC renders" + assert_contains "$output" "kind: ServiceAccount" "Has ServiceAccount" + assert_contains "$output" "kind: Role" "Has Role" + assert_contains "$output" "kind: RoleBinding" "Has RoleBinding" + assert_contains "$output" "name: otel-collector" "Has correct name" +} + +test_otel_rbac_disabled() { + log_test "OpenTelemetry RBAC - Disabled (default)" + + assert_not_renders "templates/opentelemetry/rbac.yaml" \ + "RBAC does not render when OTEL disabled (default)" +} + +test_jupyter_otel_annotations() { + log_test "Jupyter Deployment - OTEL annotations when enabled" + + local output + output=$(render_template "templates/jupyter-notebook/deployment.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.instrumentation.enabled=true) + + assert_contains "$output" "sidecar.opentelemetry.io/inject:" "Has sidecar injection annotation" + assert_contains "$output" "instrumentation.opentelemetry.io/inject-python:" "Has Python instrumentation annotation" + assert_contains "$output" 'prometheus.io/scrape: "true"' "Has Prometheus scrape annotation" + assert_contains "$output" 'prometheus.io/scrape-mode:' "Has Prometheus scrape-mode annotation" + assert_contains "$output" 'prometheus.io/port: "8889"' "Has Prometheus port annotation" +} + +test_jupyter_no_otel_annotations_when_disabled() { + log_test "Jupyter Deployment - No OTEL annotations when disabled (default)" + + local output + output=$(render_template "templates/jupyter-notebook/deployment.yaml" \ + --set global.registry.url=test.io) + + assert_not_contains "$output" "sidecar.opentelemetry.io/inject:" "No sidecar injection when disabled (default)" + assert_not_contains "$output" "instrumentation.opentelemetry.io/inject-python:" "No instrumentation when disabled (default)" +} + +# ============================================================================ +# Admin/Non-Admin Installation Tests +# ============================================================================ + +test_admin_values_otel() { + log_test "Admin installation - OTEL operator enabled, CRs disabled" + + # Collector should not render + assert_not_renders "templates/opentelemetry/collector.yaml" \ + "Collector CR not rendered with admin values" \ + -f "${CHART_DIR}/admin_installation_values.yaml" + + # Instrumentation should not render + assert_not_renders "templates/opentelemetry/instrumentation.yaml" \ + "Instrumentation CR not rendered with admin values" \ + -f "${CHART_DIR}/admin_installation_values.yaml" +} + +test_non_admin_values_otel() { + log_test "Non-admin installation - OTEL CRs enabled" + + local output + output=$(render_template "templates/opentelemetry/collector.yaml" \ + --set global.registry.url=test.io \ + -f "${CHART_DIR}/non_admin_installation_values.yaml") + + assert_renders "$output" "Collector CR renders with non-admin values" + + output=$(render_template "templates/opentelemetry/instrumentation.yaml" \ + --set global.registry.url=test.io \ + -f "${CHART_DIR}/non_admin_installation_values.yaml") + + assert_renders "$output" "Instrumentation CR renders with non-admin values" +} + +test_namespace_label_enabled() { + log_test "Namespace Label - Enabled" + + local output + output=$(render_template "templates/opentelemetry/namespace-label.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.namespaceLabel.enabled=true \ + --set opentelemetry.collector.enabled=true) + + assert_renders "$output" "Namespace label renders" + assert_contains "$output" "kind: Namespace" "Has correct kind" + assert_contains "$output" "opentelemetry.io/inject" "Has OTEL inject label key" + assert_contains "$output" '"enabled"' "Has OTEL inject label value" +} + +test_namespace_label_disabled() { + log_test "Namespace Label - Disabled (default)" + + assert_not_renders "templates/opentelemetry/namespace-label.yaml" \ + "Namespace label not rendered when disabled (default)" +} + +test_admin_namespace_label_disabled() { + log_test "Admin installation - Namespace label disabled" + + assert_not_renders "templates/opentelemetry/namespace-label.yaml" \ + "Namespace label not rendered with admin values" \ + -f "${CHART_DIR}/admin_installation_values.yaml" +} + +test_non_admin_namespace_label_enabled() { + log_test "Non-admin installation - Namespace label enabled" + + local output + output=$(render_template "templates/opentelemetry/namespace-label.yaml" \ + --set global.registry.url=test.io \ + -f "${CHART_DIR}/non_admin_installation_values.yaml") + + assert_renders "$output" "Namespace label renders with non-admin values" + assert_contains "$output" "opentelemetry.io/inject" "Has OTEL inject label" +} + +test_otel_operator_namespace_selector() { + log_test "OTEL Operator - Namespace selector configured" + + local output + output=$(render_all \ + --set global.registry.url=test.io \ + --set opentelemetry-operator.enabled=true) + + # Check if the operator webhook has namespace selector configured + # The selector should be in the MutatingWebhookConfiguration + if echo "$output" | grep -A5 "namespaceSelector:" | grep -q "opentelemetry.io/inject"; then + log_pass "Has namespace selector in webhook configuration" + else + log_fail "Namespace selector not found in webhook configuration" + fi +} + +# ============================================================================ +# Prometheus Integration Tests +# ============================================================================ + +test_prometheus_otel_scrape_config() { + log_test "Prometheus - OTEL scrape configuration" + + local output + output=$(render_all --set global.registry.url=test.io) + + # The scrape config is in a Secret as base64, extract and decode it + local secret_data + secret_data=$(echo "$output" | grep "additional-scrape-configs.yaml:" | head -1 | sed 's/.*: "//' | sed 's/"$//' || true) + + if [[ -n "$secret_data" ]]; then + local decoded + decoded=$(echo "$secret_data" | base64 -d 2>/dev/null || true) + + if echo "$decoded" | grep -q "otel-collector-sidecars"; then + log_pass "Has OTEL collector scrape job" + else + log_fail "Has OTEL collector scrape job - not found in decoded config" + fi + + if echo "$decoded" | grep -q "prometheus_io_port"; then + log_pass "Has pod annotation relabeling" + else + log_fail "Has pod annotation relabeling - not found in decoded config" + fi + else + log_fail "Prometheus scrape config secret not found" + fi +} + +# ============================================================================ +# Full Chart Render Test +# ============================================================================ + +test_full_chart_renders() { + log_test "Full chart renders without errors" + + local output + output=$(render_all --set global.registry.url=test.io 2>&1) + + if [[ $? -eq 0 ]] && [[ -n "$output" ]]; then + log_pass "Full chart renders successfully" + else + log_fail "Full chart failed to render" + fi +} + +# ============================================================================ +# Main +# ============================================================================ + +main() { + log_info "Running Helm template tests for MLRun CE" + log_info "Chart directory: ${CHART_DIR}" + echo "" + + # Ensure dependencies are up to date + log_info "Updating Helm dependencies..." + helm dependency update "${CHART_DIR}" > /dev/null 2>&1 + + echo "" + echo "========================================" + echo "OpenTelemetry Collector Tests" + echo "========================================" + test_otel_collector_default + test_otel_collector_disabled + test_otel_collector_resources + + echo "" + echo "========================================" + echo "OpenTelemetry Instrumentation Tests" + echo "========================================" + test_otel_instrumentation_default + test_otel_instrumentation_disabled + test_otel_instrumentation_java_enabled + + echo "" + echo "========================================" + echo "OpenTelemetry RBAC Tests" + echo "========================================" + test_otel_rbac_default + test_otel_rbac_disabled + + echo "" + echo "========================================" + echo "Jupyter OTEL Integration Tests" + echo "========================================" + test_jupyter_otel_annotations + test_jupyter_no_otel_annotations_when_disabled + + echo "" + echo "========================================" + echo "Admin/Non-Admin Installation Tests" + echo "========================================" + test_admin_values_otel + test_non_admin_values_otel + + echo "" + echo "========================================" + echo "Namespace Label Tests" + echo "========================================" + test_namespace_label_enabled + test_namespace_label_disabled + test_admin_namespace_label_disabled + test_non_admin_namespace_label_enabled + test_otel_operator_namespace_selector + + echo "" + echo "========================================" + echo "Prometheus Integration Tests" + echo "========================================" + test_prometheus_otel_scrape_config + + echo "" + echo "========================================" + echo "Full Chart Tests" + echo "========================================" + test_full_chart_renders + + echo "" + echo "========================================" + echo "Test Summary" + echo "========================================" + echo -e "Passed: ${GREEN}${TESTS_PASSED}${NC}" + echo -e "Failed: ${RED}${TESTS_FAILED}${NC}" + + if [[ ${TESTS_FAILED} -gt 0 ]]; then + log_error "Some tests failed!" + exit 1 + else + log_info "All tests passed!" + exit 0 + fi +} + +main "$@" + + + + diff --git a/tests/kind-test.sh b/tests/kind-test.sh index c99a182e..94b41c7f 100755 --- a/tests/kind-test.sh +++ b/tests/kind-test.sh @@ -93,6 +93,7 @@ setup_helm_repos() { helm repo add spark-operator https://kubeflow.github.io/spark-operator 2>/dev/null || true helm repo add kube-prometheus-stack https://prometheus-community.github.io/helm-charts 2>/dev/null || true helm repo add kafka https://charts.bitnami.com/bitnami 2>/dev/null || true + helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts 2>/dev/null || true helm repo update } @@ -220,6 +221,53 @@ verify_installation() { else log_warn "TimescaleDB pod not found" fi + + # Verify OpenTelemetry CRDs and resources + echo "" + log_info "Verifying OpenTelemetry..." + + # Check if OpenTelemetry Operator is installed (CRDs exist) + if kubectl get crd opentelemetrycollectors.opentelemetry.io &>/dev/null; then + log_info "OpenTelemetryCollector CRD exists" + + # Check for collector CR + local collector + collector=$(kubectl get opentelemetrycollectors -n "${NAMESPACE}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [[ -n "${collector}" ]]; then + log_info "OpenTelemetryCollector CR found: ${collector}" + kubectl get opentelemetrycollectors -n "${NAMESPACE}" "${collector}" -o yaml 2>/dev/null | grep -E "mode:|status:" | head -5 || true + else + log_warn "No OpenTelemetryCollector CR found in namespace ${NAMESPACE}" + fi + else + log_warn "OpenTelemetryCollector CRD not found - operator may not be installed" + fi + + if kubectl get crd instrumentations.opentelemetry.io &>/dev/null; then + log_info "Instrumentation CRD exists" + + # Check for instrumentation CR + local instrumentation + instrumentation=$(kubectl get instrumentations -n "${NAMESPACE}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [[ -n "${instrumentation}" ]]; then + log_info "Instrumentation CR found: ${instrumentation}" + else + log_warn "No Instrumentation CR found in namespace ${NAMESPACE}" + fi + else + log_warn "Instrumentation CRD not found - operator may not be installed" + fi + + # Check if Jupyter pod has OTEL sidecar annotations + echo "" + log_info "Checking Jupyter deployment for OTEL annotations..." + local jupyter_annotations + jupyter_annotations=$(kubectl get deployment -n "${NAMESPACE}" -l app.kubernetes.io/component=jupyter-notebook -o jsonpath='{.items[0].spec.template.metadata.annotations}' 2>/dev/null || echo "") + if echo "${jupyter_annotations}" | grep -q "sidecar.opentelemetry.io/inject"; then + log_info "Jupyter has OTEL sidecar injection annotation" + else + log_warn "Jupyter does not have OTEL sidecar injection annotation" + fi } delete_cluster() { From 40baa802bec47898e2e96415ac27b5d61e61a43c Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 25 Mar 2026 11:54:41 +0200 Subject: [PATCH 05/23] fix requirements.lock --- charts/mlrun-ce/requirements.lock | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/charts/mlrun-ce/requirements.lock b/charts/mlrun-ce/requirements.lock index 35c43691..f631004c 100644 --- a/charts/mlrun-ce/requirements.lock +++ b/charts/mlrun-ce/requirements.lock @@ -20,10 +20,8 @@ dependencies: - name: strimzi-kafka-operator repository: https://strimzi.io/charts/ version: 0.48.0 -digest: sha256:f7f2ab0eaec5fb3097c09946f6de510a602293fd7f9c59c40539991b5449a6d1 -generated: "2026-03-08T12:42:39.145588+02:00" - name: opentelemetry-operator repository: https://open-telemetry.github.io/opentelemetry-helm-charts version: 0.78.1 -digest: sha256:4a47a90d97b21b41cd3bb7f7e9b70b56b42b95fe067bb012e4d490fa1912e18f -generated: "2026-03-24T16:04:27.962041+02:00" +digest: sha256:9f6ea4d6c60baabe3a9fb2a9c286f5c70a97bbf76ecba15ddaef7f39c56269ae +generated: "2026-03-25T11:50:15.589709+02:00" From ff914ac4888b27cf26c46c6e968f2e775c90e7d0 Mon Sep 17 00:00:00 2001 From: royischoss Date: Thu, 26 Mar 2026 16:10:34 +0200 Subject: [PATCH 06/23] fixes --- charts/mlrun-ce/templates/_helpers.tpl | 158 ++++++++++++++++++ .../templates/opentelemetry/collector.yaml | 114 ++----------- .../opentelemetry/crd-readiness-job.yaml | 82 +++++++++ .../opentelemetry/instrumentation.yaml | 98 ++--------- .../opentelemetry/namespace-label.yaml | 36 +++- .../templates/opentelemetry/rbac.yaml | 110 ++++++++++++ charts/mlrun-ce/values.yaml | 12 +- tests/helm-template-test.sh | 71 ++++---- 8 files changed, 447 insertions(+), 234 deletions(-) create mode 100644 charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml diff --git a/charts/mlrun-ce/templates/_helpers.tpl b/charts/mlrun-ce/templates/_helpers.tpl index 292fe3e2..709b5f37 100644 --- a/charts/mlrun-ce/templates/_helpers.tpl +++ b/charts/mlrun-ce/templates/_helpers.tpl @@ -416,3 +416,161 @@ OpenTelemetry selector labels app.kubernetes.io/component: opentelemetry {{- end }} +{{/* +OpenTelemetryCollector CR manifest for use in the CRD readiness job +*/}} +{{- define "mlrun-ce.otel.collector.manifest" -}} +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: {{ include "mlrun-ce.otel.collector.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} +spec: + mode: {{ .Values.opentelemetry.collector.mode }} + upgradeStrategy: automatic + managementState: managed + resources: + {{- toYaml .Values.opentelemetry.collector.resources | nindent 4 }} + podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "{{ .Values.opentelemetry.collector.prometheus.port }}" + prometheus.io/path: "/metrics" + config: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.otlp.grpcPort }} + http: + endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.otlp.httpPort }} + processors: + batch: + send_batch_size: 10000 + timeout: 10s + memory_limiter: + check_interval: 1s + limit_percentage: 80 + spike_limit_percentage: 25 + resourcedetection: + detectors: + - env + - system + timeout: 5s + override: false + exporters: + prometheus: + endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.prometheus.port }} + namespace: {{ .Values.opentelemetry.collector.prometheus.namespace }} + const_labels: + collector_mode: sidecar + metrics_source: otel_collector + resource_to_telemetry_conversion: + enabled: true + debug: + verbosity: basic + sampling_initial: 5 + sampling_thereafter: 200 + extensions: + health_check: + endpoint: 0.0.0.0:13133 + service: + extensions: + - health_check + pipelines: + metrics: + receivers: + - otlp + processors: + - memory_limiter + - resourcedetection + - batch + exporters: + - prometheus + - debug + traces: + receivers: + - otlp + processors: + - memory_limiter + - resourcedetection + - batch + exporters: + - debug + telemetry: + logs: + level: info + metrics: + address: 0.0.0.0:8888 +{{- end }} + +{{/* +Instrumentation CR manifest for use in the CRD readiness job +*/}} +{{- define "mlrun-ce.otel.instrumentation.manifest" -}} +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: {{ include "mlrun-ce.otel.instrumentation.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} +spec: + propagators: + {{- toYaml .Values.opentelemetry.instrumentation.propagators | nindent 4 }} + sampler: + type: {{ .Values.opentelemetry.instrumentation.sampler.type }} + argument: {{ .Values.opentelemetry.instrumentation.sampler.argument | quote }} + env: + - name: OTEL_SERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['app.kubernetes.io/name'] + - name: OTEL_RESOURCE_ATTRIBUTES + value: >- + k8s.namespace.name=$(OTEL_RESOURCE_ATTRIBUTES_NAMESPACE), + k8s.pod.name=$(OTEL_RESOURCE_ATTRIBUTES_POD_NAME), + k8s.container.name=$(OTEL_RESOURCE_ATTRIBUTES_CONTAINER_NAME), + service.namespace=$(OTEL_RESOURCE_ATTRIBUTES_NAMESPACE) + - name: OTEL_RESOURCE_ATTRIBUTES_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: OTEL_RESOURCE_ATTRIBUTES_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: OTEL_RESOURCE_ATTRIBUTES_CONTAINER_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: OTEL_METRICS_EXPORTER + value: otlp + - name: OTEL_TRACES_EXPORTER + value: otlp + - name: OTEL_LOGS_EXPORTER + value: none + {{- if .Values.opentelemetry.instrumentation.python.enabled }} + python: + image: {{ .Values.opentelemetry.instrumentation.python.image.repository }}:{{ .Values.opentelemetry.instrumentation.python.image.tag }} + resourceRequirements: + {{- toYaml .Values.opentelemetry.instrumentation.python.resources | nindent 6 }} + env: + - name: OTEL_PYTHON_LOG_CORRELATION + value: "true" + - name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED + value: "false" + - name: OTEL_PYTHON_DISABLED_INSTRUMENTATIONS + value: "" + {{- end }} + {{- if .Values.opentelemetry.instrumentation.java.enabled }} + java: + image: {{ .Values.opentelemetry.instrumentation.java.image.repository }}:{{ .Values.opentelemetry.instrumentation.java.image.tag }} + resourceRequirements: + {{- toYaml .Values.opentelemetry.instrumentation.java.resources | nindent 6 }} + env: + - name: OTEL_INSTRUMENTATION_COMMON_DEFAULT_ENABLED + value: "true" + {{- end }} +{{- end }} diff --git a/charts/mlrun-ce/templates/opentelemetry/collector.yaml b/charts/mlrun-ce/templates/opentelemetry/collector.yaml index e32f3d4a..e1dd53ee 100644 --- a/charts/mlrun-ce/templates/opentelemetry/collector.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/collector.yaml @@ -1,102 +1,14 @@ -{{- if .Values.opentelemetry.collector.enabled }} -apiVersion: opentelemetry.io/v1beta1 -kind: OpenTelemetryCollector -metadata: - name: {{ include "mlrun-ce.otel.collector.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "mlrun-ce.otel.labels" . | nindent 4 }} - annotations: - # Delay collector CR creation until after CRDs are installed by the operator - helm.sh/hook: post-install,post-upgrade - helm.sh/hook-weight: "10" -spec: - mode: {{ .Values.opentelemetry.collector.mode }} - resources: - {{- toYaml .Values.opentelemetry.collector.resources | nindent 4 }} - # Pod annotations for Prometheus scraping - podAnnotations: - prometheus.io/scrape: "true" - prometheus.io/port: "{{ .Values.opentelemetry.collector.prometheus.port }}" - prometheus.io/path: "/metrics" - config: - receivers: - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.otlp.grpcPort }} - http: - endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.otlp.httpPort }} - - processors: - # Batch processor for efficient metric export - batch: - send_batch_size: 10000 - timeout: 10s - # Memory limiter to prevent OOM - memory_limiter: - check_interval: 1s - limit_percentage: 80 - spike_limit_percentage: 25 - # Resource detection for Kubernetes metadata - resourcedetection: - detectors: - - env - - system - timeout: 5s - override: false - - exporters: - # Prometheus exporter for metrics - scraped by Prometheus - prometheus: - endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.prometheus.port }} - # Metric namespace prefix helps distinguish OTEL-collected metrics - # from directly-scraped metrics in Prometheus queries - namespace: {{ .Values.opentelemetry.collector.prometheus.namespace }} - const_labels: - collector_mode: sidecar - metrics_source: otel_collector - resource_to_telemetry_conversion: - enabled: true - # Debug exporter for troubleshooting (logs to stdout) - debug: - verbosity: basic - sampling_initial: 5 - sampling_thereafter: 200 - - extensions: - health_check: - endpoint: 0.0.0.0:13133 - - service: - extensions: - - health_check - pipelines: - # Metrics pipeline: OTLP -> processing -> Prometheus export - metrics: - receivers: - - otlp - processors: - - memory_limiter - - resourcedetection - - batch - exporters: - - prometheus - - debug - # Traces pipeline: OTLP -> processing -> debug (no trace backend configured yet) - traces: - receivers: - - otlp - processors: - - memory_limiter - - resourcedetection - - batch - exporters: - - debug - telemetry: - logs: - level: info - metrics: - address: 0.0.0.0:8888 -{{- end }} +{{/* +OpenTelemetryCollector CR is created by the crd-readiness-job.yaml via a Helm post-install/post-upgrade hook. +This solves the race condition between the OpenTelemetry Operator starting up and registering its CRDs, +and Helm trying to create this CR. + +The actual CR manifest is defined in _helpers.tpl as "mlrun-ce.otel.collector.manifest" and is applied +by the job after it confirms the CRD is available. + +To see the CR configuration, check: +- templates/_helpers.tpl: defines the manifest +- templates/opentelemetry/crd-readiness-job.yaml: creates the CR +- values.yaml: opentelemetry.collector.* settings +*/}} diff --git a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml new file mode 100644 index 00000000..eec873f2 --- /dev/null +++ b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml @@ -0,0 +1,82 @@ +{{- if or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} +--- +# Job to wait for OpenTelemetry CRDs to be available and then create the CRs +# This solves the race condition between the operator starting and CR creation +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + # Run as a post-install and post-upgrade hook + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "10" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + ttlSecondsAfterFinished: 300 + backoffLimit: 6 + template: + metadata: + labels: + {{- include "mlrun-ce.otel.selectorLabels" . | nindent 8 }} + spec: + restartPolicy: OnFailure + serviceAccountName: {{ .Release.Name }}-otel-cr-creator + containers: + - name: cr-creator + image: bitnami/kubectl:latest + command: + - /bin/bash + - -c + - | + set -e + + echo "Waiting for OpenTelemetry CRDs to be available..." + + # Wait for the OpenTelemetryCollector CRD + {{- if .Values.opentelemetry.collector.enabled }} + echo "Waiting for OpenTelemetryCollector CRD..." + until kubectl get crd opentelemetrycollectors.opentelemetry.io &>/dev/null; do + echo "Waiting for opentelemetrycollectors.opentelemetry.io CRD..." + sleep 5 + done + echo "OpenTelemetryCollector CRD is available!" + {{- end }} + + # Wait for the Instrumentation CRD + {{- if .Values.opentelemetry.instrumentation.enabled }} + echo "Waiting for Instrumentation CRD..." + until kubectl get crd instrumentations.opentelemetry.io &>/dev/null; do + echo "Waiting for instrumentations.opentelemetry.io CRD..." + sleep 5 + done + echo "Instrumentation CRD is available!" + {{- end }} + + # Wait a bit more for the operator to be fully ready + echo "Waiting for operator webhook to be ready..." + sleep 10 + + {{- if .Values.opentelemetry.collector.enabled }} + # Create or update the OpenTelemetryCollector CR + echo "Creating/updating OpenTelemetryCollector CR..." + cat <<'EOF' | kubectl apply -f - + {{- include "mlrun-ce.otel.collector.manifest" . | nindent 14 }} + EOF + echo "OpenTelemetryCollector CR created/updated!" + {{- end }} + + {{- if .Values.opentelemetry.instrumentation.enabled }} + # Create or update the Instrumentation CR + echo "Creating/updating Instrumentation CR..." + cat <<'EOF' | kubectl apply -f - + {{- include "mlrun-ce.otel.instrumentation.manifest" . | nindent 14 }} + EOF + echo "Instrumentation CR created/updated!" + {{- end }} + + echo "All OpenTelemetry CRs have been created successfully!" +{{- end }} + diff --git a/charts/mlrun-ce/templates/opentelemetry/instrumentation.yaml b/charts/mlrun-ce/templates/opentelemetry/instrumentation.yaml index 7b9f0767..b79b9198 100644 --- a/charts/mlrun-ce/templates/opentelemetry/instrumentation.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/instrumentation.yaml @@ -1,86 +1,14 @@ -{{- if .Values.opentelemetry.instrumentation.enabled }} -apiVersion: opentelemetry.io/v1alpha1 -kind: Instrumentation -metadata: - name: {{ include "mlrun-ce.otel.instrumentation.fullname" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "mlrun-ce.otel.labels" . | nindent 4 }} - annotations: - # Delay instrumentation CR creation until after CRDs are installed by the operator - helm.sh/hook: post-install,post-upgrade - helm.sh/hook-weight: "10" -spec: - # Propagators for distributed tracing context - propagators: - {{- toYaml .Values.opentelemetry.instrumentation.propagators | nindent 4 }} - - # Sampler configuration - sampler: - type: {{ .Values.opentelemetry.instrumentation.sampler.type }} - argument: {{ .Values.opentelemetry.instrumentation.sampler.argument | quote }} - - # Environment variables injected into instrumented pods - env: - # Service name will be auto-detected from pod metadata - - name: OTEL_SERVICE_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['app.kubernetes.io/name'] - # Resource attributes for better observability - - name: OTEL_RESOURCE_ATTRIBUTES - value: >- - k8s.namespace.name=$(OTEL_RESOURCE_ATTRIBUTES_NAMESPACE), - k8s.pod.name=$(OTEL_RESOURCE_ATTRIBUTES_POD_NAME), - k8s.container.name=$(OTEL_RESOURCE_ATTRIBUTES_CONTAINER_NAME), - service.namespace=$(OTEL_RESOURCE_ATTRIBUTES_NAMESPACE) - - name: OTEL_RESOURCE_ATTRIBUTES_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: OTEL_RESOURCE_ATTRIBUTES_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: OTEL_RESOURCE_ATTRIBUTES_CONTAINER_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - # Export metrics via OTLP to the sidecar collector - - name: OTEL_METRICS_EXPORTER - value: otlp - - name: OTEL_TRACES_EXPORTER - value: otlp - - name: OTEL_LOGS_EXPORTER - value: none - - # Python auto-instrumentation configuration - {{- if .Values.opentelemetry.instrumentation.python.enabled }} - python: - image: {{ .Values.opentelemetry.instrumentation.python.image.repository }}:{{ .Values.opentelemetry.instrumentation.python.image.tag }} - resourceRequirements: - {{- toYaml .Values.opentelemetry.instrumentation.python.resources | nindent 6 }} - env: - # Python-specific OTEL configuration - - name: OTEL_PYTHON_LOG_CORRELATION - value: "true" - - name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED - value: "false" - # Disable specific instrumentations that might cause issues - - name: OTEL_PYTHON_DISABLED_INSTRUMENTATIONS - value: "" - {{- end }} - - # Java auto-instrumentation configuration - {{- if .Values.opentelemetry.instrumentation.java.enabled }} - java: - image: {{ .Values.opentelemetry.instrumentation.java.image.repository }}:{{ .Values.opentelemetry.instrumentation.java.image.tag }} - resourceRequirements: - {{- toYaml .Values.opentelemetry.instrumentation.java.resources | nindent 6 }} - env: - # Java-specific OTEL configuration - - name: OTEL_INSTRUMENTATION_COMMON_DEFAULT_ENABLED - value: "true" - {{- end }} -{{- end }} +{{/* +Instrumentation CR is created by the crd-readiness-job.yaml via a Helm post-install/post-upgrade hook. +This solves the race condition between the OpenTelemetry Operator starting up and registering its CRDs, +and Helm trying to create this CR. + +The actual CR manifest is defined in _helpers.tpl as "mlrun-ce.otel.instrumentation.manifest" and is applied +by the job after it confirms the CRD is available. + +To see the CR configuration, check: +- templates/_helpers.tpl: defines the manifest +- templates/opentelemetry/crd-readiness-job.yaml: creates the CR +- values.yaml: opentelemetry.instrumentation.* settings +*/}} diff --git a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml index 985bd4fb..d86d3402 100644 --- a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml @@ -1,15 +1,33 @@ {{- if and (or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled) .Values.opentelemetry.namespaceLabel.enabled -}} -# Label the namespace for OpenTelemetry operator webhook injection -# The operator will only inject sidecars into namespaces with this label -apiVersion: v1 -kind: Namespace +# This template uses a post-install/post-upgrade hook to label the namespace +# for OpenTelemetry operator webhook injection, avoiding Helm ownership conflicts +apiVersion: batch/v1 +kind: Job metadata: - name: {{ .Release.Namespace }} + name: {{ .Release.Name }}-namespace-label + namespace: {{ .Release.Namespace }} labels: {{ include "mlrun-ce.otel.labels" . | indent 4 }} - {{ .Values.opentelemetry.namespaceLabel.key }}: {{ .Values.opentelemetry.namespaceLabel.value | quote }} annotations: - # This resource only patches the existing namespace with the required label - # It does not create the namespace (namespace should already exist) - helm.sh/resource-policy: keep + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "-10" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + ttlSecondsAfterFinished: 60 + template: + metadata: + name: {{ .Release.Name }}-namespace-label + spec: + serviceAccountName: {{ .Release.Name }}-otel-cr-creator + restartPolicy: Never + containers: + - name: label-namespace + image: bitnami/kubectl:latest + command: + - /bin/sh + - -c + - | + echo "Labeling namespace {{ .Release.Namespace }} for OpenTelemetry..." + kubectl label namespace {{ .Release.Namespace }} {{ .Values.opentelemetry.namespaceLabel.key }}={{ .Values.opentelemetry.namespaceLabel.value }} --overwrite + echo "Namespace labeled successfully!" {{- end -}} diff --git a/charts/mlrun-ce/templates/opentelemetry/rbac.yaml b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml index 0ffe62b4..5dcf746b 100644 --- a/charts/mlrun-ce/templates/opentelemetry/rbac.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml @@ -9,6 +9,19 @@ metadata: labels: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} --- +# ServiceAccount for the CR creator job +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "0" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +--- # Role for OpenTelemetry collector to access Kubernetes resources apiVersion: rbac.authorization.k8s.io/v1 kind: Role @@ -54,5 +67,102 @@ subjects: - kind: ServiceAccount name: otel-collector namespace: {{ .Release.Namespace }} +--- +# ClusterRole for the CR creator job to read CRDs and label namespaces (cluster-scoped) +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ .Release.Name }}-otel-crd-reader + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "0" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +rules: + # Allow reading CRDs to check availability (CRDs are cluster-scoped) + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + # Allow labeling namespaces for OTEL injection + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - patch + - update +--- +# ClusterRoleBinding for the CR creator job +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ .Release.Name }}-otel-crd-reader + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "0" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Release.Name }}-otel-crd-reader +subjects: + - kind: ServiceAccount + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} +--- +# Role for the CR creator job to create OpenTelemetry CRs (namespace-scoped) +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "0" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +rules: + # Allow creating/updating OpenTelemetry CRs + - apiGroups: + - opentelemetry.io + resources: + - opentelemetrycollectors + - instrumentations + verbs: + - create + - get + - patch + - update + - list +--- +# RoleBinding for the CR creator job +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} + labels: + {{- include "mlrun-ce.otel.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "0" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ .Release.Name }}-otel-cr-creator +subjects: + - kind: ServiceAccount + name: {{ .Release.Name }}-otel-cr-creator + namespace: {{ .Release.Namespace }} {{- end }} diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index 5c864740..7850c51c 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -800,8 +800,8 @@ opentelemetry-operator: manager: # Collector image used by the operator when creating collectors collectorImage: - repository: otel/opentelemetry-collector-contrib - tag: 0.115.0 + repository: otel/opentelemetry-collector + tag: 0.116.0 # Auto-instrumentation images autoInstrumentationImage: python: @@ -810,6 +810,8 @@ opentelemetry-operator: java: repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-java tag: 2.10.0 + # Feature gates as string (comma-separated) + featureGates: "" resources: requests: cpu: 50m @@ -830,12 +832,14 @@ opentelemetry: key: "opentelemetry.io/inject" value: "enabled" - # OpenTelemetry Collector configuration (sidecar mode) + # OpenTelemetry Collector configuration (SIDECAR mode) + # In sidecar mode, the collector is injected into pods via webhook + # Not as a standalone deployment collector: enabled: false nameOverride: "" fullnameOverride: "" - # Sidecar mode - collector runs as a sidecar in instrumented pods + # SIDECAR mode - collector is injected into pods by the operator mode: sidecar # Collector sidecar container resources resources: diff --git a/tests/helm-template-test.sh b/tests/helm-template-test.sh index 8b9d79c2..6acd789f 100755 --- a/tests/helm-template-test.sh +++ b/tests/helm-template-test.sh @@ -122,34 +122,40 @@ assert_not_renders() { # ============================================================================ test_otel_collector_default() { - log_test "OpenTelemetry Collector - Enabled" + log_test "OpenTelemetry Collector - Enabled (via CRD Readiness Job)" local output - output=$(render_template "templates/opentelemetry/collector.yaml" \ + # The collector CR is now created by the crd-readiness-job, not directly + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ --set global.registry.url=test.io \ --set opentelemetry.collector.enabled=true) - assert_renders "$output" "Collector CR renders" - assert_contains "$output" "kind: OpenTelemetryCollector" "Has correct kind" + assert_renders "$output" "CRD Readiness Job renders" + assert_contains "$output" "kind: Job" "Has correct kind" + assert_contains "$output" "kind: OpenTelemetryCollector" "Job contains OpenTelemetryCollector CR" assert_contains "$output" "mode: sidecar" "Uses sidecar mode" assert_contains "$output" "prometheus:" "Has Prometheus exporter" assert_contains "$output" "endpoint: 0.0.0.0:8889" "Prometheus on port 8889" assert_contains "$output" "otlp:" "Has OTLP receiver" - assert_contains "$output" "helm.sh/hook: post-install,post-upgrade" "Has Helm hooks" + assert_contains "$output" "helm.sh/hook" "Has Helm hooks" + assert_contains "$output" "post-install,post-upgrade" "Has correct hook triggers" + assert_contains "$output" "upgradeStrategy: automatic" "Has upgradeStrategy" + assert_contains "$output" "managementState: managed" "Has managementState" } test_otel_collector_disabled() { log_test "OpenTelemetry Collector - Disabled (default)" - assert_not_renders "templates/opentelemetry/collector.yaml" \ - "Collector CR does not render when disabled (default)" + # When disabled, the crd-readiness-job should not render + assert_not_renders "templates/opentelemetry/crd-readiness-job.yaml" \ + "CRD Readiness Job does not render when collector disabled (default)" } test_otel_collector_resources() { log_test "OpenTelemetry Collector - Custom resources" local output - output=$(render_template "templates/opentelemetry/collector.yaml" \ + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ --set global.registry.url=test.io \ --set opentelemetry.collector.enabled=true \ --set opentelemetry.collector.resources.requests.cpu=100m \ @@ -164,15 +170,15 @@ test_otel_collector_resources() { } test_otel_instrumentation_default() { - log_test "OpenTelemetry Instrumentation - Enabled" + log_test "OpenTelemetry Instrumentation - Enabled (via CRD Readiness Job)" local output - output=$(render_template "templates/opentelemetry/instrumentation.yaml" \ + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ --set global.registry.url=test.io \ --set opentelemetry.instrumentation.enabled=true) - assert_renders "$output" "Instrumentation CR renders" - assert_contains "$output" "kind: Instrumentation" "Has correct kind" + assert_renders "$output" "CRD Readiness Job renders for Instrumentation" + assert_contains "$output" "kind: Instrumentation" "Job contains Instrumentation CR" assert_contains "$output" "tracecontext" "Has tracecontext propagator" assert_contains "$output" "baggage" "Has baggage propagator" assert_contains "$output" "parentbased_traceidratio" "Has sampler type" @@ -183,15 +189,16 @@ test_otel_instrumentation_default() { test_otel_instrumentation_disabled() { log_test "OpenTelemetry Instrumentation - Disabled (default)" - assert_not_renders "templates/opentelemetry/instrumentation.yaml" \ - "Instrumentation CR does not render when disabled (default)" + # When both collector and instrumentation are disabled, the job should not render + assert_not_renders "templates/opentelemetry/crd-readiness-job.yaml" \ + "CRD Readiness Job does not render when instrumentation disabled (default)" } test_otel_instrumentation_java_enabled() { log_test "OpenTelemetry Instrumentation - Java enabled" local output - output=$(render_template "templates/opentelemetry/instrumentation.yaml" \ + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ --set global.registry.url=test.io \ --set opentelemetry.instrumentation.enabled=true \ --set opentelemetry.instrumentation.java.enabled=true) @@ -213,6 +220,8 @@ test_otel_rbac_default() { assert_contains "$output" "kind: Role" "Has Role" assert_contains "$output" "kind: RoleBinding" "Has RoleBinding" assert_contains "$output" "name: otel-collector" "Has correct name" + assert_contains "$output" "kind: ClusterRole" "Has ClusterRole for CRD access" + assert_contains "$output" "otel-cr-creator" "Has CR creator ServiceAccount" } test_otel_rbac_disabled() { @@ -256,14 +265,9 @@ test_jupyter_no_otel_annotations_when_disabled() { test_admin_values_otel() { log_test "Admin installation - OTEL operator enabled, CRs disabled" - # Collector should not render - assert_not_renders "templates/opentelemetry/collector.yaml" \ - "Collector CR not rendered with admin values" \ - -f "${CHART_DIR}/admin_installation_values.yaml" - - # Instrumentation should not render - assert_not_renders "templates/opentelemetry/instrumentation.yaml" \ - "Instrumentation CR not rendered with admin values" \ + # CRD readiness job should not render when CRs are disabled + assert_not_renders "templates/opentelemetry/crd-readiness-job.yaml" \ + "CRD Readiness Job not rendered with admin values" \ -f "${CHART_DIR}/admin_installation_values.yaml" } @@ -271,17 +275,13 @@ test_non_admin_values_otel() { log_test "Non-admin installation - OTEL CRs enabled" local output - output=$(render_template "templates/opentelemetry/collector.yaml" \ - --set global.registry.url=test.io \ - -f "${CHART_DIR}/non_admin_installation_values.yaml") - - assert_renders "$output" "Collector CR renders with non-admin values" - - output=$(render_template "templates/opentelemetry/instrumentation.yaml" \ + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ --set global.registry.url=test.io \ -f "${CHART_DIR}/non_admin_installation_values.yaml") - assert_renders "$output" "Instrumentation CR renders with non-admin values" + assert_renders "$output" "CRD Readiness Job renders with non-admin values" + assert_contains "$output" "kind: OpenTelemetryCollector" "Has Collector CR" + assert_contains "$output" "kind: Instrumentation" "Has Instrumentation CR" } test_namespace_label_enabled() { @@ -293,10 +293,11 @@ test_namespace_label_enabled() { --set opentelemetry.namespaceLabel.enabled=true \ --set opentelemetry.collector.enabled=true) - assert_renders "$output" "Namespace label renders" - assert_contains "$output" "kind: Namespace" "Has correct kind" + assert_renders "$output" "Namespace label job renders" + assert_contains "$output" "kind: Job" "Has correct kind (Job)" + assert_contains "$output" "helm.sh/hook" "Has post-install hook annotation" + assert_contains "$output" "kubectl label namespace" "Has kubectl label command" assert_contains "$output" "opentelemetry.io/inject" "Has OTEL inject label key" - assert_contains "$output" '"enabled"' "Has OTEL inject label value" } test_namespace_label_disabled() { @@ -322,7 +323,7 @@ test_non_admin_namespace_label_enabled() { --set global.registry.url=test.io \ -f "${CHART_DIR}/non_admin_installation_values.yaml") - assert_renders "$output" "Namespace label renders with non-admin values" + assert_renders "$output" "Namespace label job renders with non-admin values" assert_contains "$output" "opentelemetry.io/inject" "Has OTEL inject label" } From 78b175a63298b1c8c2e42fb6e39ce9ff58bb5668 Mon Sep 17 00:00:00 2001 From: royischoss Date: Mon, 30 Mar 2026 14:47:26 +0300 Subject: [PATCH 07/23] works --- .gitignore | 4 ++ charts/mlrun-ce/templates/_helpers.tpl | 5 +- .../opentelemetry/crd-readiness-job.yaml | 1 + .../opentelemetry/namespace-label.yaml | 1 + .../templates/opentelemetry/rbac.yaml | 10 ++-- charts/mlrun-ce/values.yaml | 52 +++++++++++-------- tests/package.sh | 36 +++++++++++++ 7 files changed, 77 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index cc8f86f3..4cda4743 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,7 @@ charts/mlrun-ce/charts/* **/.DS_Store *.DS_Store **/__pycache__ +# Packaged chart tarballs (generated by make package) +charts/mlrun-ce/mlrun-ce-*.tgz +# MLRun project directories created by test scripts +otlp-pro/ diff --git a/charts/mlrun-ce/templates/_helpers.tpl b/charts/mlrun-ce/templates/_helpers.tpl index a445e846..3cfa5d7b 100644 --- a/charts/mlrun-ce/templates/_helpers.tpl +++ b/charts/mlrun-ce/templates/_helpers.tpl @@ -496,12 +496,9 @@ spec: mode: {{ .Values.opentelemetry.collector.mode }} upgradeStrategy: automatic managementState: managed + image: {{ (index .Values "opentelemetry-operator").manager.collectorImage.repository }}:{{ (index .Values "opentelemetry-operator").manager.collectorImage.tag }} resources: {{- toYaml .Values.opentelemetry.collector.resources | nindent 4 }} - podAnnotations: - prometheus.io/scrape: "true" - prometheus.io/port: "{{ .Values.opentelemetry.collector.prometheus.port }}" - prometheus.io/path: "/metrics" config: receivers: otlp: diff --git a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml index eec873f2..2a731da3 100644 --- a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml @@ -14,6 +14,7 @@ metadata: "helm.sh/hook": post-install,post-upgrade "helm.sh/hook-weight": "10" "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-timeout": "300s" spec: ttlSecondsAfterFinished: 300 backoffLimit: 6 diff --git a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml index d86d3402..11040521 100644 --- a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml @@ -12,6 +12,7 @@ metadata: "helm.sh/hook": post-install,post-upgrade "helm.sh/hook-weight": "-10" "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-timeout": "120s" spec: ttlSecondsAfterFinished: 60 template: diff --git a/charts/mlrun-ce/templates/opentelemetry/rbac.yaml b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml index 5dcf746b..eb360bfd 100644 --- a/charts/mlrun-ce/templates/opentelemetry/rbac.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml @@ -19,7 +19,7 @@ metadata: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} annotations: "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "0" + "helm.sh/hook-weight": "-20" "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded --- # Role for OpenTelemetry collector to access Kubernetes resources @@ -77,7 +77,7 @@ metadata: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} annotations: "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "0" + "helm.sh/hook-weight": "-20" "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded rules: # Allow reading CRDs to check availability (CRDs are cluster-scoped) @@ -107,7 +107,7 @@ metadata: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} annotations: "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "0" + "helm.sh/hook-weight": "-20" "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded roleRef: apiGroup: rbac.authorization.k8s.io @@ -128,7 +128,7 @@ metadata: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} annotations: "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "0" + "helm.sh/hook-weight": "-20" "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded rules: # Allow creating/updating OpenTelemetry CRs @@ -154,7 +154,7 @@ metadata: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} annotations: "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "0" + "helm.sh/hook-weight": "-20" "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index 40497364..d4d2c638 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -574,33 +574,25 @@ kube-prometheus-stack: # This creates clear separation between direct scraping and OTEL-collected metrics additionalScrapeConfigs: # Job for scraping OTEL collector sidecars (metrics on port 8889) - # Only scrapes pods with prometheus.io/scrape-mode: "otel" or "both" + # Discovers any pod with sidecar.opentelemetry.io/inject annotation — no per-function + # Prometheus annotations required. Port 8889 is the standard OTel prometheus exporter port. - job_name: 'otel-collector-sidecars' kubernetes_sd_configs: - role: pod relabel_configs: - # Only scrape pods with OTEL sidecar (port 8889) - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port] + # Only scrape pods that have the OTel sidecar injected + - source_labels: [__meta_kubernetes_pod_annotation_sidecar_opentelemetry_io_inject] action: keep - regex: "8889" - # Only scrape if scrape-mode is "otel" or "both" (or legacy scrape=true with port 8889) - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_mode] - action: keep - regex: (otel|both|) - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - # Set metrics path - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + regex: .+ + # Use port 8889 (OTel prometheus exporter) regardless of what the pod exposes + - source_labels: [__address__] action: replace - target_label: __metrics_path__ - regex: (.+) - # Set target address with port - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 + regex: ([^:]+)(?::\d+)? + replacement: $1:8889 target_label: __address__ + # Metrics path is always /metrics for the OTel prometheus exporter + - target_label: __metrics_path__ + replacement: /metrics # Add kubernetes labels - action: labelmap regex: __meta_kubernetes_pod_label_(.+) @@ -815,9 +807,11 @@ opentelemetry-operator: manager: # Collector image used by the operator when creating collectors collectorImage: - repository: otel/opentelemetry-collector - tag: 0.116.0 - # Auto-instrumentation images + # Using contrib distribution pinned to 0.108.0 — versions 0.109+ use a dynamically linked + # binary in a distroless image that lacks /lib64/ld-linux-x86-64.so.2 and fails to exec. + repository: otel/opentelemetry-collector-contrib + tag: 0.108.0 + # Auto-instrumentation images (all fields required by the sub-chart schema) autoInstrumentationImage: python: repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-python @@ -825,6 +819,18 @@ opentelemetry-operator: java: repository: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-java tag: 2.10.0 + nodejs: + repository: "" + tag: "" + dotnet: + repository: "" + tag: "" + go: + repository: "" + tag: "" + apacheHttpd: + repository: "" + tag: "" # Feature gates as string (comma-separated) featureGates: "" resources: diff --git a/tests/package.sh b/tests/package.sh index d8f847bc..fa8150ed 100755 --- a/tests/package.sh +++ b/tests/package.sh @@ -25,6 +25,42 @@ echo "Installing chart dependencies" cd "$dirname"/../charts/mlrun-ce helm dependency update +# Patch opentelemetry-operator sub-chart schema: the upstream chart has +# "examples": "" (string) for featureGates, but JSON Schema requires an array. +# Helm v4 enforces metaschema validation strictly and rejects the install otherwise. +echo "Patching opentelemetry-operator schema (featureGates.examples string -> array)..." +python3 - <<'PYEOF' +import json, tarfile, os, shutil, tempfile + +tgz = "charts/opentelemetry-operator-0.78.1.tgz" +if not os.path.exists(tgz): + print(f" {tgz} not found, skipping patch") + exit(0) + +with tempfile.TemporaryDirectory() as tmp: + with tarfile.open(tgz, "r:gz") as t: + t.extractall(tmp) + schema_path = os.path.join(tmp, "opentelemetry-operator", "values.schema.json") + with open(schema_path) as f: + schema = json.load(f) + fg = schema["properties"]["manager"]["properties"]["featureGates"] + if isinstance(fg.get("examples"), str): + fg["examples"] = [fg["examples"]] + with open(schema_path, "w") as f: + json.dump(schema, f, indent=2) + print(" Patched featureGates.examples") + else: + print(" Already correct, no patch needed") + # Repack without macOS metadata + env = os.environ.copy() + env["COPYFILE_DISABLE"] = "1" + import subprocess + subprocess.run( + ["tar", "czf", os.path.abspath(tgz), "opentelemetry-operator"], + cwd=tmp, env=env, check=True + ) +PYEOF + # Create MLRun CE tarball helm package . exit 0 From 73ba28781fa92a837b793cfc5536522502ba716a Mon Sep 17 00:00:00 2001 From: royischoss Date: Sun, 5 Apr 2026 17:54:40 +0300 Subject: [PATCH 08/23] Otel with collector works well --- charts/mlrun-ce/README.md | 104 ++++----------- .../non_admin_installation_values.yaml | 4 +- charts/mlrun-ce/templates/NOTES.txt | 41 +++--- charts/mlrun-ce/templates/_helpers.tpl | 37 +++--- .../jupyter-notebook/deployment.yaml | 19 +-- .../opentelemetry/namespace-label.yaml | 10 +- .../templates/opentelemetry/rbac.yaml | 20 +-- .../metadata-envoy-deployment.yaml | 3 + .../deployments/metadata-grpc-deployment.yaml | 3 + .../deployments/metadata-writer.yaml | 3 + .../ml-pipeline-persistenceagent.yaml | 3 + .../ml-pipeline-scheduledworkflow.yaml | 3 + .../pipelines/deployments/ml-pipeline-ui.yaml | 3 + .../deployments/ml-pipeline-viewer-crd.yaml | 3 + .../ml-pipeline-visualizationserver.yaml | 3 + .../pipelines/deployments/ml-pipeline.yaml | 3 + .../pipelines/deployments/mysql.yaml | 3 + .../deployments/workflow-controller.yaml | 3 + .../templates/timescaledb/statefulset.yaml | 3 + charts/mlrun-ce/values.yaml | 122 +++++++----------- tests/helm-template-test.sh | 48 ++++--- tests/kind-test.sh | 14 +- 22 files changed, 195 insertions(+), 260 deletions(-) diff --git a/charts/mlrun-ce/README.md b/charts/mlrun-ce/README.md index 3a8e6157..dce3c7cd 100644 --- a/charts/mlrun-ce/README.md +++ b/charts/mlrun-ce/README.md @@ -107,20 +107,16 @@ helm --namespace mlrun \ --set opentelemetry-operator.enabled=true \ --set opentelemetry.namespaceLabel.enabled=true \ --set opentelemetry.collector.enabled=true \ - --set opentelemetry.collector.scrapeMode=otel \ --set opentelemetry.instrumentation.enabled=true \ mlrun/mlrun-ce ``` -> **Important:** When enabling OpenTelemetry, set `opentelemetry.collector.scrapeMode=otel` to collect metrics -> via the OTEL sidecar and prevent duplicate metrics. The default is `direct` (for when OTEL is disabled). - The installation will: - Deploy the OpenTelemetry Operator -- Create an OpenTelemetryCollector CR (sidecar mode) +- Create an OpenTelemetryCollector CR (deployment mode — one collector per namespace) - Create an Instrumentation CR for Python auto-instrumentation -- Label the namespace with `opentelemetry.io/inject=enabled` -- Configure Prometheus to scrape OTEL sidecar metrics (port 8889) +- Label and annotate the namespace so all Python pods are auto-instrumented automatically +- Configure Prometheus to scrape OTEL collector metrics (port 8889) #### Step 5: Verify OpenTelemetry Installation @@ -140,21 +136,14 @@ kubectl -n mlrun get instrumentations kubectl -n mlrun get pods | grep opentelemetry ``` -#### Step 6: Verify Jupyter has OTEL Sidecar Annotations +#### Step 6: Verify OTel Pod Labels and Namespace Annotation ```bash -kubectl -n mlrun get deployment -l app.kubernetes.io/component=jupyter-notebook \ - -o jsonpath='{.items[0].spec.template.metadata.annotations}' | jq . -``` +# Check that the namespace has the instrumentation annotation (enables auto-instrumentation for all Python pods) +kubectl get namespace mlrun -o jsonpath='{.metadata.annotations}' | jq . -You should see annotations like: -```json -{ - "instrumentation.opentelemetry.io/inject-python": "my-mlrun-otel-instrumentation", - "prometheus.io/port": "8889", - "prometheus.io/scrape": "true", - "sidecar.opentelemetry.io/inject": "my-mlrun-otel-collector" -} +# Check pod labels — all chart-managed pods should have mlrun.io/otel=true +kubectl -n mlrun get pods --show-labels | grep mlrun.io/otel ``` ### Installing MLRun-ce on minikube @@ -185,7 +174,7 @@ Override those [in the normal methods](https://helm.sh/docs/chart_template_guide ### Configuring OpenTelemetry (Observability) MLRun CE includes the OpenTelemetry Operator for collecting metrics and traces from your ML workloads. -The operator runs in **sidecar mode**, automatically injecting collector containers into annotated pods. +The operator runs one collector **Deployment** per namespace. Instrumented pods send OTLP metrics to the collector, which exports them to Prometheus. > **Note:** OpenTelemetry is **disabled by default**. See below for how to enable it. @@ -212,10 +201,10 @@ kubectl label namespace opentelemetry.io/inject=enabled #### Default Configuration By default, OpenTelemetry is **disabled**. When enabled, it provides: -- Namespace labeling for OTEL operator webhook targeting -- Sidecar collector injection for instrumented pods -- Python auto-instrumentation for Jupyter notebooks -- Prometheus metrics export on port 8889 +- A single OTel Collector Deployment per namespace (OTLP receiver → Prometheus exporter on port 8889) +- Namespace-level Python auto-instrumentation (all Python pods in the namespace are instrumented automatically) +- `mlrun.io/otel: "true"` label on Jupyter, SeaweedFS, and Nuclio function pods +- Prometheus scrapes the collector pod (not individual pods) #### Enabling OpenTelemetry @@ -228,7 +217,6 @@ helm --namespace mlrun install my-mlrun \ --set opentelemetry-operator.enabled=true \ --set opentelemetry.namespaceLabel.enabled=true \ --set opentelemetry.collector.enabled=true \ - --set opentelemetry.collector.scrapeMode=otel \ --set opentelemetry.instrumentation.enabled=true \ mlrun/mlrun-ce ``` @@ -240,7 +228,6 @@ helm --namespace mlrun upgrade my-mlrun \ --set opentelemetry-operator.enabled=true \ --set opentelemetry.namespaceLabel.enabled=true \ --set opentelemetry.collector.enabled=true \ - --set opentelemetry.collector.scrapeMode=otel \ --set opentelemetry.instrumentation.enabled=true \ mlrun/mlrun-ce ``` @@ -253,13 +240,12 @@ helm --namespace mlrun upgrade my-mlrun \ --set opentelemetry.collector.enabled=false \ --set opentelemetry.instrumentation.enabled=false \ --set opentelemetry.namespaceLabel.enabled=false \ - --set opentelemetry.collector.scrapeMode=direct \ mlrun/mlrun-ce ``` #### Custom Resource Limits -Configure collector sidecar resources: +Configure collector resources: ```bash helm --namespace mlrun install my-mlrun \ @@ -282,63 +268,23 @@ helm --namespace mlrun install my-mlrun \ #### Adding OpenTelemetry to Custom Workloads -To instrument your own deployments with the OTEL sidecar and Python auto-instrumentation: - -1. Ensure your namespace has the OpenTelemetry label: - ```bash - kubectl label namespace opentelemetry.io/inject=enabled - ``` - -2. Add these annotations to your pod spec: - ```yaml - metadata: - annotations: - sidecar.opentelemetry.io/inject: "-otel-collector" - instrumentation.opentelemetry.io/inject-python: "-otel-instrumentation" - prometheus.io/scrape: "true" - prometheus.io/scrape-mode: "otel" - prometheus.io/port: "8889" - ``` - -#### Preventing Prometheus/OTEL Metric Overlap - -To prevent duplicate metrics when using both Prometheus direct scraping and OpenTelemetry, -MLRun CE uses a **scrape-mode** annotation system: - -| Scrape Mode | Description | Use Case | -|-------------|-------------|----------| -| `direct` | Direct Prometheus scraping only | **Default** - When OTEL is disabled | -| `otel` | Metrics collected via OTEL sidecar only | **Recommended when OTEL enabled** | -| `both` | Both OTEL and direct scraping | Debugging/transition only | - -> **Note:** The default scrape mode is `direct`. When enabling OpenTelemetry, you must set -> `--set opentelemetry.collector.scrapeMode=otel` to collect metrics via the OTEL sidecar. - -**How it works:** -- OTEL-collected metrics have the `mlrun_otel_` prefix and `metrics_source=otel_collector` label -- Direct-scraped metrics have `metrics_source=direct_scrape` label -- Prometheus scrape configs filter based on `prometheus.io/scrape-mode` annotation +Python instrumentation is applied **namespace-wide** — any Python pod in the MLRun namespace is automatically instrumented when OTel is enabled. No per-pod annotations are required. -**Configure scrape mode when enabling OTEL:** +For pods in other namespaces, annotate the namespace directly: ```bash -helm --namespace mlrun install my-mlrun \ - --set opentelemetry-operator.enabled=true \ - --set opentelemetry.collector.enabled=true \ - --set opentelemetry.collector.scrapeMode=otel \ - --set opentelemetry.instrumentation.enabled=true \ - mlrun/mlrun-ce +kubectl annotate namespace \ + instrumentation.opentelemetry.io/inject-python=-otel-instrumentation ``` -**Query metrics by source in Prometheus:** -```promql -# OTEL-collected metrics only -{metrics_source="otel_collector"} - -# Direct-scraped metrics only -{metrics_source="direct_scrape"} +The `mlrun.io/otel: "true"` label is applied to: **Jupyter**, **SeaweedFS** (master, volume, filer, s3, admin), and **Nuclio function pods** (via `functionDefaults.metadata.labels`). This label is used for Prometheus metric filtering and enrichment. -# OTEL metrics use prefix +**Query OTEL-collected metrics in Prometheus:** +```promql +# OTEL metrics use the mlrun_otel_ prefix mlrun_otel_http_server_duration_seconds_bucket{...} + +# Filter by source +{metrics_source="otel_collector"} ``` #### Split Installation (Admin/Non-Admin) diff --git a/charts/mlrun-ce/non_admin_installation_values.yaml b/charts/mlrun-ce/non_admin_installation_values.yaml index f7939235..2d32c68c 100644 --- a/charts/mlrun-ce/non_admin_installation_values.yaml +++ b/charts/mlrun-ce/non_admin_installation_values.yaml @@ -88,8 +88,8 @@ opentelemetry-operator: enabled: false # OpenTelemetry CRs - enabled for user namespace -# The namespace will be labeled with opentelemetry.io/inject=enabled -# so the operator can inject sidecars into pods +# The namespace will be labeled and annotated for OTel deployment-mode collection +# and namespace-wide Python auto-instrumentation. opentelemetry: namespaceLabel: enabled: true diff --git a/charts/mlrun-ce/templates/NOTES.txt b/charts/mlrun-ce/templates/NOTES.txt index 005dfa1b..31d6d90d 100644 --- a/charts/mlrun-ce/templates/NOTES.txt +++ b/charts/mlrun-ce/templates/NOTES.txt @@ -134,25 +134,19 @@ OpenTelemetry Operator is enabled! - Namespace selector: opentelemetry.io/inject=enabled {{- if .Values.opentelemetry.collector.enabled }} {{- "\n" }} -OpenTelemetry Collector (sidecar mode): -- Collector CR: {{ .Release.Name }}-otel-collector +OpenTelemetry Collector (deployment mode): +- Collector CR: {{ include "mlrun-ce.otel.collector.fullname" . }} - Mode: {{ .Values.opentelemetry.collector.mode }} -- OTLP gRPC endpoint: localhost:{{ .Values.opentelemetry.collector.otlp.grpcPort }} (inside pod) -- OTLP HTTP endpoint: localhost:{{ .Values.opentelemetry.collector.otlp.httpPort }} (inside pod) -- Prometheus metrics port: {{ .Values.opentelemetry.collector.prometheus.port }} -- Prometheus scrape mode: {{ .Values.opentelemetry.collector.scrapeMode }} -{{- if eq .Values.opentelemetry.collector.scrapeMode "direct" }} - -⚠️ WARNING: Scrape mode is "direct" - OTEL sidecar metrics will NOT be collected! - To collect metrics via OTEL, reinstall with: --set opentelemetry.collector.scrapeMode=otel -{{- end }} +- OTLP gRPC endpoint: {{ include "mlrun-ce.otel.collector.fullname" . }}-collector:{{ .Values.opentelemetry.collector.otlp.grpcPort }} +- OTLP HTTP endpoint: {{ include "mlrun-ce.otel.collector.fullname" . }}-collector:{{ .Values.opentelemetry.collector.otlp.httpPort }} +- Prometheus metrics port: {{ .Values.opentelemetry.collector.prometheus.port }} (scraped by Prometheus from the collector pod) {{- end }} {{- if .Values.opentelemetry.instrumentation.enabled }} {{- "\n" }} OpenTelemetry Auto-Instrumentation: -- Instrumentation CR: {{ .Release.Name }}-otel-instrumentation +- Instrumentation CR: {{ include "mlrun-ce.otel.instrumentation.fullname" . }} {{- if .Values.opentelemetry.instrumentation.python.enabled }} -- Python auto-instrumentation: enabled +- Python auto-instrumentation: enabled (namespace-wide via namespace annotation) {{- end }} {{- if .Values.opentelemetry.instrumentation.java.enabled }} - Java auto-instrumentation: enabled @@ -160,21 +154,16 @@ OpenTelemetry Auto-Instrumentation: {{- end }} {{- if .Values.opentelemetry.namespaceLabel.enabled }} {{- "\n" }} -Namespace Label: -- Namespace {{ .Release.Namespace }} is labeled with: {{ .Values.opentelemetry.namespaceLabel.key }}={{ .Values.opentelemetry.namespaceLabel.value }} +Namespace OTel configuration: +- Label: {{ .Values.opentelemetry.namespaceLabel.key }}={{ .Values.opentelemetry.namespaceLabel.value }} +{{- if .Values.opentelemetry.instrumentation.enabled }} +- Python instrumentation annotation applied to all pods in namespace {{ .Release.Namespace }} {{- end }} +{{- end }} +{{- if or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} {{- "\n" }} -Prometheus Scrape Modes: -- "otel" : Metrics collected via OTEL sidecar only (recommended) -- "direct" : Direct Prometheus scraping only (current: {{ .Values.opentelemetry.collector.scrapeMode }}) -- "both" : Both methods active (for debugging) -{{- "\n" }} -To add OTEL instrumentation to your pods, add these annotations: - sidecar.opentelemetry.io/inject: "{{ .Release.Name }}-otel-collector" - instrumentation.opentelemetry.io/inject-python: "{{ .Release.Name }}-otel-instrumentation" - prometheus.io/scrape: "true" - prometheus.io/scrape-mode: "otel" - prometheus.io/port: "{{ .Values.opentelemetry.collector.prometheus.port }}" +Pods labeled with mlrun.io/otel=true: Jupyter, SeaweedFS (master/volume/filer/s3/admin), and Nuclio function pods. +{{- end }} {{- end }} Happy MLOPSing!!! :] diff --git a/charts/mlrun-ce/templates/_helpers.tpl b/charts/mlrun-ce/templates/_helpers.tpl index 3cfa5d7b..08bc2499 100644 --- a/charts/mlrun-ce/templates/_helpers.tpl +++ b/charts/mlrun-ce/templates/_helpers.tpl @@ -423,7 +423,7 @@ OpenTelemetry helpers OpenTelemetry Collector name */}} {{- define "mlrun-ce.otel.collector.name" -}} -{{- default "otel-collector" .Values.opentelemetry.collector.nameOverride | trunc 63 | trimSuffix "-" }} +{{- default "otel" .Values.opentelemetry.collector.nameOverride | trunc 63 | trimSuffix "-" }} {{- end }} {{/* @@ -433,7 +433,7 @@ OpenTelemetry Collector fullname {{- if .Values.opentelemetry.collector.fullnameOverride }} {{- .Values.opentelemetry.collector.fullnameOverride | trunc 63 | trimSuffix "-" }} {{- else }} -{{- $name := default "otel-collector" .Values.opentelemetry.collector.nameOverride }} +{{- $name := default "otel" .Values.opentelemetry.collector.nameOverride }} {{- if contains $name .Release.Name }} {{- .Release.Name | trunc 63 | trimSuffix "-" }} {{- else }} @@ -526,7 +526,7 @@ spec: endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.prometheus.port }} namespace: {{ .Values.opentelemetry.collector.prometheus.namespace }} const_labels: - collector_mode: sidecar + collector_mode: deployment metrics_source: otel_collector resource_to_telemetry_conversion: enabled: true @@ -579,6 +579,8 @@ metadata: labels: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} spec: + exporter: + endpoint: http://{{ include "mlrun-ce.otel.collector.fullname" . }}-collector:{{ .Values.opentelemetry.collector.otlp.httpPort }} propagators: {{- toYaml .Values.opentelemetry.instrumentation.propagators | nindent 4 }} sampler: @@ -589,24 +591,6 @@ spec: valueFrom: fieldRef: fieldPath: metadata.labels['app.kubernetes.io/name'] - - name: OTEL_RESOURCE_ATTRIBUTES - value: >- - k8s.namespace.name=$(OTEL_RESOURCE_ATTRIBUTES_NAMESPACE), - k8s.pod.name=$(OTEL_RESOURCE_ATTRIBUTES_POD_NAME), - k8s.container.name=$(OTEL_RESOURCE_ATTRIBUTES_CONTAINER_NAME), - service.namespace=$(OTEL_RESOURCE_ATTRIBUTES_NAMESPACE) - - name: OTEL_RESOURCE_ATTRIBUTES_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: OTEL_RESOURCE_ATTRIBUTES_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: OTEL_RESOURCE_ATTRIBUTES_CONTAINER_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - name: OTEL_METRICS_EXPORTER value: otlp - name: OTEL_TRACES_EXPORTER @@ -624,7 +608,7 @@ spec: - name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED value: "false" - name: OTEL_PYTHON_DISABLED_INSTRUMENTATIONS - value: "" + value: "aws_lambda" {{- end }} {{- if .Values.opentelemetry.instrumentation.java.enabled }} java: @@ -636,3 +620,12 @@ spec: value: "true" {{- end }} {{- end }} +.. +{{/* +OTel pod label — marks a pod as OTel-monitored for metric enrichment and discovery. +Namespace-level instrumentation annotation (set by namespace-label job) handles Python auto-instrumentation. +Wrap usage with: {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} +*/}} +{{- define "mlrun-ce.otel.podLabels" -}} +mlrun.io/otel: "true" +{{- end }} diff --git a/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml b/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml index 83135a1e..08108481 100644 --- a/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml +++ b/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml @@ -14,22 +14,9 @@ spec: metadata: labels: {{- include "mlrun-ce.jupyter.selectorLabels" . | nindent 8 }} - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - annotations: - # OpenTelemetry sidecar injection - sidecar.opentelemetry.io/inject: "{{ include "mlrun-ce.otel.collector.fullname" . }}" - # Python auto-instrumentation injection - instrumentation.opentelemetry.io/inject-python: "{{ include "mlrun-ce.otel.instrumentation.fullname" . }}" - # Prometheus scraping configuration - # scrape-mode controls how metrics are collected to prevent duplicates: - # "otel" - Only OTEL sidecar metrics (recommended) - # "both" - Both OTEL and direct scraping (debugging) - # "direct" - Only direct scraping (OTEL metrics ignored) - prometheus.io/scrape: "true" - prometheus.io/scrape-mode: {{ .Values.opentelemetry.collector.scrapeMode | quote }} - prometheus.io/port: "{{ .Values.opentelemetry.collector.prometheus.port }}" - prometheus.io/path: "/metrics" - {{- end }} + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: {{- with .Values.jupyterNotebook.image.pullSecrets }} imagePullSecrets: diff --git a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml index 11040521..60cfb3b8 100644 --- a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml @@ -9,7 +9,7 @@ metadata: labels: {{ include "mlrun-ce.otel.labels" . | indent 4 }} annotations: - "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook": pre-install,pre-upgrade "helm.sh/hook-weight": "-10" "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded "helm.sh/hook-timeout": "120s" @@ -30,5 +30,11 @@ spec: - | echo "Labeling namespace {{ .Release.Namespace }} for OpenTelemetry..." kubectl label namespace {{ .Release.Namespace }} {{ .Values.opentelemetry.namespaceLabel.key }}={{ .Values.opentelemetry.namespaceLabel.value }} --overwrite - echo "Namespace labeled successfully!" + {{- if .Values.opentelemetry.instrumentation.enabled }} + echo "Annotating namespace for namespace-wide Python auto-instrumentation..." + kubectl annotate namespace {{ .Release.Namespace }} \ + instrumentation.opentelemetry.io/inject-python={{ include "mlrun-ce.otel.instrumentation.fullname" . }} \ + --overwrite + {{- end }} + echo "Namespace configured for OpenTelemetry successfully!" {{- end -}} diff --git a/charts/mlrun-ce/templates/opentelemetry/rbac.yaml b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml index eb360bfd..9eec1971 100644 --- a/charts/mlrun-ce/templates/opentelemetry/rbac.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml @@ -18,9 +18,9 @@ metadata: labels: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} annotations: - "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-weight": "-20" - "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-delete-policy": before-hook-creation --- # Role for OpenTelemetry collector to access Kubernetes resources apiVersion: rbac.authorization.k8s.io/v1 @@ -76,9 +76,9 @@ metadata: labels: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} annotations: - "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-weight": "-20" - "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-delete-policy": before-hook-creation rules: # Allow reading CRDs to check availability (CRDs are cluster-scoped) - apiGroups: @@ -106,9 +106,9 @@ metadata: labels: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} annotations: - "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-weight": "-20" - "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-delete-policy": before-hook-creation roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole @@ -127,9 +127,9 @@ metadata: labels: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} annotations: - "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-weight": "-20" - "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-delete-policy": before-hook-creation rules: # Allow creating/updating OpenTelemetry CRs - apiGroups: @@ -153,9 +153,9 @@ metadata: labels: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} annotations: - "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade "helm.sh/hook-weight": "-20" - "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded + "helm.sh/hook-delete-policy": before-hook-creation roleRef: apiGroup: rbac.authorization.k8s.io kind: Role diff --git a/charts/mlrun-ce/templates/pipelines/deployments/metadata-envoy-deployment.yaml b/charts/mlrun-ce/templates/pipelines/deployments/metadata-envoy-deployment.yaml index 0801bac9..8a702d9c 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/metadata-envoy-deployment.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/metadata-envoy-deployment.yaml @@ -27,6 +27,9 @@ spec: labels: application-crd-id: kubeflow-pipelines component: metadata-envoy + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: containers: - image: {{ .Values.pipelines.images.metadataEnvoy.repository }}:{{ .Values.pipelines.images.metadataEnvoy.tag }} diff --git a/charts/mlrun-ce/templates/pipelines/deployments/metadata-grpc-deployment.yaml b/charts/mlrun-ce/templates/pipelines/deployments/metadata-grpc-deployment.yaml index 00e7fb9a..f3fae663 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/metadata-grpc-deployment.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/metadata-grpc-deployment.yaml @@ -25,6 +25,9 @@ spec: labels: application-crd-id: kubeflow-pipelines component: metadata-grpc-server + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: containers: - args: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/metadata-writer.yaml b/charts/mlrun-ce/templates/pipelines/deployments/metadata-writer.yaml index d2800d1e..04f68b05 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/metadata-writer.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/metadata-writer.yaml @@ -25,6 +25,9 @@ spec: labels: app: metadata-writer application-crd-id: kubeflow-pipelines + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-persistenceagent.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-persistenceagent.yaml index 5dbd6604..04af9784 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-persistenceagent.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-persistenceagent.yaml @@ -27,6 +27,9 @@ spec: labels: app: ml-pipeline-persistenceagent application-crd-id: kubeflow-pipelines + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-scheduledworkflow.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-scheduledworkflow.yaml index c27442ad..a3634401 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-scheduledworkflow.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-scheduledworkflow.yaml @@ -27,6 +27,9 @@ spec: labels: app: ml-pipeline-scheduledworkflow application-crd-id: kubeflow-pipelines + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-ui.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-ui.yaml index e8cac85f..459223e8 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-ui.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-ui.yaml @@ -27,6 +27,9 @@ spec: labels: app: ml-pipeline-ui application-crd-id: kubeflow-pipelines + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-viewer-crd.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-viewer-crd.yaml index 89f25c23..e34dfb52 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-viewer-crd.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-viewer-crd.yaml @@ -27,6 +27,9 @@ spec: labels: app: ml-pipeline-viewer-crd application-crd-id: kubeflow-pipelines + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-visualizationserver.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-visualizationserver.yaml index 6db618a7..b6f79527 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-visualizationserver.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-visualizationserver.yaml @@ -27,6 +27,9 @@ spec: labels: app: ml-pipeline-visualizationserver application-crd-id: kubeflow-pipelines + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: containers: - image: {{ .Values.pipelines.images.visualizationServer.repository }}:{{ .Values.pipelines.images.visualizationServer.tag }} diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline.yaml index 42ece191..4a4a1a00 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline.yaml @@ -27,6 +27,9 @@ spec: labels: app: ml-pipeline application-crd-id: kubeflow-pipelines + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/mysql.yaml b/charts/mlrun-ce/templates/pipelines/deployments/mysql.yaml index 40791425..db7d4893 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/mysql.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/mysql.yaml @@ -22,6 +22,9 @@ spec: labels: app: mysql application-crd-id: kubeflow-pipelines + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: {{- if .Values.pipelines.db.securityContext }} securityContext: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/workflow-controller.yaml b/charts/mlrun-ce/templates/pipelines/deployments/workflow-controller.yaml index 83be1799..9ec903e1 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/workflow-controller.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/workflow-controller.yaml @@ -24,6 +24,9 @@ spec: labels: app: workflow-controller application-crd-id: kubeflow-pipelines + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: containers: - args: diff --git a/charts/mlrun-ce/templates/timescaledb/statefulset.yaml b/charts/mlrun-ce/templates/timescaledb/statefulset.yaml index 79f5a7f8..7001a93e 100644 --- a/charts/mlrun-ce/templates/timescaledb/statefulset.yaml +++ b/charts/mlrun-ce/templates/timescaledb/statefulset.yaml @@ -15,6 +15,9 @@ spec: metadata: labels: {{- include "mlrun-ce.timescaledb.selectorLabels" . | nindent 8 }} + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} + {{- end }} spec: {{- with .Values.timescaledb.nodeSelector }} nodeSelector: diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index d4d2c638..583bca11 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -94,6 +94,13 @@ nuclio: kind: mlrun synchronizationInterval: 10m apiAddress: http://mlrun-api-chief:8080/api + # Default labels applied to all Nuclio function pods. + # mlrun.io/otel marks function pods for OTel metric enrichment; the namespace-level + # instrumentation annotation handles Python auto-instrumentation automatically. + functionDefaults: + metadata: + labels: + mlrun.io/otel: "true" mlrun: # set the type of filesystem to use: filesystem, s3 @@ -173,6 +180,13 @@ mlrun: name: mlrun-override-env optional: true extraPersistentVolumeMounts: ~ + # Explicitly expose the Docker image's PYTHONPATH as a K8s env var so that the + # OpenTelemetry operator's PYTHONPATH injection (PYTHONPATH=/otel-auto-instrumentation-python:$(PYTHONPATH)) + # can expand $(PYTHONPATH) correctly. Without this, K8s env var substitution resolves + # $(PYTHONPATH) to an empty string (it cannot see Docker image ENV vars), and the + # mlrun `services` package path is lost, crashing the API on startup. + extraEnvKeyValue: + PYTHONPATH: "/mlrun/server/py:/mlrun/server/py/schemas/proto" # Set mlrun api workers count by setting the minReplicas value. # This is recommended for production environments running at high scale. @@ -323,6 +337,8 @@ seaweedfs: # Master server - metadata management master: + podLabels: + mlrun.io/otel: "true" port: 9333 # Storage: use PVC instead of default hostPath data: @@ -336,6 +352,8 @@ seaweedfs: # Volume server - actual data storage volume: + podLabels: + mlrun.io/otel: "true" port: 8080 # Storage: use PVC instead of default hostPath dataDirs: @@ -351,6 +369,8 @@ seaweedfs: # Filer server - file system interface filer: + podLabels: + mlrun.io/otel: "true" port: 8888 # Storage: use PVC instead of default hostPath data: @@ -368,6 +388,8 @@ seaweedfs: # S3 API gateway - MLRun connects to this endpoint s3: + podLabels: + mlrun.io/otel: "true" enabled: true # Default is false port: 8333 enableAuth: true # Default is false @@ -381,6 +403,8 @@ seaweedfs: # Admin server - user and policy management UI admin: + podLabels: + mlrun.io/otel: "true" enabled: true # Default is false port: 23646 secret: @@ -570,30 +594,30 @@ kube-prometheus-stack: type: NodePort nodePort: 30020 prometheusSpec: - # Additional scrape configs for OpenTelemetry collector sidecars - # This creates clear separation between direct scraping and OTEL-collected metrics + # Additional scrape configs for OpenTelemetry collector Deployment. + # In deployment mode, one collector pod runs per namespace and receives OTLP from all + # instrumented pods. Prometheus scrapes only the collector (port 8889), not individual pods. additionalScrapeConfigs: - # Job for scraping OTEL collector sidecars (metrics on port 8889) - # Discovers any pod with sidecar.opentelemetry.io/inject annotation — no per-function - # Prometheus annotations required. Port 8889 is the standard OTel prometheus exporter port. - - job_name: 'otel-collector-sidecars' + # Scrape the OTel Collector Deployment pod. + # Discovers pods with app.kubernetes.io/component=opentelemetry-collector label + # (applied automatically by the OTel operator to collector pods). + - job_name: 'otel-collector' kubernetes_sd_configs: - role: pod relabel_configs: - # Only scrape pods that have the OTel sidecar injected - - source_labels: [__meta_kubernetes_pod_annotation_sidecar_opentelemetry_io_inject] + # Only scrape the OTel collector pod + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] action: keep - regex: .+ - # Use port 8889 (OTel prometheus exporter) regardless of what the pod exposes + regex: opentelemetry-collector + # Use port 8889 (OTel prometheus exporter) - source_labels: [__address__] action: replace regex: ([^:]+)(?::\d+)? replacement: $1:8889 target_label: __address__ - # Metrics path is always /metrics for the OTel prometheus exporter - target_label: __metrics_path__ replacement: /metrics - # Add kubernetes labels + # Propagate pod labels as metric labels - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] @@ -602,62 +626,10 @@ kube-prometheus-stack: - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name - # Add metric relabeling to identify OTEL-sourced metrics metric_relabel_configs: - action: replace target_label: metrics_source replacement: otel_collector - - # Job for direct application scraping (non-OTEL) - # Only scrapes pods with prometheus.io/scrape-mode: "direct" or "both" - # Excludes OTEL sidecar port (8889) - - job_name: 'kubernetes-pods-direct' - kubernetes_sd_configs: - - role: pod - relabel_configs: - # Only scrape pods with prometheus.io/scrape=true - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - # Exclude OTEL sidecar port (8889) - those are handled by otel-collector-sidecars job - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port] - action: drop - regex: "8889" - # Only scrape if scrape-mode is "direct" or "both" or not set (default to direct) - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_mode] - action: keep - regex: (direct|both|) - # Set metrics path - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - replacement: $1 - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: () - replacement: /metrics - # Set target address with port - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - target_label: __address__ - # Add kubernetes labels - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: kubernetes_pod_name - # Add metric relabeling to identify direct-scraped metrics - metric_relabel_configs: - - action: replace - target_label: metrics_source - replacement: direct_scrape kube-state-metrics: fullnameOverride: state-metrics prometheus-node-exporter: @@ -846,22 +818,23 @@ opentelemetry-operator: # These are managed separately from the operator for admin/non-admin split # ============================================================================= opentelemetry: - # Namespace label for enabling OpenTelemetry monitoring - # The namespace must have this label for the operator to inject sidecars + # Namespace label for enabling OpenTelemetry monitoring. + # When enabled, the post-install job labels the namespace and also annotates it with + # instrumentation.opentelemetry.io/inject-python so all Python pods are auto-instrumented. namespaceLabel: enabled: false key: "opentelemetry.io/inject" value: "enabled" - # OpenTelemetry Collector configuration (SIDECAR mode) - # In sidecar mode, the collector is injected into pods via webhook - # Not as a standalone deployment + # OpenTelemetry Collector configuration (DEPLOYMENT mode) + # A single collector Deployment runs per namespace, receiving OTLP from instrumented pods + # and exporting metrics to Prometheus. collector: enabled: false nameOverride: "" fullnameOverride: "" - # SIDECAR mode - collector is injected into pods by the operator - mode: sidecar + # DEPLOYMENT mode - one collector pod per namespace, not injected as a sidecar + mode: deployment # Collector sidecar container resources resources: requests: @@ -880,13 +853,6 @@ opentelemetry: otlp: grpcPort: 4317 httpPort: 4318 - # Prometheus scrape mode for OTEL-instrumented pods - # Options: - # - "direct": Direct Prometheus scraping only (default when OTEL disabled) - # - "otel": Metrics collected via OTEL sidecar only (recommended when OTEL enabled) - # - "both": Both OTEL sidecar and direct scraping (for debugging/transition) - # When enabling OTEL, set this to "otel" to prevent duplicate metrics - scrapeMode: "direct" # Instrumentation configuration for auto-instrumentation instrumentation: diff --git a/tests/helm-template-test.sh b/tests/helm-template-test.sh index 6acd789f..ccdef572 100755 --- a/tests/helm-template-test.sh +++ b/tests/helm-template-test.sh @@ -133,7 +133,7 @@ test_otel_collector_default() { assert_renders "$output" "CRD Readiness Job renders" assert_contains "$output" "kind: Job" "Has correct kind" assert_contains "$output" "kind: OpenTelemetryCollector" "Job contains OpenTelemetryCollector CR" - assert_contains "$output" "mode: sidecar" "Uses sidecar mode" + assert_contains "$output" "mode: deployment" "Uses deployment mode" assert_contains "$output" "prometheus:" "Has Prometheus exporter" assert_contains "$output" "endpoint: 0.0.0.0:8889" "Prometheus on port 8889" assert_contains "$output" "otlp:" "Has OTLP receiver" @@ -231,8 +231,8 @@ test_otel_rbac_disabled() { "RBAC does not render when OTEL disabled (default)" } -test_jupyter_otel_annotations() { - log_test "Jupyter Deployment - OTEL annotations when enabled" +test_jupyter_otel_labels() { + log_test "Jupyter Deployment - OTel label applied when enabled" local output output=$(render_template "templates/jupyter-notebook/deployment.yaml" \ @@ -240,22 +240,19 @@ test_jupyter_otel_annotations() { --set opentelemetry.collector.enabled=true \ --set opentelemetry.instrumentation.enabled=true) - assert_contains "$output" "sidecar.opentelemetry.io/inject:" "Has sidecar injection annotation" - assert_contains "$output" "instrumentation.opentelemetry.io/inject-python:" "Has Python instrumentation annotation" - assert_contains "$output" 'prometheus.io/scrape: "true"' "Has Prometheus scrape annotation" - assert_contains "$output" 'prometheus.io/scrape-mode:' "Has Prometheus scrape-mode annotation" - assert_contains "$output" 'prometheus.io/port: "8889"' "Has Prometheus port annotation" + assert_contains "$output" 'mlrun.io/otel: "true"' "Has OTel pod label" + assert_not_contains "$output" "sidecar.opentelemetry.io/inject:" "No sidecar annotation (deployment mode)" + assert_not_contains "$output" "prometheus.io/scrape:" "No per-pod Prometheus annotation (collector scrapes)" } -test_jupyter_no_otel_annotations_when_disabled() { - log_test "Jupyter Deployment - No OTEL annotations when disabled (default)" +test_jupyter_no_otel_label_when_disabled() { + log_test "Jupyter Deployment - No OTel label when disabled (default)" local output output=$(render_template "templates/jupyter-notebook/deployment.yaml" \ --set global.registry.url=test.io) - assert_not_contains "$output" "sidecar.opentelemetry.io/inject:" "No sidecar injection when disabled (default)" - assert_not_contains "$output" "instrumentation.opentelemetry.io/inject-python:" "No instrumentation when disabled (default)" + assert_not_contains "$output" 'mlrun.io/otel' "No OTel label when disabled (default)" } # ============================================================================ @@ -300,6 +297,20 @@ test_namespace_label_enabled() { assert_contains "$output" "opentelemetry.io/inject" "Has OTEL inject label key" } +test_namespace_label_with_instrumentation_annotation() { + log_test "Namespace Label - Instrumentation annotation added when instrumentation enabled" + + local output + output=$(render_template "templates/opentelemetry/namespace-label.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.namespaceLabel.enabled=true \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.instrumentation.enabled=true) + + assert_contains "$output" "kubectl annotate namespace" "Has kubectl annotate command" + assert_contains "$output" "instrumentation.opentelemetry.io/inject-python" "Has Python instrumentation namespace annotation" +} + test_namespace_label_disabled() { log_test "Namespace Label - Disabled (default)" @@ -362,16 +373,16 @@ test_prometheus_otel_scrape_config() { local decoded decoded=$(echo "$secret_data" | base64 -d 2>/dev/null || true) - if echo "$decoded" | grep -q "otel-collector-sidecars"; then + if echo "$decoded" | grep -q "otel-collector"; then log_pass "Has OTEL collector scrape job" else log_fail "Has OTEL collector scrape job - not found in decoded config" fi - if echo "$decoded" | grep -q "prometheus_io_port"; then - log_pass "Has pod annotation relabeling" + if echo "$decoded" | grep -q "opentelemetry-collector"; then + log_pass "Has collector pod label selector" else - log_fail "Has pod annotation relabeling - not found in decoded config" + log_fail "Has collector pod label selector - not found in decoded config" fi else log_fail "Prometheus scrape config secret not found" @@ -435,8 +446,8 @@ main() { echo "========================================" echo "Jupyter OTEL Integration Tests" echo "========================================" - test_jupyter_otel_annotations - test_jupyter_no_otel_annotations_when_disabled + test_jupyter_otel_labels + test_jupyter_no_otel_label_when_disabled echo "" echo "========================================" @@ -453,6 +464,7 @@ main() { test_namespace_label_disabled test_admin_namespace_label_disabled test_non_admin_namespace_label_enabled + test_namespace_label_with_instrumentation_annotation test_otel_operator_namespace_selector echo "" diff --git a/tests/kind-test.sh b/tests/kind-test.sh index 5ef2b37a..f5ee4333 100755 --- a/tests/kind-test.sh +++ b/tests/kind-test.sh @@ -832,15 +832,15 @@ verify_multi_ns() { log_warn "Instrumentation CRD not found - operator may not be installed" fi - # Check if Jupyter pod has OTEL sidecar annotations + # Check if Jupyter pod has mlrun.io/otel label (deployment mode - no sidecar injection) echo "" - log_info "Checking Jupyter deployment for OTEL annotations..." - local jupyter_annotations - jupyter_annotations=$(kubectl get deployment -n "${NAMESPACE}" -l app.kubernetes.io/component=jupyter-notebook -o jsonpath='{.items[0].spec.template.metadata.annotations}' 2>/dev/null || echo "") - if echo "${jupyter_annotations}" | grep -q "sidecar.opentelemetry.io/inject"; then - log_info "Jupyter has OTEL sidecar injection annotation" + log_info "Checking Jupyter deployment for OTEL pod label..." + local jupyter_labels + jupyter_labels=$(kubectl get deployment -n "${NAMESPACE}" -l app.kubernetes.io/component=jupyter-notebook -o jsonpath='{.items[0].spec.template.metadata.labels}' 2>/dev/null || echo "") + if echo "${jupyter_labels}" | grep -q "mlrun.io/otel"; then + log_info "Jupyter has mlrun.io/otel=true pod label (deployment mode)" else - log_warn "Jupyter does not have OTEL sidecar injection annotation" + log_warn "Jupyter does not have mlrun.io/otel label (OTel may be disabled)" fi } From d71d89310d10ed3c9a1801ed978cb83b96171349 Mon Sep 17 00:00:00 2001 From: royischoss Date: Sun, 5 Apr 2026 18:06:53 +0300 Subject: [PATCH 09/23] bump chart version --- charts/mlrun-ce/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index d433a0ae..7095d780 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.11.0-rc.28 +version: 0.11.0-rc.29 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png From 8b0be8481062faf0af3ec9c28e7517ef9cbcd465 Mon Sep 17 00:00:00 2001 From: royischoss Date: Sun, 5 Apr 2026 18:09:24 +0300 Subject: [PATCH 10/23] bump chart version --- charts/mlrun-ce/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index 7095d780..1a52ad3f 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.11.0-rc.29 +version: 0.11.0-rc.30 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png From aedbbcfb241ed64d122ba1db95c2b9b27682480e Mon Sep 17 00:00:00 2001 From: royischoss Date: Sun, 5 Apr 2026 18:18:00 +0300 Subject: [PATCH 11/23] fix lint --- charts/mlrun-ce/values.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index 8d9041e7..8bb76629 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -893,4 +893,3 @@ opentelemetry: limits: cpu: 500m memory: 512Mi - From 3cc4d4694c64dab0874d900ac7762faa650c36f1 Mon Sep 17 00:00:00 2001 From: royischoss Date: Thu, 9 Apr 2026 10:46:30 +0300 Subject: [PATCH 12/23] documentation fixes --- charts/mlrun-ce/README.md | 82 ----------------------------- charts/mlrun-ce/templates/NOTES.txt | 1 - 2 files changed, 83 deletions(-) diff --git a/charts/mlrun-ce/README.md b/charts/mlrun-ce/README.md index dce3c7cd..7bfbf44b 100644 --- a/charts/mlrun-ce/README.md +++ b/charts/mlrun-ce/README.md @@ -37,7 +37,6 @@ kubectl create namespace mlrun Add the mlrun ce helm chart repo ```bash helm repo add mlrun https://mlrun.github.io/ce -helm repo update ``` To work with the open source MLRun stack, you must an accessible docker-registry. The registry's URL and credentials @@ -206,87 +205,6 @@ By default, OpenTelemetry is **disabled**. When enabled, it provides: - `mlrun.io/otel: "true"` label on Jupyter, SeaweedFS, and Nuclio function pods - Prometheus scrapes the collector pod (not individual pods) -#### Enabling OpenTelemetry - -To install **with** OpenTelemetry enabled: - -```bash -helm --namespace mlrun install my-mlrun \ - --set global.registry.url= \ - --set global.registry.secretName=registry-credentials \ - --set opentelemetry-operator.enabled=true \ - --set opentelemetry.namespaceLabel.enabled=true \ - --set opentelemetry.collector.enabled=true \ - --set opentelemetry.instrumentation.enabled=true \ - mlrun/mlrun-ce -``` - -To **enable** OpenTelemetry on an existing installation: - -```bash -helm --namespace mlrun upgrade my-mlrun \ - --set opentelemetry-operator.enabled=true \ - --set opentelemetry.namespaceLabel.enabled=true \ - --set opentelemetry.collector.enabled=true \ - --set opentelemetry.instrumentation.enabled=true \ - mlrun/mlrun-ce -``` - -To **disable** OpenTelemetry (default): - -```bash -helm --namespace mlrun upgrade my-mlrun \ - --set opentelemetry-operator.enabled=false \ - --set opentelemetry.collector.enabled=false \ - --set opentelemetry.instrumentation.enabled=false \ - --set opentelemetry.namespaceLabel.enabled=false \ - mlrun/mlrun-ce -``` - -#### Custom Resource Limits - -Configure collector resources: - -```bash -helm --namespace mlrun install my-mlrun \ - --set opentelemetry.collector.resources.requests.cpu=100m \ - --set opentelemetry.collector.resources.requests.memory=128Mi \ - --set opentelemetry.collector.resources.limits.cpu=500m \ - --set opentelemetry.collector.resources.limits.memory=512Mi \ - mlrun/mlrun-ce -``` - -#### Enabling Java Auto-Instrumentation - -To enable Java auto-instrumentation (disabled by default): - -```bash -helm --namespace mlrun install my-mlrun \ - --set opentelemetry.instrumentation.java.enabled=true \ - mlrun/mlrun-ce -``` - -#### Adding OpenTelemetry to Custom Workloads - -Python instrumentation is applied **namespace-wide** — any Python pod in the MLRun namespace is automatically instrumented when OTel is enabled. No per-pod annotations are required. - -For pods in other namespaces, annotate the namespace directly: -```bash -kubectl annotate namespace \ - instrumentation.opentelemetry.io/inject-python=-otel-instrumentation -``` - -The `mlrun.io/otel: "true"` label is applied to: **Jupyter**, **SeaweedFS** (master, volume, filer, s3, admin), and **Nuclio function pods** (via `functionDefaults.metadata.labels`). This label is used for Prometheus metric filtering and enrichment. - -**Query OTEL-collected metrics in Prometheus:** -```promql -# OTEL metrics use the mlrun_otel_ prefix -mlrun_otel_http_server_duration_seconds_bucket{...} - -# Filter by source -{metrics_source="otel_collector"} -``` - #### Split Installation (Admin/Non-Admin) For multi-tenant clusters, install the operator CRDs at the cluster level and collectors in user namespaces: diff --git a/charts/mlrun-ce/templates/NOTES.txt b/charts/mlrun-ce/templates/NOTES.txt index 31d6d90d..bd7997ee 100644 --- a/charts/mlrun-ce/templates/NOTES.txt +++ b/charts/mlrun-ce/templates/NOTES.txt @@ -136,7 +136,6 @@ OpenTelemetry Operator is enabled! {{- "\n" }} OpenTelemetry Collector (deployment mode): - Collector CR: {{ include "mlrun-ce.otel.collector.fullname" . }} -- Mode: {{ .Values.opentelemetry.collector.mode }} - OTLP gRPC endpoint: {{ include "mlrun-ce.otel.collector.fullname" . }}-collector:{{ .Values.opentelemetry.collector.otlp.grpcPort }} - OTLP HTTP endpoint: {{ include "mlrun-ce.otel.collector.fullname" . }}-collector:{{ .Values.opentelemetry.collector.otlp.httpPort }} - Prometheus metrics port: {{ .Values.opentelemetry.collector.prometheus.port }} (scraped by Prometheus from the collector pod) From f7cca6a93775d3c14916c6ad74756a8110adbe78 Mon Sep 17 00:00:00 2001 From: royischoss Date: Thu, 9 Apr 2026 17:39:48 +0300 Subject: [PATCH 13/23] fixes --- charts/mlrun-ce/README.md | 101 +++--------------- charts/mlrun-ce/templates/NOTES.txt | 2 +- .../templates/opentelemetry/rbac.yaml | 10 ++ .../metadata-envoy-deployment.yaml | 3 - .../deployments/metadata-grpc-deployment.yaml | 3 - .../deployments/metadata-writer.yaml | 3 - .../ml-pipeline-persistenceagent.yaml | 3 - .../ml-pipeline-scheduledworkflow.yaml | 3 - .../pipelines/deployments/ml-pipeline-ui.yaml | 3 - .../deployments/ml-pipeline-viewer-crd.yaml | 3 - .../ml-pipeline-visualizationserver.yaml | 3 - .../pipelines/deployments/ml-pipeline.yaml | 3 - .../pipelines/deployments/mysql.yaml | 3 - .../deployments/workflow-controller.yaml | 3 - 14 files changed, 25 insertions(+), 121 deletions(-) diff --git a/charts/mlrun-ce/README.md b/charts/mlrun-ce/README.md index 7bfbf44b..69444ac7 100644 --- a/charts/mlrun-ce/README.md +++ b/charts/mlrun-ce/README.md @@ -65,42 +65,16 @@ helm --namespace mlrun \ mlrun/mlrun-ce ``` -### Complete Installation with OpenTelemetry (From Scratch) +### Installing with OpenTelemetry Enabled -This section provides a complete step-by-step guide to install MLRun CE with full OpenTelemetry observability enabled. +> **Note:** OpenTelemetry is **disabled by default**. Follow the standard [Installing the Chart](#installing-the-chart) steps, adding the OTel flags below. -> **Note:** OpenTelemetry is **disabled by default**. Follow these steps to enable it. - -#### Step 1: Create the namespace - -```bash -kubectl create namespace mlrun -``` - -#### Step 2: Add the Helm repository - -```bash -helm repo add mlrun https://mlrun.github.io/ce -helm repo update -``` - -#### Step 3: Create the docker registry secret - -```bash -kubectl --namespace mlrun create secret docker-registry registry-credentials \ - --docker-username \ - --docker-password \ - --docker-server \ - --docker-email -``` - -#### Step 4: Install MLRun CE with OpenTelemetry Enabled +To install with OpenTelemetry enabled, append the following flags to the helm install command: ```bash helm --namespace mlrun \ install my-mlrun \ --wait \ - --timeout 15m \ --set global.registry.url= \ --set global.registry.secretName=registry-credentials \ --set opentelemetry-operator.enabled=true \ @@ -110,41 +84,14 @@ helm --namespace mlrun \ mlrun/mlrun-ce ``` -The installation will: -- Deploy the OpenTelemetry Operator -- Create an OpenTelemetryCollector CR (deployment mode — one collector per namespace) -- Create an Instrumentation CR for Python auto-instrumentation -- Label and annotate the namespace so all Python pods are auto-instrumented automatically -- Configure Prometheus to scrape OTEL collector metrics (port 8889) - -#### Step 5: Verify OpenTelemetry Installation - -Check that the OpenTelemetry resources are created: +To verify the OpenTelemetry resources were created: ```bash -# Check the namespace label -kubectl get namespace mlrun --show-labels | grep opentelemetry - -# Check the OpenTelemetry Collector CR kubectl -n mlrun get opentelemetrycollectors - -# Check the Instrumentation CR kubectl -n mlrun get instrumentations - -# Check that the OTEL operator is running kubectl -n mlrun get pods | grep opentelemetry ``` -#### Step 6: Verify OTel Pod Labels and Namespace Annotation - -```bash -# Check that the namespace has the instrumentation annotation (enables auto-instrumentation for all Python pods) -kubectl get namespace mlrun -o jsonpath='{.metadata.annotations}' | jq . - -# Check pod labels — all chart-managed pods should have mlrun.io/otel=true -kubectl -n mlrun get pods --show-labels | grep mlrun.io/otel -``` - ### Installing MLRun-ce on minikube The Open source MLRun ce uses node ports for simplicity. If your kubernetes cluster is running inside a VM, @@ -172,46 +119,27 @@ Override those [in the normal methods](https://helm.sh/docs/chart_template_guide ### Configuring OpenTelemetry (Observability) -MLRun CE includes the OpenTelemetry Operator for collecting metrics and traces from your ML workloads. -The operator runs one collector **Deployment** per namespace. Instrumented pods send OTLP metrics to the collector, which exports them to Prometheus. - -> **Note:** OpenTelemetry is **disabled by default**. See below for how to enable it. +MLRun CE includes the OpenTelemetry Operator for collecting metrics and traces. When enabled, it deploys a single collector per namespace (deployment mode) — instrumented pods send OTLP data to the collector, which exports metrics to Prometheus on port 8889. All Python pods in the namespace are auto-instrumented, and the `mlrun.io/otel: "true"` label is applied to Jupyter, SeaweedFS, TimescaleDB, and Nuclio function pods for metric enrichment. -#### Namespace Labeling +For a fresh install with OTel, see [Installing with OpenTelemetry Enabled](#installing-with-opentelemetry-enabled). -The OpenTelemetry Operator **only monitors namespaces** with the label `opentelemetry.io/inject=enabled`. -This is automatically applied to the MLRun namespace when OpenTelemetry is enabled. +To enable OTel on an existing installation: -When enabling OpenTelemetry, the namespace is labeled automatically: -```yaml -# Automatically added to your namespace when opentelemetry.namespaceLabel.enabled=true -labels: - opentelemetry.io/inject: "enabled" -``` - -For custom namespaces that need OpenTelemetry instrumentation, add the label manually: ```bash -kubectl label namespace opentelemetry.io/inject=enabled +helm --namespace mlrun upgrade my-mlrun \ + --set opentelemetry-operator.enabled=true \ + --set opentelemetry.namespaceLabel.enabled=true \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.instrumentation.enabled=true \ + mlrun/mlrun-ce ``` -> **Note:** The controller namespace (where the operator runs) does **NOT** need this label, -> as only the operator itself runs there - no workloads require instrumentation. - -#### Default Configuration - -By default, OpenTelemetry is **disabled**. When enabled, it provides: -- A single OTel Collector Deployment per namespace (OTLP receiver → Prometheus exporter on port 8889) -- Namespace-level Python auto-instrumentation (all Python pods in the namespace are instrumented automatically) -- `mlrun.io/otel: "true"` label on Jupyter, SeaweedFS, and Nuclio function pods -- Prometheus scrapes the collector pod (not individual pods) - #### Split Installation (Admin/Non-Admin) -For multi-tenant clusters, install the operator CRDs at the cluster level and collectors in user namespaces: +For multi-tenant clusters, install the operator at the cluster level and the collector CRs in each user namespace: **Controller namespace (admin):** ```bash -# Operator only - no namespace label needed (no instrumented workloads here) helm --namespace controller install mlrun-controller \ -f admin_installation_values.yaml \ mlrun/mlrun-ce @@ -219,7 +147,6 @@ helm --namespace controller install mlrun-controller \ **User namespace (non-admin):** ```bash -# Collector CRs + namespace label applied automatically helm --namespace mlrun install my-mlrun \ -f non_admin_installation_values.yaml \ mlrun/mlrun-ce diff --git a/charts/mlrun-ce/templates/NOTES.txt b/charts/mlrun-ce/templates/NOTES.txt index bd7997ee..2e7c9598 100644 --- a/charts/mlrun-ce/templates/NOTES.txt +++ b/charts/mlrun-ce/templates/NOTES.txt @@ -161,7 +161,7 @@ Namespace OTel configuration: {{- end }} {{- if or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} {{- "\n" }} -Pods labeled with mlrun.io/otel=true: Jupyter, SeaweedFS (master/volume/filer/s3/admin), and Nuclio function pods. +Pods labeled with mlrun.io/otel=true: Jupyter, SeaweedFS (master/volume/filer/s3/admin), TimescaleDB, and Nuclio function pods (via functionDefaults). {{- end }} {{- end }} diff --git a/charts/mlrun-ce/templates/opentelemetry/rbac.yaml b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml index 9eec1971..fd978318 100644 --- a/charts/mlrun-ce/templates/opentelemetry/rbac.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/rbac.yaml @@ -143,6 +143,16 @@ rules: - patch - update - list + # Allow rollout restart of instrumented deployments/statefulsets after webhook is ready + - apiGroups: + - apps + resources: + - deployments + - statefulsets + verbs: + - get + - list + - patch --- # RoleBinding for the CR creator job apiVersion: rbac.authorization.k8s.io/v1 diff --git a/charts/mlrun-ce/templates/pipelines/deployments/metadata-envoy-deployment.yaml b/charts/mlrun-ce/templates/pipelines/deployments/metadata-envoy-deployment.yaml index 8a702d9c..0801bac9 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/metadata-envoy-deployment.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/metadata-envoy-deployment.yaml @@ -27,9 +27,6 @@ spec: labels: application-crd-id: kubeflow-pipelines component: metadata-envoy - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: containers: - image: {{ .Values.pipelines.images.metadataEnvoy.repository }}:{{ .Values.pipelines.images.metadataEnvoy.tag }} diff --git a/charts/mlrun-ce/templates/pipelines/deployments/metadata-grpc-deployment.yaml b/charts/mlrun-ce/templates/pipelines/deployments/metadata-grpc-deployment.yaml index f3fae663..00e7fb9a 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/metadata-grpc-deployment.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/metadata-grpc-deployment.yaml @@ -25,9 +25,6 @@ spec: labels: application-crd-id: kubeflow-pipelines component: metadata-grpc-server - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: containers: - args: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/metadata-writer.yaml b/charts/mlrun-ce/templates/pipelines/deployments/metadata-writer.yaml index 04f68b05..d2800d1e 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/metadata-writer.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/metadata-writer.yaml @@ -25,9 +25,6 @@ spec: labels: app: metadata-writer application-crd-id: kubeflow-pipelines - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-persistenceagent.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-persistenceagent.yaml index 04af9784..5dbd6604 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-persistenceagent.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-persistenceagent.yaml @@ -27,9 +27,6 @@ spec: labels: app: ml-pipeline-persistenceagent application-crd-id: kubeflow-pipelines - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-scheduledworkflow.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-scheduledworkflow.yaml index a3634401..c27442ad 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-scheduledworkflow.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-scheduledworkflow.yaml @@ -27,9 +27,6 @@ spec: labels: app: ml-pipeline-scheduledworkflow application-crd-id: kubeflow-pipelines - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-ui.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-ui.yaml index 459223e8..e8cac85f 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-ui.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-ui.yaml @@ -27,9 +27,6 @@ spec: labels: app: ml-pipeline-ui application-crd-id: kubeflow-pipelines - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-viewer-crd.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-viewer-crd.yaml index e34dfb52..89f25c23 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-viewer-crd.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-viewer-crd.yaml @@ -27,9 +27,6 @@ spec: labels: app: ml-pipeline-viewer-crd application-crd-id: kubeflow-pipelines - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-visualizationserver.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-visualizationserver.yaml index b6f79527..6db618a7 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-visualizationserver.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline-visualizationserver.yaml @@ -27,9 +27,6 @@ spec: labels: app: ml-pipeline-visualizationserver application-crd-id: kubeflow-pipelines - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: containers: - image: {{ .Values.pipelines.images.visualizationServer.repository }}:{{ .Values.pipelines.images.visualizationServer.tag }} diff --git a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline.yaml b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline.yaml index 4a4a1a00..42ece191 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/ml-pipeline.yaml @@ -27,9 +27,6 @@ spec: labels: app: ml-pipeline application-crd-id: kubeflow-pipelines - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: containers: - env: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/mysql.yaml b/charts/mlrun-ce/templates/pipelines/deployments/mysql.yaml index db7d4893..40791425 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/mysql.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/mysql.yaml @@ -22,9 +22,6 @@ spec: labels: app: mysql application-crd-id: kubeflow-pipelines - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: {{- if .Values.pipelines.db.securityContext }} securityContext: diff --git a/charts/mlrun-ce/templates/pipelines/deployments/workflow-controller.yaml b/charts/mlrun-ce/templates/pipelines/deployments/workflow-controller.yaml index 9ec903e1..83be1799 100644 --- a/charts/mlrun-ce/templates/pipelines/deployments/workflow-controller.yaml +++ b/charts/mlrun-ce/templates/pipelines/deployments/workflow-controller.yaml @@ -24,9 +24,6 @@ spec: labels: app: workflow-controller application-crd-id: kubeflow-pipelines - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: containers: - args: From 6f6193c54c97c517a9afc714440f561e94533e5b Mon Sep 17 00:00:00 2001 From: royischoss Date: Thu, 9 Apr 2026 17:39:54 +0300 Subject: [PATCH 14/23] fixes --- .../opentelemetry/crd-readiness-job.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml index 2a731da3..94ce109c 100644 --- a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml @@ -79,5 +79,22 @@ spec: {{- end }} echo "All OpenTelemetry CRs have been created successfully!" + + # Wait for the operator webhook to be fully ready before restarting pods + echo "Waiting for OpenTelemetry operator webhook to be ready..." + until kubectl -n {{ .Release.Namespace }} rollout status deployment \ + -l app.kubernetes.io/name=opentelemetry-operator --timeout=60s &>/dev/null; do + sleep 5 + done + + # Restart pods labeled mlrun.io/otel=true so they go through the webhook + # and receive OTel auto-instrumentation injection. + # This solves the race condition where pods start before the webhook is ready. + echo "Restarting instrumented deployments and statefulsets..." + kubectl -n {{ .Release.Namespace }} rollout restart deployment \ + -l mlrun.io/otel=true 2>/dev/null || true + kubectl -n {{ .Release.Namespace }} rollout restart statefulset \ + -l mlrun.io/otel=true 2>/dev/null || true + echo "Rollout restarts triggered — pods will be re-created with OTel injection." {{- end }} From 4f56e6d95f396990c3a1ac43a7e18550ea497e7d Mon Sep 17 00:00:00 2001 From: royischoss Date: Sun, 12 Apr 2026 12:19:02 +0300 Subject: [PATCH 15/23] change method to push to prometheus --- charts/mlrun-ce/templates/NOTES.txt | 2 +- charts/mlrun-ce/templates/_helpers.tpl | 14 +++---- charts/mlrun-ce/values.yaml | 57 +++++++------------------- tests/package.sh | 46 +++++++++++++++++++++ 4 files changed, 67 insertions(+), 52 deletions(-) diff --git a/charts/mlrun-ce/templates/NOTES.txt b/charts/mlrun-ce/templates/NOTES.txt index 2e7c9598..efb991bc 100644 --- a/charts/mlrun-ce/templates/NOTES.txt +++ b/charts/mlrun-ce/templates/NOTES.txt @@ -138,7 +138,7 @@ OpenTelemetry Collector (deployment mode): - Collector CR: {{ include "mlrun-ce.otel.collector.fullname" . }} - OTLP gRPC endpoint: {{ include "mlrun-ce.otel.collector.fullname" . }}-collector:{{ .Values.opentelemetry.collector.otlp.grpcPort }} - OTLP HTTP endpoint: {{ include "mlrun-ce.otel.collector.fullname" . }}-collector:{{ .Values.opentelemetry.collector.otlp.httpPort }} -- Prometheus metrics port: {{ .Values.opentelemetry.collector.prometheus.port }} (scraped by Prometheus from the collector pod) +- Metrics export: collector pushes via OTLP to Prometheus at /api/v1/otlp/v1/metrics {{- end }} {{- if .Values.opentelemetry.instrumentation.enabled }} {{- "\n" }} diff --git a/charts/mlrun-ce/templates/_helpers.tpl b/charts/mlrun-ce/templates/_helpers.tpl index 08bc2499..a949c3e8 100644 --- a/charts/mlrun-ce/templates/_helpers.tpl +++ b/charts/mlrun-ce/templates/_helpers.tpl @@ -522,14 +522,10 @@ spec: timeout: 5s override: false exporters: - prometheus: - endpoint: 0.0.0.0:{{ .Values.opentelemetry.collector.prometheus.port }} - namespace: {{ .Values.opentelemetry.collector.prometheus.namespace }} - const_labels: - collector_mode: deployment - metrics_source: otel_collector - resource_to_telemetry_conversion: - enabled: true + otlphttp/prometheus: + endpoint: http://prometheus-operated.{{ .Release.Namespace }}.svc:9090/api/v1/otlp + tls: + insecure: true debug: verbosity: basic sampling_initial: 5 @@ -549,7 +545,7 @@ spec: - resourcedetection - batch exporters: - - prometheus + - otlphttp/prometheus - debug traces: receivers: diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index 8bb76629..246e0cab 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -594,42 +594,14 @@ kube-prometheus-stack: type: NodePort nodePort: 30020 prometheusSpec: - # Additional scrape configs for OpenTelemetry collector Deployment. - # In deployment mode, one collector pod runs per namespace and receives OTLP from all - # instrumented pods. Prometheus scrapes only the collector (port 8889), not individual pods. - additionalScrapeConfigs: - # Scrape the OTel Collector Deployment pod. - # Discovers pods with app.kubernetes.io/component=opentelemetry-collector label - # (applied automatically by the OTel operator to collector pods). - - job_name: 'otel-collector' - kubernetes_sd_configs: - - role: pod - relabel_configs: - # Only scrape the OTel collector pod - - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] - action: keep - regex: opentelemetry-collector - # Use port 8889 (OTel prometheus exporter) - - source_labels: [__address__] - action: replace - regex: ([^:]+)(?::\d+)? - replacement: $1:8889 - target_label: __address__ - - target_label: __metrics_path__ - replacement: /metrics - # Propagate pod labels as metric labels - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: kubernetes_pod_name - metric_relabel_configs: - - action: replace - target_label: metrics_source - replacement: otel_collector + # Enable OTLP write receiver so the OTel collector can push metrics directly + # to Prometheus at /api/v1/otlp/v1/metrics instead of Prometheus scraping the collector. + # Prometheus v3 requires both the feature flag AND --web.enable-otlp-receiver. + enableFeatures: + - otlp-write-receiver + additionalArgs: + - name: web.enable-otlp-receiver + value: "" kube-state-metrics: fullnameOverride: state-metrics prometheus-node-exporter: @@ -765,6 +737,13 @@ spark: # ============================================================================= opentelemetry-operator: enabled: false + # CRDs are bootstrapped by the parent chart's crds/ directory (tiny stubs applied + # before any templates so the type is established immediately on fresh install). + # Disable sub-chart CRD rendering to avoid ownership conflicts with the crds/ stubs. + # The stubs use x-kubernetes-preserve-unknown-fields so the operator can still + # manage CRs; the operator's admission webhook handles CR validation. + crds: + create: false # Admission webhooks configuration admissionWebhooks: certManager: @@ -843,12 +822,6 @@ opentelemetry: limits: cpu: 200m memory: 256Mi - # Prometheus exporter port for metrics - prometheus: - port: 8889 - # Metric prefix added to all OTEL-collected metrics - # Helps distinguish OTEL metrics from directly-scraped metrics - namespace: mlrun_otel # OTLP receiver configuration otlp: grpcPort: 4317 diff --git a/tests/package.sh b/tests/package.sh index fa8150ed..575c3546 100755 --- a/tests/package.sh +++ b/tests/package.sh @@ -61,6 +61,52 @@ with tempfile.TemporaryDirectory() as tmp: ) PYEOF +# Slim down the opentelemetry-operator sub-chart by replacing large conf/crds/ files +# with empty stubs. The CRDs are managed by the parent chart's crds/ directory instead +# (crds.create: false in values.yaml). Keeping the full 542 KB CRD files would push +# the Helm release Secret over the Kubernetes 3 MB API request limit. +echo "Slimming opentelemetry-operator conf/crds/ (replacing with empty stubs)..." +python3 - <<'PYEOF' +import tarfile, os, shutil, tempfile, io + +tgz = "charts/opentelemetry-operator-0.78.1.tgz" +if not os.path.exists(tgz): + print(f" {tgz} not found, skipping") + exit(0) + +# Stub content: preserves the {{- if .Values.crds.create }} guard so the template +# renders correctly (empty output) whether crds.create is true or false. +STUB = b"{{- if .Values.crds.create }}\n{{- end }}\n" + +crd_files = { + "opentelemetry-operator/conf/crds/crd-opentelemetrycollector.yaml", + "opentelemetry-operator/conf/crds/crd-opentelemetryinstrumentation.yaml", + "opentelemetry-operator/conf/crds/crd-opentelemetry.io_opampbridges.yaml", +} + +with tempfile.TemporaryDirectory() as tmp: + with tarfile.open(tgz, "r:gz") as t: + t.extractall(tmp) + + for rel in crd_files: + path = os.path.join(tmp, rel) + if os.path.exists(path): + orig = os.path.getsize(path) + with open(path, "wb") as f: + f.write(STUB) + print(f" {os.path.basename(rel)}: {orig} -> {len(STUB)} bytes") + else: + print(f" {rel} not found, skipping") + + import subprocess, os as _os + env = _os.environ.copy() + env["COPYFILE_DISABLE"] = "1" + subprocess.run( + ["tar", "czf", os.path.abspath(tgz), "opentelemetry-operator"], + cwd=tmp, env=env, check=True + ) +PYEOF + # Create MLRun CE tarball helm package . exit 0 From 3aa7420439f49aee098a07b0cc182b73506d8cf5 Mon Sep 17 00:00:00 2001 From: royischoss Date: Tue, 14 Apr 2026 10:15:37 +0300 Subject: [PATCH 16/23] change method to push to prometheus --- charts/mlrun-ce/README.md | 18 +----------------- charts/mlrun-ce/templates/_helpers.tpl | 2 +- charts/mlrun-ce/values.yaml | 2 ++ 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/charts/mlrun-ce/README.md b/charts/mlrun-ce/README.md index 69444ac7..f6b3b308 100644 --- a/charts/mlrun-ce/README.md +++ b/charts/mlrun-ce/README.md @@ -134,23 +134,7 @@ helm --namespace mlrun upgrade my-mlrun \ mlrun/mlrun-ce ``` -#### Split Installation (Admin/Non-Admin) - -For multi-tenant clusters, install the operator at the cluster level and the collector CRs in each user namespace: - -**Controller namespace (admin):** -```bash -helm --namespace controller install mlrun-controller \ - -f admin_installation_values.yaml \ - mlrun/mlrun-ce -``` - -**User namespace (non-admin):** -```bash -helm --namespace mlrun install my-mlrun \ - -f non_admin_installation_values.yaml \ - mlrun/mlrun-ce -``` +> **Note:** The above assumes a single-namespace installation. For multi-namespace (admin/non-admin) deployments, refer to the MLRun documentation. ### Working with ECR diff --git a/charts/mlrun-ce/templates/_helpers.tpl b/charts/mlrun-ce/templates/_helpers.tpl index a949c3e8..5918e769 100644 --- a/charts/mlrun-ce/templates/_helpers.tpl +++ b/charts/mlrun-ce/templates/_helpers.tpl @@ -494,7 +494,7 @@ metadata: {{- include "mlrun-ce.otel.labels" . | nindent 4 }} spec: mode: {{ .Values.opentelemetry.collector.mode }} - upgradeStrategy: automatic + upgradeStrategy: {{ .Values.opentelemetry.collector.upgradeStrategy }} managementState: managed image: {{ (index .Values "opentelemetry-operator").manager.collectorImage.repository }}:{{ (index .Values "opentelemetry-operator").manager.collectorImage.tag }} resources: diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index 246e0cab..1213c5a7 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -814,6 +814,8 @@ opentelemetry: fullnameOverride: "" # DEPLOYMENT mode - one collector pod per namespace, not injected as a sidecar mode: deployment + # Set to "none" to prevent automatic collector upgrades when the operator is upgraded + upgradeStrategy: automatic # Collector sidecar container resources resources: requests: From 36c4b3f56f2cbd31b4c708cb9a0012796ce57bd0 Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 15 Apr 2026 09:55:41 +0300 Subject: [PATCH 17/23] remove labeling s3 and TimescaleDB fix jupyter bug. update documentation accordingly. add request and limit for crdReadinessJob and namespaceLabelJob --- charts/mlrun-ce/README.md | 2 +- charts/mlrun-ce/templates/NOTES.txt | 3 +- .../jupyter-notebook/deployment.yaml | 29 ++++++++++ .../opentelemetry/crd-readiness-job.yaml | 7 +++ .../opentelemetry/namespace-label.yaml | 7 +++ .../templates/timescaledb/statefulset.yaml | 3 - charts/mlrun-ce/values.yaml | 29 ++++++---- tests/helm-template-test.sh | 55 +++++-------------- 8 files changed, 80 insertions(+), 55 deletions(-) diff --git a/charts/mlrun-ce/README.md b/charts/mlrun-ce/README.md index f6b3b308..61aedbe3 100644 --- a/charts/mlrun-ce/README.md +++ b/charts/mlrun-ce/README.md @@ -119,7 +119,7 @@ Override those [in the normal methods](https://helm.sh/docs/chart_template_guide ### Configuring OpenTelemetry (Observability) -MLRun CE includes the OpenTelemetry Operator for collecting metrics and traces. When enabled, it deploys a single collector per namespace (deployment mode) — instrumented pods send OTLP data to the collector, which exports metrics to Prometheus on port 8889. All Python pods in the namespace are auto-instrumented, and the `mlrun.io/otel: "true"` label is applied to Jupyter, SeaweedFS, TimescaleDB, and Nuclio function pods for metric enrichment. +MLRun CE includes the OpenTelemetry Operator for collecting metrics and traces. When enabled, it deploys a single collector per namespace (deployment mode) — instrumented pods push OTLP data to the collector, which forwards metrics to Prometheus via the OTLP endpoint. Python auto-instrumentation is applied namespace-wide via a webhook, and the `mlrun.io/otel: "true"` label is applied to Jupyter and Nuclio function pods to mark them for metric enrichment and trigger OTel injection on restart. For a fresh install with OTel, see [Installing with OpenTelemetry Enabled](#installing-with-opentelemetry-enabled). diff --git a/charts/mlrun-ce/templates/NOTES.txt b/charts/mlrun-ce/templates/NOTES.txt index efb991bc..6ce2f2bd 100644 --- a/charts/mlrun-ce/templates/NOTES.txt +++ b/charts/mlrun-ce/templates/NOTES.txt @@ -161,7 +161,8 @@ Namespace OTel configuration: {{- end }} {{- if or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} {{- "\n" }} -Pods labeled with mlrun.io/otel=true: Jupyter, SeaweedFS (master/volume/filer/s3/admin), TimescaleDB, and Nuclio function pods (via functionDefaults). +Pods labeled with mlrun.io/otel=true: Jupyter and Nuclio function pods (via functionDefaults). +These Python-based pods receive OTel auto-instrumentation (runtime metrics, traces, HTTP metrics for Nuclio functions). {{- end }} {{- end }} diff --git a/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml b/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml index 08108481..176cda96 100644 --- a/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml +++ b/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml @@ -74,7 +74,36 @@ spec: ports: - containerPort: 8888 name: http + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + command: + - /bin/bash + - -c + - | + # Extract tar if needed (mirrors mlce-start.sh setup) + file_path="${HOME}/.intdata" + if [ ! -f "$file_path" ]; then + echo "Base data does not exist, extracting home backup..." + cd / && tar -xvf /tmp/basehome.tar + echo "1" > "$file_path" + fi + cd "${HOME}" + # Add OTel sitecustomize.py to PYTHONPATH so Jupyter's own Python + # process bootstraps OTel directly (bypassing start-notebook.py which + # strips the path to prevent re-instrumentation of subprocesses). + OTEL_PATH=/otel-auto-instrumentation-python/opentelemetry/instrumentation/auto_instrumentation + if [ -d "$OTEL_PATH" ]; then + export PYTHONPATH="${OTEL_PATH}:${PYTHONPATH:-/otel-auto-instrumentation-python}" + fi + exec /opt/conda/bin/python3 /opt/conda/bin/jupyter-lab \ + --ip=0.0.0.0 \ + --port=8888 \ + --NotebookApp.token="" \ + --NotebookApp.password="" \ + --NotebookApp.allow_origin="*" \ + --NotebookApp.default_url=/lab + {{- else }} command: ["/bin/bash", "/usr/local/bin/mlce-start.sh"] + {{- end }} {{- with .Values.jupyterNotebook.nodeSelector }} nodeSelector: {{ toYaml . | nindent 8 }} {{- end }} diff --git a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml index 94ce109c..bb4081c3 100644 --- a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml @@ -28,6 +28,13 @@ spec: containers: - name: cr-creator image: bitnami/kubectl:latest + resources: + requests: + cpu: {{ .Values.opentelemetry.crdReadinessJob.resources.requests.cpu }} + memory: {{ .Values.opentelemetry.crdReadinessJob.resources.requests.memory }} + limits: + cpu: {{ .Values.opentelemetry.crdReadinessJob.resources.limits.cpu }} + memory: {{ .Values.opentelemetry.crdReadinessJob.resources.limits.memory }} command: - /bin/bash - -c diff --git a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml index 60cfb3b8..e82b7162 100644 --- a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml @@ -24,6 +24,13 @@ spec: containers: - name: label-namespace image: bitnami/kubectl:latest + resources: + requests: + cpu: {{ .Values.opentelemetry.namespaceLabelJob.resources.requests.cpu }} + memory: {{ .Values.opentelemetry.namespaceLabelJob.resources.requests.memory }} + limits: + cpu: {{ .Values.opentelemetry.namespaceLabelJob.resources.limits.cpu }} + memory: {{ .Values.opentelemetry.namespaceLabelJob.resources.limits.memory }} command: - /bin/sh - -c diff --git a/charts/mlrun-ce/templates/timescaledb/statefulset.yaml b/charts/mlrun-ce/templates/timescaledb/statefulset.yaml index 7001a93e..79f5a7f8 100644 --- a/charts/mlrun-ce/templates/timescaledb/statefulset.yaml +++ b/charts/mlrun-ce/templates/timescaledb/statefulset.yaml @@ -15,9 +15,6 @@ spec: metadata: labels: {{- include "mlrun-ce.timescaledb.selectorLabels" . | nindent 8 }} - {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} - {{- include "mlrun-ce.otel.podLabels" . | nindent 8 }} - {{- end }} spec: {{- with .Values.timescaledb.nodeSelector }} nodeSelector: diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index 1213c5a7..bb680601 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -337,8 +337,6 @@ seaweedfs: # Master server - metadata management master: - podLabels: - mlrun.io/otel: "true" port: 9333 # Storage: use PVC instead of default hostPath data: @@ -352,8 +350,6 @@ seaweedfs: # Volume server - actual data storage volume: - podLabels: - mlrun.io/otel: "true" port: 8080 # Storage: use PVC instead of default hostPath dataDirs: @@ -369,8 +365,6 @@ seaweedfs: # Filer server - file system interface filer: - podLabels: - mlrun.io/otel: "true" port: 8888 # Storage: use PVC instead of default hostPath data: @@ -388,8 +382,6 @@ seaweedfs: # S3 API gateway - MLRun connects to this endpoint s3: - podLabels: - mlrun.io/otel: "true" enabled: true # Default is false port: 8333 enableAuth: true # Default is false @@ -403,8 +395,6 @@ seaweedfs: # Admin server - user and policy management UI admin: - podLabels: - mlrun.io/otel: "true" enabled: true # Default is false port: 23646 secret: @@ -868,3 +858,22 @@ opentelemetry: limits: cpu: 500m memory: 512Mi + + # CRD readiness job resources (kubectl-only container) + crdReadinessJob: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + # Namespace label job resources (kubectl-only container) + namespaceLabelJob: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi diff --git a/tests/helm-template-test.sh b/tests/helm-template-test.sh index ccdef572..0617e192 100755 --- a/tests/helm-template-test.sh +++ b/tests/helm-template-test.sh @@ -134,8 +134,8 @@ test_otel_collector_default() { assert_contains "$output" "kind: Job" "Has correct kind" assert_contains "$output" "kind: OpenTelemetryCollector" "Job contains OpenTelemetryCollector CR" assert_contains "$output" "mode: deployment" "Uses deployment mode" - assert_contains "$output" "prometheus:" "Has Prometheus exporter" - assert_contains "$output" "endpoint: 0.0.0.0:8889" "Prometheus on port 8889" + assert_contains "$output" "otlphttp/prometheus:" "Has OTLP HTTP Prometheus exporter" + assert_contains "$output" "/api/v1/otlp" "Pushes to Prometheus OTLP endpoint" assert_contains "$output" "otlp:" "Has OTLP receiver" assert_contains "$output" "helm.sh/hook" "Has Helm hooks" assert_contains "$output" "post-install,post-upgrade" "Has correct hook triggers" @@ -151,6 +151,18 @@ test_otel_collector_disabled() { "CRD Readiness Job does not render when collector disabled (default)" } +test_otel_collector_upgrade_strategy() { + log_test "OpenTelemetry Collector - upgradeStrategy override" + + local output + output=$(render_template "templates/opentelemetry/crd-readiness-job.yaml" \ + --set global.registry.url=test.io \ + --set opentelemetry.collector.enabled=true \ + --set opentelemetry.collector.upgradeStrategy=none) + + assert_contains "$output" "upgradeStrategy: none" "upgradeStrategy can be overridden to none" +} + test_otel_collector_resources() { log_test "OpenTelemetry Collector - Custom resources" @@ -355,39 +367,6 @@ test_otel_operator_namespace_selector() { fi } -# ============================================================================ -# Prometheus Integration Tests -# ============================================================================ - -test_prometheus_otel_scrape_config() { - log_test "Prometheus - OTEL scrape configuration" - - local output - output=$(render_all --set global.registry.url=test.io) - - # The scrape config is in a Secret as base64, extract and decode it - local secret_data - secret_data=$(echo "$output" | grep "additional-scrape-configs.yaml:" | head -1 | sed 's/.*: "//' | sed 's/"$//' || true) - - if [[ -n "$secret_data" ]]; then - local decoded - decoded=$(echo "$secret_data" | base64 -d 2>/dev/null || true) - - if echo "$decoded" | grep -q "otel-collector"; then - log_pass "Has OTEL collector scrape job" - else - log_fail "Has OTEL collector scrape job - not found in decoded config" - fi - - if echo "$decoded" | grep -q "opentelemetry-collector"; then - log_pass "Has collector pod label selector" - else - log_fail "Has collector pod label selector - not found in decoded config" - fi - else - log_fail "Prometheus scrape config secret not found" - fi -} # ============================================================================ # Full Chart Render Test @@ -425,6 +404,7 @@ main() { echo "========================================" test_otel_collector_default test_otel_collector_disabled + test_otel_collector_upgrade_strategy test_otel_collector_resources echo "" @@ -467,11 +447,6 @@ main() { test_namespace_label_with_instrumentation_annotation test_otel_operator_namespace_selector - echo "" - echo "========================================" - echo "Prometheus Integration Tests" - echo "========================================" - test_prometheus_otel_scrape_config echo "" echo "========================================" From a5e71ef49d7f801536249461aea4a4b890441715 Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 15 Apr 2026 10:48:32 +0300 Subject: [PATCH 18/23] another jupyter timing fix --- .../templates/jupyter-notebook/deployment.yaml | 3 +++ .../opentelemetry/crd-readiness-job.yaml | 15 ++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml b/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml index 176cda96..aa1e43dd 100644 --- a/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml +++ b/charts/mlrun-ce/templates/jupyter-notebook/deployment.yaml @@ -5,6 +5,9 @@ metadata: name: {{ include "mlrun-ce.jupyter.fullname" . }} labels: {{- include "mlrun-ce.jupyter.labels" . | nindent 4 }} + {{- if and .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} + {{- include "mlrun-ce.otel.podLabels" . | nindent 4 }} + {{- end }} spec: replicas: 1 selector: diff --git a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml index bb4081c3..78df75eb 100644 --- a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml @@ -63,16 +63,17 @@ spec: echo "Instrumentation CRD is available!" {{- end }} - # Wait a bit more for the operator to be fully ready - echo "Waiting for operator webhook to be ready..." - sleep 10 - {{- if .Values.opentelemetry.collector.enabled }} - # Create or update the OpenTelemetryCollector CR - echo "Creating/updating OpenTelemetryCollector CR..." - cat <<'EOF' | kubectl apply -f - + # Write the OpenTelemetryCollector CR to a temp file and retry applying it + # until the operator webhook is ready to accept it (fresh install timing fix). + cat > /tmp/collector-cr.yaml <<'EOF' {{- include "mlrun-ce.otel.collector.manifest" . | nindent 14 }} EOF + echo "Creating/updating OpenTelemetryCollector CR (with webhook readiness retry)..." + until kubectl apply -f /tmp/collector-cr.yaml 2>&1; do + echo "Webhook not ready yet, retrying in 5s..." + sleep 5 + done echo "OpenTelemetryCollector CR created/updated!" {{- end }} From b15ee61cb99cafdefe317b8af3b5ce21838a3733 Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 15 Apr 2026 11:12:11 +0300 Subject: [PATCH 19/23] remove redundant loop for crds check --- .../opentelemetry/crd-readiness-job.yaml | 25 +++---------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml index 78df75eb..915b9248 100644 --- a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml @@ -41,28 +41,9 @@ spec: - | set -e - echo "Waiting for OpenTelemetry CRDs to be available..." - - # Wait for the OpenTelemetryCollector CRD - {{- if .Values.opentelemetry.collector.enabled }} - echo "Waiting for OpenTelemetryCollector CRD..." - until kubectl get crd opentelemetrycollectors.opentelemetry.io &>/dev/null; do - echo "Waiting for opentelemetrycollectors.opentelemetry.io CRD..." - sleep 5 - done - echo "OpenTelemetryCollector CRD is available!" - {{- end }} - - # Wait for the Instrumentation CRD - {{- if .Values.opentelemetry.instrumentation.enabled }} - echo "Waiting for Instrumentation CRD..." - until kubectl get crd instrumentations.opentelemetry.io &>/dev/null; do - echo "Waiting for instrumentations.opentelemetry.io CRD..." - sleep 5 - done - echo "Instrumentation CRD is available!" - {{- end }} - + # CRDs are guaranteed to exist — Helm applies charts/mlrun-ce/crds/ before + # any hooks run, so no polling needed here. + {{- if .Values.opentelemetry.collector.enabled }} # Write the OpenTelemetryCollector CR to a temp file and retry applying it # until the operator webhook is ready to accept it (fresh install timing fix). From 4f2bda3ae77b9abe570c41fe40685e64bce2654b Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 15 Apr 2026 11:24:32 +0300 Subject: [PATCH 20/23] fix requirements.lock --- charts/mlrun-ce/requirements.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/mlrun-ce/requirements.lock b/charts/mlrun-ce/requirements.lock index 7561e0a0..adda8fc5 100644 --- a/charts/mlrun-ce/requirements.lock +++ b/charts/mlrun-ce/requirements.lock @@ -23,5 +23,5 @@ dependencies: - name: opentelemetry-operator repository: https://open-telemetry.github.io/opentelemetry-helm-charts version: 0.78.1 -digest: sha256:9f6ea4d6c60baabe3a9fb2a9c286f5c70a97bbf76ecba15ddaef7f39c56269ae -generated: "2026-03-25T11:50:15.589709+02:00" +digest: sha256:50ed77fd11e450e243c05eadac99857b4b0aae92ae73ca9a6c00fc1cdc726f70 +generated: "2026-04-15T11:23:19.249332+03:00" From 657cab342e305d62273c699fc0277cecabccd6e9 Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 15 Apr 2026 11:26:57 +0300 Subject: [PATCH 21/23] fix rc version --- charts/mlrun-ce/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index b91431ec..0edfd0d5 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.11.0-rc.31 +version: 0.11.0-rc.32 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png From 73c6aa17b2d8c0f2ccc844669ffa6b24452e3f5a Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 15 Apr 2026 11:45:14 +0300 Subject: [PATCH 22/23] fix pin kubectl version in jobs, fix documentation for crds readiness, change naming for otel metrics using metadata.name fieldRef --- charts/mlrun-ce/templates/_helpers.tpl | 3 +-- .../mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/charts/mlrun-ce/templates/_helpers.tpl b/charts/mlrun-ce/templates/_helpers.tpl index 5918e769..d207c879 100644 --- a/charts/mlrun-ce/templates/_helpers.tpl +++ b/charts/mlrun-ce/templates/_helpers.tpl @@ -586,7 +586,7 @@ spec: - name: OTEL_SERVICE_NAME valueFrom: fieldRef: - fieldPath: metadata.labels['app.kubernetes.io/name'] + fieldPath: metadata.name - name: OTEL_METRICS_EXPORTER value: otlp - name: OTEL_TRACES_EXPORTER @@ -616,7 +616,6 @@ spec: value: "true" {{- end }} {{- end }} -.. {{/* OTel pod label — marks a pod as OTel-monitored for metric enrichment and discovery. Namespace-level instrumentation annotation (set by namespace-label job) handles Python auto-instrumentation. diff --git a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml index 915b9248..35c6b566 100644 --- a/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/crd-readiness-job.yaml @@ -1,7 +1,7 @@ {{- if or .Values.opentelemetry.collector.enabled .Values.opentelemetry.instrumentation.enabled }} --- -# Job to wait for OpenTelemetry CRDs to be available and then create the CRs -# This solves the race condition between the operator starting and CR creation +# Job to create OpenTelemetry CRs after the operator webhook is ready to accept them. +# Retries kubectl apply until the webhook accepts the CR (fresh install timing fix). apiVersion: batch/v1 kind: Job metadata: @@ -27,7 +27,7 @@ spec: serviceAccountName: {{ .Release.Name }}-otel-cr-creator containers: - name: cr-creator - image: bitnami/kubectl:latest + image: bitnami/kubectl:1.32 resources: requests: cpu: {{ .Values.opentelemetry.crdReadinessJob.resources.requests.cpu }} From 28cf7bad305ba20b6f531b2dc67002e8cd0ef1cf Mon Sep 17 00:00:00 2001 From: royischoss Date: Wed, 15 Apr 2026 11:45:56 +0300 Subject: [PATCH 23/23] fix pin kubectl version in jobs, fix documentation for crds readiness, change naming for otel metrics using metadata.name fieldRef --- charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml index e82b7162..ee8bb5c7 100644 --- a/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml +++ b/charts/mlrun-ce/templates/opentelemetry/namespace-label.yaml @@ -23,7 +23,7 @@ spec: restartPolicy: Never containers: - name: label-namespace - image: bitnami/kubectl:latest + image: bitnami/kubectl:1.32 resources: requests: cpu: {{ .Values.opentelemetry.namespaceLabelJob.resources.requests.cpu }}