diff --git a/Cargo.lock b/Cargo.lock index 631469b..1039ac1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -94,6 +94,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + [[package]] name = "async-broadcast" version = "0.7.1" @@ -1095,6 +1101,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -1328,6 +1343,15 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "memchr" version = "2.7.4" @@ -1467,6 +1491,57 @@ dependencies = [ "tracing", ] +[[package]] +name = "opentelemetry-http" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d" +dependencies = [ + "async-trait", + "bytes", + "http", + "opentelemetry", + "reqwest", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2366db2dca4d2ad033cad11e6ee42844fd727007af5ad04a1730f4cb8163bf" +dependencies = [ + "http", + "opentelemetry", + "opentelemetry-http", + "opentelemetry-proto", + "opentelemetry_sdk", + "prost", + "reqwest", + "thiserror 2.0.18", + "tokio", + "tonic", + "tracing", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f" +dependencies = [ + "opentelemetry", + "opentelemetry_sdk", + "prost", + "tonic", + "tonic-prost", +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e62e29dfe041afb8ed2a6c9737ab57db4907285d999ef8ad3a59092a36bdc846" + [[package]] name = "opentelemetry_sdk" version = "0.31.0" @@ -1658,6 +1733,29 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prost" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-derive" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "quote" version = "1.0.44" @@ -1749,6 +1847,7 @@ dependencies = [ "base64", "bytes", "encoding_rs", + "futures-channel", "futures-core", "futures-util", "h2", @@ -1772,7 +1871,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "sync_wrapper 1.0.1", + "sync_wrapper 1.0.2", "system-configuration", "tokio", "tokio-native-tls", @@ -1815,6 +1914,8 @@ dependencies = [ "kube", "kube-leader-election", "opentelemetry", + "opentelemetry-otlp", + "opentelemetry-semantic-conventions", "opentelemetry_sdk", "thiserror 2.0.18", "tikv-jemallocator", @@ -1822,6 +1923,7 @@ dependencies = [ "tokio-util", "tower-http", "tracing", + "tracing-opentelemetry", "tracing-subscriber", ] @@ -2170,9 +2272,9 @@ checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" [[package]] name = "sync_wrapper" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" dependencies = [ "futures-core", ] @@ -2406,6 +2508,43 @@ dependencies = [ "tokio", ] +[[package]] +name = "tonic" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fec7c61a0695dc1887c1b53952990f3ad2e3a31453e1f49f10e75424943a93ec" +dependencies = [ + "async-trait", + "base64", + "bytes", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "sync_wrapper 1.0.2", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a55376a0bbaa4975a3f10d009ad763d8f4108f067c7c2e74f3001fb49778d309" +dependencies = [ + "bytes", + "prost", + "tonic", +] + [[package]] name = "tower" version = "0.5.1" @@ -2414,7 +2553,9 @@ checksum = "2873938d487c3cfb9aed7546dc9f2711d867c9f90c46b889989a2cb84eba6b4f" dependencies = [ "futures-core", "futures-util", + "indexmap 2.6.0", "pin-project-lite", + "slab", "sync_wrapper 0.1.2", "tokio", "tokio-util", @@ -2498,16 +2639,36 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-opentelemetry" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ac28f2d093c6c477eaa76b23525478f38de514fa9aeb1285738d4b97a9552fc" +dependencies = [ + "js-sys", + "opentelemetry", + "smallvec", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", + "web-time", +] + [[package]] name = "tracing-subscriber" version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" dependencies = [ + "matchers", "nu-ansi-term", + "once_cell", + "regex-automata", "sharded-slab", "smallvec", "thread_local", + "tracing", "tracing-core", "tracing-log", ] @@ -2712,6 +2873,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "windows-core" version = "0.52.0" diff --git a/Cargo.toml b/Cargo.toml index 2c407f4..283558e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,11 +19,14 @@ kube-leader-election = "0.43.0" thiserror = "2.0.3" opentelemetry = "0.31" opentelemetry_sdk = { version = "0.31", features = ["rt-tokio", "metrics"] } +opentelemetry-otlp = { version = "0.31", features = ["trace", "grpc-tonic"] } +opentelemetry-semantic-conventions = "0.31" tokio = { version = "1.41.1", features = ["macros", "rt-multi-thread", "signal", "net"] } tokio-util = "0.7" tower-http = { version = "0.6.6", features = ["util"] } tracing = "0.1.40" -tracing-subscriber = "0.3.18" +tracing-opentelemetry = "0.32" +tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } [target.'cfg(not(target_env = "msvc"))'.dependencies] tikv-jemallocator = "0.6" diff --git a/docs/opentelemetry-tracing-analysis.md b/docs/opentelemetry-tracing-analysis.md new file mode 100644 index 0000000..bcb3a9f --- /dev/null +++ b/docs/opentelemetry-tracing-analysis.md @@ -0,0 +1,251 @@ +# OpenTelemetry Tracing Integration Trade-off Analysis + +**Issue:** #51 +**Date:** 2026-03-04 +**Status:** Analysis + +## Executive Summary + +This document analyzes the trade-offs of adding OpenTelemetry distributed tracing support to RobotLB. The project already uses OpenTelemetry for metrics and the `tracing` crate for structured logging, making the integration relatively straightforward. + +## Current State + +### Existing Observability Stack + +| Component | Implementation | Purpose | +|-----------|---------------|---------| +| Metrics | OpenTelemetry SDK 0.31 + Custom Prometheus Exporter | Operational metrics (reconcile ops, API calls, leader status) | +| Logging | `tracing` + `tracing-subscriber` | Structured logging with log levels | +| Tracing | `#[tracing::instrument]` (logs only) | Function-level instrumentation (logged, not exported) | + +### Current Dependencies (Cargo.toml) + +```toml +opentelemetry = "0.31" +opentelemetry_sdk = { version = "0.31", features = ["rt-tokio", "metrics"] } +tracing = "0.1.40" +tracing-subscriber = "0.3.18" +``` + +## Proposed Integration + +### Option A: Full OTLP Export (Recommended) + +Add distributed tracing with OTLP (OpenTelemetry Protocol) export to compatible backends (Jaeger, Tempo, SigNoz, etc.). + +**New Dependencies:** +```toml +opentelemetry-otlp = { version = "0.31", features = ["trace", "grpc"] } +opentelemetry-semantic-conventions = "0.31" +tracing-opentelemetry = "0.30" # Bridge tracing crate to OTel +``` + +**Architecture:** +``` +Application Code + ↓ + tracing crate + ↓ +tracing-opentelemetry (bridge layer) + ↓ +opentelemetry-otlp (exporter) + ↓ +OTLP Collector / Backend (Jaeger, Tempo, etc.) +``` + +### Option B: In-Process Jaeger Export + +Direct export to Jaeger without OTLP collector. + +**New Dependencies:** +```toml +opentelemetry-jaeger = "0.31" # Note: deprecated in favor of OTLP +tracing-opentelemetry = "0.30" +``` + +### Option C: Opt-in with Multiple Exporters + +Support multiple backends via configuration (OTLP, stdout for debugging). + +## Trade-off Analysis + +### Benefits + +| Benefit | Impact | Description | +|---------|--------|-------------| +| **End-to-end visibility** | High | Trace requests from Kubernetes API through reconciliation to Hetzner API calls | +| **Performance debugging** | High | Identify slow reconciliations, API bottlenecks | +| **Error correlation** | High | Link errors across service boundaries | +| **Unified observability** | Medium | Same stack for metrics, logs, traces | +| **Existing foundation** | High | Already using `tracing` crate; minimal code changes | +| **Kubernetes ecosystem** | High | Standard in cloud-native deployments | +| **Vendor neutral** | Medium | OTLP works with Jaeger, Tempo, SigNoz, Datadog, etc. | + +### Costs + +| Cost | Impact | Description | +|------|--------|-------------| +| **Binary size increase** | Low | ~500KB-1MB additional | +| **Runtime overhead** | Low | ~1-3% CPU when enabled, negligible when disabled | +| **Dependency complexity** | Medium | 3-4 new crates | +| **Configuration burden** | Medium | New environment variables/options | +| **Operational requirements** | Medium | Need OTLP collector/backend | +| **Learning curve** | Low | Team familiar with tracing concepts | + +### Specific Considerations for RobotLB + +#### Where Tracing Adds Value + +1. **Reconciliation Flow** (`src/controller/mod.rs`) + - Trace full reconciliation lifecycle + - Identify which services take longest to reconcile + - Correlate Hetzner API calls with Kubernetes events + +2. **Hetzner API Calls** (`src/lb/api.rs`) + - Track API latency per operation + - Identify rate limiting or slow responses + - Attribute errors to specific operations + +3. **Leader Election** (`src/main.rs`) + - Trace election cycles + - Debug failover scenarios + +4. **Node Discovery** (`src/controller/nodes.rs`) + - Track time spent discovering target nodes + - Identify slow pod lookups + +#### Example Instrumentation Points + +```rust +// Already exists in controller/mod.rs +#[tracing::instrument(skip(svc, context), fields(service = svc.name_any()))] +pub async fn reconcile_service(...) -> RobotLBResult + +// Would benefit from spans in lb/api.rs +pub async fn create_load_balancer(...) -> RobotLBResult { + // Span already created by tracing::info! but not exported +} + +// Network operations in lb/api.rs +pub async fn attach_to_network(...) +``` + +### Configuration Options + +Recommended environment variables: + +```bash +# Enable/disable tracing (default: disabled) +ROBOTLB_TRACING_ENABLED=false + +# OTLP endpoint (e.g., http://tempo:4317 for gRPC, http://tempo:4318 for HTTP) +ROBOTLB_OTLP_ENDPOINT=http://localhost:4317 + +# Sampling ratio (1.0 = all traces, 0.1 = 10%) +ROBOTLB_TRACING_SAMPLE_RATIO=1.0 + +# Service name for traces +ROBOTLB_SERVICE_NAME=robotlb +``` + +### Implementation Effort + +| Task | Effort | Description | +|------|--------|-------------| +| Add dependencies | 5 min | Update Cargo.toml | +| Create tracing module | 1-2 hours | Initialize OTLP exporter | +| Add configuration | 30 min | CLI args/env vars | +| Instrument key functions | 2-3 hours | Add spans where missing | +| Update Helm chart | 30 min | Add config options | +| Documentation | 1 hour | Update README/tutorial | +| **Total** | **5-7 hours** | | + +## Recommendations + +### Recommended Approach: Option A (Full OTLP Export) + +**Rationale:** +1. Already using OpenTelemetry for metrics +2. OTLP is the standard protocol with wide backend support +3. `tracing-opentelemetry` bridge requires minimal code changes +4. Can be disabled by default with zero overhead + +### Implementation Phases + +#### Phase 1: Core Integration (MVP) +- Add OTLP exporter with opt-in configuration +- Bridge existing `tracing` instrumentation +- Export traces when `ROBOTLB_TRACING_ENABLED=true` + +#### Phase 2: Enhanced Instrumentation +- Add spans for Hetzner API calls +- Add span attributes for load balancer operations +- Include Kubernetes resource metadata in spans + +#### Phase 3: Advanced Features +- Add span events for state changes +- Correlate traces with metrics +- Add baggage for cross-service correlation (if needed) + +### Security Considerations + +| Concern | Mitigation | +|---------|------------| +| Sensitive data in traces | Sanitize HCLOUD token from span attributes | +| Network exposure | Use internal OTLP endpoints only | +| Storage costs | Use sampling for high-traffic clusters | + +### Backward Compatibility + +- Tracing disabled by default (zero impact) +- No breaking changes to existing functionality +- Opt-in via environment variable + +## Comparison: With vs Without Tracing + +| Scenario | Without Tracing | With Tracing | +|----------|----------------|--------------| +| Debug slow reconciliation | Check logs, grep timestamps | Visual timeline with duration breakdown | +| API rate limit issues | Aggregate metrics only | Per-request latency distribution | +| Multi-service debugging | Correlate logs manually | Distributed trace across services | +| Performance regression | Compare metric averages | Identify specific slow operations | + +## Decision Matrix + +| Factor | Weight | No Action | Add Tracing | +|--------|--------|-----------|-------------| +| Debugging capability | 3 | 1 | 3 | +| Operational overhead | 2 | 3 | 2 | +| Implementation effort | 2 | 3 | 2 | +| Ecosystem alignment | 2 | 1 | 3 | +| **Weighted Score** | | **14** | **21** | + +## Conclusion + +**Recommendation: Proceed with OpenTelemetry tracing integration.** + +The benefits significantly outweigh the costs: +- Low implementation effort (~5-7 hours) +- Leverages existing `tracing` infrastructure +- Opt-in by default means zero overhead for users who don't need it +- Aligns with Kubernetes/cloud-native best practices +- Provides critical debugging capabilities for production issues + +The project is well-positioned for this integration since it already uses the `tracing` crate extensively. Adding distributed tracing is primarily a configuration and exporter setup task rather than a major code refactoring effort. + +## Next Steps + +1. [ ] Approve integration approach +2. [ ] Create feature branch +3. [ ] Implement Phase 1 (Core Integration) +4. [ ] Add Helm chart values for tracing configuration +5. [ ] Update documentation +6. [ ] Create example Grafana Tempo/Jaeger deployment + +## References + +- [OpenTelemetry Rust Documentation](https://docs.rs/opentelemetry/) +- [tracing-opentelemetry Integration](https://docs.rs/tracing-opentelemetry/) +- [OTLP Specification](https://opentelemetry.io/docs/specs/otlp/) +- [Grafana Tempo](https://grafana.com/oss/tempo/) +- [Jaeger](https://www.jaegertracing.io/) diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml index 69ccddf..6730896 100644 --- a/helm/templates/deployment.yaml +++ b/helm/templates/deployment.yaml @@ -38,12 +38,30 @@ spec: - /usr/local/bin/robotlb resources: {{- toYaml .Values.resources | nindent 12 }} - {{- with .Values.envs }} + {{- $hasEnvs := .Values.envs -}} + {{- $hasTracing := .Values.tracing.enabled -}} + {{- if or $hasEnvs $hasTracing }} env: + {{- with .Values.envs }} {{- range $key, $val := . }} - - name: {{ $key | quote }} - value: {{ $val | quote }} - {{ end -}} + - name: {{ $key | quote }} + value: {{ $val | quote }} + {{- end }} + {{- end }} + {{- if $hasTracing }} + - name: ROBOTLB_TRACING_ENABLED + value: "true" + - name: ROBOTLB_TRACING_OTLP_ENDPOINT + value: {{ .Values.tracing.otlp.endpoint | quote }} + - name: ROBOTLB_TRACING_OTLP_PROTOCOL + value: {{ .Values.tracing.otlp.protocol | quote }} + - name: ROBOTLB_TRACING_SAMPLING_RATIO + value: {{ .Values.tracing.samplingRatio | quote }} + {{- if .Values.tracing.serviceName }} + - name: ROBOTLB_TRACING_SERVICE_NAME + value: {{ .Values.tracing.serviceName | quote }} + {{- end }} + {{- end }} {{- end }} {{- with .Values.existingSecrets }} envFrom: diff --git a/helm/values.yaml b/helm/values.yaml index e8894d9..d2460e9 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -82,6 +82,9 @@ affinity: {} metrics: enabled: true + service: + enabled: true + annotations: {} serviceMonitor: enabled: false labels: {} @@ -98,3 +101,11 @@ metrics: enabled: false labels: {} annotations: {} + +tracing: + enabled: false + otlp: + endpoint: "" + protocol: grpc + samplingRatio: 1.0 + serviceName: "" diff --git a/src/config.rs b/src/config.rs index 3fa8fac..858f6da 100644 --- a/src/config.rs +++ b/src/config.rs @@ -7,6 +7,7 @@ use clap::Parser; use tracing::level_filters::LevelFilter; #[derive(Debug, Clone, Parser)] +#[allow(clippy::struct_excessive_bools)] pub struct OperatorConfig { /// `HCloud` API token. #[arg(short = 't', long, env = "ROBOTLB_HCLOUD_TOKEN")] @@ -105,6 +106,26 @@ pub struct OperatorConfig { // Log level of the operator. #[arg(long, env = "ROBOTLB_LOG_LEVEL", default_value = "INFO")] pub log_level: LevelFilter, + + // Enable distributed tracing via OpenTelemetry. + #[arg(long, env = "ROBOTLB_TRACING_ENABLED", default_value = "false")] + pub tracing_enabled: bool, + + // OTLP endpoint for trace export (e.g., http://tempo:4317). + #[arg( + long, + env = "ROBOTLB_OTLP_ENDPOINT", + default_value = "http://localhost:4317" + )] + pub otlp_endpoint: String, + + // Sampling ratio for traces (1.0 = all, 0.1 = 10%). + #[arg(long, env = "ROBOTLB_TRACING_SAMPLE_RATIO", default_value = "1.0")] + pub tracing_sample_ratio: f64, + + // Service name for traces. + #[arg(long, env = "ROBOTLB_SERVICE_NAME", default_value = "robotlb")] + pub service_name: String, } #[cfg(test)] @@ -134,6 +155,10 @@ mod tests { assert_eq!(config.leader_election_lease_ttl_secs, 15); assert_eq!(config.leader_election_renew_interval_secs, 5); assert_eq!(config.log_level, LevelFilter::INFO); + assert!(!config.tracing_enabled); + assert_eq!(config.otlp_endpoint, "http://localhost:4317"); + assert!((config.tracing_sample_ratio - 1.0).abs() < f64::EPSILON); + assert_eq!(config.service_name, "robotlb"); } #[test] @@ -168,6 +193,13 @@ mod tests { "10", "--log-level", "DEBUG", + "--tracing-enabled", + "--otlp-endpoint", + "http://tempo:4317", + "--tracing-sample-ratio", + "0.5", + "--service-name", + "robotlb-prod", ]) .expect("config should parse"); @@ -186,5 +218,9 @@ mod tests { assert_eq!(config.leader_election_lease_ttl_secs, 30); assert_eq!(config.leader_election_renew_interval_secs, 10); assert_eq!(config.log_level, LevelFilter::DEBUG); + assert!(config.tracing_enabled); + assert_eq!(config.otlp_endpoint, "http://tempo:4317"); + assert!((config.tracing_sample_ratio - 0.5).abs() < f64::EPSILON); + assert_eq!(config.service_name, "robotlb-prod"); } } diff --git a/src/error.rs b/src/error.rs index b0dc649..0b597d6 100644 --- a/src/error.rs +++ b/src/error.rs @@ -91,4 +91,6 @@ pub enum RobotLBError { HcloudListLoadBalancersError( #[from] hcloud::apis::Error, ), + #[error("{0}")] + Generic(String), } diff --git a/src/lb/config.rs b/src/lb/config.rs index 6dee778..35b4298 100644 --- a/src/lb/config.rs +++ b/src/lb/config.rs @@ -137,6 +137,10 @@ mod tests { leader_election_lease_ttl_secs: 15, leader_election_renew_interval_secs: 5, log_level: LevelFilter::INFO, + tracing_enabled: false, + otlp_endpoint: "http://localhost:4317".to_string(), + tracing_sample_ratio: 1.0, + service_name: "robotlb".to_string(), } } diff --git a/src/main.rs b/src/main.rs index 693bed2..3caaaec 100644 --- a/src/main.rs +++ b/src/main.rs @@ -50,6 +50,7 @@ pub mod health; pub mod label_filter; pub mod lb; pub mod metrics; +pub mod otel_tracing; pub mod prometheus_exporter; /// Shared context for the operator. @@ -95,9 +96,10 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; async fn main() -> RobotLBResult<()> { dotenvy::dotenv().ok(); let operator_config = OperatorConfig::parse(); - tracing_subscriber::fmt() - .with_max_level(operator_config.log_level) - .init(); + + let mut tracing_guard = otel_tracing::init_tracing(&operator_config).map_err(|e| { + crate::error::RobotLBError::Generic(format!("Failed to initialize tracing: {e}")) + })?; let mut hcloud_conf = HCloudConfig::new(); hcloud_conf.bearer_access_token = Some(operator_config.hcloud_token.clone()); @@ -165,6 +167,7 @@ async fn main() -> RobotLBResult<()> { _ = controller_task => {} } + tracing_guard.shutdown(); tracing::info!("Shutdown complete"); Ok(()) } diff --git a/src/otel_tracing.rs b/src/otel_tracing.rs new file mode 100644 index 0000000..a45c570 --- /dev/null +++ b/src/otel_tracing.rs @@ -0,0 +1,71 @@ +//! OpenTelemetry tracing initialization. +//! +//! This module provides distributed tracing support via OpenTelemetry Protocol (OTLP). +//! Tracing is opt-in and disabled by default for zero overhead. + +use crate::config::OperatorConfig; +use opentelemetry::{global, trace::TracerProvider}; +use opentelemetry_otlp::WithExportConfig; +use opentelemetry_sdk::{ + Resource, + trace::{self, Sampler, SdkTracerProvider}, +}; +use tracing_subscriber::{Layer, layer::SubscriberExt, util::SubscriberInitExt}; +pub struct TracingGuard { + tracer_provider: Option, +} +impl TracingGuard { + #[must_use] + pub const fn empty() -> Self { + Self { + tracer_provider: None, + } + } + pub fn shutdown(&mut self) { + if let Some(provider) = self.tracer_provider.take() { + let _ = provider.shutdown(); + } + } +} +/// Initializes the OpenTelemetry tracing subscriber. +/// +/// # Errors +/// +/// Returns an error if the OTLP span exporter fails to build. +pub fn init_tracing(config: &OperatorConfig) -> Result { + let fmt_layer = tracing_subscriber::fmt::layer().with_filter(config.log_level); + if !config.tracing_enabled { + tracing_subscriber::registry().with(fmt_layer).init(); + return Ok(TracingGuard::empty()); + } + let resource = Resource::builder() + .with_service_name(config.service_name.clone()) + .build(); + let exporter = opentelemetry_otlp::SpanExporter::builder() + .with_tonic() + .with_endpoint(format!("{}/v1/traces", config.otlp_endpoint)) + .build() + .map_err(|e| trace::TraceError::from(e.to_string()))?; + let sampler = if (config.tracing_sample_ratio - 1.0).abs() < f64::EPSILON { + Sampler::AlwaysOn + } else { + Sampler::TraceIdRatioBased(config.tracing_sample_ratio) + }; + let tracer_provider = SdkTracerProvider::builder() + .with_resource(resource) + .with_sampler(sampler) + .with_batch_exporter(exporter) + .build(); + let tracer = tracer_provider.tracer("robotlb"); + global::set_tracer_provider(tracer_provider.clone()); + let telemetry_layer = tracing_opentelemetry::layer() + .with_tracer(tracer) + .with_filter(config.log_level); + tracing_subscriber::registry() + .with(fmt_layer) + .with(telemetry_layer) + .init(); + Ok(TracingGuard { + tracer_provider: Some(tracer_provider), + }) +}