From 9229eaa3b8abd9b3279ab9f3a14d040035611d23 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Fri, 24 Apr 2026 14:24:57 -0700 Subject: [PATCH 1/2] feat(tonic-xds): add OutlierDetectionConfig types (gRFC A50) Define the validated config types consumed by the outlier-detection algorithm: OutlierDetectionConfig with the global timing/percentage parameters, plus SuccessRateConfig and FailurePercentageConfig for the two ejection algorithms. This PR contains only the type definitions. Proto parsing from envoy.config.cluster.v3.OutlierDetection and the ClusterResource field land in a follow-up PR alongside the load-balancing-pipeline wiring, keeping the algorithm PR self-contained and easy to review. Refs: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md --- tonic-xds/src/xds/resource/mod.rs | 1 + .../src/xds/resource/outlier_detection.rs | 114 ++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 tonic-xds/src/xds/resource/outlier_detection.rs diff --git a/tonic-xds/src/xds/resource/mod.rs b/tonic-xds/src/xds/resource/mod.rs index fcb3a9295..73cf459e4 100644 --- a/tonic-xds/src/xds/resource/mod.rs +++ b/tonic-xds/src/xds/resource/mod.rs @@ -13,6 +13,7 @@ pub(crate) mod cluster; pub(crate) mod endpoints; pub(crate) mod listener; +pub(crate) mod outlier_detection; pub(crate) mod route_config; pub(crate) use cluster::ClusterResource; diff --git a/tonic-xds/src/xds/resource/outlier_detection.rs b/tonic-xds/src/xds/resource/outlier_detection.rs new file mode 100644 index 000000000..0f2887913 --- /dev/null +++ b/tonic-xds/src/xds/resource/outlier_detection.rs @@ -0,0 +1,114 @@ +//! Outlier-detection configuration types (gRFC A50). +//! +//! These are the validated config inputs consumed by the outlier-detection +//! algorithm. Parsing them from `envoy.config.cluster.v3.OutlierDetection` +//! and exposing them on `ClusterResource` lands in a follow-up PR alongside +//! the wiring into the load-balancing pipeline. +//! +//! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md + +use std::time::Duration; + +/// Validated A50 outlier-detection configuration for a cluster. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct OutlierDetectionConfig { + /// How often the ejection sweep runs. + pub interval: Duration, + /// Base duration for a single ejection; actual ejection time is + /// `base_ejection_time * multiplier`, capped by `max_ejection_time`. + pub base_ejection_time: Duration, + /// Upper bound on `base_ejection_time * multiplier`. The spec guarantees + /// this is at least `base_ejection_time`. + pub max_ejection_time: Duration, + /// Maximum percentage of endpoints that may be ejected at any time (0-100). + pub max_ejection_percent: u32, + /// Success-rate ejection parameters. `None` if the algorithm is disabled. + pub success_rate: Option, + /// Failure-percentage ejection parameters. `None` if the algorithm is + /// disabled. + pub failure_percentage: Option, +} + +/// Success-rate ejection parameters (gRFC A50). +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct SuccessRateConfig { + /// Ejection threshold factor, scaled by 1000 (so `1900` means `1.9`). + /// An endpoint is a candidate for ejection when its success rate falls + /// below `mean - stdev * (stdev_factor / 1000.0)`. + pub stdev_factor: u32, + /// Probability (0-100) that a candidate is actually ejected. + pub enforcement_percentage: u32, + /// Minimum number of candidate endpoints required to run the algorithm. + pub minimum_hosts: u32, + /// Minimum number of requests an endpoint must have seen in the last + /// interval to be considered a candidate. + pub request_volume: u32, +} + +/// Failure-percentage ejection parameters (gRFC A50). +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct FailurePercentageConfig { + /// Failure rate (0-100) at or above which an endpoint is a candidate + /// for ejection. + pub threshold: u32, + /// Probability (0-100) that a candidate is actually ejected. + pub enforcement_percentage: u32, + /// Minimum number of candidate endpoints required to run the algorithm. + pub minimum_hosts: u32, + /// Minimum number of requests an endpoint must have seen in the last + /// interval to be considered a candidate. + pub request_volume: u32, +} + +impl OutlierDetectionConfig { + /// True when at least one ejection algorithm is enabled and the detector + /// should do work. If false, the cluster can skip instantiating detection. + pub(crate) fn is_enabled(&self) -> bool { + self.success_rate.is_some() || self.failure_percentage.is_some() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn base_config() -> OutlierDetectionConfig { + OutlierDetectionConfig { + interval: Duration::from_secs(10), + base_ejection_time: Duration::from_secs(30), + max_ejection_time: Duration::from_secs(300), + max_ejection_percent: 10, + success_rate: None, + failure_percentage: None, + } + } + + #[test] + fn is_enabled_false_when_both_algorithms_disabled() { + assert!(!base_config().is_enabled()); + } + + #[test] + fn is_enabled_true_when_success_rate_present() { + let mut c = base_config(); + c.success_rate = Some(SuccessRateConfig { + stdev_factor: 1900, + enforcement_percentage: 100, + minimum_hosts: 5, + request_volume: 100, + }); + assert!(c.is_enabled()); + } + + #[test] + fn is_enabled_true_when_failure_percentage_present() { + let mut c = base_config(); + c.failure_percentage = Some(FailurePercentageConfig { + threshold: 85, + enforcement_percentage: 100, + minimum_hosts: 5, + request_volume: 50, + }); + assert!(c.is_enabled()); + } +} From eafbe870fceef5302186fb5dc6bacc11edd592d6 Mon Sep 17 00:00:00 2001 From: Yu Liu <60283975+LYZJU2019@users.noreply.github.com> Date: Mon, 27 Apr 2026 13:18:17 -0700 Subject: [PATCH 2/2] feat(tonic-xds): address review on OutlierDetectionConfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Trim module docstring; drop "lands in a follow-up PR" framing. - Note in the docstring why there is no `child_policy` yet (tonic-xds has only one balancer; the field will land alongside more balancers). - Rename `enforcement_percentage` → `enforcing_success_rate` / `enforcing_failure_percentage` to match the Envoy proto field names. - Introduce a local `Percentage(u8)` newtype with a fallible constructor and use it for `max_ejection_percent`, `enforcing_success_rate`, `threshold`, and `enforcing_failure_percentage` so the 0..=100 invariant is enforced through the type system. Add tests covering the constructor's range checks. --- .../src/xds/resource/outlier_detection.rs | 81 ++++++++++++++----- 1 file changed, 63 insertions(+), 18 deletions(-) diff --git a/tonic-xds/src/xds/resource/outlier_detection.rs b/tonic-xds/src/xds/resource/outlier_detection.rs index 0f2887913..a31fd6c60 100644 --- a/tonic-xds/src/xds/resource/outlier_detection.rs +++ b/tonic-xds/src/xds/resource/outlier_detection.rs @@ -1,14 +1,46 @@ -//! Outlier-detection configuration types (gRFC A50). +//! Validated configuration types for [gRFC A50] outlier detection. //! -//! These are the validated config inputs consumed by the outlier-detection -//! algorithm. Parsing them from `envoy.config.cluster.v3.OutlierDetection` -//! and exposing them on `ClusterResource` lands in a follow-up PR alongside -//! the wiring into the load-balancing pipeline. +//! [`OutlierDetectionConfig`] is the input to the outlier-detection +//! algorithm. The two sub-configs gate which ejection algorithms run. +//! +//! Note: A50 specifies outlier detection as a load-balancing policy +//! wrapping a `child_policy`. `tonic-xds` currently runs P2C as its only +//! load balancer and integrates outlier detection as a filter on the +//! `Discover` stream feeding it, so there is no `child_policy` field +//! here yet. It will be added when more balancers are supported. //! //! [gRFC A50]: https://github.com/grpc/proposal/blob/master/A50-xds-outlier-detection.md use std::time::Duration; +/// A 0–100 percentage. Construction is fallible; once held, every +/// `Percentage` is guaranteed to be in range, so the algorithm never +/// has to re-validate. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) struct Percentage(u8); + +impl Percentage { + /// Construct from a raw value, returning `Err` if it exceeds 100. + /// Accepts `u32` to match the proto wire type without forcing callers + /// to cast at every site. + pub(crate) fn new(value: u32) -> Result { + if value > 100 { + Err(PercentageError(value)) + } else { + Ok(Self(value as u8)) + } + } + + /// The contained value, in `0..=100`. + pub(crate) fn get(self) -> u8 { + self.0 + } +} + +#[derive(Debug, thiserror::Error, PartialEq, Eq)] +#[error("percentage must be in 0..=100, got {0}")] +pub(crate) struct PercentageError(u32); + /// Validated A50 outlier-detection configuration for a cluster. #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct OutlierDetectionConfig { @@ -20,8 +52,8 @@ pub(crate) struct OutlierDetectionConfig { /// Upper bound on `base_ejection_time * multiplier`. The spec guarantees /// this is at least `base_ejection_time`. pub max_ejection_time: Duration, - /// Maximum percentage of endpoints that may be ejected at any time (0-100). - pub max_ejection_percent: u32, + /// Maximum percentage of endpoints that may be ejected at any time. + pub max_ejection_percent: Percentage, /// Success-rate ejection parameters. `None` if the algorithm is disabled. pub success_rate: Option, /// Failure-percentage ejection parameters. `None` if the algorithm is @@ -36,8 +68,8 @@ pub(crate) struct SuccessRateConfig { /// An endpoint is a candidate for ejection when its success rate falls /// below `mean - stdev * (stdev_factor / 1000.0)`. pub stdev_factor: u32, - /// Probability (0-100) that a candidate is actually ejected. - pub enforcement_percentage: u32, + /// Probability that a candidate is actually ejected. + pub enforcing_success_rate: Percentage, /// Minimum number of candidate endpoints required to run the algorithm. pub minimum_hosts: u32, /// Minimum number of requests an endpoint must have seen in the last @@ -48,11 +80,11 @@ pub(crate) struct SuccessRateConfig { /// Failure-percentage ejection parameters (gRFC A50). #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct FailurePercentageConfig { - /// Failure rate (0-100) at or above which an endpoint is a candidate - /// for ejection. - pub threshold: u32, - /// Probability (0-100) that a candidate is actually ejected. - pub enforcement_percentage: u32, + /// Failure rate at or above which an endpoint is a candidate for + /// ejection. + pub threshold: Percentage, + /// Probability that a candidate is actually ejected. + pub enforcing_failure_percentage: Percentage, /// Minimum number of candidate endpoints required to run the algorithm. pub minimum_hosts: u32, /// Minimum number of requests an endpoint must have seen in the last @@ -77,7 +109,7 @@ mod tests { interval: Duration::from_secs(10), base_ejection_time: Duration::from_secs(30), max_ejection_time: Duration::from_secs(300), - max_ejection_percent: 10, + max_ejection_percent: Percentage::new(10).unwrap(), success_rate: None, failure_percentage: None, } @@ -93,7 +125,7 @@ mod tests { let mut c = base_config(); c.success_rate = Some(SuccessRateConfig { stdev_factor: 1900, - enforcement_percentage: 100, + enforcing_success_rate: Percentage::new(100).unwrap(), minimum_hosts: 5, request_volume: 100, }); @@ -104,11 +136,24 @@ mod tests { fn is_enabled_true_when_failure_percentage_present() { let mut c = base_config(); c.failure_percentage = Some(FailurePercentageConfig { - threshold: 85, - enforcement_percentage: 100, + threshold: Percentage::new(85).unwrap(), + enforcing_failure_percentage: Percentage::new(100).unwrap(), minimum_hosts: 5, request_volume: 50, }); assert!(c.is_enabled()); } + + #[test] + fn percentage_accepts_zero_to_one_hundred() { + for v in [0, 1, 50, 99, 100] { + assert_eq!(Percentage::new(v).unwrap().get() as u32, v); + } + } + + #[test] + fn percentage_rejects_values_above_one_hundred() { + assert_eq!(Percentage::new(101), Err(PercentageError(101))); + assert_eq!(Percentage::new(u32::MAX), Err(PercentageError(u32::MAX))); + } }