From f061c1329ca9e86f033684dcd788fe8e5f5f39f6 Mon Sep 17 00:00:00 2001 From: omnomburp Date: Sat, 27 Jun 2026 13:38:59 +0800 Subject: [PATCH 1/2] Fix slow decodes "poisoning" sleep times when using power throttling --- ds4.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/ds4.c b/ds4.c index 640511eb0..2b7783aa9 100644 --- a/ds4.c +++ b/ds4.c @@ -10485,6 +10485,8 @@ typedef struct { uint32_t power_percent; double prefill_layer_avg_sec[DS4_MAX_LAYER]; double decode_token_avg_sec; + double decode_power_warmup_fastest_sec; + uint32_t decode_power_warmup_tokens; uint32_t streaming_preload_experts; bool quality; bool ssd_streaming; @@ -10525,9 +10527,33 @@ static void graph_power_note_prefill_layer(ds4_gpu_graph *g, } static void graph_power_note_decode_token(ds4_gpu_graph *g, double elapsed_sec) { + const uint32_t warmup_tokens = 5u; + const double outlier_mult = 4.0; + if (!graph_power_throttle_enabled(g)) return; + if (elapsed_sec <= 0.0 || !isfinite(elapsed_sec)) return; + + if (g->decode_power_warmup_tokens < warmup_tokens) { + if (g->decode_power_warmup_tokens == 0 || + elapsed_sec < g->decode_power_warmup_fastest_sec) { + g->decode_power_warmup_fastest_sec = elapsed_sec; + } + g->decode_power_warmup_tokens++; + return; + } + + double sample = elapsed_sec; + if (g->decode_token_avg_sec <= 0.0 || !isfinite(g->decode_token_avg_sec)) { + if (g->decode_power_warmup_fastest_sec < sample) { + sample = g->decode_power_warmup_fastest_sec; + } + } else { + const double cap = g->decode_token_avg_sec * outlier_mult; + if (sample > cap) sample = cap; + } + g->decode_token_avg_sec = - graph_power_update_avg(g->decode_token_avg_sec, elapsed_sec); + graph_power_update_avg(g->decode_token_avg_sec, sample); graph_power_sleep(g->decode_token_avg_sec, g->power_percent); } @@ -26158,7 +26184,12 @@ int ds4_session_set_power(ds4_session *s, int power_percent) { if (!s || !s->engine || power_percent < 1 || power_percent > 100) return 1; s->engine->power_percent = power_percent; #ifndef DS4_NO_GPU - if (!ds4_session_is_cpu(s)) s->graph.power_percent = (uint32_t)power_percent; + if (!ds4_session_is_cpu(s)) { + s->graph.power_percent = (uint32_t)power_percent; + s->graph.decode_token_avg_sec = 0.0; + s->graph.decode_power_warmup_fastest_sec = 0.0; + s->graph.decode_power_warmup_tokens = 0; + } #endif return 0; } From 5e86cc498541f32366c38977ad4573579319a174 Mon Sep 17 00:00:00 2001 From: omnomburp Date: Thu, 2 Jul 2026 12:00:00 +0800 Subject: [PATCH 2/2] Throttle decode on a median sample window instead of a warmup-capped average Replace the decode power-throttle EMA plus warmup/outlier-cap heuristics with the median of the last 16 token times kept in a fixed ring buffer. A decode token can stall for reasons unrelated to steady-state work: scheduler preemption, first-touch page faults, GPU clock ramps after the throttle's own sleeps, or SSD expert streaming when enabled. Such one-off stalls previously inflated the average and the sleeps applied to the fast tokens that follow. The warmup only protected the first tokens of a graph's lifetime, and the 4x cap still let a burst of slow tokens ratchet the average up at every request. The median is unmoved by up to 7 slow samples out of 16, needs no warmup or tunables, and still adopts a genuine sustained slowdown once it fills half the window. The window is a power of two so the ring index is a mask; with an even count the upper middle sample is used, erring toward slightly longer sleeps and staying under the power target. --- ds4.c | 60 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/ds4.c b/ds4.c index 2b7783aa9..441fa6779 100644 --- a/ds4.c +++ b/ds4.c @@ -10309,6 +10309,11 @@ static void print_vec_stats(const char *name, const float *x, uint64_t n) { enum { DS4_STREAMING_PREFILL_CACHE_SEED_MAX_TOKENS = 64 }; +/* Power of two so the ring index is a mask. With an even count the upper + * middle sample is used as the median, erring toward slightly longer sleeps + * (stays under the power target). */ +enum { DS4_POWER_DECODE_WINDOW = 16 }; + typedef struct { /* One-token decode tensors. These stay allocated for the life of a * session; a generated token enters as an embedding in cur_hc and leaves as @@ -10484,9 +10489,8 @@ typedef struct { float directional_steering_ffn_scale; uint32_t power_percent; double prefill_layer_avg_sec[DS4_MAX_LAYER]; - double decode_token_avg_sec; - double decode_power_warmup_fastest_sec; - uint32_t decode_power_warmup_tokens; + double decode_power_sample_sec[DS4_POWER_DECODE_WINDOW]; + uint32_t decode_power_sample_count; uint32_t streaming_preload_experts; bool quality; bool ssd_streaming; @@ -10526,35 +10530,35 @@ static void graph_power_note_prefill_layer(ds4_gpu_graph *g, graph_power_sleep(g->prefill_layer_avg_sec[il], g->power_percent); } +/* Throttle decode on the median of a recent window rather than a running + * average: a token can stall for reasons unrelated to steady-state work + * (scheduler preemption, first-touch page faults, GPU clock ramps after + * throttle sleeps, SSD expert streaming when enabled), and one-off stalls + * must not inflate the sleeps applied to the fast tokens that follow. A + * sustained slowdown still throttles proportionally once it fills half + * the window. */ static void graph_power_note_decode_token(ds4_gpu_graph *g, double elapsed_sec) { - const uint32_t warmup_tokens = 5u; - const double outlier_mult = 4.0; - if (!graph_power_throttle_enabled(g)) return; if (elapsed_sec <= 0.0 || !isfinite(elapsed_sec)) return; - if (g->decode_power_warmup_tokens < warmup_tokens) { - if (g->decode_power_warmup_tokens == 0 || - elapsed_sec < g->decode_power_warmup_fastest_sec) { - g->decode_power_warmup_fastest_sec = elapsed_sec; - } - g->decode_power_warmup_tokens++; - return; - } - - double sample = elapsed_sec; - if (g->decode_token_avg_sec <= 0.0 || !isfinite(g->decode_token_avg_sec)) { - if (g->decode_power_warmup_fastest_sec < sample) { - sample = g->decode_power_warmup_fastest_sec; + g->decode_power_sample_sec[g->decode_power_sample_count & + (DS4_POWER_DECODE_WINDOW - 1u)] = elapsed_sec; + g->decode_power_sample_count++; + + uint32_t n = g->decode_power_sample_count; + if (n > DS4_POWER_DECODE_WINDOW) n = DS4_POWER_DECODE_WINDOW; + double sorted[DS4_POWER_DECODE_WINDOW]; + memcpy(sorted, g->decode_power_sample_sec, n * sizeof(double)); + for (uint32_t i = 1; i < n; i++) { + const double v = sorted[i]; + uint32_t j = i; + while (j > 0 && sorted[j - 1] > v) { + sorted[j] = sorted[j - 1]; + j--; } - } else { - const double cap = g->decode_token_avg_sec * outlier_mult; - if (sample > cap) sample = cap; + sorted[j] = v; } - - g->decode_token_avg_sec = - graph_power_update_avg(g->decode_token_avg_sec, sample); - graph_power_sleep(g->decode_token_avg_sec, g->power_percent); + graph_power_sleep(sorted[n / 2], g->power_percent); } /* Release every Metal tensor owned by the whole-model graph runtime. */ @@ -26186,9 +26190,7 @@ int ds4_session_set_power(ds4_session *s, int power_percent) { #ifndef DS4_NO_GPU if (!ds4_session_is_cpu(s)) { s->graph.power_percent = (uint32_t)power_percent; - s->graph.decode_token_avg_sec = 0.0; - s->graph.decode_power_warmup_fastest_sec = 0.0; - s->graph.decode_power_warmup_tokens = 0; + s->graph.decode_power_sample_count = 0; } #endif return 0;