diff --git a/6.16-nap-v0.4.0.patch b/6.16-nap-v0.5.0.patch similarity index 72% rename from 6.16-nap-v0.4.0.patch rename to 6.16-nap-v0.5.0.patch index 9b40d91..3db1e3b 100644 --- a/6.16-nap-v0.4.0.patch +++ b/6.16-nap-v0.5.0.patch @@ -1,22 +1,24 @@ -From 1d2e8272f288fecce3fd7f762fb8c628ed04b7fe Mon Sep 17 00:00:00 2001 +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Masahito S -Date: Wed, 15 Apr 2026 08:37:01 +0900 -Subject: [PATCH] 6.16 backport: 6.18.3-nap-v0.4.0 +Date: Fri, 5 Jun 2026 13:10:05 +0900 +Subject: [PATCH] 6.16 backport: 6.18.3-nap-v0.5.0 -Backport of NAP cpuidle governor to Linux 6.16. -No functional changes except added RESIDENCY_THRESHOLD_NS definition. +Backport of NAP cpuidle governor v0.5.0 to Linux 6.16. + +Functional changes from v0.4.0 are preserved; 6.16 compatibility keeps +the RESIDENCY_THRESHOLD_NS fallback definition used by the previous +backport. -Signed-off-by: Masahito S --- drivers/cpuidle/Kconfig | 17 + drivers/cpuidle/governors/Makefile | 1 + - drivers/cpuidle/governors/nap/Makefile | 29 + - drivers/cpuidle/governors/nap/nap.c | 671 ++++++++++++++++++++ - drivers/cpuidle/governors/nap/nap.h | 283 +++++++++ - drivers/cpuidle/governors/nap/nap_fpu.c | 572 +++++++++++++++++ - drivers/cpuidle/governors/nap/nap_nn_avx2.c | 135 ++++ - drivers/cpuidle/governors/nap/nap_nn_sse2.c | 136 ++++ - 8 files changed, 1844 insertions(+) + drivers/cpuidle/governors/nap/Makefile | 30 + + drivers/cpuidle/governors/nap/nap.c | 623 ++++++++++++++++++++ + drivers/cpuidle/governors/nap/nap.h | 291 ++++++++++ + drivers/cpuidle/governors/nap/nap_fpu.c | 528 +++++++++++++++++ + drivers/cpuidle/governors/nap/nap_nn_avx2.c | 135 +++++ + drivers/cpuidle/governors/nap/nap_nn_sse2.c | 136 +++++ + 8 files changed, 1761 insertions(+) create mode 100644 drivers/cpuidle/governors/nap/Makefile create mode 100644 drivers/cpuidle/governors/nap/nap.c create mode 100644 drivers/cpuidle/governors/nap/nap.h @@ -63,10 +65,10 @@ index 63abb5393a..ae688891c0 100644 +obj-$(CONFIG_CPU_IDLE_GOV_NAP) += nap/ diff --git a/drivers/cpuidle/governors/nap/Makefile b/drivers/cpuidle/governors/nap/Makefile new file mode 100644 -index 0000000000..8c4a17d8e2 +index 0000000000..8b85a475a6 --- /dev/null +++ b/drivers/cpuidle/governors/nap/Makefile -@@ -0,0 +1,33 @@ +@@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for the NAP cpuidle governor @@ -90,30 +92,28 @@ index 0000000000..8c4a17d8e2 +FPU_KILL_FLAGS := -mno-sse -mno-sse2 -mno-mmx -mno-avx -mno-3dnow \ + -mno-sse4a -msoft-float -mno-80387 -mno-fp-ret-in-387 + -+# LTO FIX: Disables LTO on standalone files to prevent intrusive inlining -+# of FPU instructions and ensure that flags are preserved during linking. -+CFLAGS_REMOVE_nap.o += $(CC_FLAGS_LTO) -+CFLAGS_REMOVE_nap_fpu.o += $(CC_FLAGS_LTO) $(FPU_KILL_FLAGS) -+CFLAGS_REMOVE_nap_nn_sse2.o += $(CC_FLAGS_LTO) $(FPU_KILL_FLAGS) -+CFLAGS_REMOVE_nap_nn_avx2.o += $(CC_FLAGS_LTO) $(FPU_KILL_FLAGS) ++CFLAGS_REMOVE_nap_fpu.o += $(FPU_KILL_FLAGS) ++CFLAGS_REMOVE_nap_nn_sse2.o += $(FPU_KILL_FLAGS) ++CFLAGS_REMOVE_nap_nn_avx2.o += $(FPU_KILL_FLAGS) + +CFLAGS_nap_fpu.o += $(CC_FLAGS_FPU) +CFLAGS_nap_nn_sse2.o += $(CC_FLAGS_FPU) +CFLAGS_nap_nn_avx2.o += $(CC_FLAGS_FPU) -mavx -mavx2 -mfma diff --git a/drivers/cpuidle/governors/nap/nap.c b/drivers/cpuidle/governors/nap/nap.c new file mode 100644 -index 0000000000..c72b67e9c3 +index 0000000000..fc7393e9f4 --- /dev/null +++ b/drivers/cpuidle/governors/nap/nap.c -@@ -0,0 +1,672 @@ +@@ -0,0 +1,623 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nap.c — Neural Adaptive Predictor cpuidle governor + * -+ * A machine-learning-based cpuidle governor that uses a small MLP (8→8→1) -+ * with 3 Mixture-of-Experts (short/long/deep) to predict a log2 correction -+ * factor for sleep_length. State selection is deterministic threshold -+ * comparison. Weights are Xavier-initialized at boot, then refined via ++ * A machine-learning-based cpuidle governor that uses a small MLP trunk and an ++ * ordinal survival head to predict, per idle-state boundary, the probability ++ * that the upcoming idle reaches that state's target_residency. The decision ++ * layer picks the deepest feasible state whose calibrated survival meets a ++ * confidence level. Weights are Xavier-initialized at boot, then refined via + * online learning (deferred backpropagation with SGD). + * + * IMPORTANT: This file is compiled WITHOUT FPU/SSE flags (normal kernel @@ -126,6 +126,7 @@ index 0000000000..c72b67e9c3 + +#include +#include ++#include +#include +#include +#include @@ -149,16 +150,18 @@ index 0000000000..c72b67e9c3 +#define CPUIDLE_NAP_PROGNAME "Nap CPUIdle Governor" +#define CPUIDLE_NAP_AUTHOR "Masahito Suzuki" + -+#define CPUIDLE_NAP_VERSION "0.4.0" ++#define CPUIDLE_NAP_VERSION "0.5.0" + +/* Governor defaults */ +#define NAP_DEFAULT_LR_MILLTHS 1 /* 0.001 = 1 millths */ +#define NAP_DEFAULT_INTERVAL 4 /* learn every 4 reflects */ +#define NAP_DEFAULT_CLAMP_MILLTHS 1000 /* 1.0 = 1000 millths */ -+#define NAP_DEFAULT_PCTL_MILLTHS 100 /* 10th percentile */ ++#define NAP_DEFAULT_CONF_MILLTHS 500 /* 0.5 = balanced survival confidence */ + -+/* Backport: RESIDENCY_THRESHOLD_NS was missing in original patch */ ++/* Backport: RESIDENCY_THRESHOLD_NS is not available in Linux 6.16. */ ++#ifndef RESIDENCY_THRESHOLD_NS +#define RESIDENCY_THRESHOLD_NS TICK_NSEC ++#endif + +/* ================================================================ + * ISA dispatch via static keys (definitions only; dispatch in nap_fpu.c) @@ -194,7 +197,6 @@ index 0000000000..c72b67e9c3 + d->hist_idx = (d->hist_idx + 1) % NAP_HISTORY_SIZE; + if (d->hist_count < NAP_HISTORY_SIZE) + d->hist_count++; -+ +} + +static void nap_update_external_signals(struct nap_cpu_data *d) @@ -206,14 +208,32 @@ index 0000000000..c72b67e9c3 + * Governor callbacks + * ================================================================ */ + ++static int nap_fallback_heuristic(struct cpuidle_driver *drv, ++ struct cpuidle_device *dev) ++{ ++ s64 latency_req = cpuidle_governor_latency_req(dev->cpu); ++ ktime_t delta_tick; ++ u64 sleep_length_ns; ++ int i; ++ ++ sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick)); ++ ++ for (i = drv->state_count - 1; i > 0; i--) { ++ if (dev->states_usage[i].disable) ++ continue; ++ if (drv->states[i].exit_latency_ns > latency_req) ++ continue; ++ if (drv->states[i].target_residency_ns > sleep_length_ns) ++ continue; ++ return i; ++ } ++ return 0; ++} ++ +/* -+ * Return the shallowest C-state index that is both enabled and -+ * satisfies the current latency request. Returns 0 if no such -+ * state exists (caller must treat 0 as "POLL is the only option"). -+ * -+ * Called from the short-circuit path to decide whether the predicted -+ * sleep length is worth entering any C-state at all. Does not -+ * consult the NN. ++ * Return the shallowest enabled C-state that satisfies the current ++ * latency request, or 0 if none exists (POLL is the only option). ++ * Does not consult the NN. + */ +static int nap_find_min_valid_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev, @@ -232,23 +252,16 @@ index 0000000000..c72b67e9c3 +} + +/* -+ * Cached wrapper around nap_find_min_valid_state(). -+ * -+ * Invalidation triggers: -+ * 1. latency_req changed since last cached value (immediate; PM QoS -+ * updates propagate on the next nap_select call). -+ * 2. NAP_MIN_STATE_REFRESH_JIFFIES elapsed since last refresh -+ * (bounded staleness for sysfs-driven or runtime-driver state -+ * disable events, which are rare). -+ * -+ * Hot path cost when the cache is valid: ~5-7 cycles (one s64 -+ * compare, one time_after() check, one conditional return). The -+ * uncached loop runs at most once per HZ jiffies per CPU. ++ * Cached wrapper around nap_find_min_valid_state(). Invalidated when ++ * latency_req changes (immediate PM QoS propagation) or every ++ * NAP_MIN_STATE_REFRESH_JIFFIES (bounded staleness for rare sysfs / ++ * runtime-driver state-disable events). Hot-path cost when valid: ++ * one s64 compare plus one time_after() check. + */ +static inline int nap_get_min_valid_state(struct nap_cpu_data *d, -+ struct cpuidle_driver *drv, -+ struct cpuidle_device *dev, -+ s64 latency_req) ++ struct cpuidle_driver *drv, ++ struct cpuidle_device *dev, ++ s64 latency_req) +{ + if (unlikely(latency_req != d->cached_min_state_latency || + time_after(jiffies, @@ -263,22 +276,12 @@ index 0000000000..c72b67e9c3 +} + +/* -+ * Compute dev->poll_limit_ns for the short-circuit path. -+ * -+ * Budget = predicted wake time (sleep_length) + 1 µs safety margin. -+ * The margin absorbs timer jitter so a wake arriving slightly after -+ * the predicted time does not trigger a select/enter/reflect retry -+ * cycle. It is consumed only when the wake is actually late; on-time -+ * and early wakes exit POLL via need_resched without touching the -+ * margin. -+ * -+ * Floor: NAP_POLL_LIMIT_MIN_NS (1 µs). Below this, per-iteration -+ * governor overhead exceeds actual polling, and POLL's own timeout -+ * sampling granularity (~1.3 µs via POLL_IDLE_RELAX_COUNT cpu_relax -+ * iterations) makes smaller limits indistinguishable in practice. -+ * -+ * Ceiling: min_state.target_residency_ns. Beyond that point, the -+ * C-state would have been a better choice than polling. ++ * Compute dev->poll_limit_ns for the short-circuit path: predicted ++ * wake time plus a 1 us margin (absorbs timer jitter so a slightly ++ * late wake does not retrigger select/enter/reflect), floored at ++ * NAP_POLL_LIMIT_MIN_NS and capped at the min state's target ++ * residency (beyond which the C-state would have been the better ++ * choice). + */ +static inline u64 nap_compute_poll_limit(u64 sleep_length_ns, + u64 min_state_target_ns) @@ -290,28 +293,6 @@ index 0000000000..c72b67e9c3 + min_state_target_ns); +} + -+static int nap_fallback_heuristic(struct cpuidle_driver *drv, -+ struct cpuidle_device *dev) -+{ -+ s64 latency_req = cpuidle_governor_latency_req(dev->cpu); -+ ktime_t delta_tick; -+ u64 sleep_length_ns; -+ int i; -+ -+ sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick)); -+ -+ for (i = drv->state_count - 1; i > 0; i--) { -+ if (dev->states_usage[i].disable) -+ continue; -+ if (drv->states[i].exit_latency_ns > latency_req) -+ continue; -+ if (drv->states[i].target_residency_ns > sleep_length_ns) -+ continue; -+ return i; -+ } -+ return 0; -+} -+ +static int nap_select(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + bool *stop_tick) @@ -327,20 +308,17 @@ index 0000000000..c72b67e9c3 + + latency_req = cpuidle_governor_latency_req(dev->cpu); + sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick)); -+ + min_state = nap_get_min_valid_state(d, drv, dev, latency_req); + + /* + * Fast path: when no C-state can amortize its target residency + * within the predicted sleep length, the answer is deterministically -+ * POLL. Skip NN inference and feature extraction entirely. -+ * nap_reflect also skips history update and learning for -+ * short-circuited events (see the short_circuited check there). -+ * See spec §3.1. ++ * POLL. Skip NN inference and feature extraction entirely; ++ * nap_reflect also skips the feedback path for short-circuited ++ * events (see the short_circuited check there). + */ + if (min_state == 0 || + sleep_length_ns < drv->states[min_state].target_residency_ns) { -+ + if (min_state > 0) + dev->poll_limit_ns = nap_compute_poll_limit( + sleep_length_ns, @@ -356,7 +334,6 @@ index 0000000000..c72b67e9c3 + return 0; + } + -+ /* Normal NN-driven path */ + d->short_circuited = false; + + if (likely(may_use_simd())) { @@ -389,12 +366,10 @@ index 0000000000..c72b67e9c3 + return; + + /* -+ * Short-circuited POLL: NN was not invoked for this idle -+ * event, so the residency does not belong to the NN's -+ * training distribution. Update the aggregate residency -+ * statistic and return — history, hit_intercept, prediction -+ * error, external signals, and learning are all skipped. -+ * See spec §3.4. ++ * Short-circuited POLL: the NN was not invoked for this idle, so ++ * the residency is not part of its training distribution and must ++ * not feed the floor histogram or the weight update. Account only ++ * the aggregate residency and return. + */ + if (d->short_circuited) { + d->stats.total_residency_ns += measured_ns; @@ -406,20 +381,22 @@ index 0000000000..c72b67e9c3 + d->last_prediction_error = d->last_predicted_ns - (s64)measured_ns; + nap_update_external_signals(d); + ++ /* Every idle provides a fresh residency for the floor and reliability EMAs */ ++ d->learn_actual_ns = measured_ns; ++ d->have_sample = true; ++ + /* -+ * Dual gate: learn when both the per-N-reflect counter fires -+ * AND at least learn_jiffies_min jiffies have elapsed since -+ * the last learning step. The time gate prevents sustained -+ * weight churn on workloads with very rapid idle bursts; a -+ * value of 0 disables it (restores the original counter-only -+ * behavior). See spec §3.5. ++ * Throttle the expensive trunk/score weight update with a dual ++ * gate: the per-N-reflect counter AND a jiffies floor. The time ++ * gate caps the learning rate on workloads with very rapid idle ++ * bursts (e.g. cross-CPU ping-pong); learn_jiffies_min == 0 ++ * disables it and restores counter-only behavior. + */ + if (++d->learn_counter >= d->learn_interval && + time_after_eq(jiffies, + d->last_learn_jiffies + d->learn_jiffies_min)) { + d->learn_counter = 0; + d->last_learn_jiffies = jiffies; -+ d->learn_actual_ns = measured_ns; + d->needs_learn = true; + } + @@ -436,19 +413,6 @@ index 0000000000..c72b67e9c3 + memset(d, 0, sizeof(*d)); + + /* -+ * Force first-call refresh of the min-valid-state cache. -+ * cached_min_state_latency = S64_MIN ensures the first -+ * nap_select() comparison will always trip the invalidation -+ * branch regardless of the actual latency_req value. -+ * cached_min_state itself is already zeroed by the memset above. -+ */ -+ d->cached_min_state_latency = S64_MIN; -+ d->cached_min_state_jiffies = jiffies - NAP_MIN_STATE_REFRESH_JIFFIES; -+ -+ /* Default: allow at most one learning step per jiffy */ -+ d->learn_jiffies_min = 1; -+ -+ /* + * Defer weight initialization to the first nap_select() FPU path + * via reset_pending. nap_enable() is called from cpuidle core + * (cpuidle_enable_device) which may run on a different CPU than @@ -459,7 +423,17 @@ index 0000000000..c72b67e9c3 + d->learning_rate_millths = NAP_DEFAULT_LR_MILLTHS; + d->learn_interval = NAP_DEFAULT_INTERVAL; + d->max_grad_norm_millths = NAP_DEFAULT_CLAMP_MILLTHS; -+ d->overshoot_pctl_millths = NAP_DEFAULT_PCTL_MILLTHS; ++ d->conf_millths = NAP_DEFAULT_CONF_MILLTHS; ++ ++ /* ++ * Force a first-call refresh of the min-valid-state cache: ++ * cached_min_state_latency = S64_MIN guarantees the first ++ * nap_select() comparison trips the invalidation branch. ++ */ ++ d->cached_min_state_latency = S64_MIN; ++ d->cached_min_state_jiffies = jiffies - NAP_MIN_STATE_REFRESH_JIFFIES; ++ d->learn_jiffies_min = 1; ++ + d->reset_pending = true; + + return 0; @@ -472,7 +446,7 @@ index 0000000000..c72b67e9c3 +} + +/* ================================================================ -+ * sysfs interface (/sys/devices/system/cpu/nap/) ++ * sysfs interface (/sys/devices/system/cpu/cpuidle/nap/) + * ================================================================ */ + +static ssize_t stats_show(struct kobject *kobj, @@ -556,34 +530,6 @@ index 0000000000..c72b67e9c3 + return count; +} + -+static ssize_t learn_jiffies_min_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ int cpu; -+ -+ cpu = cpumask_first(cpu_online_mask); -+ if (cpu >= nr_cpu_ids) -+ return sysfs_emit(buf, "0\n"); -+ return sysfs_emit(buf, "%u\n", -+ per_cpu(nap_data, cpu).learn_jiffies_min); -+} -+ -+static ssize_t learn_jiffies_min_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ unsigned int val; -+ int cpu; -+ -+ if (kstrtouint(buf, 10, &val) || val > HZ * 3600) -+ return -EINVAL; -+ -+ for_each_online_cpu(cpu) -+ per_cpu(nap_data, cpu).learn_jiffies_min = val; -+ -+ return count; -+} -+ +static ssize_t reset_weights_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) @@ -639,8 +585,14 @@ index 0000000000..c72b67e9c3 + return count; +} + -+static ssize_t overshoot_pctl_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) ++/* ++ * confidence: decision confidence level in millths (1..999, default 500). ++ * Higher demands more certainty before entering a deeper state, biasing toward ++ * responsiveness (shallower); lower biases toward energy (deeper). This is the ++ * single responsiveness dial and replaces the former overshoot_pctl target. ++ */ ++static ssize_t confidence_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) +{ + int cpu; + @@ -648,21 +600,21 @@ index 0000000000..c72b67e9c3 + if (cpu >= nr_cpu_ids) + return sysfs_emit(buf, "0\n"); + return sysfs_emit(buf, "%u\n", -+ per_cpu(nap_data, cpu).overshoot_pctl_millths); ++ per_cpu(nap_data, cpu).conf_millths); +} + -+static ssize_t overshoot_pctl_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) ++static ssize_t confidence_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) +{ + unsigned int val; + int cpu; + -+ if (kstrtouint(buf, 10, &val) || val > 500) ++ if (kstrtouint(buf, 10, &val) || val == 0 || val >= 1000) + return -EINVAL; + + for_each_online_cpu(cpu) -+ per_cpu(nap_data, cpu).overshoot_pctl_millths = val; ++ per_cpu(nap_data, cpu).conf_millths = val; + + return count; +} @@ -682,15 +634,14 @@ index 0000000000..c72b67e9c3 + return sysfs_emit(buf, "sse2\n"); +} + -+static struct kobj_attribute version_attr = __ATTR_RO(version); -+static struct kobj_attribute simd_attr = __ATTR_RO(simd); -+static struct kobj_attribute stats_attr = __ATTR_RO(stats); -+static struct kobj_attribute learning_rate_attr = __ATTR_RW(learning_rate); -+static struct kobj_attribute learn_interval_attr = __ATTR_RW(learn_interval); -+static struct kobj_attribute learn_jiffies_min_attr = __ATTR_RW(learn_jiffies_min); -+static struct kobj_attribute overshoot_pctl_attr = __ATTR_RW(overshoot_pctl); -+static struct kobj_attribute reset_weights_attr = __ATTR_WO(reset_weights); -+static struct kobj_attribute reset_stats_attr = __ATTR_WO(reset_stats); ++static struct kobj_attribute version_attr = __ATTR_RO(version); ++static struct kobj_attribute simd_attr = __ATTR_RO(simd); ++static struct kobj_attribute stats_attr = __ATTR_RO(stats); ++static struct kobj_attribute learning_rate_attr = __ATTR_RW(learning_rate); ++static struct kobj_attribute learn_interval_attr = __ATTR_RW(learn_interval); ++static struct kobj_attribute confidence_attr = __ATTR_RW(confidence); ++static struct kobj_attribute reset_weights_attr = __ATTR_WO(reset_weights); ++static struct kobj_attribute reset_stats_attr = __ATTR_WO(reset_stats); + +static struct attribute *nap_attrs[] = { + &version_attr.attr, @@ -698,8 +649,7 @@ index 0000000000..c72b67e9c3 + &stats_attr.attr, + &learning_rate_attr.attr, + &learn_interval_attr.attr, -+ &learn_jiffies_min_attr.attr, -+ &overshoot_pctl_attr.attr, ++ &confidence_attr.attr, + &reset_weights_attr.attr, + &reset_stats_attr.attr, + NULL, @@ -780,10 +730,10 @@ index 0000000000..c72b67e9c3 +postcore_initcall(nap_init); diff --git a/drivers/cpuidle/governors/nap/nap.h b/drivers/cpuidle/governors/nap/nap.h new file mode 100644 -index 0000000000..1059db983b +index 0000000000..0f6aae7d17 --- /dev/null +++ b/drivers/cpuidle/governors/nap/nap.h -@@ -0,0 +1,283 @@ +@@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef NAP_H +#define NAP_H @@ -798,30 +748,39 @@ index 0000000000..1059db983b + +#define NAP_INPUT_SIZE 8 +#define NAP_HIDDEN_SIZE 8 -+#define NAP_NUM_EXPERTS 3 ++#define NAP_NUM_CUTS (CPUIDLE_STATE_MAX - 1) + +/* -+ * Neural network weight structure for an 8→8→1 MLP (scalar regression). ++ * Neural network weights for an 8-input MLP with an ordinal survival head. + * -+ * The NN outputs a single log2 correction factor applied to sleep_length: -+ * effective_sleep = exp2(log2(sleep_length) + nn_output) -+ * State selection is then deterministic: pick the deepest state whose -+ * cost (target_residency + exit_latency) ≤ effective_sleep. ++ * The trunk maps input[8] → hidden[8] (ReLU), feeding a shared linear score ++ * s = w_out . hidden + b_out ++ * which is the input to a proportional-odds ordinal head. For each idle-state ++ * boundary k the predicted survival probability that the upcoming idle reaches ++ * that state's target_residency is ++ * q_k = sigmoid(s - thr_ord[k-1]). ++ * With ordered thresholds this represents the idle-duration distribution at ++ * exactly the points the decision needs (the sufficient statistic), rather ++ * than a single point estimate. The decision layer compares q_k against a ++ * calibrated confidence level (see nap_fpu_select()). + * + * Column-major storage: w_h1[j][i] = weight from input j to hidden neuron i. + * This layout enables efficient column-wise matrix-vector products where + * each input broadcasts across all hidden neurons via SIMD FMA. + * -+ * __aligned(32) ensures AVX2 vmovaps (32-byte aligned) loads work -+ * correctly. 8 floats = 32 bytes = one ymm register. ++ * thr_ord is appended after the SIMD-accessed fields so their offsets are ++ * unchanged. __aligned(32) ensures AVX2 vmovaps (32-byte) aligned loads ++ * work correctly (8 floats = 32 bytes = one ymm register). + */ +struct nap_weights { + /* Hidden layer: input[8] → hidden[8] */ + float w_h1[NAP_INPUT_SIZE][NAP_HIDDEN_SIZE]; /* 64 params */ + float b_h1[NAP_HIDDEN_SIZE]; /* 8 params */ -+ /* Output layer: hidden[8] → 1 scalar */ ++ /* Shared score head: hidden[8] → scalar s */ + float w_out[NAP_HIDDEN_SIZE]; /* 8 params */ + float b_out; /* 1 param */ ++ /* Ordinal survival head: one ordered threshold per state boundary */ ++ float thr_ord[NAP_NUM_CUTS]; +} __aligned(32); + +/* ISA-specific forward pass implementations */ @@ -829,6 +788,7 @@ index 0000000000..1059db983b + float *hidden_save, const struct nap_weights *w); +void nap_nn_forward_avx2(const float *input, float *output, + float *hidden_save, const struct nap_weights *w); ++ +/* ISA-specific online learning (backpropagation) */ +struct nap_cpu_data; +void nap_nn_learn_sse2(struct nap_cpu_data *d); @@ -955,20 +915,18 @@ index 0000000000..1059db983b + * POLL short-circuit tunables + * ================================================================ */ + -+/* Minimum and safety-margin values for dev->poll_limit_ns written -+ * by nap_compute_poll_limit(). Both are 1 µs: the POLL state -+ * itself checks its timeout only every ~1 µs (POLL_IDLE_RELAX_COUNT -+ * cpu_relax() iterations in drivers/cpuidle/poll_state.c), so -+ * finer-grained values would not produce distinguishable behavior. ++/* dev->poll_limit_ns floor and safety margin written by ++ * nap_compute_poll_limit(). Both 1 us: the POLL state samples its ++ * own timeout only every ~1 us (POLL_IDLE_RELAX_COUNT cpu_relax() ++ * iterations in poll_state.c), so finer values are indistinguishable. + */ +#define NAP_POLL_LIMIT_MIN_NS 1000ULL +#define NAP_POLL_LIMIT_MARGIN_NS 1000ULL + -+/* Refresh interval for the cached minimum-valid-state lookup. -+ * HZ jiffies (= 1 second) bounds the staleness window caused by -+ * sysfs-driven or runtime-driver state disable events. PM QoS -+ * latency changes are detected immediately via the cached -+ * latency_req comparison. ++/* Refresh interval for the cached minimum-valid-state lookup. HZ ++ * jiffies (1 s) bounds staleness from sysfs/runtime state-disable ++ * events; PM QoS latency changes are detected immediately via the ++ * cached latency_req comparison. + */ +#define NAP_MIN_STATE_REFRESH_JIFFIES HZ + @@ -991,21 +949,21 @@ index 0000000000..1059db983b + s64 last_predicted_ns; + s64 last_prediction_error; + -+ /* Short-circuit fast path (§3.1, §3.2, §3.4 of spec) */ ++ /* POLL short-circuit fast path */ + bool short_circuited; /* set in select, read in reflect */ + int cached_min_state; /* cached shallowest valid state */ + s64 cached_min_state_latency; /* latency_req when cache populated */ + unsigned long cached_min_state_jiffies; /* jiffies when cache populated */ + -+ /* Jiffies-based learning rate floor (§3.5 of spec) */ ++ /* Jiffies-based learning rate floor */ + unsigned long last_learn_jiffies; -+ unsigned int learn_jiffies_min; /* sysfs-tunable, 0 = disabled */ ++ unsigned int learn_jiffies_min; /* 0 = disabled */ + + /* select/reflect handoff */ + int last_selected_idx; + -+ /* NN scalar output: log2 correction factor for sleep_length. -+ * effective_sleep = exp2(log2(sleep_length) + nn_output). ++ /* Shared ordinal score s (≈ log2 of the predicted idle duration in ns). ++ * Survival at boundary k is sigmoid(s - thr_ord[k-1]). + */ + float nn_output; + @@ -1015,38 +973,38 @@ index 0000000000..1059db983b + * nap_extract_features(): + * SSE2: movaps (16-byte aligned) + * AVX2: vmovaps (32-byte aligned) -+ * Without __aligned(64), the natural struct offset would be ++ * Without __aligned(32), the natural struct offset would be + * only 4-byte aligned, causing #GP faults in the idle task. + */ + float hidden_out[NAP_HIDDEN_SIZE] __aligned(32); + float features_f32[NAP_INPUT_SIZE] __aligned(32); + + /* Backprop scratch */ -+ float learn_d_out; /* output gradient direction (±1) */ -+ float learn_lr; /* effective lr (base_lr * asymmetric weight) */ ++ float learn_d_out; /* score gradient g = sum_k (q_k - y_k) */ ++ float learn_lr; /* effective learning rate (symmetric) */ + float learn_d_hid[NAP_HIDDEN_SIZE] __aligned(32); + -+ /* Precomputed per-state log2(target_residency) for threshold selection. -+ * log2_cost[i] = log2(target_residency_ns). ++ /* Precomputed per-state log2 thresholds. ++ * log2_tres[i] = log2(target_residency_ns) (ordinal thresholds, timer clamp) + */ -+ float log2_cost[CPUIDLE_STATE_MAX]; ++ float log2_tres[CPUIDLE_STATE_MAX]; ++ ++ /* Decayed per-bin idle histogram: robustness-floor survival estimate */ ++ float bin_count[CPUIDLE_STATE_MAX]; + + /* Deferred learning data */ + bool needs_learn; -+ bool output_clamped; /* true if nn_output was clamped to features[0] */ ++ bool have_sample; /* a fresh residency awaits per-idle processing */ + u64 learn_actual_ns; + -+ /* Mixture-of-Experts: 3 experts × 8 neurons each */ -+ struct nap_weights expert_weights[NAP_NUM_EXPERTS]; -+ struct nap_weights *active_w; /* selected expert for current/deferred pass */ -+ int active_expert; /* 0, 1, or 2: which expert is active */ -+ float expert_mid; /* log2 threshold: short ↔ long */ -+ float expert_deep; /* log2 threshold: long ↔ deep */ ++ /* Single network: 16→16 trunk + ordinal survival head */ ++ struct nap_weights weights; ++ struct nap_weights *active_w; /* always &weights; consumed by SIMD forward/learn */ + + /* Online learning */ + unsigned int learning_rate_millths; + unsigned int max_grad_norm_millths; -+ unsigned int overshoot_pctl_millths; /* quantile target (250 = 25th pctl) */ ++ unsigned int conf_millths; /* decision confidence level (500 = 0.5) */ + int learn_interval; + int learn_counter; + bool reset_pending; /* set by sysfs, consumed by nap_select */ @@ -1069,10 +1027,10 @@ index 0000000000..1059db983b +#endif /* NAP_H */ diff --git a/drivers/cpuidle/governors/nap/nap_fpu.c b/drivers/cpuidle/governors/nap/nap_fpu.c new file mode 100644 -index 0000000000..482a06a5d0 +index 0000000000..9465262969 --- /dev/null +++ b/drivers/cpuidle/governors/nap/nap_fpu.c -@@ -0,0 +1,572 @@ +@@ -0,0 +1,528 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nap_fpu.c — FPU/SIMD code for the NAP cpuidle governor @@ -1140,6 +1098,51 @@ index 0000000000..482a06a5d0 + return e + p; +} + ++/* ++ * Scalar 2^x approximation: integer part via exponent bits, fractional part ++ * via a minimax cubic on [0,1] (error < 1e-4). Used to build the logistic. ++ */ ++static inline float fast_exp2f(float x) ++{ ++ union { u32 i; float f; } v; ++ int xi; ++ float f; ++ ++ if (x > 60.0f) ++ x = 60.0f; ++ else if (x < -60.0f) ++ x = -60.0f; ++ ++ xi = (int)x; ++ if (x < (float)xi) ++ xi--; /* floor toward negative infinity */ ++ f = x - (float)xi; ++ ++ v.i = (u32)((xi + 127) << 23); /* 2^xi */ ++ return v.f * (1.0f + f * (0.6931472f + ++ f * (0.2402265f + f * 0.0555041f))); ++} ++ ++/* Logistic sigmoid: sigmoid(x) = 1 / (1 + e^-x) = 1 / (1 + 2^(-x*log2(e))) */ ++static inline float nap_sigmoidf(float x) ++{ ++ return 1.0f / (1.0f + fast_exp2f(-1.4426950f * x)); ++} ++ ++/* ++ * Robustness floor and Beta-Binomial shrinkage. ++ * ++ * bin_count[] is an exponentially decayed histogram (window NAP_FLOOR_WIN, in ++ * idles) of which idle-state bin each idle landed in, updated every idle; its ++ * survival estimate is a fast, forgetting-resistant memory. The decision ++ * treats the NN survival as a prior worth NAP_PRIOR_K pseudo-observations and ++ * the decayed histogram as data: ++ * q_k = (NAP_PRIOR_K * q_nn_k + count(>=k)) / (NAP_PRIOR_K + total). ++ * Cold (no data) follows the NN; once the histogram fills it dominates. ++ */ ++#define NAP_FLOOR_WIN 256 ++#define NAP_PRIOR_K 16 ++ +/* ================================================================ + * Deterministic PRNG for weight initialization (LCG) + * ================================================================ */ @@ -1227,106 +1230,31 @@ index 0000000000..482a06a5d0 +} + +/* -+ * Precompute log2(target_residency) per state for threshold-based selection. -+ * -+ * Used in the selection loop: pick deepest state where -+ * log2_cost[i] <= nn_output (predicted sleep time in log2 space). -+ * -+ * Only target_residency_ns is used — exit_latency is a wakeup cost, -+ * not a factor in whether the CPU can profitably stay in the state -+ * for the predicted duration. ++ * Precompute log2(target_residency) per state and seed the ordinal ++ * thresholds. log2_tres[k] is the boundary location in score space: it ++ * seeds thr_ord[k-1], bounds its learned drift, and clamps the score ++ * against the timer in the decision layer. + */ -+static void nap_init_log2_cost(struct nap_cpu_data *d, ++static void nap_init_log2_tres(struct nap_cpu_data *d, + struct cpuidle_driver *drv) +{ -+ float log2_tick; -+ int long_start, deep_idx; + int i; + + for (i = 0; i < drv->state_count; i++) { -+ float res = float_max( ++ float tres = float_max( + (float)drv->states[i].target_residency_ns, 1.0f); -+ d->log2_cost[i] = fast_log2f(res); -+ } + -+ /* -+ * MoE expert boundaries — 3-way split. -+ * -+ * Expert 0 (short): tick-bound idles where measured residency -+ * is dominated by the next tick rather than the workload's -+ * true idle duration. Boundary: log2(TICK_NSEC). -+ * -+ * Expert 1 (long): nohz idles in intermediate C-states. -+ * -+ * Expert 2 (deep): idles targeting the deepest C-state. -+ * The deepest state often has qualitatively different -+ * residency characteristics (package C-state, longer -+ * exit latency, power-gated domains) that warrant a -+ * dedicated expert to avoid gradient interference with -+ * intermediate states. -+ * -+ * Safety: with only 2 C-states (+ POLL), expert_deep is -+ * placed equal to expert_mid so the deep expert is never -+ * routed (same behavior as the old 2-expert split). -+ */ -+ if (drv->state_count <= 1) { -+ d->expert_mid = 0.0f; -+ d->expert_deep = 0.0f; -+ return; -+ } -+ -+ log2_tick = fast_log2f((float)TICK_NSEC); -+ -+ /* Default: deepest state belongs to long expert (safety) */ -+ long_start = drv->state_count - 1; -+ -+ /* Prefer the first state whose target_residency exceeds one jiffy */ -+ for (i = 1; i < drv->state_count; i++) { -+ if (d->log2_cost[i] > log2_tick) { -+ long_start = i; -+ break; -+ } -+ } -+ -+ if (long_start > 1) { -+ /* Normal case: boundary between last short and first long */ -+ d->expert_mid = (d->log2_cost[long_start - 1] + -+ d->log2_cost[long_start]) / 2.0f; -+ } else { -+ /* -+ * long_start == 1: even the shallowest C-state already -+ * exceeds one jiffy. All NN-handled idles go to the -+ * long expert; place the boundary just below C1's -+ * residency so the short expert remains routable but -+ * unused. -+ */ -+ d->expert_mid = d->log2_cost[1] - 1.0f; ++ d->log2_tres[i] = fast_log2f(tres); + } + + /* -+ * Deep expert boundary — deepest C-state split. -+ * -+ * When there are >= 3 C-states (state_count >= 4, counting POLL), -+ * place the boundary at the midpoint between the second-deepest -+ * and deepest state's log2(target_residency). The deep expert -+ * then exclusively handles sleep durations long enough to reach -+ * the deepest state. -+ * -+ * With only 2 C-states, expert_deep == expert_mid collapses to -+ * the 2-expert regime (expert 2 is never selected). ++ * Seed each ordinal threshold at its boundary's log2(target_residency), ++ * so before learning q_k crosses 0.5 exactly when the score (initially ++ * ~= log2(sleep_length)) reaches that state's target_residency. This ++ * reproduces the deepest-state-that-fits default until learning adapts. + */ -+ deep_idx = drv->state_count - 1; -+ if (deep_idx >= 3) { -+ /* >= 3 C-states: split before the deepest */ -+ d->expert_deep = (d->log2_cost[deep_idx - 1] + -+ d->log2_cost[deep_idx]) / 2.0f; -+ /* Ensure deep > mid ordering */ -+ if (d->expert_deep <= d->expert_mid) -+ d->expert_deep = d->expert_mid; -+ } else { -+ /* <= 2 C-states: collapse deep into long */ -+ d->expert_deep = d->expert_mid; -+ } ++ for (i = 1; i < drv->state_count; i++) ++ d->weights.thr_ord[i - 1] = d->log2_tres[i]; +} + +/* ================================================================ @@ -1400,18 +1328,6 @@ index 0000000000..482a06a5d0 + s->avg = sum / (float)n; +} + -+/* -+ * Extract 8 input features for the MLP. -+ * -+ * [0] log2(sleep_length) — next timer event -+ * [1] log2(last_residency) — actual duration of last idle -+ * [2] log_hist avg — average recent idle duration -+ * [3] log_hist min — shortest recent idle -+ * [4] log_hist max — longest recent idle -+ * [5] signed log2(|pred_error|+1) — prediction feedback -+ * [6] log2(busy_ns) — pre-idle busy duration -+ * [7] log2(lat_req) - log2(deepest_lat) — PM QoS headroom -+ */ +static void nap_extract_features(struct cpuidle_driver *drv, + struct cpuidle_device *dev, + float out[NAP_INPUT_SIZE], @@ -1508,86 +1424,70 @@ index 0000000000..482a06a5d0 + + /* Handle deferred weight reset (set by sysfs or nap_enable) */ + if (unlikely(d->reset_pending)) { -+ int e; -+ -+ for (e = 0; e < NAP_NUM_EXPERTS; e++) -+ nap_init_weights(&d->expert_weights[e]); -+ nap_init_log2_cost(d, drv); ++ nap_init_weights(&d->weights); ++ nap_init_log2_tres(d, drv); ++ memset(d->bin_count, 0, sizeof(d->bin_count)); ++ d->have_sample = false; + d->stats.learn_count = 0; + d->needs_learn = false; + d->reset_pending = false; + } + -+ /* Deferred learning (always, even during warmup) */ -+ if (d->needs_learn) { -+ float log2_eff = d->nn_output; -+ float alpha = (float)d->overshoot_pctl_millths -+ / 1000.0f; -+ int nn_selected = 0; -+ bool is_overshoot; -+ int i; -+ -+ /* Simulate which state the NN selected */ -+ for (i = drv->state_count - 1; i > 0; i--) { -+ if (d->log2_cost[i] <= log2_eff) { -+ nn_selected = i; -+ break; -+ } -+ } -+ -+ /* -+ * Direct overshoot loss. -+ * -+ * Base the gradient on whether the simulated state -+ * selection actually caused overshoot -+ * (actual < target_residency). -+ * -+ * The asymmetric weight is encoded in the learning -+ * rate (not in d_out) so that gradient clamping -+ * cannot destroy the asymmetry. d_out is ±1 and -+ * gets clipped symmetrically; the (1-α) vs α ratio -+ * is preserved through learn_lr. -+ * -+ * At equilibrium, P(overshoot) converges to α. -+ * α = overshoot_pctl / 1000. -+ */ -+ { -+ float base_lr = (float)d->learning_rate_millths -+ / 1000.0f; -+ -+ is_overshoot = (nn_selected > 0 && -+ d->learn_actual_ns < -+ drv->states[nn_selected].target_residency_ns); -+ -+ /* -+ * When the output was clamped at the upper -+ * limit (nn_output == features[0]), the NN -+ * is already predicting the maximum possible -+ * sleep time. Non-overshoot events would -+ * push weights UP, but the output cannot -+ * actually increase. Suppress this gradient -+ * to prevent unbounded weight growth in idle -+ * systems where natural overshoot rate < α. -+ * -+ * Overshoot events still learn normally -+ * (push DOWN) even when clamped. -+ */ -+ if (d->output_clamped && !is_overshoot) { -+ d->learn_lr = 0; -+ d->learn_d_out = 0; -+ } else { -+ d->learn_d_out = is_overshoot -+ ? 1.0f : -1.0f; -+ d->learn_lr = is_overshoot -+ ? base_lr * (1.0f - alpha) -+ : base_lr * alpha; ++ /* ++ * Per-idle feedback against the just-realized idle duration. ++ * ++ * Every idle: update the decayed floor histogram so it stays current. ++ * Only every learn_interval (needs_learn): apply the ordinal-threshold ++ * updates and the trunk/score-head backprop, using the previous pass's ++ * stored score, hidden activations and features. Under the shared-score ++ * proportional-odds model the gradient w.r.t. the score is the scalar ++ * g = sum_k (q_k - y_k), which drives the existing SIMD backprop unchanged. ++ * The loss is symmetric -- any responsiveness bias lives in the decision ++ * layer, not here. ++ */ ++ if (d->have_sample) { ++ float decay = (float)(NAP_FLOOR_WIN - 1) / (float)NAP_FLOOR_WIN; ++ int k, label_bin = 0; ++ ++ if (d->needs_learn) { ++ float base_lr = (float)d->learning_rate_millths / 1000.0f; ++ float clamp_val = (float)d->max_grad_norm_millths / 1000.0f; ++ float s = d->nn_output; ++ float g = 0.0f; ++ ++ for (k = 1; k < drv->state_count; k++) { ++ float th = d->active_w->thr_ord[k - 1]; ++ float q = nap_sigmoidf(s - th); ++ float y = (d->learn_actual_ns >= ++ drv->states[k].target_residency_ns) ++ ? 1.0f : 0.0f; ++ float err = q - y; ++ float lo = d->log2_tres[k] - 6.0f; ++ float hi = d->log2_tres[k] + 6.0f; ++ ++ g += err; ++ d->active_w->thr_ord[k - 1] = ++ fclampf(th + fclampf(base_lr * err, ++ -clamp_val, clamp_val), ++ lo, hi); + } ++ d->learn_d_out = g; ++ d->learn_lr = base_lr; ++ d->stats.learn_count++; ++ nap_nn_learn(d); ++ d->needs_learn = false; + } + -+ d->stats.learn_count++; ++ /* Floor histogram update, every idle */ ++ for (k = 1; k < drv->state_count; k++) ++ if (d->learn_actual_ns >= ++ drv->states[k].target_residency_ns) ++ label_bin = k; ++ for (k = 0; k < drv->state_count; k++) ++ d->bin_count[k] *= decay; ++ d->bin_count[label_bin] += 1.0f; + -+ nap_nn_learn(d); -+ d->needs_learn = false; ++ d->have_sample = false; + } + + /* @@ -1597,57 +1497,71 @@ index 0000000000..482a06a5d0 + */ + nap_extract_features(drv, dev, d->features_f32, latency_req); + -+ /* MoE: 3-way expert selection based on log2(sleep_length) */ -+ if (d->features_f32[0] >= d->expert_deep) -+ d->active_expert = 2; /* deep: deepest C-state */ -+ else if (d->features_f32[0] >= d->expert_mid) -+ d->active_expert = 1; /* long: nohz intermediate */ -+ else -+ d->active_expert = 0; /* short: tick-bound */ -+ d->active_w = &d->expert_weights[d->active_expert]; ++ d->active_w = &d->weights; + + nap_nn_forward(d->features_f32, &d->nn_output, d->hidden_out, + d->active_w); + + /* -+ * Clamp NN output: predicted sleep cannot exceed sleep_length -+ * (next timer event). features_f32[0] = log2(sleep_length). ++ * Decision layer. + * -+ * Track whether the clamp was applied so the learning block -+ * can suppress "push up" gradients when the output is already -+ * at the maximum. Without this, weights diverge unboundedly -+ * in idle systems where the natural overshoot rate < alpha. -+ */ -+ d->output_clamped = (d->nn_output > d->features_f32[0]); -+ if (d->output_clamped) -+ d->nn_output = d->features_f32[0]; -+ -+ /* -+ * Threshold-based selection using NN predicted sleep time. -+ * -+ * The NN directly outputs log2(predicted_sleep) in ns. -+ * Select the deepest feasible state whose cost ≤ predicted_sleep. ++ * For each boundary k the survival probability q_k is a Beta-Binomial ++ * shrinkage of the NN survival sigmoid(s - thr_ord) (a prior worth ++ * NAP_PRIOR_K pseudo-observations) toward the decayed histogram (data): ++ * the NN drives cold start, the floor takes over as it fills. A running ++ * minimum enforces a monotone non-increasing survival curve, and the next ++ * timer event caps the reachable depth (a deeper state cannot be earned ++ * past it). The confidence level is the single responsiveness dial: pick ++ * the deepest feasible state whose survival still meets it. + */ + { -+ float log2_eff = d->nn_output; -+ int idx = 0, i; ++ float conf = (float)d->conf_millths / 1000.0f; ++ float s = d->nn_output; ++ float sleep_log2 = d->features_f32[0]; ++ float suffix[CPUIDLE_STATE_MAX]; ++ float total = 0.0f; ++ float qmin = 1.0f; ++ int k, m = 0, idx = 0; ++ ++ for (k = 0; k < drv->state_count; k++) ++ total += d->bin_count[k]; ++ ++ suffix[drv->state_count - 1] = ++ d->bin_count[drv->state_count - 1]; ++ for (k = drv->state_count - 2; k >= 0; k--) ++ suffix[k] = suffix[k + 1] + d->bin_count[k]; ++ ++ for (k = 1; k < drv->state_count; k++) { ++ float q_nn = nap_sigmoidf(s - d->active_w->thr_ord[k - 1]); ++ float q = ((float)NAP_PRIOR_K * q_nn + suffix[k]) / ++ ((float)NAP_PRIOR_K + total); ++ ++ if (d->log2_tres[k] > sleep_log2) ++ q = 0.0f; /* cannot idle past the next timer */ ++ if (q < qmin) ++ qmin = q; ++ q = qmin; ++ ++ if (q >= conf) ++ m = k; ++ else ++ break; ++ } + -+ for (i = drv->state_count - 1; i > 0; i--) { -+ if (dev->states_usage[i].disable) ++ for (k = m; k >= 1; k--) { ++ if (dev->states_usage[k].disable) + continue; -+ if (drv->states[i].exit_latency_ns > latency_req) ++ if (drv->states[k].exit_latency_ns > latency_req) + continue; -+ if (d->log2_cost[i] <= log2_eff) { -+ idx = i; -+ break; -+ } ++ idx = k; ++ break; + } + return idx; + } +} diff --git a/drivers/cpuidle/governors/nap/nap_nn_avx2.c b/drivers/cpuidle/governors/nap/nap_nn_avx2.c new file mode 100644 -index 0000000000..96e5415423 +index 0000000000..a43091793c --- /dev/null +++ b/drivers/cpuidle/governors/nap/nap_nn_avx2.c @@ -0,0 +1,135 @@ @@ -1655,7 +1569,7 @@ index 0000000000..96e5415423 +/* + * nap_nn_avx2.c — AVX2+FMA forward pass and backpropagation for the nap MLP + * -+ * 8→8→1 scalar regression (log2 correction factor). ++ * 8→8 trunk + scalar score s feeding the ordinal survival head. + * Uses 256-bit ymm registers: 8 hidden neurons = 1 ymm. + * FMA via vfmadd231ps for fused multiply-add. + * @@ -1788,7 +1702,7 @@ index 0000000000..96e5415423 +} diff --git a/drivers/cpuidle/governors/nap/nap_nn_sse2.c b/drivers/cpuidle/governors/nap/nap_nn_sse2.c new file mode 100644 -index 0000000000..a9fffb3b98 +index 0000000000..0f2a6f131f --- /dev/null +++ b/drivers/cpuidle/governors/nap/nap_nn_sse2.c @@ -0,0 +1,136 @@ @@ -1796,7 +1710,7 @@ index 0000000000..a9fffb3b98 +/* + * nap_nn_sse2.c — SSE2 forward pass and backpropagation for the nap MLP + * -+ * 8→8→1 scalar regression (log2 correction factor). ++ * 8→8 trunk + scalar score s feeding the ordinal survival head. + * Baseline implementation using SSE2, which is always available on x86_64. + * No FMA — uses separate mul + add (2 instructions per MAC). + * @@ -1930,3 +1844,4 @@ index 0000000000..a9fffb3b98 +} -- 2.34.1 + diff --git a/99-charcoal-sysctl.conf b/99-charcoal-sysctl.conf new file mode 100644 index 0000000..0bddf16 --- /dev/null +++ b/99-charcoal-sysctl.conf @@ -0,0 +1,2 @@ +vm.kcompressd=256 +vm.vfs_cache_pressure=125 diff --git a/PKGBUILD b/PKGBUILD index e84fdbf..2d1baba 100755 --- a/PKGBUILD +++ b/PKGBUILD @@ -4,8 +4,8 @@ pkgbase=linux-charcoal-616 _nepbase=linux-neptune-616 -_tag=6.16.12-valve23 -_ver=3 +_tag=6.16.12-valve24 +_ver=1 pkgver=${_tag//-/.}.cc$_ver pkgrel=1 pkgdesc='Linux' @@ -54,6 +54,7 @@ source=( charcoal.conf 65-adios.rules 99-charcoal.sh + 99-charcoal-sysctl.conf vangogh_allow_higher_cpu_freq.patch vangogh_higher_max_power_limit.patch drm_sched_rr_default.patch @@ -94,15 +95,16 @@ source=( "git+https://github.com/forkymcforkface/xpad-noone.git#commit=8e903676dd9514c07ce5e06e43c5f7d8cc51cb7d" "git+https://github.com/atar-axis/xpadneo.git#tag=v$_xpadneo_version" 6.16-poc-selector-v2.6.1.patch - 6.16-nap-v0.4.0.patch + 6.16-nap-v0.5.0.patch ) -sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0' +sha256sums=('SKIP' '37452b4d09e5e42134ae24a61f2f656790837c327268074cf79d7dab3558b972' 'd88eaf0f94bae470040e4882f334c05b1bb2ab0a99e4b7299aa0b2337810ab8d' 'fd57213c524e24cd9c72e2fecd9b2005934b6099e209864e5a93eb03406fca21' 'b831de1b98a2f77f636f4780e37ebfcb3a6829f94f5423eb04c4b26e64ac43b8' '52cbbf41450806d766260bc4f1ea055f6f9fdd55d37ad831840b16d505beb0cc' '0a6a7408ccc0c94b5cce50dabc7ee318abcc1b9eaaedd3d83fd7e7d5a73b4d4f' + '7b0a1d962dfbcc1cbec195a8abb5ad1ff1872fde0a2249bd5704367c023c6573' '375c8e17daf9e60bc6c211dd73f0c67ec241bd40a83d812a08eeb42aab6128d9' '1c49146dc5878bfab32b331d11cb66d493670bbe590ff07c2050305911c281c3' '6e510d8b74798944b5cb84ac775156831410c853c8a03c2a3f79e9bc7be9c2e2' @@ -143,7 +145,7 @@ sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0' '1055bbbd32985017f4501d375648873bd598db084177d302aeeade56b47920e1' '26b3a811d38471a42229fa037cb6d2bb5ff78f19f45a17c7f263339ee67769a7' '14dabfb0452a3a817e8d809fb28eb7565512e95386d789c627b62baf136e001f' - '99d87a5c9cf47f257df81fabbabdcb9df02ff93c0c9caabf1bbd40d2e50fed6e') + 'f665d6ba6fc18579083bf8ec7ec741d43495f16f9dcbc482a5bd928b1778b2d3') export KBUILD_BUILD_HOST=archlinux export KBUILD_BUILD_USER=$pkgbase @@ -251,6 +253,8 @@ _package() { install -D -m 0644 -t "$pkgdir/etc/profile.d" ../99-charcoal.sh # Charcoal: Install udev rules install -D -m 0644 -t "$pkgdir/etc/udev/rules.d" ../65-adios.rules + # sysctl parameters to fix trashing under heavy memory pressure + install -D -m 0644 -t "$pkgdir/etc/sysctl.d" ../99-charcoal-sysctl.conf # Charcoal: Install bundles DKMS modules ZSTD_CLEVEL=19 make LLVM=1 M=../ryzen_smu INSTALL_MOD_PATH="$pkgdir/usr" INSTALL_MOD_STRIP=1 DEPMOD=/doesnt/exist modules_install