diff --git a/6.16-nap-v0.4.0.patch b/6.16-nap-v0.5.0.patch
similarity index 72%
rename from 6.16-nap-v0.4.0.patch
rename to 6.16-nap-v0.5.0.patch
index 9b40d91..3db1e3b 100644
--- a/6.16-nap-v0.4.0.patch
+++ b/6.16-nap-v0.5.0.patch
@@ -1,22 +1,24 @@
-From 1d2e8272f288fecce3fd7f762fb8c628ed04b7fe Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Masahito S <firelzrd@gmail.com>
-Date: Wed, 15 Apr 2026 08:37:01 +0900
-Subject: [PATCH] 6.16 backport: 6.18.3-nap-v0.4.0
+Date: Fri, 5 Jun 2026 13:10:05 +0900
+Subject: [PATCH] 6.16 backport: 6.18.3-nap-v0.5.0
 
-Backport of NAP cpuidle governor to Linux 6.16.
-No functional changes except added RESIDENCY_THRESHOLD_NS definition.
+Backport of NAP cpuidle governor v0.5.0 to Linux 6.16.
+
+Functional changes from v0.4.0 are preserved; 6.16 compatibility keeps
+the RESIDENCY_THRESHOLD_NS fallback definition used by the previous
+backport.
 
-Signed-off-by: Masahito S <firelzrd@gmail.com>
 ---
  drivers/cpuidle/Kconfig                     |  17 +
  drivers/cpuidle/governors/Makefile          |   1 +
- drivers/cpuidle/governors/nap/Makefile      |  29 +
- drivers/cpuidle/governors/nap/nap.c         | 671 ++++++++++++++++++++
- drivers/cpuidle/governors/nap/nap.h         | 283 +++++++++
- drivers/cpuidle/governors/nap/nap_fpu.c     | 572 +++++++++++++++++
- drivers/cpuidle/governors/nap/nap_nn_avx2.c | 135 ++++
- drivers/cpuidle/governors/nap/nap_nn_sse2.c | 136 ++++
- 8 files changed, 1844 insertions(+)
+ drivers/cpuidle/governors/nap/Makefile      |  30 +
+ drivers/cpuidle/governors/nap/nap.c         | 623 ++++++++++++++++++++
+ drivers/cpuidle/governors/nap/nap.h         | 291 ++++++++++
+ drivers/cpuidle/governors/nap/nap_fpu.c     | 528 +++++++++++++++++
+ drivers/cpuidle/governors/nap/nap_nn_avx2.c | 135 +++++
+ drivers/cpuidle/governors/nap/nap_nn_sse2.c | 136 +++++
+ 8 files changed, 1761 insertions(+)
  create mode 100644 drivers/cpuidle/governors/nap/Makefile
  create mode 100644 drivers/cpuidle/governors/nap/nap.c
  create mode 100644 drivers/cpuidle/governors/nap/nap.h
@@ -63,10 +65,10 @@ index 63abb5393a..ae688891c0 100644
 +obj-$(CONFIG_CPU_IDLE_GOV_NAP) += nap/
 diff --git a/drivers/cpuidle/governors/nap/Makefile b/drivers/cpuidle/governors/nap/Makefile
 new file mode 100644
-index 0000000000..8c4a17d8e2
+index 0000000000..8b85a475a6
 --- /dev/null
 +++ b/drivers/cpuidle/governors/nap/Makefile
-@@ -0,0 +1,33 @@
+@@ -0,0 +1,30 @@
 +# SPDX-License-Identifier: GPL-2.0-only
 +#
 +# Makefile for the NAP cpuidle governor
@@ -90,30 +92,28 @@ index 0000000000..8c4a17d8e2
 +FPU_KILL_FLAGS := -mno-sse -mno-sse2 -mno-mmx -mno-avx -mno-3dnow \
 +                  -mno-sse4a -msoft-float -mno-80387 -mno-fp-ret-in-387
 +
-+# LTO FIX: Disables LTO on standalone files to prevent intrusive inlining
-+# of FPU instructions and ensure that flags are preserved during linking.
-+CFLAGS_REMOVE_nap.o            += $(CC_FLAGS_LTO)
-+CFLAGS_REMOVE_nap_fpu.o        += $(CC_FLAGS_LTO) $(FPU_KILL_FLAGS)
-+CFLAGS_REMOVE_nap_nn_sse2.o    += $(CC_FLAGS_LTO) $(FPU_KILL_FLAGS)
-+CFLAGS_REMOVE_nap_nn_avx2.o    += $(CC_FLAGS_LTO) $(FPU_KILL_FLAGS)
++CFLAGS_REMOVE_nap_fpu.o        += $(FPU_KILL_FLAGS)
++CFLAGS_REMOVE_nap_nn_sse2.o    += $(FPU_KILL_FLAGS)
++CFLAGS_REMOVE_nap_nn_avx2.o    += $(FPU_KILL_FLAGS)
 +
 +CFLAGS_nap_fpu.o       += $(CC_FLAGS_FPU)
 +CFLAGS_nap_nn_sse2.o   += $(CC_FLAGS_FPU)
 +CFLAGS_nap_nn_avx2.o   += $(CC_FLAGS_FPU) -mavx -mavx2 -mfma
 diff --git a/drivers/cpuidle/governors/nap/nap.c b/drivers/cpuidle/governors/nap/nap.c
 new file mode 100644
-index 0000000000..c72b67e9c3
+index 0000000000..fc7393e9f4
 --- /dev/null
 +++ b/drivers/cpuidle/governors/nap/nap.c
-@@ -0,0 +1,672 @@
+@@ -0,0 +1,623 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * nap.c — Neural Adaptive Predictor cpuidle governor
 + *
-+ * A machine-learning-based cpuidle governor that uses a small MLP (8→8→1)
-+ * with 3 Mixture-of-Experts (short/long/deep) to predict a log2 correction
-+ * factor for sleep_length.  State selection is deterministic threshold
-+ * comparison.  Weights are Xavier-initialized at boot, then refined via
++ * A machine-learning-based cpuidle governor that uses a small MLP trunk and an
++ * ordinal survival head to predict, per idle-state boundary, the probability
++ * that the upcoming idle reaches that state's target_residency.  The decision
++ * layer picks the deepest feasible state whose calibrated survival meets a
++ * confidence level.  Weights are Xavier-initialized at boot, then refined via
 + * online learning (deferred backpropagation with SGD).
 + *
 + * IMPORTANT: This file is compiled WITHOUT FPU/SSE flags (normal kernel
@@ -126,6 +126,7 @@ index 0000000000..c72b67e9c3
 +
 +#include <linux/cpuidle.h>
 +#include <linux/cpu.h>
++#include <linux/jiffies.h>
 +#include <linux/jump_label.h>
 +#include <linux/kobject.h>
 +#include <linux/math64.h>
@@ -149,16 +150,18 @@ index 0000000000..c72b67e9c3
 +#define CPUIDLE_NAP_PROGNAME "Nap CPUIdle Governor"
 +#define CPUIDLE_NAP_AUTHOR   "Masahito Suzuki"
 +
-+#define CPUIDLE_NAP_VERSION  "0.4.0"
++#define CPUIDLE_NAP_VERSION  "0.5.0"
 +
 +/* Governor defaults */
 +#define NAP_DEFAULT_LR_MILLTHS    1     /* 0.001 = 1 millths */
 +#define NAP_DEFAULT_INTERVAL      4     /* learn every 4 reflects */
 +#define NAP_DEFAULT_CLAMP_MILLTHS 1000  /* 1.0 = 1000 millths */
-+#define NAP_DEFAULT_PCTL_MILLTHS  100   /* 10th percentile */
++#define NAP_DEFAULT_CONF_MILLTHS  500   /* 0.5 = balanced survival confidence */
 +
-+/* Backport: RESIDENCY_THRESHOLD_NS was missing in original patch */
++/* Backport: RESIDENCY_THRESHOLD_NS is not available in Linux 6.16. */
++#ifndef RESIDENCY_THRESHOLD_NS
 +#define RESIDENCY_THRESHOLD_NS TICK_NSEC
++#endif
 +
 +/* ================================================================
 + * ISA dispatch via static keys (definitions only; dispatch in nap_fpu.c)
@@ -194,7 +197,6 @@ index 0000000000..c72b67e9c3
 +	d->hist_idx = (d->hist_idx + 1) % NAP_HISTORY_SIZE;
 +	if (d->hist_count < NAP_HISTORY_SIZE)
 +		d->hist_count++;
-+
 +}
 +
 +static void nap_update_external_signals(struct nap_cpu_data *d)
@@ -206,14 +208,32 @@ index 0000000000..c72b67e9c3
 + * Governor callbacks
 + * ================================================================ */
 +
++static int nap_fallback_heuristic(struct cpuidle_driver *drv,
++				  struct cpuidle_device *dev)
++{
++	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
++	ktime_t delta_tick;
++	u64 sleep_length_ns;
++	int i;
++
++	sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick));
++
++	for (i = drv->state_count - 1; i > 0; i--) {
++		if (dev->states_usage[i].disable)
++			continue;
++		if (drv->states[i].exit_latency_ns > latency_req)
++			continue;
++		if (drv->states[i].target_residency_ns > sleep_length_ns)
++			continue;
++		return i;
++	}
++	return 0;
++}
++
 +/*
-+ * Return the shallowest C-state index that is both enabled and
-+ * satisfies the current latency request.  Returns 0 if no such
-+ * state exists (caller must treat 0 as "POLL is the only option").
-+ *
-+ * Called from the short-circuit path to decide whether the predicted
-+ * sleep length is worth entering any C-state at all.  Does not
-+ * consult the NN.
++ * Return the shallowest enabled C-state that satisfies the current
++ * latency request, or 0 if none exists (POLL is the only option).
++ * Does not consult the NN.
 + */
 +static int nap_find_min_valid_state(struct cpuidle_driver *drv,
 +				    struct cpuidle_device *dev,
@@ -232,23 +252,16 @@ index 0000000000..c72b67e9c3
 +}
 +
 +/*
-+ * Cached wrapper around nap_find_min_valid_state().
-+ *
-+ * Invalidation triggers:
-+ *   1. latency_req changed since last cached value (immediate; PM QoS
-+ *      updates propagate on the next nap_select call).
-+ *   2. NAP_MIN_STATE_REFRESH_JIFFIES elapsed since last refresh
-+ *      (bounded staleness for sysfs-driven or runtime-driver state
-+ *      disable events, which are rare).
-+ *
-+ * Hot path cost when the cache is valid: ~5-7 cycles (one s64
-+ * compare, one time_after() check, one conditional return).  The
-+ * uncached loop runs at most once per HZ jiffies per CPU.
++ * Cached wrapper around nap_find_min_valid_state().  Invalidated when
++ * latency_req changes (immediate PM QoS propagation) or every
++ * NAP_MIN_STATE_REFRESH_JIFFIES (bounded staleness for rare sysfs /
++ * runtime-driver state-disable events).  Hot-path cost when valid:
++ * one s64 compare plus one time_after() check.
 + */
 +static inline int nap_get_min_valid_state(struct nap_cpu_data *d,
-+					   struct cpuidle_driver *drv,
-+					   struct cpuidle_device *dev,
-+					   s64 latency_req)
++					  struct cpuidle_driver *drv,
++					  struct cpuidle_device *dev,
++					  s64 latency_req)
 +{
 +	if (unlikely(latency_req != d->cached_min_state_latency ||
 +		     time_after(jiffies,
@@ -263,22 +276,12 @@ index 0000000000..c72b67e9c3
 +}
 +
 +/*
-+ * Compute dev->poll_limit_ns for the short-circuit path.
-+ *
-+ * Budget = predicted wake time (sleep_length) + 1 µs safety margin.
-+ * The margin absorbs timer jitter so a wake arriving slightly after
-+ * the predicted time does not trigger a select/enter/reflect retry
-+ * cycle.  It is consumed only when the wake is actually late; on-time
-+ * and early wakes exit POLL via need_resched without touching the
-+ * margin.
-+ *
-+ * Floor: NAP_POLL_LIMIT_MIN_NS (1 µs).  Below this, per-iteration
-+ * governor overhead exceeds actual polling, and POLL's own timeout
-+ * sampling granularity (~1.3 µs via POLL_IDLE_RELAX_COUNT cpu_relax
-+ * iterations) makes smaller limits indistinguishable in practice.
-+ *
-+ * Ceiling: min_state.target_residency_ns.  Beyond that point, the
-+ * C-state would have been a better choice than polling.
++ * Compute dev->poll_limit_ns for the short-circuit path: predicted
++ * wake time plus a 1 us margin (absorbs timer jitter so a slightly
++ * late wake does not retrigger select/enter/reflect), floored at
++ * NAP_POLL_LIMIT_MIN_NS and capped at the min state's target
++ * residency (beyond which the C-state would have been the better
++ * choice).
 + */
 +static inline u64 nap_compute_poll_limit(u64 sleep_length_ns,
 +					 u64 min_state_target_ns)
@@ -290,28 +293,6 @@ index 0000000000..c72b67e9c3
 +		       min_state_target_ns);
 +}
 +
-+static int nap_fallback_heuristic(struct cpuidle_driver *drv,
-+				  struct cpuidle_device *dev)
-+{
-+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
-+	ktime_t delta_tick;
-+	u64 sleep_length_ns;
-+	int i;
-+
-+	sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick));
-+
-+	for (i = drv->state_count - 1; i > 0; i--) {
-+		if (dev->states_usage[i].disable)
-+			continue;
-+		if (drv->states[i].exit_latency_ns > latency_req)
-+			continue;
-+		if (drv->states[i].target_residency_ns > sleep_length_ns)
-+			continue;
-+		return i;
-+	}
-+	return 0;
-+}
-+
 +static int nap_select(struct cpuidle_driver *drv,
 +		      struct cpuidle_device *dev,
 +		      bool *stop_tick)
@@ -327,20 +308,17 @@ index 0000000000..c72b67e9c3
 +
 +	latency_req = cpuidle_governor_latency_req(dev->cpu);
 +	sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick));
-+
 +	min_state = nap_get_min_valid_state(d, drv, dev, latency_req);
 +
 +	/*
 +	 * Fast path: when no C-state can amortize its target residency
 +	 * within the predicted sleep length, the answer is deterministically
-+	 * POLL.  Skip NN inference and feature extraction entirely.
-+	 * nap_reflect also skips history update and learning for
-+	 * short-circuited events (see the short_circuited check there).
-+	 * See spec §3.1.
++	 * POLL.  Skip NN inference and feature extraction entirely;
++	 * nap_reflect also skips the feedback path for short-circuited
++	 * events (see the short_circuited check there).
 +	 */
 +	if (min_state == 0 ||
 +	    sleep_length_ns < drv->states[min_state].target_residency_ns) {
-+
 +		if (min_state > 0)
 +			dev->poll_limit_ns = nap_compute_poll_limit(
 +				sleep_length_ns,
@@ -356,7 +334,6 @@ index 0000000000..c72b67e9c3
 +		return 0;
 +	}
 +
-+	/* Normal NN-driven path */
 +	d->short_circuited = false;
 +
 +	if (likely(may_use_simd())) {
@@ -389,12 +366,10 @@ index 0000000000..c72b67e9c3
 +		return;
 +
 +	/*
-+	 * Short-circuited POLL: NN was not invoked for this idle
-+	 * event, so the residency does not belong to the NN's
-+	 * training distribution.  Update the aggregate residency
-+	 * statistic and return — history, hit_intercept, prediction
-+	 * error, external signals, and learning are all skipped.
-+	 * See spec §3.4.
++	 * Short-circuited POLL: the NN was not invoked for this idle, so
++	 * the residency is not part of its training distribution and must
++	 * not feed the floor histogram or the weight update.  Account only
++	 * the aggregate residency and return.
 +	 */
 +	if (d->short_circuited) {
 +		d->stats.total_residency_ns += measured_ns;
@@ -406,20 +381,22 @@ index 0000000000..c72b67e9c3
 +	d->last_prediction_error = d->last_predicted_ns - (s64)measured_ns;
 +	nap_update_external_signals(d);
 +
++	/* Every idle provides a fresh residency for the floor and reliability EMAs */
++	d->learn_actual_ns = measured_ns;
++	d->have_sample = true;
++
 +	/*
-+	 * Dual gate: learn when both the per-N-reflect counter fires
-+	 * AND at least learn_jiffies_min jiffies have elapsed since
-+	 * the last learning step.  The time gate prevents sustained
-+	 * weight churn on workloads with very rapid idle bursts; a
-+	 * value of 0 disables it (restores the original counter-only
-+	 * behavior).  See spec §3.5.
++	 * Throttle the expensive trunk/score weight update with a dual
++	 * gate: the per-N-reflect counter AND a jiffies floor.  The time
++	 * gate caps the learning rate on workloads with very rapid idle
++	 * bursts (e.g. cross-CPU ping-pong); learn_jiffies_min == 0
++	 * disables it and restores counter-only behavior.
 +	 */
 +	if (++d->learn_counter >= d->learn_interval &&
 +	    time_after_eq(jiffies,
 +			  d->last_learn_jiffies + d->learn_jiffies_min)) {
 +		d->learn_counter = 0;
 +		d->last_learn_jiffies = jiffies;
-+		d->learn_actual_ns = measured_ns;
 +		d->needs_learn = true;
 +	}
 +
@@ -436,19 +413,6 @@ index 0000000000..c72b67e9c3
 +	memset(d, 0, sizeof(*d));
 +
 +	/*
-+	 * Force first-call refresh of the min-valid-state cache.
-+	 * cached_min_state_latency = S64_MIN ensures the first
-+	 * nap_select() comparison will always trip the invalidation
-+	 * branch regardless of the actual latency_req value.
-+	 * cached_min_state itself is already zeroed by the memset above.
-+	 */
-+	d->cached_min_state_latency = S64_MIN;
-+	d->cached_min_state_jiffies = jiffies - NAP_MIN_STATE_REFRESH_JIFFIES;
-+
-+	/* Default: allow at most one learning step per jiffy */
-+	d->learn_jiffies_min = 1;
-+
-+	/*
 +	 * Defer weight initialization to the first nap_select() FPU path
 +	 * via reset_pending.  nap_enable() is called from cpuidle core
 +	 * (cpuidle_enable_device) which may run on a different CPU than
@@ -459,7 +423,17 @@ index 0000000000..c72b67e9c3
 +	d->learning_rate_millths  = NAP_DEFAULT_LR_MILLTHS;
 +	d->learn_interval = NAP_DEFAULT_INTERVAL;
 +	d->max_grad_norm_millths  = NAP_DEFAULT_CLAMP_MILLTHS;
-+	d->overshoot_pctl_millths = NAP_DEFAULT_PCTL_MILLTHS;
++	d->conf_millths = NAP_DEFAULT_CONF_MILLTHS;
++
++	/*
++	 * Force a first-call refresh of the min-valid-state cache:
++	 * cached_min_state_latency = S64_MIN guarantees the first
++	 * nap_select() comparison trips the invalidation branch.
++	 */
++	d->cached_min_state_latency = S64_MIN;
++	d->cached_min_state_jiffies = jiffies - NAP_MIN_STATE_REFRESH_JIFFIES;
++	d->learn_jiffies_min = 1;
++
 +	d->reset_pending = true;
 +
 +	return 0;
@@ -472,7 +446,7 @@ index 0000000000..c72b67e9c3
 +}
 +
 +/* ================================================================
-+ * sysfs interface  (/sys/devices/system/cpu/nap/)
++ * sysfs interface  (/sys/devices/system/cpu/cpuidle/nap/)
 + * ================================================================ */
 +
 +static ssize_t stats_show(struct kobject *kobj,
@@ -556,34 +530,6 @@ index 0000000000..c72b67e9c3
 +	return count;
 +}
 +
-+static ssize_t learn_jiffies_min_show(struct kobject *kobj,
-+				      struct kobj_attribute *attr, char *buf)
-+{
-+	int cpu;
-+
-+	cpu = cpumask_first(cpu_online_mask);
-+	if (cpu >= nr_cpu_ids)
-+		return sysfs_emit(buf, "0\n");
-+	return sysfs_emit(buf, "%u\n",
-+			  per_cpu(nap_data, cpu).learn_jiffies_min);
-+}
-+
-+static ssize_t learn_jiffies_min_store(struct kobject *kobj,
-+				       struct kobj_attribute *attr,
-+				       const char *buf, size_t count)
-+{
-+	unsigned int val;
-+	int cpu;
-+
-+	if (kstrtouint(buf, 10, &val) || val > HZ * 3600)
-+		return -EINVAL;
-+
-+	for_each_online_cpu(cpu)
-+		per_cpu(nap_data, cpu).learn_jiffies_min = val;
-+
-+	return count;
-+}
-+
 +static ssize_t reset_weights_store(struct kobject *kobj,
 +				   struct kobj_attribute *attr,
 +				   const char *buf, size_t count)
@@ -639,8 +585,14 @@ index 0000000000..c72b67e9c3
 +	return count;
 +}
 +
-+static ssize_t overshoot_pctl_show(struct kobject *kobj,
-+				    struct kobj_attribute *attr, char *buf)
++/*
++ * confidence: decision confidence level in millths (1..999, default 500).
++ * Higher demands more certainty before entering a deeper state, biasing toward
++ * responsiveness (shallower); lower biases toward energy (deeper).  This is the
++ * single responsiveness dial and replaces the former overshoot_pctl target.
++ */
++static ssize_t confidence_show(struct kobject *kobj,
++			       struct kobj_attribute *attr, char *buf)
 +{
 +	int cpu;
 +
@@ -648,21 +600,21 @@ index 0000000000..c72b67e9c3
 +	if (cpu >= nr_cpu_ids)
 +		return sysfs_emit(buf, "0\n");
 +	return sysfs_emit(buf, "%u\n",
-+			  per_cpu(nap_data, cpu).overshoot_pctl_millths);
++			  per_cpu(nap_data, cpu).conf_millths);
 +}
 +
-+static ssize_t overshoot_pctl_store(struct kobject *kobj,
-+				     struct kobj_attribute *attr,
-+				     const char *buf, size_t count)
++static ssize_t confidence_store(struct kobject *kobj,
++				struct kobj_attribute *attr,
++				const char *buf, size_t count)
 +{
 +	unsigned int val;
 +	int cpu;
 +
-+	if (kstrtouint(buf, 10, &val) || val > 500)
++	if (kstrtouint(buf, 10, &val) || val == 0 || val >= 1000)
 +		return -EINVAL;
 +
 +	for_each_online_cpu(cpu)
-+		per_cpu(nap_data, cpu).overshoot_pctl_millths = val;
++		per_cpu(nap_data, cpu).conf_millths = val;
 +
 +	return count;
 +}
@@ -682,15 +634,14 @@ index 0000000000..c72b67e9c3
 +		return sysfs_emit(buf, "sse2\n");
 +}
 +
-+static struct kobj_attribute version_attr           = __ATTR_RO(version);
-+static struct kobj_attribute simd_attr              = __ATTR_RO(simd);
-+static struct kobj_attribute stats_attr             = __ATTR_RO(stats);
-+static struct kobj_attribute learning_rate_attr     = __ATTR_RW(learning_rate);
-+static struct kobj_attribute learn_interval_attr    = __ATTR_RW(learn_interval);
-+static struct kobj_attribute learn_jiffies_min_attr = __ATTR_RW(learn_jiffies_min);
-+static struct kobj_attribute overshoot_pctl_attr    = __ATTR_RW(overshoot_pctl);
-+static struct kobj_attribute reset_weights_attr     = __ATTR_WO(reset_weights);
-+static struct kobj_attribute reset_stats_attr       = __ATTR_WO(reset_stats);
++static struct kobj_attribute version_attr        = __ATTR_RO(version);
++static struct kobj_attribute simd_attr           = __ATTR_RO(simd);
++static struct kobj_attribute stats_attr          = __ATTR_RO(stats);
++static struct kobj_attribute learning_rate_attr  = __ATTR_RW(learning_rate);
++static struct kobj_attribute learn_interval_attr = __ATTR_RW(learn_interval);
++static struct kobj_attribute confidence_attr     = __ATTR_RW(confidence);
++static struct kobj_attribute reset_weights_attr  = __ATTR_WO(reset_weights);
++static struct kobj_attribute reset_stats_attr    = __ATTR_WO(reset_stats);
 +
 +static struct attribute *nap_attrs[] = {
 +	&version_attr.attr,
@@ -698,8 +649,7 @@ index 0000000000..c72b67e9c3
 +	&stats_attr.attr,
 +	&learning_rate_attr.attr,
 +	&learn_interval_attr.attr,
-+	&learn_jiffies_min_attr.attr,
-+	&overshoot_pctl_attr.attr,
++	&confidence_attr.attr,
 +	&reset_weights_attr.attr,
 +	&reset_stats_attr.attr,
 +	NULL,
@@ -780,10 +730,10 @@ index 0000000000..c72b67e9c3
 +postcore_initcall(nap_init);
 diff --git a/drivers/cpuidle/governors/nap/nap.h b/drivers/cpuidle/governors/nap/nap.h
 new file mode 100644
-index 0000000000..1059db983b
+index 0000000000..0f6aae7d17
 --- /dev/null
 +++ b/drivers/cpuidle/governors/nap/nap.h
-@@ -0,0 +1,283 @@
+@@ -0,0 +1,291 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef NAP_H
 +#define NAP_H
@@ -798,30 +748,39 @@ index 0000000000..1059db983b
 +
 +#define NAP_INPUT_SIZE    8
 +#define NAP_HIDDEN_SIZE   8
-+#define NAP_NUM_EXPERTS   3
++#define NAP_NUM_CUTS      (CPUIDLE_STATE_MAX - 1)
 +
 +/*
-+ * Neural network weight structure for an 8→8→1 MLP (scalar regression).
++ * Neural network weights for an 8-input MLP with an ordinal survival head.
 + *
-+ * The NN outputs a single log2 correction factor applied to sleep_length:
-+ *   effective_sleep = exp2(log2(sleep_length) + nn_output)
-+ * State selection is then deterministic: pick the deepest state whose
-+ * cost (target_residency + exit_latency) ≤ effective_sleep.
++ * The trunk maps input[8] → hidden[8] (ReLU), feeding a shared linear score
++ *   s = w_out . hidden + b_out
++ * which is the input to a proportional-odds ordinal head. For each idle-state
++ * boundary k the predicted survival probability that the upcoming idle reaches
++ * that state's target_residency is
++ *   q_k = sigmoid(s - thr_ord[k-1]).
++ * With ordered thresholds this represents the idle-duration distribution at
++ * exactly the points the decision needs (the sufficient statistic), rather
++ * than a single point estimate. The decision layer compares q_k against a
++ * calibrated confidence level (see nap_fpu_select()).
 + *
 + * Column-major storage: w_h1[j][i] = weight from input j to hidden neuron i.
 + * This layout enables efficient column-wise matrix-vector products where
 + * each input broadcasts across all hidden neurons via SIMD FMA.
 + *
-+ * __aligned(32) ensures AVX2 vmovaps (32-byte aligned) loads work
-+ * correctly.  8 floats = 32 bytes = one ymm register.
++ * thr_ord is appended after the SIMD-accessed fields so their offsets are
++ * unchanged. __aligned(32) ensures AVX2 vmovaps (32-byte) aligned loads
++ * work correctly (8 floats = 32 bytes = one ymm register).
 + */
 +struct nap_weights {
 +	/* Hidden layer: input[8] → hidden[8] */
 +	float w_h1[NAP_INPUT_SIZE][NAP_HIDDEN_SIZE];  /* 64 params */
 +	float b_h1[NAP_HIDDEN_SIZE];                   /* 8 params  */
-+	/* Output layer: hidden[8] → 1 scalar */
++	/* Shared score head: hidden[8] → scalar s */
 +	float w_out[NAP_HIDDEN_SIZE];                  /* 8 params  */
 +	float b_out;                                   /* 1 param   */
++	/* Ordinal survival head: one ordered threshold per state boundary */
++	float thr_ord[NAP_NUM_CUTS];
 +} __aligned(32);
 +
 +/* ISA-specific forward pass implementations */
@@ -829,6 +788,7 @@ index 0000000000..1059db983b
 +			 float *hidden_save, const struct nap_weights *w);
 +void nap_nn_forward_avx2(const float *input, float *output,
 +			 float *hidden_save, const struct nap_weights *w);
++
 +/* ISA-specific online learning (backpropagation) */
 +struct nap_cpu_data;
 +void nap_nn_learn_sse2(struct nap_cpu_data *d);
@@ -955,20 +915,18 @@ index 0000000000..1059db983b
 + * POLL short-circuit tunables
 + * ================================================================ */
 +
-+/* Minimum and safety-margin values for dev->poll_limit_ns written
-+ * by nap_compute_poll_limit().  Both are 1 µs: the POLL state
-+ * itself checks its timeout only every ~1 µs (POLL_IDLE_RELAX_COUNT
-+ * cpu_relax() iterations in drivers/cpuidle/poll_state.c), so
-+ * finer-grained values would not produce distinguishable behavior.
++/* dev->poll_limit_ns floor and safety margin written by
++ * nap_compute_poll_limit().  Both 1 us: the POLL state samples its
++ * own timeout only every ~1 us (POLL_IDLE_RELAX_COUNT cpu_relax()
++ * iterations in poll_state.c), so finer values are indistinguishable.
 + */
 +#define NAP_POLL_LIMIT_MIN_NS      1000ULL
 +#define NAP_POLL_LIMIT_MARGIN_NS   1000ULL
 +
-+/* Refresh interval for the cached minimum-valid-state lookup.
-+ * HZ jiffies (= 1 second) bounds the staleness window caused by
-+ * sysfs-driven or runtime-driver state disable events.  PM QoS
-+ * latency changes are detected immediately via the cached
-+ * latency_req comparison.
++/* Refresh interval for the cached minimum-valid-state lookup.  HZ
++ * jiffies (1 s) bounds staleness from sysfs/runtime state-disable
++ * events; PM QoS latency changes are detected immediately via the
++ * cached latency_req comparison.
 + */
 +#define NAP_MIN_STATE_REFRESH_JIFFIES  HZ
 +
@@ -991,21 +949,21 @@ index 0000000000..1059db983b
 +	s64     last_predicted_ns;
 +	s64     last_prediction_error;
 +
-+	/* Short-circuit fast path (§3.1, §3.2, §3.4 of spec) */
++	/* POLL short-circuit fast path */
 +	bool short_circuited;			/* set in select, read in reflect */
 +	int  cached_min_state;			/* cached shallowest valid state */
 +	s64  cached_min_state_latency;		/* latency_req when cache populated */
 +	unsigned long cached_min_state_jiffies;	/* jiffies when cache populated */
 +
-+	/* Jiffies-based learning rate floor (§3.5 of spec) */
++	/* Jiffies-based learning rate floor */
 +	unsigned long last_learn_jiffies;
-+	unsigned int  learn_jiffies_min;	/* sysfs-tunable, 0 = disabled */
++	unsigned int  learn_jiffies_min;	/* 0 = disabled */
 +
 +	/* select/reflect handoff */
 +	int   last_selected_idx;
 +
-+	/* NN scalar output: log2 correction factor for sleep_length.
-+	 * effective_sleep = exp2(log2(sleep_length) + nn_output).
++	/* Shared ordinal score s (≈ log2 of the predicted idle duration in ns).
++	 * Survival at boundary k is sigmoid(s - thr_ord[k-1]).
 +	 */
 +	float nn_output;
 +
@@ -1015,38 +973,38 @@ index 0000000000..1059db983b
 +	 * nap_extract_features():
 +	 *   SSE2:    movaps  (16-byte aligned)
 +	 *   AVX2:    vmovaps (32-byte aligned)
-+	 * Without __aligned(64), the natural struct offset would be
++	 * Without __aligned(32), the natural struct offset would be
 +	 * only 4-byte aligned, causing #GP faults in the idle task.
 +	 */
 +	float hidden_out[NAP_HIDDEN_SIZE] __aligned(32);
 +	float features_f32[NAP_INPUT_SIZE] __aligned(32);
 +
 +	/* Backprop scratch */
-+	float learn_d_out;	/* output gradient direction (±1) */
-+	float learn_lr;		/* effective lr (base_lr * asymmetric weight) */
++	float learn_d_out;	/* score gradient g = sum_k (q_k - y_k) */
++	float learn_lr;		/* effective learning rate (symmetric) */
 +	float learn_d_hid[NAP_HIDDEN_SIZE] __aligned(32);
 +
-+	/* Precomputed per-state log2(target_residency) for threshold selection.
-+	 * log2_cost[i] = log2(target_residency_ns).
++	/* Precomputed per-state log2 thresholds.
++	 * log2_tres[i] = log2(target_residency_ns) (ordinal thresholds, timer clamp)
 +	 */
-+	float log2_cost[CPUIDLE_STATE_MAX];
++	float log2_tres[CPUIDLE_STATE_MAX];
++
++	/* Decayed per-bin idle histogram: robustness-floor survival estimate */
++	float bin_count[CPUIDLE_STATE_MAX];
 +
 +	/* Deferred learning data */
 +	bool  needs_learn;
-+	bool  output_clamped;	/* true if nn_output was clamped to features[0] */
++	bool  have_sample;	/* a fresh residency awaits per-idle processing */
 +	u64   learn_actual_ns;
 +
-+	/* Mixture-of-Experts: 3 experts × 8 neurons each */
-+	struct nap_weights expert_weights[NAP_NUM_EXPERTS];
-+	struct nap_weights *active_w;	/* selected expert for current/deferred pass */
-+	int   active_expert;		/* 0, 1, or 2: which expert is active */
-+	float expert_mid;		/* log2 threshold: short ↔ long */
-+	float expert_deep;		/* log2 threshold: long ↔ deep */
++	/* Single network: 16→16 trunk + ordinal survival head */
++	struct nap_weights weights;
++	struct nap_weights *active_w;	/* always &weights; consumed by SIMD forward/learn */
 +
 +	/* Online learning */
 +	unsigned int learning_rate_millths;
 +	unsigned int max_grad_norm_millths;
-+	unsigned int overshoot_pctl_millths; /* quantile target (250 = 25th pctl) */
++	unsigned int conf_millths;	/* decision confidence level (500 = 0.5) */
 +	int   learn_interval;
 +	int   learn_counter;
 +	bool reset_pending;		/* set by sysfs, consumed by nap_select */
@@ -1069,10 +1027,10 @@ index 0000000000..1059db983b
 +#endif /* NAP_H */
 diff --git a/drivers/cpuidle/governors/nap/nap_fpu.c b/drivers/cpuidle/governors/nap/nap_fpu.c
 new file mode 100644
-index 0000000000..482a06a5d0
+index 0000000000..9465262969
 --- /dev/null
 +++ b/drivers/cpuidle/governors/nap/nap_fpu.c
-@@ -0,0 +1,572 @@
+@@ -0,0 +1,528 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * nap_fpu.c — FPU/SIMD code for the NAP cpuidle governor
@@ -1140,6 +1098,51 @@ index 0000000000..482a06a5d0
 +	return e + p;
 +}
 +
++/*
++ * Scalar 2^x approximation: integer part via exponent bits, fractional part
++ * via a minimax cubic on [0,1] (error < 1e-4).  Used to build the logistic.
++ */
++static inline float fast_exp2f(float x)
++{
++	union { u32 i; float f; } v;
++	int xi;
++	float f;
++
++	if (x > 60.0f)
++		x = 60.0f;
++	else if (x < -60.0f)
++		x = -60.0f;
++
++	xi = (int)x;
++	if (x < (float)xi)
++		xi--;			/* floor toward negative infinity */
++	f = x - (float)xi;
++
++	v.i = (u32)((xi + 127) << 23);	/* 2^xi */
++	return v.f * (1.0f + f * (0.6931472f +
++			f * (0.2402265f + f * 0.0555041f)));
++}
++
++/* Logistic sigmoid: sigmoid(x) = 1 / (1 + e^-x) = 1 / (1 + 2^(-x*log2(e))) */
++static inline float nap_sigmoidf(float x)
++{
++	return 1.0f / (1.0f + fast_exp2f(-1.4426950f * x));
++}
++
++/*
++ * Robustness floor and Beta-Binomial shrinkage.
++ *
++ * bin_count[] is an exponentially decayed histogram (window NAP_FLOOR_WIN, in
++ * idles) of which idle-state bin each idle landed in, updated every idle; its
++ * survival estimate is a fast, forgetting-resistant memory.  The decision
++ * treats the NN survival as a prior worth NAP_PRIOR_K pseudo-observations and
++ * the decayed histogram as data:
++ *   q_k = (NAP_PRIOR_K * q_nn_k + count(>=k)) / (NAP_PRIOR_K + total).
++ * Cold (no data) follows the NN; once the histogram fills it dominates.
++ */
++#define NAP_FLOOR_WIN  256
++#define NAP_PRIOR_K    16
++
 +/* ================================================================
 + * Deterministic PRNG for weight initialization (LCG)
 + * ================================================================ */
@@ -1227,106 +1230,31 @@ index 0000000000..482a06a5d0
 +}
 +
 +/*
-+ * Precompute log2(target_residency) per state for threshold-based selection.
-+ *
-+ * Used in the selection loop: pick deepest state where
-+ * log2_cost[i] <= nn_output (predicted sleep time in log2 space).
-+ *
-+ * Only target_residency_ns is used — exit_latency is a wakeup cost,
-+ * not a factor in whether the CPU can profitably stay in the state
-+ * for the predicted duration.
++ * Precompute log2(target_residency) per state and seed the ordinal
++ * thresholds.  log2_tres[k] is the boundary location in score space: it
++ * seeds thr_ord[k-1], bounds its learned drift, and clamps the score
++ * against the timer in the decision layer.
 + */
-+static void nap_init_log2_cost(struct nap_cpu_data *d,
++static void nap_init_log2_tres(struct nap_cpu_data *d,
 +			       struct cpuidle_driver *drv)
 +{
-+	float log2_tick;
-+	int long_start, deep_idx;
 +	int i;
 +
 +	for (i = 0; i < drv->state_count; i++) {
-+		float res = float_max(
++		float tres = float_max(
 +			(float)drv->states[i].target_residency_ns, 1.0f);
-+		d->log2_cost[i] = fast_log2f(res);
-+	}
 +
-+	/*
-+	 * MoE expert boundaries — 3-way split.
-+	 *
-+	 * Expert 0 (short): tick-bound idles where measured residency
-+	 *   is dominated by the next tick rather than the workload's
-+	 *   true idle duration.  Boundary: log2(TICK_NSEC).
-+	 *
-+	 * Expert 1 (long): nohz idles in intermediate C-states.
-+	 *
-+	 * Expert 2 (deep): idles targeting the deepest C-state.
-+	 *   The deepest state often has qualitatively different
-+	 *   residency characteristics (package C-state, longer
-+	 *   exit latency, power-gated domains) that warrant a
-+	 *   dedicated expert to avoid gradient interference with
-+	 *   intermediate states.
-+	 *
-+	 * Safety: with only 2 C-states (+ POLL), expert_deep is
-+	 * placed equal to expert_mid so the deep expert is never
-+	 * routed (same behavior as the old 2-expert split).
-+	 */
-+	if (drv->state_count <= 1) {
-+		d->expert_mid = 0.0f;
-+		d->expert_deep = 0.0f;
-+		return;
-+	}
-+
-+	log2_tick = fast_log2f((float)TICK_NSEC);
-+
-+	/* Default: deepest state belongs to long expert (safety) */
-+	long_start = drv->state_count - 1;
-+
-+	/* Prefer the first state whose target_residency exceeds one jiffy */
-+	for (i = 1; i < drv->state_count; i++) {
-+		if (d->log2_cost[i] > log2_tick) {
-+			long_start = i;
-+			break;
-+		}
-+	}
-+
-+	if (long_start > 1) {
-+		/* Normal case: boundary between last short and first long */
-+		d->expert_mid = (d->log2_cost[long_start - 1] +
-+				 d->log2_cost[long_start]) / 2.0f;
-+	} else {
-+		/*
-+		 * long_start == 1: even the shallowest C-state already
-+		 * exceeds one jiffy.  All NN-handled idles go to the
-+		 * long expert; place the boundary just below C1's
-+		 * residency so the short expert remains routable but
-+		 * unused.
-+		 */
-+		d->expert_mid = d->log2_cost[1] - 1.0f;
++		d->log2_tres[i] = fast_log2f(tres);
 +	}
 +
 +	/*
-+	 * Deep expert boundary — deepest C-state split.
-+	 *
-+	 * When there are >= 3 C-states (state_count >= 4, counting POLL),
-+	 * place the boundary at the midpoint between the second-deepest
-+	 * and deepest state's log2(target_residency).  The deep expert
-+	 * then exclusively handles sleep durations long enough to reach
-+	 * the deepest state.
-+	 *
-+	 * With only 2 C-states, expert_deep == expert_mid collapses to
-+	 * the 2-expert regime (expert 2 is never selected).
++	 * Seed each ordinal threshold at its boundary's log2(target_residency),
++	 * so before learning q_k crosses 0.5 exactly when the score (initially
++	 * ~= log2(sleep_length)) reaches that state's target_residency.  This
++	 * reproduces the deepest-state-that-fits default until learning adapts.
 +	 */
-+	deep_idx = drv->state_count - 1;
-+	if (deep_idx >= 3) {
-+		/* >= 3 C-states: split before the deepest */
-+		d->expert_deep = (d->log2_cost[deep_idx - 1] +
-+				  d->log2_cost[deep_idx]) / 2.0f;
-+		/* Ensure deep > mid ordering */
-+		if (d->expert_deep <= d->expert_mid)
-+			d->expert_deep = d->expert_mid;
-+	} else {
-+		/* <= 2 C-states: collapse deep into long */
-+		d->expert_deep = d->expert_mid;
-+	}
++	for (i = 1; i < drv->state_count; i++)
++		d->weights.thr_ord[i - 1] = d->log2_tres[i];
 +}
 +
 +/* ================================================================
@@ -1400,18 +1328,6 @@ index 0000000000..482a06a5d0
 +	s->avg = sum / (float)n;
 +}
 +
-+/*
-+ * Extract 8 input features for the MLP.
-+ *
-+ *   [0] log2(sleep_length)           — next timer event
-+ *   [1] log2(last_residency)         — actual duration of last idle
-+ *   [2] log_hist avg                 — average recent idle duration
-+ *   [3] log_hist min                 — shortest recent idle
-+ *   [4] log_hist max                 — longest recent idle
-+ *   [5] signed log2(|pred_error|+1)  — prediction feedback
-+ *   [6] log2(busy_ns)               — pre-idle busy duration
-+ *   [7] log2(lat_req) - log2(deepest_lat) — PM QoS headroom
-+ */
 +static void nap_extract_features(struct cpuidle_driver *drv,
 +				 struct cpuidle_device *dev,
 +				 float out[NAP_INPUT_SIZE],
@@ -1508,86 +1424,70 @@ index 0000000000..482a06a5d0
 +
 +	/* Handle deferred weight reset (set by sysfs or nap_enable) */
 +	if (unlikely(d->reset_pending)) {
-+		int e;
-+
-+		for (e = 0; e < NAP_NUM_EXPERTS; e++)
-+			nap_init_weights(&d->expert_weights[e]);
-+		nap_init_log2_cost(d, drv);
++		nap_init_weights(&d->weights);
++		nap_init_log2_tres(d, drv);
++		memset(d->bin_count, 0, sizeof(d->bin_count));
++		d->have_sample = false;
 +		d->stats.learn_count = 0;
 +		d->needs_learn = false;
 +		d->reset_pending = false;
 +	}
 +
-+	/* Deferred learning (always, even during warmup) */
-+	if (d->needs_learn) {
-+		float log2_eff = d->nn_output;
-+		float alpha = (float)d->overshoot_pctl_millths
-+			      / 1000.0f;
-+		int nn_selected = 0;
-+		bool is_overshoot;
-+		int i;
-+
-+		/* Simulate which state the NN selected */
-+		for (i = drv->state_count - 1; i > 0; i--) {
-+			if (d->log2_cost[i] <= log2_eff) {
-+				nn_selected = i;
-+				break;
-+			}
-+		}
-+
-+		/*
-+		 * Direct overshoot loss.
-+		 *
-+		 * Base the gradient on whether the simulated state
-+		 * selection actually caused overshoot
-+		 * (actual < target_residency).
-+		 *
-+		 * The asymmetric weight is encoded in the learning
-+		 * rate (not in d_out) so that gradient clamping
-+		 * cannot destroy the asymmetry.  d_out is ±1 and
-+		 * gets clipped symmetrically; the (1-α) vs α ratio
-+		 * is preserved through learn_lr.
-+		 *
-+		 * At equilibrium, P(overshoot) converges to α.
-+		 * α = overshoot_pctl / 1000.
-+		 */
-+		{
-+			float base_lr = (float)d->learning_rate_millths
-+					/ 1000.0f;
-+
-+			is_overshoot = (nn_selected > 0 &&
-+				d->learn_actual_ns <
-+				drv->states[nn_selected].target_residency_ns);
-+
-+			/*
-+			 * When the output was clamped at the upper
-+			 * limit (nn_output == features[0]), the NN
-+			 * is already predicting the maximum possible
-+			 * sleep time.  Non-overshoot events would
-+			 * push weights UP, but the output cannot
-+			 * actually increase.  Suppress this gradient
-+			 * to prevent unbounded weight growth in idle
-+			 * systems where natural overshoot rate < α.
-+			 *
-+			 * Overshoot events still learn normally
-+			 * (push DOWN) even when clamped.
-+			 */
-+			if (d->output_clamped && !is_overshoot) {
-+				d->learn_lr = 0;
-+				d->learn_d_out = 0;
-+			} else {
-+				d->learn_d_out = is_overshoot
-+					? 1.0f : -1.0f;
-+				d->learn_lr = is_overshoot
-+					? base_lr * (1.0f - alpha)
-+					: base_lr * alpha;
++	/*
++	 * Per-idle feedback against the just-realized idle duration.
++	 *
++	 * Every idle: update the decayed floor histogram so it stays current.
++	 * Only every learn_interval (needs_learn): apply the ordinal-threshold
++	 * updates and the trunk/score-head backprop, using the previous pass's
++	 * stored score, hidden activations and features.  Under the shared-score
++	 * proportional-odds model the gradient w.r.t. the score is the scalar
++	 * g = sum_k (q_k - y_k), which drives the existing SIMD backprop unchanged.
++	 * The loss is symmetric -- any responsiveness bias lives in the decision
++	 * layer, not here.
++	 */
++	if (d->have_sample) {
++		float decay = (float)(NAP_FLOOR_WIN - 1) / (float)NAP_FLOOR_WIN;
++		int k, label_bin = 0;
++
++		if (d->needs_learn) {
++			float base_lr = (float)d->learning_rate_millths / 1000.0f;
++			float clamp_val = (float)d->max_grad_norm_millths / 1000.0f;
++			float s = d->nn_output;
++			float g = 0.0f;
++
++			for (k = 1; k < drv->state_count; k++) {
++				float th = d->active_w->thr_ord[k - 1];
++				float q = nap_sigmoidf(s - th);
++				float y = (d->learn_actual_ns >=
++					   drv->states[k].target_residency_ns)
++					  ? 1.0f : 0.0f;
++				float err = q - y;
++				float lo = d->log2_tres[k] - 6.0f;
++				float hi = d->log2_tres[k] + 6.0f;
++
++				g += err;
++				d->active_w->thr_ord[k - 1] =
++					fclampf(th + fclampf(base_lr * err,
++							     -clamp_val, clamp_val),
++						lo, hi);
 +			}
++			d->learn_d_out = g;
++			d->learn_lr = base_lr;
++			d->stats.learn_count++;
++			nap_nn_learn(d);
++			d->needs_learn = false;
 +		}
 +
-+		d->stats.learn_count++;
++		/* Floor histogram update, every idle */
++		for (k = 1; k < drv->state_count; k++)
++			if (d->learn_actual_ns >=
++			    drv->states[k].target_residency_ns)
++				label_bin = k;
++		for (k = 0; k < drv->state_count; k++)
++			d->bin_count[k] *= decay;
++		d->bin_count[label_bin] += 1.0f;
 +
-+		nap_nn_learn(d);
-+		d->needs_learn = false;
++		d->have_sample = false;
 +	}
 +
 +	/*
@@ -1597,57 +1497,71 @@ index 0000000000..482a06a5d0
 +	 */
 +	nap_extract_features(drv, dev, d->features_f32, latency_req);
 +
-+	/* MoE: 3-way expert selection based on log2(sleep_length) */
-+	if (d->features_f32[0] >= d->expert_deep)
-+		d->active_expert = 2;		/* deep: deepest C-state */
-+	else if (d->features_f32[0] >= d->expert_mid)
-+		d->active_expert = 1;		/* long: nohz intermediate */
-+	else
-+		d->active_expert = 0;		/* short: tick-bound */
-+	d->active_w = &d->expert_weights[d->active_expert];
++	d->active_w = &d->weights;
 +
 +	nap_nn_forward(d->features_f32, &d->nn_output, d->hidden_out,
 +		       d->active_w);
 +
 +	/*
-+	 * Clamp NN output: predicted sleep cannot exceed sleep_length
-+	 * (next timer event).  features_f32[0] = log2(sleep_length).
++	 * Decision layer.
 +	 *
-+	 * Track whether the clamp was applied so the learning block
-+	 * can suppress "push up" gradients when the output is already
-+	 * at the maximum.  Without this, weights diverge unboundedly
-+	 * in idle systems where the natural overshoot rate < alpha.
-+	 */
-+	d->output_clamped = (d->nn_output > d->features_f32[0]);
-+	if (d->output_clamped)
-+		d->nn_output = d->features_f32[0];
-+
-+	/*
-+	 * Threshold-based selection using NN predicted sleep time.
-+	 *
-+	 * The NN directly outputs log2(predicted_sleep) in ns.
-+	 * Select the deepest feasible state whose cost ≤ predicted_sleep.
++	 * For each boundary k the survival probability q_k is a Beta-Binomial
++	 * shrinkage of the NN survival sigmoid(s - thr_ord) (a prior worth
++	 * NAP_PRIOR_K pseudo-observations) toward the decayed histogram (data):
++	 * the NN drives cold start, the floor takes over as it fills.  A running
++	 * minimum enforces a monotone non-increasing survival curve, and the next
++	 * timer event caps the reachable depth (a deeper state cannot be earned
++	 * past it).  The confidence level is the single responsiveness dial: pick
++	 * the deepest feasible state whose survival still meets it.
 +	 */
 +	{
-+		float log2_eff = d->nn_output;
-+		int idx = 0, i;
++		float conf = (float)d->conf_millths / 1000.0f;
++		float s = d->nn_output;
++		float sleep_log2 = d->features_f32[0];
++		float suffix[CPUIDLE_STATE_MAX];
++		float total = 0.0f;
++		float qmin = 1.0f;
++		int k, m = 0, idx = 0;
++
++		for (k = 0; k < drv->state_count; k++)
++			total += d->bin_count[k];
++
++		suffix[drv->state_count - 1] =
++			d->bin_count[drv->state_count - 1];
++		for (k = drv->state_count - 2; k >= 0; k--)
++			suffix[k] = suffix[k + 1] + d->bin_count[k];
++
++		for (k = 1; k < drv->state_count; k++) {
++			float q_nn = nap_sigmoidf(s - d->active_w->thr_ord[k - 1]);
++			float q = ((float)NAP_PRIOR_K * q_nn + suffix[k]) /
++				  ((float)NAP_PRIOR_K + total);
++
++			if (d->log2_tres[k] > sleep_log2)
++				q = 0.0f;	/* cannot idle past the next timer */
++			if (q < qmin)
++				qmin = q;
++			q = qmin;
++
++			if (q >= conf)
++				m = k;
++			else
++				break;
++		}
 +
-+		for (i = drv->state_count - 1; i > 0; i--) {
-+			if (dev->states_usage[i].disable)
++		for (k = m; k >= 1; k--) {
++			if (dev->states_usage[k].disable)
 +				continue;
-+			if (drv->states[i].exit_latency_ns > latency_req)
++			if (drv->states[k].exit_latency_ns > latency_req)
 +				continue;
-+			if (d->log2_cost[i] <= log2_eff) {
-+				idx = i;
-+				break;
-+			}
++			idx = k;
++			break;
 +		}
 +		return idx;
 +	}
 +}
 diff --git a/drivers/cpuidle/governors/nap/nap_nn_avx2.c b/drivers/cpuidle/governors/nap/nap_nn_avx2.c
 new file mode 100644
-index 0000000000..96e5415423
+index 0000000000..a43091793c
 --- /dev/null
 +++ b/drivers/cpuidle/governors/nap/nap_nn_avx2.c
 @@ -0,0 +1,135 @@
@@ -1655,7 +1569,7 @@ index 0000000000..96e5415423
 +/*
 + * nap_nn_avx2.c — AVX2+FMA forward pass and backpropagation for the nap MLP
 + *
-+ * 8→8→1 scalar regression (log2 correction factor).
++ * 8→8 trunk + scalar score s feeding the ordinal survival head.
 + * Uses 256-bit ymm registers: 8 hidden neurons = 1 ymm.
 + * FMA via vfmadd231ps for fused multiply-add.
 + *
@@ -1788,7 +1702,7 @@ index 0000000000..96e5415423
 +}
 diff --git a/drivers/cpuidle/governors/nap/nap_nn_sse2.c b/drivers/cpuidle/governors/nap/nap_nn_sse2.c
 new file mode 100644
-index 0000000000..a9fffb3b98
+index 0000000000..0f2a6f131f
 --- /dev/null
 +++ b/drivers/cpuidle/governors/nap/nap_nn_sse2.c
 @@ -0,0 +1,136 @@
@@ -1796,7 +1710,7 @@ index 0000000000..a9fffb3b98
 +/*
 + * nap_nn_sse2.c — SSE2 forward pass and backpropagation for the nap MLP
 + *
-+ * 8→8→1 scalar regression (log2 correction factor).
++ * 8→8 trunk + scalar score s feeding the ordinal survival head.
 + * Baseline implementation using SSE2, which is always available on x86_64.
 + * No FMA — uses separate mul + add (2 instructions per MAC).
 + *
@@ -1930,3 +1844,4 @@ index 0000000000..a9fffb3b98
 +}
 -- 
 2.34.1
+
diff --git a/99-charcoal-sysctl.conf b/99-charcoal-sysctl.conf
new file mode 100644
index 0000000..0bddf16
--- /dev/null
+++ b/99-charcoal-sysctl.conf
@@ -0,0 +1,2 @@
+vm.kcompressd=256
+vm.vfs_cache_pressure=125
diff --git a/PKGBUILD b/PKGBUILD
index e84fdbf..2d1baba 100755
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -4,8 +4,8 @@
 
 pkgbase=linux-charcoal-616
 _nepbase=linux-neptune-616
-_tag=6.16.12-valve23
-_ver=3
+_tag=6.16.12-valve24
+_ver=1
 pkgver=${_tag//-/.}.cc$_ver
 pkgrel=1
 pkgdesc='Linux'
@@ -54,6 +54,7 @@ source=(
   charcoal.conf
   65-adios.rules
   99-charcoal.sh
+  99-charcoal-sysctl.conf
   vangogh_allow_higher_cpu_freq.patch
   vangogh_higher_max_power_limit.patch
   drm_sched_rr_default.patch
@@ -94,15 +95,16 @@ source=(
   "git+https://github.com/forkymcforkface/xpad-noone.git#commit=8e903676dd9514c07ce5e06e43c5f7d8cc51cb7d"
   "git+https://github.com/atar-axis/xpadneo.git#tag=v$_xpadneo_version"
    6.16-poc-selector-v2.6.1.patch 
-   6.16-nap-v0.4.0.patch
+   6.16-nap-v0.5.0.patch
 )
-sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0'
+sha256sums=('SKIP'
             '37452b4d09e5e42134ae24a61f2f656790837c327268074cf79d7dab3558b972'
             'd88eaf0f94bae470040e4882f334c05b1bb2ab0a99e4b7299aa0b2337810ab8d'
             'fd57213c524e24cd9c72e2fecd9b2005934b6099e209864e5a93eb03406fca21'
             'b831de1b98a2f77f636f4780e37ebfcb3a6829f94f5423eb04c4b26e64ac43b8'
             '52cbbf41450806d766260bc4f1ea055f6f9fdd55d37ad831840b16d505beb0cc'
             '0a6a7408ccc0c94b5cce50dabc7ee318abcc1b9eaaedd3d83fd7e7d5a73b4d4f'
+            '7b0a1d962dfbcc1cbec195a8abb5ad1ff1872fde0a2249bd5704367c023c6573' 
             '375c8e17daf9e60bc6c211dd73f0c67ec241bd40a83d812a08eeb42aab6128d9'
             '1c49146dc5878bfab32b331d11cb66d493670bbe590ff07c2050305911c281c3'
             '6e510d8b74798944b5cb84ac775156831410c853c8a03c2a3f79e9bc7be9c2e2'
@@ -143,7 +145,7 @@ sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0'
             '1055bbbd32985017f4501d375648873bd598db084177d302aeeade56b47920e1'
             '26b3a811d38471a42229fa037cb6d2bb5ff78f19f45a17c7f263339ee67769a7'
             '14dabfb0452a3a817e8d809fb28eb7565512e95386d789c627b62baf136e001f'
-            '99d87a5c9cf47f257df81fabbabdcb9df02ff93c0c9caabf1bbd40d2e50fed6e')
+            'f665d6ba6fc18579083bf8ec7ec741d43495f16f9dcbc482a5bd928b1778b2d3')
 
 export KBUILD_BUILD_HOST=archlinux
 export KBUILD_BUILD_USER=$pkgbase
@@ -251,6 +253,8 @@ _package() {
   install -D -m 0644 -t "$pkgdir/etc/profile.d" ../99-charcoal.sh
   # Charcoal: Install udev rules
   install -D -m 0644 -t "$pkgdir/etc/udev/rules.d" ../65-adios.rules
+  # sysctl parameters to fix trashing under heavy memory pressure 
+  install -D -m 0644 -t "$pkgdir/etc/sysctl.d" ../99-charcoal-sysctl.conf
 
   # Charcoal: Install bundles DKMS modules
   ZSTD_CLEVEL=19 make LLVM=1 M=../ryzen_smu INSTALL_MOD_PATH="$pkgdir/usr" INSTALL_MOD_STRIP=1 DEPMOD=/doesnt/exist modules_install