From 6cba6b56bdc0270ec00da68a37f99e4a0b110afa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Sat, 30 May 2026 17:41:42 -0300
Subject: [PATCH 01/10] Change checksum to 'SKIP' in PKGBUILD

---
 PKGBUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PKGBUILD b/PKGBUILD
index e84fdbf..b5974b6 100755
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -110,7 +110,7 @@ sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0'
             'e58e21581a509d0617591311b1d9ab8669f46046f2949e42d6149b0bb11ead87'
             '4bcf61814a6daac8f72c46a425b9ce88c07f6bd95f6a0ac287d73dfd4d5da60b'
             'ff3bbe78d6f072d57f567878e870956242ee78ccddd258b1ec2e4729621138fe'
-            'ab6b17b1f9cc4b322f0050d2e8cede75e44e069854e9bdc22068356530d628e8'
+            'SKIP'
             '11fe52062dedc9c2016fafc98899f4afb4cbd5327bd985c8d813dc72461f503a'
             '9df628fd530950e37d31da854cb314d536f33c83935adf5c47e71266a55f7004'
             '9e7b20068cdfe6a00b64d7488bdc47966fa130a07a3eae02fa57caef5d35d4ec'

From 44bd4d5ed423e0ac42250bf52588c78cc5e0ebef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Sat, 30 May 2026 18:12:07 -0300
Subject: [PATCH 02/10] Add sysctl configuration for memory management

---
 99-charcoal-sysctl.conf | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 99-charcoal-sysctl.conf

diff --git a/99-charcoal-sysctl.conf b/99-charcoal-sysctl.conf
new file mode 100644
index 0000000..0bddf16
--- /dev/null
+++ b/99-charcoal-sysctl.conf
@@ -0,0 +1,2 @@
+vm.kcompressd=256
+vm.vfs_cache_pressure=125

From b9988369ff2eec27593f1ce39706ce16f2b940ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Sat, 30 May 2026 18:25:26 -0300
Subject: [PATCH 03/10] Add sysctl configuration for memory pressure handling

Added sysctl parameters to improve memory management under heavy load.
---
 PKGBUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/PKGBUILD b/PKGBUILD
index b5974b6..59aa391 100755
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -54,6 +54,7 @@ source=(
   charcoal.conf
   65-adios.rules
   99-charcoal.sh
+  99-charcoal-sysctl.conf
   vangogh_allow_higher_cpu_freq.patch
   vangogh_higher_max_power_limit.patch
   drm_sched_rr_default.patch
@@ -103,6 +104,7 @@ sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0'
             'b831de1b98a2f77f636f4780e37ebfcb3a6829f94f5423eb04c4b26e64ac43b8'
             '52cbbf41450806d766260bc4f1ea055f6f9fdd55d37ad831840b16d505beb0cc'
             '0a6a7408ccc0c94b5cce50dabc7ee318abcc1b9eaaedd3d83fd7e7d5a73b4d4f'
+            'SKIP' 
             '375c8e17daf9e60bc6c211dd73f0c67ec241bd40a83d812a08eeb42aab6128d9'
             '1c49146dc5878bfab32b331d11cb66d493670bbe590ff07c2050305911c281c3'
             '6e510d8b74798944b5cb84ac775156831410c853c8a03c2a3f79e9bc7be9c2e2'
@@ -251,6 +253,8 @@ _package() {
   install -D -m 0644 -t "$pkgdir/etc/profile.d" ../99-charcoal.sh
   # Charcoal: Install udev rules
   install -D -m 0644 -t "$pkgdir/etc/udev/rules.d" ../65-adios.rules
+  # sysctl parameters to fix trashing under heavy memory pressure 
+  install -D -m 0644 -t "$pkgdir/etc/sysctl.d" ../99-charcoal-sysctl.conf
 
   # Charcoal: Install bundles DKMS modules
   ZSTD_CLEVEL=19 make LLVM=1 M=../ryzen_smu INSTALL_MOD_PATH="$pkgdir/usr" INSTALL_MOD_STRIP=1 DEPMOD=/doesnt/exist modules_install

From 6c7aec40d07eb6d6f9c390534eedd6625bcf4282 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Sat, 30 May 2026 19:05:20 -0300
Subject: [PATCH 04/10] Update checksums in PKGBUILD

---
 PKGBUILD | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PKGBUILD b/PKGBUILD
index 59aa391..b879d9f 100755
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -104,7 +104,7 @@ sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0'
             'b831de1b98a2f77f636f4780e37ebfcb3a6829f94f5423eb04c4b26e64ac43b8'
             '52cbbf41450806d766260bc4f1ea055f6f9fdd55d37ad831840b16d505beb0cc'
             '0a6a7408ccc0c94b5cce50dabc7ee318abcc1b9eaaedd3d83fd7e7d5a73b4d4f'
-            'SKIP' 
+            '7b0a1d962dfbcc1cbec195a8abb5ad1ff1872fde0a2249bd5704367c023c6573' 
             '375c8e17daf9e60bc6c211dd73f0c67ec241bd40a83d812a08eeb42aab6128d9'
             '1c49146dc5878bfab32b331d11cb66d493670bbe590ff07c2050305911c281c3'
             '6e510d8b74798944b5cb84ac775156831410c853c8a03c2a3f79e9bc7be9c2e2'
@@ -112,7 +112,7 @@ sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0'
             'e58e21581a509d0617591311b1d9ab8669f46046f2949e42d6149b0bb11ead87'
             '4bcf61814a6daac8f72c46a425b9ce88c07f6bd95f6a0ac287d73dfd4d5da60b'
             'ff3bbe78d6f072d57f567878e870956242ee78ccddd258b1ec2e4729621138fe'
-            'SKIP'
+            'df38dc7a2bd45ebacf34de8182e7df50f7ea871715b0ab4798f40485ba7fd2f0'
             '11fe52062dedc9c2016fafc98899f4afb4cbd5327bd985c8d813dc72461f503a'
             '9df628fd530950e37d31da854cb314d536f33c83935adf5c47e71266a55f7004'
             '9e7b20068cdfe6a00b64d7488bdc47966fa130a07a3eae02fa57caef5d35d4ec'

From 23596af86120aa78bf701cbd08fa7001ea600a46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Sun, 7 Jun 2026 18:38:51 -0300
Subject: [PATCH 05/10] Delete 6.16-nap-v0.4.0.patch

---
 6.16-nap-v0.4.0.patch | 1932 -----------------------------------------
 1 file changed, 1932 deletions(-)
 delete mode 100644 6.16-nap-v0.4.0.patch

diff --git a/6.16-nap-v0.4.0.patch b/6.16-nap-v0.4.0.patch
deleted file mode 100644
index 9b40d91..0000000
--- a/6.16-nap-v0.4.0.patch
+++ /dev/null
@@ -1,1932 +0,0 @@
-From 1d2e8272f288fecce3fd7f762fb8c628ed04b7fe Mon Sep 17 00:00:00 2001
-From: Masahito S <firelzrd@gmail.com>
-Date: Wed, 15 Apr 2026 08:37:01 +0900
-Subject: [PATCH] 6.16 backport: 6.18.3-nap-v0.4.0
-
-Backport of NAP cpuidle governor to Linux 6.16.
-No functional changes except added RESIDENCY_THRESHOLD_NS definition.
-
-Signed-off-by: Masahito S <firelzrd@gmail.com>
----
- drivers/cpuidle/Kconfig                     |  17 +
- drivers/cpuidle/governors/Makefile          |   1 +
- drivers/cpuidle/governors/nap/Makefile      |  29 +
- drivers/cpuidle/governors/nap/nap.c         | 671 ++++++++++++++++++++
- drivers/cpuidle/governors/nap/nap.h         | 283 +++++++++
- drivers/cpuidle/governors/nap/nap_fpu.c     | 572 +++++++++++++++++
- drivers/cpuidle/governors/nap/nap_nn_avx2.c | 135 ++++
- drivers/cpuidle/governors/nap/nap_nn_sse2.c | 136 ++++
- 8 files changed, 1844 insertions(+)
- create mode 100644 drivers/cpuidle/governors/nap/Makefile
- create mode 100644 drivers/cpuidle/governors/nap/nap.c
- create mode 100644 drivers/cpuidle/governors/nap/nap.h
- create mode 100644 drivers/cpuidle/governors/nap/nap_fpu.c
- create mode 100644 drivers/cpuidle/governors/nap/nap_nn_avx2.c
- create mode 100644 drivers/cpuidle/governors/nap/nap_nn_sse2.c
-
-diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
-index cac5997dca..9b6c50f0d8 100644
---- a/drivers/cpuidle/Kconfig
-+++ b/drivers/cpuidle/Kconfig
-@@ -44,6 +44,23 @@ config CPU_IDLE_GOV_HALTPOLL
- 
- 	  Some virtualized workloads benefit from using it.
- 
-+config CPU_IDLE_GOV_NAP
-+	bool "Neural Adaptive Predictor (NAP) governor"
-+	depends on X86_64
-+	default y
-+	help
-+	  A machine-learning-based cpuidle governor that uses a small
-+	  neural network (MLP 16→16→10) to predict the optimal idle
-+	  state.  Weights are initialized from hardware idle-state
-+	  parameters and refined via online learning (deferred
-+	  backpropagation with SGD).  Requires SSE2 at minimum;
-+	  AVX2/AVX-512 are used when available.
-+
-+	  This is experimental. Select via cpuidle.governor=nap on
-+	  the kernel command line.
-+
-+	  If unsure, say Y.
-+
- config DT_IDLE_STATES
- 	bool
- 
-diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile
-index 63abb5393a..ae688891c0 100644
---- a/drivers/cpuidle/governors/Makefile
-+++ b/drivers/cpuidle/governors/Makefile
-@@ -7,3 +7,4 @@ obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o
- obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o
- obj-$(CONFIG_CPU_IDLE_GOV_TEO) += teo.o
- obj-$(CONFIG_CPU_IDLE_GOV_HALTPOLL) += haltpoll.o
-+obj-$(CONFIG_CPU_IDLE_GOV_NAP) += nap/
-diff --git a/drivers/cpuidle/governors/nap/Makefile b/drivers/cpuidle/governors/nap/Makefile
-new file mode 100644
-index 0000000000..8c4a17d8e2
---- /dev/null
-+++ b/drivers/cpuidle/governors/nap/Makefile
-@@ -0,0 +1,33 @@
-+# SPDX-License-Identifier: GPL-2.0-only
-+#
-+# Makefile for the NAP cpuidle governor
-+#
-+
-+obj-$(CONFIG_CPU_IDLE_GOV_NAP) += cpuidle_gov_nap.o
-+
-+cpuidle_gov_nap-y := nap.o nap_fpu.o nap_nn_sse2.o nap_nn_avx2.o
-+
-+# Kernel builds with -mno-sse -mno-sse2 -mno-avx -msoft-float -mno-80387
-+# -mno-fp-ret-in-387.  FPU/SIMD-using files need these removed and ISA
-+# flags explicitly added.
-+#
-+# CRITICAL: nap.o is intentionally compiled with NORMAL kernel flags
-+# (no FPU/SSE).  All floating-point code lives in nap_fpu.o and the
-+# nap_nn_*.o files.  This ensures the compiler cannot emit SSE instructions
-+# in governor callbacks (nap_select, nap_reflect, etc.), which would
-+# silently corrupt userspace FPU register state.
-+#
-+# Do NOT add CFLAGS_REMOVE/CFLAGS for nap.o — it must stay FPU-free.
-+FPU_KILL_FLAGS := -mno-sse -mno-sse2 -mno-mmx -mno-avx -mno-3dnow \
-+                  -mno-sse4a -msoft-float -mno-80387 -mno-fp-ret-in-387
-+
-+# LTO FIX: Disables LTO on standalone files to prevent intrusive inlining
-+# of FPU instructions and ensure that flags are preserved during linking.
-+CFLAGS_REMOVE_nap.o            += $(CC_FLAGS_LTO)
-+CFLAGS_REMOVE_nap_fpu.o        += $(CC_FLAGS_LTO) $(FPU_KILL_FLAGS)
-+CFLAGS_REMOVE_nap_nn_sse2.o    += $(CC_FLAGS_LTO) $(FPU_KILL_FLAGS)
-+CFLAGS_REMOVE_nap_nn_avx2.o    += $(CC_FLAGS_LTO) $(FPU_KILL_FLAGS)
-+
-+CFLAGS_nap_fpu.o       += $(CC_FLAGS_FPU)
-+CFLAGS_nap_nn_sse2.o   += $(CC_FLAGS_FPU)
-+CFLAGS_nap_nn_avx2.o   += $(CC_FLAGS_FPU) -mavx -mavx2 -mfma
-diff --git a/drivers/cpuidle/governors/nap/nap.c b/drivers/cpuidle/governors/nap/nap.c
-new file mode 100644
-index 0000000000..c72b67e9c3
---- /dev/null
-+++ b/drivers/cpuidle/governors/nap/nap.c
-@@ -0,0 +1,672 @@
-+// SPDX-License-Identifier: GPL-2.0
-+/*
-+ * nap.c — Neural Adaptive Predictor cpuidle governor
-+ *
-+ * A machine-learning-based cpuidle governor that uses a small MLP (8→8→1)
-+ * with 3 Mixture-of-Experts (short/long/deep) to predict a log2 correction
-+ * factor for sleep_length.  State selection is deterministic threshold
-+ * comparison.  Weights are Xavier-initialized at boot, then refined via
-+ * online learning (deferred backpropagation with SGD).
-+ *
-+ * IMPORTANT: This file is compiled WITHOUT FPU/SSE flags (normal kernel
-+ * compilation).  All floating-point and SIMD code lives in nap_fpu.c and
-+ * nap_nn_{sse2,avx2}.c, which are compiled with CC_FLAGS_FPU.
-+ * This separation ensures the compiler cannot emit SSE instructions in
-+ * governor callbacks (nap_select, nap_reflect, etc.), which would corrupt
-+ * userspace FPU register state.
-+ */
-+
-+#include <linux/cpuidle.h>
-+#include <linux/cpu.h>
-+#include <linux/jump_label.h>
-+#include <linux/kobject.h>
-+#include <linux/math64.h>
-+#include <linux/percpu.h>
-+#include <linux/sched/clock.h>
-+#include <linux/sysfs.h>
-+#include <linux/string.h>
-+#include <linux/tick.h>
-+#include <asm/simd.h>
-+#include <asm/fpu/api.h>
-+#include <asm/processor.h>
-+
-+#include "nap.h"
-+
-+#include "../gov.h"
-+
-+/**************************************************************
-+ * Version Information:
-+ */
-+
-+#define CPUIDLE_NAP_PROGNAME "Nap CPUIdle Governor"
-+#define CPUIDLE_NAP_AUTHOR   "Masahito Suzuki"
-+
-+#define CPUIDLE_NAP_VERSION  "0.4.0"
-+
-+/* Governor defaults */
-+#define NAP_DEFAULT_LR_MILLTHS    1     /* 0.001 = 1 millths */
-+#define NAP_DEFAULT_INTERVAL      4     /* learn every 4 reflects */
-+#define NAP_DEFAULT_CLAMP_MILLTHS 1000  /* 1.0 = 1000 millths */
-+#define NAP_DEFAULT_PCTL_MILLTHS  100   /* 10th percentile */
-+
-+/* Backport: RESIDENCY_THRESHOLD_NS was missing in original patch */
-+#define RESIDENCY_THRESHOLD_NS TICK_NSEC
-+
-+/* ================================================================
-+ * ISA dispatch via static keys (definitions only; dispatch in nap_fpu.c)
-+ * ================================================================ */
-+
-+DEFINE_STATIC_KEY_FALSE(nap_use_avx2);
-+
-+static void __init nap_detect_simd(void)
-+{
-+	if (boot_cpu_has(X86_FEATURE_FMA) &&
-+	    boot_cpu_has(X86_FEATURE_AVX2)) {
-+		static_branch_enable(&nap_use_avx2);
-+		pr_info("nap: using AVX2+FMA\n");
-+	} else {
-+		pr_info("nap: using SSE2\n");
-+	}
-+}
-+
-+/* ================================================================
-+ * Per-CPU data
-+ * ================================================================ */
-+
-+DEFINE_PER_CPU(struct nap_cpu_data, nap_data);
-+static struct cpuidle_driver *nap_cached_drv;
-+
-+/* ================================================================
-+ * Reflect-time updates (integer-only, no FPU needed)
-+ * ================================================================ */
-+
-+static void nap_history_update(struct nap_cpu_data *d, u64 measured_ns)
-+{
-+	d->history[d->hist_idx] = measured_ns;
-+	d->hist_idx = (d->hist_idx + 1) % NAP_HISTORY_SIZE;
-+	if (d->hist_count < NAP_HISTORY_SIZE)
-+		d->hist_count++;
-+
-+}
-+
-+static void nap_update_external_signals(struct nap_cpu_data *d)
-+{
-+	d->prev_idle_exit = local_clock();
-+}
-+
-+/* ================================================================
-+ * Governor callbacks
-+ * ================================================================ */
-+
-+/*
-+ * Return the shallowest C-state index that is both enabled and
-+ * satisfies the current latency request.  Returns 0 if no such
-+ * state exists (caller must treat 0 as "POLL is the only option").
-+ *
-+ * Called from the short-circuit path to decide whether the predicted
-+ * sleep length is worth entering any C-state at all.  Does not
-+ * consult the NN.
-+ */
-+static int nap_find_min_valid_state(struct cpuidle_driver *drv,
-+				    struct cpuidle_device *dev,
-+				    s64 latency_req)
-+{
-+	int i;
-+
-+	for (i = 1; i < drv->state_count; i++) {
-+		if (dev->states_usage[i].disable)
-+			continue;
-+		if (drv->states[i].exit_latency_ns > latency_req)
-+			continue;
-+		return i;
-+	}
-+	return 0;
-+}
-+
-+/*
-+ * Cached wrapper around nap_find_min_valid_state().
-+ *
-+ * Invalidation triggers:
-+ *   1. latency_req changed since last cached value (immediate; PM QoS
-+ *      updates propagate on the next nap_select call).
-+ *   2. NAP_MIN_STATE_REFRESH_JIFFIES elapsed since last refresh
-+ *      (bounded staleness for sysfs-driven or runtime-driver state
-+ *      disable events, which are rare).
-+ *
-+ * Hot path cost when the cache is valid: ~5-7 cycles (one s64
-+ * compare, one time_after() check, one conditional return).  The
-+ * uncached loop runs at most once per HZ jiffies per CPU.
-+ */
-+static inline int nap_get_min_valid_state(struct nap_cpu_data *d,
-+					   struct cpuidle_driver *drv,
-+					   struct cpuidle_device *dev,
-+					   s64 latency_req)
-+{
-+	if (unlikely(latency_req != d->cached_min_state_latency ||
-+		     time_after(jiffies,
-+				d->cached_min_state_jiffies +
-+				NAP_MIN_STATE_REFRESH_JIFFIES))) {
-+		d->cached_min_state = nap_find_min_valid_state(drv, dev,
-+							       latency_req);
-+		d->cached_min_state_latency = latency_req;
-+		d->cached_min_state_jiffies = jiffies;
-+	}
-+	return d->cached_min_state;
-+}
-+
-+/*
-+ * Compute dev->poll_limit_ns for the short-circuit path.
-+ *
-+ * Budget = predicted wake time (sleep_length) + 1 µs safety margin.
-+ * The margin absorbs timer jitter so a wake arriving slightly after
-+ * the predicted time does not trigger a select/enter/reflect retry
-+ * cycle.  It is consumed only when the wake is actually late; on-time
-+ * and early wakes exit POLL via need_resched without touching the
-+ * margin.
-+ *
-+ * Floor: NAP_POLL_LIMIT_MIN_NS (1 µs).  Below this, per-iteration
-+ * governor overhead exceeds actual polling, and POLL's own timeout
-+ * sampling granularity (~1.3 µs via POLL_IDLE_RELAX_COUNT cpu_relax
-+ * iterations) makes smaller limits indistinguishable in practice.
-+ *
-+ * Ceiling: min_state.target_residency_ns.  Beyond that point, the
-+ * C-state would have been a better choice than polling.
-+ */
-+static inline u64 nap_compute_poll_limit(u64 sleep_length_ns,
-+					 u64 min_state_target_ns)
-+{
-+	u64 budget = sleep_length_ns + NAP_POLL_LIMIT_MARGIN_NS;
-+
-+	return clamp_t(u64, budget,
-+		       NAP_POLL_LIMIT_MIN_NS,
-+		       min_state_target_ns);
-+}
-+
-+static int nap_fallback_heuristic(struct cpuidle_driver *drv,
-+				  struct cpuidle_device *dev)
-+{
-+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
-+	ktime_t delta_tick;
-+	u64 sleep_length_ns;
-+	int i;
-+
-+	sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick));
-+
-+	for (i = drv->state_count - 1; i > 0; i--) {
-+		if (dev->states_usage[i].disable)
-+			continue;
-+		if (drv->states[i].exit_latency_ns > latency_req)
-+			continue;
-+		if (drv->states[i].target_residency_ns > sleep_length_ns)
-+			continue;
-+		return i;
-+	}
-+	return 0;
-+}
-+
-+static int nap_select(struct cpuidle_driver *drv,
-+		      struct cpuidle_device *dev,
-+		      bool *stop_tick)
-+{
-+	struct nap_cpu_data *d = this_cpu_ptr(&nap_data);
-+	s64 latency_req;
-+	ktime_t delta_tick;
-+	u64 sleep_length_ns;
-+	int idx, min_state;
-+
-+	if (unlikely(drv->state_count <= 1))
-+		return 0;
-+
-+	latency_req = cpuidle_governor_latency_req(dev->cpu);
-+	sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick));
-+
-+	min_state = nap_get_min_valid_state(d, drv, dev, latency_req);
-+
-+	/*
-+	 * Fast path: when no C-state can amortize its target residency
-+	 * within the predicted sleep length, the answer is deterministically
-+	 * POLL.  Skip NN inference and feature extraction entirely.
-+	 * nap_reflect also skips history update and learning for
-+	 * short-circuited events (see the short_circuited check there).
-+	 * See spec §3.1.
-+	 */
-+	if (min_state == 0 ||
-+	    sleep_length_ns < drv->states[min_state].target_residency_ns) {
-+
-+		if (min_state > 0)
-+			dev->poll_limit_ns = nap_compute_poll_limit(
-+				sleep_length_ns,
-+				drv->states[min_state].target_residency_ns);
-+		else
-+			dev->poll_limit_ns = max_t(u64, sleep_length_ns,
-+						   NAP_POLL_LIMIT_MIN_NS);
-+
-+		*stop_tick = false;
-+		d->last_selected_idx = 0;
-+		d->short_circuited = true;
-+		d->stats.total_selects++;
-+		return 0;
-+	}
-+
-+	/* Normal NN-driven path */
-+	d->short_circuited = false;
-+
-+	if (likely(may_use_simd())) {
-+		kernel_fpu_begin();
-+		idx = nap_fpu_select(drv, dev, d);
-+		kernel_fpu_end();
-+
-+		if (idx < 0)
-+			idx = nap_fallback_heuristic(drv, dev);
-+	} else {
-+		idx = nap_fallback_heuristic(drv, dev);
-+	}
-+
-+	*stop_tick = (drv->states[idx].target_residency_ns >
-+		      RESIDENCY_THRESHOLD_NS);
-+
-+	d->last_selected_idx = idx;
-+	d->stats.total_selects++;
-+
-+	return idx;
-+}
-+
-+static void nap_reflect(struct cpuidle_device *dev, int index)
-+{
-+	struct nap_cpu_data *d = this_cpu_ptr(&nap_data);
-+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
-+	u64 measured_ns = dev->last_residency_ns;
-+
-+	if (unlikely(!drv))
-+		return;
-+
-+	/*
-+	 * Short-circuited POLL: NN was not invoked for this idle
-+	 * event, so the residency does not belong to the NN's
-+	 * training distribution.  Update the aggregate residency
-+	 * statistic and return — history, hit_intercept, prediction
-+	 * error, external signals, and learning are all skipped.
-+	 * See spec §3.4.
-+	 */
-+	if (d->short_circuited) {
-+		d->stats.total_residency_ns += measured_ns;
-+		return;
-+	}
-+
-+	nap_history_update(d, measured_ns);
-+
-+	d->last_prediction_error = d->last_predicted_ns - (s64)measured_ns;
-+	nap_update_external_signals(d);
-+
-+	/*
-+	 * Dual gate: learn when both the per-N-reflect counter fires
-+	 * AND at least learn_jiffies_min jiffies have elapsed since
-+	 * the last learning step.  The time gate prevents sustained
-+	 * weight churn on workloads with very rapid idle bursts; a
-+	 * value of 0 disables it (restores the original counter-only
-+	 * behavior).  See spec §3.5.
-+	 */
-+	if (++d->learn_counter >= d->learn_interval &&
-+	    time_after_eq(jiffies,
-+			  d->last_learn_jiffies + d->learn_jiffies_min)) {
-+		d->learn_counter = 0;
-+		d->last_learn_jiffies = jiffies;
-+		d->learn_actual_ns = measured_ns;
-+		d->needs_learn = true;
-+	}
-+
-+	d->stats.total_residency_ns += measured_ns;
-+	if (index > 0 && measured_ns < drv->states[index].target_residency_ns)
-+		d->stats.overshoot_count++;
-+}
-+
-+static int nap_enable(struct cpuidle_driver *drv,
-+		      struct cpuidle_device *dev)
-+{
-+	struct nap_cpu_data *d = per_cpu_ptr(&nap_data, dev->cpu);
-+
-+	memset(d, 0, sizeof(*d));
-+
-+	/*
-+	 * Force first-call refresh of the min-valid-state cache.
-+	 * cached_min_state_latency = S64_MIN ensures the first
-+	 * nap_select() comparison will always trip the invalidation
-+	 * branch regardless of the actual latency_req value.
-+	 * cached_min_state itself is already zeroed by the memset above.
-+	 */
-+	d->cached_min_state_latency = S64_MIN;
-+	d->cached_min_state_jiffies = jiffies - NAP_MIN_STATE_REFRESH_JIFFIES;
-+
-+	/* Default: allow at most one learning step per jiffy */
-+	d->learn_jiffies_min = 1;
-+
-+	/*
-+	 * Defer weight initialization to the first nap_select() FPU path
-+	 * via reset_pending.  nap_enable() is called from cpuidle core
-+	 * (cpuidle_enable_device) which may run on a different CPU than
-+	 * dev->cpu during governor switch.  Deferring ensures FPU init
-+	 * happens on the correct CPU in its own idle context.
-+	 */
-+	WRITE_ONCE(nap_cached_drv, drv);
-+	d->learning_rate_millths  = NAP_DEFAULT_LR_MILLTHS;
-+	d->learn_interval = NAP_DEFAULT_INTERVAL;
-+	d->max_grad_norm_millths  = NAP_DEFAULT_CLAMP_MILLTHS;
-+	d->overshoot_pctl_millths = NAP_DEFAULT_PCTL_MILLTHS;
-+	d->reset_pending = true;
-+
-+	return 0;
-+}
-+
-+static void nap_disable(struct cpuidle_driver *drv,
-+			struct cpuidle_device *dev)
-+{
-+	WRITE_ONCE(nap_cached_drv, NULL);
-+}
-+
-+/* ================================================================
-+ * sysfs interface  (/sys/devices/system/cpu/nap/)
-+ * ================================================================ */
-+
-+static ssize_t stats_show(struct kobject *kobj,
-+			  struct kobj_attribute *attr, char *buf)
-+{
-+	int cpu, len = 0;
-+	u64 total_sel = 0, total_res = 0, total_under = 0, total_learn = 0;
-+
-+	for_each_online_cpu(cpu) {
-+		struct nap_cpu_data *d = &per_cpu(nap_data, cpu);
-+
-+		total_sel   += d->stats.total_selects;
-+		total_res   += d->stats.total_residency_ns;
-+		total_under += d->stats.overshoot_count;
-+		total_learn += d->stats.learn_count;
-+	}
-+
-+	len += sysfs_emit_at(buf, len, "total_selects: %llu\n", total_sel);
-+	len += sysfs_emit_at(buf, len, "total_residency_ms: %llu\n",
-+			     div_u64(total_res, NSEC_PER_MSEC));
-+	len += sysfs_emit_at(buf, len, "overshoot_count: %llu\n", total_under);
-+	len += sysfs_emit_at(buf, len, "overshoot_rate_permil: %llu\n",
-+			     total_sel ? div_u64(total_under * 1000, total_sel) : 0);
-+	len += sysfs_emit_at(buf, len, "learn_count: %llu\n", total_learn);
-+	return len;
-+}
-+
-+static ssize_t learning_rate_show(struct kobject *kobj,
-+				  struct kobj_attribute *attr, char *buf)
-+{
-+	int cpu;
-+
-+	cpu = cpumask_first(cpu_online_mask);
-+	if (cpu >= nr_cpu_ids)
-+		return sysfs_emit(buf, "0\n");
-+	return sysfs_emit(buf, "%u\n",
-+			  per_cpu(nap_data, cpu).learning_rate_millths);
-+}
-+
-+static ssize_t learning_rate_store(struct kobject *kobj,
-+				   struct kobj_attribute *attr,
-+				   const char *buf, size_t count)
-+{
-+	unsigned int val;
-+	int cpu;
-+
-+	if (kstrtouint(buf, 10, &val) || val == 0 || val > 100)
-+		return -EINVAL;
-+
-+	for_each_online_cpu(cpu)
-+		per_cpu(nap_data, cpu).learning_rate_millths = val;
-+
-+	return count;
-+}
-+
-+static ssize_t learn_interval_show(struct kobject *kobj,
-+				   struct kobj_attribute *attr, char *buf)
-+{
-+	int cpu;
-+
-+	cpu = cpumask_first(cpu_online_mask);
-+	if (cpu >= nr_cpu_ids)
-+		return sysfs_emit(buf, "0\n");
-+	return sysfs_emit(buf, "%d\n",
-+			  per_cpu(nap_data, cpu).learn_interval);
-+}
-+
-+static ssize_t learn_interval_store(struct kobject *kobj,
-+				    struct kobj_attribute *attr,
-+				    const char *buf, size_t count)
-+{
-+	unsigned int val;
-+	int cpu;
-+
-+	if (kstrtouint(buf, 10, &val) || val == 0 || val > 10000)
-+		return -EINVAL;
-+
-+	for_each_online_cpu(cpu)
-+		per_cpu(nap_data, cpu).learn_interval = val;
-+
-+	return count;
-+}
-+
-+static ssize_t learn_jiffies_min_show(struct kobject *kobj,
-+				      struct kobj_attribute *attr, char *buf)
-+{
-+	int cpu;
-+
-+	cpu = cpumask_first(cpu_online_mask);
-+	if (cpu >= nr_cpu_ids)
-+		return sysfs_emit(buf, "0\n");
-+	return sysfs_emit(buf, "%u\n",
-+			  per_cpu(nap_data, cpu).learn_jiffies_min);
-+}
-+
-+static ssize_t learn_jiffies_min_store(struct kobject *kobj,
-+				       struct kobj_attribute *attr,
-+				       const char *buf, size_t count)
-+{
-+	unsigned int val;
-+	int cpu;
-+
-+	if (kstrtouint(buf, 10, &val) || val > HZ * 3600)
-+		return -EINVAL;
-+
-+	for_each_online_cpu(cpu)
-+		per_cpu(nap_data, cpu).learn_jiffies_min = val;
-+
-+	return count;
-+}
-+
-+static ssize_t reset_weights_store(struct kobject *kobj,
-+				   struct kobj_attribute *attr,
-+				   const char *buf, size_t count)
-+{
-+	cpumask_var_t mask;
-+	int cpu;
-+
-+	if (!READ_ONCE(nap_cached_drv))
-+		return -ENODEV;
-+
-+	/*
-+	 * Set a per-CPU flag; each CPU will reinitialize its own weights
-+	 * inside nap_select() within its own kernel_fpu_begin/end context.
-+	 * This avoids cross-CPU data races on the weight arrays.
-+	 *
-+	 * Accepts "all" to reset every online CPU, or a cpulist
-+	 * (e.g. "0-3,5,7") to reset specific CPUs.
-+	 */
-+	if (sysfs_streq(buf, "all")) {
-+		for_each_online_cpu(cpu)
-+			per_cpu(nap_data, cpu).reset_pending = true;
-+		pr_info("nap: weight reset scheduled for all CPUs\n");
-+		return count;
-+	}
-+
-+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-+		return -ENOMEM;
-+
-+	if (cpulist_parse(buf, mask)) {
-+		free_cpumask_var(mask);
-+		return -EINVAL;
-+	}
-+
-+	for_each_cpu_and(cpu, mask, cpu_online_mask)
-+		per_cpu(nap_data, cpu).reset_pending = true;
-+
-+	pr_info("nap: weight reset scheduled for CPUs %*pbl\n",
-+		cpumask_pr_args(mask));
-+	free_cpumask_var(mask);
-+	return count;
-+}
-+
-+static ssize_t reset_stats_store(struct kobject *kobj,
-+				 struct kobj_attribute *attr,
-+				 const char *buf, size_t count)
-+{
-+	int cpu;
-+
-+	for_each_online_cpu(cpu)
-+		memset(&per_cpu(nap_data, cpu).stats, 0,
-+		       sizeof(struct nap_stats));
-+
-+	return count;
-+}
-+
-+static ssize_t overshoot_pctl_show(struct kobject *kobj,
-+				    struct kobj_attribute *attr, char *buf)
-+{
-+	int cpu;
-+
-+	cpu = cpumask_first(cpu_online_mask);
-+	if (cpu >= nr_cpu_ids)
-+		return sysfs_emit(buf, "0\n");
-+	return sysfs_emit(buf, "%u\n",
-+			  per_cpu(nap_data, cpu).overshoot_pctl_millths);
-+}
-+
-+static ssize_t overshoot_pctl_store(struct kobject *kobj,
-+				     struct kobj_attribute *attr,
-+				     const char *buf, size_t count)
-+{
-+	unsigned int val;
-+	int cpu;
-+
-+	if (kstrtouint(buf, 10, &val) || val > 500)
-+		return -EINVAL;
-+
-+	for_each_online_cpu(cpu)
-+		per_cpu(nap_data, cpu).overshoot_pctl_millths = val;
-+
-+	return count;
-+}
-+
-+static ssize_t version_show(struct kobject *kobj,
-+			    struct kobj_attribute *attr, char *buf)
-+{
-+	return sysfs_emit(buf, "%s\n", CPUIDLE_NAP_VERSION);
-+}
-+
-+static ssize_t simd_show(struct kobject *kobj,
-+			 struct kobj_attribute *attr, char *buf)
-+{
-+	if (static_branch_unlikely(&nap_use_avx2))
-+		return sysfs_emit(buf, "avx2\n");
-+	else
-+		return sysfs_emit(buf, "sse2\n");
-+}
-+
-+static struct kobj_attribute version_attr           = __ATTR_RO(version);
-+static struct kobj_attribute simd_attr              = __ATTR_RO(simd);
-+static struct kobj_attribute stats_attr             = __ATTR_RO(stats);
-+static struct kobj_attribute learning_rate_attr     = __ATTR_RW(learning_rate);
-+static struct kobj_attribute learn_interval_attr    = __ATTR_RW(learn_interval);
-+static struct kobj_attribute learn_jiffies_min_attr = __ATTR_RW(learn_jiffies_min);
-+static struct kobj_attribute overshoot_pctl_attr    = __ATTR_RW(overshoot_pctl);
-+static struct kobj_attribute reset_weights_attr     = __ATTR_WO(reset_weights);
-+static struct kobj_attribute reset_stats_attr       = __ATTR_WO(reset_stats);
-+
-+static struct attribute *nap_attrs[] = {
-+	&version_attr.attr,
-+	&simd_attr.attr,
-+	&stats_attr.attr,
-+	&learning_rate_attr.attr,
-+	&learn_interval_attr.attr,
-+	&learn_jiffies_min_attr.attr,
-+	&overshoot_pctl_attr.attr,
-+	&reset_weights_attr.attr,
-+	&reset_stats_attr.attr,
-+	NULL,
-+};
-+
-+static const struct attribute_group nap_attr_group = {
-+	.attrs = nap_attrs,
-+};
-+
-+static struct kobject *cpuidle_kobj;
-+
-+int nap_sysfs_init(void)
-+{
-+	struct device *dev_root;
-+	int ret;
-+
-+	dev_root = bus_get_dev_root(&cpu_subsys);
-+	if (!dev_root)
-+		return -ENODEV;
-+
-+	cpuidle_kobj = kobject_create_and_add("nap", &dev_root->kobj);
-+	put_device(dev_root);
-+	if (!cpuidle_kobj)
-+		return -ENOMEM;
-+
-+	ret = sysfs_create_group(cpuidle_kobj, &nap_attr_group);
-+	if (ret) {
-+		kobject_put(cpuidle_kobj);
-+		cpuidle_kobj = NULL;
-+	}
-+	return ret;
-+}
-+
-+void nap_sysfs_exit(void)
-+{
-+	if (cpuidle_kobj) {
-+		sysfs_remove_group(cpuidle_kobj, &nap_attr_group);
-+		kobject_put(cpuidle_kobj);
-+		cpuidle_kobj = NULL;
-+	}
-+}
-+
-+/* ================================================================
-+ * Governor registration
-+ * ================================================================ */
-+
-+static struct cpuidle_governor nap_governor = {
-+	.name    = "nap",
-+	.rating  = 26,
-+	.enable  = nap_enable,
-+	.disable = nap_disable,
-+	.select  = nap_select,
-+	.reflect = nap_reflect,
-+};
-+
-+static int __init nap_init(void)
-+{
-+	int ret;
-+
-+	nap_detect_simd();
-+
-+	ret = nap_sysfs_init();
-+	if (ret)
-+		pr_warn("nap: sysfs init failed: %d (continuing without sysfs)\n", ret);
-+
-+	ret = cpuidle_register_governor(&nap_governor);
-+	if (ret) {
-+		pr_err("nap: register_governor failed: %d\n", ret);
-+		nap_sysfs_exit();
-+		return ret;
-+	}
-+
-+	pr_info("%s v%s by %s registered (rating=%u)\n",
-+	       CPUIDLE_NAP_PROGNAME, CPUIDLE_NAP_VERSION,
-+	       CPUIDLE_NAP_AUTHOR, nap_governor.rating);
-+	return 0;
-+}
-+postcore_initcall(nap_init);
-diff --git a/drivers/cpuidle/governors/nap/nap.h b/drivers/cpuidle/governors/nap/nap.h
-new file mode 100644
-index 0000000000..1059db983b
---- /dev/null
-+++ b/drivers/cpuidle/governors/nap/nap.h
-@@ -0,0 +1,283 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+#ifndef NAP_H
-+#define NAP_H
-+
-+#include <linux/cpuidle.h>
-+#include <linux/jump_label.h>
-+#include <linux/ktime.h>
-+
-+/* ================================================================
-+ * Neural network dimensions
-+ * ================================================================ */
-+
-+#define NAP_INPUT_SIZE    8
-+#define NAP_HIDDEN_SIZE   8
-+#define NAP_NUM_EXPERTS   3
-+
-+/*
-+ * Neural network weight structure for an 8→8→1 MLP (scalar regression).
-+ *
-+ * The NN outputs a single log2 correction factor applied to sleep_length:
-+ *   effective_sleep = exp2(log2(sleep_length) + nn_output)
-+ * State selection is then deterministic: pick the deepest state whose
-+ * cost (target_residency + exit_latency) ≤ effective_sleep.
-+ *
-+ * Column-major storage: w_h1[j][i] = weight from input j to hidden neuron i.
-+ * This layout enables efficient column-wise matrix-vector products where
-+ * each input broadcasts across all hidden neurons via SIMD FMA.
-+ *
-+ * __aligned(32) ensures AVX2 vmovaps (32-byte aligned) loads work
-+ * correctly.  8 floats = 32 bytes = one ymm register.
-+ */
-+struct nap_weights {
-+	/* Hidden layer: input[8] → hidden[8] */
-+	float w_h1[NAP_INPUT_SIZE][NAP_HIDDEN_SIZE];  /* 64 params */
-+	float b_h1[NAP_HIDDEN_SIZE];                   /* 8 params  */
-+	/* Output layer: hidden[8] → 1 scalar */
-+	float w_out[NAP_HIDDEN_SIZE];                  /* 8 params  */
-+	float b_out;                                   /* 1 param   */
-+} __aligned(32);
-+
-+/* ISA-specific forward pass implementations */
-+void nap_nn_forward_sse2(const float *input, float *output,
-+			 float *hidden_save, const struct nap_weights *w);
-+void nap_nn_forward_avx2(const float *input, float *output,
-+			 float *hidden_save, const struct nap_weights *w);
-+/* ISA-specific online learning (backpropagation) */
-+struct nap_cpu_data;
-+void nap_nn_learn_sse2(struct nap_cpu_data *d);
-+void nap_nn_learn_avx2(struct nap_cpu_data *d);
-+
-+/* Static key for ISA dispatch (defined in nap.c) */
-+DECLARE_STATIC_KEY_FALSE(nap_use_avx2);
-+
-+/* ================================================================
-+ * SIMD type definitions and helpers (GCC vector extensions)
-+ *
-+ * Only available when compiled with FPU/SSE flags (nap_fpu.c,
-+ * nap_nn_*.c).  nap.c is compiled without FPU flags and must
-+ * not see these definitions.
-+ *
-+ * <immintrin.h> is a userspace header and cannot be used in kernel.
-+ * We use __attribute__((__vector_size__())) and __builtin_ia32_*.
-+ * ================================================================ */
-+
-+#ifdef __SSE2__
-+
-+typedef float v4sf  __attribute__((__vector_size__(16)));   /* xmm: 4×float  */
-+typedef int   v4si  __attribute__((__vector_size__(16)));   /* xmm: 4×int32  */
-+typedef float v8sf  __attribute__((__vector_size__(32)));   /* ymm: 8×float  */
-+
-+/* Broadcast helpers */
-+#define V4SF_SET1(x)  ((v4sf){ (x), (x), (x), (x) })
-+#define V4SI_SET1(x)  ((v4si){ (x), (x), (x), (x) })
-+#define V8SF_SET1(x)  ((v8sf){ (x),(x),(x),(x),(x),(x),(x),(x) })
-+#define V8SF_ZERO     V8SF_SET1(0.0f)
-+
-+/* Unaligned load/store helpers */
-+static inline v4sf v4sf_loadu(const float *p)
-+{
-+	v4sf result;
-+	__builtin_memcpy(&result, p, sizeof(result));
-+	return result;
-+}
-+
-+static inline void v4sf_storeu(float *p, v4sf v)
-+{
-+	__builtin_memcpy(p, &v, sizeof(v));
-+}
-+
-+#ifdef __AVX__
-+static inline v8sf v8sf_loadu(const float *p)
-+{
-+	v8sf result;
-+	__builtin_memcpy(&result, p, sizeof(result));
-+	return result;
-+}
-+
-+static inline void v8sf_storeu(float *p, v8sf v)
-+{
-+	__builtin_memcpy(p, &v, sizeof(v));
-+}
-+#endif /* __AVX__ */
-+
-+/* Scalar/vector clamp helpers */
-+static inline float fclampf(float v, float lo, float hi)
-+{
-+	if (v < lo) return lo;
-+	if (v > hi) return hi;
-+	return v;
-+}
-+
-+static inline v4sf v4sf_clamp(v4sf v, v4sf lo, v4sf hi)
-+{
-+	return __builtin_ia32_maxps(__builtin_ia32_minps(v, hi), lo);
-+}
-+
-+/* Type punning: float ↔ int reinterpret (no instruction generated) */
-+static inline v4si v4sf_as_v4si(v4sf v)
-+{
-+	union { v4sf f; v4si i; } u = { .f = v };
-+	return u.i;
-+}
-+
-+static inline v4sf v4si_as_v4sf(v4si v)
-+{
-+	union { v4si i; v4sf f; } u = { .i = v };
-+	return u.f;
-+}
-+
-+/*
-+ * fast_log2f_sse() — Compute log2 of 4 floats simultaneously using SSE2
-+ *
-+ * Cost: ~15 cycles for 4 values (~4 cycles per value)
-+ */
-+static inline v4sf fast_log2f_sse(v4sf x)
-+{
-+	const v4si mask_exp  = V4SI_SET1(0xFF);
-+	const v4si bias      = V4SI_SET1(127);
-+	const v4si mask_mant = V4SI_SET1(0x7FFFFF);
-+	const v4si exp_bias  = V4SI_SET1(127 << 23);
-+
-+	v4si xi    = v4sf_as_v4si(x);
-+	v4si exp_i = (xi >> 23) & mask_exp;
-+	exp_i      = exp_i - bias;
-+	v4sf e     = __builtin_convertvector(exp_i, v4sf);
-+
-+	v4si mant_i = (xi & mask_mant) | exp_bias;
-+	v4sf m      = v4si_as_v4sf(mant_i) - V4SF_SET1(1.0f);
-+
-+	v4sf p;
-+	p = m * V4SF_SET1(0.4808f);
-+	p = V4SF_SET1(0.7213f) - p;
-+	p = m * p;
-+	p = V4SF_SET1(1.4425f) - p;
-+	p = m * p;
-+
-+	return e + p;
-+}
-+
-+#endif /* __SSE2__ */
-+
-+/* ================================================================
-+ * Feature extraction
-+ * ================================================================ */
-+
-+#define NAP_HISTORY_SIZE     8
-+
-+/* ================================================================
-+ * POLL short-circuit tunables
-+ * ================================================================ */
-+
-+/* Minimum and safety-margin values for dev->poll_limit_ns written
-+ * by nap_compute_poll_limit().  Both are 1 µs: the POLL state
-+ * itself checks its timeout only every ~1 µs (POLL_IDLE_RELAX_COUNT
-+ * cpu_relax() iterations in drivers/cpuidle/poll_state.c), so
-+ * finer-grained values would not produce distinguishable behavior.
-+ */
-+#define NAP_POLL_LIMIT_MIN_NS      1000ULL
-+#define NAP_POLL_LIMIT_MARGIN_NS   1000ULL
-+
-+/* Refresh interval for the cached minimum-valid-state lookup.
-+ * HZ jiffies (= 1 second) bounds the staleness window caused by
-+ * sysfs-driven or runtime-driver state disable events.  PM QoS
-+ * latency changes are detected immediately via the cached
-+ * latency_req comparison.
-+ */
-+#define NAP_MIN_STATE_REFRESH_JIFFIES  HZ
-+
-+struct nap_stats {
-+	u64 total_selects;
-+	u64 total_residency_ns;
-+	u64 overshoot_count;
-+	u64 learn_count;
-+};
-+
-+struct nap_cpu_data {
-+	/* Ring buffer */
-+	u64   history[NAP_HISTORY_SIZE];
-+	float log_history[NAP_HISTORY_SIZE];
-+	int   hist_idx;
-+	int   hist_count;
-+
-+	/* External signal tracking */
-+	u64     prev_idle_exit;
-+	s64     last_predicted_ns;
-+	s64     last_prediction_error;
-+
-+	/* Short-circuit fast path (§3.1, §3.2, §3.4 of spec) */
-+	bool short_circuited;			/* set in select, read in reflect */
-+	int  cached_min_state;			/* cached shallowest valid state */
-+	s64  cached_min_state_latency;		/* latency_req when cache populated */
-+	unsigned long cached_min_state_jiffies;	/* jiffies when cache populated */
-+
-+	/* Jiffies-based learning rate floor (§3.5 of spec) */
-+	unsigned long last_learn_jiffies;
-+	unsigned int  learn_jiffies_min;	/* sysfs-tunable, 0 = disabled */
-+
-+	/* select/reflect handoff */
-+	int   last_selected_idx;
-+
-+	/* NN scalar output: log2 correction factor for sleep_length.
-+	 * effective_sleep = exp2(log2(sleep_length) + nn_output).
-+	 */
-+	float nn_output;
-+
-+	/*
-+	 * hidden_out[], features_f32[] are written with aligned SIMD
-+	 * stores in nap_nn_forward_{sse2,avx2}() and
-+	 * nap_extract_features():
-+	 *   SSE2:    movaps  (16-byte aligned)
-+	 *   AVX2:    vmovaps (32-byte aligned)
-+	 * Without __aligned(64), the natural struct offset would be
-+	 * only 4-byte aligned, causing #GP faults in the idle task.
-+	 */
-+	float hidden_out[NAP_HIDDEN_SIZE] __aligned(32);
-+	float features_f32[NAP_INPUT_SIZE] __aligned(32);
-+
-+	/* Backprop scratch */
-+	float learn_d_out;	/* output gradient direction (±1) */
-+	float learn_lr;		/* effective lr (base_lr * asymmetric weight) */
-+	float learn_d_hid[NAP_HIDDEN_SIZE] __aligned(32);
-+
-+	/* Precomputed per-state log2(target_residency) for threshold selection.
-+	 * log2_cost[i] = log2(target_residency_ns).
-+	 */
-+	float log2_cost[CPUIDLE_STATE_MAX];
-+
-+	/* Deferred learning data */
-+	bool  needs_learn;
-+	bool  output_clamped;	/* true if nn_output was clamped to features[0] */
-+	u64   learn_actual_ns;
-+
-+	/* Mixture-of-Experts: 3 experts × 8 neurons each */
-+	struct nap_weights expert_weights[NAP_NUM_EXPERTS];
-+	struct nap_weights *active_w;	/* selected expert for current/deferred pass */
-+	int   active_expert;		/* 0, 1, or 2: which expert is active */
-+	float expert_mid;		/* log2 threshold: short ↔ long */
-+	float expert_deep;		/* log2 threshold: long ↔ deep */
-+
-+	/* Online learning */
-+	unsigned int learning_rate_millths;
-+	unsigned int max_grad_norm_millths;
-+	unsigned int overshoot_pctl_millths; /* quantile target (250 = 25th pctl) */
-+	int   learn_interval;
-+	int   learn_counter;
-+	bool reset_pending;		/* set by sysfs, consumed by nap_select */
-+
-+	/* sysfs statistics */
-+	struct nap_stats stats;
-+};
-+
-+DECLARE_PER_CPU(struct nap_cpu_data, nap_data);
-+
-+/* FPU entry point (nap_fpu.c) — call only within kernel_fpu_begin/end */
-+int nap_fpu_select(struct cpuidle_driver *drv,
-+		   struct cpuidle_device *dev,
-+		   struct nap_cpu_data *d);
-+
-+/* sysfs interface */
-+int  nap_sysfs_init(void);
-+void nap_sysfs_exit(void);
-+
-+#endif /* NAP_H */
-diff --git a/drivers/cpuidle/governors/nap/nap_fpu.c b/drivers/cpuidle/governors/nap/nap_fpu.c
-new file mode 100644
-index 0000000000..482a06a5d0
---- /dev/null
-+++ b/drivers/cpuidle/governors/nap/nap_fpu.c
-@@ -0,0 +1,572 @@
-+// SPDX-License-Identifier: GPL-2.0
-+/*
-+ * nap_fpu.c — FPU/SIMD code for the NAP cpuidle governor
-+ *
-+ * This file is compiled with FPU/SSE flags enabled (CC_FLAGS_FPU).
-+ * ALL functions here MUST be called only from within
-+ * kernel_fpu_begin()/kernel_fpu_end() blocks.
-+ *
-+ * Keeping FPU code in a separate translation unit ensures the compiler
-+ * cannot emit SSE/x87 instructions in non-FPU code paths (nap.c),
-+ * which would silently corrupt userspace FPU register state.
-+ */
-+
-+#include <linux/cpuidle.h>
-+#include <linux/math64.h>
-+#include <linux/percpu.h>
-+#include <linux/pm_qos.h>
-+#include <linux/sched/clock.h>
-+#include <linux/string.h>
-+#include <linux/tick.h>
-+
-+#include "nap.h"
-+
-+/* Clang lacks __builtin_ia32_movhlps; emulate with __builtin_shufflevector */
-+#ifdef __clang__
-+#define __builtin_ia32_movhlps(a, b) \
-+	__builtin_shufflevector(b, a, 2, 3, 6, 7)
-+#endif
-+
-+/* ================================================================
-+ * Float math helpers
-+ * ================================================================ */
-+
-+static inline float float_min(float a, float b) { return a < b ? a : b; }
-+static inline float float_max(float a, float b) { return a > b ? a : b; }
-+
-+/*
-+ * Kernel-safe sqrtf using the SSE sqrtss instruction directly.
-+ * GCC may lower nap_sqrtf to a libm call, which is unavailable
-+ * in the kernel.  This file is always compiled with FPU/SSE enabled.
-+ */
-+static inline float nap_sqrtf(float x)
-+{
-+	asm("sqrtss %1, %0" : "=x"(x) : "x"(x));
-+	return x;
-+}
-+
-+/* Scalar log2 approximation (same algorithm as fast_log2f_sse) */
-+static inline float fast_log2f(float x)
-+{
-+	union { float f; u32 i; } u = { .f = x };
-+	int exp = (int)((u.i >> 23) & 0xFFu) - 127;
-+	float e = (float)exp;
-+	float m, p;
-+
-+	u.i = (u.i & 0x7FFFFFu) | (127u << 23);
-+	m = u.f - 1.0f;
-+
-+	p = m * 0.4808f;
-+	p = 0.7213f - p;
-+	p = m * p;
-+	p = 1.4425f - p;
-+	p = m * p;
-+
-+	return e + p;
-+}
-+
-+/* ================================================================
-+ * Deterministic PRNG for weight initialization (LCG)
-+ * ================================================================ */
-+
-+static inline float nap_prng_float(u32 *state)
-+{
-+	*state = *state * 1664525u + 1013904223u;
-+	return (float)(s32)*state * (1.0f / 2147483648.0f);
-+}
-+
-+/* ================================================================
-+ * ISA dispatch via static keys
-+ * ================================================================ */
-+
-+static inline void nap_nn_forward(const float *input, float *output,
-+				  float *hidden_save,
-+				  const struct nap_weights *w)
-+{
-+	if (static_branch_unlikely(&nap_use_avx2))
-+		nap_nn_forward_avx2(input, output, hidden_save, w);
-+	else
-+		nap_nn_forward_sse2(input, output, hidden_save, w);
-+}
-+
-+static inline void nap_nn_learn(struct nap_cpu_data *d)
-+{
-+	if (static_branch_unlikely(&nap_use_avx2))
-+		nap_nn_learn_avx2(d);
-+	else
-+		nap_nn_learn_sse2(d);
-+}
-+
-+/* ================================================================
-+ * Weight initialization
-+ *
-+ * The NN directly outputs predicted sleep time in log2(ns) space.
-+ * Hidden neuron 0 is initialized as a pass-through for feature[0]
-+ * (log2(sleep_length)), so the initial output ≈ log2(sleep_length).
-+ * This matches the pre-learning behavior of selecting the deepest
-+ * state that fits within sleep_length.
-+ *
-+ * Other hidden neurons are Xavier-initialized with near-zero output
-+ * weights so their initial contribution is negligible.  Biases = 0.
-+ * ================================================================ */
-+
-+#define NAP_PRNG_SEED 42u
-+
-+static void nap_init_weights(struct nap_weights *w)
-+{
-+	u32 rng = NAP_PRNG_SEED;
-+	float scale_h1, scale_out;
-+	int i, j;
-+
-+	/* Xavier uniform: U(-sqrt(6/(fan_in+fan_out)), +sqrt(6/(...))) */
-+	scale_h1  = nap_sqrtf(6.0f / (float)(NAP_INPUT_SIZE + NAP_HIDDEN_SIZE));
-+	scale_out = 0.01f;
-+
-+	/* Hidden layer weights */
-+	for (i = 0; i < NAP_INPUT_SIZE; i++)
-+		for (j = 0; j < NAP_HIDDEN_SIZE; j++)
-+			w->w_h1[i][j] = nap_prng_float(&rng) * scale_h1;
-+
-+	/* Hidden biases: zero (standard) */
-+	memset(w->b_h1, 0, sizeof(w->b_h1));
-+
-+	/* Output weights: near-zero for ~0 initial contribution */
-+	for (j = 0; j < NAP_HIDDEN_SIZE; j++)
-+		w->w_out[j] = nap_prng_float(&rng) * scale_out;
-+
-+	/* Output bias: zero */
-+	w->b_out = 0.0f;
-+
-+	/*
-+	 * Neuron 0: pass-through for feature[0] = log2(sleep_length).
-+	 * hidden[0] = ReLU(1.0 * input[0] + 0) = input[0]  (always > 0)
-+	 * output += 1.0 * hidden[0] = log2(sleep_length)
-+	 *
-+	 * Override the random init above so initial output ≈ input[0].
-+	 */
-+	for (i = 0; i < NAP_INPUT_SIZE; i++)
-+		w->w_h1[i][0] = 0.0f;
-+	w->w_h1[0][0] = 1.0f;
-+	w->b_h1[0] = 0.0f;
-+	w->w_out[0] = 1.0f;
-+}
-+
-+/*
-+ * Precompute log2(target_residency) per state for threshold-based selection.
-+ *
-+ * Used in the selection loop: pick deepest state where
-+ * log2_cost[i] <= nn_output (predicted sleep time in log2 space).
-+ *
-+ * Only target_residency_ns is used — exit_latency is a wakeup cost,
-+ * not a factor in whether the CPU can profitably stay in the state
-+ * for the predicted duration.
-+ */
-+static void nap_init_log2_cost(struct nap_cpu_data *d,
-+			       struct cpuidle_driver *drv)
-+{
-+	float log2_tick;
-+	int long_start, deep_idx;
-+	int i;
-+
-+	for (i = 0; i < drv->state_count; i++) {
-+		float res = float_max(
-+			(float)drv->states[i].target_residency_ns, 1.0f);
-+		d->log2_cost[i] = fast_log2f(res);
-+	}
-+
-+	/*
-+	 * MoE expert boundaries — 3-way split.
-+	 *
-+	 * Expert 0 (short): tick-bound idles where measured residency
-+	 *   is dominated by the next tick rather than the workload's
-+	 *   true idle duration.  Boundary: log2(TICK_NSEC).
-+	 *
-+	 * Expert 1 (long): nohz idles in intermediate C-states.
-+	 *
-+	 * Expert 2 (deep): idles targeting the deepest C-state.
-+	 *   The deepest state often has qualitatively different
-+	 *   residency characteristics (package C-state, longer
-+	 *   exit latency, power-gated domains) that warrant a
-+	 *   dedicated expert to avoid gradient interference with
-+	 *   intermediate states.
-+	 *
-+	 * Safety: with only 2 C-states (+ POLL), expert_deep is
-+	 * placed equal to expert_mid so the deep expert is never
-+	 * routed (same behavior as the old 2-expert split).
-+	 */
-+	if (drv->state_count <= 1) {
-+		d->expert_mid = 0.0f;
-+		d->expert_deep = 0.0f;
-+		return;
-+	}
-+
-+	log2_tick = fast_log2f((float)TICK_NSEC);
-+
-+	/* Default: deepest state belongs to long expert (safety) */
-+	long_start = drv->state_count - 1;
-+
-+	/* Prefer the first state whose target_residency exceeds one jiffy */
-+	for (i = 1; i < drv->state_count; i++) {
-+		if (d->log2_cost[i] > log2_tick) {
-+			long_start = i;
-+			break;
-+		}
-+	}
-+
-+	if (long_start > 1) {
-+		/* Normal case: boundary between last short and first long */
-+		d->expert_mid = (d->log2_cost[long_start - 1] +
-+				 d->log2_cost[long_start]) / 2.0f;
-+	} else {
-+		/*
-+		 * long_start == 1: even the shallowest C-state already
-+		 * exceeds one jiffy.  All NN-handled idles go to the
-+		 * long expert; place the boundary just below C1's
-+		 * residency so the short expert remains routable but
-+		 * unused.
-+		 */
-+		d->expert_mid = d->log2_cost[1] - 1.0f;
-+	}
-+
-+	/*
-+	 * Deep expert boundary — deepest C-state split.
-+	 *
-+	 * When there are >= 3 C-states (state_count >= 4, counting POLL),
-+	 * place the boundary at the midpoint between the second-deepest
-+	 * and deepest state's log2(target_residency).  The deep expert
-+	 * then exclusively handles sleep durations long enough to reach
-+	 * the deepest state.
-+	 *
-+	 * With only 2 C-states, expert_deep == expert_mid collapses to
-+	 * the 2-expert regime (expert 2 is never selected).
-+	 */
-+	deep_idx = drv->state_count - 1;
-+	if (deep_idx >= 3) {
-+		/* >= 3 C-states: split before the deepest */
-+		d->expert_deep = (d->log2_cost[deep_idx - 1] +
-+				  d->log2_cost[deep_idx]) / 2.0f;
-+		/* Ensure deep > mid ordering */
-+		if (d->expert_deep <= d->expert_mid)
-+			d->expert_deep = d->expert_mid;
-+	} else {
-+		/* <= 2 C-states: collapse deep into long */
-+		d->expert_deep = d->expert_mid;
-+	}
-+}
-+
-+/* ================================================================
-+ * Feature extraction helpers
-+ * ================================================================ */
-+
-+struct logring_stats {
-+	float avg;
-+	float min;
-+	float max;
-+};
-+
-+/*
-+ * Compute log_history statistics: avg, min, max.
-+ * SIMD fast path when the ring buffer is full (8 elements = 2 × xmm).
-+ */
-+static void logring_compute(const struct nap_cpu_data *d,
-+			    struct logring_stats *s)
-+{
-+	int i, n = d->hist_count;
-+	float sum;
-+
-+	if (n == 0) {
-+		*s = (struct logring_stats){ 0 };
-+		return;
-+	}
-+
-+	if (n == NAP_HISTORY_SIZE) {
-+		v4sf v0 = *(const v4sf *)&d->log_history[0];
-+		v4sf v1 = *(const v4sf *)&d->log_history[4];
-+		v4sf pmin, pmax, psum, t;
-+
-+		pmin = __builtin_ia32_minps(v0, v1);
-+		pmax = __builtin_ia32_maxps(v0, v1);
-+		psum = v0 + v1;
-+
-+		/* 4 → 2 */
-+		t = __builtin_ia32_movhlps(pmin, pmin);
-+		pmin = __builtin_ia32_minps(pmin, t);
-+		t = __builtin_ia32_movhlps(pmax, pmax);
-+		pmax = __builtin_ia32_maxps(pmax, t);
-+		t = __builtin_ia32_movhlps(psum, psum);
-+		psum = psum + t;
-+
-+		/* 2 → 1 */
-+		t = __builtin_ia32_shufps(pmin, pmin, 0x55);
-+		pmin = __builtin_ia32_minps(pmin, t);
-+		t = __builtin_ia32_shufps(pmax, pmax, 0x55);
-+		pmax = __builtin_ia32_maxps(pmax, t);
-+		t = __builtin_ia32_shufps(psum, psum, 0x55);
-+		psum = psum + t;
-+
-+		sum = psum[0];
-+		s->min = pmin[0];
-+		s->max = pmax[0];
-+	} else {
-+		float val;
-+
-+		sum = d->log_history[0];
-+		s->min = sum;
-+		s->max = sum;
-+
-+		for (i = 1; i < n; i++) {
-+			val = d->log_history[i];
-+			sum += val;
-+			s->min = float_min(s->min, val);
-+			s->max = float_max(s->max, val);
-+		}
-+	}
-+
-+	s->avg = sum / (float)n;
-+}
-+
-+/*
-+ * Extract 8 input features for the MLP.
-+ *
-+ *   [0] log2(sleep_length)           — next timer event
-+ *   [1] log2(last_residency)         — actual duration of last idle
-+ *   [2] log_hist avg                 — average recent idle duration
-+ *   [3] log_hist min                 — shortest recent idle
-+ *   [4] log_hist max                 — longest recent idle
-+ *   [5] signed log2(|pred_error|+1)  — prediction feedback
-+ *   [6] log2(busy_ns)               — pre-idle busy duration
-+ *   [7] log2(lat_req) - log2(deepest_lat) — PM QoS headroom
-+ */
-+static void nap_extract_features(struct cpuidle_driver *drv,
-+				 struct cpuidle_device *dev,
-+				 float out[NAP_INPUT_SIZE],
-+				 s64 latency_req)
-+{
-+	struct nap_cpu_data *d = this_cpu_ptr(&nap_data);
-+	struct logring_stats lr;
-+	ktime_t sleep_length, delta_tick;
-+	u64 busy_ns;
-+	float log_inputs[4] __aligned(16);
-+	float log_results[4] __aligned(16);
-+
-+	sleep_length = tick_nohz_get_sleep_length(&delta_tick);
-+	busy_ns = local_clock() - d->prev_idle_exit;
-+
-+	/*
-+	 * SSE log2 batch: 4 values in one fast_log2f_sse call.
-+	 *   [0] sleep_length   → out[0]
-+	 *   [1] last_residency → out[1], also stored to log_history
-+	 *   [2] busy_ns        → out[6]
-+	 *   [3] |pred_error_us| + 1 → out[5] (sign restored after)
-+	 */
-+	{
-+		float err_f = (float)(d->last_prediction_error / 1000);
-+		float abs_err = (err_f >= 0.0f) ? err_f : -err_f;
-+
-+		log_inputs[0] = float_max((float)ktime_to_ns(sleep_length), 1.0f);
-+		log_inputs[1] = float_max((float)dev->last_residency_ns, 1.0f);
-+		log_inputs[2] = float_max((float)busy_ns, 1.0f);
-+		log_inputs[3] = abs_err + 1.0f;
-+
-+		{
-+			v4sf log_in  = *(const v4sf *)log_inputs;
-+			v4sf log_out = fast_log2f_sse(log_in);
-+			*(v4sf *)log_results = log_out;
-+		}
-+
-+		out[0] = log_results[0];
-+		out[1] = log_results[1];
-+		out[6] = log_results[2];
-+
-+		/* out[5]: sign-preserving log2(|err_us| + 1) */
-+		{
-+			union { float f; u32 i; } res = { .f = log_results[3] };
-+			union { float f; u32 i; } sgn = { .f = err_f };
-+
-+			res.i |= sgn.i & 0x80000000u;
-+			out[5] = res.f;
-+		}
-+	}
-+
-+	/* Update log_history ring buffer */
-+	{
-+		int prev = (d->hist_idx - 1 + NAP_HISTORY_SIZE) % NAP_HISTORY_SIZE;
-+		d->log_history[prev] = log_results[1];
-+	}
-+
-+	/* Compute log_history statistics: avg, min, max */
-+	logring_compute(d, &lr);
-+	out[2] = lr.avg;
-+	out[3] = lr.min;
-+	out[4] = lr.max;
-+
-+	/* out[7]: log2(latency_req) - log2(deepest_lat), 0 if unconstrained */
-+	{
-+		u64 deepest_lat = drv->states[drv->state_count - 1]
-+				      .exit_latency_ns;
-+		bool lat_valid = (latency_req < PM_QOS_LATENCY_ANY_NS &&
-+				  deepest_lat > 0);
-+
-+		if (lat_valid)
-+			out[7] = fast_log2f(float_max((float)latency_req, 1.0f))
-+			       - fast_log2f(float_max((float)deepest_lat, 1.0f));
-+		else
-+			out[7] = 0.0f;
-+	}
-+
-+	d->last_predicted_ns = ktime_to_ns(sleep_length);
-+}
-+
-+/* ================================================================
-+ * FPU entry point for nap_select
-+ *
-+ * Called within kernel_fpu_begin()/kernel_fpu_end().
-+ * Returns: selected idle state index (>= 0), or -1 to fall back
-+ *          to the integer heuristic.
-+ * ================================================================ */
-+
-+int nap_fpu_select(struct cpuidle_driver *drv,
-+		   struct cpuidle_device *dev,
-+		   struct nap_cpu_data *d)
-+{
-+	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
-+
-+	/* Handle deferred weight reset (set by sysfs or nap_enable) */
-+	if (unlikely(d->reset_pending)) {
-+		int e;
-+
-+		for (e = 0; e < NAP_NUM_EXPERTS; e++)
-+			nap_init_weights(&d->expert_weights[e]);
-+		nap_init_log2_cost(d, drv);
-+		d->stats.learn_count = 0;
-+		d->needs_learn = false;
-+		d->reset_pending = false;
-+	}
-+
-+	/* Deferred learning (always, even during warmup) */
-+	if (d->needs_learn) {
-+		float log2_eff = d->nn_output;
-+		float alpha = (float)d->overshoot_pctl_millths
-+			      / 1000.0f;
-+		int nn_selected = 0;
-+		bool is_overshoot;
-+		int i;
-+
-+		/* Simulate which state the NN selected */
-+		for (i = drv->state_count - 1; i > 0; i--) {
-+			if (d->log2_cost[i] <= log2_eff) {
-+				nn_selected = i;
-+				break;
-+			}
-+		}
-+
-+		/*
-+		 * Direct overshoot loss.
-+		 *
-+		 * Base the gradient on whether the simulated state
-+		 * selection actually caused overshoot
-+		 * (actual < target_residency).
-+		 *
-+		 * The asymmetric weight is encoded in the learning
-+		 * rate (not in d_out) so that gradient clamping
-+		 * cannot destroy the asymmetry.  d_out is ±1 and
-+		 * gets clipped symmetrically; the (1-α) vs α ratio
-+		 * is preserved through learn_lr.
-+		 *
-+		 * At equilibrium, P(overshoot) converges to α.
-+		 * α = overshoot_pctl / 1000.
-+		 */
-+		{
-+			float base_lr = (float)d->learning_rate_millths
-+					/ 1000.0f;
-+
-+			is_overshoot = (nn_selected > 0 &&
-+				d->learn_actual_ns <
-+				drv->states[nn_selected].target_residency_ns);
-+
-+			/*
-+			 * When the output was clamped at the upper
-+			 * limit (nn_output == features[0]), the NN
-+			 * is already predicting the maximum possible
-+			 * sleep time.  Non-overshoot events would
-+			 * push weights UP, but the output cannot
-+			 * actually increase.  Suppress this gradient
-+			 * to prevent unbounded weight growth in idle
-+			 * systems where natural overshoot rate < α.
-+			 *
-+			 * Overshoot events still learn normally
-+			 * (push DOWN) even when clamped.
-+			 */
-+			if (d->output_clamped && !is_overshoot) {
-+				d->learn_lr = 0;
-+				d->learn_d_out = 0;
-+			} else {
-+				d->learn_d_out = is_overshoot
-+					? 1.0f : -1.0f;
-+				d->learn_lr = is_overshoot
-+					? base_lr * (1.0f - alpha)
-+					: base_lr * alpha;
-+			}
-+		}
-+
-+		d->stats.learn_count++;
-+
-+		nap_nn_learn(d);
-+		d->needs_learn = false;
-+	}
-+
-+	/*
-+	 * Feature extraction + NN forward pass.
-+	 * features_f32 is __aligned(64) in nap_cpu_data, satisfying
-+	 * AVX-512 vmovaps requirements.
-+	 */
-+	nap_extract_features(drv, dev, d->features_f32, latency_req);
-+
-+	/* MoE: 3-way expert selection based on log2(sleep_length) */
-+	if (d->features_f32[0] >= d->expert_deep)
-+		d->active_expert = 2;		/* deep: deepest C-state */
-+	else if (d->features_f32[0] >= d->expert_mid)
-+		d->active_expert = 1;		/* long: nohz intermediate */
-+	else
-+		d->active_expert = 0;		/* short: tick-bound */
-+	d->active_w = &d->expert_weights[d->active_expert];
-+
-+	nap_nn_forward(d->features_f32, &d->nn_output, d->hidden_out,
-+		       d->active_w);
-+
-+	/*
-+	 * Clamp NN output: predicted sleep cannot exceed sleep_length
-+	 * (next timer event).  features_f32[0] = log2(sleep_length).
-+	 *
-+	 * Track whether the clamp was applied so the learning block
-+	 * can suppress "push up" gradients when the output is already
-+	 * at the maximum.  Without this, weights diverge unboundedly
-+	 * in idle systems where the natural overshoot rate < alpha.
-+	 */
-+	d->output_clamped = (d->nn_output > d->features_f32[0]);
-+	if (d->output_clamped)
-+		d->nn_output = d->features_f32[0];
-+
-+	/*
-+	 * Threshold-based selection using NN predicted sleep time.
-+	 *
-+	 * The NN directly outputs log2(predicted_sleep) in ns.
-+	 * Select the deepest feasible state whose cost ≤ predicted_sleep.
-+	 */
-+	{
-+		float log2_eff = d->nn_output;
-+		int idx = 0, i;
-+
-+		for (i = drv->state_count - 1; i > 0; i--) {
-+			if (dev->states_usage[i].disable)
-+				continue;
-+			if (drv->states[i].exit_latency_ns > latency_req)
-+				continue;
-+			if (d->log2_cost[i] <= log2_eff) {
-+				idx = i;
-+				break;
-+			}
-+		}
-+		return idx;
-+	}
-+}
-diff --git a/drivers/cpuidle/governors/nap/nap_nn_avx2.c b/drivers/cpuidle/governors/nap/nap_nn_avx2.c
-new file mode 100644
-index 0000000000..96e5415423
---- /dev/null
-+++ b/drivers/cpuidle/governors/nap/nap_nn_avx2.c
-@@ -0,0 +1,135 @@
-+// SPDX-License-Identifier: GPL-2.0
-+/*
-+ * nap_nn_avx2.c — AVX2+FMA forward pass and backpropagation for the nap MLP
-+ *
-+ * 8→8→1 scalar regression (log2 correction factor).
-+ * Uses 256-bit ymm registers: 8 hidden neurons = 1 ymm.
-+ * FMA via vfmadd231ps for fused multiply-add.
-+ *
-+ * Must be called within kernel_fpu_begin/end.
-+ * Compiled with: CFLAGS += -mavx2 -mfma
-+ */
-+
-+#include "nap.h"
-+
-+/* Aligned load/store: GCC translates v8sf* dereference to vmovaps */
-+static inline v8sf v8sf_load(const float *p)   { return *(const v8sf *)p; }
-+static inline void v8sf_store(float *p, v8sf v) { *(v8sf *)p = v; }
-+
-+/* FMA: a*b+c — vfmadd231ps: dest = src1 * src2 + dest */
-+static inline v8sf v8sf_fmadd(v8sf a, v8sf b, v8sf c)
-+{
-+	asm("vfmadd231ps %2, %1, %0" : "+x"(c) : "x"(a), "xm"(b));
-+	return c;
-+}
-+
-+/* ymm clamp: max(min(v, hi), lo) */
-+static inline v8sf v8sf_clamp(v8sf v, v8sf lo, v8sf hi)
-+{
-+	return __builtin_ia32_maxps256(__builtin_ia32_minps256(v, hi), lo);
-+}
-+
-+void nap_nn_forward_avx2(const float *input,
-+			 float *output,
-+			 float *hidden_save,
-+			 const struct nap_weights *w)
-+{
-+	int j;
-+
-+	/* === Hidden layer: 8 outputs = 1×ymm, 2-way accumulator === */
-+	v8sf acc0 = v8sf_load(&w->b_h1[0]);
-+	v8sf acc1 = V8SF_ZERO;
-+
-+	for (j = 0; j < NAP_INPUT_SIZE; j += 2) {
-+		v8sf x0 = V8SF_SET1(input[j]);
-+		v8sf x1 = V8SF_SET1(input[j + 1]);
-+
-+		acc0 = v8sf_fmadd(v8sf_load(&w->w_h1[j][0]),     x0, acc0);
-+		acc1 = v8sf_fmadd(v8sf_load(&w->w_h1[j + 1][0]), x1, acc1);
-+	}
-+
-+	/* Merge accumulators + ReLU */
-+	{
-+		v8sf h = __builtin_ia32_maxps256(acc0 + acc1, V8SF_ZERO);
-+
-+		v8sf_store(hidden_save, h);
-+
-+		/* === Output layer: dot(hidden[8], w_out[8]) + b_out === */
-+		{
-+			v8sf p = v8sf_load(&w->w_out[0]) * h;
-+
-+			/* Horizontal reduce: 8 → 4 → scalar */
-+			v4sf lo = __builtin_ia32_vextractf128_ps256(p, 0);
-+			v4sf hi = __builtin_ia32_vextractf128_ps256(p, 1);
-+			v4sf s4 = lo + hi;
-+
-+			*output = s4[0] + s4[1] + s4[2] + s4[3] + w->b_out;
-+		}
-+	}
-+}
-+
-+/*
-+ * Online learning (backpropagation) — AVX2+FMA
-+ *
-+ * Output: scalar d_out (pre-computed by caller)
-+ * Hidden layer: 8 neurons = 1×ymm
-+ */
-+void nap_nn_learn_avx2(struct nap_cpu_data *d)
-+{
-+	int i;
-+	float d_out_scalar = d->learn_d_out;
-+	float *d_hid = d->learn_d_hid;
-+	float lr = d->learn_lr;
-+	float clamp_val = (float)d->max_grad_norm_millths / 1000.0f;
-+	v8sf v_neg_lr = V8SF_SET1(-lr);
-+	v8sf v_cl_hi  = V8SF_SET1(clamp_val);
-+	v8sf v_cl_lo  = V8SF_SET1(-clamp_val);
-+
-+	/*
-+	 * Hidden gradient: d_hid[j] = relu'(h[j]) * w_out[j] * d_out.
-+	 * vcmpps + vandps: branchless SIMD mask (1×ymm = 8 neurons).
-+	 */
-+	v8sf dh;
-+	{
-+		v8sf vd = V8SF_SET1(d_out_scalar);
-+		v8sf g = v8sf_load(&d->active_w->w_out[0]) * vd;
-+		v8sf mask = __builtin_ia32_cmpps256(
-+				v8sf_load(&d->hidden_out[0]), V8SF_ZERO, 14);
-+
-+		asm("vandps %2, %1, %0" : "=x"(dh) : "x"(g), "xm"(mask));
-+		v8sf_store(d_hid, dh);
-+	}
-+
-+	/* Output weight update: w_out[j] -= lr * clamp(h[j] * d_out) */
-+	{
-+		v8sf vd = V8SF_SET1(d_out_scalar);
-+		v8sf *w = (v8sf *)&d->active_w->w_out[0];
-+
-+		*w = v8sf_fmadd(v_neg_lr,
-+				v8sf_clamp(v8sf_load(&d->hidden_out[0]) * vd,
-+					   v_cl_lo, v_cl_hi),
-+				*w);
-+	}
-+
-+	/* Output bias update (scalar) */
-+	d->active_w->b_out -= lr * fclampf(d_out_scalar, -clamp_val, clamp_val);
-+
-+	/* Hidden weight update: w_h1[i][j] -= lr * clamp(feat[i] * d_hid[j]) */
-+	for (i = 0; i < NAP_INPUT_SIZE; i++) {
-+		v8sf vf = V8SF_SET1(d->features_f32[i]);
-+		v8sf *w = (v8sf *)&d->active_w->w_h1[i][0];
-+
-+		*w = v8sf_fmadd(v_neg_lr,
-+				v8sf_clamp(vf * dh, v_cl_lo, v_cl_hi),
-+				*w);
-+	}
-+
-+	/* Hidden bias update */
-+	{
-+		v8sf *b = (v8sf *)&d->active_w->b_h1[0];
-+
-+		*b = v8sf_fmadd(v_neg_lr,
-+				v8sf_clamp(dh, v_cl_lo, v_cl_hi),
-+				*b);
-+	}
-+}
-diff --git a/drivers/cpuidle/governors/nap/nap_nn_sse2.c b/drivers/cpuidle/governors/nap/nap_nn_sse2.c
-new file mode 100644
-index 0000000000..a9fffb3b98
---- /dev/null
-+++ b/drivers/cpuidle/governors/nap/nap_nn_sse2.c
-@@ -0,0 +1,136 @@
-+// SPDX-License-Identifier: GPL-2.0
-+/*
-+ * nap_nn_sse2.c — SSE2 forward pass and backpropagation for the nap MLP
-+ *
-+ * 8→8→1 scalar regression (log2 correction factor).
-+ * Baseline implementation using SSE2, which is always available on x86_64.
-+ * No FMA — uses separate mul + add (2 instructions per MAC).
-+ *
-+ * Must be called within kernel_fpu_begin/end.
-+ * Compiled with: CFLAGS += -msse2
-+ */
-+
-+#include "nap.h"
-+
-+/* Aligned load/store */
-+static inline v4sf v4sf_load(const float *p)   { return *(const v4sf *)p; }
-+static inline void v4sf_store(float *p, v4sf v) { *(v4sf *)p = v; }
-+
-+/* ReLU helper */
-+static inline v4sf v4sf_max(v4sf a, v4sf b)
-+{
-+	return __builtin_ia32_maxps(a, b);
-+}
-+
-+void nap_nn_forward_sse2(const float *input,
-+			 float *output,
-+			 float *hidden_save,
-+			 const struct nap_weights *w)
-+{
-+	int j;
-+
-+	/* === Hidden layer: 8 outputs = 2×xmm === */
-+	v4sf acc0 = v4sf_load(&w->b_h1[0]);
-+	v4sf acc1 = v4sf_load(&w->b_h1[4]);
-+
-+	for (j = 0; j < NAP_INPUT_SIZE; j++) {
-+		v4sf x = V4SF_SET1(input[j]);
-+		acc0 += v4sf_load(&w->w_h1[j][0]) * x;
-+		acc1 += v4sf_load(&w->w_h1[j][4]) * x;
-+	}
-+
-+	/* ReLU */
-+	{
-+		v4sf zero = V4SF_SET1(0.0f);
-+
-+		acc0 = v4sf_max(acc0, zero);
-+		acc1 = v4sf_max(acc1, zero);
-+	}
-+	v4sf_store(&hidden_save[0], acc0);
-+	v4sf_store(&hidden_save[4], acc1);
-+
-+	/* === Output layer: dot(hidden[8], w_out[8]) + b_out → 1 scalar === */
-+	{
-+		v4sf p0 = v4sf_load(&w->w_out[0]) * acc0;
-+		v4sf p1 = v4sf_load(&w->w_out[4]) * acc1;
-+		v4sf sum = p0 + p1;
-+
-+		*output = sum[0] + sum[1] + sum[2] + sum[3] + w->b_out;
-+	}
-+}
-+
-+/*
-+ * Online learning (backpropagation) — SSE2
-+ *
-+ * Output: scalar d_out (pre-computed by caller)
-+ * Hidden layer: 8 neurons = 2×xmm
-+ */
-+void nap_nn_learn_sse2(struct nap_cpu_data *d)
-+{
-+	int i;
-+	float d_out_scalar = d->learn_d_out;
-+	float *d_hid = d->learn_d_hid;
-+	float lr = d->learn_lr;
-+	float clamp_val = (float)d->max_grad_norm_millths / 1000.0f;
-+	v4sf v_lr    = V4SF_SET1(lr);
-+	v4sf v_cl_hi = V4SF_SET1(clamp_val);
-+	v4sf v_cl_lo = V4SF_SET1(-clamp_val);
-+
-+	/*
-+	 * Hidden gradient: d_hid[j] = relu'(h[j]) * w_out[j] * d_out.
-+	 * Must be computed before output weight update to use pre-update
-+	 * w_out.
-+	 */
-+	{
-+		v4sf vd = V4SF_SET1(d_out_scalar);
-+		v4sf zero = V4SF_SET1(0.0f);
-+		v4sf h, g;
-+		v4si m;
-+
-+		h = v4sf_load(&d->hidden_out[0]);
-+		g = v4sf_load(&d->active_w->w_out[0]) * vd;
-+		m = (v4si)(h > zero);
-+		v4sf_store(&d_hid[0], v4si_as_v4sf(v4sf_as_v4si(g) & m));
-+
-+		h = v4sf_load(&d->hidden_out[4]);
-+		g = v4sf_load(&d->active_w->w_out[4]) * vd;
-+		m = (v4si)(h > zero);
-+		v4sf_store(&d_hid[4], v4si_as_v4sf(v4sf_as_v4si(g) & m));
-+	}
-+
-+	/* Output weight update: w_out[j] -= lr * clamp(h[j] * d_out) */
-+	{
-+		v4sf vd = V4SF_SET1(d_out_scalar);
-+		v4sf *w = (v4sf *)&d->active_w->w_out[0];
-+
-+		w[0] -= v_lr * v4sf_clamp(v4sf_load(&d->hidden_out[0]) * vd,
-+					  v_cl_lo, v_cl_hi);
-+		w[1] -= v_lr * v4sf_clamp(v4sf_load(&d->hidden_out[4]) * vd,
-+					  v_cl_lo, v_cl_hi);
-+	}
-+
-+	/* Output bias update: b_out -= lr * clamp(d_out) */
-+	d->active_w->b_out -= lr * fclampf(d_out_scalar, -clamp_val, clamp_val);
-+
-+	/* Hidden weight update: w_h1[i][j] -= lr * clamp(feat[i] * d_hid[j]) */
-+	{
-+		v4sf dh0 = *(const v4sf *)&d_hid[0];
-+		v4sf dh1 = *(const v4sf *)&d_hid[4];
-+
-+		for (i = 0; i < NAP_INPUT_SIZE; i++) {
-+			v4sf vf = V4SF_SET1(d->features_f32[i]);
-+			v4sf *w = (v4sf *)&d->active_w->w_h1[i][0];
-+
-+			w[0] -= v_lr * v4sf_clamp(vf * dh0, v_cl_lo, v_cl_hi);
-+			w[1] -= v_lr * v4sf_clamp(vf * dh1, v_cl_lo, v_cl_hi);
-+		}
-+
-+		/* Hidden bias update: b_h1[j] -= lr * clamp(d_hid[j]) */
-+		{
-+			v4sf *b = (v4sf *)&d->active_w->b_h1[0];
-+
-+			b[0] -= v_lr * v4sf_clamp(dh0, v_cl_lo, v_cl_hi);
-+			b[1] -= v_lr * v4sf_clamp(dh1, v_cl_lo, v_cl_hi);
-+		}
-+	}
-+}
--- 
-2.34.1

From 67a3de7a27f04bae83c6d8c851061f9976e611bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Sun, 7 Jun 2026 18:39:21 -0300
Subject: [PATCH 06/10] Add files via upload

---
 6.16-nap-v0.5.0.patch | 1847 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1847 insertions(+)
 create mode 100644 6.16-nap-v0.5.0.patch

diff --git a/6.16-nap-v0.5.0.patch b/6.16-nap-v0.5.0.patch
new file mode 100644
index 0000000..3db1e3b
--- /dev/null
+++ b/6.16-nap-v0.5.0.patch
@@ -0,0 +1,1847 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Masahito S <firelzrd@gmail.com>
+Date: Fri, 5 Jun 2026 13:10:05 +0900
+Subject: [PATCH] 6.16 backport: 6.18.3-nap-v0.5.0
+
+Backport of NAP cpuidle governor v0.5.0 to Linux 6.16.
+
+Functional changes from v0.4.0 are preserved; 6.16 compatibility keeps
+the RESIDENCY_THRESHOLD_NS fallback definition used by the previous
+backport.
+
+---
+ drivers/cpuidle/Kconfig                     |  17 +
+ drivers/cpuidle/governors/Makefile          |   1 +
+ drivers/cpuidle/governors/nap/Makefile      |  30 +
+ drivers/cpuidle/governors/nap/nap.c         | 623 ++++++++++++++++++++
+ drivers/cpuidle/governors/nap/nap.h         | 291 ++++++++++
+ drivers/cpuidle/governors/nap/nap_fpu.c     | 528 +++++++++++++++++
+ drivers/cpuidle/governors/nap/nap_nn_avx2.c | 135 +++++
+ drivers/cpuidle/governors/nap/nap_nn_sse2.c | 136 +++++
+ 8 files changed, 1761 insertions(+)
+ create mode 100644 drivers/cpuidle/governors/nap/Makefile
+ create mode 100644 drivers/cpuidle/governors/nap/nap.c
+ create mode 100644 drivers/cpuidle/governors/nap/nap.h
+ create mode 100644 drivers/cpuidle/governors/nap/nap_fpu.c
+ create mode 100644 drivers/cpuidle/governors/nap/nap_nn_avx2.c
+ create mode 100644 drivers/cpuidle/governors/nap/nap_nn_sse2.c
+
+diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
+index cac5997dca..9b6c50f0d8 100644
+--- a/drivers/cpuidle/Kconfig
++++ b/drivers/cpuidle/Kconfig
+@@ -44,6 +44,23 @@ config CPU_IDLE_GOV_HALTPOLL
+ 
+ 	  Some virtualized workloads benefit from using it.
+ 
++config CPU_IDLE_GOV_NAP
++	bool "Neural Adaptive Predictor (NAP) governor"
++	depends on X86_64
++	default y
++	help
++	  A machine-learning-based cpuidle governor that uses a small
++	  neural network (MLP 16→16→10) to predict the optimal idle
++	  state.  Weights are initialized from hardware idle-state
++	  parameters and refined via online learning (deferred
++	  backpropagation with SGD).  Requires SSE2 at minimum;
++	  AVX2/AVX-512 are used when available.
++
++	  This is experimental. Select via cpuidle.governor=nap on
++	  the kernel command line.
++
++	  If unsure, say Y.
++
+ config DT_IDLE_STATES
+ 	bool
+ 
+diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile
+index 63abb5393a..ae688891c0 100644
+--- a/drivers/cpuidle/governors/Makefile
++++ b/drivers/cpuidle/governors/Makefile
+@@ -7,3 +7,4 @@ obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o
+ obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o
+ obj-$(CONFIG_CPU_IDLE_GOV_TEO) += teo.o
+ obj-$(CONFIG_CPU_IDLE_GOV_HALTPOLL) += haltpoll.o
++obj-$(CONFIG_CPU_IDLE_GOV_NAP) += nap/
+diff --git a/drivers/cpuidle/governors/nap/Makefile b/drivers/cpuidle/governors/nap/Makefile
+new file mode 100644
+index 0000000000..8b85a475a6
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/Makefile
+@@ -0,0 +1,30 @@
++# SPDX-License-Identifier: GPL-2.0-only
++#
++# Makefile for the NAP cpuidle governor
++#
++
++obj-$(CONFIG_CPU_IDLE_GOV_NAP) += cpuidle_gov_nap.o
++
++cpuidle_gov_nap-y := nap.o nap_fpu.o nap_nn_sse2.o nap_nn_avx2.o
++
++# Kernel builds with -mno-sse -mno-sse2 -mno-avx -msoft-float -mno-80387
++# -mno-fp-ret-in-387.  FPU/SIMD-using files need these removed and ISA
++# flags explicitly added.
++#
++# CRITICAL: nap.o is intentionally compiled with NORMAL kernel flags
++# (no FPU/SSE).  All floating-point code lives in nap_fpu.o and the
++# nap_nn_*.o files.  This ensures the compiler cannot emit SSE instructions
++# in governor callbacks (nap_select, nap_reflect, etc.), which would
++# silently corrupt userspace FPU register state.
++#
++# Do NOT add CFLAGS_REMOVE/CFLAGS for nap.o — it must stay FPU-free.
++FPU_KILL_FLAGS := -mno-sse -mno-sse2 -mno-mmx -mno-avx -mno-3dnow \
++                  -mno-sse4a -msoft-float -mno-80387 -mno-fp-ret-in-387
++
++CFLAGS_REMOVE_nap_fpu.o        += $(FPU_KILL_FLAGS)
++CFLAGS_REMOVE_nap_nn_sse2.o    += $(FPU_KILL_FLAGS)
++CFLAGS_REMOVE_nap_nn_avx2.o    += $(FPU_KILL_FLAGS)
++
++CFLAGS_nap_fpu.o       += $(CC_FLAGS_FPU)
++CFLAGS_nap_nn_sse2.o   += $(CC_FLAGS_FPU)
++CFLAGS_nap_nn_avx2.o   += $(CC_FLAGS_FPU) -mavx -mavx2 -mfma
+diff --git a/drivers/cpuidle/governors/nap/nap.c b/drivers/cpuidle/governors/nap/nap.c
+new file mode 100644
+index 0000000000..fc7393e9f4
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/nap.c
+@@ -0,0 +1,623 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * nap.c — Neural Adaptive Predictor cpuidle governor
++ *
++ * A machine-learning-based cpuidle governor that uses a small MLP trunk and an
++ * ordinal survival head to predict, per idle-state boundary, the probability
++ * that the upcoming idle reaches that state's target_residency.  The decision
++ * layer picks the deepest feasible state whose calibrated survival meets a
++ * confidence level.  Weights are Xavier-initialized at boot, then refined via
++ * online learning (deferred backpropagation with SGD).
++ *
++ * IMPORTANT: This file is compiled WITHOUT FPU/SSE flags (normal kernel
++ * compilation).  All floating-point and SIMD code lives in nap_fpu.c and
++ * nap_nn_{sse2,avx2}.c, which are compiled with CC_FLAGS_FPU.
++ * This separation ensures the compiler cannot emit SSE instructions in
++ * governor callbacks (nap_select, nap_reflect, etc.), which would corrupt
++ * userspace FPU register state.
++ */
++
++#include <linux/cpuidle.h>
++#include <linux/cpu.h>
++#include <linux/jiffies.h>
++#include <linux/jump_label.h>
++#include <linux/kobject.h>
++#include <linux/math64.h>
++#include <linux/percpu.h>
++#include <linux/sched/clock.h>
++#include <linux/sysfs.h>
++#include <linux/string.h>
++#include <linux/tick.h>
++#include <asm/simd.h>
++#include <asm/fpu/api.h>
++#include <asm/processor.h>
++
++#include "nap.h"
++
++#include "../gov.h"
++
++/**************************************************************
++ * Version Information:
++ */
++
++#define CPUIDLE_NAP_PROGNAME "Nap CPUIdle Governor"
++#define CPUIDLE_NAP_AUTHOR   "Masahito Suzuki"
++
++#define CPUIDLE_NAP_VERSION  "0.5.0"
++
++/* Governor defaults */
++#define NAP_DEFAULT_LR_MILLTHS    1     /* 0.001 = 1 millths */
++#define NAP_DEFAULT_INTERVAL      4     /* learn every 4 reflects */
++#define NAP_DEFAULT_CLAMP_MILLTHS 1000  /* 1.0 = 1000 millths */
++#define NAP_DEFAULT_CONF_MILLTHS  500   /* 0.5 = balanced survival confidence */
++
++/* Backport: RESIDENCY_THRESHOLD_NS is not available in Linux 6.16. */
++#ifndef RESIDENCY_THRESHOLD_NS
++#define RESIDENCY_THRESHOLD_NS TICK_NSEC
++#endif
++
++/* ================================================================
++ * ISA dispatch via static keys (definitions only; dispatch in nap_fpu.c)
++ * ================================================================ */
++
++DEFINE_STATIC_KEY_FALSE(nap_use_avx2);
++
++static void __init nap_detect_simd(void)
++{
++	if (boot_cpu_has(X86_FEATURE_FMA) &&
++	    boot_cpu_has(X86_FEATURE_AVX2)) {
++		static_branch_enable(&nap_use_avx2);
++		pr_info("nap: using AVX2+FMA\n");
++	} else {
++		pr_info("nap: using SSE2\n");
++	}
++}
++
++/* ================================================================
++ * Per-CPU data
++ * ================================================================ */
++
++DEFINE_PER_CPU(struct nap_cpu_data, nap_data);
++static struct cpuidle_driver *nap_cached_drv;
++
++/* ================================================================
++ * Reflect-time updates (integer-only, no FPU needed)
++ * ================================================================ */
++
++static void nap_history_update(struct nap_cpu_data *d, u64 measured_ns)
++{
++	d->history[d->hist_idx] = measured_ns;
++	d->hist_idx = (d->hist_idx + 1) % NAP_HISTORY_SIZE;
++	if (d->hist_count < NAP_HISTORY_SIZE)
++		d->hist_count++;
++}
++
++static void nap_update_external_signals(struct nap_cpu_data *d)
++{
++	d->prev_idle_exit = local_clock();
++}
++
++/* ================================================================
++ * Governor callbacks
++ * ================================================================ */
++
++static int nap_fallback_heuristic(struct cpuidle_driver *drv,
++				  struct cpuidle_device *dev)
++{
++	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
++	ktime_t delta_tick;
++	u64 sleep_length_ns;
++	int i;
++
++	sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick));
++
++	for (i = drv->state_count - 1; i > 0; i--) {
++		if (dev->states_usage[i].disable)
++			continue;
++		if (drv->states[i].exit_latency_ns > latency_req)
++			continue;
++		if (drv->states[i].target_residency_ns > sleep_length_ns)
++			continue;
++		return i;
++	}
++	return 0;
++}
++
++/*
++ * Return the shallowest enabled C-state that satisfies the current
++ * latency request, or 0 if none exists (POLL is the only option).
++ * Does not consult the NN.
++ */
++static int nap_find_min_valid_state(struct cpuidle_driver *drv,
++				    struct cpuidle_device *dev,
++				    s64 latency_req)
++{
++	int i;
++
++	for (i = 1; i < drv->state_count; i++) {
++		if (dev->states_usage[i].disable)
++			continue;
++		if (drv->states[i].exit_latency_ns > latency_req)
++			continue;
++		return i;
++	}
++	return 0;
++}
++
++/*
++ * Cached wrapper around nap_find_min_valid_state().  Invalidated when
++ * latency_req changes (immediate PM QoS propagation) or every
++ * NAP_MIN_STATE_REFRESH_JIFFIES (bounded staleness for rare sysfs /
++ * runtime-driver state-disable events).  Hot-path cost when valid:
++ * one s64 compare plus one time_after() check.
++ */
++static inline int nap_get_min_valid_state(struct nap_cpu_data *d,
++					  struct cpuidle_driver *drv,
++					  struct cpuidle_device *dev,
++					  s64 latency_req)
++{
++	if (unlikely(latency_req != d->cached_min_state_latency ||
++		     time_after(jiffies,
++				d->cached_min_state_jiffies +
++				NAP_MIN_STATE_REFRESH_JIFFIES))) {
++		d->cached_min_state = nap_find_min_valid_state(drv, dev,
++							       latency_req);
++		d->cached_min_state_latency = latency_req;
++		d->cached_min_state_jiffies = jiffies;
++	}
++	return d->cached_min_state;
++}
++
++/*
++ * Compute dev->poll_limit_ns for the short-circuit path: predicted
++ * wake time plus a 1 us margin (absorbs timer jitter so a slightly
++ * late wake does not retrigger select/enter/reflect), floored at
++ * NAP_POLL_LIMIT_MIN_NS and capped at the min state's target
++ * residency (beyond which the C-state would have been the better
++ * choice).
++ */
++static inline u64 nap_compute_poll_limit(u64 sleep_length_ns,
++					 u64 min_state_target_ns)
++{
++	u64 budget = sleep_length_ns + NAP_POLL_LIMIT_MARGIN_NS;
++
++	return clamp_t(u64, budget,
++		       NAP_POLL_LIMIT_MIN_NS,
++		       min_state_target_ns);
++}
++
++static int nap_select(struct cpuidle_driver *drv,
++		      struct cpuidle_device *dev,
++		      bool *stop_tick)
++{
++	struct nap_cpu_data *d = this_cpu_ptr(&nap_data);
++	s64 latency_req;
++	ktime_t delta_tick;
++	u64 sleep_length_ns;
++	int idx, min_state;
++
++	if (unlikely(drv->state_count <= 1))
++		return 0;
++
++	latency_req = cpuidle_governor_latency_req(dev->cpu);
++	sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick));
++	min_state = nap_get_min_valid_state(d, drv, dev, latency_req);
++
++	/*
++	 * Fast path: when no C-state can amortize its target residency
++	 * within the predicted sleep length, the answer is deterministically
++	 * POLL.  Skip NN inference and feature extraction entirely;
++	 * nap_reflect also skips the feedback path for short-circuited
++	 * events (see the short_circuited check there).
++	 */
++	if (min_state == 0 ||
++	    sleep_length_ns < drv->states[min_state].target_residency_ns) {
++		if (min_state > 0)
++			dev->poll_limit_ns = nap_compute_poll_limit(
++				sleep_length_ns,
++				drv->states[min_state].target_residency_ns);
++		else
++			dev->poll_limit_ns = max_t(u64, sleep_length_ns,
++						   NAP_POLL_LIMIT_MIN_NS);
++
++		*stop_tick = false;
++		d->last_selected_idx = 0;
++		d->short_circuited = true;
++		d->stats.total_selects++;
++		return 0;
++	}
++
++	d->short_circuited = false;
++
++	if (likely(may_use_simd())) {
++		kernel_fpu_begin();
++		idx = nap_fpu_select(drv, dev, d);
++		kernel_fpu_end();
++
++		if (idx < 0)
++			idx = nap_fallback_heuristic(drv, dev);
++	} else {
++		idx = nap_fallback_heuristic(drv, dev);
++	}
++
++	*stop_tick = (drv->states[idx].target_residency_ns >
++		      RESIDENCY_THRESHOLD_NS);
++
++	d->last_selected_idx = idx;
++	d->stats.total_selects++;
++
++	return idx;
++}
++
++static void nap_reflect(struct cpuidle_device *dev, int index)
++{
++	struct nap_cpu_data *d = this_cpu_ptr(&nap_data);
++	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
++	u64 measured_ns = dev->last_residency_ns;
++
++	if (unlikely(!drv))
++		return;
++
++	/*
++	 * Short-circuited POLL: the NN was not invoked for this idle, so
++	 * the residency is not part of its training distribution and must
++	 * not feed the floor histogram or the weight update.  Account only
++	 * the aggregate residency and return.
++	 */
++	if (d->short_circuited) {
++		d->stats.total_residency_ns += measured_ns;
++		return;
++	}
++
++	nap_history_update(d, measured_ns);
++
++	d->last_prediction_error = d->last_predicted_ns - (s64)measured_ns;
++	nap_update_external_signals(d);
++
++	/* Every idle provides a fresh residency for the floor and reliability EMAs */
++	d->learn_actual_ns = measured_ns;
++	d->have_sample = true;
++
++	/*
++	 * Throttle the expensive trunk/score weight update with a dual
++	 * gate: the per-N-reflect counter AND a jiffies floor.  The time
++	 * gate caps the learning rate on workloads with very rapid idle
++	 * bursts (e.g. cross-CPU ping-pong); learn_jiffies_min == 0
++	 * disables it and restores counter-only behavior.
++	 */
++	if (++d->learn_counter >= d->learn_interval &&
++	    time_after_eq(jiffies,
++			  d->last_learn_jiffies + d->learn_jiffies_min)) {
++		d->learn_counter = 0;
++		d->last_learn_jiffies = jiffies;
++		d->needs_learn = true;
++	}
++
++	d->stats.total_residency_ns += measured_ns;
++	if (index > 0 && measured_ns < drv->states[index].target_residency_ns)
++		d->stats.overshoot_count++;
++}
++
++static int nap_enable(struct cpuidle_driver *drv,
++		      struct cpuidle_device *dev)
++{
++	struct nap_cpu_data *d = per_cpu_ptr(&nap_data, dev->cpu);
++
++	memset(d, 0, sizeof(*d));
++
++	/*
++	 * Defer weight initialization to the first nap_select() FPU path
++	 * via reset_pending.  nap_enable() is called from cpuidle core
++	 * (cpuidle_enable_device) which may run on a different CPU than
++	 * dev->cpu during governor switch.  Deferring ensures FPU init
++	 * happens on the correct CPU in its own idle context.
++	 */
++	WRITE_ONCE(nap_cached_drv, drv);
++	d->learning_rate_millths  = NAP_DEFAULT_LR_MILLTHS;
++	d->learn_interval = NAP_DEFAULT_INTERVAL;
++	d->max_grad_norm_millths  = NAP_DEFAULT_CLAMP_MILLTHS;
++	d->conf_millths = NAP_DEFAULT_CONF_MILLTHS;
++
++	/*
++	 * Force a first-call refresh of the min-valid-state cache:
++	 * cached_min_state_latency = S64_MIN guarantees the first
++	 * nap_select() comparison trips the invalidation branch.
++	 */
++	d->cached_min_state_latency = S64_MIN;
++	d->cached_min_state_jiffies = jiffies - NAP_MIN_STATE_REFRESH_JIFFIES;
++	d->learn_jiffies_min = 1;
++
++	d->reset_pending = true;
++
++	return 0;
++}
++
++static void nap_disable(struct cpuidle_driver *drv,
++			struct cpuidle_device *dev)
++{
++	WRITE_ONCE(nap_cached_drv, NULL);
++}
++
++/* ================================================================
++ * sysfs interface  (/sys/devices/system/cpu/cpuidle/nap/)
++ * ================================================================ */
++
++static ssize_t stats_show(struct kobject *kobj,
++			  struct kobj_attribute *attr, char *buf)
++{
++	int cpu, len = 0;
++	u64 total_sel = 0, total_res = 0, total_under = 0, total_learn = 0;
++
++	for_each_online_cpu(cpu) {
++		struct nap_cpu_data *d = &per_cpu(nap_data, cpu);
++
++		total_sel   += d->stats.total_selects;
++		total_res   += d->stats.total_residency_ns;
++		total_under += d->stats.overshoot_count;
++		total_learn += d->stats.learn_count;
++	}
++
++	len += sysfs_emit_at(buf, len, "total_selects: %llu\n", total_sel);
++	len += sysfs_emit_at(buf, len, "total_residency_ms: %llu\n",
++			     div_u64(total_res, NSEC_PER_MSEC));
++	len += sysfs_emit_at(buf, len, "overshoot_count: %llu\n", total_under);
++	len += sysfs_emit_at(buf, len, "overshoot_rate_permil: %llu\n",
++			     total_sel ? div_u64(total_under * 1000, total_sel) : 0);
++	len += sysfs_emit_at(buf, len, "learn_count: %llu\n", total_learn);
++	return len;
++}
++
++static ssize_t learning_rate_show(struct kobject *kobj,
++				  struct kobj_attribute *attr, char *buf)
++{
++	int cpu;
++
++	cpu = cpumask_first(cpu_online_mask);
++	if (cpu >= nr_cpu_ids)
++		return sysfs_emit(buf, "0\n");
++	return sysfs_emit(buf, "%u\n",
++			  per_cpu(nap_data, cpu).learning_rate_millths);
++}
++
++static ssize_t learning_rate_store(struct kobject *kobj,
++				   struct kobj_attribute *attr,
++				   const char *buf, size_t count)
++{
++	unsigned int val;
++	int cpu;
++
++	if (kstrtouint(buf, 10, &val) || val == 0 || val > 100)
++		return -EINVAL;
++
++	for_each_online_cpu(cpu)
++		per_cpu(nap_data, cpu).learning_rate_millths = val;
++
++	return count;
++}
++
++static ssize_t learn_interval_show(struct kobject *kobj,
++				   struct kobj_attribute *attr, char *buf)
++{
++	int cpu;
++
++	cpu = cpumask_first(cpu_online_mask);
++	if (cpu >= nr_cpu_ids)
++		return sysfs_emit(buf, "0\n");
++	return sysfs_emit(buf, "%d\n",
++			  per_cpu(nap_data, cpu).learn_interval);
++}
++
++static ssize_t learn_interval_store(struct kobject *kobj,
++				    struct kobj_attribute *attr,
++				    const char *buf, size_t count)
++{
++	unsigned int val;
++	int cpu;
++
++	if (kstrtouint(buf, 10, &val) || val == 0 || val > 10000)
++		return -EINVAL;
++
++	for_each_online_cpu(cpu)
++		per_cpu(nap_data, cpu).learn_interval = val;
++
++	return count;
++}
++
++static ssize_t reset_weights_store(struct kobject *kobj,
++				   struct kobj_attribute *attr,
++				   const char *buf, size_t count)
++{
++	cpumask_var_t mask;
++	int cpu;
++
++	if (!READ_ONCE(nap_cached_drv))
++		return -ENODEV;
++
++	/*
++	 * Set a per-CPU flag; each CPU will reinitialize its own weights
++	 * inside nap_select() within its own kernel_fpu_begin/end context.
++	 * This avoids cross-CPU data races on the weight arrays.
++	 *
++	 * Accepts "all" to reset every online CPU, or a cpulist
++	 * (e.g. "0-3,5,7") to reset specific CPUs.
++	 */
++	if (sysfs_streq(buf, "all")) {
++		for_each_online_cpu(cpu)
++			per_cpu(nap_data, cpu).reset_pending = true;
++		pr_info("nap: weight reset scheduled for all CPUs\n");
++		return count;
++	}
++
++	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
++		return -ENOMEM;
++
++	if (cpulist_parse(buf, mask)) {
++		free_cpumask_var(mask);
++		return -EINVAL;
++	}
++
++	for_each_cpu_and(cpu, mask, cpu_online_mask)
++		per_cpu(nap_data, cpu).reset_pending = true;
++
++	pr_info("nap: weight reset scheduled for CPUs %*pbl\n",
++		cpumask_pr_args(mask));
++	free_cpumask_var(mask);
++	return count;
++}
++
++static ssize_t reset_stats_store(struct kobject *kobj,
++				 struct kobj_attribute *attr,
++				 const char *buf, size_t count)
++{
++	int cpu;
++
++	for_each_online_cpu(cpu)
++		memset(&per_cpu(nap_data, cpu).stats, 0,
++		       sizeof(struct nap_stats));
++
++	return count;
++}
++
++/*
++ * confidence: decision confidence level in millths (1..999, default 500).
++ * Higher demands more certainty before entering a deeper state, biasing toward
++ * responsiveness (shallower); lower biases toward energy (deeper).  This is the
++ * single responsiveness dial and replaces the former overshoot_pctl target.
++ */
++static ssize_t confidence_show(struct kobject *kobj,
++			       struct kobj_attribute *attr, char *buf)
++{
++	int cpu;
++
++	cpu = cpumask_first(cpu_online_mask);
++	if (cpu >= nr_cpu_ids)
++		return sysfs_emit(buf, "0\n");
++	return sysfs_emit(buf, "%u\n",
++			  per_cpu(nap_data, cpu).conf_millths);
++}
++
++static ssize_t confidence_store(struct kobject *kobj,
++				struct kobj_attribute *attr,
++				const char *buf, size_t count)
++{
++	unsigned int val;
++	int cpu;
++
++	if (kstrtouint(buf, 10, &val) || val == 0 || val >= 1000)
++		return -EINVAL;
++
++	for_each_online_cpu(cpu)
++		per_cpu(nap_data, cpu).conf_millths = val;
++
++	return count;
++}
++
++static ssize_t version_show(struct kobject *kobj,
++			    struct kobj_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "%s\n", CPUIDLE_NAP_VERSION);
++}
++
++static ssize_t simd_show(struct kobject *kobj,
++			 struct kobj_attribute *attr, char *buf)
++{
++	if (static_branch_unlikely(&nap_use_avx2))
++		return sysfs_emit(buf, "avx2\n");
++	else
++		return sysfs_emit(buf, "sse2\n");
++}
++
++static struct kobj_attribute version_attr        = __ATTR_RO(version);
++static struct kobj_attribute simd_attr           = __ATTR_RO(simd);
++static struct kobj_attribute stats_attr          = __ATTR_RO(stats);
++static struct kobj_attribute learning_rate_attr  = __ATTR_RW(learning_rate);
++static struct kobj_attribute learn_interval_attr = __ATTR_RW(learn_interval);
++static struct kobj_attribute confidence_attr     = __ATTR_RW(confidence);
++static struct kobj_attribute reset_weights_attr  = __ATTR_WO(reset_weights);
++static struct kobj_attribute reset_stats_attr    = __ATTR_WO(reset_stats);
++
++static struct attribute *nap_attrs[] = {
++	&version_attr.attr,
++	&simd_attr.attr,
++	&stats_attr.attr,
++	&learning_rate_attr.attr,
++	&learn_interval_attr.attr,
++	&confidence_attr.attr,
++	&reset_weights_attr.attr,
++	&reset_stats_attr.attr,
++	NULL,
++};
++
++static const struct attribute_group nap_attr_group = {
++	.attrs = nap_attrs,
++};
++
++static struct kobject *cpuidle_kobj;
++
++int nap_sysfs_init(void)
++{
++	struct device *dev_root;
++	int ret;
++
++	dev_root = bus_get_dev_root(&cpu_subsys);
++	if (!dev_root)
++		return -ENODEV;
++
++	cpuidle_kobj = kobject_create_and_add("nap", &dev_root->kobj);
++	put_device(dev_root);
++	if (!cpuidle_kobj)
++		return -ENOMEM;
++
++	ret = sysfs_create_group(cpuidle_kobj, &nap_attr_group);
++	if (ret) {
++		kobject_put(cpuidle_kobj);
++		cpuidle_kobj = NULL;
++	}
++	return ret;
++}
++
++void nap_sysfs_exit(void)
++{
++	if (cpuidle_kobj) {
++		sysfs_remove_group(cpuidle_kobj, &nap_attr_group);
++		kobject_put(cpuidle_kobj);
++		cpuidle_kobj = NULL;
++	}
++}
++
++/* ================================================================
++ * Governor registration
++ * ================================================================ */
++
++static struct cpuidle_governor nap_governor = {
++	.name    = "nap",
++	.rating  = 26,
++	.enable  = nap_enable,
++	.disable = nap_disable,
++	.select  = nap_select,
++	.reflect = nap_reflect,
++};
++
++static int __init nap_init(void)
++{
++	int ret;
++
++	nap_detect_simd();
++
++	ret = nap_sysfs_init();
++	if (ret)
++		pr_warn("nap: sysfs init failed: %d (continuing without sysfs)\n", ret);
++
++	ret = cpuidle_register_governor(&nap_governor);
++	if (ret) {
++		pr_err("nap: register_governor failed: %d\n", ret);
++		nap_sysfs_exit();
++		return ret;
++	}
++
++	pr_info("%s v%s by %s registered (rating=%u)\n",
++	       CPUIDLE_NAP_PROGNAME, CPUIDLE_NAP_VERSION,
++	       CPUIDLE_NAP_AUTHOR, nap_governor.rating);
++	return 0;
++}
++postcore_initcall(nap_init);
+diff --git a/drivers/cpuidle/governors/nap/nap.h b/drivers/cpuidle/governors/nap/nap.h
+new file mode 100644
+index 0000000000..0f6aae7d17
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/nap.h
+@@ -0,0 +1,291 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef NAP_H
++#define NAP_H
++
++#include <linux/cpuidle.h>
++#include <linux/jump_label.h>
++#include <linux/ktime.h>
++
++/* ================================================================
++ * Neural network dimensions
++ * ================================================================ */
++
++#define NAP_INPUT_SIZE    8
++#define NAP_HIDDEN_SIZE   8
++#define NAP_NUM_CUTS      (CPUIDLE_STATE_MAX - 1)
++
++/*
++ * Neural network weights for an 8-input MLP with an ordinal survival head.
++ *
++ * The trunk maps input[8] → hidden[8] (ReLU), feeding a shared linear score
++ *   s = w_out . hidden + b_out
++ * which is the input to a proportional-odds ordinal head. For each idle-state
++ * boundary k the predicted survival probability that the upcoming idle reaches
++ * that state's target_residency is
++ *   q_k = sigmoid(s - thr_ord[k-1]).
++ * With ordered thresholds this represents the idle-duration distribution at
++ * exactly the points the decision needs (the sufficient statistic), rather
++ * than a single point estimate. The decision layer compares q_k against a
++ * calibrated confidence level (see nap_fpu_select()).
++ *
++ * Column-major storage: w_h1[j][i] = weight from input j to hidden neuron i.
++ * This layout enables efficient column-wise matrix-vector products where
++ * each input broadcasts across all hidden neurons via SIMD FMA.
++ *
++ * thr_ord is appended after the SIMD-accessed fields so their offsets are
++ * unchanged. __aligned(32) ensures AVX2 vmovaps (32-byte) aligned loads
++ * work correctly (8 floats = 32 bytes = one ymm register).
++ */
++struct nap_weights {
++	/* Hidden layer: input[8] → hidden[8] */
++	float w_h1[NAP_INPUT_SIZE][NAP_HIDDEN_SIZE];  /* 64 params */
++	float b_h1[NAP_HIDDEN_SIZE];                   /* 8 params  */
++	/* Shared score head: hidden[8] → scalar s */
++	float w_out[NAP_HIDDEN_SIZE];                  /* 8 params  */
++	float b_out;                                   /* 1 param   */
++	/* Ordinal survival head: one ordered threshold per state boundary */
++	float thr_ord[NAP_NUM_CUTS];
++} __aligned(32);
++
++/* ISA-specific forward pass implementations */
++void nap_nn_forward_sse2(const float *input, float *output,
++			 float *hidden_save, const struct nap_weights *w);
++void nap_nn_forward_avx2(const float *input, float *output,
++			 float *hidden_save, const struct nap_weights *w);
++
++/* ISA-specific online learning (backpropagation) */
++struct nap_cpu_data;
++void nap_nn_learn_sse2(struct nap_cpu_data *d);
++void nap_nn_learn_avx2(struct nap_cpu_data *d);
++
++/* Static key for ISA dispatch (defined in nap.c) */
++DECLARE_STATIC_KEY_FALSE(nap_use_avx2);
++
++/* ================================================================
++ * SIMD type definitions and helpers (GCC vector extensions)
++ *
++ * Only available when compiled with FPU/SSE flags (nap_fpu.c,
++ * nap_nn_*.c).  nap.c is compiled without FPU flags and must
++ * not see these definitions.
++ *
++ * <immintrin.h> is a userspace header and cannot be used in kernel.
++ * We use __attribute__((__vector_size__())) and __builtin_ia32_*.
++ * ================================================================ */
++
++#ifdef __SSE2__
++
++typedef float v4sf  __attribute__((__vector_size__(16)));   /* xmm: 4×float  */
++typedef int   v4si  __attribute__((__vector_size__(16)));   /* xmm: 4×int32  */
++typedef float v8sf  __attribute__((__vector_size__(32)));   /* ymm: 8×float  */
++
++/* Broadcast helpers */
++#define V4SF_SET1(x)  ((v4sf){ (x), (x), (x), (x) })
++#define V4SI_SET1(x)  ((v4si){ (x), (x), (x), (x) })
++#define V8SF_SET1(x)  ((v8sf){ (x),(x),(x),(x),(x),(x),(x),(x) })
++#define V8SF_ZERO     V8SF_SET1(0.0f)
++
++/* Unaligned load/store helpers */
++static inline v4sf v4sf_loadu(const float *p)
++{
++	v4sf result;
++	__builtin_memcpy(&result, p, sizeof(result));
++	return result;
++}
++
++static inline void v4sf_storeu(float *p, v4sf v)
++{
++	__builtin_memcpy(p, &v, sizeof(v));
++}
++
++#ifdef __AVX__
++static inline v8sf v8sf_loadu(const float *p)
++{
++	v8sf result;
++	__builtin_memcpy(&result, p, sizeof(result));
++	return result;
++}
++
++static inline void v8sf_storeu(float *p, v8sf v)
++{
++	__builtin_memcpy(p, &v, sizeof(v));
++}
++#endif /* __AVX__ */
++
++/* Scalar/vector clamp helpers */
++static inline float fclampf(float v, float lo, float hi)
++{
++	if (v < lo) return lo;
++	if (v > hi) return hi;
++	return v;
++}
++
++static inline v4sf v4sf_clamp(v4sf v, v4sf lo, v4sf hi)
++{
++	return __builtin_ia32_maxps(__builtin_ia32_minps(v, hi), lo);
++}
++
++/* Type punning: float ↔ int reinterpret (no instruction generated) */
++static inline v4si v4sf_as_v4si(v4sf v)
++{
++	union { v4sf f; v4si i; } u = { .f = v };
++	return u.i;
++}
++
++static inline v4sf v4si_as_v4sf(v4si v)
++{
++	union { v4si i; v4sf f; } u = { .i = v };
++	return u.f;
++}
++
++/*
++ * fast_log2f_sse() — Compute log2 of 4 floats simultaneously using SSE2
++ *
++ * Cost: ~15 cycles for 4 values (~4 cycles per value)
++ */
++static inline v4sf fast_log2f_sse(v4sf x)
++{
++	const v4si mask_exp  = V4SI_SET1(0xFF);
++	const v4si bias      = V4SI_SET1(127);
++	const v4si mask_mant = V4SI_SET1(0x7FFFFF);
++	const v4si exp_bias  = V4SI_SET1(127 << 23);
++
++	v4si xi    = v4sf_as_v4si(x);
++	v4si exp_i = (xi >> 23) & mask_exp;
++	exp_i      = exp_i - bias;
++	v4sf e     = __builtin_convertvector(exp_i, v4sf);
++
++	v4si mant_i = (xi & mask_mant) | exp_bias;
++	v4sf m      = v4si_as_v4sf(mant_i) - V4SF_SET1(1.0f);
++
++	v4sf p;
++	p = m * V4SF_SET1(0.4808f);
++	p = V4SF_SET1(0.7213f) - p;
++	p = m * p;
++	p = V4SF_SET1(1.4425f) - p;
++	p = m * p;
++
++	return e + p;
++}
++
++#endif /* __SSE2__ */
++
++/* ================================================================
++ * Feature extraction
++ * ================================================================ */
++
++#define NAP_HISTORY_SIZE     8
++
++/* ================================================================
++ * POLL short-circuit tunables
++ * ================================================================ */
++
++/* dev->poll_limit_ns floor and safety margin written by
++ * nap_compute_poll_limit().  Both 1 us: the POLL state samples its
++ * own timeout only every ~1 us (POLL_IDLE_RELAX_COUNT cpu_relax()
++ * iterations in poll_state.c), so finer values are indistinguishable.
++ */
++#define NAP_POLL_LIMIT_MIN_NS      1000ULL
++#define NAP_POLL_LIMIT_MARGIN_NS   1000ULL
++
++/* Refresh interval for the cached minimum-valid-state lookup.  HZ
++ * jiffies (1 s) bounds staleness from sysfs/runtime state-disable
++ * events; PM QoS latency changes are detected immediately via the
++ * cached latency_req comparison.
++ */
++#define NAP_MIN_STATE_REFRESH_JIFFIES  HZ
++
++struct nap_stats {
++	u64 total_selects;
++	u64 total_residency_ns;
++	u64 overshoot_count;
++	u64 learn_count;
++};
++
++struct nap_cpu_data {
++	/* Ring buffer */
++	u64   history[NAP_HISTORY_SIZE];
++	float log_history[NAP_HISTORY_SIZE];
++	int   hist_idx;
++	int   hist_count;
++
++	/* External signal tracking */
++	u64     prev_idle_exit;
++	s64     last_predicted_ns;
++	s64     last_prediction_error;
++
++	/* POLL short-circuit fast path */
++	bool short_circuited;			/* set in select, read in reflect */
++	int  cached_min_state;			/* cached shallowest valid state */
++	s64  cached_min_state_latency;		/* latency_req when cache populated */
++	unsigned long cached_min_state_jiffies;	/* jiffies when cache populated */
++
++	/* Jiffies-based learning rate floor */
++	unsigned long last_learn_jiffies;
++	unsigned int  learn_jiffies_min;	/* 0 = disabled */
++
++	/* select/reflect handoff */
++	int   last_selected_idx;
++
++	/* Shared ordinal score s (≈ log2 of the predicted idle duration in ns).
++	 * Survival at boundary k is sigmoid(s - thr_ord[k-1]).
++	 */
++	float nn_output;
++
++	/*
++	 * hidden_out[], features_f32[] are written with aligned SIMD
++	 * stores in nap_nn_forward_{sse2,avx2}() and
++	 * nap_extract_features():
++	 *   SSE2:    movaps  (16-byte aligned)
++	 *   AVX2:    vmovaps (32-byte aligned)
++	 * Without __aligned(32), the natural struct offset would be
++	 * only 4-byte aligned, causing #GP faults in the idle task.
++	 */
++	float hidden_out[NAP_HIDDEN_SIZE] __aligned(32);
++	float features_f32[NAP_INPUT_SIZE] __aligned(32);
++
++	/* Backprop scratch */
++	float learn_d_out;	/* score gradient g = sum_k (q_k - y_k) */
++	float learn_lr;		/* effective learning rate (symmetric) */
++	float learn_d_hid[NAP_HIDDEN_SIZE] __aligned(32);
++
++	/* Precomputed per-state log2 thresholds.
++	 * log2_tres[i] = log2(target_residency_ns) (ordinal thresholds, timer clamp)
++	 */
++	float log2_tres[CPUIDLE_STATE_MAX];
++
++	/* Decayed per-bin idle histogram: robustness-floor survival estimate */
++	float bin_count[CPUIDLE_STATE_MAX];
++
++	/* Deferred learning data */
++	bool  needs_learn;
++	bool  have_sample;	/* a fresh residency awaits per-idle processing */
++	u64   learn_actual_ns;
++
++	/* Single network: 16→16 trunk + ordinal survival head */
++	struct nap_weights weights;
++	struct nap_weights *active_w;	/* always &weights; consumed by SIMD forward/learn */
++
++	/* Online learning */
++	unsigned int learning_rate_millths;
++	unsigned int max_grad_norm_millths;
++	unsigned int conf_millths;	/* decision confidence level (500 = 0.5) */
++	int   learn_interval;
++	int   learn_counter;
++	bool reset_pending;		/* set by sysfs, consumed by nap_select */
++
++	/* sysfs statistics */
++	struct nap_stats stats;
++};
++
++DECLARE_PER_CPU(struct nap_cpu_data, nap_data);
++
++/* FPU entry point (nap_fpu.c) — call only within kernel_fpu_begin/end */
++int nap_fpu_select(struct cpuidle_driver *drv,
++		   struct cpuidle_device *dev,
++		   struct nap_cpu_data *d);
++
++/* sysfs interface */
++int  nap_sysfs_init(void);
++void nap_sysfs_exit(void);
++
++#endif /* NAP_H */
+diff --git a/drivers/cpuidle/governors/nap/nap_fpu.c b/drivers/cpuidle/governors/nap/nap_fpu.c
+new file mode 100644
+index 0000000000..9465262969
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/nap_fpu.c
+@@ -0,0 +1,528 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * nap_fpu.c — FPU/SIMD code for the NAP cpuidle governor
++ *
++ * This file is compiled with FPU/SSE flags enabled (CC_FLAGS_FPU).
++ * ALL functions here MUST be called only from within
++ * kernel_fpu_begin()/kernel_fpu_end() blocks.
++ *
++ * Keeping FPU code in a separate translation unit ensures the compiler
++ * cannot emit SSE/x87 instructions in non-FPU code paths (nap.c),
++ * which would silently corrupt userspace FPU register state.
++ */
++
++#include <linux/cpuidle.h>
++#include <linux/math64.h>
++#include <linux/percpu.h>
++#include <linux/pm_qos.h>
++#include <linux/sched/clock.h>
++#include <linux/string.h>
++#include <linux/tick.h>
++
++#include "nap.h"
++
++/* Clang lacks __builtin_ia32_movhlps; emulate with __builtin_shufflevector */
++#ifdef __clang__
++#define __builtin_ia32_movhlps(a, b) \
++	__builtin_shufflevector(b, a, 2, 3, 6, 7)
++#endif
++
++/* ================================================================
++ * Float math helpers
++ * ================================================================ */
++
++static inline float float_min(float a, float b) { return a < b ? a : b; }
++static inline float float_max(float a, float b) { return a > b ? a : b; }
++
++/*
++ * Kernel-safe sqrtf using the SSE sqrtss instruction directly.
++ * GCC may lower nap_sqrtf to a libm call, which is unavailable
++ * in the kernel.  This file is always compiled with FPU/SSE enabled.
++ */
++static inline float nap_sqrtf(float x)
++{
++	asm("sqrtss %1, %0" : "=x"(x) : "x"(x));
++	return x;
++}
++
++/* Scalar log2 approximation (same algorithm as fast_log2f_sse) */
++static inline float fast_log2f(float x)
++{
++	union { float f; u32 i; } u = { .f = x };
++	int exp = (int)((u.i >> 23) & 0xFFu) - 127;
++	float e = (float)exp;
++	float m, p;
++
++	u.i = (u.i & 0x7FFFFFu) | (127u << 23);
++	m = u.f - 1.0f;
++
++	p = m * 0.4808f;
++	p = 0.7213f - p;
++	p = m * p;
++	p = 1.4425f - p;
++	p = m * p;
++
++	return e + p;
++}
++
++/*
++ * Scalar 2^x approximation: integer part via exponent bits, fractional part
++ * via a minimax cubic on [0,1] (error < 1e-4).  Used to build the logistic.
++ */
++static inline float fast_exp2f(float x)
++{
++	union { u32 i; float f; } v;
++	int xi;
++	float f;
++
++	if (x > 60.0f)
++		x = 60.0f;
++	else if (x < -60.0f)
++		x = -60.0f;
++
++	xi = (int)x;
++	if (x < (float)xi)
++		xi--;			/* floor toward negative infinity */
++	f = x - (float)xi;
++
++	v.i = (u32)((xi + 127) << 23);	/* 2^xi */
++	return v.f * (1.0f + f * (0.6931472f +
++			f * (0.2402265f + f * 0.0555041f)));
++}
++
++/* Logistic sigmoid: sigmoid(x) = 1 / (1 + e^-x) = 1 / (1 + 2^(-x*log2(e))) */
++static inline float nap_sigmoidf(float x)
++{
++	return 1.0f / (1.0f + fast_exp2f(-1.4426950f * x));
++}
++
++/*
++ * Robustness floor and Beta-Binomial shrinkage.
++ *
++ * bin_count[] is an exponentially decayed histogram (window NAP_FLOOR_WIN, in
++ * idles) of which idle-state bin each idle landed in, updated every idle; its
++ * survival estimate is a fast, forgetting-resistant memory.  The decision
++ * treats the NN survival as a prior worth NAP_PRIOR_K pseudo-observations and
++ * the decayed histogram as data:
++ *   q_k = (NAP_PRIOR_K * q_nn_k + count(>=k)) / (NAP_PRIOR_K + total).
++ * Cold (no data) follows the NN; once the histogram fills it dominates.
++ */
++#define NAP_FLOOR_WIN  256
++#define NAP_PRIOR_K    16
++
++/* ================================================================
++ * Deterministic PRNG for weight initialization (LCG)
++ * ================================================================ */
++
++static inline float nap_prng_float(u32 *state)
++{
++	*state = *state * 1664525u + 1013904223u;
++	return (float)(s32)*state * (1.0f / 2147483648.0f);
++}
++
++/* ================================================================
++ * ISA dispatch via static keys
++ * ================================================================ */
++
++static inline void nap_nn_forward(const float *input, float *output,
++				  float *hidden_save,
++				  const struct nap_weights *w)
++{
++	if (static_branch_unlikely(&nap_use_avx2))
++		nap_nn_forward_avx2(input, output, hidden_save, w);
++	else
++		nap_nn_forward_sse2(input, output, hidden_save, w);
++}
++
++static inline void nap_nn_learn(struct nap_cpu_data *d)
++{
++	if (static_branch_unlikely(&nap_use_avx2))
++		nap_nn_learn_avx2(d);
++	else
++		nap_nn_learn_sse2(d);
++}
++
++/* ================================================================
++ * Weight initialization
++ *
++ * The NN directly outputs predicted sleep time in log2(ns) space.
++ * Hidden neuron 0 is initialized as a pass-through for feature[0]
++ * (log2(sleep_length)), so the initial output ≈ log2(sleep_length).
++ * This matches the pre-learning behavior of selecting the deepest
++ * state that fits within sleep_length.
++ *
++ * Other hidden neurons are Xavier-initialized with near-zero output
++ * weights so their initial contribution is negligible.  Biases = 0.
++ * ================================================================ */
++
++#define NAP_PRNG_SEED 42u
++
++static void nap_init_weights(struct nap_weights *w)
++{
++	u32 rng = NAP_PRNG_SEED;
++	float scale_h1, scale_out;
++	int i, j;
++
++	/* Xavier uniform: U(-sqrt(6/(fan_in+fan_out)), +sqrt(6/(...))) */
++	scale_h1  = nap_sqrtf(6.0f / (float)(NAP_INPUT_SIZE + NAP_HIDDEN_SIZE));
++	scale_out = 0.01f;
++
++	/* Hidden layer weights */
++	for (i = 0; i < NAP_INPUT_SIZE; i++)
++		for (j = 0; j < NAP_HIDDEN_SIZE; j++)
++			w->w_h1[i][j] = nap_prng_float(&rng) * scale_h1;
++
++	/* Hidden biases: zero (standard) */
++	memset(w->b_h1, 0, sizeof(w->b_h1));
++
++	/* Output weights: near-zero for ~0 initial contribution */
++	for (j = 0; j < NAP_HIDDEN_SIZE; j++)
++		w->w_out[j] = nap_prng_float(&rng) * scale_out;
++
++	/* Output bias: zero */
++	w->b_out = 0.0f;
++
++	/*
++	 * Neuron 0: pass-through for feature[0] = log2(sleep_length).
++	 * hidden[0] = ReLU(1.0 * input[0] + 0) = input[0]  (always > 0)
++	 * output += 1.0 * hidden[0] = log2(sleep_length)
++	 *
++	 * Override the random init above so initial output ≈ input[0].
++	 */
++	for (i = 0; i < NAP_INPUT_SIZE; i++)
++		w->w_h1[i][0] = 0.0f;
++	w->w_h1[0][0] = 1.0f;
++	w->b_h1[0] = 0.0f;
++	w->w_out[0] = 1.0f;
++}
++
++/*
++ * Precompute log2(target_residency) per state and seed the ordinal
++ * thresholds.  log2_tres[k] is the boundary location in score space: it
++ * seeds thr_ord[k-1], bounds its learned drift, and clamps the score
++ * against the timer in the decision layer.
++ */
++static void nap_init_log2_tres(struct nap_cpu_data *d,
++			       struct cpuidle_driver *drv)
++{
++	int i;
++
++	for (i = 0; i < drv->state_count; i++) {
++		float tres = float_max(
++			(float)drv->states[i].target_residency_ns, 1.0f);
++
++		d->log2_tres[i] = fast_log2f(tres);
++	}
++
++	/*
++	 * Seed each ordinal threshold at its boundary's log2(target_residency),
++	 * so before learning q_k crosses 0.5 exactly when the score (initially
++	 * ~= log2(sleep_length)) reaches that state's target_residency.  This
++	 * reproduces the deepest-state-that-fits default until learning adapts.
++	 */
++	for (i = 1; i < drv->state_count; i++)
++		d->weights.thr_ord[i - 1] = d->log2_tres[i];
++}
++
++/* ================================================================
++ * Feature extraction helpers
++ * ================================================================ */
++
++struct logring_stats {
++	float avg;
++	float min;
++	float max;
++};
++
++/*
++ * Compute log_history statistics: avg, min, max.
++ * SIMD fast path when the ring buffer is full (8 elements = 2 × xmm).
++ */
++static void logring_compute(const struct nap_cpu_data *d,
++			    struct logring_stats *s)
++{
++	int i, n = d->hist_count;
++	float sum;
++
++	if (n == 0) {
++		*s = (struct logring_stats){ 0 };
++		return;
++	}
++
++	if (n == NAP_HISTORY_SIZE) {
++		v4sf v0 = *(const v4sf *)&d->log_history[0];
++		v4sf v1 = *(const v4sf *)&d->log_history[4];
++		v4sf pmin, pmax, psum, t;
++
++		pmin = __builtin_ia32_minps(v0, v1);
++		pmax = __builtin_ia32_maxps(v0, v1);
++		psum = v0 + v1;
++
++		/* 4 → 2 */
++		t = __builtin_ia32_movhlps(pmin, pmin);
++		pmin = __builtin_ia32_minps(pmin, t);
++		t = __builtin_ia32_movhlps(pmax, pmax);
++		pmax = __builtin_ia32_maxps(pmax, t);
++		t = __builtin_ia32_movhlps(psum, psum);
++		psum = psum + t;
++
++		/* 2 → 1 */
++		t = __builtin_ia32_shufps(pmin, pmin, 0x55);
++		pmin = __builtin_ia32_minps(pmin, t);
++		t = __builtin_ia32_shufps(pmax, pmax, 0x55);
++		pmax = __builtin_ia32_maxps(pmax, t);
++		t = __builtin_ia32_shufps(psum, psum, 0x55);
++		psum = psum + t;
++
++		sum = psum[0];
++		s->min = pmin[0];
++		s->max = pmax[0];
++	} else {
++		float val;
++
++		sum = d->log_history[0];
++		s->min = sum;
++		s->max = sum;
++
++		for (i = 1; i < n; i++) {
++			val = d->log_history[i];
++			sum += val;
++			s->min = float_min(s->min, val);
++			s->max = float_max(s->max, val);
++		}
++	}
++
++	s->avg = sum / (float)n;
++}
++
++static void nap_extract_features(struct cpuidle_driver *drv,
++				 struct cpuidle_device *dev,
++				 float out[NAP_INPUT_SIZE],
++				 s64 latency_req)
++{
++	struct nap_cpu_data *d = this_cpu_ptr(&nap_data);
++	struct logring_stats lr;
++	ktime_t sleep_length, delta_tick;
++	u64 busy_ns;
++	float log_inputs[4] __aligned(16);
++	float log_results[4] __aligned(16);
++
++	sleep_length = tick_nohz_get_sleep_length(&delta_tick);
++	busy_ns = local_clock() - d->prev_idle_exit;
++
++	/*
++	 * SSE log2 batch: 4 values in one fast_log2f_sse call.
++	 *   [0] sleep_length   → out[0]
++	 *   [1] last_residency → out[1], also stored to log_history
++	 *   [2] busy_ns        → out[6]
++	 *   [3] |pred_error_us| + 1 → out[5] (sign restored after)
++	 */
++	{
++		float err_f = (float)(d->last_prediction_error / 1000);
++		float abs_err = (err_f >= 0.0f) ? err_f : -err_f;
++
++		log_inputs[0] = float_max((float)ktime_to_ns(sleep_length), 1.0f);
++		log_inputs[1] = float_max((float)dev->last_residency_ns, 1.0f);
++		log_inputs[2] = float_max((float)busy_ns, 1.0f);
++		log_inputs[3] = abs_err + 1.0f;
++
++		{
++			v4sf log_in  = *(const v4sf *)log_inputs;
++			v4sf log_out = fast_log2f_sse(log_in);
++			*(v4sf *)log_results = log_out;
++		}
++
++		out[0] = log_results[0];
++		out[1] = log_results[1];
++		out[6] = log_results[2];
++
++		/* out[5]: sign-preserving log2(|err_us| + 1) */
++		{
++			union { float f; u32 i; } res = { .f = log_results[3] };
++			union { float f; u32 i; } sgn = { .f = err_f };
++
++			res.i |= sgn.i & 0x80000000u;
++			out[5] = res.f;
++		}
++	}
++
++	/* Update log_history ring buffer */
++	{
++		int prev = (d->hist_idx - 1 + NAP_HISTORY_SIZE) % NAP_HISTORY_SIZE;
++		d->log_history[prev] = log_results[1];
++	}
++
++	/* Compute log_history statistics: avg, min, max */
++	logring_compute(d, &lr);
++	out[2] = lr.avg;
++	out[3] = lr.min;
++	out[4] = lr.max;
++
++	/* out[7]: log2(latency_req) - log2(deepest_lat), 0 if unconstrained */
++	{
++		u64 deepest_lat = drv->states[drv->state_count - 1]
++				      .exit_latency_ns;
++		bool lat_valid = (latency_req < PM_QOS_LATENCY_ANY_NS &&
++				  deepest_lat > 0);
++
++		if (lat_valid)
++			out[7] = fast_log2f(float_max((float)latency_req, 1.0f))
++			       - fast_log2f(float_max((float)deepest_lat, 1.0f));
++		else
++			out[7] = 0.0f;
++	}
++
++	d->last_predicted_ns = ktime_to_ns(sleep_length);
++}
++
++/* ================================================================
++ * FPU entry point for nap_select
++ *
++ * Called within kernel_fpu_begin()/kernel_fpu_end().
++ * Returns: selected idle state index (>= 0), or -1 to fall back
++ *          to the integer heuristic.
++ * ================================================================ */
++
++int nap_fpu_select(struct cpuidle_driver *drv,
++		   struct cpuidle_device *dev,
++		   struct nap_cpu_data *d)
++{
++	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
++
++	/* Handle deferred weight reset (set by sysfs or nap_enable) */
++	if (unlikely(d->reset_pending)) {
++		nap_init_weights(&d->weights);
++		nap_init_log2_tres(d, drv);
++		memset(d->bin_count, 0, sizeof(d->bin_count));
++		d->have_sample = false;
++		d->stats.learn_count = 0;
++		d->needs_learn = false;
++		d->reset_pending = false;
++	}
++
++	/*
++	 * Per-idle feedback against the just-realized idle duration.
++	 *
++	 * Every idle: update the decayed floor histogram so it stays current.
++	 * Only every learn_interval (needs_learn): apply the ordinal-threshold
++	 * updates and the trunk/score-head backprop, using the previous pass's
++	 * stored score, hidden activations and features.  Under the shared-score
++	 * proportional-odds model the gradient w.r.t. the score is the scalar
++	 * g = sum_k (q_k - y_k), which drives the existing SIMD backprop unchanged.
++	 * The loss is symmetric -- any responsiveness bias lives in the decision
++	 * layer, not here.
++	 */
++	if (d->have_sample) {
++		float decay = (float)(NAP_FLOOR_WIN - 1) / (float)NAP_FLOOR_WIN;
++		int k, label_bin = 0;
++
++		if (d->needs_learn) {
++			float base_lr = (float)d->learning_rate_millths / 1000.0f;
++			float clamp_val = (float)d->max_grad_norm_millths / 1000.0f;
++			float s = d->nn_output;
++			float g = 0.0f;
++
++			for (k = 1; k < drv->state_count; k++) {
++				float th = d->active_w->thr_ord[k - 1];
++				float q = nap_sigmoidf(s - th);
++				float y = (d->learn_actual_ns >=
++					   drv->states[k].target_residency_ns)
++					  ? 1.0f : 0.0f;
++				float err = q - y;
++				float lo = d->log2_tres[k] - 6.0f;
++				float hi = d->log2_tres[k] + 6.0f;
++
++				g += err;
++				d->active_w->thr_ord[k - 1] =
++					fclampf(th + fclampf(base_lr * err,
++							     -clamp_val, clamp_val),
++						lo, hi);
++			}
++			d->learn_d_out = g;
++			d->learn_lr = base_lr;
++			d->stats.learn_count++;
++			nap_nn_learn(d);
++			d->needs_learn = false;
++		}
++
++		/* Floor histogram update, every idle */
++		for (k = 1; k < drv->state_count; k++)
++			if (d->learn_actual_ns >=
++			    drv->states[k].target_residency_ns)
++				label_bin = k;
++		for (k = 0; k < drv->state_count; k++)
++			d->bin_count[k] *= decay;
++		d->bin_count[label_bin] += 1.0f;
++
++		d->have_sample = false;
++	}
++
++	/*
++	 * Feature extraction + NN forward pass.
++	 * features_f32 is __aligned(64) in nap_cpu_data, satisfying
++	 * AVX-512 vmovaps requirements.
++	 */
++	nap_extract_features(drv, dev, d->features_f32, latency_req);
++
++	d->active_w = &d->weights;
++
++	nap_nn_forward(d->features_f32, &d->nn_output, d->hidden_out,
++		       d->active_w);
++
++	/*
++	 * Decision layer.
++	 *
++	 * For each boundary k the survival probability q_k is a Beta-Binomial
++	 * shrinkage of the NN survival sigmoid(s - thr_ord) (a prior worth
++	 * NAP_PRIOR_K pseudo-observations) toward the decayed histogram (data):
++	 * the NN drives cold start, the floor takes over as it fills.  A running
++	 * minimum enforces a monotone non-increasing survival curve, and the next
++	 * timer event caps the reachable depth (a deeper state cannot be earned
++	 * past it).  The confidence level is the single responsiveness dial: pick
++	 * the deepest feasible state whose survival still meets it.
++	 */
++	{
++		float conf = (float)d->conf_millths / 1000.0f;
++		float s = d->nn_output;
++		float sleep_log2 = d->features_f32[0];
++		float suffix[CPUIDLE_STATE_MAX];
++		float total = 0.0f;
++		float qmin = 1.0f;
++		int k, m = 0, idx = 0;
++
++		for (k = 0; k < drv->state_count; k++)
++			total += d->bin_count[k];
++
++		suffix[drv->state_count - 1] =
++			d->bin_count[drv->state_count - 1];
++		for (k = drv->state_count - 2; k >= 0; k--)
++			suffix[k] = suffix[k + 1] + d->bin_count[k];
++
++		for (k = 1; k < drv->state_count; k++) {
++			float q_nn = nap_sigmoidf(s - d->active_w->thr_ord[k - 1]);
++			float q = ((float)NAP_PRIOR_K * q_nn + suffix[k]) /
++				  ((float)NAP_PRIOR_K + total);
++
++			if (d->log2_tres[k] > sleep_log2)
++				q = 0.0f;	/* cannot idle past the next timer */
++			if (q < qmin)
++				qmin = q;
++			q = qmin;
++
++			if (q >= conf)
++				m = k;
++			else
++				break;
++		}
++
++		for (k = m; k >= 1; k--) {
++			if (dev->states_usage[k].disable)
++				continue;
++			if (drv->states[k].exit_latency_ns > latency_req)
++				continue;
++			idx = k;
++			break;
++		}
++		return idx;
++	}
++}
+diff --git a/drivers/cpuidle/governors/nap/nap_nn_avx2.c b/drivers/cpuidle/governors/nap/nap_nn_avx2.c
+new file mode 100644
+index 0000000000..a43091793c
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/nap_nn_avx2.c
+@@ -0,0 +1,135 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * nap_nn_avx2.c — AVX2+FMA forward pass and backpropagation for the nap MLP
++ *
++ * 8→8 trunk + scalar score s feeding the ordinal survival head.
++ * Uses 256-bit ymm registers: 8 hidden neurons = 1 ymm.
++ * FMA via vfmadd231ps for fused multiply-add.
++ *
++ * Must be called within kernel_fpu_begin/end.
++ * Compiled with: CFLAGS += -mavx2 -mfma
++ */
++
++#include "nap.h"
++
++/* Aligned load/store: GCC translates v8sf* dereference to vmovaps */
++static inline v8sf v8sf_load(const float *p)   { return *(const v8sf *)p; }
++static inline void v8sf_store(float *p, v8sf v) { *(v8sf *)p = v; }
++
++/* FMA: a*b+c — vfmadd231ps: dest = src1 * src2 + dest */
++static inline v8sf v8sf_fmadd(v8sf a, v8sf b, v8sf c)
++{
++	asm("vfmadd231ps %2, %1, %0" : "+x"(c) : "x"(a), "xm"(b));
++	return c;
++}
++
++/* ymm clamp: max(min(v, hi), lo) */
++static inline v8sf v8sf_clamp(v8sf v, v8sf lo, v8sf hi)
++{
++	return __builtin_ia32_maxps256(__builtin_ia32_minps256(v, hi), lo);
++}
++
++void nap_nn_forward_avx2(const float *input,
++			 float *output,
++			 float *hidden_save,
++			 const struct nap_weights *w)
++{
++	int j;
++
++	/* === Hidden layer: 8 outputs = 1×ymm, 2-way accumulator === */
++	v8sf acc0 = v8sf_load(&w->b_h1[0]);
++	v8sf acc1 = V8SF_ZERO;
++
++	for (j = 0; j < NAP_INPUT_SIZE; j += 2) {
++		v8sf x0 = V8SF_SET1(input[j]);
++		v8sf x1 = V8SF_SET1(input[j + 1]);
++
++		acc0 = v8sf_fmadd(v8sf_load(&w->w_h1[j][0]),     x0, acc0);
++		acc1 = v8sf_fmadd(v8sf_load(&w->w_h1[j + 1][0]), x1, acc1);
++	}
++
++	/* Merge accumulators + ReLU */
++	{
++		v8sf h = __builtin_ia32_maxps256(acc0 + acc1, V8SF_ZERO);
++
++		v8sf_store(hidden_save, h);
++
++		/* === Output layer: dot(hidden[8], w_out[8]) + b_out === */
++		{
++			v8sf p = v8sf_load(&w->w_out[0]) * h;
++
++			/* Horizontal reduce: 8 → 4 → scalar */
++			v4sf lo = __builtin_ia32_vextractf128_ps256(p, 0);
++			v4sf hi = __builtin_ia32_vextractf128_ps256(p, 1);
++			v4sf s4 = lo + hi;
++
++			*output = s4[0] + s4[1] + s4[2] + s4[3] + w->b_out;
++		}
++	}
++}
++
++/*
++ * Online learning (backpropagation) — AVX2+FMA
++ *
++ * Output: scalar d_out (pre-computed by caller)
++ * Hidden layer: 8 neurons = 1×ymm
++ */
++void nap_nn_learn_avx2(struct nap_cpu_data *d)
++{
++	int i;
++	float d_out_scalar = d->learn_d_out;
++	float *d_hid = d->learn_d_hid;
++	float lr = d->learn_lr;
++	float clamp_val = (float)d->max_grad_norm_millths / 1000.0f;
++	v8sf v_neg_lr = V8SF_SET1(-lr);
++	v8sf v_cl_hi  = V8SF_SET1(clamp_val);
++	v8sf v_cl_lo  = V8SF_SET1(-clamp_val);
++
++	/*
++	 * Hidden gradient: d_hid[j] = relu'(h[j]) * w_out[j] * d_out.
++	 * vcmpps + vandps: branchless SIMD mask (1×ymm = 8 neurons).
++	 */
++	v8sf dh;
++	{
++		v8sf vd = V8SF_SET1(d_out_scalar);
++		v8sf g = v8sf_load(&d->active_w->w_out[0]) * vd;
++		v8sf mask = __builtin_ia32_cmpps256(
++				v8sf_load(&d->hidden_out[0]), V8SF_ZERO, 14);
++
++		asm("vandps %2, %1, %0" : "=x"(dh) : "x"(g), "xm"(mask));
++		v8sf_store(d_hid, dh);
++	}
++
++	/* Output weight update: w_out[j] -= lr * clamp(h[j] * d_out) */
++	{
++		v8sf vd = V8SF_SET1(d_out_scalar);
++		v8sf *w = (v8sf *)&d->active_w->w_out[0];
++
++		*w = v8sf_fmadd(v_neg_lr,
++				v8sf_clamp(v8sf_load(&d->hidden_out[0]) * vd,
++					   v_cl_lo, v_cl_hi),
++				*w);
++	}
++
++	/* Output bias update (scalar) */
++	d->active_w->b_out -= lr * fclampf(d_out_scalar, -clamp_val, clamp_val);
++
++	/* Hidden weight update: w_h1[i][j] -= lr * clamp(feat[i] * d_hid[j]) */
++	for (i = 0; i < NAP_INPUT_SIZE; i++) {
++		v8sf vf = V8SF_SET1(d->features_f32[i]);
++		v8sf *w = (v8sf *)&d->active_w->w_h1[i][0];
++
++		*w = v8sf_fmadd(v_neg_lr,
++				v8sf_clamp(vf * dh, v_cl_lo, v_cl_hi),
++				*w);
++	}
++
++	/* Hidden bias update */
++	{
++		v8sf *b = (v8sf *)&d->active_w->b_h1[0];
++
++		*b = v8sf_fmadd(v_neg_lr,
++				v8sf_clamp(dh, v_cl_lo, v_cl_hi),
++				*b);
++	}
++}
+diff --git a/drivers/cpuidle/governors/nap/nap_nn_sse2.c b/drivers/cpuidle/governors/nap/nap_nn_sse2.c
+new file mode 100644
+index 0000000000..0f2a6f131f
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/nap_nn_sse2.c
+@@ -0,0 +1,136 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * nap_nn_sse2.c — SSE2 forward pass and backpropagation for the nap MLP
++ *
++ * 8→8 trunk + scalar score s feeding the ordinal survival head.
++ * Baseline implementation using SSE2, which is always available on x86_64.
++ * No FMA — uses separate mul + add (2 instructions per MAC).
++ *
++ * Must be called within kernel_fpu_begin/end.
++ * Compiled with: CFLAGS += -msse2
++ */
++
++#include "nap.h"
++
++/* Aligned load/store */
++static inline v4sf v4sf_load(const float *p)   { return *(const v4sf *)p; }
++static inline void v4sf_store(float *p, v4sf v) { *(v4sf *)p = v; }
++
++/* ReLU helper */
++static inline v4sf v4sf_max(v4sf a, v4sf b)
++{
++	return __builtin_ia32_maxps(a, b);
++}
++
++void nap_nn_forward_sse2(const float *input,
++			 float *output,
++			 float *hidden_save,
++			 const struct nap_weights *w)
++{
++	int j;
++
++	/* === Hidden layer: 8 outputs = 2×xmm === */
++	v4sf acc0 = v4sf_load(&w->b_h1[0]);
++	v4sf acc1 = v4sf_load(&w->b_h1[4]);
++
++	for (j = 0; j < NAP_INPUT_SIZE; j++) {
++		v4sf x = V4SF_SET1(input[j]);
++		acc0 += v4sf_load(&w->w_h1[j][0]) * x;
++		acc1 += v4sf_load(&w->w_h1[j][4]) * x;
++	}
++
++	/* ReLU */
++	{
++		v4sf zero = V4SF_SET1(0.0f);
++
++		acc0 = v4sf_max(acc0, zero);
++		acc1 = v4sf_max(acc1, zero);
++	}
++	v4sf_store(&hidden_save[0], acc0);
++	v4sf_store(&hidden_save[4], acc1);
++
++	/* === Output layer: dot(hidden[8], w_out[8]) + b_out → 1 scalar === */
++	{
++		v4sf p0 = v4sf_load(&w->w_out[0]) * acc0;
++		v4sf p1 = v4sf_load(&w->w_out[4]) * acc1;
++		v4sf sum = p0 + p1;
++
++		*output = sum[0] + sum[1] + sum[2] + sum[3] + w->b_out;
++	}
++}
++
++/*
++ * Online learning (backpropagation) — SSE2
++ *
++ * Output: scalar d_out (pre-computed by caller)
++ * Hidden layer: 8 neurons = 2×xmm
++ */
++void nap_nn_learn_sse2(struct nap_cpu_data *d)
++{
++	int i;
++	float d_out_scalar = d->learn_d_out;
++	float *d_hid = d->learn_d_hid;
++	float lr = d->learn_lr;
++	float clamp_val = (float)d->max_grad_norm_millths / 1000.0f;
++	v4sf v_lr    = V4SF_SET1(lr);
++	v4sf v_cl_hi = V4SF_SET1(clamp_val);
++	v4sf v_cl_lo = V4SF_SET1(-clamp_val);
++
++	/*
++	 * Hidden gradient: d_hid[j] = relu'(h[j]) * w_out[j] * d_out.
++	 * Must be computed before output weight update to use pre-update
++	 * w_out.
++	 */
++	{
++		v4sf vd = V4SF_SET1(d_out_scalar);
++		v4sf zero = V4SF_SET1(0.0f);
++		v4sf h, g;
++		v4si m;
++
++		h = v4sf_load(&d->hidden_out[0]);
++		g = v4sf_load(&d->active_w->w_out[0]) * vd;
++		m = (v4si)(h > zero);
++		v4sf_store(&d_hid[0], v4si_as_v4sf(v4sf_as_v4si(g) & m));
++
++		h = v4sf_load(&d->hidden_out[4]);
++		g = v4sf_load(&d->active_w->w_out[4]) * vd;
++		m = (v4si)(h > zero);
++		v4sf_store(&d_hid[4], v4si_as_v4sf(v4sf_as_v4si(g) & m));
++	}
++
++	/* Output weight update: w_out[j] -= lr * clamp(h[j] * d_out) */
++	{
++		v4sf vd = V4SF_SET1(d_out_scalar);
++		v4sf *w = (v4sf *)&d->active_w->w_out[0];
++
++		w[0] -= v_lr * v4sf_clamp(v4sf_load(&d->hidden_out[0]) * vd,
++					  v_cl_lo, v_cl_hi);
++		w[1] -= v_lr * v4sf_clamp(v4sf_load(&d->hidden_out[4]) * vd,
++					  v_cl_lo, v_cl_hi);
++	}
++
++	/* Output bias update: b_out -= lr * clamp(d_out) */
++	d->active_w->b_out -= lr * fclampf(d_out_scalar, -clamp_val, clamp_val);
++
++	/* Hidden weight update: w_h1[i][j] -= lr * clamp(feat[i] * d_hid[j]) */
++	{
++		v4sf dh0 = *(const v4sf *)&d_hid[0];
++		v4sf dh1 = *(const v4sf *)&d_hid[4];
++
++		for (i = 0; i < NAP_INPUT_SIZE; i++) {
++			v4sf vf = V4SF_SET1(d->features_f32[i]);
++			v4sf *w = (v4sf *)&d->active_w->w_h1[i][0];
++
++			w[0] -= v_lr * v4sf_clamp(vf * dh0, v_cl_lo, v_cl_hi);
++			w[1] -= v_lr * v4sf_clamp(vf * dh1, v_cl_lo, v_cl_hi);
++		}
++
++		/* Hidden bias update: b_h1[j] -= lr * clamp(d_hid[j]) */
++		{
++			v4sf *b = (v4sf *)&d->active_w->b_h1[0];
++
++			b[0] -= v_lr * v4sf_clamp(dh0, v_cl_lo, v_cl_hi);
++			b[1] -= v_lr * v4sf_clamp(dh1, v_cl_lo, v_cl_hi);
++		}
++	}
++}
+-- 
+2.34.1
+

From 56b394e8250f1fd3e284a6b05a74b902f82a5259 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Sun, 7 Jun 2026 18:41:42 -0300
Subject: [PATCH 07/10] Update patch version from 0.4.0 to 0.5.0

---
 PKGBUILD | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PKGBUILD b/PKGBUILD
index b879d9f..a6873b7 100755
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -95,7 +95,7 @@ source=(
   "git+https://github.com/forkymcforkface/xpad-noone.git#commit=8e903676dd9514c07ce5e06e43c5f7d8cc51cb7d"
   "git+https://github.com/atar-axis/xpadneo.git#tag=v$_xpadneo_version"
    6.16-poc-selector-v2.6.1.patch 
-   6.16-nap-v0.4.0.patch
+   6.16-nap-v0.5.0.patch
 )
 sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0'
             '37452b4d09e5e42134ae24a61f2f656790837c327268074cf79d7dab3558b972'
@@ -145,7 +145,7 @@ sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0'
             '1055bbbd32985017f4501d375648873bd598db084177d302aeeade56b47920e1'
             '26b3a811d38471a42229fa037cb6d2bb5ff78f19f45a17c7f263339ee67769a7'
             '14dabfb0452a3a817e8d809fb28eb7565512e95386d789c627b62baf136e001f'
-            '99d87a5c9cf47f257df81fabbabdcb9df02ff93c0c9caabf1bbd40d2e50fed6e')
+            'f665d6ba6fc18579083bf8ec7ec741d43495f16f9dcbc482a5bd928b1778b2d3)
 
 export KBUILD_BUILD_HOST=archlinux
 export KBUILD_BUILD_USER=$pkgbase

From 52f50cd360db8bf0c1fb470332a0fb8f91e2aff1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Sun, 7 Jun 2026 19:41:30 -0300
Subject: [PATCH 08/10] Fix formatting issue in PKGBUILD checksum line

---
 PKGBUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PKGBUILD b/PKGBUILD
index a6873b7..c9ecf20 100755
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -145,7 +145,7 @@ sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0'
             '1055bbbd32985017f4501d375648873bd598db084177d302aeeade56b47920e1'
             '26b3a811d38471a42229fa037cb6d2bb5ff78f19f45a17c7f263339ee67769a7'
             '14dabfb0452a3a817e8d809fb28eb7565512e95386d789c627b62baf136e001f'
-            'f665d6ba6fc18579083bf8ec7ec741d43495f16f9dcbc482a5bd928b1778b2d3)
+            'f665d6ba6fc18579083bf8ec7ec741d43495f16f9dcbc482a5bd928b1778b2d3')
 
 export KBUILD_BUILD_HOST=archlinux
 export KBUILD_BUILD_USER=$pkgbase

From b2c04955794b01350a591daaf602a34f0a621dcb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Sun, 7 Jun 2026 20:37:49 -0300
Subject: [PATCH 09/10] Update checksum in PKGBUILD

---
 PKGBUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PKGBUILD b/PKGBUILD
index c9ecf20..ea1e537 100755
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -112,7 +112,7 @@ sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0'
             'e58e21581a509d0617591311b1d9ab8669f46046f2949e42d6149b0bb11ead87'
             '4bcf61814a6daac8f72c46a425b9ce88c07f6bd95f6a0ac287d73dfd4d5da60b'
             'ff3bbe78d6f072d57f567878e870956242ee78ccddd258b1ec2e4729621138fe'
-            'df38dc7a2bd45ebacf34de8182e7df50f7ea871715b0ab4798f40485ba7fd2f0'
+            'ab6b17b1f9cc4b322f0050d2e8cede75e44e069854e9bdc22068356530d628e8'
             '11fe52062dedc9c2016fafc98899f4afb4cbd5327bd985c8d813dc72461f503a'
             '9df628fd530950e37d31da854cb314d536f33c83935adf5c47e71266a55f7004'
             '9e7b20068cdfe6a00b64d7488bdc47966fa130a07a3eae02fa57caef5d35d4ec'

From 71b5261442cba4bb65da458b55d3765c01adea1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Thu, 11 Jun 2026 05:48:01 -0300
Subject: [PATCH 10/10] Update PKGBUILD version and tag for linux-charcoal

---
 PKGBUILD | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PKGBUILD b/PKGBUILD
index ea1e537..2d1baba 100755
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -4,8 +4,8 @@
 
 pkgbase=linux-charcoal-616
 _nepbase=linux-neptune-616
-_tag=6.16.12-valve23
-_ver=3
+_tag=6.16.12-valve24
+_ver=1
 pkgver=${_tag//-/.}.cc$_ver
 pkgrel=1
 pkgdesc='Linux'
@@ -97,7 +97,7 @@ source=(
    6.16-poc-selector-v2.6.1.patch 
    6.16-nap-v0.5.0.patch
 )
-sha256sums=('4011d16fef57b8f04cbcddc0937819f7fd32225f65d63698afbd5dc6629d0ff0'
+sha256sums=('SKIP'
             '37452b4d09e5e42134ae24a61f2f656790837c327268074cf79d7dab3558b972'
             'd88eaf0f94bae470040e4882f334c05b1bb2ab0a99e4b7299aa0b2337810ab8d'
             'fd57213c524e24cd9c72e2fecd9b2005934b6099e209864e5a93eb03406fca21'