From a6d0396fac324599a52c41dd474d6ca127265190 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Tue, 12 May 2026 16:55:10 -0300
Subject: [PATCH 1/5] Backport POC selector changes from 6.18.3 to 6.16

---
 6.16-poc-selector-v2.6.1.patch | 2575 ++++++++++++++++++++++++++++++++
 PKGBUILD                       |    4 +-
 2 files changed, 2578 insertions(+), 1 deletion(-)
 create mode 100644 6.16-poc-selector-v2.6.1.patch

diff --git a/6.16-poc-selector-v2.6.1.patch b/6.16-poc-selector-v2.6.1.patch
new file mode 100644
index 0000000..53617b5
--- /dev/null
+++ b/6.16-poc-selector-v2.6.1.patch
@@ -0,0 +1,2575 @@
+From 854c284516887b6bdf6a3a3f2507ce873151a4d5 Mon Sep 17 00:00:00 2001
+From: Masahito S <firelzrd@gmail.com>
+Date: Mon, 27 Apr 2026 11:07:36 +0900
+Subject: [PATCH] 6.18.3-poc-selector-v2.6.1
+
+---
+ include/linux/sched/topology.h |   50 +-
+ init/Kconfig                   |   13 +
+ kernel/sched/ext.c             |    7 +
+ kernel/sched/fair.c            |  164 ++-
+ kernel/sched/idle.c            |   10 +
+ kernel/sched/poc_selector.c    | 1788 ++++++++++++++++++++++++++++++++
+ kernel/sched/sched.h           |  110 ++
+ kernel/sched/topology.c        |  226 ++++
+ 8 files changed, 2330 insertions(+), 38 deletions(-)
+ create mode 100644 kernel/sched/poc_selector.c
+
+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
+index bbcfdf12aa..c10c871dea 100644
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -66,8 +66,54 @@ struct sched_group;
+ struct sched_domain_shared {
+ 	atomic_t	ref;
+ 	atomic_t	nr_busy_cpus;
+-	int		has_idle_cores;
+-	int		nr_idle_scan;
++	int			has_idle_cores;
++	int			nr_idle_scan;
++#ifdef CONFIG_SCHED_POC_SELECTOR
++	/*
++	 * POC Selector: per-LLC idle CPU tracking
++	 */
++	u64		poc_llc_members;	/* bitmask of valid CPUs (relative to base) */
++	int		poc_cpu_base;		/* smallest CPU ID in this LLC */
++	u8		poc_affinity_shift;	/* bit shift for cpumask alignment */
++	bool	poc_fast_eligible;	/* true when LLC CPU count <= 64 */
++	bool	poc_cluster_valid;	/* true when cluster mask is usable */
++#ifdef CONFIG_SCHED_SMT
++	u8		poc_smt_shift;		/* bit distance between SMT siblings */
++	u64		poc_primary_mask;	/* bitmask of core representative CPUs */
++#endif
++
++	/*
++	 * Hot write path: idle state flag arrays (lock-free mode).
++	 * Each array = exactly 1 cache line (64B).
++	 * Writers: WRITE_ONCE (plain MOV, no LOCK prefix).
++	 * Readers: snapshot to stack, then multiply-and-shift aggregation.
++	 * Active only when sched_poc_atomic_bitmap=0.
++	 */
++	u8		poc_idle_cpus[64] ____cacheline_aligned;
++#ifdef CONFIG_SCHED_SMT
++	u8		poc_idle_cores[64] ____cacheline_aligned;
++#endif /* CONFIG_SCHED_SMT */
++
++	/*
++	 * Hot read/write path: idle state bitmaps (bitmap mode, default).
++	 * Readers: single atomic64_read (MOV on x86).
++	 * Writers: atomic64_or / atomic64_andnot (LOCK'd on x86).
++	 * Active only when sched_poc_atomic_bitmap=1.
++	 */
++	atomic64_t	poc_idle_cpus_mask ____cacheline_aligned;
++#ifdef CONFIG_SCHED_SMT
++	atomic64_t	poc_idle_cores_mask ____cacheline_aligned;
++#endif /* CONFIG_SCHED_SMT */
++
++	/*
++	 * Read-only lookup tables (written once at init).
++	 * Cacheline-aligned for exact prefetch targeting.
++	 */
++	u64		poc_cluster_mask[64] ____cacheline_aligned;
++#ifdef CONFIG_SCHED_SMT
++	u64		poc_smt_mask[64] ____cacheline_aligned;
++#endif /* CONFIG_SCHED_SMT */
++#endif /* CONFIG_SCHED_POC_SELECTOR */
+ };
+ 
+ struct sched_domain {
+diff --git a/init/Kconfig b/init/Kconfig
+index cab3ad28ca..991fe7f8a4 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1435,6 +1435,19 @@ config SCHED_AUTOGROUP
+ 	  desktop applications.  Task group autogeneration is currently based
+ 	  upon task session.
+ 
++config SCHED_POC_SELECTOR
++	bool "Piece-Of-Cake Fast Idle CPU Selector"
++	depends on SMP
++	default y
++	help
++	  Idle CPU selector using cached bitmasks inspired by the scx_cake BPF
++	  scheduler. Reduces select_idle_cpu overhead by using bitmap scanning.
++
++	  This optimization does not affect scheduler fairness - it only
++	  speeds up the process of finding an idle CPU for task wakeup.
++
++	  If unsure, say Y.
++
+ config RELAY
+ 	bool "Kernel->user space relay support (formerly relayfs)"
+ 	select IRQ_WORK
+diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
+index b959a70471..ce89f9627f 100644
+--- a/kernel/sched/ext.c
++++ b/kernel/sched/ext.c
+@@ -4076,6 +4076,9 @@ static void scx_disable_workfn(struct kthread_work *work)
+ 	mutex_unlock(&scx_enable_mutex);
+ 
+ 	WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
++#ifdef CONFIG_SCHED_POC_SELECTOR
++	poc_notify_scx(false);
++#endif
+ done:
+ 	scx_bypass(false);
+ }
+@@ -4814,6 +4817,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+ 	if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
+ 		static_branch_enable(&__scx_switched_all);
+ 
++#ifdef CONFIG_SCHED_POC_SELECTOR
++	poc_notify_scx(true);
++#endif
++
+ 	pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
+ 		sch->ops.name, scx_switched_all() ? "" : " (partial)");
+ 	kobject_uevent(&sch->kobj, KOBJ_ADD);
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 967ca52fb2..1afd1838f5 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -49,6 +49,7 @@
+ #include <linux/ratelimit.h>
+ #include <linux/task_work.h>
+ #include <linux/rbtree_augmented.h>
++#include <linux/prefetch.h>
+ 
+ #include <asm/switch_to.h>
+ 
+@@ -1064,7 +1065,7 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 
+ #include "pelt.h"
+ 
+-static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
++static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu, int sync);
+ static unsigned long task_h_load(struct task_struct *p);
+ static unsigned long capacity_of(int cpu);
+ 
+@@ -7601,6 +7602,20 @@ void __update_idle_core(struct rq *rq)
+ 	rcu_read_unlock();
+ }
+ 
++/*
++ * Check if the entire core (all SMT siblings) containing @cpu is idle.
++ */
++static inline bool is_idle_core(int cpu)
++{
++	int sibling;
++
++	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
++		if (!available_idle_cpu(sibling))
++			return false;
++	}
++	return true;
++}
++
+ /*
+  * Scan the entire LLC domain for idle cores; this dynamically switches off if
+  * there are no idle cores left in the system; tracked through
+@@ -7668,6 +7683,11 @@ static inline bool test_idle_cores(int cpu)
+ 	return false;
+ }
+ 
++static inline bool is_idle_core(int cpu)
++{
++	return (available_idle_cpu(cpu) || sched_idle_cpu(cpu));
++}
++
+ static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
+ {
+ 	return __select_idle_cpu(core, p);
+@@ -7817,16 +7837,38 @@ static inline bool asym_fits_cpu(unsigned long util,
+ 	return true;
+ }
+ 
++#ifdef CONFIG_SCHED_POC_SELECTOR
++#include "poc_selector.c"
++#endif
+ /*
+  * Try and locate an idle core/thread in the LLC cache domain.
+  */
+-static int select_idle_sibling(struct task_struct *p, int prev, int target)
++static int select_idle_sibling(struct task_struct *p, int prev, int target, int sync)
+ {
+ 	bool has_idle_core = false;
+ 	struct sched_domain *sd;
+ 	unsigned long task_util, util_min, util_max;
+ 	int i, recent_used_cpu, prev_aff = -1;
+ 
++	/* Check a recently used CPU as a potential idle candidate: */
++	recent_used_cpu = p->recent_used_cpu;
++	p->recent_used_cpu = prev;
++	if (recent_used_cpu != prev &&
++	    recent_used_cpu != target &&
++	    cpus_share_cache(recent_used_cpu, target) &&
++	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
++	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr)) {
++#ifdef CONFIG_SCHED_POC_SELECTOR
++		if (!static_branch_likely(&poc_selector_active) ||
++			static_branch_unlikely(&sched_poc_early_select))
++#endif
++		if ((unsigned int)recent_used_cpu < nr_cpumask_bits &&
++		    is_idle_core(recent_used_cpu))
++			return recent_used_cpu;
++	} else {
++		recent_used_cpu = -1;
++	}
++
+ 	/*
+ 	 * On asymmetric system, update task utilization because we will check
+ 	 * that the task fits with CPU's capacity.
+@@ -7843,23 +7885,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
+ 	 */
+ 	lockdep_assert_irqs_disabled();
+ 
+-	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
++#ifdef CONFIG_SCHED_POC_SELECTOR
++	if (static_branch_likely(&poc_selector_active) &&
++	    static_branch_unlikely(&sched_poc_early_select) &&
++	    is_idle_core(target) &&
+ 	    asym_fits_cpu(task_util, util_min, util_max, target))
+ 		return target;
+-
+-	/*
+-	 * If the previous CPU is cache affine and idle, don't be stupid:
+-	 */
+-	if (prev != target && cpus_share_cache(prev, target) &&
+-	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+-	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
+-
+-		if (!static_branch_unlikely(&sched_cluster_active) ||
+-		    cpus_share_resources(prev, target))
+-			return prev;
+-
+-		prev_aff = prev;
+-	}
++#endif
+ 
+ 	/*
+ 	 * Allow a per-cpu kthread to stack with the wakee if the
+@@ -7877,24 +7909,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
+ 		return prev;
+ 	}
+ 
+-	/* Check a recently used CPU as a potential idle candidate: */
+-	recent_used_cpu = p->recent_used_cpu;
+-	p->recent_used_cpu = prev;
+-	if (recent_used_cpu != prev &&
+-	    recent_used_cpu != target &&
+-	    cpus_share_cache(recent_used_cpu, target) &&
+-	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+-	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
+-	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
+-
+-		if (!static_branch_unlikely(&sched_cluster_active) ||
+-		    cpus_share_resources(recent_used_cpu, target))
+-			return recent_used_cpu;
+-
+-	} else {
+-		recent_used_cpu = -1;
+-	}
+-
+ 	/*
+ 	 * For asymmetric CPU capacity systems, our domain of interest is
+ 	 * sd_asym_cpucapacity rather than sd_llc.
+@@ -7919,6 +7933,74 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
+ 	if (!sd)
+ 		return target;
+ 
++#ifdef CONFIG_SCHED_POC_SELECTOR
++	{
++		struct sched_domain_shared *sd_share =
++			rcu_dereference(per_cpu(sd_llc_shared, target));
++		if (static_branch_likely(&poc_selector_active)
++				&& !sched_asym_cpucap_active()
++				&& sd_share && likely(sd_share->poc_fast_eligible)) {
++			int poc_cpu = select_idle_cpu_poc(target, prev,
++					recent_used_cpu, sync,
++					sd_share, p->cpus_ptr);
++			if (poc_cpu >= 0) {
++				return poc_cpu;
++			}
++			/*
++			 * POC returns -2 when the SIS_UTIL overload gate fires
++			 * (smt_fallback=0 only). POC has already checked
++			 * prev's SMT sibling (Level 4) and decided broader
++			 * search is not worthwhile. CFS would reach the same
++			 * conclusion, so skip select_idle_smt/select_idle_cpu.
++			 *
++			 * POC returns -1 for Level 0 saturation (no idle CPUs
++			 * in bitmap), but CFS may still find sched_idle CPUs,
++			 * so we must NOT skip CFS in that case.
++			 */
++			if (poc_cpu == -2)
++				goto give_up;
++		} else {
++			/*
++			 * poc_selector_active is off — POC is either disabled
++			 * by sysctl or suppressed while scx is running.
++			 * If an scx scheduler called us, flip poc_selector_skip
++			 * and schedule a workqueue item to re-enable POC with
++			 * bitmap resync.
++			 */
++			poc_check_skip_fallback();
++		}
++	}
++	poc_count(POC_FALLBACK);
++#endif /* CONFIG_SCHED_POC_SELECTOR */
++
++	if ((unsigned int)recent_used_cpu < nr_cpumask_bits) {
++		if ((available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu))) {
++			if (is_idle_core(recent_used_cpu))
++				return recent_used_cpu;
++			/* idle CPU but not idle core → preserve for give_up */
++		} else {
++			recent_used_cpu = -1;  /* not idle → discard */
++		}
++	}
++
++	if (sync && is_idle_core(target) &&
++	    asym_fits_cpu(task_util, util_min, util_max, target))
++		return target;
++
++	/*
++	 * If the previous CPU is cache affine and idle, don't be stupid:
++	 */
++	if (prev != target && cpus_share_cache(prev, target) &&
++	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
++	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
++
++		if (!static_branch_unlikely(&sched_cluster_active) ||
++		    cpus_share_resources(prev, target))
++			return prev;
++
++		prev_aff = prev;
++	}
++
+ 	if (sched_smt_active()) {
+ 		has_idle_core = test_idle_cores(target);
+ 
+@@ -7933,6 +8015,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
+ 	if ((unsigned)i < nr_cpumask_bits)
+ 		return i;
+ 
++#ifdef CONFIG_SCHED_POC_SELECTOR
++give_up:
++#endif
+ 	/*
+ 	 * For cluster machines which have lower sharing cache like L2 or
+ 	 * LLC Tag, we tend to find an idle CPU in the target's cluster
+@@ -7944,6 +8029,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
+ 	if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+ 		return recent_used_cpu;
+ 
++#ifdef CONFIG_SCHED_POC_SELECTOR
++	/* Last resort: avoid enqueuing behind RT/DL tasks on target */
++	if (static_branch_likely(&poc_selector_active) &&
++			rt_task(cpu_rq(target)->curr) &&
++			prev != target && !rt_task(cpu_rq(prev)->curr))
++		return prev;
++#endif
+ 	return target;
+ }
+ 
+@@ -8628,7 +8720,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+ 		new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ 	} else if (wake_flags & WF_TTWU) { /* XXX always ? */
+ 		/* Fast path */
+-		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
++		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu, sync);
+ 	}
+ 	rcu_read_unlock();
+ 
+diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
+index c39b089d4f..3fffa1a43f 100644
+--- a/kernel/sched/idle.c
++++ b/kernel/sched/idle.c
+@@ -275,6 +275,11 @@ static void do_idle(void)
+ 	__current_set_polling();
+ 	tick_nohz_idle_enter();
+ 
++#ifdef CONFIG_SCHED_POC_SELECTOR
++	/* POC Selector: mark CPU as idle */
++	set_cpu_idle_state_poc(cpu, 1);
++#endif /* CONFIG_SCHED_POC_SELECTOR */
++
+ 	while (!need_resched()) {
+ 
+ 		/*
+@@ -332,6 +337,11 @@ static void do_idle(void)
+ 		arch_cpu_idle_exit();
+ 	}
+ 
++#ifdef CONFIG_SCHED_POC_SELECTOR
++	/* POC Selector: mark CPU as busy */
++	set_cpu_idle_state_poc(cpu, 0);
++#endif /* CONFIG_SCHED_POC_SELECTOR */
++
+ 	/*
+ 	 * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
+ 	 * be set, propagate it into PREEMPT_NEED_RESCHED.
+diff --git a/kernel/sched/poc_selector.c b/kernel/sched/poc_selector.c
+new file mode 100644
+index 0000000000..5bbd927828
+--- /dev/null
++++ b/kernel/sched/poc_selector.c
+@@ -0,0 +1,1788 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Piece-Of-Cake (POC) CPU Selector
++ *
++ * Fast idle CPU selector inspired by RitzDaCat's scx_cake scheduler
++ * "Piece of Cake" - making idle CPU search a piece of cake!
++ *
++ * Tracks idle state in per-LLC atomic64_t bitmaps with lock-free
++ * atomic64_read/or/andnot for O(1) idle CPU lookup.
++ * Supports up to 64 CPUs per LLC (single 64-bit word).
++ * Includes affinity-aware filtering via cpumask intersection.
++ *
++ * When the fast path is not eligible (LLC exceeds 64 CPUs),
++ * returns -1 to let CFS standard select_idle_cpu handle it.
++ *
++ * Copyright (C) 2026 Masahito Suzuki
++ *
++ * Acknowledgements:
++ * This work is heavily inspired by RitzDaCat's scx_cake scheduler.
++ *
++ * Special thanks to the algorithm inventors whose research enabled
++ * the O(1) techniques used in this implementation:
++ *
++ * - Prashant Pandey, Michael A. Bender, Rob Johnson
++ * ("A Fast x86 Implementation of Select")
++ *
++ * - Daniel Lemire
++ * ("Fast Random Integer Generation in an Interval")
++ */
++
++#ifdef CONFIG_SCHED_POC_SELECTOR
++
++/**************************************************************
++ * Version Information:
++ */
++
++#define SCHED_POC_SELECTOR_AUTHOR   "Masahito Suzuki"
++#define SCHED_POC_SELECTOR_PROGNAME "Piece-Of-Cake (POC) CPU Selector"
++
++#define SCHED_POC_SELECTOR_VERSION  "2.6.1"
++
++/**************************************************************
++ * Static keys:
++ */
++
++/*
++ * Runtime control: poc_selector_active (static key)
++ * Derived from: sched_poc_selector && !poc_selector_skip
++ *
++ * sched_poc_selector: user-visible sysctl (kernel.sched_poc_selector),
++ * plain bool, default true.
++ * poc_selector_skip:  set true while sched_ext is active to avoid
++ * idle-bitmap overhead in do_idle.
++ * poc_selector_active: the actual static key gating all POC hot paths.
++ * Enabled only when sched_poc_selector && !poc_selector_skip.
++ * On enable transition, poc_resync_idle_state() is called.
++ */
++DEFINE_STATIC_KEY_TRUE(poc_selector_active);
++static bool sched_poc_selector = true;
++static bool poc_selector_skip;
++
++/*
++ * SMT fallback control: sched_poc_smt_fallback
++ * (sysctl kernel.sched_poc_smt_fallback)
++ *
++ * When enabled, POC bails out to CFS when no idle
++ * cores exist (has_idle_cores == false).  CFS then handles
++ * SMT sibling selection via select_idle_smt(prev) and
++ * nr_idle_scan-limited select_idle_cpu().
++ *
++ * When disabled (default), POC handles SMT sibling selection
++ * itself, trying prev's SMT sibling for cache locality, then
++ * LLC-wide RR search for remaining idle CPUs.  Level 5/6 is
++ * gated by nr_idle_scan (SIS_UTIL): when LLC utilization
++ * exceeds ~85%, broader SMT search is skipped.
++ */
++DEFINE_STATIC_KEY_FALSE(sched_poc_smt_fallback);
++
++/*
++ * SMT consecutive layout: sched_poc_smt_consecutive
++ *
++ * When true (default), SMT siblings occupy consecutive LLC-relative
++ * positions (e.g., CPU 0,1 / 2,3 / ...).  The idle core mask is
++ * derived from the idle CPU mask via bit-parallel operations:
++ * core_mask = cpu_mask & (cpu_mask >> 1) & 0x5555555555555555ULL
++ *
++ * Disabled at boot if non-consecutive 2-way SMT or >2-way SMT
++ * is detected on any LLC.
++ */
++DEFINE_STATIC_KEY_TRUE(sched_poc_smt_consecutive);
++
++/*
++ * SMT uniform 2-way layout: sched_poc_smt_uniform
++ *
++ * When true (default), all cores in every LLC have uniform 2-way SMT
++ * with a constant stride between siblings.  The idle core mask is
++ * derived at read time via:
++ * core_mask = cpu_mask & (cpu_mask >> poc_smt_shift) & poc_primary_mask
++ *
++ * This covers both consecutive (stride=1) and stride-N (e.g., Intel
++ * Xeon) layouts without write-path overhead.
++ *
++ * When false (>2-way SMT or non-uniform topology), falls back to
++ * write-time maintenance of poc_idle_cores_mask atomic64_t.
++ *
++ * Disabled at boot if any LLC contains non-2-way or non-uniform SMT.
++ */
++DEFINE_STATIC_KEY_TRUE(sched_poc_smt_uniform);
++
++/*
++ * Target CPU sticky: sched_poc_target_sticky
++ * (sysctl kernel.sched_poc_target_sticky)
++ *
++ * When enabled, if the target CPU is idle in the bitmap, return it
++ * immediately — regardless of whether its core is fully idle.
++ * This provides L1 cache affinity: the waking task reuses the CPU
++ * it ran on last, keeping warm TLB/L1/L2 state.
++ *
++ * Checked after Level 0 (saturation) and before core_mask derivation.
++ * Default: disabled.
++ */
++DEFINE_STATIC_KEY_FALSE(sched_poc_target_sticky);
++
++/*
++ * Early select: sched_poc_early_select
++ * (sysctl kernel.sched_poc_early_select)
++ *
++ * When enabled, select_idle_sibling performs idle-core checks
++ * for recent_used_cpu and target BEFORE entering POC search:
++ * - recent_used_cpu with fully idle core → return immediately
++ * (matches upstream CFS Gate 4 behavior)
++ * - target with fully idle core → return immediately
++ * (avoids POC overhead: RCU deref, bitmap read, mask ops)
++ *
++ * These two checks must be toggled together to preserve POC's
++ * internal priority order (Level 1r before 1t).  Enabling only
++ * one would let the pre-POC path return a lower-priority result
++ * before POC can evaluate the higher-priority candidate.
++ *
++ * Default: enabled.
++ */
++DEFINE_STATIC_KEY_TRUE(sched_poc_early_select);
++
++/*
++ * Greedy search: sched_poc_greedy_search
++ * (sysctl kernel.sched_poc_greedy_search)
++ *
++ * When enabled, POC always attempts Level 5/6 (LLC-wide SMT sibling
++ * search) regardless of utilization, ignoring the SIS_UTIL overload
++ * gate (nr_idle_scan == 0).  This may benefit latency-sensitive
++ * workloads that want to find any idle CPU at all costs.
++ *
++ * When disabled, POC skips Level 5/6 under overload,
++ * returning -2 to also skip CFS fallback search.
++ *
++ * Default: enabled.
++ */
++DEFINE_STATIC_KEY_TRUE(sched_poc_greedy_search);
++
++/*
++ * sched_poc_aligned: true when all LLCs have poc_cpu_base aligned to 64
++ *
++ * When true, cpumask-to-POC conversion is a simple word load (zero shift).
++ * When false (e.g., Threadripper CCDs at CPU 8, 16, ...), bit shifting
++ * is needed to align cpumask bits with POC's LLC-relative positions.
++ * Defaults to true; disabled at boot if any LLC has non-aligned base.
++ */
++DEFINE_STATIC_KEY_TRUE(sched_poc_aligned);
++
++/*
++ * Packed priority search: sched_poc_packed
++ *
++ * When true (default), per-LLC CPU count is ≤ 32, enabling packed
++ * priority search.  Cluster candidates (Level 2) and LLC-wide
++ * candidates (Level 3) are packed into a single 64-bit word:
++ *
++ * bits [31:0]:  cluster idle candidates (high priority)
++ * bits [63:32]: all LLC idle candidates (low priority)
++ *
++ * A single TZCNT resolves both levels simultaneously.
++ * ror32-based rotation distributes selections across idle CPUs.
++ *
++ * When false (LLC > 32 CPUs), falls back to separate cluster
++ * search + PTSELECT-based RR.
++ *
++ * Disabled at boot if any LLC has > 32 CPUs.
++ */
++DEFINE_STATIC_KEY_TRUE(sched_poc_packed);
++
++/*
++ * Improved RR strategy: sched_poc_rr_improved
++ * (sysctl kernel.sched_poc_rr_improved)
++ *
++ * When enabled (default), idle CPU selection in poc_select_rr,
++ * poc_cluster_search, and the packed priority search uses an
++ * improved RR strategy combining two techniques:
++ * 1. total size case-split (1/2/>=3): direct / interleave / full
++ * 2. golden-ratio scrambling (Lemire fastrange)
++ *
++ * When disabled, the current strategy is used unchanged:
++ * - poc_select_rr:   poc_rr_step[] table (perfect RR)
++ * - poc_cluster_search: ctz lowest-bit selection (no RR)
++ * - packed search:   ror32(counter & 31)
++ *
++ * The current path is preserved as the A/B-testing baseline;
++ * once the improved path is validated, the legacy code will
++ * be removed in a follow-up.
++ */
++DEFINE_STATIC_KEY_TRUE(sched_poc_rr_improved);
++
++/*
++ * Lockless bitmap mode: sched_poc_lockless_bitmap
++ * (sysctl kernel.sched_poc_lockless_bitmap)
++ *
++ * When enabled, idle state is tracked in u8[64] flag arrays.
++ * Writers use plain WRITE_ONCE (no LOCK prefix); readers snapshot
++ * the 64-byte cache line to the stack, then use multiply-and-shift
++ * aggregation to assemble a u64 bitmask.
++ *
++ * When disabled (default), idle state is tracked in atomic64_t bitmaps.
++ * Readers use a single atomic64_read (MOV on x86); writers use
++ * atomic64_or / atomic64_andnot (LOCK'd on x86).
++ *
++ * Only one representation is maintained at a time (single-write).
++ * Switching via sysctl resyncs the newly-active representation
++ * before readers can observe it.
++ *
++ * Default: disabled.
++ */
++DEFINE_STATIC_KEY_FALSE(sched_poc_lockless_bitmap);
++
++/**************************************************************
++ * Debug counters (sysctl kernel.sched_poc_count):
++ *
++ * Per-CPU counters for each selection level hit.
++ * Guarded by static key — zero overhead when disabled (default).
++ * Aggregated across all CPUs and exposed via sysfs.
++ */
++enum poc_level {
++	POC_LV1S = 0,	/* target CPU sticky (L1/TLB affinity) */
++	POC_LV1T,		/* target core idle */
++	POC_LV1P,		/* prev core idle */
++	POC_LV1R,		/* recent core idle */
++	POC_LV2,		/* idle core in L2 cluster */
++	POC_LV3,		/* idle core across LLC (RR) */
++	POC_LV4S,		/* sync + target CPU idle (no idle cores) */
++	POC_LV4P,		/* prev's SMT sibling (cache locality) */
++	POC_LV4R,		/* recent's SMT sibling (warm cache) */
++	POC_LV4T,		/* target's SMT sibling */
++	POC_LV5,		/* idle CPU in L2 cluster */
++	POC_LV6,		/* idle CPU across LLC (RR) */
++	POC_FALLBACK,	/* POC returned -1, CFS fallback */
++	POC_NR_LEVELS
++};
++
++#define POC_SMT_LEVEL_OFFSET (POC_LV5 - POC_LV2)
++
++DEFINE_STATIC_KEY_FALSE(sched_poc_count_enabled);
++
++static DEFINE_PER_CPU(unsigned long[POC_NR_LEVELS], poc_debug_cnt);
++
++static __always_inline void poc_count(enum poc_level lv)
++{
++	if (static_branch_unlikely(&sched_poc_count_enabled))
++		__this_cpu_inc(poc_debug_cnt[lv]);
++}
++
++/**************************************************************
++ * Per-CPU round-robin counter and division-free mapping:
++ */
++
++/*
++ * POC_HASH_MULT / POC_SCRAMBLE — Golden-ratio scrambling
++ *
++ * Multiplying a 32-bit counter by ⌊2^32 / φ⌋ = 0x9E3779B9 scatters
++ * consecutive values across the 32-bit output space with good
++ * avalanche properties (Knuth's multiplicative hash, TAOCP Vol. 3).
++ * The scrambled value feeds POC_FASTRANGE for uniform [0, range)
++ * mapping in the improved RR path, or is used directly with a bit
++ * shift to derive an uncorrelated rotation amount in packed search.
++ */
++#define POC_HASH_MULT 0x9E3779B9U  /* golden ratio * 2^32 */
++#define POC_SCRAMBLE(counter) ((u32)(counter) * POC_HASH_MULT)
++
++/*
++ * Per-CPU round-robin counter for idle CPU selection.
++ * Each CPU starts at a different offset to reduce cross-CPU
++ * collision probability.  Combined with poc_rr_step[] and
++ * POC_FIXED_MOD16, consecutive calls on the same CPU produce
++ * perfect round-robin: each call picks a different idle CPU
++ * until all candidates have been visited.
++ */
++static DEFINE_PER_CPU(u32, poc_rr_counter);
++
++/*
++ * Division-free modulo via 16-bit fixed-point reciprocal multiplication
++ *
++ * The multiply-and-shift technique is inspired by:
++ * D. Lemire, "Fast Random Integer Generation in an Interval",
++ * ACM Trans. Model. Comput. Simul. 29, 1, Article 3, 2019.
++ *
++ * Combined with poc_rr_step[], this replaces modulo with two
++ * multiplications and a shift:
++ * phase = (u16)(counter * poc_rr_step[total - 1])
++ * pick  = POC_FIXED_MOD16(phase, total)
++ *
++ * Proof that pick == counter % total (for total ≤ 64):
++ * Let S = ceil(2^16 / N).  For k in [0, N):
++ * k*S*N / 2^16 ∈ [k, k + kN/2^16)
++ * Since kN < N² ≤ 64² = 4096 ≪ 2^16, floor(kN/2^16) = 0,
++ * so floor(k*S*N / 2^16) = k.  QED.
++ */
++#define POC_FIXED_MOD16(phase, range) ((u32)(((u32)(phase) * (u32)(range)) >> 16))
++
++/*
++ * POC_FASTRANGE — Map a 32-bit scrambled value to [0, range)
++ *
++ * Implements Lemire's fastrange technique:
++ * D. Lemire, "Fast Random Integer Generation in an Interval",
++ * ACM Trans. Model. Comput. Simul. 29, 1, Article 3, 2019.
++ *
++ * Computes (seed * range) >> 32, giving a uniform mapping of
++ * a 32-bit seed into [0, range) using only one 64-bit multiply
++ * and a shift.  Used with golden-ratio hashing for pseudo-random
++ * RR distribution in the improved RR path.
++ */
++#define POC_FASTRANGE(seed, range) ((u32)(((u64)(seed) * (u32)(range)) >> 32))
++
++/*
++ * RR step table: poc_rr_step[n-1] = ceil(2^16 / n) for n = 1..64
++ *
++ * Indexed by (total - 1) where total = popcount(idle mask).
++ * total == 0 is unreachable (caller guarantees mask != 0).
++ * 64 entries × 2 bytes = 128 bytes = exactly 2 cache lines.
++ */
++static const u16 poc_rr_step[64] = {
++	     0, 0x8000, 0x5556, 0x4000,	0x3334, 0x2AAB, 0x2493, 0x2000,	/* 1.. 8 */
++	0x1C72, 0x199A, 0x1746, 0x1556,	0x13B2, 0x124A, 0x1112, 0x1000,	/* 9..16 */
++	0x0F10, 0x0E39, 0x0D7A, 0x0CCD,	0x0C31, 0x0BA3, 0x0B22, 0x0AAB,	/* 17..24 */
++	0x0A3E, 0x09D9, 0x097C, 0x0925,	0x08D4, 0x0889, 0x0843, 0x0800,	/* 25..32 */
++	0x07C2, 0x0788, 0x0751, 0x071D,	0x06EC, 0x06BD, 0x0691, 0x0667,	/* 33..40 */
++	0x063F, 0x0619, 0x05F5, 0x05D2,	0x05B1, 0x0591, 0x0573, 0x0556,	/* 41..48 */
++	0x053A, 0x051F, 0x0506, 0x04ED,	0x04D5, 0x04BE, 0x04A8, 0x0493,	/* 49..56 */
++	0x047E, 0x046A, 0x0457, 0x0445,	0x0433, 0x0422, 0x0411, 0x0400,	/* 57..64 */
++};
++
++/**************************************************************
++ * Bit manipulation primitives:
++ */
++
++/*
++ * POC_CTZ64 — Portable Count Trailing Zeros (64-bit)
++ *
++ * Three-tier architecture detection:
++ *
++ * Tier 1: Native hardware CTZ with well-defined zero semantics
++ * x86-64 + BMI1 (__BMI__): TZCNT — returns 64 for input 0
++ * ARM64:                   RBIT + CLZ
++ * RISC-V Zbb:              CTZ instruction
++ *
++ * Tier 2: x86-64 without BMI1 (Bulldozer, pre-Haswell, etc.)
++ * BSF is fast (~3 cyc) but UNDEFINED for input 0.
++ * On AMD Bulldozer: BSF(0) leaves dest register unchanged (stale value).
++ * On Intel pre-Haswell: BSF(0) is architecturally undefined.
++ * Wrap with explicit zero check to guarantee returning 64.
++ *
++ * Tier 3: De Bruijn fallback (BPF, unknown architectures)
++ * Software multiply + 64-entry table lookup, branchless O(1).
++ */
++
++/*
++ * POC_CTZ64 is defined in sched.h for use by load balancer functions.
++ * Here we only define POC_CTZ64_NAME for sysfs hardware info display.
++ */
++#if defined(__x86_64__) && defined(__BMI__)
++#define POC_CTZ64_NAME "HW (TZCNT)"
++#elif defined(__aarch64__)
++#define POC_CTZ64_NAME "HW (RBIT+CLZ)"
++#elif defined(__riscv) && defined(__riscv_zbb)
++#define POC_CTZ64_NAME "HW (ctz)"
++#elif defined(__x86_64__)
++#define POC_CTZ64_NAME "HW (BSF)"
++#else
++#define POC_CTZ64_NAME "SW (De Bruijn)"
++#endif
++
++/*
++ * POC_PTSELECT — Select position of the j-th set bit in a 64-bit word
++ *
++ * Based on the algorithm described in:
++ * P. Pandey, M. A. Bender, R. Johnson,
++ * "A Fast x86 Implementation of Select", arXiv:1706.00990, 2017.
++ *
++ * Returns the bit position (0-indexed) of the j-th set bit in v.
++ * Undefined behavior if j >= popcount(v).
++ *
++ * Tier 1 (x86-64 + BMI2, excluding AMD Zen 1/2 slow microcode PDEP):
++ * PDEP + TZCNT — 4 instructions total.
++ * PDEP deposits the j-th source bit at the j-th mask position.
++ *
++ * Tier 2 (fallback): Iterative bit-clear — O(j) iterations
++ * Clears the lowest set bit j times, then CTZ on remainder.
++ */
++
++#if defined(__x86_64__) && defined(__BMI2__) && \
++    !defined(__znver1) && !defined(__znver2)
++static __always_inline int poc_ptselect(u64 v, int j)
++{
++	u64 deposited;
++
++	asm("pdep %2, %1, %0" : "=r"(deposited) : "r"(1ULL << j), "rm"(v));
++	return POC_CTZ64(deposited);
++}
++#define POC_PTSELECT(v, j) poc_ptselect(v, j)
++#define POC_PTSELECT_NAME "HW (PDEP)"
++
++/*
++ * Tier 2 (fallback): Iterative bit-clear — O(j) iterations.
++ * Clears the lowest set bit j times, then returns its position via CTZ.
++ */
++#else
++static __always_inline int poc_ptselect_sw(u64 v, int j)
++{
++	int k;
++
++	for (k = 0; k < j; k++)
++		v &= v - 1;	/* clear lowest set bit */
++	return POC_CTZ64(v);
++}
++#define POC_PTSELECT(v, j) poc_ptselect_sw(v, j)
++#define POC_PTSELECT_NAME "SW (loop)"
++
++#endif /* POC_PTSELECT */
++
++/**************************************************************
++ * Flag array to bitmask conversion (lock-free mode):
++ */
++
++/*
++ * POC_BYTE_EXTRACT / POC_BYTE_PACK - constants for multiply-and-shift trick.
++ *
++ * Isolates bit 0 of each byte in a u64 word, then packs the 8 bits
++ * into the most significant byte via multiply.
++ */
++#define POC_BYTE_EXTRACT 0x0101010101010101ULL
++#define POC_BYTE_PACK    0x0102040810204080ULL
++
++/*
++ * POC_BMP8 - Convert one 8-byte slice of the flag array to 8 packed bits.
++ *
++ * Tier 1 (x86-64 + BMI2, excluding AMD Zen 1/2 slow microcode PEXT):
++ * PEXT extracts bit 0 of each byte directly into 8 contiguous bits.
++ * Single instruction replaces AND + MUL + SHR.
++ *
++ * Tier 2 (fallback): Multiply-and-shift trick.
++ * Isolates bit 0 of each byte (AND), packs via MUL, shifts to position.
++ */
++#if defined(__x86_64__) && defined(__BMI2__) && \
++    !defined(__znver1) && !defined(__znver2)
++
++static __always_inline u64 poc_bmp8_pext(u64 word, int i)
++{
++	u64 extracted;
++
++	asm("pext %2, %1, %0" : "=r"(extracted) : "r"(word), "r"(POC_BYTE_EXTRACT));
++	return extracted << (i * 8);
++}
++#define POC_BMP8(w, i) poc_bmp8_pext((w)[i], i)
++
++#else
++
++#define POC_BMP8(w, i) \
++	((((w)[i] & POC_BYTE_EXTRACT) * POC_BYTE_PACK >> 56) << ((i) * 8))
++
++#endif /* POC_BMP8 */
++
++/*
++ * poc_flags_to_u64 - Convert u8[64] flag array to u64 bitmask
++ * @flags: pointer to 64-byte flag array (cacheline-aligned)
++ *
++ * Phase 1 (memcpy): snapshot the 64-byte cache line to the stack.
++ * This eliminates the window in which a concurrent MESI invalidation
++ * could cause a re-fetch mid-computation.  All 64 bytes land in one
++ * or two cache line transfers; subsequent computation is purely local.
++ *
++ * Phase 2: pack the stack-local copy into a u64 bitmask via
++ * multiply-and-shift (or PEXT on BMI2 x86).  Always processes all
++ * 8 chunks — the extra iterations for small LLCs are negligible
++ * on stack-local data and avoid the poc_chunks_bit* dispatch tree.
++ *
++ * Returns: u64 bitmask with bit N set iff flags[N] != 0
++ */
++static __always_inline u64 poc_flags_to_u64(const u8 *flags)
++{
++	u64 w[8];
++
++	/* Phase 1: snapshot shared cache line to stack */
++	memcpy(w, flags, 64);
++
++	/* Phase 2: pack stack-local copy into bitmask */
++	return POC_BMP8(w, 0) | POC_BMP8(w, 1) | POC_BMP8(w, 2) | POC_BMP8(w, 3) |
++	       POC_BMP8(w, 4) | POC_BMP8(w, 5) | POC_BMP8(w, 6) | POC_BMP8(w, 7);
++}
++
++/**************************************************************
++ * Idle mask accessors:
++ */
++
++/*
++ * poc_idle_cpu_mask - Get idle CPU bitmask filtered by LLC and affinity
++ * @affinity: task's allowed CPU mask (poc-relative, from poc_cpumask_to_u64)
++ * @sd_share: per-LLC shared data
++ *
++ * Returns a snapshot of idle CPUs within this LLC, masked by
++ * llc_members (valid CPUs) and @affinity (task placement).
++ *
++ * bitmap mode (default): single atomic64_read (MOV on x86).
++ * flag array mode: stack-snapshot + multiply-and-shift aggregation.
++ */
++static __always_inline u64 poc_idle_cpu_mask(u64 affinity,
++	struct sched_domain_shared *sd_share)
++{
++	u64 cpus;
++
++	if (static_branch_unlikely(&sched_poc_lockless_bitmap))
++		cpus = poc_flags_to_u64(sd_share->poc_idle_cpus);
++	else
++		cpus = (u64)atomic64_read(&sd_share->poc_idle_cpus_mask);
++
++	return cpus & sd_share->poc_llc_members & affinity;
++}
++
++#ifdef CONFIG_SCHED_SMT
++/*
++ * poc_idle_core_mask - Get idle core bitmask
++ * @cpu_mask: snapshot of idle CPUs (already masked by llc_members & affinity)
++ * @sd_share: per-LLC shared data
++ *
++ * Returns a bitmask with bits set at core representative positions
++ * (lowest-numbered sibling) for cores where ALL SMT siblings are idle.
++ *
++ * Three-tier derivation:
++ *
++ * Tier 1 (consecutive 2-way SMT): 3 register ops with compile-time
++ * constants — AND, SHR 1, AND 0x5555...  No memory loads.
++ *
++ * Tier 2 (uniform stride-N 2-way SMT): 3 register ops with
++ * precomputed per-LLC shift and primary mask — AND, SHR N, AND.
++ * Two extra loads (poc_smt_shift, poc_primary_mask) from sd_share,
++ * but no write-path overhead.
++ *
++ * Tier 3 (exotic: >2-way SMT or non-uniform topology): reads the
++ * separately-maintained poc_idle_cores_mask atomic64_t.  Write path
++ * maintains this bitmap on every idle transition.
++ */
++static __always_inline u64 poc_idle_core_mask(u64 cpu_mask,
++	struct sched_domain_shared *sd_share)
++{
++	/* Tier 1: consecutive — constants only, zero loads */
++	if (static_branch_likely(&sched_poc_smt_consecutive))
++		return cpu_mask & (cpu_mask >> 1) & 0x5555555555555555ULL;
++
++	/* Tier 2: uniform stride-N — precomputed shift + mask */
++	if (static_branch_likely(&sched_poc_smt_uniform))
++		return cpu_mask & (cpu_mask >> sd_share->poc_smt_shift)
++				& sd_share->poc_primary_mask;
++
++	/* Tier 3: exotic — bitmap or flag array based on mode */
++	if (static_branch_unlikely(&sched_poc_lockless_bitmap))
++		return poc_flags_to_u64(sd_share->poc_idle_cores) & cpu_mask;
++
++	return (u64)atomic64_read(&sd_share->poc_idle_cores_mask) & cpu_mask;
++}
++#endif /* CONFIG_SCHED_SMT */
++
++/*
++ * __set_cpu_idle_state_poc - Update idle state in atomic64_t bitmap
++ * @cpu: CPU number
++ * @state: 0=busy, 1=idle
++ *
++ * Updates the atomic64_t cpus bitmap via atomic64_or/andnot (LOCK'd on x86).
++ *
++ * On uniform 2-way SMT (Tier 1 & 2: consecutive or stride-N), only
++ * the cpus state is updated; core idle state is derived at read time
++ * via bit-parallel operations.
++ *
++ * On exotic SMT (Tier 3: >2-way or non-uniform), also maintains the
++ * separate cores state (bitmap or flag array) for O(1) read-time lookup.
++ *
++ * Only one representation is maintained at a time (single-write),
++ * selected by sched_poc_lockless_bitmap.
++ *
++ * Caller (inline wrapper in sched.h) ensures poc_selector_active is on
++ * and sched_asym_cpucap_active() is false before calling here.
++ */
++void __set_cpu_idle_state_poc(int cpu, int state)
++{
++	struct rq *rq = cpu_rq(cpu);
++	if (!static_branch_unlikely(&sched_poc_lockless_bitmap) &&
++			!state && READ_ONCE(rq->poc_idle_committed))
++		return;
++
++	guard(rcu)();
++	struct sched_domain_shared *sd_share =
++		rcu_dereference(per_cpu(sd_llc_shared, cpu));
++	if (!sd_share || !sd_share->poc_fast_eligible)
++		return;
++
++	int bit = cpu - sd_share->poc_cpu_base;
++	u64 bit_mask = 1ULL << bit;
++
++	if (static_branch_unlikely(&sched_poc_lockless_bitmap)) {
++		WRITE_ONCE(sd_share->poc_idle_cpus[bit], state > 0 ? 1 : 0);
++	} else if (state > 0) {
++		/* Entering idle: clear any stale committed flag */
++		WRITE_ONCE(rq->poc_idle_committed, 0);
++		atomic64_or(bit_mask, &sd_share->poc_idle_cpus_mask);
++	} else {
++		/*
++		 * Exiting idle: if a waker already committed (cleared the
++		 * bitmap bit), skip the redundant atomic on the shared
++		 * cacheline.  The flag lives in rq's first cacheline —
++		 * same line the waker already dirtied via ttwu_pending.
++		 */
++		atomic64_andnot(bit_mask, &sd_share->poc_idle_cpus_mask);
++		WRITE_ONCE(rq->poc_idle_committed, 1);
++	}
++
++#ifdef CONFIG_SCHED_SMT
++	if (sched_smt_active()) {
++		/* Tier 1 & 2: read-time derivation, no write-path cost */
++		if (static_branch_likely(&sched_poc_smt_uniform))
++			return;
++		/*
++		 * Tier 3 (exotic SMT): maintain separate cores state.
++		 * Check whether all SMT siblings are idle.
++		 */
++		u64 smt = sd_share->poc_smt_mask[bit];
++		u64 core_bitmask = smt & (-smt); /* core representative */
++		int core_bit = __builtin_ctzll(core_bitmask);
++		bool core_idle;
++
++		if (static_branch_unlikely(&sched_poc_lockless_bitmap)) {
++			/*
++			 * Flag array mode: check siblings via WRITE_ONCE-stored
++			 * flags.  smp_wmb() ensures our store to poc_idle_cpus[]
++			 * is visible before we read sibling flags.
++			 * On x86 TSO: compiler barrier only (~0 cyc).
++			 * On ARM64: dmb ishst.
++			 */
++			smp_wmb();
++			u64 tmp = smt;
++
++			core_idle = state > 0;
++			while (core_idle && tmp) {
++				int s = __builtin_ctzll(tmp);
++
++				if (!READ_ONCE(sd_share->poc_idle_cpus[s]))
++					core_idle = false;
++				tmp &= tmp - 1;
++			}
++			WRITE_ONCE(sd_share->poc_idle_cores[core_bit],
++				   core_idle ? 1 : 0);
++		} else {
++			/*
++			 * smp_mb__after_atomic() ensures our atomic store is
++			 * visible before we read sibling bits.  On x86 TSO this
++			 * is a compiler barrier (~0 cyc); on ARM64: dmb ish.
++			 */
++			smp_mb__after_atomic();
++			u64 cpus = (u64)atomic64_read(&sd_share->poc_idle_cpus_mask);
++			core_idle = (cpus & smt) == smt;
++			u64 cores = (u64)atomic64_read(&sd_share->poc_idle_cores_mask);
++
++			if (core_idle) {
++				if (!(cores & core_bitmask))
++					atomic64_or(core_bitmask,
++						    &sd_share->poc_idle_cores_mask);
++			} else {
++				if (cores & core_bitmask)
++					atomic64_andnot(core_bitmask,
++							&sd_share->poc_idle_cores_mask);
++			}
++		}
++	}
++#endif /* CONFIG_SCHED_SMT */
++}
++
++/**************************************************************
++ * Idle CPU selection helpers:
++ */
++
++/* Test whether a single CPU is idle in a POC bitmap snapshot.
++ * Assumes cpu_mask is in scope — works in any function with that variable. */
++#define POC_IDLE_CPU(bit)	(cpu_mask & (1ULL << (bit)))
++/* Scope-free validity checks — usable in any function. */
++#define POC_CPU_VALID(cpu)	((cpu) >= 0)
++#define POC_CPU_IN_LLC(bit)	((unsigned int)(bit) < 64)
++
++/*
++ * poc_select_rr_improved - Improved round-robin idle CPU selection
++ * @base: poc_cpu_base (smallest CPU ID in this LLC)
++ * @mask: idle bitmask (snapshot, caller guarantees non-zero)
++ * @counter: per-CPU round-robin counter value
++ *
++ * Improved RR with two techniques:
++ * 1. Case-split by total:
++ * total=1: direct ctz
++ * total=2: interleave by counter LSB (guarantees non-repeat),
++ * single CTZ via cmov-selected source mask
++ * total>=3: golden-ratio scramble + Lemire fastrange
++ * 2. Golden-ratio scrambling (counter * 0x9E3779B9) mapped via
++ * Lemire fastrange for pseudo-random uniform distribution.
++ *
++ * eager_commit (unconditional) already prevents burst wake-ups from
++ * re-selecting the same CPU by clearing the bitmap bit at selection
++ * time, so no previous-pick exclusion state is needed here.
++ *
++ * Returns: selected CPU number.
++ */
++static __always_inline int poc_select_rr_improved(
++	int base, u64 mask, unsigned int counter)
++{
++	int total = hweight64(mask);
++
++	if (total <= 2) {
++		/*
++		 * Pick the lower or upper set bit via counter LSB if total == 2.
++		 * Select the mask first (cmov), then one CTZ — halves the
++		 * cost on archs where CTZ64 is a SW fallback (De Bruijn).
++		 */
++		if ((total == 2) && (counter & 1))
++			mask &= mask - 1;
++
++		return base + POC_CTZ64(mask);
++	}
++
++	/* total >= 3: golden-ratio scramble + Lemire fastrange */
++	{
++		u32 scrambled = POC_SCRAMBLE(counter);
++		int pick = POC_FASTRANGE(scrambled, total);
++
++		return base + POC_PTSELECT(mask, pick);
++	}
++}
++
++/*
++ * poc_select_rr - Round-robin idle CPU selection from a single-word mask
++ * @base: poc_cpu_base (smallest CPU ID in this LLC)
++ * @mask: idle bitmask (snapshot)
++ * @counter: per-CPU round-robin counter value
++ *
++ * Division-free perfect round-robin via FASTRANGE16 + PTSELECT.
++ * Consecutive calls on the same CPU never repeat an idle CPU
++ * until all candidates have been visited.
++ * Caller must ensure at least one bit is set in mask.
++ * Returns: selected CPU number.
++ */
++static __always_inline int poc_select_rr(int base, u64 mask, unsigned int counter)
++{
++	if (static_branch_likely(&sched_poc_rr_improved))
++		return poc_select_rr_improved(base, mask, counter);
++
++	/* Current strategy: poc_rr_step[] table (perfect RR), unchanged */
++	{
++		int total = hweight64(mask);
++		u16 phase = (u16)(counter * (u32)poc_rr_step[total - 1]);
++		int pick  = POC_FIXED_MOD16(phase, total);
++
++		return POC_PTSELECT(mask, pick) + base;
++	}
++}
++
++/*
++ * poc_cluster_search - Search for an idle CPU within the target's L2 cluster
++ * @base: poc_cpu_base (smallest CPU ID in this LLC)
++ * @tgt_bit: target CPU's POC-relative bit position
++ * @sd_share: per-LLC shared data containing cluster geometry
++ * @mask: snapshot of idle bitmask (cores or cpus, caller decides)
++ *
++ * Uses pre-computed cluster mask for O(1) lookup via CTZ.
++ * Returns: idle CPU number if found within cluster, -1 otherwise.
++ */
++static __always_inline int poc_cluster_search(int base, int tgt_bit,
++	struct sched_domain_shared *sd_share, u64 mask)
++{
++	u64 cls_idle = mask & sd_share->poc_cluster_mask[tgt_bit];
++
++	if (!cls_idle)
++		return -1;
++
++	if (static_branch_likely(&sched_poc_rr_improved)) {
++		/* Improved path: inc counter here so LV3 fallback sees fresh value */
++		unsigned int counter = __this_cpu_inc_return(poc_rr_counter);
++		return poc_select_rr_improved(base, cls_idle, counter);
++	}
++
++	/* Current strategy: ctz lowest-bit (no RR), unchanged */
++	return base + POC_CTZ64(cls_idle);
++}
++
++#ifdef CONFIG_SCHED_SMT
++/*
++ * poc_smt_sibling_mask - Get SMT sibling bitmask for a given CPU
++ * @bit: POC-relative bit position
++ * @sd_share: per-LLC shared data
++ *
++ * Three-tier computation matching poc_idle_core_mask():
++ *
++ * Tier 1 (consecutive): 3ULL << (bit & ~1) — shift only, zero loads.
++ *
++ * Tier 2 (uniform stride-N): determine sibling via poc_smt_shift
++ * and poc_primary_mask.  Avoids poc_smt_mask[] array lookup.
++ *
++ * Tier 3 (exotic): loads from pre-computed poc_smt_mask[] table.
++ */
++static __always_inline u64 poc_smt_sibling_mask(int bit,
++	struct sched_domain_shared *sd_share)
++{
++	if (static_branch_likely(&sched_poc_smt_consecutive))
++		return 3ULL << (bit & ~1);
++
++	if (static_branch_likely(&sched_poc_smt_uniform)) {
++		u8 shift = sd_share->poc_smt_shift;
++		int sib = (sd_share->poc_primary_mask & (1ULL << bit))
++				? bit + shift : bit - shift;
++		return (1ULL << bit) | (1ULL << sib);
++	}
++
++	return sd_share->poc_smt_mask[bit];
++}
++
++/*
++ * poc_find_idle_smt_sibling - Find an idle CPU among target and its SMT siblings
++ * @base: poc_cpu_base (smallest CPU ID in this LLC)
++ * @tgt_bit: target CPU's POC-relative bit position
++ * @cpu_mask: snapshot of idle CPU bitmask
++ * @smt_mask: pre-computed SMT sibling mask for target (includes self)
++ *
++ * Searches target itself and its SMT siblings for an idle CPU.
++ * Target is checked first for cache locality.
++ * Returns: idle CPU number if found, -1 otherwise
++ */
++static __always_inline int poc_find_idle_smt_sibling(
++	int base, int tgt_bit, u64 cpu_mask, u64 smt_mask)
++{
++	/* Check target first for cache locality */
++	if (POC_IDLE_CPU(tgt_bit))
++		return base + tgt_bit;
++
++	u64 idle_sibs = cpu_mask & smt_mask;
++
++	if (idle_sibs)
++		return base + POC_CTZ64(idle_sibs);
++
++	return -1;
++}
++/*
++ * poc_try_idle_smt - Find an idle CPU among a CPU and its SMT siblings
++ * @base: poc_cpu_base (smallest CPU ID in this LLC)
++ * @cpu: the CPU to check (and its SMT siblings)
++ * @cpu_mask: snapshot of idle CPU bitmask
++ * @sd_share: per-LLC shared data
++ *
++ * Checks if the given CPU or any of its SMT siblings is idle.
++ * Caller is responsible for poc_count() and poc_commit_selection().
++ * Returns: idle CPU number if found, -1 otherwise
++ */
++static __always_inline int poc_try_idle_smt(int base, int cpu,
++	u64 cpu_mask, struct sched_domain_shared *sd_share)
++{
++	int bit = cpu - base;
++
++	if (sd_share->poc_llc_members & (1ULL << bit)) {
++		int smt_cpu = poc_find_idle_smt_sibling(base, bit,
++			cpu_mask, poc_smt_sibling_mask(bit, sd_share));
++		if (POC_CPU_VALID(smt_cpu))
++			return smt_cpu;
++	}
++	return -1;
++}
++
++#endif /* CONFIG_SCHED_SMT */
++
++/*
++ * poc_commit_selection - Atomically clear selected CPU from idle bitmap
++ * @cpu: the CPU number selected by POC
++ * @sd_share: per-LLC shared data
++ *
++ * Clears the selected CPU's bit in poc_idle_cpus_mask at selection
++ * time to close the race window where multiple waker CPUs read the
++ * same stale bitmap and select the same idle CPU.  The do_idle()
++ * exit path performs an idempotent clear as a safety net for
++ * non-POC wakeups; poc_idle_committed gates that path so the atomic
++ * fires at most once per selection.
++ */
++static __always_inline void poc_commit_selection(int cpu,
++	struct sched_domain_shared *sd_share)
++{
++	if (cpu_rq(cpu)->nr_running <= 2) {
++		int bit = cpu - sd_share->poc_cpu_base;
++
++		if (static_branch_unlikely(&sched_poc_lockless_bitmap)) {
++			WRITE_ONCE(sd_share->poc_idle_cpus[bit], 0);
++			smp_wmb();
++		} else {
++			atomic64_andnot(1ULL << bit, &sd_share->poc_idle_cpus_mask);
++			smp_mb__after_atomic();
++			/* Mark committed so target skips redundant andnot on wakeup */
++			WRITE_ONCE(cpu_rq(cpu)->poc_idle_committed, 1);
++		}
++	}
++}
++
++/*
++ * POC_IDLE_CORE  - Test whether a CPU's core is fully idle.
++ * POC_IDLE_SMT   - Find an idle CPU among @cpu and its SMT siblings.
++ *
++ * POC_RETURN     - Record hit counter, clear bitmap, return selected CPU.
++ * POC_RETURN_IF  - Same, but only if @cpu >= 0 (used after POC_IDLE_SMT).
++ *
++ * These assume core_mask, base, sd_share are in scope
++ * (only used inside select_idle_cpu_poc).
++ */
++#define POC_IDLE_CORE(bit)	(core_mask & poc_smt_sibling_mask((bit), sd_share))
++#define POC_IDLE_SMT(cpu)	poc_try_idle_smt(base, (cpu), cpu_mask, sd_share)
++
++#define POC_RETURN(cpu, level) do { \
++	poc_count(level); \
++	poc_commit_selection(cpu, sd_share); \
++	return cpu; \
++} while (0)
++
++#define POC_RETURN_IF(cpu, level) do { \
++	if ((cpu) >= 0) \
++		POC_RETURN(cpu, level); \
++} while (0)
++
++/**************************************************************
++ * Fast path dispatcher:
++ */
++
++/*
++ * select_idle_cpu_poc - Fast idle CPU selector (atomic64 bitmap path)
++ * @target: CPU chosen by wake_affine (Level 1 preferred CPU;
++ * search origin for L2/L3/L5/L6)
++ * @prev: task's previous CPU (Level 4 cache locality preference)
++ * @recent: task's recent_used_cpu (-1 if none; pre-filtered by caller)
++ * @sync: 1 if synchronous wakeup (Level 4s: waker yields CPU)
++ * @sd_share: per-LLC shared data (caller provides; never NULL)
++ * @allowed: task's cpumask (p->cpus_ptr) for affinity filtering
++ *
++ * Two operating modes (sysctl kernel.sched_poc_smt_fallback):
++ *
++ * smt_fallback=0 (default): POC handles all idle CPU
++ * selection itself, including SMT siblings.  Prioritizes
++ * prev's SMT sibling for cache locality.  Uses CFS's
++ * nr_idle_scan (SIS_UTIL) to gate Level 5/6 under overload.
++ *
++ * smt_fallback=1: Bails out to CFS when has_idle_cores is
++ * false.  CFS handles SMT sibling selection via
++ * select_idle_smt(prev) and nr_idle_scan-limited
++ * select_idle_cpu().
++ *
++ * Selection levels:
++ *
++ * Level 0:   Saturation check -- no idle CPUs → return -1
++ * (smt_fallback: also when has_idle_cores == false)
++ * Level 1r:  Recent's core is fully idle → return recent (!early_select)
++ * Level 1s:  Target CPU idle in bitmap → return target (L1/TLB affinity)
++ * Level 1t:  Target CPU's core is fully idle → return target
++ * Level 1p:  Prev's core is fully idle → return prev (prev != target)
++ * --- core_mask != 0: search idle-core bitmap ---
++ * Level 2:   Idle core in L2 cluster (CTZ)
++ * Level 3:   Idle core across LLC (RR PTSELECT)
++ * --- core_mask == 0: search idle-CPU bitmap ---
++ * Level 4s:  sync + target CPU idle (waker frees core)
++ * Level 4p:  Prev's SMT sibling (cache locality)
++ * Level 4t:  Target's SMT sibling
++ * Level 4r:  Recent's SMT sibling (warm cache, always)
++ * [SIS_UTIL gate: nr_idle_scan == 0 → return -2]
++ * Level 5:   Idle CPU in L2 cluster (CTZ)
++ * Level 6:   Idle CPU across LLC (RR PTSELECT)
++ *
++ * Non-SMT: Level 1r → 1t → 1p → Level 2 → Level 3 (core = CPU).
++ *
++ * Returns: idle CPU number if found, -1 if not found (CFS may retry),
++ * -2 if SIS_UTIL overload (caller should skip CFS)
++ */
++static __always_inline int select_idle_cpu_poc(int target, int prev,
++				int recent, int sync,
++				struct sched_domain_shared *sd_share,
++				const struct cpumask *allowed)
++{
++	int base = sd_share->poc_cpu_base;
++	int rct_bit = recent - base;
++	int tgt_bit = target - base;
++	int prv_bit = prev   - base;
++#ifdef CONFIG_SCHED_SMT
++	u64 core_mask __maybe_unused;
++#endif
++	u64 affinity;
++	u64 cpu_mask;
++	int level_offset = 0;
++
++#ifdef CONFIG_SCHED_SMT
++	/* SMT fallback: bail to CFS for SMT sibling selection */
++	if (sched_smt_active() &&
++			static_branch_unlikely(&sched_poc_smt_fallback) &&
++			!READ_ONCE(sd_share->has_idle_cores))
++		return -1;
++#endif
++
++	if (static_branch_unlikely(&sched_poc_lockless_bitmap))
++		prefetch(sd_share->poc_idle_cpus);
++	else
++		prefetch(&sd_share->poc_idle_cpus_mask);
++#ifdef CONFIG_SCHED_SMT
++	if (sched_smt_active()) {
++		if (!static_branch_likely(&sched_poc_smt_uniform)) {
++			if (static_branch_unlikely(&sched_poc_lockless_bitmap))
++				prefetch(sd_share->poc_idle_cores);
++			else
++				prefetch(&sd_share->poc_idle_cores_mask);
++			if (POC_CPU_VALID(recent))
++				prefetch(&sd_share->poc_smt_mask[rct_bit]);
++			prefetch(&sd_share->poc_smt_mask[tgt_bit]);
++			prefetch(&sd_share->poc_smt_mask[prv_bit]);
++		}
++	}
++#endif
++	if (static_branch_likely(&sched_cluster_active))
++		prefetch(&sd_share->poc_cluster_mask[tgt_bit]);
++
++	affinity = poc_cpumask_to_u64(allowed, sd_share);
++	cpu_mask = poc_idle_cpu_mask(affinity, sd_share);
++
++	/* Level 0: Saturation — no idle CPU */
++	if (!cpu_mask)
++		return -1;
++
++#ifdef CONFIG_SCHED_SMT
++	if (sched_smt_active()) {
++		core_mask = poc_idle_core_mask(cpu_mask, sd_share);
++
++		/* Level 1r: recent's core is idle (warm cache) */
++		if (!static_branch_likely(&sched_poc_early_select) &&
++				core_mask && POC_CPU_IN_LLC(rct_bit) && POC_IDLE_CORE(rct_bit))
++			POC_RETURN(recent, POC_LV1R);
++
++		/* Level 1s: target CPU sticky — L1/TLB affinity shortcut */
++		if (static_branch_unlikely(&sched_poc_target_sticky) && POC_IDLE_CPU(tgt_bit))
++			POC_RETURN(target, POC_LV1S);
++
++		if (core_mask) {
++			/*
++			 * Idle core path: T → P order.
++			 * Target first — wake_affine chose it for data sharing
++			 * and the full core is free.
++			 */
++
++			/* Level 1t: target CPU's core is idle → return it */
++			if (!static_branch_likely(&sched_poc_early_select) &&
++					POC_IDLE_CORE(tgt_bit))
++				POC_RETURN(target, POC_LV1T);
++
++			/* Level 1p: prev's core is idle (task's L1/L2 warm) */
++			if (prev != target && POC_CPU_IN_LLC(prv_bit) && POC_IDLE_CORE(prv_bit))
++				POC_RETURN(prev, POC_LV1P);
++
++			cpu_mask = core_mask;
++		} else {
++			int cpu;
++
++			/* Level 4s: sync wakeup + target CPU idle →
++			 * waker will sleep imminently, freeing the core */
++			if (sync && POC_IDLE_CPU(tgt_bit))
++				POC_RETURN(target, POC_LV4S);
++
++			/*
++			 * No-idle-core path: P → T → R order.
++			 * Target itself was already tried at Level 1s/4s;
++			 * prioritize task's own cache (prev, recent) over
++			 * waker locality (target's sibling).
++			 */
++
++			/* Level 4p: prev's SMT sibling (cache locality) */
++			if (prev != target && POC_CPU_IN_LLC(prv_bit)) {
++				cpu = POC_IDLE_SMT(prev);
++				POC_RETURN_IF(cpu, POC_LV4P);
++			}
++
++			/* Level 4t: target's SMT sibling */
++			cpu = POC_IDLE_SMT(target);
++			POC_RETURN_IF(cpu, POC_LV4T);
++
++			/* Level 4r: recent's SMT sibling (warm cache) */
++			if (POC_CPU_IN_LLC(rct_bit)) {
++				cpu = POC_IDLE_SMT(recent);
++				POC_RETURN_IF(cpu, POC_LV4R);
++			}
++
++			/* SIS_UTIL overload gate for Level 5/6 */
++			if (!static_branch_likely(&sched_poc_greedy_search) &&
++			    sched_feat(SIS_UTIL) && !READ_ONCE(sd_share->nr_idle_scan))
++				return -2;
++
++			level_offset = POC_SMT_LEVEL_OFFSET;
++		}
++	}
++	else
++#endif
++	{
++		/* Level 1r: recent CPU is idle (non-SMT) */
++		if (!static_branch_likely(&sched_poc_early_select) &&
++				POC_CPU_IN_LLC(rct_bit) && POC_IDLE_CPU(rct_bit))
++			POC_RETURN(recent, POC_LV1R);
++		/* Level 1t: target CPU is idle → return (non-SMT) */
++		if (POC_IDLE_CPU(tgt_bit))
++			POC_RETURN(target, POC_LV1T);
++		/* Level 1p: prev CPU is idle (non-SMT) */
++		if (prev != target && POC_CPU_IN_LLC(prv_bit) && POC_IDLE_CPU(prv_bit))
++			POC_RETURN(prev, POC_LV1P);
++	}
++
++	if (static_branch_likely(&sched_poc_packed)) {
++		/*
++		* Level 2+3 / 5+6: packed priority search (≤32 CPUs/LLC)
++		*
++		* Packs cluster candidates (high priority) into lower 32 bits
++		* and all LLC candidates (low priority) into upper 32 bits.
++		* A single TZCNT resolves the highest-priority idle CPU.
++		* Level discrimination: (raw >> 5) yields 0 (cluster) or 1 (LLC).
++		*
++		* rr_improved=ON: rotation amount via golden-ratio scramble.
++		* rr_improved=OFF: rotation amount is (counter & 31).
++		*/
++		unsigned int counter = __this_cpu_inc_return(poc_rr_counter);
++		int rot;
++		u32 cls = 0;
++		u32 all;
++		u64 packed;
++		int raw, bit;
++
++		if (static_branch_likely(&sched_poc_rr_improved))
++			rot = (int)(POC_SCRAMBLE(counter) >> 27);
++		else
++			rot = counter & 31;
++
++		if (static_branch_likely(&sched_cluster_active) &&
++				sd_share->poc_cluster_valid)
++			cls = ror32((u32)(cpu_mask &
++				sd_share->poc_cluster_mask[tgt_bit]), rot);
++
++		all = ror32((u32)cpu_mask, rot);
++		packed = (u64)cls | ((u64)all << 32);
++
++		raw = POC_CTZ64(packed);
++		bit = ((raw & 31) + rot) & 31;
++
++		POC_RETURN(base + bit, POC_LV2 + (raw >> 5) + level_offset);
++	} else {
++		/* Level 2/5: idle core/cpu in target's L2 cluster */
++		if (static_branch_likely(&sched_cluster_active)
++				&& sd_share->poc_cluster_valid) {
++			int cpu = poc_cluster_search(
++				base, tgt_bit, sd_share, cpu_mask);
++			if (POC_CPU_VALID(cpu))
++				POC_RETURN(cpu, POC_LV2 + level_offset);
++		}
++
++		/* Level 3/6: idle core/cpu across LLC via RR */
++		{
++			unsigned int counter = __this_cpu_inc_return(poc_rr_counter);
++			int rr_cpu = poc_select_rr(base, cpu_mask, counter);
++			POC_RETURN(rr_cpu, POC_LV3 + level_offset);
++		}
++	}
++}
++
++/**************************************************************
++ * Sysctl interface and initialization:
++ */
++
++#if defined(CONFIG_SYSCTL) || defined(CONFIG_SCHED_CLASS_EXT)
++/*
++ * poc_resync_idle_state - Resync POC idle bitmaps after re-enable
++ *
++ * When POC is re-enabled after a period of being disabled,
++ * the idle bitmaps may be stale.  Walk all online CPUs and push
++ * the current idle state into poc_idle_cpus_mask (and poc_idle_cores_mask
++ * on non-consecutive SMT).
++ *
++ * Must be called AFTER static_branch_enable() so that concurrent
++ * idle transitions are also updating the flags.
++ * Caller must hold cpus_read_lock().
++ */
++static void poc_resync_idle_state(void)
++{
++	int cpu;
++
++	for_each_online_cpu(cpu) {
++		WRITE_ONCE(cpu_rq(cpu)->poc_idle_committed, 0);
++		__set_cpu_idle_state_poc(cpu, idle_cpu(cpu));
++	}
++}
++
++/*
++ * poc_reevaluate_active - Recompute poc_selector_active from inputs
++ *
++ * poc_selector_active = sched_poc_selector && !poc_selector_skip
++ *
++ * On transition to active: enable static key, then resync idle bitmaps.
++ * On transition to inactive: disable static key.
++ * Caller must hold cpus_read_lock().
++ */
++static void poc_reevaluate_active(void)
++{
++	bool want = sched_poc_selector && !poc_selector_skip;
++	bool now  = static_branch_likely(&poc_selector_active);
++
++	if (want == now)
++		return;
++
++	if (want) {
++		static_branch_enable_cpuslocked(&poc_selector_active);
++		poc_resync_idle_state();
++	} else {
++		static_branch_disable_cpuslocked(&poc_selector_active);
++	}
++}
++#endif /* CONFIG_SYSCTL || CONFIG_SCHED_CLASS_EXT */
++
++#ifdef CONFIG_SCHED_CLASS_EXT
++/*
++ * poc_notify_scx - Called by sched_ext on enable/disable transitions
++ * @scx_active: true when scx scheduler is being enabled
++ */
++void poc_notify_scx(bool scx_active)
++{
++	cpus_read_lock();
++	poc_selector_skip = scx_active;
++	poc_reevaluate_active();
++	cpus_read_unlock();
++}
++
++/*
++ * poc_skip_fallback_work - Workqueue item to re-enable POC after scx fallback.
++ *
++ * Scheduled by poc_check_skip_fallback() when an scx scheduler calls
++ * select_idle_sibling.  Runs poc_reevaluate_active() outside the hot path
++ * to avoid updating the static key and resyncing bitmaps inline.
++ */
++static void poc_skip_fallback_fn(struct work_struct *work);
++static DECLARE_WORK(poc_skip_fallback_work, poc_skip_fallback_fn);
++
++static void poc_skip_fallback_fn(struct work_struct *work)
++{
++	cpus_read_lock();
++	poc_reevaluate_active();
++	cpus_read_unlock();
++}
++
++/*
++ * poc_check_skip_fallback - Hot-path detection for scx calling select_idle_sibling
++ *
++ * While scx is active, poc_selector_skip=true suppresses idle bitmap updates
++ * in do_idle.  Some scx schedulers still call select_idle_sibling; when that
++ * happens, flip poc_selector_skip back to false and schedule a workqueue item
++ * to re-enable poc_selector_active and resync stale bitmaps.
++ *
++ * WRITE_ONCE(false) is idempotent across concurrent callers; schedule_work()
++ * silently drops duplicate requests when the item is already queued.
++ */
++void poc_check_skip_fallback(void)
++{
++	if (!sched_poc_selector || !READ_ONCE(poc_selector_skip))
++		return;
++	WRITE_ONCE(poc_selector_skip, false);
++	schedule_work(&poc_skip_fallback_work);
++}
++#endif
++
++#ifdef CONFIG_SYSCTL
++static int sched_poc_sysctl_handler(const struct ctl_table *table, int write,
++				    void *buffer, size_t *lenp, loff_t *ppos)
++{
++	unsigned int val = sched_poc_selector ? 1 : 0;
++	struct ctl_table tmp = {
++		.data    = &val,
++		.maxlen  = sizeof(val),
++		.extra1  = SYSCTL_ZERO,
++		.extra2  = SYSCTL_ONE,
++	};
++	int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos);
++
++	if (!ret && write) {
++		cpus_read_lock();
++		sched_poc_selector = !!val;
++		poc_reevaluate_active();
++		cpus_read_unlock();
++	}
++	return ret;
++}
++
++static int sched_poc_smt_fallback_sysctl_handler(const struct ctl_table *table,
++					       int write, void *buffer,
++					       size_t *lenp, loff_t *ppos)
++{
++	unsigned int val = static_branch_unlikely(&sched_poc_smt_fallback) ? 1 : 0;
++	struct ctl_table tmp = {
++		.data    = &val,
++		.maxlen  = sizeof(val),
++		.extra1  = SYSCTL_ZERO,
++		.extra2  = SYSCTL_ONE,
++	};
++	int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos);
++
++	if (!ret && write) {
++		if (val)
++			static_branch_enable(&sched_poc_smt_fallback);
++		else
++			static_branch_disable(&sched_poc_smt_fallback);
++	}
++	return ret;
++}
++
++static int sched_poc_rr_improved_sysctl_handler(const struct ctl_table *table,
++					     int write, void *buffer,
++					     size_t *lenp, loff_t *ppos)
++{
++	unsigned int val = static_branch_likely(&sched_poc_rr_improved) ? 1 : 0;
++	struct ctl_table tmp = {
++		.data    = &val,
++		.maxlen  = sizeof(val),
++		.extra1  = SYSCTL_ZERO,
++		.extra2  = SYSCTL_ONE,
++	};
++	int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos);
++
++	if (!ret && write) {
++		if (val)
++			static_branch_enable(&sched_poc_rr_improved);
++		else
++			static_branch_disable(&sched_poc_rr_improved);
++	}
++	return ret;
++}
++
++static int sched_poc_target_sticky_sysctl_handler(const struct ctl_table *table,
++					       int write, void *buffer,
++					       size_t *lenp, loff_t *ppos)
++{
++	unsigned int val = static_branch_unlikely(&sched_poc_target_sticky) ? 1 : 0;
++	struct ctl_table tmp = {
++		.data    = &val,
++		.maxlen  = sizeof(val),
++		.extra1  = SYSCTL_ZERO,
++		.extra2  = SYSCTL_ONE,
++	};
++	int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos);
++
++	if (!ret && write) {
++		if (val)
++			static_branch_enable(&sched_poc_target_sticky);
++		else
++			static_branch_disable(&sched_poc_target_sticky);
++	}
++	return ret;
++}
++
++static int sched_poc_early_select_handler(const struct ctl_table *table,
++					  int write, void *buffer,
++					  size_t *lenp, loff_t *ppos)
++{
++	unsigned int val = static_branch_likely(&sched_poc_early_select) ? 1 : 0;
++	struct ctl_table tmp = {
++		.data    = &val,
++		.maxlen  = sizeof(val),
++		.extra1  = SYSCTL_ZERO,
++		.extra2  = SYSCTL_ONE,
++	};
++	int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos);
++
++	if (!ret && write) {
++		if (val)
++			static_branch_enable(&sched_poc_early_select);
++		else
++			static_branch_disable(&sched_poc_early_select);
++	}
++	return ret;
++}
++
++static int sched_poc_greedy_search_handler(const struct ctl_table *table,
++					       int write, void *buffer,
++					       size_t *lenp, loff_t *ppos)
++{
++	unsigned int val = static_branch_likely(&sched_poc_greedy_search) ? 1 : 0;
++	struct ctl_table tmp = {
++		.data    = &val,
++		.maxlen  = sizeof(val),
++		.extra1  = SYSCTL_ZERO,
++		.extra2  = SYSCTL_ONE,
++	};
++	int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos);
++
++	if (!ret && write) {
++		if (val)
++			static_branch_enable(&sched_poc_greedy_search);
++		else
++			static_branch_disable(&sched_poc_greedy_search);
++	}
++	return ret;
++}
++
++static int sched_poc_count_sysctl_handler(const struct ctl_table *table,
++					  int write, void *buffer,
++					  size_t *lenp, loff_t *ppos)
++{
++	unsigned int val = static_branch_unlikely(&sched_poc_count_enabled) ? 1 : 0;
++	struct ctl_table tmp = {
++		.data    = &val,
++		.maxlen  = sizeof(val),
++		.extra1  = SYSCTL_ZERO,
++		.extra2  = SYSCTL_ONE,
++	};
++	int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos);
++
++	if (!ret && write) {
++		if (val)
++			static_branch_enable(&sched_poc_count_enabled);
++		else
++			static_branch_disable(&sched_poc_count_enabled);
++	}
++	return ret;
++}
++
++static int sched_poc_lockless_bitmap_sysctl_handler(const struct ctl_table *table,
++						int write, void *buffer,
++						size_t *lenp, loff_t *ppos)
++{
++	unsigned int val = static_branch_unlikely(&sched_poc_lockless_bitmap) ? 1 : 0;
++	struct ctl_table tmp = {
++		.data    = &val,
++		.maxlen  = sizeof(val),
++		.extra1  = SYSCTL_ZERO,
++		.extra2  = SYSCTL_ONE,
++	};
++	int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos);
++
++	if (!ret && write) {
++		cpus_read_lock();
++		if (val)
++			static_branch_enable_cpuslocked(&sched_poc_lockless_bitmap);
++		else
++			static_branch_disable_cpuslocked(&sched_poc_lockless_bitmap);
++		/*
++		 * Resync the newly-active representation so readers see
++		 * consistent state immediately after the mode switch.
++		 */
++		poc_resync_idle_state();
++		cpus_read_unlock();
++	}
++	return ret;
++}
++
++static struct ctl_table sched_poc_sysctls[] = {
++	{
++		.procname	= "sched_poc_selector",
++		.data		= NULL,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= sched_poc_sysctl_handler,
++	},
++	{
++		.procname	= "sched_poc_smt_fallback",
++		.data		= NULL,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= sched_poc_smt_fallback_sysctl_handler,
++	},
++	{
++		.procname	= "sched_poc_rr_improved",
++		.data		= NULL,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= sched_poc_rr_improved_sysctl_handler,
++	},
++	{
++		.procname	= "sched_poc_target_sticky",
++		.data		= NULL,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= sched_poc_target_sticky_sysctl_handler,
++	},
++	{
++		.procname	= "sched_poc_early_select",
++		.data		= NULL,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= sched_poc_early_select_handler,
++	},
++	{
++		.procname	= "sched_poc_greedy_search",
++		.data		= NULL,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= sched_poc_greedy_search_handler,
++	},
++	{
++		.procname	= "sched_poc_count",
++		.data		= NULL,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= sched_poc_count_sysctl_handler,
++	},
++	{
++		.procname	= "sched_poc_lockless_bitmap",
++		.data		= NULL,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= sched_poc_lockless_bitmap_sysctl_handler,
++	},
++};
++
++static int __init sched_poc_sysctl_init(void)
++{
++	printk(KERN_INFO "%s %s by %s [CTZ: %s, PTSelect: %s]\n",
++		SCHED_POC_SELECTOR_PROGNAME, SCHED_POC_SELECTOR_VERSION,
++		SCHED_POC_SELECTOR_AUTHOR, POC_CTZ64_NAME, POC_PTSELECT_NAME);
++
++	register_sysctl_init("kernel", sched_poc_sysctls);
++	return 0;
++}
++late_initcall(sched_poc_sysctl_init);
++
++#endif /* CONFIG_SYSCTL */
++
++/*
++ * Initialize per-CPU RR counters with CPU ID offset.
++ * Different starting values shift the FASTRANGE16 phase per CPU,
++ * reducing cross-CPU collision probability when multiple CPUs
++ * perform burst wakeups against the same idle bitmap snapshot.
++ */
++static int __init sched_poc_rr_init(void)
++{
++	int cpu;
++
++	for_each_possible_cpu(cpu)
++		per_cpu(poc_rr_counter, cpu) = (u32)cpu;
++	return 0;
++}
++early_initcall(sched_poc_rr_init);
++
++/**************************************************************
++ * Status: sysfs interface (always available)
++ *
++ * Exported at /sys/kernel/poc_selector/status/ for runtime status queries.
++ * Reports whether POC is actually active (combining all conditions).
++ */
++
++#ifdef CONFIG_SYSFS
++
++/* Root kobject shared with debug section */
++static struct kobject *kobj_poc_root;
++
++static bool poc_check_all_llc_eligible(void)
++{
++	int cpu;
++
++	for_each_online_cpu(cpu) {
++		struct sched_domain_shared *sd_share;
++
++		scoped_guard(rcu) {
++			sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu));
++			if (sd_share && !sd_share->poc_fast_eligible)
++				return false;
++		}
++	}
++	return true;
++}
++
++static ssize_t active_show(struct kobject *kobj,
++			   struct kobj_attribute *attr, char *buf)
++{
++	bool active = static_branch_likely(&poc_selector_active) &&
++		      !sched_asym_cpucap_active() &&
++		      poc_check_all_llc_eligible();
++	return sysfs_emit(buf, "%d\n", active ? 1 : 0);
++}
++
++static ssize_t symmetric_cpucap_show(struct kobject *kobj,
++				     struct kobj_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "%d\n", sched_asym_cpucap_active() ? 0 : 1);
++}
++
++static ssize_t all_llc_eligible_show(struct kobject *kobj,
++				     struct kobj_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "%d\n", poc_check_all_llc_eligible() ? 1 : 0);
++}
++
++static ssize_t version_show(struct kobject *kobj,
++			    struct kobj_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "%s\n", SCHED_POC_SELECTOR_VERSION);
++}
++
++static struct kobj_attribute poc_status_active_attr = __ATTR_RO(active);
++static struct kobj_attribute poc_status_asym_attr = __ATTR_RO(symmetric_cpucap);
++static struct kobj_attribute poc_status_eligible_attr = __ATTR_RO(all_llc_eligible);
++static struct kobj_attribute poc_status_version_attr = __ATTR_RO(version);
++
++static struct attribute *poc_status_attrs[] = {
++	&poc_status_active_attr.attr,
++	&poc_status_asym_attr.attr,
++	&poc_status_eligible_attr.attr,
++	&poc_status_version_attr.attr,
++	NULL,
++};
++
++static const struct attribute_group poc_status_group = {
++	.name = "status",
++	.attrs = poc_status_attrs,
++};
++
++/* --- hw_accel: expose which hardware acceleration is in use --- */
++
++#define DEFINE_POC_HW_ATTR(fname, namestr) \
++static ssize_t poc_hw_##fname##_show(struct kobject *kobj, \
++		struct kobj_attribute *attr, char *buf) \
++{ \
++	return sysfs_emit(buf, "%s\n", namestr); \
++} \
++static struct kobj_attribute poc_hw_attr_##fname = { \
++	.attr = { .name = #fname, .mode = 0444 }, \
++	.show = poc_hw_##fname##_show, \
++}
++
++DEFINE_POC_HW_ATTR(ctz, POC_CTZ64_NAME);
++DEFINE_POC_HW_ATTR(ptselect, POC_PTSELECT_NAME);
++
++/* popcnt: x86 uses runtime alternatives, detect via boot_cpu_has */
++static ssize_t poc_hw_popcnt_show(struct kobject *kobj,
++				  struct kobj_attribute *attr, char *buf)
++{
++#if defined(__x86_64__)
++	return sysfs_emit(buf, "%s\n",
++		boot_cpu_has(X86_FEATURE_POPCNT) ? "HW (POPCNT)" : "SW");
++#elif defined(__aarch64__)
++	return sysfs_emit(buf, "HW (CNT)\n");
++#elif defined(__riscv) && defined(__riscv_zbb)
++	return sysfs_emit(buf, "HW (cpop)\n");
++#else
++	return sysfs_emit(buf, "SW\n");
++#endif
++}
++
++static struct kobj_attribute poc_hw_attr_popcnt = {
++	.attr = { .name = "popcnt", .mode = 0444 },
++	.show = poc_hw_popcnt_show,
++};
++
++static struct attribute *poc_hw_attrs[] = {
++	&poc_hw_attr_popcnt.attr,
++	&poc_hw_attr_ctz.attr,
++	&poc_hw_attr_ptselect.attr,
++	NULL,
++};
++
++static const struct attribute_group poc_hw_group = {
++	.name = "hw_accel",
++	.attrs = poc_hw_attrs,
++};
++
++/* --- count: per-level hit counters (sysctl kernel.sched_poc_count) --- */
++
++static unsigned long poc_sum_level(enum poc_level lvl)
++{
++	unsigned long sum = 0;
++	int cpu;
++
++	for_each_possible_cpu(cpu)
++		sum += per_cpu(poc_debug_cnt[lvl], cpu);
++	return sum;
++}
++
++#define DEFINE_POC_COUNT_ATTR(fname, level)				\
++static ssize_t poc_count_##fname##_show(struct kobject *kobj,	\
++		struct kobj_attribute *attr, char *buf)			\
++{									\
++	return sysfs_emit(buf, "%lu\n", poc_sum_level(level));		\
++}									\
++static struct kobj_attribute poc_count_##fname##_attr = {		\
++	.attr = { .name = #fname, .mode = 0444 },			\
++	.show = poc_count_##fname##_show,				\
++}
++
++DEFINE_POC_COUNT_ATTR(l1s, POC_LV1S);
++DEFINE_POC_COUNT_ATTR(l1t, POC_LV1T);
++DEFINE_POC_COUNT_ATTR(l1p, POC_LV1P);
++DEFINE_POC_COUNT_ATTR(l1r, POC_LV1R);
++DEFINE_POC_COUNT_ATTR(l2, POC_LV2);
++DEFINE_POC_COUNT_ATTR(l3, POC_LV3);
++DEFINE_POC_COUNT_ATTR(l4s, POC_LV4S);
++DEFINE_POC_COUNT_ATTR(l4p, POC_LV4P);
++DEFINE_POC_COUNT_ATTR(l4r, POC_LV4R);
++DEFINE_POC_COUNT_ATTR(l4t, POC_LV4T);
++DEFINE_POC_COUNT_ATTR(l5, POC_LV5);
++DEFINE_POC_COUNT_ATTR(l6, POC_LV6);
++DEFINE_POC_COUNT_ATTR(fallback, POC_FALLBACK);
++
++static ssize_t poc_count_reset_store(struct kobject *kobj,
++		struct kobj_attribute *attr,
++		const char *buf, size_t count)
++{
++	int cpu;
++
++	for_each_possible_cpu(cpu)
++		memset(per_cpu_ptr(poc_debug_cnt, cpu), 0,
++		       sizeof(poc_debug_cnt));
++	return count;
++}
++
++static struct kobj_attribute poc_count_reset_attr = {
++	.attr = { .name = "reset", .mode = 0200 },
++	.store = poc_count_reset_store,
++};
++
++static struct attribute *poc_count_attrs[] = {
++	&poc_count_l1s_attr.attr,
++	&poc_count_l1t_attr.attr,
++	&poc_count_l1p_attr.attr,
++	&poc_count_l1r_attr.attr,
++	&poc_count_l2_attr.attr,
++	&poc_count_l3_attr.attr,
++	&poc_count_l4s_attr.attr,
++	&poc_count_l4p_attr.attr,
++	&poc_count_l4r_attr.attr,
++	&poc_count_l4t_attr.attr,
++	&poc_count_l5_attr.attr,
++	&poc_count_l6_attr.attr,
++	&poc_count_fallback_attr.attr,
++	&poc_count_reset_attr.attr,
++	NULL,
++};
++
++static const struct attribute_group poc_count_group = {
++	.name = "count",
++	.attrs = poc_count_attrs,
++};
++
++static int __init sched_poc_status_init(void)
++{
++	int ret;
++
++	kobj_poc_root = kobject_create_and_add("poc_selector", kernel_kobj);
++	if (!kobj_poc_root)
++		return -ENOMEM;
++
++	ret = sysfs_create_group(kobj_poc_root, &poc_status_group);
++	if (ret)
++		goto err_status;
++
++	ret = sysfs_create_group(kobj_poc_root, &poc_hw_group);
++	if (ret)
++		goto err_hw;
++
++	ret = sysfs_create_group(kobj_poc_root, &poc_count_group);
++	if (ret)
++		goto err_selected;
++
++	return 0;
++
++err_selected:
++	sysfs_remove_group(kobj_poc_root, &poc_hw_group);
++err_hw:
++	sysfs_remove_group(kobj_poc_root, &poc_status_group);
++err_status:
++	kobject_put(kobj_poc_root);
++	kobj_poc_root = NULL;
++	return ret;
++}
++late_initcall(sched_poc_status_init);
++
++#endif /* CONFIG_SYSFS */
++#endif /* CONFIG_SCHED_POC_SELECTOR */
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index adfb6e3409..c5676f37a1 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1136,2 +1136,5 @@
+ 	unsigned int		ttwu_pending;
++#ifdef CONFIG_SCHED_POC_SELECTOR
++	unsigned int		poc_idle_committed;
++#endif
+ 	u64			nr_switches;
+@@ -2197,6 +2200,112 @@ static inline struct task_group *task_group(struct task_struct *p)
+ 
+ #endif /* !CONFIG_CGROUP_SCHED */
+ 
++#ifdef CONFIG_SCHED_POC_SELECTOR
++extern struct static_key_true poc_selector_active;
++#ifdef CONFIG_SCHED_CLASS_EXT
++extern void poc_notify_scx(bool scx_active);
++extern void poc_check_skip_fallback(void);
++#else
++static inline void poc_check_skip_fallback(void) {}
++#endif
++extern struct static_key_true sched_poc_aligned;
++extern struct static_key_true sched_poc_smt_consecutive;
++extern struct static_key_true sched_poc_smt_uniform;
++extern struct static_key_false sched_poc_target_sticky;
++extern struct static_key_true sched_poc_packed;
++extern struct static_key_false sched_poc_lockless_bitmap;
++extern void __set_cpu_idle_state_poc(int cpu, int state);
++static __always_inline void set_cpu_idle_state_poc(int cpu, int state)
++{
++	if (static_branch_likely(&poc_selector_active) &&
++	    !sched_asym_cpucap_active())
++		__set_cpu_idle_state_poc(cpu, state);
++}
++
++/*
++ * POC_CTZ64 - Count trailing zeros (find first set bit)
++ *
++ * Architecture-optimized CTZ for POC idle CPU selection.
++ * Returns 64 for input 0 (important for BSF-based implementations).
++ */
++#if defined(__x86_64__) && defined(__BMI__)
++/* Tier 1: x86-64 with BMI1 - TZCNT is zero-safe */
++#define POC_CTZ64(v) ((int)__builtin_ctzll(v))
++
++#elif defined(__aarch64__)
++/* Tier 1: ARM64 - RBIT+CLZ is zero-safe */
++#define POC_CTZ64(v) ((int)__builtin_ctzll(v))
++
++#elif defined(__riscv) && defined(__riscv_zbb)
++/* Tier 1: RISC-V with Zbb - CTZ is zero-safe */
++#define POC_CTZ64(v) ((int)__builtin_ctzll(v))
++
++#elif defined(__x86_64__)
++/* Tier 2: x86-64 without BMI1 - BSF needs zero check */
++static __always_inline int poc_ctz64_bsf(u64 v)
++{
++	if (unlikely(!v))
++		return 64;
++	return (int)__builtin_ctzll(v);
++}
++#define POC_CTZ64(v) poc_ctz64_bsf(v)
++
++#else
++/* Tier 3: De Bruijn fallback for other architectures */
++#define POC_DEBRUIJN_CTZ64_CONST 0x03F79D71B4CA8B09ULL
++static const u8 poc_debruijn_ctz64_tab[64] = {
++	 0,  1, 56,  2, 57, 49, 28,  3,
++	61, 58, 42, 50, 38, 29, 17,  4,
++	62, 47, 59, 36, 45, 43, 51, 22,
++	53, 39, 33, 30, 24, 18, 12,  5,
++	63, 55, 48, 27, 60, 41, 37, 16,
++	46, 35, 44, 21, 52, 32, 23, 11,
++	54, 26, 40, 15, 34, 20, 31, 10,
++	25, 14, 19,  9, 13,  8,  7,  6,
++};
++static __always_inline int poc_debruijn_ctz64(u64 v)
++{
++	u64 lsb;
++	u32 idx;
++
++	if (unlikely(!v))
++		return 64;
++	lsb = v & (-(s64)v);
++	idx = (u32)((lsb * POC_DEBRUIJN_CTZ64_CONST) >> 58);
++	return (int)poc_debruijn_ctz64_tab[idx & 63];
++}
++#define POC_CTZ64(v) poc_debruijn_ctz64(v)
++
++#endif /* POC_CTZ64 */
++
++/*
++ * POC helper: convert cpumask region to POC-relative u64
++ *
++ * Extracts the 64-bit region of @mask corresponding to this LLC's
++ * CPU range and shifts it to align with POC's bit positions.
++ *
++ * Used by load balancer functions that need to intersect cpumasks
++ * with POC idle bitmaps.
++ */
++static __always_inline u64 poc_cpumask_to_u64(const struct cpumask *mask,
++					      struct sched_domain_shared *sd_share)
++{
++	int base = sd_share->poc_cpu_base;
++	int base_word = base >> 6;
++
++	if (static_branch_likely(&sched_poc_aligned)) {
++		/* Fast path: no shift needed (base is 64-aligned) */
++		return cpumask_bits(mask)[base_word];
++	} else {
++		/* Slow path: shift required (e.g., Threadripper) */
++		int shift = sd_share->poc_affinity_shift;
++		u64 lo = cpumask_bits(mask)[base_word];
++		u64 hi = cpumask_bits(mask)[base_word + 1];
++		return (lo >> shift) | (hi << (64 - shift));
++	}
++}
++#endif /* CONFIG_SCHED_POC_SELECTOR */
++
+ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+ {
+ 	set_task_rq(p, cpu);
+@@ -3134,6 +3243,7 @@ extern void nohz_run_idle_balance(int cpu);
+ static inline void nohz_run_idle_balance(int cpu) { }
+ #endif
+ 
++
+ #include "stats.h"
+ 
+ #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 444bdfdab7..510b96abcd 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -1717,6 +1717,232 @@ sd_init(struct sched_domain_topology_level *tl,
+ 		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ 		atomic_inc(&sd->shared->ref);
+ 		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
++
++#ifdef CONFIG_SCHED_POC_SELECTOR
++		int range = cpumask_last(sd_span) - sd_id + 1;
++
++		sd->shared->poc_cpu_base = sd_id;
++		sd->shared->poc_affinity_shift = sd_id & 63;
++
++		if (range <= 64) {
++			sd->shared->poc_fast_eligible = true;
++			/*
++			 * Disable aligned optimization if this LLC's base CPU
++			 * is not 64-aligned (e.g., Threadripper CCDs).
++			 */
++			if (sd_id & 63)
++				static_branch_disable_cpuslocked(&sched_poc_aligned);
++			/*
++			 * Disable packed priority search if this LLC
++			 * has more than 32 CPUs.
++			 */
++			if (range > 32)
++				static_branch_disable_cpuslocked(&sched_poc_packed);
++		} else {
++			sd->shared->poc_fast_eligible = false;
++			static_branch_disable_cpuslocked(&sched_poc_packed);
++		}
++		memset(sd->shared->poc_idle_cpus, 0,
++		       sizeof(sd->shared->poc_idle_cpus));
++		atomic64_set(&sd->shared->poc_idle_cpus_mask, 0);
++#ifdef CONFIG_SCHED_SMT
++		memset(sd->shared->poc_idle_cores, 0,
++		       sizeof(sd->shared->poc_idle_cores));
++		atomic64_set(&sd->shared->poc_idle_cores_mask, 0);
++#endif
++
++		/* Build LLC member bitmask for reader-side aggregation */
++		{
++			u64 members = 0;
++			int cpu_iter;
++
++			for_each_cpu(cpu_iter, sd_span) {
++				int bit = cpu_iter - sd_id;
++
++				if ((unsigned int)bit < 64)
++					members |= 1ULL << bit;
++			}
++			sd->shared->poc_llc_members = members;
++
++		}
++
++#ifdef CONFIG_SCHED_SMT
++		/*
++		 * Pre-compute SMT sibling masks for Level 4.
++		 * Each entry contains a bitmask of SMT siblings (including self)
++		 * for O(1) lookup via CTZ during wakeup.
++		 */
++		memset(sd->shared->poc_smt_mask, 0,
++		       sizeof(sd->shared->poc_smt_mask));
++		if (sd->shared->poc_fast_eligible) {
++			int cpu_iter;
++
++			for_each_cpu(cpu_iter, sd_span) {
++				int bit = cpu_iter - sd_id;
++				int sibling;
++				u64 mask = 0;
++
++				for_each_cpu(sibling, cpu_smt_mask(cpu_iter)) {
++					int sib_bit;
++
++					sib_bit = sibling - sd_id;
++					if (sib_bit >= 0 && sib_bit < 64)
++						mask |= 1ULL << sib_bit;
++				}
++				if (bit >= 0 && bit < 64)
++					sd->shared->poc_smt_mask[bit] = mask;
++			}
++		}
++
++		/*
++		 * Detect SMT topology and classify for poc_idle_core_mask():
++		 *
++		 * Tier 1 (consecutive): uniform 2-way SMT, siblings at
++		 * consecutive bit positions (e.g., 0,1 / 2,3).
++		 * Uses compile-time constants: shift=1, mask=0x5555...
++		 *
++		 * Tier 2 (uniform stride-N): uniform 2-way SMT with
++		 * constant stride between siblings (e.g., Intel Xeon
++		 * stride-8: CPU 0,8 / 1,9 / ...).  Uses precomputed
++		 * poc_smt_shift and poc_primary_mask for read-time
++		 * derivation without write-path overhead.
++		 *
++		 * Tier 3 (exotic): >2-way SMT, non-uniform topology,
++		 * or mixed SMT ways.  Falls back to write-time
++		 * maintenance of poc_idle_cores_mask atomic64_t.
++		 *
++		 * On pure non-SMT systems, the key values are irrelevant
++		 * because sched_smt_active() gates all SMT paths.
++		 */
++		sd->shared->poc_smt_shift = 1;
++		sd->shared->poc_primary_mask = 0;
++
++		if (sd->shared->poc_fast_eligible) {
++			int cpu_iter;
++			bool all_2way = true;
++			bool all_consecutive = true;
++			int uniform_stride = -1;
++			u64 primary_mask = 0;
++
++			for_each_cpu(cpu_iter, sd_span) {
++				int bit = cpu_iter - sd_id;
++
++				if (bit < 0 || bit >= 64)
++					continue;
++				u64 mask = sd->shared->poc_smt_mask[bit];
++				int ways = hweight64(mask);
++
++				if (ways != 2) {
++					all_2way = false;
++					all_consecutive = false;
++					break;
++				}
++
++				int lo = __ffs(mask);
++				int hi = __fls(mask);
++				int stride = hi - lo;
++
++				/* Track primary (lowest-numbered sibling) */
++				primary_mask |= 1ULL << lo;
++
++				/* Check consecutive: 0b11 at even position */
++				if ((lo & 1) || mask != (3ULL << lo))
++					all_consecutive = false;
++
++				/* Check uniform stride */
++				if (uniform_stride < 0)
++					uniform_stride = stride;
++				else if (stride != uniform_stride)
++					all_2way = false;
++			}
++
++			if (!all_consecutive)
++				static_branch_disable_cpuslocked(
++					&sched_poc_smt_consecutive);
++
++			if (all_2way && uniform_stride > 0) {
++				sd->shared->poc_smt_shift =
++					(u8)uniform_stride;
++				sd->shared->poc_primary_mask = primary_mask;
++			} else {
++				static_branch_disable_cpuslocked(
++					&sched_poc_smt_consecutive);
++				static_branch_disable_cpuslocked(
++					&sched_poc_smt_uniform);
++			}
++		}
++#endif /* CONFIG_SCHED_SMT */
++
++		memset(sd->shared->poc_cluster_mask, 0,
++		       sizeof(sd->shared->poc_cluster_mask));
++
++		sd->shared->poc_cluster_valid = false;
++
++#ifdef CONFIG_SCHED_CLUSTER
++		/*
++		 * Detect cluster (L2-sharing) topology for Level 2/5
++		 * cluster-local search in POC selector.
++		 *
++		 * Uses cpu_clustergroup_mask() which returns the L2
++		 * cache sharing mask on x86.  Validates that all
++		 * clusters are uniform (same size, power-of-2, and
++		 * naturally aligned in POC bit space).
++		 */
++		if (sd->shared->poc_fast_eligible) {
++			const struct cpumask *cls_mask =
++				cpu_clustergroup_mask(sd_id);
++			int cls_size = cpumask_weight(cls_mask);
++			int smt_size = cpumask_weight(cpu_smt_mask(sd_id));
++
++			if (cls_size > smt_size &&
++			    is_power_of_2(cls_size)) {
++				bool valid = true;
++				int cpu_iter;
++
++				for_each_cpu(cpu_iter, sd_span) {
++					const struct cpumask *m =
++						cpu_clustergroup_mask(cpu_iter);
++					int first = cpumask_first(m);
++					int rel = first - sd_id;
++
++					if (cpumask_weight(m) != cls_size ||
++					    (rel & (cls_size - 1)) != 0) {
++						valid = false;
++						break;
++					}
++				}
++				if (valid) {
++					sd->shared->poc_cluster_valid = true;
++
++					/*
++					 * Pre-compute cluster masks for O(1) lookup.
++					 * Each entry contains a bitmask of cluster
++					 * members (excluding self) for fast search.
++					 */
++					for_each_cpu(cpu_iter, sd_span) {
++						const struct cpumask *m =
++							cpu_clustergroup_mask(cpu_iter);
++						int bit = cpu_iter - sd_id;
++						int member;
++						u64 cmask = 0;
++
++						for_each_cpu(member, m) {
++							int mbit;
++
++							if (member == cpu_iter)
++								continue;
++							mbit = member - sd_id;
++							if (mbit >= 0 && mbit < 64)
++								cmask |= 1ULL << mbit;
++						}
++						if (bit >= 0 && bit < 64)
++							sd->shared->poc_cluster_mask[bit] = cmask;
++					}
++				}
++			}
++		}
++#endif /* CONFIG_SCHED_CLUSTER */
++#endif /* CONFIG_SCHED_POC_SELECTOR */
+ 	}
+ 
+ 	sd->private = sdd;
+-- 
+2.34.1
diff --git a/PKGBUILD b/PKGBUILD
index 1e8134f..943b69b 100755
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -93,6 +93,7 @@ source=(
   "git+https://github.com/dlundqvist/xone.git#tag=v0.5.8"
   "git+https://github.com/forkymcforkface/xpad-noone.git#commit=8e903676dd9514c07ce5e06e43c5f7d8cc51cb7d"
   "git+https://github.com/atar-axis/xpadneo.git#tag=v$_xpadneo_version"
+   6.16-poc-selector-v2.6.1.patch 
 )
 sha256sums=('a69eea3b189ab64e65608140d6cd7c57823d1b39b361e876197eec1b4d1db957'
             '37452b4d09e5e42134ae24a61f2f656790837c327268074cf79d7dab3558b972'
@@ -139,7 +140,8 @@ sha256sums=('a69eea3b189ab64e65608140d6cd7c57823d1b39b361e876197eec1b4d1db957'
             '26aed703ca1a74aa33bd76e632a63810840f7549849435c2a8e893985ff6e2c9'
             '7ba61ccf2ddb508d6adb30906d3d57dc0ce1bc64a6d1a41796eb94a8584ea63b'
             '1055bbbd32985017f4501d375648873bd598db084177d302aeeade56b47920e1'
-            '26b3a811d38471a42229fa037cb6d2bb5ff78f19f45a17c7f263339ee67769a7')
+            '26b3a811d38471a42229fa037cb6d2bb5ff78f19f45a17c7f263339ee67769a7'
+            '14dabfb0452a3a817e8d809fb28eb7565512e95386d789c627b62baf136e001f')
 
 export KBUILD_BUILD_HOST=archlinux
 export KBUILD_BUILD_USER=$pkgbase

From 44a0969aaceeb3123e1d25e009efedee860071ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Fri, 15 May 2026 13:11:15 -0300
Subject: [PATCH 2/5] Backport NAP cpuidle governor to Linux 6.16

---
 6.16-nap-v0.4.0.patch | 1928 +++++++++++++++++++++++++++++++++++++++++
 PKGBUILD              |    4 +-
 2 files changed, 1931 insertions(+), 1 deletion(-)
 create mode 100644 6.16-nap-v0.4.0.patch

diff --git a/6.16-nap-v0.4.0.patch b/6.16-nap-v0.4.0.patch
new file mode 100644
index 0000000..1fd94eb
--- /dev/null
+++ b/6.16-nap-v0.4.0.patch
@@ -0,0 +1,1928 @@
+From 1d2e8272f288fecce3fd7f762fb8c628ed04b7fe Mon Sep 17 00:00:00 2001
+From: Masahito S <firelzrd@gmail.com>
+Date: Wed, 15 Apr 2026 08:37:01 +0900
+Subject: [PATCH] 6.16 backport: 6.18.3-nap-v0.4.0
+
+Backport of NAP cpuidle governor to Linux 6.16.
+No functional changes except added RESIDENCY_THRESHOLD_NS definition.
+
+Signed-off-by: Masahito S <firelzrd@gmail.com>
+---
+ drivers/cpuidle/Kconfig                     |  17 +
+ drivers/cpuidle/governors/Makefile          |   1 +
+ drivers/cpuidle/governors/nap/Makefile      |  29 +
+ drivers/cpuidle/governors/nap/nap.c         | 671 ++++++++++++++++++++
+ drivers/cpuidle/governors/nap/nap.h         | 283 +++++++++
+ drivers/cpuidle/governors/nap/nap_fpu.c     | 572 +++++++++++++++++
+ drivers/cpuidle/governors/nap/nap_nn_avx2.c | 135 ++++
+ drivers/cpuidle/governors/nap/nap_nn_sse2.c | 136 ++++
+ 8 files changed, 1844 insertions(+)
+ create mode 100644 drivers/cpuidle/governors/nap/Makefile
+ create mode 100644 drivers/cpuidle/governors/nap/nap.c
+ create mode 100644 drivers/cpuidle/governors/nap/nap.h
+ create mode 100644 drivers/cpuidle/governors/nap/nap_fpu.c
+ create mode 100644 drivers/cpuidle/governors/nap/nap_nn_avx2.c
+ create mode 100644 drivers/cpuidle/governors/nap/nap_nn_sse2.c
+
+diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
+index cac5997dca..9b6c50f0d8 100644
+--- a/drivers/cpuidle/Kconfig
++++ b/drivers/cpuidle/Kconfig
+@@ -44,6 +44,23 @@ config CPU_IDLE_GOV_HALTPOLL
+ 
+ 	  Some virtualized workloads benefit from using it.
+ 
++config CPU_IDLE_GOV_NAP
++	bool "Neural Adaptive Predictor (NAP) governor"
++	depends on X86_64
++	default y
++	help
++	  A machine-learning-based cpuidle governor that uses a small
++	  neural network (MLP 16→16→10) to predict the optimal idle
++	  state.  Weights are initialized from hardware idle-state
++	  parameters and refined via online learning (deferred
++	  backpropagation with SGD).  Requires SSE2 at minimum;
++	  AVX2/AVX-512 are used when available.
++
++	  This is experimental. Select via cpuidle.governor=nap on
++	  the kernel command line.
++
++	  If unsure, say Y.
++
+ config DT_IDLE_STATES
+ 	bool
+ 
+diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile
+index 63abb5393a..ae688891c0 100644
+--- a/drivers/cpuidle/governors/Makefile
++++ b/drivers/cpuidle/governors/Makefile
+@@ -7,3 +7,4 @@ obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o
+ obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o
+ obj-$(CONFIG_CPU_IDLE_GOV_TEO) += teo.o
+ obj-$(CONFIG_CPU_IDLE_GOV_HALTPOLL) += haltpoll.o
++obj-$(CONFIG_CPU_IDLE_GOV_NAP) += nap/
+diff --git a/drivers/cpuidle/governors/nap/Makefile b/drivers/cpuidle/governors/nap/Makefile
+new file mode 100644
+index 0000000000..6d48cd5384
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/Makefile
+@@ -0,0 +1,29 @@
++# SPDX-License-Identifier: GPL-2.0-only
++#
++# Makefile for the NAP cpuidle governor
++#
++
++obj-$(CONFIG_CPU_IDLE_GOV_NAP) += cpuidle_gov_nap.o
++
++cpuidle_gov_nap-y := nap.o nap_fpu.o nap_nn_sse2.o nap_nn_avx2.o
++
++# Kernel builds with -mno-sse -mno-sse2 -mno-avx -msoft-float -mno-80387
++# -mno-fp-ret-in-387.  FPU/SIMD-using files need these removed and ISA
++# flags explicitly added.
++#
++# CRITICAL: nap.o is intentionally compiled with NORMAL kernel flags
++# (no FPU/SSE).  All floating-point code lives in nap_fpu.o and the
++# nap_nn_*.o files.  This ensures the compiler cannot emit SSE instructions
++# in governor callbacks (nap_select, nap_reflect, etc.), which would
++# silently corrupt userspace FPU register state.
++#
++# Do NOT add CFLAGS_REMOVE/CFLAGS for nap.o — it must stay FPU-free.
++FPU_KILL_FLAGS := -mno-sse -mno-sse2 -mno-mmx -mno-avx -mno-3dnow \
++                  -mno-sse4a -msoft-float -mno-80387 -mno-fp-ret-in-387
++
++CFLAGS_REMOVE_nap_fpu.o        += $(FPU_KILL_FLAGS)
++CFLAGS_REMOVE_nap_nn_sse2.o    += $(FPU_KILL_FLAGS)
++CFLAGS_REMOVE_nap_nn_avx2.o    += $(FPU_KILL_FLAGS)
++CFLAGS_nap_fpu.o       += $(CC_FLAGS_FPU)
++CFLAGS_nap_nn_sse2.o   += $(CC_FLAGS_FPU)
++CFLAGS_nap_nn_avx2.o   += $(CC_FLAGS_FPU) -mavx -mavx2 -mfma
+diff --git a/drivers/cpuidle/governors/nap/nap.c b/drivers/cpuidle/governors/nap/nap.c
+new file mode 100644
+index 0000000000..c72b67e9c3
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/nap.c
+@@ -0,0 +1,671 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * nap.c — Neural Adaptive Predictor cpuidle governor
++ *
++ * A machine-learning-based cpuidle governor that uses a small MLP (8→8→1)
++ * with 3 Mixture-of-Experts (short/long/deep) to predict a log2 correction
++ * factor for sleep_length.  State selection is deterministic threshold
++ * comparison.  Weights are Xavier-initialized at boot, then refined via
++ * online learning (deferred backpropagation with SGD).
++ *
++ * IMPORTANT: This file is compiled WITHOUT FPU/SSE flags (normal kernel
++ * compilation).  All floating-point and SIMD code lives in nap_fpu.c and
++ * nap_nn_{sse2,avx2}.c, which are compiled with CC_FLAGS_FPU.
++ * This separation ensures the compiler cannot emit SSE instructions in
++ * governor callbacks (nap_select, nap_reflect, etc.), which would corrupt
++ * userspace FPU register state.
++ */
++
++#include <linux/cpuidle.h>
++#include <linux/cpu.h>
++#include <linux/jump_label.h>
++#include <linux/kobject.h>
++#include <linux/math64.h>
++#include <linux/percpu.h>
++#include <linux/sched/clock.h>
++#include <linux/sysfs.h>
++#include <linux/string.h>
++#include <linux/tick.h>
++#include <asm/simd.h>
++#include <asm/fpu/api.h>
++#include <asm/processor.h>
++
++#include "nap.h"
++
++#include "../gov.h"
++
++/**************************************************************
++ * Version Information:
++ */
++
++#define CPUIDLE_NAP_PROGNAME "Nap CPUIdle Governor"
++#define CPUIDLE_NAP_AUTHOR   "Masahito Suzuki"
++
++#define CPUIDLE_NAP_VERSION  "0.4.0"
++
++/* Governor defaults */
++#define NAP_DEFAULT_LR_MILLTHS    1     /* 0.001 = 1 millths */
++#define NAP_DEFAULT_INTERVAL      4     /* learn every 4 reflects */
++#define NAP_DEFAULT_CLAMP_MILLTHS 1000  /* 1.0 = 1000 millths */
++#define NAP_DEFAULT_PCTL_MILLTHS  100   /* 10th percentile */
++
++/* Backport: RESIDENCY_THRESHOLD_NS was missing in original patch */
++#define RESIDENCY_THRESHOLD_NS TICK_NSEC
++
++/* ================================================================
++ * ISA dispatch via static keys (definitions only; dispatch in nap_fpu.c)
++ * ================================================================ */
++
++DEFINE_STATIC_KEY_FALSE(nap_use_avx2);
++
++static void __init nap_detect_simd(void)
++{
++	if (boot_cpu_has(X86_FEATURE_FMA) &&
++	    boot_cpu_has(X86_FEATURE_AVX2)) {
++		static_branch_enable(&nap_use_avx2);
++		pr_info("nap: using AVX2+FMA\n");
++	} else {
++		pr_info("nap: using SSE2\n");
++	}
++}
++
++/* ================================================================
++ * Per-CPU data
++ * ================================================================ */
++
++DEFINE_PER_CPU(struct nap_cpu_data, nap_data);
++static struct cpuidle_driver *nap_cached_drv;
++
++/* ================================================================
++ * Reflect-time updates (integer-only, no FPU needed)
++ * ================================================================ */
++
++static void nap_history_update(struct nap_cpu_data *d, u64 measured_ns)
++{
++	d->history[d->hist_idx] = measured_ns;
++	d->hist_idx = (d->hist_idx + 1) % NAP_HISTORY_SIZE;
++	if (d->hist_count < NAP_HISTORY_SIZE)
++		d->hist_count++;
++
++}
++
++static void nap_update_external_signals(struct nap_cpu_data *d)
++{
++	d->prev_idle_exit = local_clock();
++}
++
++/* ================================================================
++ * Governor callbacks
++ * ================================================================ */
++
++/*
++ * Return the shallowest C-state index that is both enabled and
++ * satisfies the current latency request.  Returns 0 if no such
++ * state exists (caller must treat 0 as "POLL is the only option").
++ *
++ * Called from the short-circuit path to decide whether the predicted
++ * sleep length is worth entering any C-state at all.  Does not
++ * consult the NN.
++ */
++static int nap_find_min_valid_state(struct cpuidle_driver *drv,
++				    struct cpuidle_device *dev,
++				    s64 latency_req)
++{
++	int i;
++
++	for (i = 1; i < drv->state_count; i++) {
++		if (dev->states_usage[i].disable)
++			continue;
++		if (drv->states[i].exit_latency_ns > latency_req)
++			continue;
++		return i;
++	}
++	return 0;
++}
++
++/*
++ * Cached wrapper around nap_find_min_valid_state().
++ *
++ * Invalidation triggers:
++ *   1. latency_req changed since last cached value (immediate; PM QoS
++ *      updates propagate on the next nap_select call).
++ *   2. NAP_MIN_STATE_REFRESH_JIFFIES elapsed since last refresh
++ *      (bounded staleness for sysfs-driven or runtime-driver state
++ *      disable events, which are rare).
++ *
++ * Hot path cost when the cache is valid: ~5-7 cycles (one s64
++ * compare, one time_after() check, one conditional return).  The
++ * uncached loop runs at most once per HZ jiffies per CPU.
++ */
++static inline int nap_get_min_valid_state(struct nap_cpu_data *d,
++					   struct cpuidle_driver *drv,
++					   struct cpuidle_device *dev,
++					   s64 latency_req)
++{
++	if (unlikely(latency_req != d->cached_min_state_latency ||
++		     time_after(jiffies,
++				d->cached_min_state_jiffies +
++				NAP_MIN_STATE_REFRESH_JIFFIES))) {
++		d->cached_min_state = nap_find_min_valid_state(drv, dev,
++							       latency_req);
++		d->cached_min_state_latency = latency_req;
++		d->cached_min_state_jiffies = jiffies;
++	}
++	return d->cached_min_state;
++}
++
++/*
++ * Compute dev->poll_limit_ns for the short-circuit path.
++ *
++ * Budget = predicted wake time (sleep_length) + 1 µs safety margin.
++ * The margin absorbs timer jitter so a wake arriving slightly after
++ * the predicted time does not trigger a select/enter/reflect retry
++ * cycle.  It is consumed only when the wake is actually late; on-time
++ * and early wakes exit POLL via need_resched without touching the
++ * margin.
++ *
++ * Floor: NAP_POLL_LIMIT_MIN_NS (1 µs).  Below this, per-iteration
++ * governor overhead exceeds actual polling, and POLL's own timeout
++ * sampling granularity (~1.3 µs via POLL_IDLE_RELAX_COUNT cpu_relax
++ * iterations) makes smaller limits indistinguishable in practice.
++ *
++ * Ceiling: min_state.target_residency_ns.  Beyond that point, the
++ * C-state would have been a better choice than polling.
++ */
++static inline u64 nap_compute_poll_limit(u64 sleep_length_ns,
++					 u64 min_state_target_ns)
++{
++	u64 budget = sleep_length_ns + NAP_POLL_LIMIT_MARGIN_NS;
++
++	return clamp_t(u64, budget,
++		       NAP_POLL_LIMIT_MIN_NS,
++		       min_state_target_ns);
++}
++
++static int nap_fallback_heuristic(struct cpuidle_driver *drv,
++				  struct cpuidle_device *dev)
++{
++	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
++	ktime_t delta_tick;
++	u64 sleep_length_ns;
++	int i;
++
++	sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick));
++
++	for (i = drv->state_count - 1; i > 0; i--) {
++		if (dev->states_usage[i].disable)
++			continue;
++		if (drv->states[i].exit_latency_ns > latency_req)
++			continue;
++		if (drv->states[i].target_residency_ns > sleep_length_ns)
++			continue;
++		return i;
++	}
++	return 0;
++}
++
++static int nap_select(struct cpuidle_driver *drv,
++		      struct cpuidle_device *dev,
++		      bool *stop_tick)
++{
++	struct nap_cpu_data *d = this_cpu_ptr(&nap_data);
++	s64 latency_req;
++	ktime_t delta_tick;
++	u64 sleep_length_ns;
++	int idx, min_state;
++
++	if (unlikely(drv->state_count <= 1))
++		return 0;
++
++	latency_req = cpuidle_governor_latency_req(dev->cpu);
++	sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick));
++
++	min_state = nap_get_min_valid_state(d, drv, dev, latency_req);
++
++	/*
++	 * Fast path: when no C-state can amortize its target residency
++	 * within the predicted sleep length, the answer is deterministically
++	 * POLL.  Skip NN inference and feature extraction entirely.
++	 * nap_reflect also skips history update and learning for
++	 * short-circuited events (see the short_circuited check there).
++	 * See spec §3.1.
++	 */
++	if (min_state == 0 ||
++	    sleep_length_ns < drv->states[min_state].target_residency_ns) {
++
++		if (min_state > 0)
++			dev->poll_limit_ns = nap_compute_poll_limit(
++				sleep_length_ns,
++				drv->states[min_state].target_residency_ns);
++		else
++			dev->poll_limit_ns = max_t(u64, sleep_length_ns,
++						   NAP_POLL_LIMIT_MIN_NS);
++
++		*stop_tick = false;
++		d->last_selected_idx = 0;
++		d->short_circuited = true;
++		d->stats.total_selects++;
++		return 0;
++	}
++
++	/* Normal NN-driven path */
++	d->short_circuited = false;
++
++	if (likely(may_use_simd())) {
++		kernel_fpu_begin();
++		idx = nap_fpu_select(drv, dev, d);
++		kernel_fpu_end();
++
++		if (idx < 0)
++			idx = nap_fallback_heuristic(drv, dev);
++	} else {
++		idx = nap_fallback_heuristic(drv, dev);
++	}
++
++	*stop_tick = (drv->states[idx].target_residency_ns >
++		      RESIDENCY_THRESHOLD_NS);
++
++	d->last_selected_idx = idx;
++	d->stats.total_selects++;
++
++	return idx;
++}
++
++static void nap_reflect(struct cpuidle_device *dev, int index)
++{
++	struct nap_cpu_data *d = this_cpu_ptr(&nap_data);
++	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
++	u64 measured_ns = dev->last_residency_ns;
++
++	if (unlikely(!drv))
++		return;
++
++	/*
++	 * Short-circuited POLL: NN was not invoked for this idle
++	 * event, so the residency does not belong to the NN's
++	 * training distribution.  Update the aggregate residency
++	 * statistic and return — history, hit_intercept, prediction
++	 * error, external signals, and learning are all skipped.
++	 * See spec §3.4.
++	 */
++	if (d->short_circuited) {
++		d->stats.total_residency_ns += measured_ns;
++		return;
++	}
++
++	nap_history_update(d, measured_ns);
++
++	d->last_prediction_error = d->last_predicted_ns - (s64)measured_ns;
++	nap_update_external_signals(d);
++
++	/*
++	 * Dual gate: learn when both the per-N-reflect counter fires
++	 * AND at least learn_jiffies_min jiffies have elapsed since
++	 * the last learning step.  The time gate prevents sustained
++	 * weight churn on workloads with very rapid idle bursts; a
++	 * value of 0 disables it (restores the original counter-only
++	 * behavior).  See spec §3.5.
++	 */
++	if (++d->learn_counter >= d->learn_interval &&
++	    time_after_eq(jiffies,
++			  d->last_learn_jiffies + d->learn_jiffies_min)) {
++		d->learn_counter = 0;
++		d->last_learn_jiffies = jiffies;
++		d->learn_actual_ns = measured_ns;
++		d->needs_learn = true;
++	}
++
++	d->stats.total_residency_ns += measured_ns;
++	if (index > 0 && measured_ns < drv->states[index].target_residency_ns)
++		d->stats.overshoot_count++;
++}
++
++static int nap_enable(struct cpuidle_driver *drv,
++		      struct cpuidle_device *dev)
++{
++	struct nap_cpu_data *d = per_cpu_ptr(&nap_data, dev->cpu);
++
++	memset(d, 0, sizeof(*d));
++
++	/*
++	 * Force first-call refresh of the min-valid-state cache.
++	 * cached_min_state_latency = S64_MIN ensures the first
++	 * nap_select() comparison will always trip the invalidation
++	 * branch regardless of the actual latency_req value.
++	 * cached_min_state itself is already zeroed by the memset above.
++	 */
++	d->cached_min_state_latency = S64_MIN;
++	d->cached_min_state_jiffies = jiffies - NAP_MIN_STATE_REFRESH_JIFFIES;
++
++	/* Default: allow at most one learning step per jiffy */
++	d->learn_jiffies_min = 1;
++
++	/*
++	 * Defer weight initialization to the first nap_select() FPU path
++	 * via reset_pending.  nap_enable() is called from cpuidle core
++	 * (cpuidle_enable_device) which may run on a different CPU than
++	 * dev->cpu during governor switch.  Deferring ensures FPU init
++	 * happens on the correct CPU in its own idle context.
++	 */
++	WRITE_ONCE(nap_cached_drv, drv);
++	d->learning_rate_millths  = NAP_DEFAULT_LR_MILLTHS;
++	d->learn_interval = NAP_DEFAULT_INTERVAL;
++	d->max_grad_norm_millths  = NAP_DEFAULT_CLAMP_MILLTHS;
++	d->overshoot_pctl_millths = NAP_DEFAULT_PCTL_MILLTHS;
++	d->reset_pending = true;
++
++	return 0;
++}
++
++static void nap_disable(struct cpuidle_driver *drv,
++			struct cpuidle_device *dev)
++{
++	WRITE_ONCE(nap_cached_drv, NULL);
++}
++
++/* ================================================================
++ * sysfs interface  (/sys/devices/system/cpu/nap/)
++ * ================================================================ */
++
++static ssize_t stats_show(struct kobject *kobj,
++			  struct kobj_attribute *attr, char *buf)
++{
++	int cpu, len = 0;
++	u64 total_sel = 0, total_res = 0, total_under = 0, total_learn = 0;
++
++	for_each_online_cpu(cpu) {
++		struct nap_cpu_data *d = &per_cpu(nap_data, cpu);
++
++		total_sel   += d->stats.total_selects;
++		total_res   += d->stats.total_residency_ns;
++		total_under += d->stats.overshoot_count;
++		total_learn += d->stats.learn_count;
++	}
++
++	len += sysfs_emit_at(buf, len, "total_selects: %llu\n", total_sel);
++	len += sysfs_emit_at(buf, len, "total_residency_ms: %llu\n",
++			     div_u64(total_res, NSEC_PER_MSEC));
++	len += sysfs_emit_at(buf, len, "overshoot_count: %llu\n", total_under);
++	len += sysfs_emit_at(buf, len, "overshoot_rate_permil: %llu\n",
++			     total_sel ? div_u64(total_under * 1000, total_sel) : 0);
++	len += sysfs_emit_at(buf, len, "learn_count: %llu\n", total_learn);
++	return len;
++}
++
++static ssize_t learning_rate_show(struct kobject *kobj,
++				  struct kobj_attribute *attr, char *buf)
++{
++	int cpu;
++
++	cpu = cpumask_first(cpu_online_mask);
++	if (cpu >= nr_cpu_ids)
++		return sysfs_emit(buf, "0\n");
++	return sysfs_emit(buf, "%u\n",
++			  per_cpu(nap_data, cpu).learning_rate_millths);
++}
++
++static ssize_t learning_rate_store(struct kobject *kobj,
++				   struct kobj_attribute *attr,
++				   const char *buf, size_t count)
++{
++	unsigned int val;
++	int cpu;
++
++	if (kstrtouint(buf, 10, &val) || val == 0 || val > 100)
++		return -EINVAL;
++
++	for_each_online_cpu(cpu)
++		per_cpu(nap_data, cpu).learning_rate_millths = val;
++
++	return count;
++}
++
++static ssize_t learn_interval_show(struct kobject *kobj,
++				   struct kobj_attribute *attr, char *buf)
++{
++	int cpu;
++
++	cpu = cpumask_first(cpu_online_mask);
++	if (cpu >= nr_cpu_ids)
++		return sysfs_emit(buf, "0\n");
++	return sysfs_emit(buf, "%d\n",
++			  per_cpu(nap_data, cpu).learn_interval);
++}
++
++static ssize_t learn_interval_store(struct kobject *kobj,
++				    struct kobj_attribute *attr,
++				    const char *buf, size_t count)
++{
++	unsigned int val;
++	int cpu;
++
++	if (kstrtouint(buf, 10, &val) || val == 0 || val > 10000)
++		return -EINVAL;
++
++	for_each_online_cpu(cpu)
++		per_cpu(nap_data, cpu).learn_interval = val;
++
++	return count;
++}
++
++static ssize_t learn_jiffies_min_show(struct kobject *kobj,
++				      struct kobj_attribute *attr, char *buf)
++{
++	int cpu;
++
++	cpu = cpumask_first(cpu_online_mask);
++	if (cpu >= nr_cpu_ids)
++		return sysfs_emit(buf, "0\n");
++	return sysfs_emit(buf, "%u\n",
++			  per_cpu(nap_data, cpu).learn_jiffies_min);
++}
++
++static ssize_t learn_jiffies_min_store(struct kobject *kobj,
++				       struct kobj_attribute *attr,
++				       const char *buf, size_t count)
++{
++	unsigned int val;
++	int cpu;
++
++	if (kstrtouint(buf, 10, &val) || val > HZ * 3600)
++		return -EINVAL;
++
++	for_each_online_cpu(cpu)
++		per_cpu(nap_data, cpu).learn_jiffies_min = val;
++
++	return count;
++}
++
++static ssize_t reset_weights_store(struct kobject *kobj,
++				   struct kobj_attribute *attr,
++				   const char *buf, size_t count)
++{
++	cpumask_var_t mask;
++	int cpu;
++
++	if (!READ_ONCE(nap_cached_drv))
++		return -ENODEV;
++
++	/*
++	 * Set a per-CPU flag; each CPU will reinitialize its own weights
++	 * inside nap_select() within its own kernel_fpu_begin/end context.
++	 * This avoids cross-CPU data races on the weight arrays.
++	 *
++	 * Accepts "all" to reset every online CPU, or a cpulist
++	 * (e.g. "0-3,5,7") to reset specific CPUs.
++	 */
++	if (sysfs_streq(buf, "all")) {
++		for_each_online_cpu(cpu)
++			per_cpu(nap_data, cpu).reset_pending = true;
++		pr_info("nap: weight reset scheduled for all CPUs\n");
++		return count;
++	}
++
++	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
++		return -ENOMEM;
++
++	if (cpulist_parse(buf, mask)) {
++		free_cpumask_var(mask);
++		return -EINVAL;
++	}
++
++	for_each_cpu_and(cpu, mask, cpu_online_mask)
++		per_cpu(nap_data, cpu).reset_pending = true;
++
++	pr_info("nap: weight reset scheduled for CPUs %*pbl\n",
++		cpumask_pr_args(mask));
++	free_cpumask_var(mask);
++	return count;
++}
++
++static ssize_t reset_stats_store(struct kobject *kobj,
++				 struct kobj_attribute *attr,
++				 const char *buf, size_t count)
++{
++	int cpu;
++
++	for_each_online_cpu(cpu)
++		memset(&per_cpu(nap_data, cpu).stats, 0,
++		       sizeof(struct nap_stats));
++
++	return count;
++}
++
++static ssize_t overshoot_pctl_show(struct kobject *kobj,
++				    struct kobj_attribute *attr, char *buf)
++{
++	int cpu;
++
++	cpu = cpumask_first(cpu_online_mask);
++	if (cpu >= nr_cpu_ids)
++		return sysfs_emit(buf, "0\n");
++	return sysfs_emit(buf, "%u\n",
++			  per_cpu(nap_data, cpu).overshoot_pctl_millths);
++}
++
++static ssize_t overshoot_pctl_store(struct kobject *kobj,
++				     struct kobj_attribute *attr,
++				     const char *buf, size_t count)
++{
++	unsigned int val;
++	int cpu;
++
++	if (kstrtouint(buf, 10, &val) || val > 500)
++		return -EINVAL;
++
++	for_each_online_cpu(cpu)
++		per_cpu(nap_data, cpu).overshoot_pctl_millths = val;
++
++	return count;
++}
++
++static ssize_t version_show(struct kobject *kobj,
++			    struct kobj_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "%s\n", CPUIDLE_NAP_VERSION);
++}
++
++static ssize_t simd_show(struct kobject *kobj,
++			 struct kobj_attribute *attr, char *buf)
++{
++	if (static_branch_unlikely(&nap_use_avx2))
++		return sysfs_emit(buf, "avx2\n");
++	else
++		return sysfs_emit(buf, "sse2\n");
++}
++
++static struct kobj_attribute version_attr           = __ATTR_RO(version);
++static struct kobj_attribute simd_attr              = __ATTR_RO(simd);
++static struct kobj_attribute stats_attr             = __ATTR_RO(stats);
++static struct kobj_attribute learning_rate_attr     = __ATTR_RW(learning_rate);
++static struct kobj_attribute learn_interval_attr    = __ATTR_RW(learn_interval);
++static struct kobj_attribute learn_jiffies_min_attr = __ATTR_RW(learn_jiffies_min);
++static struct kobj_attribute overshoot_pctl_attr    = __ATTR_RW(overshoot_pctl);
++static struct kobj_attribute reset_weights_attr     = __ATTR_WO(reset_weights);
++static struct kobj_attribute reset_stats_attr       = __ATTR_WO(reset_stats);
++
++static struct attribute *nap_attrs[] = {
++	&version_attr.attr,
++	&simd_attr.attr,
++	&stats_attr.attr,
++	&learning_rate_attr.attr,
++	&learn_interval_attr.attr,
++	&learn_jiffies_min_attr.attr,
++	&overshoot_pctl_attr.attr,
++	&reset_weights_attr.attr,
++	&reset_stats_attr.attr,
++	NULL,
++};
++
++static const struct attribute_group nap_attr_group = {
++	.attrs = nap_attrs,
++};
++
++static struct kobject *cpuidle_kobj;
++
++int nap_sysfs_init(void)
++{
++	struct device *dev_root;
++	int ret;
++
++	dev_root = bus_get_dev_root(&cpu_subsys);
++	if (!dev_root)
++		return -ENODEV;
++
++	cpuidle_kobj = kobject_create_and_add("nap", &dev_root->kobj);
++	put_device(dev_root);
++	if (!cpuidle_kobj)
++		return -ENOMEM;
++
++	ret = sysfs_create_group(cpuidle_kobj, &nap_attr_group);
++	if (ret) {
++		kobject_put(cpuidle_kobj);
++		cpuidle_kobj = NULL;
++	}
++	return ret;
++}
++
++void nap_sysfs_exit(void)
++{
++	if (cpuidle_kobj) {
++		sysfs_remove_group(cpuidle_kobj, &nap_attr_group);
++		kobject_put(cpuidle_kobj);
++		cpuidle_kobj = NULL;
++	}
++}
++
++/* ================================================================
++ * Governor registration
++ * ================================================================ */
++
++static struct cpuidle_governor nap_governor = {
++	.name    = "nap",
++	.rating  = 26,
++	.enable  = nap_enable,
++	.disable = nap_disable,
++	.select  = nap_select,
++	.reflect = nap_reflect,
++};
++
++static int __init nap_init(void)
++{
++	int ret;
++
++	nap_detect_simd();
++
++	ret = nap_sysfs_init();
++	if (ret)
++		pr_warn("nap: sysfs init failed: %d (continuing without sysfs)\n", ret);
++
++	ret = cpuidle_register_governor(&nap_governor);
++	if (ret) {
++		pr_err("nap: register_governor failed: %d\n", ret);
++		nap_sysfs_exit();
++		return ret;
++	}
++
++	pr_info("%s v%s by %s registered (rating=%u)\n",
++	       CPUIDLE_NAP_PROGNAME, CPUIDLE_NAP_VERSION,
++	       CPUIDLE_NAP_AUTHOR, nap_governor.rating);
++	return 0;
++}
++postcore_initcall(nap_init);
+diff --git a/drivers/cpuidle/governors/nap/nap.h b/drivers/cpuidle/governors/nap/nap.h
+new file mode 100644
+index 0000000000..1059db983b
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/nap.h
+@@ -0,0 +1,283 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef NAP_H
++#define NAP_H
++
++#include <linux/cpuidle.h>
++#include <linux/jump_label.h>
++#include <linux/ktime.h>
++
++/* ================================================================
++ * Neural network dimensions
++ * ================================================================ */
++
++#define NAP_INPUT_SIZE    8
++#define NAP_HIDDEN_SIZE   8
++#define NAP_NUM_EXPERTS   3
++
++/*
++ * Neural network weight structure for an 8→8→1 MLP (scalar regression).
++ *
++ * The NN outputs a single log2 correction factor applied to sleep_length:
++ *   effective_sleep = exp2(log2(sleep_length) + nn_output)
++ * State selection is then deterministic: pick the deepest state whose
++ * cost (target_residency + exit_latency) ≤ effective_sleep.
++ *
++ * Column-major storage: w_h1[j][i] = weight from input j to hidden neuron i.
++ * This layout enables efficient column-wise matrix-vector products where
++ * each input broadcasts across all hidden neurons via SIMD FMA.
++ *
++ * __aligned(32) ensures AVX2 vmovaps (32-byte aligned) loads work
++ * correctly.  8 floats = 32 bytes = one ymm register.
++ */
++struct nap_weights {
++	/* Hidden layer: input[8] → hidden[8] */
++	float w_h1[NAP_INPUT_SIZE][NAP_HIDDEN_SIZE];  /* 64 params */
++	float b_h1[NAP_HIDDEN_SIZE];                   /* 8 params  */
++	/* Output layer: hidden[8] → 1 scalar */
++	float w_out[NAP_HIDDEN_SIZE];                  /* 8 params  */
++	float b_out;                                   /* 1 param   */
++} __aligned(32);
++
++/* ISA-specific forward pass implementations */
++void nap_nn_forward_sse2(const float *input, float *output,
++			 float *hidden_save, const struct nap_weights *w);
++void nap_nn_forward_avx2(const float *input, float *output,
++			 float *hidden_save, const struct nap_weights *w);
++/* ISA-specific online learning (backpropagation) */
++struct nap_cpu_data;
++void nap_nn_learn_sse2(struct nap_cpu_data *d);
++void nap_nn_learn_avx2(struct nap_cpu_data *d);
++
++/* Static key for ISA dispatch (defined in nap.c) */
++DECLARE_STATIC_KEY_FALSE(nap_use_avx2);
++
++/* ================================================================
++ * SIMD type definitions and helpers (GCC vector extensions)
++ *
++ * Only available when compiled with FPU/SSE flags (nap_fpu.c,
++ * nap_nn_*.c).  nap.c is compiled without FPU flags and must
++ * not see these definitions.
++ *
++ * <immintrin.h> is a userspace header and cannot be used in kernel.
++ * We use __attribute__((__vector_size__())) and __builtin_ia32_*.
++ * ================================================================ */
++
++#ifdef __SSE2__
++
++typedef float v4sf  __attribute__((__vector_size__(16)));   /* xmm: 4×float  */
++typedef int   v4si  __attribute__((__vector_size__(16)));   /* xmm: 4×int32  */
++typedef float v8sf  __attribute__((__vector_size__(32)));   /* ymm: 8×float  */
++
++/* Broadcast helpers */
++#define V4SF_SET1(x)  ((v4sf){ (x), (x), (x), (x) })
++#define V4SI_SET1(x)  ((v4si){ (x), (x), (x), (x) })
++#define V8SF_SET1(x)  ((v8sf){ (x),(x),(x),(x),(x),(x),(x),(x) })
++#define V8SF_ZERO     V8SF_SET1(0.0f)
++
++/* Unaligned load/store helpers */
++static inline v4sf v4sf_loadu(const float *p)
++{
++	v4sf result;
++	__builtin_memcpy(&result, p, sizeof(result));
++	return result;
++}
++
++static inline void v4sf_storeu(float *p, v4sf v)
++{
++	__builtin_memcpy(p, &v, sizeof(v));
++}
++
++#ifdef __AVX__
++static inline v8sf v8sf_loadu(const float *p)
++{
++	v8sf result;
++	__builtin_memcpy(&result, p, sizeof(result));
++	return result;
++}
++
++static inline void v8sf_storeu(float *p, v8sf v)
++{
++	__builtin_memcpy(p, &v, sizeof(v));
++}
++#endif /* __AVX__ */
++
++/* Scalar/vector clamp helpers */
++static inline float fclampf(float v, float lo, float hi)
++{
++	if (v < lo) return lo;
++	if (v > hi) return hi;
++	return v;
++}
++
++static inline v4sf v4sf_clamp(v4sf v, v4sf lo, v4sf hi)
++{
++	return __builtin_ia32_maxps(__builtin_ia32_minps(v, hi), lo);
++}
++
++/* Type punning: float ↔ int reinterpret (no instruction generated) */
++static inline v4si v4sf_as_v4si(v4sf v)
++{
++	union { v4sf f; v4si i; } u = { .f = v };
++	return u.i;
++}
++
++static inline v4sf v4si_as_v4sf(v4si v)
++{
++	union { v4si i; v4sf f; } u = { .i = v };
++	return u.f;
++}
++
++/*
++ * fast_log2f_sse() — Compute log2 of 4 floats simultaneously using SSE2
++ *
++ * Cost: ~15 cycles for 4 values (~4 cycles per value)
++ */
++static inline v4sf fast_log2f_sse(v4sf x)
++{
++	const v4si mask_exp  = V4SI_SET1(0xFF);
++	const v4si bias      = V4SI_SET1(127);
++	const v4si mask_mant = V4SI_SET1(0x7FFFFF);
++	const v4si exp_bias  = V4SI_SET1(127 << 23);
++
++	v4si xi    = v4sf_as_v4si(x);
++	v4si exp_i = (xi >> 23) & mask_exp;
++	exp_i      = exp_i - bias;
++	v4sf e     = __builtin_convertvector(exp_i, v4sf);
++
++	v4si mant_i = (xi & mask_mant) | exp_bias;
++	v4sf m      = v4si_as_v4sf(mant_i) - V4SF_SET1(1.0f);
++
++	v4sf p;
++	p = m * V4SF_SET1(0.4808f);
++	p = V4SF_SET1(0.7213f) - p;
++	p = m * p;
++	p = V4SF_SET1(1.4425f) - p;
++	p = m * p;
++
++	return e + p;
++}
++
++#endif /* __SSE2__ */
++
++/* ================================================================
++ * Feature extraction
++ * ================================================================ */
++
++#define NAP_HISTORY_SIZE     8
++
++/* ================================================================
++ * POLL short-circuit tunables
++ * ================================================================ */
++
++/* Minimum and safety-margin values for dev->poll_limit_ns written
++ * by nap_compute_poll_limit().  Both are 1 µs: the POLL state
++ * itself checks its timeout only every ~1 µs (POLL_IDLE_RELAX_COUNT
++ * cpu_relax() iterations in drivers/cpuidle/poll_state.c), so
++ * finer-grained values would not produce distinguishable behavior.
++ */
++#define NAP_POLL_LIMIT_MIN_NS      1000ULL
++#define NAP_POLL_LIMIT_MARGIN_NS   1000ULL
++
++/* Refresh interval for the cached minimum-valid-state lookup.
++ * HZ jiffies (= 1 second) bounds the staleness window caused by
++ * sysfs-driven or runtime-driver state disable events.  PM QoS
++ * latency changes are detected immediately via the cached
++ * latency_req comparison.
++ */
++#define NAP_MIN_STATE_REFRESH_JIFFIES  HZ
++
++struct nap_stats {
++	u64 total_selects;
++	u64 total_residency_ns;
++	u64 overshoot_count;
++	u64 learn_count;
++};
++
++struct nap_cpu_data {
++	/* Ring buffer */
++	u64   history[NAP_HISTORY_SIZE];
++	float log_history[NAP_HISTORY_SIZE];
++	int   hist_idx;
++	int   hist_count;
++
++	/* External signal tracking */
++	u64     prev_idle_exit;
++	s64     last_predicted_ns;
++	s64     last_prediction_error;
++
++	/* Short-circuit fast path (§3.1, §3.2, §3.4 of spec) */
++	bool short_circuited;			/* set in select, read in reflect */
++	int  cached_min_state;			/* cached shallowest valid state */
++	s64  cached_min_state_latency;		/* latency_req when cache populated */
++	unsigned long cached_min_state_jiffies;	/* jiffies when cache populated */
++
++	/* Jiffies-based learning rate floor (§3.5 of spec) */
++	unsigned long last_learn_jiffies;
++	unsigned int  learn_jiffies_min;	/* sysfs-tunable, 0 = disabled */
++
++	/* select/reflect handoff */
++	int   last_selected_idx;
++
++	/* NN scalar output: log2 correction factor for sleep_length.
++	 * effective_sleep = exp2(log2(sleep_length) + nn_output).
++	 */
++	float nn_output;
++
++	/*
++	 * hidden_out[], features_f32[] are written with aligned SIMD
++	 * stores in nap_nn_forward_{sse2,avx2}() and
++	 * nap_extract_features():
++	 *   SSE2:    movaps  (16-byte aligned)
++	 *   AVX2:    vmovaps (32-byte aligned)
++	 * Without __aligned(64), the natural struct offset would be
++	 * only 4-byte aligned, causing #GP faults in the idle task.
++	 */
++	float hidden_out[NAP_HIDDEN_SIZE] __aligned(32);
++	float features_f32[NAP_INPUT_SIZE] __aligned(32);
++
++	/* Backprop scratch */
++	float learn_d_out;	/* output gradient direction (±1) */
++	float learn_lr;		/* effective lr (base_lr * asymmetric weight) */
++	float learn_d_hid[NAP_HIDDEN_SIZE] __aligned(32);
++
++	/* Precomputed per-state log2(target_residency) for threshold selection.
++	 * log2_cost[i] = log2(target_residency_ns).
++	 */
++	float log2_cost[CPUIDLE_STATE_MAX];
++
++	/* Deferred learning data */
++	bool  needs_learn;
++	bool  output_clamped;	/* true if nn_output was clamped to features[0] */
++	u64   learn_actual_ns;
++
++	/* Mixture-of-Experts: 3 experts × 8 neurons each */
++	struct nap_weights expert_weights[NAP_NUM_EXPERTS];
++	struct nap_weights *active_w;	/* selected expert for current/deferred pass */
++	int   active_expert;		/* 0, 1, or 2: which expert is active */
++	float expert_mid;		/* log2 threshold: short ↔ long */
++	float expert_deep;		/* log2 threshold: long ↔ deep */
++
++	/* Online learning */
++	unsigned int learning_rate_millths;
++	unsigned int max_grad_norm_millths;
++	unsigned int overshoot_pctl_millths; /* quantile target (250 = 25th pctl) */
++	int   learn_interval;
++	int   learn_counter;
++	bool reset_pending;		/* set by sysfs, consumed by nap_select */
++
++	/* sysfs statistics */
++	struct nap_stats stats;
++};
++
++DECLARE_PER_CPU(struct nap_cpu_data, nap_data);
++
++/* FPU entry point (nap_fpu.c) — call only within kernel_fpu_begin/end */
++int nap_fpu_select(struct cpuidle_driver *drv,
++		   struct cpuidle_device *dev,
++		   struct nap_cpu_data *d);
++
++/* sysfs interface */
++int  nap_sysfs_init(void);
++void nap_sysfs_exit(void);
++
++#endif /* NAP_H */
+diff --git a/drivers/cpuidle/governors/nap/nap_fpu.c b/drivers/cpuidle/governors/nap/nap_fpu.c
+new file mode 100644
+index 0000000000..482a06a5d0
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/nap_fpu.c
+@@ -0,0 +1,572 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * nap_fpu.c — FPU/SIMD code for the NAP cpuidle governor
++ *
++ * This file is compiled with FPU/SSE flags enabled (CC_FLAGS_FPU).
++ * ALL functions here MUST be called only from within
++ * kernel_fpu_begin()/kernel_fpu_end() blocks.
++ *
++ * Keeping FPU code in a separate translation unit ensures the compiler
++ * cannot emit SSE/x87 instructions in non-FPU code paths (nap.c),
++ * which would silently corrupt userspace FPU register state.
++ */
++
++#include <linux/cpuidle.h>
++#include <linux/math64.h>
++#include <linux/percpu.h>
++#include <linux/pm_qos.h>
++#include <linux/sched/clock.h>
++#include <linux/string.h>
++#include <linux/tick.h>
++
++#include "nap.h"
++
++/* Clang lacks __builtin_ia32_movhlps; emulate with __builtin_shufflevector */
++#ifdef __clang__
++#define __builtin_ia32_movhlps(a, b) \
++	__builtin_shufflevector(b, a, 2, 3, 6, 7)
++#endif
++
++/* ================================================================
++ * Float math helpers
++ * ================================================================ */
++
++static inline float float_min(float a, float b) { return a < b ? a : b; }
++static inline float float_max(float a, float b) { return a > b ? a : b; }
++
++/*
++ * Kernel-safe sqrtf using the SSE sqrtss instruction directly.
++ * GCC may lower nap_sqrtf to a libm call, which is unavailable
++ * in the kernel.  This file is always compiled with FPU/SSE enabled.
++ */
++static inline float nap_sqrtf(float x)
++{
++	asm("sqrtss %1, %0" : "=x"(x) : "x"(x));
++	return x;
++}
++
++/* Scalar log2 approximation (same algorithm as fast_log2f_sse) */
++static inline float fast_log2f(float x)
++{
++	union { float f; u32 i; } u = { .f = x };
++	int exp = (int)((u.i >> 23) & 0xFFu) - 127;
++	float e = (float)exp;
++	float m, p;
++
++	u.i = (u.i & 0x7FFFFFu) | (127u << 23);
++	m = u.f - 1.0f;
++
++	p = m * 0.4808f;
++	p = 0.7213f - p;
++	p = m * p;
++	p = 1.4425f - p;
++	p = m * p;
++
++	return e + p;
++}
++
++/* ================================================================
++ * Deterministic PRNG for weight initialization (LCG)
++ * ================================================================ */
++
++static inline float nap_prng_float(u32 *state)
++{
++	*state = *state * 1664525u + 1013904223u;
++	return (float)(s32)*state * (1.0f / 2147483648.0f);
++}
++
++/* ================================================================
++ * ISA dispatch via static keys
++ * ================================================================ */
++
++static inline void nap_nn_forward(const float *input, float *output,
++				  float *hidden_save,
++				  const struct nap_weights *w)
++{
++	if (static_branch_unlikely(&nap_use_avx2))
++		nap_nn_forward_avx2(input, output, hidden_save, w);
++	else
++		nap_nn_forward_sse2(input, output, hidden_save, w);
++}
++
++static inline void nap_nn_learn(struct nap_cpu_data *d)
++{
++	if (static_branch_unlikely(&nap_use_avx2))
++		nap_nn_learn_avx2(d);
++	else
++		nap_nn_learn_sse2(d);
++}
++
++/* ================================================================
++ * Weight initialization
++ *
++ * The NN directly outputs predicted sleep time in log2(ns) space.
++ * Hidden neuron 0 is initialized as a pass-through for feature[0]
++ * (log2(sleep_length)), so the initial output ≈ log2(sleep_length).
++ * This matches the pre-learning behavior of selecting the deepest
++ * state that fits within sleep_length.
++ *
++ * Other hidden neurons are Xavier-initialized with near-zero output
++ * weights so their initial contribution is negligible.  Biases = 0.
++ * ================================================================ */
++
++#define NAP_PRNG_SEED 42u
++
++static void nap_init_weights(struct nap_weights *w)
++{
++	u32 rng = NAP_PRNG_SEED;
++	float scale_h1, scale_out;
++	int i, j;
++
++	/* Xavier uniform: U(-sqrt(6/(fan_in+fan_out)), +sqrt(6/(...))) */
++	scale_h1  = nap_sqrtf(6.0f / (float)(NAP_INPUT_SIZE + NAP_HIDDEN_SIZE));
++	scale_out = 0.01f;
++
++	/* Hidden layer weights */
++	for (i = 0; i < NAP_INPUT_SIZE; i++)
++		for (j = 0; j < NAP_HIDDEN_SIZE; j++)
++			w->w_h1[i][j] = nap_prng_float(&rng) * scale_h1;
++
++	/* Hidden biases: zero (standard) */
++	memset(w->b_h1, 0, sizeof(w->b_h1));
++
++	/* Output weights: near-zero for ~0 initial contribution */
++	for (j = 0; j < NAP_HIDDEN_SIZE; j++)
++		w->w_out[j] = nap_prng_float(&rng) * scale_out;
++
++	/* Output bias: zero */
++	w->b_out = 0.0f;
++
++	/*
++	 * Neuron 0: pass-through for feature[0] = log2(sleep_length).
++	 * hidden[0] = ReLU(1.0 * input[0] + 0) = input[0]  (always > 0)
++	 * output += 1.0 * hidden[0] = log2(sleep_length)
++	 *
++	 * Override the random init above so initial output ≈ input[0].
++	 */
++	for (i = 0; i < NAP_INPUT_SIZE; i++)
++		w->w_h1[i][0] = 0.0f;
++	w->w_h1[0][0] = 1.0f;
++	w->b_h1[0] = 0.0f;
++	w->w_out[0] = 1.0f;
++}
++
++/*
++ * Precompute log2(target_residency) per state for threshold-based selection.
++ *
++ * Used in the selection loop: pick deepest state where
++ * log2_cost[i] <= nn_output (predicted sleep time in log2 space).
++ *
++ * Only target_residency_ns is used — exit_latency is a wakeup cost,
++ * not a factor in whether the CPU can profitably stay in the state
++ * for the predicted duration.
++ */
++static void nap_init_log2_cost(struct nap_cpu_data *d,
++			       struct cpuidle_driver *drv)
++{
++	float log2_tick;
++	int long_start, deep_idx;
++	int i;
++
++	for (i = 0; i < drv->state_count; i++) {
++		float res = float_max(
++			(float)drv->states[i].target_residency_ns, 1.0f);
++		d->log2_cost[i] = fast_log2f(res);
++	}
++
++	/*
++	 * MoE expert boundaries — 3-way split.
++	 *
++	 * Expert 0 (short): tick-bound idles where measured residency
++	 *   is dominated by the next tick rather than the workload's
++	 *   true idle duration.  Boundary: log2(TICK_NSEC).
++	 *
++	 * Expert 1 (long): nohz idles in intermediate C-states.
++	 *
++	 * Expert 2 (deep): idles targeting the deepest C-state.
++	 *   The deepest state often has qualitatively different
++	 *   residency characteristics (package C-state, longer
++	 *   exit latency, power-gated domains) that warrant a
++	 *   dedicated expert to avoid gradient interference with
++	 *   intermediate states.
++	 *
++	 * Safety: with only 2 C-states (+ POLL), expert_deep is
++	 * placed equal to expert_mid so the deep expert is never
++	 * routed (same behavior as the old 2-expert split).
++	 */
++	if (drv->state_count <= 1) {
++		d->expert_mid = 0.0f;
++		d->expert_deep = 0.0f;
++		return;
++	}
++
++	log2_tick = fast_log2f((float)TICK_NSEC);
++
++	/* Default: deepest state belongs to long expert (safety) */
++	long_start = drv->state_count - 1;
++
++	/* Prefer the first state whose target_residency exceeds one jiffy */
++	for (i = 1; i < drv->state_count; i++) {
++		if (d->log2_cost[i] > log2_tick) {
++			long_start = i;
++			break;
++		}
++	}
++
++	if (long_start > 1) {
++		/* Normal case: boundary between last short and first long */
++		d->expert_mid = (d->log2_cost[long_start - 1] +
++				 d->log2_cost[long_start]) / 2.0f;
++	} else {
++		/*
++		 * long_start == 1: even the shallowest C-state already
++		 * exceeds one jiffy.  All NN-handled idles go to the
++		 * long expert; place the boundary just below C1's
++		 * residency so the short expert remains routable but
++		 * unused.
++		 */
++		d->expert_mid = d->log2_cost[1] - 1.0f;
++	}
++
++	/*
++	 * Deep expert boundary — deepest C-state split.
++	 *
++	 * When there are >= 3 C-states (state_count >= 4, counting POLL),
++	 * place the boundary at the midpoint between the second-deepest
++	 * and deepest state's log2(target_residency).  The deep expert
++	 * then exclusively handles sleep durations long enough to reach
++	 * the deepest state.
++	 *
++	 * With only 2 C-states, expert_deep == expert_mid collapses to
++	 * the 2-expert regime (expert 2 is never selected).
++	 */
++	deep_idx = drv->state_count - 1;
++	if (deep_idx >= 3) {
++		/* >= 3 C-states: split before the deepest */
++		d->expert_deep = (d->log2_cost[deep_idx - 1] +
++				  d->log2_cost[deep_idx]) / 2.0f;
++		/* Ensure deep > mid ordering */
++		if (d->expert_deep <= d->expert_mid)
++			d->expert_deep = d->expert_mid;
++	} else {
++		/* <= 2 C-states: collapse deep into long */
++		d->expert_deep = d->expert_mid;
++	}
++}
++
++/* ================================================================
++ * Feature extraction helpers
++ * ================================================================ */
++
++struct logring_stats {
++	float avg;
++	float min;
++	float max;
++};
++
++/*
++ * Compute log_history statistics: avg, min, max.
++ * SIMD fast path when the ring buffer is full (8 elements = 2 × xmm).
++ */
++static void logring_compute(const struct nap_cpu_data *d,
++			    struct logring_stats *s)
++{
++	int i, n = d->hist_count;
++	float sum;
++
++	if (n == 0) {
++		*s = (struct logring_stats){ 0 };
++		return;
++	}
++
++	if (n == NAP_HISTORY_SIZE) {
++		v4sf v0 = *(const v4sf *)&d->log_history[0];
++		v4sf v1 = *(const v4sf *)&d->log_history[4];
++		v4sf pmin, pmax, psum, t;
++
++		pmin = __builtin_ia32_minps(v0, v1);
++		pmax = __builtin_ia32_maxps(v0, v1);
++		psum = v0 + v1;
++
++		/* 4 → 2 */
++		t = __builtin_ia32_movhlps(pmin, pmin);
++		pmin = __builtin_ia32_minps(pmin, t);
++		t = __builtin_ia32_movhlps(pmax, pmax);
++		pmax = __builtin_ia32_maxps(pmax, t);
++		t = __builtin_ia32_movhlps(psum, psum);
++		psum = psum + t;
++
++		/* 2 → 1 */
++		t = __builtin_ia32_shufps(pmin, pmin, 0x55);
++		pmin = __builtin_ia32_minps(pmin, t);
++		t = __builtin_ia32_shufps(pmax, pmax, 0x55);
++		pmax = __builtin_ia32_maxps(pmax, t);
++		t = __builtin_ia32_shufps(psum, psum, 0x55);
++		psum = psum + t;
++
++		sum = psum[0];
++		s->min = pmin[0];
++		s->max = pmax[0];
++	} else {
++		float val;
++
++		sum = d->log_history[0];
++		s->min = sum;
++		s->max = sum;
++
++		for (i = 1; i < n; i++) {
++			val = d->log_history[i];
++			sum += val;
++			s->min = float_min(s->min, val);
++			s->max = float_max(s->max, val);
++		}
++	}
++
++	s->avg = sum / (float)n;
++}
++
++/*
++ * Extract 8 input features for the MLP.
++ *
++ *   [0] log2(sleep_length)           — next timer event
++ *   [1] log2(last_residency)         — actual duration of last idle
++ *   [2] log_hist avg                 — average recent idle duration
++ *   [3] log_hist min                 — shortest recent idle
++ *   [4] log_hist max                 — longest recent idle
++ *   [5] signed log2(|pred_error|+1)  — prediction feedback
++ *   [6] log2(busy_ns)               — pre-idle busy duration
++ *   [7] log2(lat_req) - log2(deepest_lat) — PM QoS headroom
++ */
++static void nap_extract_features(struct cpuidle_driver *drv,
++				 struct cpuidle_device *dev,
++				 float out[NAP_INPUT_SIZE],
++				 s64 latency_req)
++{
++	struct nap_cpu_data *d = this_cpu_ptr(&nap_data);
++	struct logring_stats lr;
++	ktime_t sleep_length, delta_tick;
++	u64 busy_ns;
++	float log_inputs[4] __aligned(16);
++	float log_results[4] __aligned(16);
++
++	sleep_length = tick_nohz_get_sleep_length(&delta_tick);
++	busy_ns = local_clock() - d->prev_idle_exit;
++
++	/*
++	 * SSE log2 batch: 4 values in one fast_log2f_sse call.
++	 *   [0] sleep_length   → out[0]
++	 *   [1] last_residency → out[1], also stored to log_history
++	 *   [2] busy_ns        → out[6]
++	 *   [3] |pred_error_us| + 1 → out[5] (sign restored after)
++	 */
++	{
++		float err_f = (float)(d->last_prediction_error / 1000);
++		float abs_err = (err_f >= 0.0f) ? err_f : -err_f;
++
++		log_inputs[0] = float_max((float)ktime_to_ns(sleep_length), 1.0f);
++		log_inputs[1] = float_max((float)dev->last_residency_ns, 1.0f);
++		log_inputs[2] = float_max((float)busy_ns, 1.0f);
++		log_inputs[3] = abs_err + 1.0f;
++
++		{
++			v4sf log_in  = *(const v4sf *)log_inputs;
++			v4sf log_out = fast_log2f_sse(log_in);
++			*(v4sf *)log_results = log_out;
++		}
++
++		out[0] = log_results[0];
++		out[1] = log_results[1];
++		out[6] = log_results[2];
++
++		/* out[5]: sign-preserving log2(|err_us| + 1) */
++		{
++			union { float f; u32 i; } res = { .f = log_results[3] };
++			union { float f; u32 i; } sgn = { .f = err_f };
++
++			res.i |= sgn.i & 0x80000000u;
++			out[5] = res.f;
++		}
++	}
++
++	/* Update log_history ring buffer */
++	{
++		int prev = (d->hist_idx - 1 + NAP_HISTORY_SIZE) % NAP_HISTORY_SIZE;
++		d->log_history[prev] = log_results[1];
++	}
++
++	/* Compute log_history statistics: avg, min, max */
++	logring_compute(d, &lr);
++	out[2] = lr.avg;
++	out[3] = lr.min;
++	out[4] = lr.max;
++
++	/* out[7]: log2(latency_req) - log2(deepest_lat), 0 if unconstrained */
++	{
++		u64 deepest_lat = drv->states[drv->state_count - 1]
++				      .exit_latency_ns;
++		bool lat_valid = (latency_req < PM_QOS_LATENCY_ANY_NS &&
++				  deepest_lat > 0);
++
++		if (lat_valid)
++			out[7] = fast_log2f(float_max((float)latency_req, 1.0f))
++			       - fast_log2f(float_max((float)deepest_lat, 1.0f));
++		else
++			out[7] = 0.0f;
++	}
++
++	d->last_predicted_ns = ktime_to_ns(sleep_length);
++}
++
++/* ================================================================
++ * FPU entry point for nap_select
++ *
++ * Called within kernel_fpu_begin()/kernel_fpu_end().
++ * Returns: selected idle state index (>= 0), or -1 to fall back
++ *          to the integer heuristic.
++ * ================================================================ */
++
++int nap_fpu_select(struct cpuidle_driver *drv,
++		   struct cpuidle_device *dev,
++		   struct nap_cpu_data *d)
++{
++	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
++
++	/* Handle deferred weight reset (set by sysfs or nap_enable) */
++	if (unlikely(d->reset_pending)) {
++		int e;
++
++		for (e = 0; e < NAP_NUM_EXPERTS; e++)
++			nap_init_weights(&d->expert_weights[e]);
++		nap_init_log2_cost(d, drv);
++		d->stats.learn_count = 0;
++		d->needs_learn = false;
++		d->reset_pending = false;
++	}
++
++	/* Deferred learning (always, even during warmup) */
++	if (d->needs_learn) {
++		float log2_eff = d->nn_output;
++		float alpha = (float)d->overshoot_pctl_millths
++			      / 1000.0f;
++		int nn_selected = 0;
++		bool is_overshoot;
++		int i;
++
++		/* Simulate which state the NN selected */
++		for (i = drv->state_count - 1; i > 0; i--) {
++			if (d->log2_cost[i] <= log2_eff) {
++				nn_selected = i;
++				break;
++			}
++		}
++
++		/*
++		 * Direct overshoot loss.
++		 *
++		 * Base the gradient on whether the simulated state
++		 * selection actually caused overshoot
++		 * (actual < target_residency).
++		 *
++		 * The asymmetric weight is encoded in the learning
++		 * rate (not in d_out) so that gradient clamping
++		 * cannot destroy the asymmetry.  d_out is ±1 and
++		 * gets clipped symmetrically; the (1-α) vs α ratio
++		 * is preserved through learn_lr.
++		 *
++		 * At equilibrium, P(overshoot) converges to α.
++		 * α = overshoot_pctl / 1000.
++		 */
++		{
++			float base_lr = (float)d->learning_rate_millths
++					/ 1000.0f;
++
++			is_overshoot = (nn_selected > 0 &&
++				d->learn_actual_ns <
++				drv->states[nn_selected].target_residency_ns);
++
++			/*
++			 * When the output was clamped at the upper
++			 * limit (nn_output == features[0]), the NN
++			 * is already predicting the maximum possible
++			 * sleep time.  Non-overshoot events would
++			 * push weights UP, but the output cannot
++			 * actually increase.  Suppress this gradient
++			 * to prevent unbounded weight growth in idle
++			 * systems where natural overshoot rate < α.
++			 *
++			 * Overshoot events still learn normally
++			 * (push DOWN) even when clamped.
++			 */
++			if (d->output_clamped && !is_overshoot) {
++				d->learn_lr = 0;
++				d->learn_d_out = 0;
++			} else {
++				d->learn_d_out = is_overshoot
++					? 1.0f : -1.0f;
++				d->learn_lr = is_overshoot
++					? base_lr * (1.0f - alpha)
++					: base_lr * alpha;
++			}
++		}
++
++		d->stats.learn_count++;
++
++		nap_nn_learn(d);
++		d->needs_learn = false;
++	}
++
++	/*
++	 * Feature extraction + NN forward pass.
++	 * features_f32 is __aligned(64) in nap_cpu_data, satisfying
++	 * AVX-512 vmovaps requirements.
++	 */
++	nap_extract_features(drv, dev, d->features_f32, latency_req);
++
++	/* MoE: 3-way expert selection based on log2(sleep_length) */
++	if (d->features_f32[0] >= d->expert_deep)
++		d->active_expert = 2;		/* deep: deepest C-state */
++	else if (d->features_f32[0] >= d->expert_mid)
++		d->active_expert = 1;		/* long: nohz intermediate */
++	else
++		d->active_expert = 0;		/* short: tick-bound */
++	d->active_w = &d->expert_weights[d->active_expert];
++
++	nap_nn_forward(d->features_f32, &d->nn_output, d->hidden_out,
++		       d->active_w);
++
++	/*
++	 * Clamp NN output: predicted sleep cannot exceed sleep_length
++	 * (next timer event).  features_f32[0] = log2(sleep_length).
++	 *
++	 * Track whether the clamp was applied so the learning block
++	 * can suppress "push up" gradients when the output is already
++	 * at the maximum.  Without this, weights diverge unboundedly
++	 * in idle systems where the natural overshoot rate < alpha.
++	 */
++	d->output_clamped = (d->nn_output > d->features_f32[0]);
++	if (d->output_clamped)
++		d->nn_output = d->features_f32[0];
++
++	/*
++	 * Threshold-based selection using NN predicted sleep time.
++	 *
++	 * The NN directly outputs log2(predicted_sleep) in ns.
++	 * Select the deepest feasible state whose cost ≤ predicted_sleep.
++	 */
++	{
++		float log2_eff = d->nn_output;
++		int idx = 0, i;
++
++		for (i = drv->state_count - 1; i > 0; i--) {
++			if (dev->states_usage[i].disable)
++				continue;
++			if (drv->states[i].exit_latency_ns > latency_req)
++				continue;
++			if (d->log2_cost[i] <= log2_eff) {
++				idx = i;
++				break;
++			}
++		}
++		return idx;
++	}
++}
+diff --git a/drivers/cpuidle/governors/nap/nap_nn_avx2.c b/drivers/cpuidle/governors/nap/nap_nn_avx2.c
+new file mode 100644
+index 0000000000..96e5415423
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/nap_nn_avx2.c
+@@ -0,0 +1,135 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * nap_nn_avx2.c — AVX2+FMA forward pass and backpropagation for the nap MLP
++ *
++ * 8→8→1 scalar regression (log2 correction factor).
++ * Uses 256-bit ymm registers: 8 hidden neurons = 1 ymm.
++ * FMA via vfmadd231ps for fused multiply-add.
++ *
++ * Must be called within kernel_fpu_begin/end.
++ * Compiled with: CFLAGS += -mavx2 -mfma
++ */
++
++#include "nap.h"
++
++/* Aligned load/store: GCC translates v8sf* dereference to vmovaps */
++static inline v8sf v8sf_load(const float *p)   { return *(const v8sf *)p; }
++static inline void v8sf_store(float *p, v8sf v) { *(v8sf *)p = v; }
++
++/* FMA: a*b+c — vfmadd231ps: dest = src1 * src2 + dest */
++static inline v8sf v8sf_fmadd(v8sf a, v8sf b, v8sf c)
++{
++	asm("vfmadd231ps %2, %1, %0" : "+x"(c) : "x"(a), "xm"(b));
++	return c;
++}
++
++/* ymm clamp: max(min(v, hi), lo) */
++static inline v8sf v8sf_clamp(v8sf v, v8sf lo, v8sf hi)
++{
++	return __builtin_ia32_maxps256(__builtin_ia32_minps256(v, hi), lo);
++}
++
++void nap_nn_forward_avx2(const float *input,
++			 float *output,
++			 float *hidden_save,
++			 const struct nap_weights *w)
++{
++	int j;
++
++	/* === Hidden layer: 8 outputs = 1×ymm, 2-way accumulator === */
++	v8sf acc0 = v8sf_load(&w->b_h1[0]);
++	v8sf acc1 = V8SF_ZERO;
++
++	for (j = 0; j < NAP_INPUT_SIZE; j += 2) {
++		v8sf x0 = V8SF_SET1(input[j]);
++		v8sf x1 = V8SF_SET1(input[j + 1]);
++
++		acc0 = v8sf_fmadd(v8sf_load(&w->w_h1[j][0]),     x0, acc0);
++		acc1 = v8sf_fmadd(v8sf_load(&w->w_h1[j + 1][0]), x1, acc1);
++	}
++
++	/* Merge accumulators + ReLU */
++	{
++		v8sf h = __builtin_ia32_maxps256(acc0 + acc1, V8SF_ZERO);
++
++		v8sf_store(hidden_save, h);
++
++		/* === Output layer: dot(hidden[8], w_out[8]) + b_out === */
++		{
++			v8sf p = v8sf_load(&w->w_out[0]) * h;
++
++			/* Horizontal reduce: 8 → 4 → scalar */
++			v4sf lo = __builtin_ia32_vextractf128_ps256(p, 0);
++			v4sf hi = __builtin_ia32_vextractf128_ps256(p, 1);
++			v4sf s4 = lo + hi;
++
++			*output = s4[0] + s4[1] + s4[2] + s4[3] + w->b_out;
++		}
++	}
++}
++
++/*
++ * Online learning (backpropagation) — AVX2+FMA
++ *
++ * Output: scalar d_out (pre-computed by caller)
++ * Hidden layer: 8 neurons = 1×ymm
++ */
++void nap_nn_learn_avx2(struct nap_cpu_data *d)
++{
++	int i;
++	float d_out_scalar = d->learn_d_out;
++	float *d_hid = d->learn_d_hid;
++	float lr = d->learn_lr;
++	float clamp_val = (float)d->max_grad_norm_millths / 1000.0f;
++	v8sf v_neg_lr = V8SF_SET1(-lr);
++	v8sf v_cl_hi  = V8SF_SET1(clamp_val);
++	v8sf v_cl_lo  = V8SF_SET1(-clamp_val);
++
++	/*
++	 * Hidden gradient: d_hid[j] = relu'(h[j]) * w_out[j] * d_out.
++	 * vcmpps + vandps: branchless SIMD mask (1×ymm = 8 neurons).
++	 */
++	v8sf dh;
++	{
++		v8sf vd = V8SF_SET1(d_out_scalar);
++		v8sf g = v8sf_load(&d->active_w->w_out[0]) * vd;
++		v8sf mask = __builtin_ia32_cmpps256(
++				v8sf_load(&d->hidden_out[0]), V8SF_ZERO, 14);
++
++		asm("vandps %2, %1, %0" : "=x"(dh) : "x"(g), "xm"(mask));
++		v8sf_store(d_hid, dh);
++	}
++
++	/* Output weight update: w_out[j] -= lr * clamp(h[j] * d_out) */
++	{
++		v8sf vd = V8SF_SET1(d_out_scalar);
++		v8sf *w = (v8sf *)&d->active_w->w_out[0];
++
++		*w = v8sf_fmadd(v_neg_lr,
++				v8sf_clamp(v8sf_load(&d->hidden_out[0]) * vd,
++					   v_cl_lo, v_cl_hi),
++				*w);
++	}
++
++	/* Output bias update (scalar) */
++	d->active_w->b_out -= lr * fclampf(d_out_scalar, -clamp_val, clamp_val);
++
++	/* Hidden weight update: w_h1[i][j] -= lr * clamp(feat[i] * d_hid[j]) */
++	for (i = 0; i < NAP_INPUT_SIZE; i++) {
++		v8sf vf = V8SF_SET1(d->features_f32[i]);
++		v8sf *w = (v8sf *)&d->active_w->w_h1[i][0];
++
++		*w = v8sf_fmadd(v_neg_lr,
++				v8sf_clamp(vf * dh, v_cl_lo, v_cl_hi),
++				*w);
++	}
++
++	/* Hidden bias update */
++	{
++		v8sf *b = (v8sf *)&d->active_w->b_h1[0];
++
++		*b = v8sf_fmadd(v_neg_lr,
++				v8sf_clamp(dh, v_cl_lo, v_cl_hi),
++				*b);
++	}
++}
+diff --git a/drivers/cpuidle/governors/nap/nap_nn_sse2.c b/drivers/cpuidle/governors/nap/nap_nn_sse2.c
+new file mode 100644
+index 0000000000..a9fffb3b98
+--- /dev/null
++++ b/drivers/cpuidle/governors/nap/nap_nn_sse2.c
+@@ -0,0 +1,136 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * nap_nn_sse2.c — SSE2 forward pass and backpropagation for the nap MLP
++ *
++ * 8→8→1 scalar regression (log2 correction factor).
++ * Baseline implementation using SSE2, which is always available on x86_64.
++ * No FMA — uses separate mul + add (2 instructions per MAC).
++ *
++ * Must be called within kernel_fpu_begin/end.
++ * Compiled with: CFLAGS += -msse2
++ */
++
++#include "nap.h"
++
++/* Aligned load/store */
++static inline v4sf v4sf_load(const float *p)   { return *(const v4sf *)p; }
++static inline void v4sf_store(float *p, v4sf v) { *(v4sf *)p = v; }
++
++/* ReLU helper */
++static inline v4sf v4sf_max(v4sf a, v4sf b)
++{
++	return __builtin_ia32_maxps(a, b);
++}
++
++void nap_nn_forward_sse2(const float *input,
++			 float *output,
++			 float *hidden_save,
++			 const struct nap_weights *w)
++{
++	int j;
++
++	/* === Hidden layer: 8 outputs = 2×xmm === */
++	v4sf acc0 = v4sf_load(&w->b_h1[0]);
++	v4sf acc1 = v4sf_load(&w->b_h1[4]);
++
++	for (j = 0; j < NAP_INPUT_SIZE; j++) {
++		v4sf x = V4SF_SET1(input[j]);
++		acc0 += v4sf_load(&w->w_h1[j][0]) * x;
++		acc1 += v4sf_load(&w->w_h1[j][4]) * x;
++	}
++
++	/* ReLU */
++	{
++		v4sf zero = V4SF_SET1(0.0f);
++
++		acc0 = v4sf_max(acc0, zero);
++		acc1 = v4sf_max(acc1, zero);
++	}
++	v4sf_store(&hidden_save[0], acc0);
++	v4sf_store(&hidden_save[4], acc1);
++
++	/* === Output layer: dot(hidden[8], w_out[8]) + b_out → 1 scalar === */
++	{
++		v4sf p0 = v4sf_load(&w->w_out[0]) * acc0;
++		v4sf p1 = v4sf_load(&w->w_out[4]) * acc1;
++		v4sf sum = p0 + p1;
++
++		*output = sum[0] + sum[1] + sum[2] + sum[3] + w->b_out;
++	}
++}
++
++/*
++ * Online learning (backpropagation) — SSE2
++ *
++ * Output: scalar d_out (pre-computed by caller)
++ * Hidden layer: 8 neurons = 2×xmm
++ */
++void nap_nn_learn_sse2(struct nap_cpu_data *d)
++{
++	int i;
++	float d_out_scalar = d->learn_d_out;
++	float *d_hid = d->learn_d_hid;
++	float lr = d->learn_lr;
++	float clamp_val = (float)d->max_grad_norm_millths / 1000.0f;
++	v4sf v_lr    = V4SF_SET1(lr);
++	v4sf v_cl_hi = V4SF_SET1(clamp_val);
++	v4sf v_cl_lo = V4SF_SET1(-clamp_val);
++
++	/*
++	 * Hidden gradient: d_hid[j] = relu'(h[j]) * w_out[j] * d_out.
++	 * Must be computed before output weight update to use pre-update
++	 * w_out.
++	 */
++	{
++		v4sf vd = V4SF_SET1(d_out_scalar);
++		v4sf zero = V4SF_SET1(0.0f);
++		v4sf h, g;
++		v4si m;
++
++		h = v4sf_load(&d->hidden_out[0]);
++		g = v4sf_load(&d->active_w->w_out[0]) * vd;
++		m = (v4si)(h > zero);
++		v4sf_store(&d_hid[0], v4si_as_v4sf(v4sf_as_v4si(g) & m));
++
++		h = v4sf_load(&d->hidden_out[4]);
++		g = v4sf_load(&d->active_w->w_out[4]) * vd;
++		m = (v4si)(h > zero);
++		v4sf_store(&d_hid[4], v4si_as_v4sf(v4sf_as_v4si(g) & m));
++	}
++
++	/* Output weight update: w_out[j] -= lr * clamp(h[j] * d_out) */
++	{
++		v4sf vd = V4SF_SET1(d_out_scalar);
++		v4sf *w = (v4sf *)&d->active_w->w_out[0];
++
++		w[0] -= v_lr * v4sf_clamp(v4sf_load(&d->hidden_out[0]) * vd,
++					  v_cl_lo, v_cl_hi);
++		w[1] -= v_lr * v4sf_clamp(v4sf_load(&d->hidden_out[4]) * vd,
++					  v_cl_lo, v_cl_hi);
++	}
++
++	/* Output bias update: b_out -= lr * clamp(d_out) */
++	d->active_w->b_out -= lr * fclampf(d_out_scalar, -clamp_val, clamp_val);
++
++	/* Hidden weight update: w_h1[i][j] -= lr * clamp(feat[i] * d_hid[j]) */
++	{
++		v4sf dh0 = *(const v4sf *)&d_hid[0];
++		v4sf dh1 = *(const v4sf *)&d_hid[4];
++
++		for (i = 0; i < NAP_INPUT_SIZE; i++) {
++			v4sf vf = V4SF_SET1(d->features_f32[i]);
++			v4sf *w = (v4sf *)&d->active_w->w_h1[i][0];
++
++			w[0] -= v_lr * v4sf_clamp(vf * dh0, v_cl_lo, v_cl_hi);
++			w[1] -= v_lr * v4sf_clamp(vf * dh1, v_cl_lo, v_cl_hi);
++		}
++
++		/* Hidden bias update: b_h1[j] -= lr * clamp(d_hid[j]) */
++		{
++			v4sf *b = (v4sf *)&d->active_w->b_h1[0];
++
++			b[0] -= v_lr * v4sf_clamp(dh0, v_cl_lo, v_cl_hi);
++			b[1] -= v_lr * v4sf_clamp(dh1, v_cl_lo, v_cl_hi);
++		}
++	}
++}
+-- 
+2.34.1
diff --git a/PKGBUILD b/PKGBUILD
index 943b69b..9017452 100755
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -94,6 +94,7 @@ source=(
   "git+https://github.com/forkymcforkface/xpad-noone.git#commit=8e903676dd9514c07ce5e06e43c5f7d8cc51cb7d"
   "git+https://github.com/atar-axis/xpadneo.git#tag=v$_xpadneo_version"
    6.16-poc-selector-v2.6.1.patch 
+   6.16-nap-v0.4.0.patch
 )
 sha256sums=('a69eea3b189ab64e65608140d6cd7c57823d1b39b361e876197eec1b4d1db957'
             '37452b4d09e5e42134ae24a61f2f656790837c327268074cf79d7dab3558b972'
@@ -141,7 +142,8 @@ sha256sums=('a69eea3b189ab64e65608140d6cd7c57823d1b39b361e876197eec1b4d1db957'
             '7ba61ccf2ddb508d6adb30906d3d57dc0ce1bc64a6d1a41796eb94a8584ea63b'
             '1055bbbd32985017f4501d375648873bd598db084177d302aeeade56b47920e1'
             '26b3a811d38471a42229fa037cb6d2bb5ff78f19f45a17c7f263339ee67769a7'
-            '14dabfb0452a3a817e8d809fb28eb7565512e95386d789c627b62baf136e001f')
+            '14dabfb0452a3a817e8d809fb28eb7565512e95386d789c627b62baf136e001f'
+            'e3a353432be799ba938f6cb2495f07e531ba456818500008f09bf6b6a8632862')
 
 export KBUILD_BUILD_HOST=archlinux
 export KBUILD_BUILD_USER=$pkgbase

From 8ca236f29872207953684b524d82dc82718eb190 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Fri, 15 May 2026 17:50:08 -0300
Subject: [PATCH 3/5] Enable high-performance idle CPU selector

---
 PKGBUILD        | 2 +-
 config-charcoal | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/PKGBUILD b/PKGBUILD
index 9017452..e6a7ba5 100755
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -99,7 +99,7 @@ source=(
 sha256sums=('a69eea3b189ab64e65608140d6cd7c57823d1b39b361e876197eec1b4d1db957'
             '37452b4d09e5e42134ae24a61f2f656790837c327268074cf79d7dab3558b972'
             'd88eaf0f94bae470040e4882f334c05b1bb2ab0a99e4b7299aa0b2337810ab8d'
-            'fd18272b72f7226a9b00bf676ed74b961c666402d0fdea0846aa50ff7a8f3758'
+            '5e04417ff3a3416b64ff26112825842ccc6ad353a2f86d27a6412a5b7fda30d1'
             'b831de1b98a2f77f636f4780e37ebfcb3a6829f94f5423eb04c4b26e64ac43b8'
             '52cbbf41450806d766260bc4f1ea055f6f9fdd55d37ad831840b16d505beb0cc'
             '0a6a7408ccc0c94b5cce50dabc7ee318abcc1b9eaaedd3d83fd7e7d5a73b4d4f'
diff --git a/config-charcoal b/config-charcoal
index 2e16643..9542d7d 100755
--- a/config-charcoal
+++ b/config-charcoal
@@ -31,9 +31,12 @@ CONFIG_LTO_CLANG_FULL=y
 # Enable Polly
 CONFIG_POLLY_CLANG=y
 
-# Change CPU IDLE goernor
+# Change CPU IDLE governor
 CONFIG_CPU_IDLE_GOV_LADDER=n
 CONFIG_CPU_IDLE_GOV_MENU=n
+CONFIG_CPU_IDLE_GOV_NAP=y
+CONFIG_CPU_IDLE_DEFAULT_GOVERNOR="nap"
+CONFIG_SCHED_POC_SELECTOR=y
 
 # Enable Zen kernel tuning
 CONFIG_ZEN_INTERACTIVE=y

From 8e5af07c97dbbc2afea35bfc102829e0e0cb57a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Wed, 13 May 2026 02:09:08 -0300
Subject: [PATCH 4/5] Add poc link to README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 19a9199..4418a65 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,8 @@ Works on Steam Deck and possibly other AMD based handheld PCs.
 - Add [re-swappiness](https://github.com/firelzrd/re-swappiness)
 - Add [zram-ir](https://github.com/firelzrd/zram-ir)
 - Add [kcompressd-unofficial](https://github.com/firelzrd/kcompressd-unofficial)
+
+- Add [poc selectors](https://github.com/firelzrd/poc-selector)
 - Switch default DRM scheduling policy to round-robin
 - Optimize kernel with -O3 (from tkg)
 - Optimize for Zen 2 (from Gentoo)

From 4bfb080e38336591231d5db5a02a0c923701a922 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?=
 <jorgezarpon@msn.com>
Date: Fri, 15 May 2026 14:10:22 -0300
Subject: [PATCH 5/5] Add nap link to README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4418a65..21f9b3e 100644
--- a/README.md
+++ b/README.md
@@ -13,8 +13,8 @@ Works on Steam Deck and possibly other AMD based handheld PCs.
 - Add [re-swappiness](https://github.com/firelzrd/re-swappiness)
 - Add [zram-ir](https://github.com/firelzrd/zram-ir)
 - Add [kcompressd-unofficial](https://github.com/firelzrd/kcompressd-unofficial)
-
 - Add [poc selectors](https://github.com/firelzrd/poc-selector)
+- Add [nap](https://github.com/firelzrd/nap) 
 - Switch default DRM scheduling policy to round-robin
 - Optimize kernel with -O3 (from tkg)
 - Optimize for Zen 2 (from Gentoo)