From a6d0396fac324599a52c41dd474d6ca127265190 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?= Date: Tue, 12 May 2026 16:55:10 -0300 Subject: [PATCH 1/5] Backport POC selector changes from 6.18.3 to 6.16 --- 6.16-poc-selector-v2.6.1.patch | 2575 ++++++++++++++++++++++++++++++++ PKGBUILD | 4 +- 2 files changed, 2578 insertions(+), 1 deletion(-) create mode 100644 6.16-poc-selector-v2.6.1.patch diff --git a/6.16-poc-selector-v2.6.1.patch b/6.16-poc-selector-v2.6.1.patch new file mode 100644 index 0000000..53617b5 --- /dev/null +++ b/6.16-poc-selector-v2.6.1.patch @@ -0,0 +1,2575 @@ +From 854c284516887b6bdf6a3a3f2507ce873151a4d5 Mon Sep 17 00:00:00 2001 +From: Masahito S +Date: Mon, 27 Apr 2026 11:07:36 +0900 +Subject: [PATCH] 6.18.3-poc-selector-v2.6.1 + +--- + include/linux/sched/topology.h | 50 +- + init/Kconfig | 13 + + kernel/sched/ext.c | 7 + + kernel/sched/fair.c | 164 ++- + kernel/sched/idle.c | 10 + + kernel/sched/poc_selector.c | 1788 ++++++++++++++++++++++++++++++++ + kernel/sched/sched.h | 110 ++ + kernel/sched/topology.c | 226 ++++ + 8 files changed, 2330 insertions(+), 38 deletions(-) + create mode 100644 kernel/sched/poc_selector.c + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index bbcfdf12aa..c10c871dea 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -66,8 +66,54 @@ struct sched_group; + struct sched_domain_shared { + atomic_t ref; + atomic_t nr_busy_cpus; +- int has_idle_cores; +- int nr_idle_scan; ++ int has_idle_cores; ++ int nr_idle_scan; ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ /* ++ * POC Selector: per-LLC idle CPU tracking ++ */ ++ u64 poc_llc_members; /* bitmask of valid CPUs (relative to base) */ ++ int poc_cpu_base; /* smallest CPU ID in this LLC */ ++ u8 poc_affinity_shift; /* bit shift for cpumask alignment */ ++ bool poc_fast_eligible; /* true when LLC CPU count <= 64 */ ++ bool poc_cluster_valid; /* true when cluster mask is usable */ ++#ifdef CONFIG_SCHED_SMT ++ u8 poc_smt_shift; /* bit distance between SMT siblings */ ++ u64 poc_primary_mask; /* bitmask of core representative CPUs */ ++#endif ++ ++ /* ++ * Hot write path: idle state flag arrays (lock-free mode). ++ * Each array = exactly 1 cache line (64B). ++ * Writers: WRITE_ONCE (plain MOV, no LOCK prefix). ++ * Readers: snapshot to stack, then multiply-and-shift aggregation. ++ * Active only when sched_poc_atomic_bitmap=0. ++ */ ++ u8 poc_idle_cpus[64] ____cacheline_aligned; ++#ifdef CONFIG_SCHED_SMT ++ u8 poc_idle_cores[64] ____cacheline_aligned; ++#endif /* CONFIG_SCHED_SMT */ ++ ++ /* ++ * Hot read/write path: idle state bitmaps (bitmap mode, default). ++ * Readers: single atomic64_read (MOV on x86). ++ * Writers: atomic64_or / atomic64_andnot (LOCK'd on x86). ++ * Active only when sched_poc_atomic_bitmap=1. ++ */ ++ atomic64_t poc_idle_cpus_mask ____cacheline_aligned; ++#ifdef CONFIG_SCHED_SMT ++ atomic64_t poc_idle_cores_mask ____cacheline_aligned; ++#endif /* CONFIG_SCHED_SMT */ ++ ++ /* ++ * Read-only lookup tables (written once at init). ++ * Cacheline-aligned for exact prefetch targeting. ++ */ ++ u64 poc_cluster_mask[64] ____cacheline_aligned; ++#ifdef CONFIG_SCHED_SMT ++ u64 poc_smt_mask[64] ____cacheline_aligned; ++#endif /* CONFIG_SCHED_SMT */ ++#endif /* CONFIG_SCHED_POC_SELECTOR */ + }; + + struct sched_domain { +diff --git a/init/Kconfig b/init/Kconfig +index cab3ad28ca..991fe7f8a4 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1435,6 +1435,19 @@ config SCHED_AUTOGROUP + desktop applications. Task group autogeneration is currently based + upon task session. + ++config SCHED_POC_SELECTOR ++ bool "Piece-Of-Cake Fast Idle CPU Selector" ++ depends on SMP ++ default y ++ help ++ Idle CPU selector using cached bitmasks inspired by the scx_cake BPF ++ scheduler. Reduces select_idle_cpu overhead by using bitmap scanning. ++ ++ This optimization does not affect scheduler fairness - it only ++ speeds up the process of finding an idle CPU for task wakeup. ++ ++ If unsure, say Y. ++ + config RELAY + bool "Kernel->user space relay support (formerly relayfs)" + select IRQ_WORK +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index b959a70471..ce89f9627f 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -4076,6 +4076,9 @@ static void scx_disable_workfn(struct kthread_work *work) + mutex_unlock(&scx_enable_mutex); + + WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ poc_notify_scx(false); ++#endif + done: + scx_bypass(false); + } +@@ -4814,6 +4817,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) + if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) + static_branch_enable(&__scx_switched_all); + ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ poc_notify_scx(true); ++#endif ++ + pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", + sch->ops.name, scx_switched_all() ? "" : " (partial)"); + kobject_uevent(&sch->kobj, KOBJ_ADD); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 967ca52fb2..1afd1838f5 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -49,6 +49,7 @@ + #include + #include + #include ++#include + + #include + +@@ -1064,7 +1065,7 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) + + #include "pelt.h" + +-static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); ++static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu, int sync); + static unsigned long task_h_load(struct task_struct *p); + static unsigned long capacity_of(int cpu); + +@@ -7601,6 +7602,20 @@ void __update_idle_core(struct rq *rq) + rcu_read_unlock(); + } + ++/* ++ * Check if the entire core (all SMT siblings) containing @cpu is idle. ++ */ ++static inline bool is_idle_core(int cpu) ++{ ++ int sibling; ++ ++ for_each_cpu(sibling, cpu_smt_mask(cpu)) { ++ if (!available_idle_cpu(sibling)) ++ return false; ++ } ++ return true; ++} ++ + /* + * Scan the entire LLC domain for idle cores; this dynamically switches off if + * there are no idle cores left in the system; tracked through +@@ -7668,6 +7683,11 @@ static inline bool test_idle_cores(int cpu) + return false; + } + ++static inline bool is_idle_core(int cpu) ++{ ++ return (available_idle_cpu(cpu) || sched_idle_cpu(cpu)); ++} ++ + static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) + { + return __select_idle_cpu(core, p); +@@ -7817,16 +7837,38 @@ static inline bool asym_fits_cpu(unsigned long util, + return true; + } + ++#ifdef CONFIG_SCHED_POC_SELECTOR ++#include "poc_selector.c" ++#endif + /* + * Try and locate an idle core/thread in the LLC cache domain. + */ +-static int select_idle_sibling(struct task_struct *p, int prev, int target) ++static int select_idle_sibling(struct task_struct *p, int prev, int target, int sync) + { + bool has_idle_core = false; + struct sched_domain *sd; + unsigned long task_util, util_min, util_max; + int i, recent_used_cpu, prev_aff = -1; + ++ /* Check a recently used CPU as a potential idle candidate: */ ++ recent_used_cpu = p->recent_used_cpu; ++ p->recent_used_cpu = prev; ++ if (recent_used_cpu != prev && ++ recent_used_cpu != target && ++ cpus_share_cache(recent_used_cpu, target) && ++ (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && ++ cpumask_test_cpu(recent_used_cpu, p->cpus_ptr)) { ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ if (!static_branch_likely(&poc_selector_active) || ++ static_branch_unlikely(&sched_poc_early_select)) ++#endif ++ if ((unsigned int)recent_used_cpu < nr_cpumask_bits && ++ is_idle_core(recent_used_cpu)) ++ return recent_used_cpu; ++ } else { ++ recent_used_cpu = -1; ++ } ++ + /* + * On asymmetric system, update task utilization because we will check + * that the task fits with CPU's capacity. +@@ -7843,23 +7885,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + */ + lockdep_assert_irqs_disabled(); + +- if ((available_idle_cpu(target) || sched_idle_cpu(target)) && ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ if (static_branch_likely(&poc_selector_active) && ++ static_branch_unlikely(&sched_poc_early_select) && ++ is_idle_core(target) && + asym_fits_cpu(task_util, util_min, util_max, target)) + return target; +- +- /* +- * If the previous CPU is cache affine and idle, don't be stupid: +- */ +- if (prev != target && cpus_share_cache(prev, target) && +- (available_idle_cpu(prev) || sched_idle_cpu(prev)) && +- asym_fits_cpu(task_util, util_min, util_max, prev)) { +- +- if (!static_branch_unlikely(&sched_cluster_active) || +- cpus_share_resources(prev, target)) +- return prev; +- +- prev_aff = prev; +- } ++#endif + + /* + * Allow a per-cpu kthread to stack with the wakee if the +@@ -7877,24 +7909,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + return prev; + } + +- /* Check a recently used CPU as a potential idle candidate: */ +- recent_used_cpu = p->recent_used_cpu; +- p->recent_used_cpu = prev; +- if (recent_used_cpu != prev && +- recent_used_cpu != target && +- cpus_share_cache(recent_used_cpu, target) && +- (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && +- cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && +- asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { +- +- if (!static_branch_unlikely(&sched_cluster_active) || +- cpus_share_resources(recent_used_cpu, target)) +- return recent_used_cpu; +- +- } else { +- recent_used_cpu = -1; +- } +- + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. +@@ -7919,6 +7933,74 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + if (!sd) + return target; + ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ { ++ struct sched_domain_shared *sd_share = ++ rcu_dereference(per_cpu(sd_llc_shared, target)); ++ if (static_branch_likely(&poc_selector_active) ++ && !sched_asym_cpucap_active() ++ && sd_share && likely(sd_share->poc_fast_eligible)) { ++ int poc_cpu = select_idle_cpu_poc(target, prev, ++ recent_used_cpu, sync, ++ sd_share, p->cpus_ptr); ++ if (poc_cpu >= 0) { ++ return poc_cpu; ++ } ++ /* ++ * POC returns -2 when the SIS_UTIL overload gate fires ++ * (smt_fallback=0 only). POC has already checked ++ * prev's SMT sibling (Level 4) and decided broader ++ * search is not worthwhile. CFS would reach the same ++ * conclusion, so skip select_idle_smt/select_idle_cpu. ++ * ++ * POC returns -1 for Level 0 saturation (no idle CPUs ++ * in bitmap), but CFS may still find sched_idle CPUs, ++ * so we must NOT skip CFS in that case. ++ */ ++ if (poc_cpu == -2) ++ goto give_up; ++ } else { ++ /* ++ * poc_selector_active is off — POC is either disabled ++ * by sysctl or suppressed while scx is running. ++ * If an scx scheduler called us, flip poc_selector_skip ++ * and schedule a workqueue item to re-enable POC with ++ * bitmap resync. ++ */ ++ poc_check_skip_fallback(); ++ } ++ } ++ poc_count(POC_FALLBACK); ++#endif /* CONFIG_SCHED_POC_SELECTOR */ ++ ++ if ((unsigned int)recent_used_cpu < nr_cpumask_bits) { ++ if ((available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu))) { ++ if (is_idle_core(recent_used_cpu)) ++ return recent_used_cpu; ++ /* idle CPU but not idle core → preserve for give_up */ ++ } else { ++ recent_used_cpu = -1; /* not idle → discard */ ++ } ++ } ++ ++ if (sync && is_idle_core(target) && ++ asym_fits_cpu(task_util, util_min, util_max, target)) ++ return target; ++ ++ /* ++ * If the previous CPU is cache affine and idle, don't be stupid: ++ */ ++ if (prev != target && cpus_share_cache(prev, target) && ++ (available_idle_cpu(prev) || sched_idle_cpu(prev)) && ++ asym_fits_cpu(task_util, util_min, util_max, prev)) { ++ ++ if (!static_branch_unlikely(&sched_cluster_active) || ++ cpus_share_resources(prev, target)) ++ return prev; ++ ++ prev_aff = prev; ++ } ++ + if (sched_smt_active()) { + has_idle_core = test_idle_cores(target); + +@@ -7933,6 +8015,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + if ((unsigned)i < nr_cpumask_bits) + return i; + ++#ifdef CONFIG_SCHED_POC_SELECTOR ++give_up: ++#endif + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster +@@ -7944,6 +8029,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + if ((unsigned int)recent_used_cpu < nr_cpumask_bits) + return recent_used_cpu; + ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ /* Last resort: avoid enqueuing behind RT/DL tasks on target */ ++ if (static_branch_likely(&poc_selector_active) && ++ rt_task(cpu_rq(target)->curr) && ++ prev != target && !rt_task(cpu_rq(prev)->curr)) ++ return prev; ++#endif + return target; + } + +@@ -8628,7 +8720,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) + new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); + } else if (wake_flags & WF_TTWU) { /* XXX always ? */ + /* Fast path */ +- new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); ++ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu, sync); + } + rcu_read_unlock(); + +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index c39b089d4f..3fffa1a43f 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -275,6 +275,11 @@ static void do_idle(void) + __current_set_polling(); + tick_nohz_idle_enter(); + ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ /* POC Selector: mark CPU as idle */ ++ set_cpu_idle_state_poc(cpu, 1); ++#endif /* CONFIG_SCHED_POC_SELECTOR */ ++ + while (!need_resched()) { + + /* +@@ -332,6 +337,11 @@ static void do_idle(void) + arch_cpu_idle_exit(); + } + ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ /* POC Selector: mark CPU as busy */ ++ set_cpu_idle_state_poc(cpu, 0); ++#endif /* CONFIG_SCHED_POC_SELECTOR */ ++ + /* + * Since we fell out of the loop above, we know TIF_NEED_RESCHED must + * be set, propagate it into PREEMPT_NEED_RESCHED. +diff --git a/kernel/sched/poc_selector.c b/kernel/sched/poc_selector.c +new file mode 100644 +index 0000000000..5bbd927828 +--- /dev/null ++++ b/kernel/sched/poc_selector.c +@@ -0,0 +1,1788 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Piece-Of-Cake (POC) CPU Selector ++ * ++ * Fast idle CPU selector inspired by RitzDaCat's scx_cake scheduler ++ * "Piece of Cake" - making idle CPU search a piece of cake! ++ * ++ * Tracks idle state in per-LLC atomic64_t bitmaps with lock-free ++ * atomic64_read/or/andnot for O(1) idle CPU lookup. ++ * Supports up to 64 CPUs per LLC (single 64-bit word). ++ * Includes affinity-aware filtering via cpumask intersection. ++ * ++ * When the fast path is not eligible (LLC exceeds 64 CPUs), ++ * returns -1 to let CFS standard select_idle_cpu handle it. ++ * ++ * Copyright (C) 2026 Masahito Suzuki ++ * ++ * Acknowledgements: ++ * This work is heavily inspired by RitzDaCat's scx_cake scheduler. ++ * ++ * Special thanks to the algorithm inventors whose research enabled ++ * the O(1) techniques used in this implementation: ++ * ++ * - Prashant Pandey, Michael A. Bender, Rob Johnson ++ * ("A Fast x86 Implementation of Select") ++ * ++ * - Daniel Lemire ++ * ("Fast Random Integer Generation in an Interval") ++ */ ++ ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ ++/************************************************************** ++ * Version Information: ++ */ ++ ++#define SCHED_POC_SELECTOR_AUTHOR "Masahito Suzuki" ++#define SCHED_POC_SELECTOR_PROGNAME "Piece-Of-Cake (POC) CPU Selector" ++ ++#define SCHED_POC_SELECTOR_VERSION "2.6.1" ++ ++/************************************************************** ++ * Static keys: ++ */ ++ ++/* ++ * Runtime control: poc_selector_active (static key) ++ * Derived from: sched_poc_selector && !poc_selector_skip ++ * ++ * sched_poc_selector: user-visible sysctl (kernel.sched_poc_selector), ++ * plain bool, default true. ++ * poc_selector_skip: set true while sched_ext is active to avoid ++ * idle-bitmap overhead in do_idle. ++ * poc_selector_active: the actual static key gating all POC hot paths. ++ * Enabled only when sched_poc_selector && !poc_selector_skip. ++ * On enable transition, poc_resync_idle_state() is called. ++ */ ++DEFINE_STATIC_KEY_TRUE(poc_selector_active); ++static bool sched_poc_selector = true; ++static bool poc_selector_skip; ++ ++/* ++ * SMT fallback control: sched_poc_smt_fallback ++ * (sysctl kernel.sched_poc_smt_fallback) ++ * ++ * When enabled, POC bails out to CFS when no idle ++ * cores exist (has_idle_cores == false). CFS then handles ++ * SMT sibling selection via select_idle_smt(prev) and ++ * nr_idle_scan-limited select_idle_cpu(). ++ * ++ * When disabled (default), POC handles SMT sibling selection ++ * itself, trying prev's SMT sibling for cache locality, then ++ * LLC-wide RR search for remaining idle CPUs. Level 5/6 is ++ * gated by nr_idle_scan (SIS_UTIL): when LLC utilization ++ * exceeds ~85%, broader SMT search is skipped. ++ */ ++DEFINE_STATIC_KEY_FALSE(sched_poc_smt_fallback); ++ ++/* ++ * SMT consecutive layout: sched_poc_smt_consecutive ++ * ++ * When true (default), SMT siblings occupy consecutive LLC-relative ++ * positions (e.g., CPU 0,1 / 2,3 / ...). The idle core mask is ++ * derived from the idle CPU mask via bit-parallel operations: ++ * core_mask = cpu_mask & (cpu_mask >> 1) & 0x5555555555555555ULL ++ * ++ * Disabled at boot if non-consecutive 2-way SMT or >2-way SMT ++ * is detected on any LLC. ++ */ ++DEFINE_STATIC_KEY_TRUE(sched_poc_smt_consecutive); ++ ++/* ++ * SMT uniform 2-way layout: sched_poc_smt_uniform ++ * ++ * When true (default), all cores in every LLC have uniform 2-way SMT ++ * with a constant stride between siblings. The idle core mask is ++ * derived at read time via: ++ * core_mask = cpu_mask & (cpu_mask >> poc_smt_shift) & poc_primary_mask ++ * ++ * This covers both consecutive (stride=1) and stride-N (e.g., Intel ++ * Xeon) layouts without write-path overhead. ++ * ++ * When false (>2-way SMT or non-uniform topology), falls back to ++ * write-time maintenance of poc_idle_cores_mask atomic64_t. ++ * ++ * Disabled at boot if any LLC contains non-2-way or non-uniform SMT. ++ */ ++DEFINE_STATIC_KEY_TRUE(sched_poc_smt_uniform); ++ ++/* ++ * Target CPU sticky: sched_poc_target_sticky ++ * (sysctl kernel.sched_poc_target_sticky) ++ * ++ * When enabled, if the target CPU is idle in the bitmap, return it ++ * immediately — regardless of whether its core is fully idle. ++ * This provides L1 cache affinity: the waking task reuses the CPU ++ * it ran on last, keeping warm TLB/L1/L2 state. ++ * ++ * Checked after Level 0 (saturation) and before core_mask derivation. ++ * Default: disabled. ++ */ ++DEFINE_STATIC_KEY_FALSE(sched_poc_target_sticky); ++ ++/* ++ * Early select: sched_poc_early_select ++ * (sysctl kernel.sched_poc_early_select) ++ * ++ * When enabled, select_idle_sibling performs idle-core checks ++ * for recent_used_cpu and target BEFORE entering POC search: ++ * - recent_used_cpu with fully idle core → return immediately ++ * (matches upstream CFS Gate 4 behavior) ++ * - target with fully idle core → return immediately ++ * (avoids POC overhead: RCU deref, bitmap read, mask ops) ++ * ++ * These two checks must be toggled together to preserve POC's ++ * internal priority order (Level 1r before 1t). Enabling only ++ * one would let the pre-POC path return a lower-priority result ++ * before POC can evaluate the higher-priority candidate. ++ * ++ * Default: enabled. ++ */ ++DEFINE_STATIC_KEY_TRUE(sched_poc_early_select); ++ ++/* ++ * Greedy search: sched_poc_greedy_search ++ * (sysctl kernel.sched_poc_greedy_search) ++ * ++ * When enabled, POC always attempts Level 5/6 (LLC-wide SMT sibling ++ * search) regardless of utilization, ignoring the SIS_UTIL overload ++ * gate (nr_idle_scan == 0). This may benefit latency-sensitive ++ * workloads that want to find any idle CPU at all costs. ++ * ++ * When disabled, POC skips Level 5/6 under overload, ++ * returning -2 to also skip CFS fallback search. ++ * ++ * Default: enabled. ++ */ ++DEFINE_STATIC_KEY_TRUE(sched_poc_greedy_search); ++ ++/* ++ * sched_poc_aligned: true when all LLCs have poc_cpu_base aligned to 64 ++ * ++ * When true, cpumask-to-POC conversion is a simple word load (zero shift). ++ * When false (e.g., Threadripper CCDs at CPU 8, 16, ...), bit shifting ++ * is needed to align cpumask bits with POC's LLC-relative positions. ++ * Defaults to true; disabled at boot if any LLC has non-aligned base. ++ */ ++DEFINE_STATIC_KEY_TRUE(sched_poc_aligned); ++ ++/* ++ * Packed priority search: sched_poc_packed ++ * ++ * When true (default), per-LLC CPU count is ≤ 32, enabling packed ++ * priority search. Cluster candidates (Level 2) and LLC-wide ++ * candidates (Level 3) are packed into a single 64-bit word: ++ * ++ * bits [31:0]: cluster idle candidates (high priority) ++ * bits [63:32]: all LLC idle candidates (low priority) ++ * ++ * A single TZCNT resolves both levels simultaneously. ++ * ror32-based rotation distributes selections across idle CPUs. ++ * ++ * When false (LLC > 32 CPUs), falls back to separate cluster ++ * search + PTSELECT-based RR. ++ * ++ * Disabled at boot if any LLC has > 32 CPUs. ++ */ ++DEFINE_STATIC_KEY_TRUE(sched_poc_packed); ++ ++/* ++ * Improved RR strategy: sched_poc_rr_improved ++ * (sysctl kernel.sched_poc_rr_improved) ++ * ++ * When enabled (default), idle CPU selection in poc_select_rr, ++ * poc_cluster_search, and the packed priority search uses an ++ * improved RR strategy combining two techniques: ++ * 1. total size case-split (1/2/>=3): direct / interleave / full ++ * 2. golden-ratio scrambling (Lemire fastrange) ++ * ++ * When disabled, the current strategy is used unchanged: ++ * - poc_select_rr: poc_rr_step[] table (perfect RR) ++ * - poc_cluster_search: ctz lowest-bit selection (no RR) ++ * - packed search: ror32(counter & 31) ++ * ++ * The current path is preserved as the A/B-testing baseline; ++ * once the improved path is validated, the legacy code will ++ * be removed in a follow-up. ++ */ ++DEFINE_STATIC_KEY_TRUE(sched_poc_rr_improved); ++ ++/* ++ * Lockless bitmap mode: sched_poc_lockless_bitmap ++ * (sysctl kernel.sched_poc_lockless_bitmap) ++ * ++ * When enabled, idle state is tracked in u8[64] flag arrays. ++ * Writers use plain WRITE_ONCE (no LOCK prefix); readers snapshot ++ * the 64-byte cache line to the stack, then use multiply-and-shift ++ * aggregation to assemble a u64 bitmask. ++ * ++ * When disabled (default), idle state is tracked in atomic64_t bitmaps. ++ * Readers use a single atomic64_read (MOV on x86); writers use ++ * atomic64_or / atomic64_andnot (LOCK'd on x86). ++ * ++ * Only one representation is maintained at a time (single-write). ++ * Switching via sysctl resyncs the newly-active representation ++ * before readers can observe it. ++ * ++ * Default: disabled. ++ */ ++DEFINE_STATIC_KEY_FALSE(sched_poc_lockless_bitmap); ++ ++/************************************************************** ++ * Debug counters (sysctl kernel.sched_poc_count): ++ * ++ * Per-CPU counters for each selection level hit. ++ * Guarded by static key — zero overhead when disabled (default). ++ * Aggregated across all CPUs and exposed via sysfs. ++ */ ++enum poc_level { ++ POC_LV1S = 0, /* target CPU sticky (L1/TLB affinity) */ ++ POC_LV1T, /* target core idle */ ++ POC_LV1P, /* prev core idle */ ++ POC_LV1R, /* recent core idle */ ++ POC_LV2, /* idle core in L2 cluster */ ++ POC_LV3, /* idle core across LLC (RR) */ ++ POC_LV4S, /* sync + target CPU idle (no idle cores) */ ++ POC_LV4P, /* prev's SMT sibling (cache locality) */ ++ POC_LV4R, /* recent's SMT sibling (warm cache) */ ++ POC_LV4T, /* target's SMT sibling */ ++ POC_LV5, /* idle CPU in L2 cluster */ ++ POC_LV6, /* idle CPU across LLC (RR) */ ++ POC_FALLBACK, /* POC returned -1, CFS fallback */ ++ POC_NR_LEVELS ++}; ++ ++#define POC_SMT_LEVEL_OFFSET (POC_LV5 - POC_LV2) ++ ++DEFINE_STATIC_KEY_FALSE(sched_poc_count_enabled); ++ ++static DEFINE_PER_CPU(unsigned long[POC_NR_LEVELS], poc_debug_cnt); ++ ++static __always_inline void poc_count(enum poc_level lv) ++{ ++ if (static_branch_unlikely(&sched_poc_count_enabled)) ++ __this_cpu_inc(poc_debug_cnt[lv]); ++} ++ ++/************************************************************** ++ * Per-CPU round-robin counter and division-free mapping: ++ */ ++ ++/* ++ * POC_HASH_MULT / POC_SCRAMBLE — Golden-ratio scrambling ++ * ++ * Multiplying a 32-bit counter by ⌊2^32 / φ⌋ = 0x9E3779B9 scatters ++ * consecutive values across the 32-bit output space with good ++ * avalanche properties (Knuth's multiplicative hash, TAOCP Vol. 3). ++ * The scrambled value feeds POC_FASTRANGE for uniform [0, range) ++ * mapping in the improved RR path, or is used directly with a bit ++ * shift to derive an uncorrelated rotation amount in packed search. ++ */ ++#define POC_HASH_MULT 0x9E3779B9U /* golden ratio * 2^32 */ ++#define POC_SCRAMBLE(counter) ((u32)(counter) * POC_HASH_MULT) ++ ++/* ++ * Per-CPU round-robin counter for idle CPU selection. ++ * Each CPU starts at a different offset to reduce cross-CPU ++ * collision probability. Combined with poc_rr_step[] and ++ * POC_FIXED_MOD16, consecutive calls on the same CPU produce ++ * perfect round-robin: each call picks a different idle CPU ++ * until all candidates have been visited. ++ */ ++static DEFINE_PER_CPU(u32, poc_rr_counter); ++ ++/* ++ * Division-free modulo via 16-bit fixed-point reciprocal multiplication ++ * ++ * The multiply-and-shift technique is inspired by: ++ * D. Lemire, "Fast Random Integer Generation in an Interval", ++ * ACM Trans. Model. Comput. Simul. 29, 1, Article 3, 2019. ++ * ++ * Combined with poc_rr_step[], this replaces modulo with two ++ * multiplications and a shift: ++ * phase = (u16)(counter * poc_rr_step[total - 1]) ++ * pick = POC_FIXED_MOD16(phase, total) ++ * ++ * Proof that pick == counter % total (for total ≤ 64): ++ * Let S = ceil(2^16 / N). For k in [0, N): ++ * k*S*N / 2^16 ∈ [k, k + kN/2^16) ++ * Since kN < N² ≤ 64² = 4096 ≪ 2^16, floor(kN/2^16) = 0, ++ * so floor(k*S*N / 2^16) = k. QED. ++ */ ++#define POC_FIXED_MOD16(phase, range) ((u32)(((u32)(phase) * (u32)(range)) >> 16)) ++ ++/* ++ * POC_FASTRANGE — Map a 32-bit scrambled value to [0, range) ++ * ++ * Implements Lemire's fastrange technique: ++ * D. Lemire, "Fast Random Integer Generation in an Interval", ++ * ACM Trans. Model. Comput. Simul. 29, 1, Article 3, 2019. ++ * ++ * Computes (seed * range) >> 32, giving a uniform mapping of ++ * a 32-bit seed into [0, range) using only one 64-bit multiply ++ * and a shift. Used with golden-ratio hashing for pseudo-random ++ * RR distribution in the improved RR path. ++ */ ++#define POC_FASTRANGE(seed, range) ((u32)(((u64)(seed) * (u32)(range)) >> 32)) ++ ++/* ++ * RR step table: poc_rr_step[n-1] = ceil(2^16 / n) for n = 1..64 ++ * ++ * Indexed by (total - 1) where total = popcount(idle mask). ++ * total == 0 is unreachable (caller guarantees mask != 0). ++ * 64 entries × 2 bytes = 128 bytes = exactly 2 cache lines. ++ */ ++static const u16 poc_rr_step[64] = { ++ 0, 0x8000, 0x5556, 0x4000, 0x3334, 0x2AAB, 0x2493, 0x2000, /* 1.. 8 */ ++ 0x1C72, 0x199A, 0x1746, 0x1556, 0x13B2, 0x124A, 0x1112, 0x1000, /* 9..16 */ ++ 0x0F10, 0x0E39, 0x0D7A, 0x0CCD, 0x0C31, 0x0BA3, 0x0B22, 0x0AAB, /* 17..24 */ ++ 0x0A3E, 0x09D9, 0x097C, 0x0925, 0x08D4, 0x0889, 0x0843, 0x0800, /* 25..32 */ ++ 0x07C2, 0x0788, 0x0751, 0x071D, 0x06EC, 0x06BD, 0x0691, 0x0667, /* 33..40 */ ++ 0x063F, 0x0619, 0x05F5, 0x05D2, 0x05B1, 0x0591, 0x0573, 0x0556, /* 41..48 */ ++ 0x053A, 0x051F, 0x0506, 0x04ED, 0x04D5, 0x04BE, 0x04A8, 0x0493, /* 49..56 */ ++ 0x047E, 0x046A, 0x0457, 0x0445, 0x0433, 0x0422, 0x0411, 0x0400, /* 57..64 */ ++}; ++ ++/************************************************************** ++ * Bit manipulation primitives: ++ */ ++ ++/* ++ * POC_CTZ64 — Portable Count Trailing Zeros (64-bit) ++ * ++ * Three-tier architecture detection: ++ * ++ * Tier 1: Native hardware CTZ with well-defined zero semantics ++ * x86-64 + BMI1 (__BMI__): TZCNT — returns 64 for input 0 ++ * ARM64: RBIT + CLZ ++ * RISC-V Zbb: CTZ instruction ++ * ++ * Tier 2: x86-64 without BMI1 (Bulldozer, pre-Haswell, etc.) ++ * BSF is fast (~3 cyc) but UNDEFINED for input 0. ++ * On AMD Bulldozer: BSF(0) leaves dest register unchanged (stale value). ++ * On Intel pre-Haswell: BSF(0) is architecturally undefined. ++ * Wrap with explicit zero check to guarantee returning 64. ++ * ++ * Tier 3: De Bruijn fallback (BPF, unknown architectures) ++ * Software multiply + 64-entry table lookup, branchless O(1). ++ */ ++ ++/* ++ * POC_CTZ64 is defined in sched.h for use by load balancer functions. ++ * Here we only define POC_CTZ64_NAME for sysfs hardware info display. ++ */ ++#if defined(__x86_64__) && defined(__BMI__) ++#define POC_CTZ64_NAME "HW (TZCNT)" ++#elif defined(__aarch64__) ++#define POC_CTZ64_NAME "HW (RBIT+CLZ)" ++#elif defined(__riscv) && defined(__riscv_zbb) ++#define POC_CTZ64_NAME "HW (ctz)" ++#elif defined(__x86_64__) ++#define POC_CTZ64_NAME "HW (BSF)" ++#else ++#define POC_CTZ64_NAME "SW (De Bruijn)" ++#endif ++ ++/* ++ * POC_PTSELECT — Select position of the j-th set bit in a 64-bit word ++ * ++ * Based on the algorithm described in: ++ * P. Pandey, M. A. Bender, R. Johnson, ++ * "A Fast x86 Implementation of Select", arXiv:1706.00990, 2017. ++ * ++ * Returns the bit position (0-indexed) of the j-th set bit in v. ++ * Undefined behavior if j >= popcount(v). ++ * ++ * Tier 1 (x86-64 + BMI2, excluding AMD Zen 1/2 slow microcode PDEP): ++ * PDEP + TZCNT — 4 instructions total. ++ * PDEP deposits the j-th source bit at the j-th mask position. ++ * ++ * Tier 2 (fallback): Iterative bit-clear — O(j) iterations ++ * Clears the lowest set bit j times, then CTZ on remainder. ++ */ ++ ++#if defined(__x86_64__) && defined(__BMI2__) && \ ++ !defined(__znver1) && !defined(__znver2) ++static __always_inline int poc_ptselect(u64 v, int j) ++{ ++ u64 deposited; ++ ++ asm("pdep %2, %1, %0" : "=r"(deposited) : "r"(1ULL << j), "rm"(v)); ++ return POC_CTZ64(deposited); ++} ++#define POC_PTSELECT(v, j) poc_ptselect(v, j) ++#define POC_PTSELECT_NAME "HW (PDEP)" ++ ++/* ++ * Tier 2 (fallback): Iterative bit-clear — O(j) iterations. ++ * Clears the lowest set bit j times, then returns its position via CTZ. ++ */ ++#else ++static __always_inline int poc_ptselect_sw(u64 v, int j) ++{ ++ int k; ++ ++ for (k = 0; k < j; k++) ++ v &= v - 1; /* clear lowest set bit */ ++ return POC_CTZ64(v); ++} ++#define POC_PTSELECT(v, j) poc_ptselect_sw(v, j) ++#define POC_PTSELECT_NAME "SW (loop)" ++ ++#endif /* POC_PTSELECT */ ++ ++/************************************************************** ++ * Flag array to bitmask conversion (lock-free mode): ++ */ ++ ++/* ++ * POC_BYTE_EXTRACT / POC_BYTE_PACK - constants for multiply-and-shift trick. ++ * ++ * Isolates bit 0 of each byte in a u64 word, then packs the 8 bits ++ * into the most significant byte via multiply. ++ */ ++#define POC_BYTE_EXTRACT 0x0101010101010101ULL ++#define POC_BYTE_PACK 0x0102040810204080ULL ++ ++/* ++ * POC_BMP8 - Convert one 8-byte slice of the flag array to 8 packed bits. ++ * ++ * Tier 1 (x86-64 + BMI2, excluding AMD Zen 1/2 slow microcode PEXT): ++ * PEXT extracts bit 0 of each byte directly into 8 contiguous bits. ++ * Single instruction replaces AND + MUL + SHR. ++ * ++ * Tier 2 (fallback): Multiply-and-shift trick. ++ * Isolates bit 0 of each byte (AND), packs via MUL, shifts to position. ++ */ ++#if defined(__x86_64__) && defined(__BMI2__) && \ ++ !defined(__znver1) && !defined(__znver2) ++ ++static __always_inline u64 poc_bmp8_pext(u64 word, int i) ++{ ++ u64 extracted; ++ ++ asm("pext %2, %1, %0" : "=r"(extracted) : "r"(word), "r"(POC_BYTE_EXTRACT)); ++ return extracted << (i * 8); ++} ++#define POC_BMP8(w, i) poc_bmp8_pext((w)[i], i) ++ ++#else ++ ++#define POC_BMP8(w, i) \ ++ ((((w)[i] & POC_BYTE_EXTRACT) * POC_BYTE_PACK >> 56) << ((i) * 8)) ++ ++#endif /* POC_BMP8 */ ++ ++/* ++ * poc_flags_to_u64 - Convert u8[64] flag array to u64 bitmask ++ * @flags: pointer to 64-byte flag array (cacheline-aligned) ++ * ++ * Phase 1 (memcpy): snapshot the 64-byte cache line to the stack. ++ * This eliminates the window in which a concurrent MESI invalidation ++ * could cause a re-fetch mid-computation. All 64 bytes land in one ++ * or two cache line transfers; subsequent computation is purely local. ++ * ++ * Phase 2: pack the stack-local copy into a u64 bitmask via ++ * multiply-and-shift (or PEXT on BMI2 x86). Always processes all ++ * 8 chunks — the extra iterations for small LLCs are negligible ++ * on stack-local data and avoid the poc_chunks_bit* dispatch tree. ++ * ++ * Returns: u64 bitmask with bit N set iff flags[N] != 0 ++ */ ++static __always_inline u64 poc_flags_to_u64(const u8 *flags) ++{ ++ u64 w[8]; ++ ++ /* Phase 1: snapshot shared cache line to stack */ ++ memcpy(w, flags, 64); ++ ++ /* Phase 2: pack stack-local copy into bitmask */ ++ return POC_BMP8(w, 0) | POC_BMP8(w, 1) | POC_BMP8(w, 2) | POC_BMP8(w, 3) | ++ POC_BMP8(w, 4) | POC_BMP8(w, 5) | POC_BMP8(w, 6) | POC_BMP8(w, 7); ++} ++ ++/************************************************************** ++ * Idle mask accessors: ++ */ ++ ++/* ++ * poc_idle_cpu_mask - Get idle CPU bitmask filtered by LLC and affinity ++ * @affinity: task's allowed CPU mask (poc-relative, from poc_cpumask_to_u64) ++ * @sd_share: per-LLC shared data ++ * ++ * Returns a snapshot of idle CPUs within this LLC, masked by ++ * llc_members (valid CPUs) and @affinity (task placement). ++ * ++ * bitmap mode (default): single atomic64_read (MOV on x86). ++ * flag array mode: stack-snapshot + multiply-and-shift aggregation. ++ */ ++static __always_inline u64 poc_idle_cpu_mask(u64 affinity, ++ struct sched_domain_shared *sd_share) ++{ ++ u64 cpus; ++ ++ if (static_branch_unlikely(&sched_poc_lockless_bitmap)) ++ cpus = poc_flags_to_u64(sd_share->poc_idle_cpus); ++ else ++ cpus = (u64)atomic64_read(&sd_share->poc_idle_cpus_mask); ++ ++ return cpus & sd_share->poc_llc_members & affinity; ++} ++ ++#ifdef CONFIG_SCHED_SMT ++/* ++ * poc_idle_core_mask - Get idle core bitmask ++ * @cpu_mask: snapshot of idle CPUs (already masked by llc_members & affinity) ++ * @sd_share: per-LLC shared data ++ * ++ * Returns a bitmask with bits set at core representative positions ++ * (lowest-numbered sibling) for cores where ALL SMT siblings are idle. ++ * ++ * Three-tier derivation: ++ * ++ * Tier 1 (consecutive 2-way SMT): 3 register ops with compile-time ++ * constants — AND, SHR 1, AND 0x5555... No memory loads. ++ * ++ * Tier 2 (uniform stride-N 2-way SMT): 3 register ops with ++ * precomputed per-LLC shift and primary mask — AND, SHR N, AND. ++ * Two extra loads (poc_smt_shift, poc_primary_mask) from sd_share, ++ * but no write-path overhead. ++ * ++ * Tier 3 (exotic: >2-way SMT or non-uniform topology): reads the ++ * separately-maintained poc_idle_cores_mask atomic64_t. Write path ++ * maintains this bitmap on every idle transition. ++ */ ++static __always_inline u64 poc_idle_core_mask(u64 cpu_mask, ++ struct sched_domain_shared *sd_share) ++{ ++ /* Tier 1: consecutive — constants only, zero loads */ ++ if (static_branch_likely(&sched_poc_smt_consecutive)) ++ return cpu_mask & (cpu_mask >> 1) & 0x5555555555555555ULL; ++ ++ /* Tier 2: uniform stride-N — precomputed shift + mask */ ++ if (static_branch_likely(&sched_poc_smt_uniform)) ++ return cpu_mask & (cpu_mask >> sd_share->poc_smt_shift) ++ & sd_share->poc_primary_mask; ++ ++ /* Tier 3: exotic — bitmap or flag array based on mode */ ++ if (static_branch_unlikely(&sched_poc_lockless_bitmap)) ++ return poc_flags_to_u64(sd_share->poc_idle_cores) & cpu_mask; ++ ++ return (u64)atomic64_read(&sd_share->poc_idle_cores_mask) & cpu_mask; ++} ++#endif /* CONFIG_SCHED_SMT */ ++ ++/* ++ * __set_cpu_idle_state_poc - Update idle state in atomic64_t bitmap ++ * @cpu: CPU number ++ * @state: 0=busy, 1=idle ++ * ++ * Updates the atomic64_t cpus bitmap via atomic64_or/andnot (LOCK'd on x86). ++ * ++ * On uniform 2-way SMT (Tier 1 & 2: consecutive or stride-N), only ++ * the cpus state is updated; core idle state is derived at read time ++ * via bit-parallel operations. ++ * ++ * On exotic SMT (Tier 3: >2-way or non-uniform), also maintains the ++ * separate cores state (bitmap or flag array) for O(1) read-time lookup. ++ * ++ * Only one representation is maintained at a time (single-write), ++ * selected by sched_poc_lockless_bitmap. ++ * ++ * Caller (inline wrapper in sched.h) ensures poc_selector_active is on ++ * and sched_asym_cpucap_active() is false before calling here. ++ */ ++void __set_cpu_idle_state_poc(int cpu, int state) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ if (!static_branch_unlikely(&sched_poc_lockless_bitmap) && ++ !state && READ_ONCE(rq->poc_idle_committed)) ++ return; ++ ++ guard(rcu)(); ++ struct sched_domain_shared *sd_share = ++ rcu_dereference(per_cpu(sd_llc_shared, cpu)); ++ if (!sd_share || !sd_share->poc_fast_eligible) ++ return; ++ ++ int bit = cpu - sd_share->poc_cpu_base; ++ u64 bit_mask = 1ULL << bit; ++ ++ if (static_branch_unlikely(&sched_poc_lockless_bitmap)) { ++ WRITE_ONCE(sd_share->poc_idle_cpus[bit], state > 0 ? 1 : 0); ++ } else if (state > 0) { ++ /* Entering idle: clear any stale committed flag */ ++ WRITE_ONCE(rq->poc_idle_committed, 0); ++ atomic64_or(bit_mask, &sd_share->poc_idle_cpus_mask); ++ } else { ++ /* ++ * Exiting idle: if a waker already committed (cleared the ++ * bitmap bit), skip the redundant atomic on the shared ++ * cacheline. The flag lives in rq's first cacheline — ++ * same line the waker already dirtied via ttwu_pending. ++ */ ++ atomic64_andnot(bit_mask, &sd_share->poc_idle_cpus_mask); ++ WRITE_ONCE(rq->poc_idle_committed, 1); ++ } ++ ++#ifdef CONFIG_SCHED_SMT ++ if (sched_smt_active()) { ++ /* Tier 1 & 2: read-time derivation, no write-path cost */ ++ if (static_branch_likely(&sched_poc_smt_uniform)) ++ return; ++ /* ++ * Tier 3 (exotic SMT): maintain separate cores state. ++ * Check whether all SMT siblings are idle. ++ */ ++ u64 smt = sd_share->poc_smt_mask[bit]; ++ u64 core_bitmask = smt & (-smt); /* core representative */ ++ int core_bit = __builtin_ctzll(core_bitmask); ++ bool core_idle; ++ ++ if (static_branch_unlikely(&sched_poc_lockless_bitmap)) { ++ /* ++ * Flag array mode: check siblings via WRITE_ONCE-stored ++ * flags. smp_wmb() ensures our store to poc_idle_cpus[] ++ * is visible before we read sibling flags. ++ * On x86 TSO: compiler barrier only (~0 cyc). ++ * On ARM64: dmb ishst. ++ */ ++ smp_wmb(); ++ u64 tmp = smt; ++ ++ core_idle = state > 0; ++ while (core_idle && tmp) { ++ int s = __builtin_ctzll(tmp); ++ ++ if (!READ_ONCE(sd_share->poc_idle_cpus[s])) ++ core_idle = false; ++ tmp &= tmp - 1; ++ } ++ WRITE_ONCE(sd_share->poc_idle_cores[core_bit], ++ core_idle ? 1 : 0); ++ } else { ++ /* ++ * smp_mb__after_atomic() ensures our atomic store is ++ * visible before we read sibling bits. On x86 TSO this ++ * is a compiler barrier (~0 cyc); on ARM64: dmb ish. ++ */ ++ smp_mb__after_atomic(); ++ u64 cpus = (u64)atomic64_read(&sd_share->poc_idle_cpus_mask); ++ core_idle = (cpus & smt) == smt; ++ u64 cores = (u64)atomic64_read(&sd_share->poc_idle_cores_mask); ++ ++ if (core_idle) { ++ if (!(cores & core_bitmask)) ++ atomic64_or(core_bitmask, ++ &sd_share->poc_idle_cores_mask); ++ } else { ++ if (cores & core_bitmask) ++ atomic64_andnot(core_bitmask, ++ &sd_share->poc_idle_cores_mask); ++ } ++ } ++ } ++#endif /* CONFIG_SCHED_SMT */ ++} ++ ++/************************************************************** ++ * Idle CPU selection helpers: ++ */ ++ ++/* Test whether a single CPU is idle in a POC bitmap snapshot. ++ * Assumes cpu_mask is in scope — works in any function with that variable. */ ++#define POC_IDLE_CPU(bit) (cpu_mask & (1ULL << (bit))) ++/* Scope-free validity checks — usable in any function. */ ++#define POC_CPU_VALID(cpu) ((cpu) >= 0) ++#define POC_CPU_IN_LLC(bit) ((unsigned int)(bit) < 64) ++ ++/* ++ * poc_select_rr_improved - Improved round-robin idle CPU selection ++ * @base: poc_cpu_base (smallest CPU ID in this LLC) ++ * @mask: idle bitmask (snapshot, caller guarantees non-zero) ++ * @counter: per-CPU round-robin counter value ++ * ++ * Improved RR with two techniques: ++ * 1. Case-split by total: ++ * total=1: direct ctz ++ * total=2: interleave by counter LSB (guarantees non-repeat), ++ * single CTZ via cmov-selected source mask ++ * total>=3: golden-ratio scramble + Lemire fastrange ++ * 2. Golden-ratio scrambling (counter * 0x9E3779B9) mapped via ++ * Lemire fastrange for pseudo-random uniform distribution. ++ * ++ * eager_commit (unconditional) already prevents burst wake-ups from ++ * re-selecting the same CPU by clearing the bitmap bit at selection ++ * time, so no previous-pick exclusion state is needed here. ++ * ++ * Returns: selected CPU number. ++ */ ++static __always_inline int poc_select_rr_improved( ++ int base, u64 mask, unsigned int counter) ++{ ++ int total = hweight64(mask); ++ ++ if (total <= 2) { ++ /* ++ * Pick the lower or upper set bit via counter LSB if total == 2. ++ * Select the mask first (cmov), then one CTZ — halves the ++ * cost on archs where CTZ64 is a SW fallback (De Bruijn). ++ */ ++ if ((total == 2) && (counter & 1)) ++ mask &= mask - 1; ++ ++ return base + POC_CTZ64(mask); ++ } ++ ++ /* total >= 3: golden-ratio scramble + Lemire fastrange */ ++ { ++ u32 scrambled = POC_SCRAMBLE(counter); ++ int pick = POC_FASTRANGE(scrambled, total); ++ ++ return base + POC_PTSELECT(mask, pick); ++ } ++} ++ ++/* ++ * poc_select_rr - Round-robin idle CPU selection from a single-word mask ++ * @base: poc_cpu_base (smallest CPU ID in this LLC) ++ * @mask: idle bitmask (snapshot) ++ * @counter: per-CPU round-robin counter value ++ * ++ * Division-free perfect round-robin via FASTRANGE16 + PTSELECT. ++ * Consecutive calls on the same CPU never repeat an idle CPU ++ * until all candidates have been visited. ++ * Caller must ensure at least one bit is set in mask. ++ * Returns: selected CPU number. ++ */ ++static __always_inline int poc_select_rr(int base, u64 mask, unsigned int counter) ++{ ++ if (static_branch_likely(&sched_poc_rr_improved)) ++ return poc_select_rr_improved(base, mask, counter); ++ ++ /* Current strategy: poc_rr_step[] table (perfect RR), unchanged */ ++ { ++ int total = hweight64(mask); ++ u16 phase = (u16)(counter * (u32)poc_rr_step[total - 1]); ++ int pick = POC_FIXED_MOD16(phase, total); ++ ++ return POC_PTSELECT(mask, pick) + base; ++ } ++} ++ ++/* ++ * poc_cluster_search - Search for an idle CPU within the target's L2 cluster ++ * @base: poc_cpu_base (smallest CPU ID in this LLC) ++ * @tgt_bit: target CPU's POC-relative bit position ++ * @sd_share: per-LLC shared data containing cluster geometry ++ * @mask: snapshot of idle bitmask (cores or cpus, caller decides) ++ * ++ * Uses pre-computed cluster mask for O(1) lookup via CTZ. ++ * Returns: idle CPU number if found within cluster, -1 otherwise. ++ */ ++static __always_inline int poc_cluster_search(int base, int tgt_bit, ++ struct sched_domain_shared *sd_share, u64 mask) ++{ ++ u64 cls_idle = mask & sd_share->poc_cluster_mask[tgt_bit]; ++ ++ if (!cls_idle) ++ return -1; ++ ++ if (static_branch_likely(&sched_poc_rr_improved)) { ++ /* Improved path: inc counter here so LV3 fallback sees fresh value */ ++ unsigned int counter = __this_cpu_inc_return(poc_rr_counter); ++ return poc_select_rr_improved(base, cls_idle, counter); ++ } ++ ++ /* Current strategy: ctz lowest-bit (no RR), unchanged */ ++ return base + POC_CTZ64(cls_idle); ++} ++ ++#ifdef CONFIG_SCHED_SMT ++/* ++ * poc_smt_sibling_mask - Get SMT sibling bitmask for a given CPU ++ * @bit: POC-relative bit position ++ * @sd_share: per-LLC shared data ++ * ++ * Three-tier computation matching poc_idle_core_mask(): ++ * ++ * Tier 1 (consecutive): 3ULL << (bit & ~1) — shift only, zero loads. ++ * ++ * Tier 2 (uniform stride-N): determine sibling via poc_smt_shift ++ * and poc_primary_mask. Avoids poc_smt_mask[] array lookup. ++ * ++ * Tier 3 (exotic): loads from pre-computed poc_smt_mask[] table. ++ */ ++static __always_inline u64 poc_smt_sibling_mask(int bit, ++ struct sched_domain_shared *sd_share) ++{ ++ if (static_branch_likely(&sched_poc_smt_consecutive)) ++ return 3ULL << (bit & ~1); ++ ++ if (static_branch_likely(&sched_poc_smt_uniform)) { ++ u8 shift = sd_share->poc_smt_shift; ++ int sib = (sd_share->poc_primary_mask & (1ULL << bit)) ++ ? bit + shift : bit - shift; ++ return (1ULL << bit) | (1ULL << sib); ++ } ++ ++ return sd_share->poc_smt_mask[bit]; ++} ++ ++/* ++ * poc_find_idle_smt_sibling - Find an idle CPU among target and its SMT siblings ++ * @base: poc_cpu_base (smallest CPU ID in this LLC) ++ * @tgt_bit: target CPU's POC-relative bit position ++ * @cpu_mask: snapshot of idle CPU bitmask ++ * @smt_mask: pre-computed SMT sibling mask for target (includes self) ++ * ++ * Searches target itself and its SMT siblings for an idle CPU. ++ * Target is checked first for cache locality. ++ * Returns: idle CPU number if found, -1 otherwise ++ */ ++static __always_inline int poc_find_idle_smt_sibling( ++ int base, int tgt_bit, u64 cpu_mask, u64 smt_mask) ++{ ++ /* Check target first for cache locality */ ++ if (POC_IDLE_CPU(tgt_bit)) ++ return base + tgt_bit; ++ ++ u64 idle_sibs = cpu_mask & smt_mask; ++ ++ if (idle_sibs) ++ return base + POC_CTZ64(idle_sibs); ++ ++ return -1; ++} ++/* ++ * poc_try_idle_smt - Find an idle CPU among a CPU and its SMT siblings ++ * @base: poc_cpu_base (smallest CPU ID in this LLC) ++ * @cpu: the CPU to check (and its SMT siblings) ++ * @cpu_mask: snapshot of idle CPU bitmask ++ * @sd_share: per-LLC shared data ++ * ++ * Checks if the given CPU or any of its SMT siblings is idle. ++ * Caller is responsible for poc_count() and poc_commit_selection(). ++ * Returns: idle CPU number if found, -1 otherwise ++ */ ++static __always_inline int poc_try_idle_smt(int base, int cpu, ++ u64 cpu_mask, struct sched_domain_shared *sd_share) ++{ ++ int bit = cpu - base; ++ ++ if (sd_share->poc_llc_members & (1ULL << bit)) { ++ int smt_cpu = poc_find_idle_smt_sibling(base, bit, ++ cpu_mask, poc_smt_sibling_mask(bit, sd_share)); ++ if (POC_CPU_VALID(smt_cpu)) ++ return smt_cpu; ++ } ++ return -1; ++} ++ ++#endif /* CONFIG_SCHED_SMT */ ++ ++/* ++ * poc_commit_selection - Atomically clear selected CPU from idle bitmap ++ * @cpu: the CPU number selected by POC ++ * @sd_share: per-LLC shared data ++ * ++ * Clears the selected CPU's bit in poc_idle_cpus_mask at selection ++ * time to close the race window where multiple waker CPUs read the ++ * same stale bitmap and select the same idle CPU. The do_idle() ++ * exit path performs an idempotent clear as a safety net for ++ * non-POC wakeups; poc_idle_committed gates that path so the atomic ++ * fires at most once per selection. ++ */ ++static __always_inline void poc_commit_selection(int cpu, ++ struct sched_domain_shared *sd_share) ++{ ++ if (cpu_rq(cpu)->nr_running <= 2) { ++ int bit = cpu - sd_share->poc_cpu_base; ++ ++ if (static_branch_unlikely(&sched_poc_lockless_bitmap)) { ++ WRITE_ONCE(sd_share->poc_idle_cpus[bit], 0); ++ smp_wmb(); ++ } else { ++ atomic64_andnot(1ULL << bit, &sd_share->poc_idle_cpus_mask); ++ smp_mb__after_atomic(); ++ /* Mark committed so target skips redundant andnot on wakeup */ ++ WRITE_ONCE(cpu_rq(cpu)->poc_idle_committed, 1); ++ } ++ } ++} ++ ++/* ++ * POC_IDLE_CORE - Test whether a CPU's core is fully idle. ++ * POC_IDLE_SMT - Find an idle CPU among @cpu and its SMT siblings. ++ * ++ * POC_RETURN - Record hit counter, clear bitmap, return selected CPU. ++ * POC_RETURN_IF - Same, but only if @cpu >= 0 (used after POC_IDLE_SMT). ++ * ++ * These assume core_mask, base, sd_share are in scope ++ * (only used inside select_idle_cpu_poc). ++ */ ++#define POC_IDLE_CORE(bit) (core_mask & poc_smt_sibling_mask((bit), sd_share)) ++#define POC_IDLE_SMT(cpu) poc_try_idle_smt(base, (cpu), cpu_mask, sd_share) ++ ++#define POC_RETURN(cpu, level) do { \ ++ poc_count(level); \ ++ poc_commit_selection(cpu, sd_share); \ ++ return cpu; \ ++} while (0) ++ ++#define POC_RETURN_IF(cpu, level) do { \ ++ if ((cpu) >= 0) \ ++ POC_RETURN(cpu, level); \ ++} while (0) ++ ++/************************************************************** ++ * Fast path dispatcher: ++ */ ++ ++/* ++ * select_idle_cpu_poc - Fast idle CPU selector (atomic64 bitmap path) ++ * @target: CPU chosen by wake_affine (Level 1 preferred CPU; ++ * search origin for L2/L3/L5/L6) ++ * @prev: task's previous CPU (Level 4 cache locality preference) ++ * @recent: task's recent_used_cpu (-1 if none; pre-filtered by caller) ++ * @sync: 1 if synchronous wakeup (Level 4s: waker yields CPU) ++ * @sd_share: per-LLC shared data (caller provides; never NULL) ++ * @allowed: task's cpumask (p->cpus_ptr) for affinity filtering ++ * ++ * Two operating modes (sysctl kernel.sched_poc_smt_fallback): ++ * ++ * smt_fallback=0 (default): POC handles all idle CPU ++ * selection itself, including SMT siblings. Prioritizes ++ * prev's SMT sibling for cache locality. Uses CFS's ++ * nr_idle_scan (SIS_UTIL) to gate Level 5/6 under overload. ++ * ++ * smt_fallback=1: Bails out to CFS when has_idle_cores is ++ * false. CFS handles SMT sibling selection via ++ * select_idle_smt(prev) and nr_idle_scan-limited ++ * select_idle_cpu(). ++ * ++ * Selection levels: ++ * ++ * Level 0: Saturation check -- no idle CPUs → return -1 ++ * (smt_fallback: also when has_idle_cores == false) ++ * Level 1r: Recent's core is fully idle → return recent (!early_select) ++ * Level 1s: Target CPU idle in bitmap → return target (L1/TLB affinity) ++ * Level 1t: Target CPU's core is fully idle → return target ++ * Level 1p: Prev's core is fully idle → return prev (prev != target) ++ * --- core_mask != 0: search idle-core bitmap --- ++ * Level 2: Idle core in L2 cluster (CTZ) ++ * Level 3: Idle core across LLC (RR PTSELECT) ++ * --- core_mask == 0: search idle-CPU bitmap --- ++ * Level 4s: sync + target CPU idle (waker frees core) ++ * Level 4p: Prev's SMT sibling (cache locality) ++ * Level 4t: Target's SMT sibling ++ * Level 4r: Recent's SMT sibling (warm cache, always) ++ * [SIS_UTIL gate: nr_idle_scan == 0 → return -2] ++ * Level 5: Idle CPU in L2 cluster (CTZ) ++ * Level 6: Idle CPU across LLC (RR PTSELECT) ++ * ++ * Non-SMT: Level 1r → 1t → 1p → Level 2 → Level 3 (core = CPU). ++ * ++ * Returns: idle CPU number if found, -1 if not found (CFS may retry), ++ * -2 if SIS_UTIL overload (caller should skip CFS) ++ */ ++static __always_inline int select_idle_cpu_poc(int target, int prev, ++ int recent, int sync, ++ struct sched_domain_shared *sd_share, ++ const struct cpumask *allowed) ++{ ++ int base = sd_share->poc_cpu_base; ++ int rct_bit = recent - base; ++ int tgt_bit = target - base; ++ int prv_bit = prev - base; ++#ifdef CONFIG_SCHED_SMT ++ u64 core_mask __maybe_unused; ++#endif ++ u64 affinity; ++ u64 cpu_mask; ++ int level_offset = 0; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* SMT fallback: bail to CFS for SMT sibling selection */ ++ if (sched_smt_active() && ++ static_branch_unlikely(&sched_poc_smt_fallback) && ++ !READ_ONCE(sd_share->has_idle_cores)) ++ return -1; ++#endif ++ ++ if (static_branch_unlikely(&sched_poc_lockless_bitmap)) ++ prefetch(sd_share->poc_idle_cpus); ++ else ++ prefetch(&sd_share->poc_idle_cpus_mask); ++#ifdef CONFIG_SCHED_SMT ++ if (sched_smt_active()) { ++ if (!static_branch_likely(&sched_poc_smt_uniform)) { ++ if (static_branch_unlikely(&sched_poc_lockless_bitmap)) ++ prefetch(sd_share->poc_idle_cores); ++ else ++ prefetch(&sd_share->poc_idle_cores_mask); ++ if (POC_CPU_VALID(recent)) ++ prefetch(&sd_share->poc_smt_mask[rct_bit]); ++ prefetch(&sd_share->poc_smt_mask[tgt_bit]); ++ prefetch(&sd_share->poc_smt_mask[prv_bit]); ++ } ++ } ++#endif ++ if (static_branch_likely(&sched_cluster_active)) ++ prefetch(&sd_share->poc_cluster_mask[tgt_bit]); ++ ++ affinity = poc_cpumask_to_u64(allowed, sd_share); ++ cpu_mask = poc_idle_cpu_mask(affinity, sd_share); ++ ++ /* Level 0: Saturation — no idle CPU */ ++ if (!cpu_mask) ++ return -1; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (sched_smt_active()) { ++ core_mask = poc_idle_core_mask(cpu_mask, sd_share); ++ ++ /* Level 1r: recent's core is idle (warm cache) */ ++ if (!static_branch_likely(&sched_poc_early_select) && ++ core_mask && POC_CPU_IN_LLC(rct_bit) && POC_IDLE_CORE(rct_bit)) ++ POC_RETURN(recent, POC_LV1R); ++ ++ /* Level 1s: target CPU sticky — L1/TLB affinity shortcut */ ++ if (static_branch_unlikely(&sched_poc_target_sticky) && POC_IDLE_CPU(tgt_bit)) ++ POC_RETURN(target, POC_LV1S); ++ ++ if (core_mask) { ++ /* ++ * Idle core path: T → P order. ++ * Target first — wake_affine chose it for data sharing ++ * and the full core is free. ++ */ ++ ++ /* Level 1t: target CPU's core is idle → return it */ ++ if (!static_branch_likely(&sched_poc_early_select) && ++ POC_IDLE_CORE(tgt_bit)) ++ POC_RETURN(target, POC_LV1T); ++ ++ /* Level 1p: prev's core is idle (task's L1/L2 warm) */ ++ if (prev != target && POC_CPU_IN_LLC(prv_bit) && POC_IDLE_CORE(prv_bit)) ++ POC_RETURN(prev, POC_LV1P); ++ ++ cpu_mask = core_mask; ++ } else { ++ int cpu; ++ ++ /* Level 4s: sync wakeup + target CPU idle → ++ * waker will sleep imminently, freeing the core */ ++ if (sync && POC_IDLE_CPU(tgt_bit)) ++ POC_RETURN(target, POC_LV4S); ++ ++ /* ++ * No-idle-core path: P → T → R order. ++ * Target itself was already tried at Level 1s/4s; ++ * prioritize task's own cache (prev, recent) over ++ * waker locality (target's sibling). ++ */ ++ ++ /* Level 4p: prev's SMT sibling (cache locality) */ ++ if (prev != target && POC_CPU_IN_LLC(prv_bit)) { ++ cpu = POC_IDLE_SMT(prev); ++ POC_RETURN_IF(cpu, POC_LV4P); ++ } ++ ++ /* Level 4t: target's SMT sibling */ ++ cpu = POC_IDLE_SMT(target); ++ POC_RETURN_IF(cpu, POC_LV4T); ++ ++ /* Level 4r: recent's SMT sibling (warm cache) */ ++ if (POC_CPU_IN_LLC(rct_bit)) { ++ cpu = POC_IDLE_SMT(recent); ++ POC_RETURN_IF(cpu, POC_LV4R); ++ } ++ ++ /* SIS_UTIL overload gate for Level 5/6 */ ++ if (!static_branch_likely(&sched_poc_greedy_search) && ++ sched_feat(SIS_UTIL) && !READ_ONCE(sd_share->nr_idle_scan)) ++ return -2; ++ ++ level_offset = POC_SMT_LEVEL_OFFSET; ++ } ++ } ++ else ++#endif ++ { ++ /* Level 1r: recent CPU is idle (non-SMT) */ ++ if (!static_branch_likely(&sched_poc_early_select) && ++ POC_CPU_IN_LLC(rct_bit) && POC_IDLE_CPU(rct_bit)) ++ POC_RETURN(recent, POC_LV1R); ++ /* Level 1t: target CPU is idle → return (non-SMT) */ ++ if (POC_IDLE_CPU(tgt_bit)) ++ POC_RETURN(target, POC_LV1T); ++ /* Level 1p: prev CPU is idle (non-SMT) */ ++ if (prev != target && POC_CPU_IN_LLC(prv_bit) && POC_IDLE_CPU(prv_bit)) ++ POC_RETURN(prev, POC_LV1P); ++ } ++ ++ if (static_branch_likely(&sched_poc_packed)) { ++ /* ++ * Level 2+3 / 5+6: packed priority search (≤32 CPUs/LLC) ++ * ++ * Packs cluster candidates (high priority) into lower 32 bits ++ * and all LLC candidates (low priority) into upper 32 bits. ++ * A single TZCNT resolves the highest-priority idle CPU. ++ * Level discrimination: (raw >> 5) yields 0 (cluster) or 1 (LLC). ++ * ++ * rr_improved=ON: rotation amount via golden-ratio scramble. ++ * rr_improved=OFF: rotation amount is (counter & 31). ++ */ ++ unsigned int counter = __this_cpu_inc_return(poc_rr_counter); ++ int rot; ++ u32 cls = 0; ++ u32 all; ++ u64 packed; ++ int raw, bit; ++ ++ if (static_branch_likely(&sched_poc_rr_improved)) ++ rot = (int)(POC_SCRAMBLE(counter) >> 27); ++ else ++ rot = counter & 31; ++ ++ if (static_branch_likely(&sched_cluster_active) && ++ sd_share->poc_cluster_valid) ++ cls = ror32((u32)(cpu_mask & ++ sd_share->poc_cluster_mask[tgt_bit]), rot); ++ ++ all = ror32((u32)cpu_mask, rot); ++ packed = (u64)cls | ((u64)all << 32); ++ ++ raw = POC_CTZ64(packed); ++ bit = ((raw & 31) + rot) & 31; ++ ++ POC_RETURN(base + bit, POC_LV2 + (raw >> 5) + level_offset); ++ } else { ++ /* Level 2/5: idle core/cpu in target's L2 cluster */ ++ if (static_branch_likely(&sched_cluster_active) ++ && sd_share->poc_cluster_valid) { ++ int cpu = poc_cluster_search( ++ base, tgt_bit, sd_share, cpu_mask); ++ if (POC_CPU_VALID(cpu)) ++ POC_RETURN(cpu, POC_LV2 + level_offset); ++ } ++ ++ /* Level 3/6: idle core/cpu across LLC via RR */ ++ { ++ unsigned int counter = __this_cpu_inc_return(poc_rr_counter); ++ int rr_cpu = poc_select_rr(base, cpu_mask, counter); ++ POC_RETURN(rr_cpu, POC_LV3 + level_offset); ++ } ++ } ++} ++ ++/************************************************************** ++ * Sysctl interface and initialization: ++ */ ++ ++#if defined(CONFIG_SYSCTL) || defined(CONFIG_SCHED_CLASS_EXT) ++/* ++ * poc_resync_idle_state - Resync POC idle bitmaps after re-enable ++ * ++ * When POC is re-enabled after a period of being disabled, ++ * the idle bitmaps may be stale. Walk all online CPUs and push ++ * the current idle state into poc_idle_cpus_mask (and poc_idle_cores_mask ++ * on non-consecutive SMT). ++ * ++ * Must be called AFTER static_branch_enable() so that concurrent ++ * idle transitions are also updating the flags. ++ * Caller must hold cpus_read_lock(). ++ */ ++static void poc_resync_idle_state(void) ++{ ++ int cpu; ++ ++ for_each_online_cpu(cpu) { ++ WRITE_ONCE(cpu_rq(cpu)->poc_idle_committed, 0); ++ __set_cpu_idle_state_poc(cpu, idle_cpu(cpu)); ++ } ++} ++ ++/* ++ * poc_reevaluate_active - Recompute poc_selector_active from inputs ++ * ++ * poc_selector_active = sched_poc_selector && !poc_selector_skip ++ * ++ * On transition to active: enable static key, then resync idle bitmaps. ++ * On transition to inactive: disable static key. ++ * Caller must hold cpus_read_lock(). ++ */ ++static void poc_reevaluate_active(void) ++{ ++ bool want = sched_poc_selector && !poc_selector_skip; ++ bool now = static_branch_likely(&poc_selector_active); ++ ++ if (want == now) ++ return; ++ ++ if (want) { ++ static_branch_enable_cpuslocked(&poc_selector_active); ++ poc_resync_idle_state(); ++ } else { ++ static_branch_disable_cpuslocked(&poc_selector_active); ++ } ++} ++#endif /* CONFIG_SYSCTL || CONFIG_SCHED_CLASS_EXT */ ++ ++#ifdef CONFIG_SCHED_CLASS_EXT ++/* ++ * poc_notify_scx - Called by sched_ext on enable/disable transitions ++ * @scx_active: true when scx scheduler is being enabled ++ */ ++void poc_notify_scx(bool scx_active) ++{ ++ cpus_read_lock(); ++ poc_selector_skip = scx_active; ++ poc_reevaluate_active(); ++ cpus_read_unlock(); ++} ++ ++/* ++ * poc_skip_fallback_work - Workqueue item to re-enable POC after scx fallback. ++ * ++ * Scheduled by poc_check_skip_fallback() when an scx scheduler calls ++ * select_idle_sibling. Runs poc_reevaluate_active() outside the hot path ++ * to avoid updating the static key and resyncing bitmaps inline. ++ */ ++static void poc_skip_fallback_fn(struct work_struct *work); ++static DECLARE_WORK(poc_skip_fallback_work, poc_skip_fallback_fn); ++ ++static void poc_skip_fallback_fn(struct work_struct *work) ++{ ++ cpus_read_lock(); ++ poc_reevaluate_active(); ++ cpus_read_unlock(); ++} ++ ++/* ++ * poc_check_skip_fallback - Hot-path detection for scx calling select_idle_sibling ++ * ++ * While scx is active, poc_selector_skip=true suppresses idle bitmap updates ++ * in do_idle. Some scx schedulers still call select_idle_sibling; when that ++ * happens, flip poc_selector_skip back to false and schedule a workqueue item ++ * to re-enable poc_selector_active and resync stale bitmaps. ++ * ++ * WRITE_ONCE(false) is idempotent across concurrent callers; schedule_work() ++ * silently drops duplicate requests when the item is already queued. ++ */ ++void poc_check_skip_fallback(void) ++{ ++ if (!sched_poc_selector || !READ_ONCE(poc_selector_skip)) ++ return; ++ WRITE_ONCE(poc_selector_skip, false); ++ schedule_work(&poc_skip_fallback_work); ++} ++#endif ++ ++#ifdef CONFIG_SYSCTL ++static int sched_poc_sysctl_handler(const struct ctl_table *table, int write, ++ void *buffer, size_t *lenp, loff_t *ppos) ++{ ++ unsigned int val = sched_poc_selector ? 1 : 0; ++ struct ctl_table tmp = { ++ .data = &val, ++ .maxlen = sizeof(val), ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }; ++ int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); ++ ++ if (!ret && write) { ++ cpus_read_lock(); ++ sched_poc_selector = !!val; ++ poc_reevaluate_active(); ++ cpus_read_unlock(); ++ } ++ return ret; ++} ++ ++static int sched_poc_smt_fallback_sysctl_handler(const struct ctl_table *table, ++ int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ unsigned int val = static_branch_unlikely(&sched_poc_smt_fallback) ? 1 : 0; ++ struct ctl_table tmp = { ++ .data = &val, ++ .maxlen = sizeof(val), ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }; ++ int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); ++ ++ if (!ret && write) { ++ if (val) ++ static_branch_enable(&sched_poc_smt_fallback); ++ else ++ static_branch_disable(&sched_poc_smt_fallback); ++ } ++ return ret; ++} ++ ++static int sched_poc_rr_improved_sysctl_handler(const struct ctl_table *table, ++ int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ unsigned int val = static_branch_likely(&sched_poc_rr_improved) ? 1 : 0; ++ struct ctl_table tmp = { ++ .data = &val, ++ .maxlen = sizeof(val), ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }; ++ int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); ++ ++ if (!ret && write) { ++ if (val) ++ static_branch_enable(&sched_poc_rr_improved); ++ else ++ static_branch_disable(&sched_poc_rr_improved); ++ } ++ return ret; ++} ++ ++static int sched_poc_target_sticky_sysctl_handler(const struct ctl_table *table, ++ int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ unsigned int val = static_branch_unlikely(&sched_poc_target_sticky) ? 1 : 0; ++ struct ctl_table tmp = { ++ .data = &val, ++ .maxlen = sizeof(val), ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }; ++ int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); ++ ++ if (!ret && write) { ++ if (val) ++ static_branch_enable(&sched_poc_target_sticky); ++ else ++ static_branch_disable(&sched_poc_target_sticky); ++ } ++ return ret; ++} ++ ++static int sched_poc_early_select_handler(const struct ctl_table *table, ++ int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ unsigned int val = static_branch_likely(&sched_poc_early_select) ? 1 : 0; ++ struct ctl_table tmp = { ++ .data = &val, ++ .maxlen = sizeof(val), ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }; ++ int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); ++ ++ if (!ret && write) { ++ if (val) ++ static_branch_enable(&sched_poc_early_select); ++ else ++ static_branch_disable(&sched_poc_early_select); ++ } ++ return ret; ++} ++ ++static int sched_poc_greedy_search_handler(const struct ctl_table *table, ++ int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ unsigned int val = static_branch_likely(&sched_poc_greedy_search) ? 1 : 0; ++ struct ctl_table tmp = { ++ .data = &val, ++ .maxlen = sizeof(val), ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }; ++ int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); ++ ++ if (!ret && write) { ++ if (val) ++ static_branch_enable(&sched_poc_greedy_search); ++ else ++ static_branch_disable(&sched_poc_greedy_search); ++ } ++ return ret; ++} ++ ++static int sched_poc_count_sysctl_handler(const struct ctl_table *table, ++ int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ unsigned int val = static_branch_unlikely(&sched_poc_count_enabled) ? 1 : 0; ++ struct ctl_table tmp = { ++ .data = &val, ++ .maxlen = sizeof(val), ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }; ++ int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); ++ ++ if (!ret && write) { ++ if (val) ++ static_branch_enable(&sched_poc_count_enabled); ++ else ++ static_branch_disable(&sched_poc_count_enabled); ++ } ++ return ret; ++} ++ ++static int sched_poc_lockless_bitmap_sysctl_handler(const struct ctl_table *table, ++ int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ unsigned int val = static_branch_unlikely(&sched_poc_lockless_bitmap) ? 1 : 0; ++ struct ctl_table tmp = { ++ .data = &val, ++ .maxlen = sizeof(val), ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }; ++ int ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); ++ ++ if (!ret && write) { ++ cpus_read_lock(); ++ if (val) ++ static_branch_enable_cpuslocked(&sched_poc_lockless_bitmap); ++ else ++ static_branch_disable_cpuslocked(&sched_poc_lockless_bitmap); ++ /* ++ * Resync the newly-active representation so readers see ++ * consistent state immediately after the mode switch. ++ */ ++ poc_resync_idle_state(); ++ cpus_read_unlock(); ++ } ++ return ret; ++} ++ ++static struct ctl_table sched_poc_sysctls[] = { ++ { ++ .procname = "sched_poc_selector", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sched_poc_sysctl_handler, ++ }, ++ { ++ .procname = "sched_poc_smt_fallback", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sched_poc_smt_fallback_sysctl_handler, ++ }, ++ { ++ .procname = "sched_poc_rr_improved", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sched_poc_rr_improved_sysctl_handler, ++ }, ++ { ++ .procname = "sched_poc_target_sticky", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sched_poc_target_sticky_sysctl_handler, ++ }, ++ { ++ .procname = "sched_poc_early_select", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sched_poc_early_select_handler, ++ }, ++ { ++ .procname = "sched_poc_greedy_search", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sched_poc_greedy_search_handler, ++ }, ++ { ++ .procname = "sched_poc_count", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sched_poc_count_sysctl_handler, ++ }, ++ { ++ .procname = "sched_poc_lockless_bitmap", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sched_poc_lockless_bitmap_sysctl_handler, ++ }, ++}; ++ ++static int __init sched_poc_sysctl_init(void) ++{ ++ printk(KERN_INFO "%s %s by %s [CTZ: %s, PTSelect: %s]\n", ++ SCHED_POC_SELECTOR_PROGNAME, SCHED_POC_SELECTOR_VERSION, ++ SCHED_POC_SELECTOR_AUTHOR, POC_CTZ64_NAME, POC_PTSELECT_NAME); ++ ++ register_sysctl_init("kernel", sched_poc_sysctls); ++ return 0; ++} ++late_initcall(sched_poc_sysctl_init); ++ ++#endif /* CONFIG_SYSCTL */ ++ ++/* ++ * Initialize per-CPU RR counters with CPU ID offset. ++ * Different starting values shift the FASTRANGE16 phase per CPU, ++ * reducing cross-CPU collision probability when multiple CPUs ++ * perform burst wakeups against the same idle bitmap snapshot. ++ */ ++static int __init sched_poc_rr_init(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ per_cpu(poc_rr_counter, cpu) = (u32)cpu; ++ return 0; ++} ++early_initcall(sched_poc_rr_init); ++ ++/************************************************************** ++ * Status: sysfs interface (always available) ++ * ++ * Exported at /sys/kernel/poc_selector/status/ for runtime status queries. ++ * Reports whether POC is actually active (combining all conditions). ++ */ ++ ++#ifdef CONFIG_SYSFS ++ ++/* Root kobject shared with debug section */ ++static struct kobject *kobj_poc_root; ++ ++static bool poc_check_all_llc_eligible(void) ++{ ++ int cpu; ++ ++ for_each_online_cpu(cpu) { ++ struct sched_domain_shared *sd_share; ++ ++ scoped_guard(rcu) { ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu)); ++ if (sd_share && !sd_share->poc_fast_eligible) ++ return false; ++ } ++ } ++ return true; ++} ++ ++static ssize_t active_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ bool active = static_branch_likely(&poc_selector_active) && ++ !sched_asym_cpucap_active() && ++ poc_check_all_llc_eligible(); ++ return sysfs_emit(buf, "%d\n", active ? 1 : 0); ++} ++ ++static ssize_t symmetric_cpucap_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sysfs_emit(buf, "%d\n", sched_asym_cpucap_active() ? 0 : 1); ++} ++ ++static ssize_t all_llc_eligible_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sysfs_emit(buf, "%d\n", poc_check_all_llc_eligible() ? 1 : 0); ++} ++ ++static ssize_t version_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sysfs_emit(buf, "%s\n", SCHED_POC_SELECTOR_VERSION); ++} ++ ++static struct kobj_attribute poc_status_active_attr = __ATTR_RO(active); ++static struct kobj_attribute poc_status_asym_attr = __ATTR_RO(symmetric_cpucap); ++static struct kobj_attribute poc_status_eligible_attr = __ATTR_RO(all_llc_eligible); ++static struct kobj_attribute poc_status_version_attr = __ATTR_RO(version); ++ ++static struct attribute *poc_status_attrs[] = { ++ &poc_status_active_attr.attr, ++ &poc_status_asym_attr.attr, ++ &poc_status_eligible_attr.attr, ++ &poc_status_version_attr.attr, ++ NULL, ++}; ++ ++static const struct attribute_group poc_status_group = { ++ .name = "status", ++ .attrs = poc_status_attrs, ++}; ++ ++/* --- hw_accel: expose which hardware acceleration is in use --- */ ++ ++#define DEFINE_POC_HW_ATTR(fname, namestr) \ ++static ssize_t poc_hw_##fname##_show(struct kobject *kobj, \ ++ struct kobj_attribute *attr, char *buf) \ ++{ \ ++ return sysfs_emit(buf, "%s\n", namestr); \ ++} \ ++static struct kobj_attribute poc_hw_attr_##fname = { \ ++ .attr = { .name = #fname, .mode = 0444 }, \ ++ .show = poc_hw_##fname##_show, \ ++} ++ ++DEFINE_POC_HW_ATTR(ctz, POC_CTZ64_NAME); ++DEFINE_POC_HW_ATTR(ptselect, POC_PTSELECT_NAME); ++ ++/* popcnt: x86 uses runtime alternatives, detect via boot_cpu_has */ ++static ssize_t poc_hw_popcnt_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++#if defined(__x86_64__) ++ return sysfs_emit(buf, "%s\n", ++ boot_cpu_has(X86_FEATURE_POPCNT) ? "HW (POPCNT)" : "SW"); ++#elif defined(__aarch64__) ++ return sysfs_emit(buf, "HW (CNT)\n"); ++#elif defined(__riscv) && defined(__riscv_zbb) ++ return sysfs_emit(buf, "HW (cpop)\n"); ++#else ++ return sysfs_emit(buf, "SW\n"); ++#endif ++} ++ ++static struct kobj_attribute poc_hw_attr_popcnt = { ++ .attr = { .name = "popcnt", .mode = 0444 }, ++ .show = poc_hw_popcnt_show, ++}; ++ ++static struct attribute *poc_hw_attrs[] = { ++ &poc_hw_attr_popcnt.attr, ++ &poc_hw_attr_ctz.attr, ++ &poc_hw_attr_ptselect.attr, ++ NULL, ++}; ++ ++static const struct attribute_group poc_hw_group = { ++ .name = "hw_accel", ++ .attrs = poc_hw_attrs, ++}; ++ ++/* --- count: per-level hit counters (sysctl kernel.sched_poc_count) --- */ ++ ++static unsigned long poc_sum_level(enum poc_level lvl) ++{ ++ unsigned long sum = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ sum += per_cpu(poc_debug_cnt[lvl], cpu); ++ return sum; ++} ++ ++#define DEFINE_POC_COUNT_ATTR(fname, level) \ ++static ssize_t poc_count_##fname##_show(struct kobject *kobj, \ ++ struct kobj_attribute *attr, char *buf) \ ++{ \ ++ return sysfs_emit(buf, "%lu\n", poc_sum_level(level)); \ ++} \ ++static struct kobj_attribute poc_count_##fname##_attr = { \ ++ .attr = { .name = #fname, .mode = 0444 }, \ ++ .show = poc_count_##fname##_show, \ ++} ++ ++DEFINE_POC_COUNT_ATTR(l1s, POC_LV1S); ++DEFINE_POC_COUNT_ATTR(l1t, POC_LV1T); ++DEFINE_POC_COUNT_ATTR(l1p, POC_LV1P); ++DEFINE_POC_COUNT_ATTR(l1r, POC_LV1R); ++DEFINE_POC_COUNT_ATTR(l2, POC_LV2); ++DEFINE_POC_COUNT_ATTR(l3, POC_LV3); ++DEFINE_POC_COUNT_ATTR(l4s, POC_LV4S); ++DEFINE_POC_COUNT_ATTR(l4p, POC_LV4P); ++DEFINE_POC_COUNT_ATTR(l4r, POC_LV4R); ++DEFINE_POC_COUNT_ATTR(l4t, POC_LV4T); ++DEFINE_POC_COUNT_ATTR(l5, POC_LV5); ++DEFINE_POC_COUNT_ATTR(l6, POC_LV6); ++DEFINE_POC_COUNT_ATTR(fallback, POC_FALLBACK); ++ ++static ssize_t poc_count_reset_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ memset(per_cpu_ptr(poc_debug_cnt, cpu), 0, ++ sizeof(poc_debug_cnt)); ++ return count; ++} ++ ++static struct kobj_attribute poc_count_reset_attr = { ++ .attr = { .name = "reset", .mode = 0200 }, ++ .store = poc_count_reset_store, ++}; ++ ++static struct attribute *poc_count_attrs[] = { ++ &poc_count_l1s_attr.attr, ++ &poc_count_l1t_attr.attr, ++ &poc_count_l1p_attr.attr, ++ &poc_count_l1r_attr.attr, ++ &poc_count_l2_attr.attr, ++ &poc_count_l3_attr.attr, ++ &poc_count_l4s_attr.attr, ++ &poc_count_l4p_attr.attr, ++ &poc_count_l4r_attr.attr, ++ &poc_count_l4t_attr.attr, ++ &poc_count_l5_attr.attr, ++ &poc_count_l6_attr.attr, ++ &poc_count_fallback_attr.attr, ++ &poc_count_reset_attr.attr, ++ NULL, ++}; ++ ++static const struct attribute_group poc_count_group = { ++ .name = "count", ++ .attrs = poc_count_attrs, ++}; ++ ++static int __init sched_poc_status_init(void) ++{ ++ int ret; ++ ++ kobj_poc_root = kobject_create_and_add("poc_selector", kernel_kobj); ++ if (!kobj_poc_root) ++ return -ENOMEM; ++ ++ ret = sysfs_create_group(kobj_poc_root, &poc_status_group); ++ if (ret) ++ goto err_status; ++ ++ ret = sysfs_create_group(kobj_poc_root, &poc_hw_group); ++ if (ret) ++ goto err_hw; ++ ++ ret = sysfs_create_group(kobj_poc_root, &poc_count_group); ++ if (ret) ++ goto err_selected; ++ ++ return 0; ++ ++err_selected: ++ sysfs_remove_group(kobj_poc_root, &poc_hw_group); ++err_hw: ++ sysfs_remove_group(kobj_poc_root, &poc_status_group); ++err_status: ++ kobject_put(kobj_poc_root); ++ kobj_poc_root = NULL; ++ return ret; ++} ++late_initcall(sched_poc_status_init); ++ ++#endif /* CONFIG_SYSFS */ ++#endif /* CONFIG_SCHED_POC_SELECTOR */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index adfb6e3409..c5676f37a1 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1136,2 +1136,5 @@ + unsigned int ttwu_pending; ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ unsigned int poc_idle_committed; ++#endif + u64 nr_switches; +@@ -2197,6 +2200,112 @@ static inline struct task_group *task_group(struct task_struct *p) + + #endif /* !CONFIG_CGROUP_SCHED */ + ++#ifdef CONFIG_SCHED_POC_SELECTOR ++extern struct static_key_true poc_selector_active; ++#ifdef CONFIG_SCHED_CLASS_EXT ++extern void poc_notify_scx(bool scx_active); ++extern void poc_check_skip_fallback(void); ++#else ++static inline void poc_check_skip_fallback(void) {} ++#endif ++extern struct static_key_true sched_poc_aligned; ++extern struct static_key_true sched_poc_smt_consecutive; ++extern struct static_key_true sched_poc_smt_uniform; ++extern struct static_key_false sched_poc_target_sticky; ++extern struct static_key_true sched_poc_packed; ++extern struct static_key_false sched_poc_lockless_bitmap; ++extern void __set_cpu_idle_state_poc(int cpu, int state); ++static __always_inline void set_cpu_idle_state_poc(int cpu, int state) ++{ ++ if (static_branch_likely(&poc_selector_active) && ++ !sched_asym_cpucap_active()) ++ __set_cpu_idle_state_poc(cpu, state); ++} ++ ++/* ++ * POC_CTZ64 - Count trailing zeros (find first set bit) ++ * ++ * Architecture-optimized CTZ for POC idle CPU selection. ++ * Returns 64 for input 0 (important for BSF-based implementations). ++ */ ++#if defined(__x86_64__) && defined(__BMI__) ++/* Tier 1: x86-64 with BMI1 - TZCNT is zero-safe */ ++#define POC_CTZ64(v) ((int)__builtin_ctzll(v)) ++ ++#elif defined(__aarch64__) ++/* Tier 1: ARM64 - RBIT+CLZ is zero-safe */ ++#define POC_CTZ64(v) ((int)__builtin_ctzll(v)) ++ ++#elif defined(__riscv) && defined(__riscv_zbb) ++/* Tier 1: RISC-V with Zbb - CTZ is zero-safe */ ++#define POC_CTZ64(v) ((int)__builtin_ctzll(v)) ++ ++#elif defined(__x86_64__) ++/* Tier 2: x86-64 without BMI1 - BSF needs zero check */ ++static __always_inline int poc_ctz64_bsf(u64 v) ++{ ++ if (unlikely(!v)) ++ return 64; ++ return (int)__builtin_ctzll(v); ++} ++#define POC_CTZ64(v) poc_ctz64_bsf(v) ++ ++#else ++/* Tier 3: De Bruijn fallback for other architectures */ ++#define POC_DEBRUIJN_CTZ64_CONST 0x03F79D71B4CA8B09ULL ++static const u8 poc_debruijn_ctz64_tab[64] = { ++ 0, 1, 56, 2, 57, 49, 28, 3, ++ 61, 58, 42, 50, 38, 29, 17, 4, ++ 62, 47, 59, 36, 45, 43, 51, 22, ++ 53, 39, 33, 30, 24, 18, 12, 5, ++ 63, 55, 48, 27, 60, 41, 37, 16, ++ 46, 35, 44, 21, 52, 32, 23, 11, ++ 54, 26, 40, 15, 34, 20, 31, 10, ++ 25, 14, 19, 9, 13, 8, 7, 6, ++}; ++static __always_inline int poc_debruijn_ctz64(u64 v) ++{ ++ u64 lsb; ++ u32 idx; ++ ++ if (unlikely(!v)) ++ return 64; ++ lsb = v & (-(s64)v); ++ idx = (u32)((lsb * POC_DEBRUIJN_CTZ64_CONST) >> 58); ++ return (int)poc_debruijn_ctz64_tab[idx & 63]; ++} ++#define POC_CTZ64(v) poc_debruijn_ctz64(v) ++ ++#endif /* POC_CTZ64 */ ++ ++/* ++ * POC helper: convert cpumask region to POC-relative u64 ++ * ++ * Extracts the 64-bit region of @mask corresponding to this LLC's ++ * CPU range and shifts it to align with POC's bit positions. ++ * ++ * Used by load balancer functions that need to intersect cpumasks ++ * with POC idle bitmaps. ++ */ ++static __always_inline u64 poc_cpumask_to_u64(const struct cpumask *mask, ++ struct sched_domain_shared *sd_share) ++{ ++ int base = sd_share->poc_cpu_base; ++ int base_word = base >> 6; ++ ++ if (static_branch_likely(&sched_poc_aligned)) { ++ /* Fast path: no shift needed (base is 64-aligned) */ ++ return cpumask_bits(mask)[base_word]; ++ } else { ++ /* Slow path: shift required (e.g., Threadripper) */ ++ int shift = sd_share->poc_affinity_shift; ++ u64 lo = cpumask_bits(mask)[base_word]; ++ u64 hi = cpumask_bits(mask)[base_word + 1]; ++ return (lo >> shift) | (hi << (64 - shift)); ++ } ++} ++#endif /* CONFIG_SCHED_POC_SELECTOR */ ++ + static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) + { + set_task_rq(p, cpu); +@@ -3134,6 +3243,7 @@ extern void nohz_run_idle_balance(int cpu); + static inline void nohz_run_idle_balance(int cpu) { } + #endif + ++ + #include "stats.h" + + #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 444bdfdab7..510b96abcd 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1717,6 +1717,232 @@ sd_init(struct sched_domain_topology_level *tl, + sd->shared = *per_cpu_ptr(sdd->sds, sd_id); + atomic_inc(&sd->shared->ref); + atomic_set(&sd->shared->nr_busy_cpus, sd_weight); ++ ++#ifdef CONFIG_SCHED_POC_SELECTOR ++ int range = cpumask_last(sd_span) - sd_id + 1; ++ ++ sd->shared->poc_cpu_base = sd_id; ++ sd->shared->poc_affinity_shift = sd_id & 63; ++ ++ if (range <= 64) { ++ sd->shared->poc_fast_eligible = true; ++ /* ++ * Disable aligned optimization if this LLC's base CPU ++ * is not 64-aligned (e.g., Threadripper CCDs). ++ */ ++ if (sd_id & 63) ++ static_branch_disable_cpuslocked(&sched_poc_aligned); ++ /* ++ * Disable packed priority search if this LLC ++ * has more than 32 CPUs. ++ */ ++ if (range > 32) ++ static_branch_disable_cpuslocked(&sched_poc_packed); ++ } else { ++ sd->shared->poc_fast_eligible = false; ++ static_branch_disable_cpuslocked(&sched_poc_packed); ++ } ++ memset(sd->shared->poc_idle_cpus, 0, ++ sizeof(sd->shared->poc_idle_cpus)); ++ atomic64_set(&sd->shared->poc_idle_cpus_mask, 0); ++#ifdef CONFIG_SCHED_SMT ++ memset(sd->shared->poc_idle_cores, 0, ++ sizeof(sd->shared->poc_idle_cores)); ++ atomic64_set(&sd->shared->poc_idle_cores_mask, 0); ++#endif ++ ++ /* Build LLC member bitmask for reader-side aggregation */ ++ { ++ u64 members = 0; ++ int cpu_iter; ++ ++ for_each_cpu(cpu_iter, sd_span) { ++ int bit = cpu_iter - sd_id; ++ ++ if ((unsigned int)bit < 64) ++ members |= 1ULL << bit; ++ } ++ sd->shared->poc_llc_members = members; ++ ++ } ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * Pre-compute SMT sibling masks for Level 4. ++ * Each entry contains a bitmask of SMT siblings (including self) ++ * for O(1) lookup via CTZ during wakeup. ++ */ ++ memset(sd->shared->poc_smt_mask, 0, ++ sizeof(sd->shared->poc_smt_mask)); ++ if (sd->shared->poc_fast_eligible) { ++ int cpu_iter; ++ ++ for_each_cpu(cpu_iter, sd_span) { ++ int bit = cpu_iter - sd_id; ++ int sibling; ++ u64 mask = 0; ++ ++ for_each_cpu(sibling, cpu_smt_mask(cpu_iter)) { ++ int sib_bit; ++ ++ sib_bit = sibling - sd_id; ++ if (sib_bit >= 0 && sib_bit < 64) ++ mask |= 1ULL << sib_bit; ++ } ++ if (bit >= 0 && bit < 64) ++ sd->shared->poc_smt_mask[bit] = mask; ++ } ++ } ++ ++ /* ++ * Detect SMT topology and classify for poc_idle_core_mask(): ++ * ++ * Tier 1 (consecutive): uniform 2-way SMT, siblings at ++ * consecutive bit positions (e.g., 0,1 / 2,3). ++ * Uses compile-time constants: shift=1, mask=0x5555... ++ * ++ * Tier 2 (uniform stride-N): uniform 2-way SMT with ++ * constant stride between siblings (e.g., Intel Xeon ++ * stride-8: CPU 0,8 / 1,9 / ...). Uses precomputed ++ * poc_smt_shift and poc_primary_mask for read-time ++ * derivation without write-path overhead. ++ * ++ * Tier 3 (exotic): >2-way SMT, non-uniform topology, ++ * or mixed SMT ways. Falls back to write-time ++ * maintenance of poc_idle_cores_mask atomic64_t. ++ * ++ * On pure non-SMT systems, the key values are irrelevant ++ * because sched_smt_active() gates all SMT paths. ++ */ ++ sd->shared->poc_smt_shift = 1; ++ sd->shared->poc_primary_mask = 0; ++ ++ if (sd->shared->poc_fast_eligible) { ++ int cpu_iter; ++ bool all_2way = true; ++ bool all_consecutive = true; ++ int uniform_stride = -1; ++ u64 primary_mask = 0; ++ ++ for_each_cpu(cpu_iter, sd_span) { ++ int bit = cpu_iter - sd_id; ++ ++ if (bit < 0 || bit >= 64) ++ continue; ++ u64 mask = sd->shared->poc_smt_mask[bit]; ++ int ways = hweight64(mask); ++ ++ if (ways != 2) { ++ all_2way = false; ++ all_consecutive = false; ++ break; ++ } ++ ++ int lo = __ffs(mask); ++ int hi = __fls(mask); ++ int stride = hi - lo; ++ ++ /* Track primary (lowest-numbered sibling) */ ++ primary_mask |= 1ULL << lo; ++ ++ /* Check consecutive: 0b11 at even position */ ++ if ((lo & 1) || mask != (3ULL << lo)) ++ all_consecutive = false; ++ ++ /* Check uniform stride */ ++ if (uniform_stride < 0) ++ uniform_stride = stride; ++ else if (stride != uniform_stride) ++ all_2way = false; ++ } ++ ++ if (!all_consecutive) ++ static_branch_disable_cpuslocked( ++ &sched_poc_smt_consecutive); ++ ++ if (all_2way && uniform_stride > 0) { ++ sd->shared->poc_smt_shift = ++ (u8)uniform_stride; ++ sd->shared->poc_primary_mask = primary_mask; ++ } else { ++ static_branch_disable_cpuslocked( ++ &sched_poc_smt_consecutive); ++ static_branch_disable_cpuslocked( ++ &sched_poc_smt_uniform); ++ } ++ } ++#endif /* CONFIG_SCHED_SMT */ ++ ++ memset(sd->shared->poc_cluster_mask, 0, ++ sizeof(sd->shared->poc_cluster_mask)); ++ ++ sd->shared->poc_cluster_valid = false; ++ ++#ifdef CONFIG_SCHED_CLUSTER ++ /* ++ * Detect cluster (L2-sharing) topology for Level 2/5 ++ * cluster-local search in POC selector. ++ * ++ * Uses cpu_clustergroup_mask() which returns the L2 ++ * cache sharing mask on x86. Validates that all ++ * clusters are uniform (same size, power-of-2, and ++ * naturally aligned in POC bit space). ++ */ ++ if (sd->shared->poc_fast_eligible) { ++ const struct cpumask *cls_mask = ++ cpu_clustergroup_mask(sd_id); ++ int cls_size = cpumask_weight(cls_mask); ++ int smt_size = cpumask_weight(cpu_smt_mask(sd_id)); ++ ++ if (cls_size > smt_size && ++ is_power_of_2(cls_size)) { ++ bool valid = true; ++ int cpu_iter; ++ ++ for_each_cpu(cpu_iter, sd_span) { ++ const struct cpumask *m = ++ cpu_clustergroup_mask(cpu_iter); ++ int first = cpumask_first(m); ++ int rel = first - sd_id; ++ ++ if (cpumask_weight(m) != cls_size || ++ (rel & (cls_size - 1)) != 0) { ++ valid = false; ++ break; ++ } ++ } ++ if (valid) { ++ sd->shared->poc_cluster_valid = true; ++ ++ /* ++ * Pre-compute cluster masks for O(1) lookup. ++ * Each entry contains a bitmask of cluster ++ * members (excluding self) for fast search. ++ */ ++ for_each_cpu(cpu_iter, sd_span) { ++ const struct cpumask *m = ++ cpu_clustergroup_mask(cpu_iter); ++ int bit = cpu_iter - sd_id; ++ int member; ++ u64 cmask = 0; ++ ++ for_each_cpu(member, m) { ++ int mbit; ++ ++ if (member == cpu_iter) ++ continue; ++ mbit = member - sd_id; ++ if (mbit >= 0 && mbit < 64) ++ cmask |= 1ULL << mbit; ++ } ++ if (bit >= 0 && bit < 64) ++ sd->shared->poc_cluster_mask[bit] = cmask; ++ } ++ } ++ } ++ } ++#endif /* CONFIG_SCHED_CLUSTER */ ++#endif /* CONFIG_SCHED_POC_SELECTOR */ + } + + sd->private = sdd; +-- +2.34.1 diff --git a/PKGBUILD b/PKGBUILD index 1e8134f..943b69b 100755 --- a/PKGBUILD +++ b/PKGBUILD @@ -93,6 +93,7 @@ source=( "git+https://github.com/dlundqvist/xone.git#tag=v0.5.8" "git+https://github.com/forkymcforkface/xpad-noone.git#commit=8e903676dd9514c07ce5e06e43c5f7d8cc51cb7d" "git+https://github.com/atar-axis/xpadneo.git#tag=v$_xpadneo_version" + 6.16-poc-selector-v2.6.1.patch ) sha256sums=('a69eea3b189ab64e65608140d6cd7c57823d1b39b361e876197eec1b4d1db957' '37452b4d09e5e42134ae24a61f2f656790837c327268074cf79d7dab3558b972' @@ -139,7 +140,8 @@ sha256sums=('a69eea3b189ab64e65608140d6cd7c57823d1b39b361e876197eec1b4d1db957' '26aed703ca1a74aa33bd76e632a63810840f7549849435c2a8e893985ff6e2c9' '7ba61ccf2ddb508d6adb30906d3d57dc0ce1bc64a6d1a41796eb94a8584ea63b' '1055bbbd32985017f4501d375648873bd598db084177d302aeeade56b47920e1' - '26b3a811d38471a42229fa037cb6d2bb5ff78f19f45a17c7f263339ee67769a7') + '26b3a811d38471a42229fa037cb6d2bb5ff78f19f45a17c7f263339ee67769a7' + '14dabfb0452a3a817e8d809fb28eb7565512e95386d789c627b62baf136e001f') export KBUILD_BUILD_HOST=archlinux export KBUILD_BUILD_USER=$pkgbase From 44a0969aaceeb3123e1d25e009efedee860071ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?= Date: Fri, 15 May 2026 13:11:15 -0300 Subject: [PATCH 2/5] Backport NAP cpuidle governor to Linux 6.16 --- 6.16-nap-v0.4.0.patch | 1928 +++++++++++++++++++++++++++++++++++++++++ PKGBUILD | 4 +- 2 files changed, 1931 insertions(+), 1 deletion(-) create mode 100644 6.16-nap-v0.4.0.patch diff --git a/6.16-nap-v0.4.0.patch b/6.16-nap-v0.4.0.patch new file mode 100644 index 0000000..1fd94eb --- /dev/null +++ b/6.16-nap-v0.4.0.patch @@ -0,0 +1,1928 @@ +From 1d2e8272f288fecce3fd7f762fb8c628ed04b7fe Mon Sep 17 00:00:00 2001 +From: Masahito S +Date: Wed, 15 Apr 2026 08:37:01 +0900 +Subject: [PATCH] 6.16 backport: 6.18.3-nap-v0.4.0 + +Backport of NAP cpuidle governor to Linux 6.16. +No functional changes except added RESIDENCY_THRESHOLD_NS definition. + +Signed-off-by: Masahito S +--- + drivers/cpuidle/Kconfig | 17 + + drivers/cpuidle/governors/Makefile | 1 + + drivers/cpuidle/governors/nap/Makefile | 29 + + drivers/cpuidle/governors/nap/nap.c | 671 ++++++++++++++++++++ + drivers/cpuidle/governors/nap/nap.h | 283 +++++++++ + drivers/cpuidle/governors/nap/nap_fpu.c | 572 +++++++++++++++++ + drivers/cpuidle/governors/nap/nap_nn_avx2.c | 135 ++++ + drivers/cpuidle/governors/nap/nap_nn_sse2.c | 136 ++++ + 8 files changed, 1844 insertions(+) + create mode 100644 drivers/cpuidle/governors/nap/Makefile + create mode 100644 drivers/cpuidle/governors/nap/nap.c + create mode 100644 drivers/cpuidle/governors/nap/nap.h + create mode 100644 drivers/cpuidle/governors/nap/nap_fpu.c + create mode 100644 drivers/cpuidle/governors/nap/nap_nn_avx2.c + create mode 100644 drivers/cpuidle/governors/nap/nap_nn_sse2.c + +diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig +index cac5997dca..9b6c50f0d8 100644 +--- a/drivers/cpuidle/Kconfig ++++ b/drivers/cpuidle/Kconfig +@@ -44,6 +44,23 @@ config CPU_IDLE_GOV_HALTPOLL + + Some virtualized workloads benefit from using it. + ++config CPU_IDLE_GOV_NAP ++ bool "Neural Adaptive Predictor (NAP) governor" ++ depends on X86_64 ++ default y ++ help ++ A machine-learning-based cpuidle governor that uses a small ++ neural network (MLP 16→16→10) to predict the optimal idle ++ state. Weights are initialized from hardware idle-state ++ parameters and refined via online learning (deferred ++ backpropagation with SGD). Requires SSE2 at minimum; ++ AVX2/AVX-512 are used when available. ++ ++ This is experimental. Select via cpuidle.governor=nap on ++ the kernel command line. ++ ++ If unsure, say Y. ++ + config DT_IDLE_STATES + bool + +diff --git a/drivers/cpuidle/governors/Makefile b/drivers/cpuidle/governors/Makefile +index 63abb5393a..ae688891c0 100644 +--- a/drivers/cpuidle/governors/Makefile ++++ b/drivers/cpuidle/governors/Makefile +@@ -7,3 +7,4 @@ obj-$(CONFIG_CPU_IDLE_GOV_LADDER) += ladder.o + obj-$(CONFIG_CPU_IDLE_GOV_MENU) += menu.o + obj-$(CONFIG_CPU_IDLE_GOV_TEO) += teo.o + obj-$(CONFIG_CPU_IDLE_GOV_HALTPOLL) += haltpoll.o ++obj-$(CONFIG_CPU_IDLE_GOV_NAP) += nap/ +diff --git a/drivers/cpuidle/governors/nap/Makefile b/drivers/cpuidle/governors/nap/Makefile +new file mode 100644 +index 0000000000..6d48cd5384 +--- /dev/null ++++ b/drivers/cpuidle/governors/nap/Makefile +@@ -0,0 +1,29 @@ ++# SPDX-License-Identifier: GPL-2.0-only ++# ++# Makefile for the NAP cpuidle governor ++# ++ ++obj-$(CONFIG_CPU_IDLE_GOV_NAP) += cpuidle_gov_nap.o ++ ++cpuidle_gov_nap-y := nap.o nap_fpu.o nap_nn_sse2.o nap_nn_avx2.o ++ ++# Kernel builds with -mno-sse -mno-sse2 -mno-avx -msoft-float -mno-80387 ++# -mno-fp-ret-in-387. FPU/SIMD-using files need these removed and ISA ++# flags explicitly added. ++# ++# CRITICAL: nap.o is intentionally compiled with NORMAL kernel flags ++# (no FPU/SSE). All floating-point code lives in nap_fpu.o and the ++# nap_nn_*.o files. This ensures the compiler cannot emit SSE instructions ++# in governor callbacks (nap_select, nap_reflect, etc.), which would ++# silently corrupt userspace FPU register state. ++# ++# Do NOT add CFLAGS_REMOVE/CFLAGS for nap.o — it must stay FPU-free. ++FPU_KILL_FLAGS := -mno-sse -mno-sse2 -mno-mmx -mno-avx -mno-3dnow \ ++ -mno-sse4a -msoft-float -mno-80387 -mno-fp-ret-in-387 ++ ++CFLAGS_REMOVE_nap_fpu.o += $(FPU_KILL_FLAGS) ++CFLAGS_REMOVE_nap_nn_sse2.o += $(FPU_KILL_FLAGS) ++CFLAGS_REMOVE_nap_nn_avx2.o += $(FPU_KILL_FLAGS) ++CFLAGS_nap_fpu.o += $(CC_FLAGS_FPU) ++CFLAGS_nap_nn_sse2.o += $(CC_FLAGS_FPU) ++CFLAGS_nap_nn_avx2.o += $(CC_FLAGS_FPU) -mavx -mavx2 -mfma +diff --git a/drivers/cpuidle/governors/nap/nap.c b/drivers/cpuidle/governors/nap/nap.c +new file mode 100644 +index 0000000000..c72b67e9c3 +--- /dev/null ++++ b/drivers/cpuidle/governors/nap/nap.c +@@ -0,0 +1,671 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * nap.c — Neural Adaptive Predictor cpuidle governor ++ * ++ * A machine-learning-based cpuidle governor that uses a small MLP (8→8→1) ++ * with 3 Mixture-of-Experts (short/long/deep) to predict a log2 correction ++ * factor for sleep_length. State selection is deterministic threshold ++ * comparison. Weights are Xavier-initialized at boot, then refined via ++ * online learning (deferred backpropagation with SGD). ++ * ++ * IMPORTANT: This file is compiled WITHOUT FPU/SSE flags (normal kernel ++ * compilation). All floating-point and SIMD code lives in nap_fpu.c and ++ * nap_nn_{sse2,avx2}.c, which are compiled with CC_FLAGS_FPU. ++ * This separation ensures the compiler cannot emit SSE instructions in ++ * governor callbacks (nap_select, nap_reflect, etc.), which would corrupt ++ * userspace FPU register state. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nap.h" ++ ++#include "../gov.h" ++ ++/************************************************************** ++ * Version Information: ++ */ ++ ++#define CPUIDLE_NAP_PROGNAME "Nap CPUIdle Governor" ++#define CPUIDLE_NAP_AUTHOR "Masahito Suzuki" ++ ++#define CPUIDLE_NAP_VERSION "0.4.0" ++ ++/* Governor defaults */ ++#define NAP_DEFAULT_LR_MILLTHS 1 /* 0.001 = 1 millths */ ++#define NAP_DEFAULT_INTERVAL 4 /* learn every 4 reflects */ ++#define NAP_DEFAULT_CLAMP_MILLTHS 1000 /* 1.0 = 1000 millths */ ++#define NAP_DEFAULT_PCTL_MILLTHS 100 /* 10th percentile */ ++ ++/* Backport: RESIDENCY_THRESHOLD_NS was missing in original patch */ ++#define RESIDENCY_THRESHOLD_NS TICK_NSEC ++ ++/* ================================================================ ++ * ISA dispatch via static keys (definitions only; dispatch in nap_fpu.c) ++ * ================================================================ */ ++ ++DEFINE_STATIC_KEY_FALSE(nap_use_avx2); ++ ++static void __init nap_detect_simd(void) ++{ ++ if (boot_cpu_has(X86_FEATURE_FMA) && ++ boot_cpu_has(X86_FEATURE_AVX2)) { ++ static_branch_enable(&nap_use_avx2); ++ pr_info("nap: using AVX2+FMA\n"); ++ } else { ++ pr_info("nap: using SSE2\n"); ++ } ++} ++ ++/* ================================================================ ++ * Per-CPU data ++ * ================================================================ */ ++ ++DEFINE_PER_CPU(struct nap_cpu_data, nap_data); ++static struct cpuidle_driver *nap_cached_drv; ++ ++/* ================================================================ ++ * Reflect-time updates (integer-only, no FPU needed) ++ * ================================================================ */ ++ ++static void nap_history_update(struct nap_cpu_data *d, u64 measured_ns) ++{ ++ d->history[d->hist_idx] = measured_ns; ++ d->hist_idx = (d->hist_idx + 1) % NAP_HISTORY_SIZE; ++ if (d->hist_count < NAP_HISTORY_SIZE) ++ d->hist_count++; ++ ++} ++ ++static void nap_update_external_signals(struct nap_cpu_data *d) ++{ ++ d->prev_idle_exit = local_clock(); ++} ++ ++/* ================================================================ ++ * Governor callbacks ++ * ================================================================ */ ++ ++/* ++ * Return the shallowest C-state index that is both enabled and ++ * satisfies the current latency request. Returns 0 if no such ++ * state exists (caller must treat 0 as "POLL is the only option"). ++ * ++ * Called from the short-circuit path to decide whether the predicted ++ * sleep length is worth entering any C-state at all. Does not ++ * consult the NN. ++ */ ++static int nap_find_min_valid_state(struct cpuidle_driver *drv, ++ struct cpuidle_device *dev, ++ s64 latency_req) ++{ ++ int i; ++ ++ for (i = 1; i < drv->state_count; i++) { ++ if (dev->states_usage[i].disable) ++ continue; ++ if (drv->states[i].exit_latency_ns > latency_req) ++ continue; ++ return i; ++ } ++ return 0; ++} ++ ++/* ++ * Cached wrapper around nap_find_min_valid_state(). ++ * ++ * Invalidation triggers: ++ * 1. latency_req changed since last cached value (immediate; PM QoS ++ * updates propagate on the next nap_select call). ++ * 2. NAP_MIN_STATE_REFRESH_JIFFIES elapsed since last refresh ++ * (bounded staleness for sysfs-driven or runtime-driver state ++ * disable events, which are rare). ++ * ++ * Hot path cost when the cache is valid: ~5-7 cycles (one s64 ++ * compare, one time_after() check, one conditional return). The ++ * uncached loop runs at most once per HZ jiffies per CPU. ++ */ ++static inline int nap_get_min_valid_state(struct nap_cpu_data *d, ++ struct cpuidle_driver *drv, ++ struct cpuidle_device *dev, ++ s64 latency_req) ++{ ++ if (unlikely(latency_req != d->cached_min_state_latency || ++ time_after(jiffies, ++ d->cached_min_state_jiffies + ++ NAP_MIN_STATE_REFRESH_JIFFIES))) { ++ d->cached_min_state = nap_find_min_valid_state(drv, dev, ++ latency_req); ++ d->cached_min_state_latency = latency_req; ++ d->cached_min_state_jiffies = jiffies; ++ } ++ return d->cached_min_state; ++} ++ ++/* ++ * Compute dev->poll_limit_ns for the short-circuit path. ++ * ++ * Budget = predicted wake time (sleep_length) + 1 µs safety margin. ++ * The margin absorbs timer jitter so a wake arriving slightly after ++ * the predicted time does not trigger a select/enter/reflect retry ++ * cycle. It is consumed only when the wake is actually late; on-time ++ * and early wakes exit POLL via need_resched without touching the ++ * margin. ++ * ++ * Floor: NAP_POLL_LIMIT_MIN_NS (1 µs). Below this, per-iteration ++ * governor overhead exceeds actual polling, and POLL's own timeout ++ * sampling granularity (~1.3 µs via POLL_IDLE_RELAX_COUNT cpu_relax ++ * iterations) makes smaller limits indistinguishable in practice. ++ * ++ * Ceiling: min_state.target_residency_ns. Beyond that point, the ++ * C-state would have been a better choice than polling. ++ */ ++static inline u64 nap_compute_poll_limit(u64 sleep_length_ns, ++ u64 min_state_target_ns) ++{ ++ u64 budget = sleep_length_ns + NAP_POLL_LIMIT_MARGIN_NS; ++ ++ return clamp_t(u64, budget, ++ NAP_POLL_LIMIT_MIN_NS, ++ min_state_target_ns); ++} ++ ++static int nap_fallback_heuristic(struct cpuidle_driver *drv, ++ struct cpuidle_device *dev) ++{ ++ s64 latency_req = cpuidle_governor_latency_req(dev->cpu); ++ ktime_t delta_tick; ++ u64 sleep_length_ns; ++ int i; ++ ++ sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick)); ++ ++ for (i = drv->state_count - 1; i > 0; i--) { ++ if (dev->states_usage[i].disable) ++ continue; ++ if (drv->states[i].exit_latency_ns > latency_req) ++ continue; ++ if (drv->states[i].target_residency_ns > sleep_length_ns) ++ continue; ++ return i; ++ } ++ return 0; ++} ++ ++static int nap_select(struct cpuidle_driver *drv, ++ struct cpuidle_device *dev, ++ bool *stop_tick) ++{ ++ struct nap_cpu_data *d = this_cpu_ptr(&nap_data); ++ s64 latency_req; ++ ktime_t delta_tick; ++ u64 sleep_length_ns; ++ int idx, min_state; ++ ++ if (unlikely(drv->state_count <= 1)) ++ return 0; ++ ++ latency_req = cpuidle_governor_latency_req(dev->cpu); ++ sleep_length_ns = ktime_to_ns(tick_nohz_get_sleep_length(&delta_tick)); ++ ++ min_state = nap_get_min_valid_state(d, drv, dev, latency_req); ++ ++ /* ++ * Fast path: when no C-state can amortize its target residency ++ * within the predicted sleep length, the answer is deterministically ++ * POLL. Skip NN inference and feature extraction entirely. ++ * nap_reflect also skips history update and learning for ++ * short-circuited events (see the short_circuited check there). ++ * See spec §3.1. ++ */ ++ if (min_state == 0 || ++ sleep_length_ns < drv->states[min_state].target_residency_ns) { ++ ++ if (min_state > 0) ++ dev->poll_limit_ns = nap_compute_poll_limit( ++ sleep_length_ns, ++ drv->states[min_state].target_residency_ns); ++ else ++ dev->poll_limit_ns = max_t(u64, sleep_length_ns, ++ NAP_POLL_LIMIT_MIN_NS); ++ ++ *stop_tick = false; ++ d->last_selected_idx = 0; ++ d->short_circuited = true; ++ d->stats.total_selects++; ++ return 0; ++ } ++ ++ /* Normal NN-driven path */ ++ d->short_circuited = false; ++ ++ if (likely(may_use_simd())) { ++ kernel_fpu_begin(); ++ idx = nap_fpu_select(drv, dev, d); ++ kernel_fpu_end(); ++ ++ if (idx < 0) ++ idx = nap_fallback_heuristic(drv, dev); ++ } else { ++ idx = nap_fallback_heuristic(drv, dev); ++ } ++ ++ *stop_tick = (drv->states[idx].target_residency_ns > ++ RESIDENCY_THRESHOLD_NS); ++ ++ d->last_selected_idx = idx; ++ d->stats.total_selects++; ++ ++ return idx; ++} ++ ++static void nap_reflect(struct cpuidle_device *dev, int index) ++{ ++ struct nap_cpu_data *d = this_cpu_ptr(&nap_data); ++ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); ++ u64 measured_ns = dev->last_residency_ns; ++ ++ if (unlikely(!drv)) ++ return; ++ ++ /* ++ * Short-circuited POLL: NN was not invoked for this idle ++ * event, so the residency does not belong to the NN's ++ * training distribution. Update the aggregate residency ++ * statistic and return — history, hit_intercept, prediction ++ * error, external signals, and learning are all skipped. ++ * See spec §3.4. ++ */ ++ if (d->short_circuited) { ++ d->stats.total_residency_ns += measured_ns; ++ return; ++ } ++ ++ nap_history_update(d, measured_ns); ++ ++ d->last_prediction_error = d->last_predicted_ns - (s64)measured_ns; ++ nap_update_external_signals(d); ++ ++ /* ++ * Dual gate: learn when both the per-N-reflect counter fires ++ * AND at least learn_jiffies_min jiffies have elapsed since ++ * the last learning step. The time gate prevents sustained ++ * weight churn on workloads with very rapid idle bursts; a ++ * value of 0 disables it (restores the original counter-only ++ * behavior). See spec §3.5. ++ */ ++ if (++d->learn_counter >= d->learn_interval && ++ time_after_eq(jiffies, ++ d->last_learn_jiffies + d->learn_jiffies_min)) { ++ d->learn_counter = 0; ++ d->last_learn_jiffies = jiffies; ++ d->learn_actual_ns = measured_ns; ++ d->needs_learn = true; ++ } ++ ++ d->stats.total_residency_ns += measured_ns; ++ if (index > 0 && measured_ns < drv->states[index].target_residency_ns) ++ d->stats.overshoot_count++; ++} ++ ++static int nap_enable(struct cpuidle_driver *drv, ++ struct cpuidle_device *dev) ++{ ++ struct nap_cpu_data *d = per_cpu_ptr(&nap_data, dev->cpu); ++ ++ memset(d, 0, sizeof(*d)); ++ ++ /* ++ * Force first-call refresh of the min-valid-state cache. ++ * cached_min_state_latency = S64_MIN ensures the first ++ * nap_select() comparison will always trip the invalidation ++ * branch regardless of the actual latency_req value. ++ * cached_min_state itself is already zeroed by the memset above. ++ */ ++ d->cached_min_state_latency = S64_MIN; ++ d->cached_min_state_jiffies = jiffies - NAP_MIN_STATE_REFRESH_JIFFIES; ++ ++ /* Default: allow at most one learning step per jiffy */ ++ d->learn_jiffies_min = 1; ++ ++ /* ++ * Defer weight initialization to the first nap_select() FPU path ++ * via reset_pending. nap_enable() is called from cpuidle core ++ * (cpuidle_enable_device) which may run on a different CPU than ++ * dev->cpu during governor switch. Deferring ensures FPU init ++ * happens on the correct CPU in its own idle context. ++ */ ++ WRITE_ONCE(nap_cached_drv, drv); ++ d->learning_rate_millths = NAP_DEFAULT_LR_MILLTHS; ++ d->learn_interval = NAP_DEFAULT_INTERVAL; ++ d->max_grad_norm_millths = NAP_DEFAULT_CLAMP_MILLTHS; ++ d->overshoot_pctl_millths = NAP_DEFAULT_PCTL_MILLTHS; ++ d->reset_pending = true; ++ ++ return 0; ++} ++ ++static void nap_disable(struct cpuidle_driver *drv, ++ struct cpuidle_device *dev) ++{ ++ WRITE_ONCE(nap_cached_drv, NULL); ++} ++ ++/* ================================================================ ++ * sysfs interface (/sys/devices/system/cpu/nap/) ++ * ================================================================ */ ++ ++static ssize_t stats_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int cpu, len = 0; ++ u64 total_sel = 0, total_res = 0, total_under = 0, total_learn = 0; ++ ++ for_each_online_cpu(cpu) { ++ struct nap_cpu_data *d = &per_cpu(nap_data, cpu); ++ ++ total_sel += d->stats.total_selects; ++ total_res += d->stats.total_residency_ns; ++ total_under += d->stats.overshoot_count; ++ total_learn += d->stats.learn_count; ++ } ++ ++ len += sysfs_emit_at(buf, len, "total_selects: %llu\n", total_sel); ++ len += sysfs_emit_at(buf, len, "total_residency_ms: %llu\n", ++ div_u64(total_res, NSEC_PER_MSEC)); ++ len += sysfs_emit_at(buf, len, "overshoot_count: %llu\n", total_under); ++ len += sysfs_emit_at(buf, len, "overshoot_rate_permil: %llu\n", ++ total_sel ? div_u64(total_under * 1000, total_sel) : 0); ++ len += sysfs_emit_at(buf, len, "learn_count: %llu\n", total_learn); ++ return len; ++} ++ ++static ssize_t learning_rate_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int cpu; ++ ++ cpu = cpumask_first(cpu_online_mask); ++ if (cpu >= nr_cpu_ids) ++ return sysfs_emit(buf, "0\n"); ++ return sysfs_emit(buf, "%u\n", ++ per_cpu(nap_data, cpu).learning_rate_millths); ++} ++ ++static ssize_t learning_rate_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned int val; ++ int cpu; ++ ++ if (kstrtouint(buf, 10, &val) || val == 0 || val > 100) ++ return -EINVAL; ++ ++ for_each_online_cpu(cpu) ++ per_cpu(nap_data, cpu).learning_rate_millths = val; ++ ++ return count; ++} ++ ++static ssize_t learn_interval_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int cpu; ++ ++ cpu = cpumask_first(cpu_online_mask); ++ if (cpu >= nr_cpu_ids) ++ return sysfs_emit(buf, "0\n"); ++ return sysfs_emit(buf, "%d\n", ++ per_cpu(nap_data, cpu).learn_interval); ++} ++ ++static ssize_t learn_interval_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned int val; ++ int cpu; ++ ++ if (kstrtouint(buf, 10, &val) || val == 0 || val > 10000) ++ return -EINVAL; ++ ++ for_each_online_cpu(cpu) ++ per_cpu(nap_data, cpu).learn_interval = val; ++ ++ return count; ++} ++ ++static ssize_t learn_jiffies_min_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int cpu; ++ ++ cpu = cpumask_first(cpu_online_mask); ++ if (cpu >= nr_cpu_ids) ++ return sysfs_emit(buf, "0\n"); ++ return sysfs_emit(buf, "%u\n", ++ per_cpu(nap_data, cpu).learn_jiffies_min); ++} ++ ++static ssize_t learn_jiffies_min_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned int val; ++ int cpu; ++ ++ if (kstrtouint(buf, 10, &val) || val > HZ * 3600) ++ return -EINVAL; ++ ++ for_each_online_cpu(cpu) ++ per_cpu(nap_data, cpu).learn_jiffies_min = val; ++ ++ return count; ++} ++ ++static ssize_t reset_weights_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ cpumask_var_t mask; ++ int cpu; ++ ++ if (!READ_ONCE(nap_cached_drv)) ++ return -ENODEV; ++ ++ /* ++ * Set a per-CPU flag; each CPU will reinitialize its own weights ++ * inside nap_select() within its own kernel_fpu_begin/end context. ++ * This avoids cross-CPU data races on the weight arrays. ++ * ++ * Accepts "all" to reset every online CPU, or a cpulist ++ * (e.g. "0-3,5,7") to reset specific CPUs. ++ */ ++ if (sysfs_streq(buf, "all")) { ++ for_each_online_cpu(cpu) ++ per_cpu(nap_data, cpu).reset_pending = true; ++ pr_info("nap: weight reset scheduled for all CPUs\n"); ++ return count; ++ } ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ if (cpulist_parse(buf, mask)) { ++ free_cpumask_var(mask); ++ return -EINVAL; ++ } ++ ++ for_each_cpu_and(cpu, mask, cpu_online_mask) ++ per_cpu(nap_data, cpu).reset_pending = true; ++ ++ pr_info("nap: weight reset scheduled for CPUs %*pbl\n", ++ cpumask_pr_args(mask)); ++ free_cpumask_var(mask); ++ return count; ++} ++ ++static ssize_t reset_stats_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int cpu; ++ ++ for_each_online_cpu(cpu) ++ memset(&per_cpu(nap_data, cpu).stats, 0, ++ sizeof(struct nap_stats)); ++ ++ return count; ++} ++ ++static ssize_t overshoot_pctl_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int cpu; ++ ++ cpu = cpumask_first(cpu_online_mask); ++ if (cpu >= nr_cpu_ids) ++ return sysfs_emit(buf, "0\n"); ++ return sysfs_emit(buf, "%u\n", ++ per_cpu(nap_data, cpu).overshoot_pctl_millths); ++} ++ ++static ssize_t overshoot_pctl_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned int val; ++ int cpu; ++ ++ if (kstrtouint(buf, 10, &val) || val > 500) ++ return -EINVAL; ++ ++ for_each_online_cpu(cpu) ++ per_cpu(nap_data, cpu).overshoot_pctl_millths = val; ++ ++ return count; ++} ++ ++static ssize_t version_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sysfs_emit(buf, "%s\n", CPUIDLE_NAP_VERSION); ++} ++ ++static ssize_t simd_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ if (static_branch_unlikely(&nap_use_avx2)) ++ return sysfs_emit(buf, "avx2\n"); ++ else ++ return sysfs_emit(buf, "sse2\n"); ++} ++ ++static struct kobj_attribute version_attr = __ATTR_RO(version); ++static struct kobj_attribute simd_attr = __ATTR_RO(simd); ++static struct kobj_attribute stats_attr = __ATTR_RO(stats); ++static struct kobj_attribute learning_rate_attr = __ATTR_RW(learning_rate); ++static struct kobj_attribute learn_interval_attr = __ATTR_RW(learn_interval); ++static struct kobj_attribute learn_jiffies_min_attr = __ATTR_RW(learn_jiffies_min); ++static struct kobj_attribute overshoot_pctl_attr = __ATTR_RW(overshoot_pctl); ++static struct kobj_attribute reset_weights_attr = __ATTR_WO(reset_weights); ++static struct kobj_attribute reset_stats_attr = __ATTR_WO(reset_stats); ++ ++static struct attribute *nap_attrs[] = { ++ &version_attr.attr, ++ &simd_attr.attr, ++ &stats_attr.attr, ++ &learning_rate_attr.attr, ++ &learn_interval_attr.attr, ++ &learn_jiffies_min_attr.attr, ++ &overshoot_pctl_attr.attr, ++ &reset_weights_attr.attr, ++ &reset_stats_attr.attr, ++ NULL, ++}; ++ ++static const struct attribute_group nap_attr_group = { ++ .attrs = nap_attrs, ++}; ++ ++static struct kobject *cpuidle_kobj; ++ ++int nap_sysfs_init(void) ++{ ++ struct device *dev_root; ++ int ret; ++ ++ dev_root = bus_get_dev_root(&cpu_subsys); ++ if (!dev_root) ++ return -ENODEV; ++ ++ cpuidle_kobj = kobject_create_and_add("nap", &dev_root->kobj); ++ put_device(dev_root); ++ if (!cpuidle_kobj) ++ return -ENOMEM; ++ ++ ret = sysfs_create_group(cpuidle_kobj, &nap_attr_group); ++ if (ret) { ++ kobject_put(cpuidle_kobj); ++ cpuidle_kobj = NULL; ++ } ++ return ret; ++} ++ ++void nap_sysfs_exit(void) ++{ ++ if (cpuidle_kobj) { ++ sysfs_remove_group(cpuidle_kobj, &nap_attr_group); ++ kobject_put(cpuidle_kobj); ++ cpuidle_kobj = NULL; ++ } ++} ++ ++/* ================================================================ ++ * Governor registration ++ * ================================================================ */ ++ ++static struct cpuidle_governor nap_governor = { ++ .name = "nap", ++ .rating = 26, ++ .enable = nap_enable, ++ .disable = nap_disable, ++ .select = nap_select, ++ .reflect = nap_reflect, ++}; ++ ++static int __init nap_init(void) ++{ ++ int ret; ++ ++ nap_detect_simd(); ++ ++ ret = nap_sysfs_init(); ++ if (ret) ++ pr_warn("nap: sysfs init failed: %d (continuing without sysfs)\n", ret); ++ ++ ret = cpuidle_register_governor(&nap_governor); ++ if (ret) { ++ pr_err("nap: register_governor failed: %d\n", ret); ++ nap_sysfs_exit(); ++ return ret; ++ } ++ ++ pr_info("%s v%s by %s registered (rating=%u)\n", ++ CPUIDLE_NAP_PROGNAME, CPUIDLE_NAP_VERSION, ++ CPUIDLE_NAP_AUTHOR, nap_governor.rating); ++ return 0; ++} ++postcore_initcall(nap_init); +diff --git a/drivers/cpuidle/governors/nap/nap.h b/drivers/cpuidle/governors/nap/nap.h +new file mode 100644 +index 0000000000..1059db983b +--- /dev/null ++++ b/drivers/cpuidle/governors/nap/nap.h +@@ -0,0 +1,283 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef NAP_H ++#define NAP_H ++ ++#include ++#include ++#include ++ ++/* ================================================================ ++ * Neural network dimensions ++ * ================================================================ */ ++ ++#define NAP_INPUT_SIZE 8 ++#define NAP_HIDDEN_SIZE 8 ++#define NAP_NUM_EXPERTS 3 ++ ++/* ++ * Neural network weight structure for an 8→8→1 MLP (scalar regression). ++ * ++ * The NN outputs a single log2 correction factor applied to sleep_length: ++ * effective_sleep = exp2(log2(sleep_length) + nn_output) ++ * State selection is then deterministic: pick the deepest state whose ++ * cost (target_residency + exit_latency) ≤ effective_sleep. ++ * ++ * Column-major storage: w_h1[j][i] = weight from input j to hidden neuron i. ++ * This layout enables efficient column-wise matrix-vector products where ++ * each input broadcasts across all hidden neurons via SIMD FMA. ++ * ++ * __aligned(32) ensures AVX2 vmovaps (32-byte aligned) loads work ++ * correctly. 8 floats = 32 bytes = one ymm register. ++ */ ++struct nap_weights { ++ /* Hidden layer: input[8] → hidden[8] */ ++ float w_h1[NAP_INPUT_SIZE][NAP_HIDDEN_SIZE]; /* 64 params */ ++ float b_h1[NAP_HIDDEN_SIZE]; /* 8 params */ ++ /* Output layer: hidden[8] → 1 scalar */ ++ float w_out[NAP_HIDDEN_SIZE]; /* 8 params */ ++ float b_out; /* 1 param */ ++} __aligned(32); ++ ++/* ISA-specific forward pass implementations */ ++void nap_nn_forward_sse2(const float *input, float *output, ++ float *hidden_save, const struct nap_weights *w); ++void nap_nn_forward_avx2(const float *input, float *output, ++ float *hidden_save, const struct nap_weights *w); ++/* ISA-specific online learning (backpropagation) */ ++struct nap_cpu_data; ++void nap_nn_learn_sse2(struct nap_cpu_data *d); ++void nap_nn_learn_avx2(struct nap_cpu_data *d); ++ ++/* Static key for ISA dispatch (defined in nap.c) */ ++DECLARE_STATIC_KEY_FALSE(nap_use_avx2); ++ ++/* ================================================================ ++ * SIMD type definitions and helpers (GCC vector extensions) ++ * ++ * Only available when compiled with FPU/SSE flags (nap_fpu.c, ++ * nap_nn_*.c). nap.c is compiled without FPU flags and must ++ * not see these definitions. ++ * ++ * is a userspace header and cannot be used in kernel. ++ * We use __attribute__((__vector_size__())) and __builtin_ia32_*. ++ * ================================================================ */ ++ ++#ifdef __SSE2__ ++ ++typedef float v4sf __attribute__((__vector_size__(16))); /* xmm: 4×float */ ++typedef int v4si __attribute__((__vector_size__(16))); /* xmm: 4×int32 */ ++typedef float v8sf __attribute__((__vector_size__(32))); /* ymm: 8×float */ ++ ++/* Broadcast helpers */ ++#define V4SF_SET1(x) ((v4sf){ (x), (x), (x), (x) }) ++#define V4SI_SET1(x) ((v4si){ (x), (x), (x), (x) }) ++#define V8SF_SET1(x) ((v8sf){ (x),(x),(x),(x),(x),(x),(x),(x) }) ++#define V8SF_ZERO V8SF_SET1(0.0f) ++ ++/* Unaligned load/store helpers */ ++static inline v4sf v4sf_loadu(const float *p) ++{ ++ v4sf result; ++ __builtin_memcpy(&result, p, sizeof(result)); ++ return result; ++} ++ ++static inline void v4sf_storeu(float *p, v4sf v) ++{ ++ __builtin_memcpy(p, &v, sizeof(v)); ++} ++ ++#ifdef __AVX__ ++static inline v8sf v8sf_loadu(const float *p) ++{ ++ v8sf result; ++ __builtin_memcpy(&result, p, sizeof(result)); ++ return result; ++} ++ ++static inline void v8sf_storeu(float *p, v8sf v) ++{ ++ __builtin_memcpy(p, &v, sizeof(v)); ++} ++#endif /* __AVX__ */ ++ ++/* Scalar/vector clamp helpers */ ++static inline float fclampf(float v, float lo, float hi) ++{ ++ if (v < lo) return lo; ++ if (v > hi) return hi; ++ return v; ++} ++ ++static inline v4sf v4sf_clamp(v4sf v, v4sf lo, v4sf hi) ++{ ++ return __builtin_ia32_maxps(__builtin_ia32_minps(v, hi), lo); ++} ++ ++/* Type punning: float ↔ int reinterpret (no instruction generated) */ ++static inline v4si v4sf_as_v4si(v4sf v) ++{ ++ union { v4sf f; v4si i; } u = { .f = v }; ++ return u.i; ++} ++ ++static inline v4sf v4si_as_v4sf(v4si v) ++{ ++ union { v4si i; v4sf f; } u = { .i = v }; ++ return u.f; ++} ++ ++/* ++ * fast_log2f_sse() — Compute log2 of 4 floats simultaneously using SSE2 ++ * ++ * Cost: ~15 cycles for 4 values (~4 cycles per value) ++ */ ++static inline v4sf fast_log2f_sse(v4sf x) ++{ ++ const v4si mask_exp = V4SI_SET1(0xFF); ++ const v4si bias = V4SI_SET1(127); ++ const v4si mask_mant = V4SI_SET1(0x7FFFFF); ++ const v4si exp_bias = V4SI_SET1(127 << 23); ++ ++ v4si xi = v4sf_as_v4si(x); ++ v4si exp_i = (xi >> 23) & mask_exp; ++ exp_i = exp_i - bias; ++ v4sf e = __builtin_convertvector(exp_i, v4sf); ++ ++ v4si mant_i = (xi & mask_mant) | exp_bias; ++ v4sf m = v4si_as_v4sf(mant_i) - V4SF_SET1(1.0f); ++ ++ v4sf p; ++ p = m * V4SF_SET1(0.4808f); ++ p = V4SF_SET1(0.7213f) - p; ++ p = m * p; ++ p = V4SF_SET1(1.4425f) - p; ++ p = m * p; ++ ++ return e + p; ++} ++ ++#endif /* __SSE2__ */ ++ ++/* ================================================================ ++ * Feature extraction ++ * ================================================================ */ ++ ++#define NAP_HISTORY_SIZE 8 ++ ++/* ================================================================ ++ * POLL short-circuit tunables ++ * ================================================================ */ ++ ++/* Minimum and safety-margin values for dev->poll_limit_ns written ++ * by nap_compute_poll_limit(). Both are 1 µs: the POLL state ++ * itself checks its timeout only every ~1 µs (POLL_IDLE_RELAX_COUNT ++ * cpu_relax() iterations in drivers/cpuidle/poll_state.c), so ++ * finer-grained values would not produce distinguishable behavior. ++ */ ++#define NAP_POLL_LIMIT_MIN_NS 1000ULL ++#define NAP_POLL_LIMIT_MARGIN_NS 1000ULL ++ ++/* Refresh interval for the cached minimum-valid-state lookup. ++ * HZ jiffies (= 1 second) bounds the staleness window caused by ++ * sysfs-driven or runtime-driver state disable events. PM QoS ++ * latency changes are detected immediately via the cached ++ * latency_req comparison. ++ */ ++#define NAP_MIN_STATE_REFRESH_JIFFIES HZ ++ ++struct nap_stats { ++ u64 total_selects; ++ u64 total_residency_ns; ++ u64 overshoot_count; ++ u64 learn_count; ++}; ++ ++struct nap_cpu_data { ++ /* Ring buffer */ ++ u64 history[NAP_HISTORY_SIZE]; ++ float log_history[NAP_HISTORY_SIZE]; ++ int hist_idx; ++ int hist_count; ++ ++ /* External signal tracking */ ++ u64 prev_idle_exit; ++ s64 last_predicted_ns; ++ s64 last_prediction_error; ++ ++ /* Short-circuit fast path (§3.1, §3.2, §3.4 of spec) */ ++ bool short_circuited; /* set in select, read in reflect */ ++ int cached_min_state; /* cached shallowest valid state */ ++ s64 cached_min_state_latency; /* latency_req when cache populated */ ++ unsigned long cached_min_state_jiffies; /* jiffies when cache populated */ ++ ++ /* Jiffies-based learning rate floor (§3.5 of spec) */ ++ unsigned long last_learn_jiffies; ++ unsigned int learn_jiffies_min; /* sysfs-tunable, 0 = disabled */ ++ ++ /* select/reflect handoff */ ++ int last_selected_idx; ++ ++ /* NN scalar output: log2 correction factor for sleep_length. ++ * effective_sleep = exp2(log2(sleep_length) + nn_output). ++ */ ++ float nn_output; ++ ++ /* ++ * hidden_out[], features_f32[] are written with aligned SIMD ++ * stores in nap_nn_forward_{sse2,avx2}() and ++ * nap_extract_features(): ++ * SSE2: movaps (16-byte aligned) ++ * AVX2: vmovaps (32-byte aligned) ++ * Without __aligned(64), the natural struct offset would be ++ * only 4-byte aligned, causing #GP faults in the idle task. ++ */ ++ float hidden_out[NAP_HIDDEN_SIZE] __aligned(32); ++ float features_f32[NAP_INPUT_SIZE] __aligned(32); ++ ++ /* Backprop scratch */ ++ float learn_d_out; /* output gradient direction (±1) */ ++ float learn_lr; /* effective lr (base_lr * asymmetric weight) */ ++ float learn_d_hid[NAP_HIDDEN_SIZE] __aligned(32); ++ ++ /* Precomputed per-state log2(target_residency) for threshold selection. ++ * log2_cost[i] = log2(target_residency_ns). ++ */ ++ float log2_cost[CPUIDLE_STATE_MAX]; ++ ++ /* Deferred learning data */ ++ bool needs_learn; ++ bool output_clamped; /* true if nn_output was clamped to features[0] */ ++ u64 learn_actual_ns; ++ ++ /* Mixture-of-Experts: 3 experts × 8 neurons each */ ++ struct nap_weights expert_weights[NAP_NUM_EXPERTS]; ++ struct nap_weights *active_w; /* selected expert for current/deferred pass */ ++ int active_expert; /* 0, 1, or 2: which expert is active */ ++ float expert_mid; /* log2 threshold: short ↔ long */ ++ float expert_deep; /* log2 threshold: long ↔ deep */ ++ ++ /* Online learning */ ++ unsigned int learning_rate_millths; ++ unsigned int max_grad_norm_millths; ++ unsigned int overshoot_pctl_millths; /* quantile target (250 = 25th pctl) */ ++ int learn_interval; ++ int learn_counter; ++ bool reset_pending; /* set by sysfs, consumed by nap_select */ ++ ++ /* sysfs statistics */ ++ struct nap_stats stats; ++}; ++ ++DECLARE_PER_CPU(struct nap_cpu_data, nap_data); ++ ++/* FPU entry point (nap_fpu.c) — call only within kernel_fpu_begin/end */ ++int nap_fpu_select(struct cpuidle_driver *drv, ++ struct cpuidle_device *dev, ++ struct nap_cpu_data *d); ++ ++/* sysfs interface */ ++int nap_sysfs_init(void); ++void nap_sysfs_exit(void); ++ ++#endif /* NAP_H */ +diff --git a/drivers/cpuidle/governors/nap/nap_fpu.c b/drivers/cpuidle/governors/nap/nap_fpu.c +new file mode 100644 +index 0000000000..482a06a5d0 +--- /dev/null ++++ b/drivers/cpuidle/governors/nap/nap_fpu.c +@@ -0,0 +1,572 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * nap_fpu.c — FPU/SIMD code for the NAP cpuidle governor ++ * ++ * This file is compiled with FPU/SSE flags enabled (CC_FLAGS_FPU). ++ * ALL functions here MUST be called only from within ++ * kernel_fpu_begin()/kernel_fpu_end() blocks. ++ * ++ * Keeping FPU code in a separate translation unit ensures the compiler ++ * cannot emit SSE/x87 instructions in non-FPU code paths (nap.c), ++ * which would silently corrupt userspace FPU register state. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nap.h" ++ ++/* Clang lacks __builtin_ia32_movhlps; emulate with __builtin_shufflevector */ ++#ifdef __clang__ ++#define __builtin_ia32_movhlps(a, b) \ ++ __builtin_shufflevector(b, a, 2, 3, 6, 7) ++#endif ++ ++/* ================================================================ ++ * Float math helpers ++ * ================================================================ */ ++ ++static inline float float_min(float a, float b) { return a < b ? a : b; } ++static inline float float_max(float a, float b) { return a > b ? a : b; } ++ ++/* ++ * Kernel-safe sqrtf using the SSE sqrtss instruction directly. ++ * GCC may lower nap_sqrtf to a libm call, which is unavailable ++ * in the kernel. This file is always compiled with FPU/SSE enabled. ++ */ ++static inline float nap_sqrtf(float x) ++{ ++ asm("sqrtss %1, %0" : "=x"(x) : "x"(x)); ++ return x; ++} ++ ++/* Scalar log2 approximation (same algorithm as fast_log2f_sse) */ ++static inline float fast_log2f(float x) ++{ ++ union { float f; u32 i; } u = { .f = x }; ++ int exp = (int)((u.i >> 23) & 0xFFu) - 127; ++ float e = (float)exp; ++ float m, p; ++ ++ u.i = (u.i & 0x7FFFFFu) | (127u << 23); ++ m = u.f - 1.0f; ++ ++ p = m * 0.4808f; ++ p = 0.7213f - p; ++ p = m * p; ++ p = 1.4425f - p; ++ p = m * p; ++ ++ return e + p; ++} ++ ++/* ================================================================ ++ * Deterministic PRNG for weight initialization (LCG) ++ * ================================================================ */ ++ ++static inline float nap_prng_float(u32 *state) ++{ ++ *state = *state * 1664525u + 1013904223u; ++ return (float)(s32)*state * (1.0f / 2147483648.0f); ++} ++ ++/* ================================================================ ++ * ISA dispatch via static keys ++ * ================================================================ */ ++ ++static inline void nap_nn_forward(const float *input, float *output, ++ float *hidden_save, ++ const struct nap_weights *w) ++{ ++ if (static_branch_unlikely(&nap_use_avx2)) ++ nap_nn_forward_avx2(input, output, hidden_save, w); ++ else ++ nap_nn_forward_sse2(input, output, hidden_save, w); ++} ++ ++static inline void nap_nn_learn(struct nap_cpu_data *d) ++{ ++ if (static_branch_unlikely(&nap_use_avx2)) ++ nap_nn_learn_avx2(d); ++ else ++ nap_nn_learn_sse2(d); ++} ++ ++/* ================================================================ ++ * Weight initialization ++ * ++ * The NN directly outputs predicted sleep time in log2(ns) space. ++ * Hidden neuron 0 is initialized as a pass-through for feature[0] ++ * (log2(sleep_length)), so the initial output ≈ log2(sleep_length). ++ * This matches the pre-learning behavior of selecting the deepest ++ * state that fits within sleep_length. ++ * ++ * Other hidden neurons are Xavier-initialized with near-zero output ++ * weights so their initial contribution is negligible. Biases = 0. ++ * ================================================================ */ ++ ++#define NAP_PRNG_SEED 42u ++ ++static void nap_init_weights(struct nap_weights *w) ++{ ++ u32 rng = NAP_PRNG_SEED; ++ float scale_h1, scale_out; ++ int i, j; ++ ++ /* Xavier uniform: U(-sqrt(6/(fan_in+fan_out)), +sqrt(6/(...))) */ ++ scale_h1 = nap_sqrtf(6.0f / (float)(NAP_INPUT_SIZE + NAP_HIDDEN_SIZE)); ++ scale_out = 0.01f; ++ ++ /* Hidden layer weights */ ++ for (i = 0; i < NAP_INPUT_SIZE; i++) ++ for (j = 0; j < NAP_HIDDEN_SIZE; j++) ++ w->w_h1[i][j] = nap_prng_float(&rng) * scale_h1; ++ ++ /* Hidden biases: zero (standard) */ ++ memset(w->b_h1, 0, sizeof(w->b_h1)); ++ ++ /* Output weights: near-zero for ~0 initial contribution */ ++ for (j = 0; j < NAP_HIDDEN_SIZE; j++) ++ w->w_out[j] = nap_prng_float(&rng) * scale_out; ++ ++ /* Output bias: zero */ ++ w->b_out = 0.0f; ++ ++ /* ++ * Neuron 0: pass-through for feature[0] = log2(sleep_length). ++ * hidden[0] = ReLU(1.0 * input[0] + 0) = input[0] (always > 0) ++ * output += 1.0 * hidden[0] = log2(sleep_length) ++ * ++ * Override the random init above so initial output ≈ input[0]. ++ */ ++ for (i = 0; i < NAP_INPUT_SIZE; i++) ++ w->w_h1[i][0] = 0.0f; ++ w->w_h1[0][0] = 1.0f; ++ w->b_h1[0] = 0.0f; ++ w->w_out[0] = 1.0f; ++} ++ ++/* ++ * Precompute log2(target_residency) per state for threshold-based selection. ++ * ++ * Used in the selection loop: pick deepest state where ++ * log2_cost[i] <= nn_output (predicted sleep time in log2 space). ++ * ++ * Only target_residency_ns is used — exit_latency is a wakeup cost, ++ * not a factor in whether the CPU can profitably stay in the state ++ * for the predicted duration. ++ */ ++static void nap_init_log2_cost(struct nap_cpu_data *d, ++ struct cpuidle_driver *drv) ++{ ++ float log2_tick; ++ int long_start, deep_idx; ++ int i; ++ ++ for (i = 0; i < drv->state_count; i++) { ++ float res = float_max( ++ (float)drv->states[i].target_residency_ns, 1.0f); ++ d->log2_cost[i] = fast_log2f(res); ++ } ++ ++ /* ++ * MoE expert boundaries — 3-way split. ++ * ++ * Expert 0 (short): tick-bound idles where measured residency ++ * is dominated by the next tick rather than the workload's ++ * true idle duration. Boundary: log2(TICK_NSEC). ++ * ++ * Expert 1 (long): nohz idles in intermediate C-states. ++ * ++ * Expert 2 (deep): idles targeting the deepest C-state. ++ * The deepest state often has qualitatively different ++ * residency characteristics (package C-state, longer ++ * exit latency, power-gated domains) that warrant a ++ * dedicated expert to avoid gradient interference with ++ * intermediate states. ++ * ++ * Safety: with only 2 C-states (+ POLL), expert_deep is ++ * placed equal to expert_mid so the deep expert is never ++ * routed (same behavior as the old 2-expert split). ++ */ ++ if (drv->state_count <= 1) { ++ d->expert_mid = 0.0f; ++ d->expert_deep = 0.0f; ++ return; ++ } ++ ++ log2_tick = fast_log2f((float)TICK_NSEC); ++ ++ /* Default: deepest state belongs to long expert (safety) */ ++ long_start = drv->state_count - 1; ++ ++ /* Prefer the first state whose target_residency exceeds one jiffy */ ++ for (i = 1; i < drv->state_count; i++) { ++ if (d->log2_cost[i] > log2_tick) { ++ long_start = i; ++ break; ++ } ++ } ++ ++ if (long_start > 1) { ++ /* Normal case: boundary between last short and first long */ ++ d->expert_mid = (d->log2_cost[long_start - 1] + ++ d->log2_cost[long_start]) / 2.0f; ++ } else { ++ /* ++ * long_start == 1: even the shallowest C-state already ++ * exceeds one jiffy. All NN-handled idles go to the ++ * long expert; place the boundary just below C1's ++ * residency so the short expert remains routable but ++ * unused. ++ */ ++ d->expert_mid = d->log2_cost[1] - 1.0f; ++ } ++ ++ /* ++ * Deep expert boundary — deepest C-state split. ++ * ++ * When there are >= 3 C-states (state_count >= 4, counting POLL), ++ * place the boundary at the midpoint between the second-deepest ++ * and deepest state's log2(target_residency). The deep expert ++ * then exclusively handles sleep durations long enough to reach ++ * the deepest state. ++ * ++ * With only 2 C-states, expert_deep == expert_mid collapses to ++ * the 2-expert regime (expert 2 is never selected). ++ */ ++ deep_idx = drv->state_count - 1; ++ if (deep_idx >= 3) { ++ /* >= 3 C-states: split before the deepest */ ++ d->expert_deep = (d->log2_cost[deep_idx - 1] + ++ d->log2_cost[deep_idx]) / 2.0f; ++ /* Ensure deep > mid ordering */ ++ if (d->expert_deep <= d->expert_mid) ++ d->expert_deep = d->expert_mid; ++ } else { ++ /* <= 2 C-states: collapse deep into long */ ++ d->expert_deep = d->expert_mid; ++ } ++} ++ ++/* ================================================================ ++ * Feature extraction helpers ++ * ================================================================ */ ++ ++struct logring_stats { ++ float avg; ++ float min; ++ float max; ++}; ++ ++/* ++ * Compute log_history statistics: avg, min, max. ++ * SIMD fast path when the ring buffer is full (8 elements = 2 × xmm). ++ */ ++static void logring_compute(const struct nap_cpu_data *d, ++ struct logring_stats *s) ++{ ++ int i, n = d->hist_count; ++ float sum; ++ ++ if (n == 0) { ++ *s = (struct logring_stats){ 0 }; ++ return; ++ } ++ ++ if (n == NAP_HISTORY_SIZE) { ++ v4sf v0 = *(const v4sf *)&d->log_history[0]; ++ v4sf v1 = *(const v4sf *)&d->log_history[4]; ++ v4sf pmin, pmax, psum, t; ++ ++ pmin = __builtin_ia32_minps(v0, v1); ++ pmax = __builtin_ia32_maxps(v0, v1); ++ psum = v0 + v1; ++ ++ /* 4 → 2 */ ++ t = __builtin_ia32_movhlps(pmin, pmin); ++ pmin = __builtin_ia32_minps(pmin, t); ++ t = __builtin_ia32_movhlps(pmax, pmax); ++ pmax = __builtin_ia32_maxps(pmax, t); ++ t = __builtin_ia32_movhlps(psum, psum); ++ psum = psum + t; ++ ++ /* 2 → 1 */ ++ t = __builtin_ia32_shufps(pmin, pmin, 0x55); ++ pmin = __builtin_ia32_minps(pmin, t); ++ t = __builtin_ia32_shufps(pmax, pmax, 0x55); ++ pmax = __builtin_ia32_maxps(pmax, t); ++ t = __builtin_ia32_shufps(psum, psum, 0x55); ++ psum = psum + t; ++ ++ sum = psum[0]; ++ s->min = pmin[0]; ++ s->max = pmax[0]; ++ } else { ++ float val; ++ ++ sum = d->log_history[0]; ++ s->min = sum; ++ s->max = sum; ++ ++ for (i = 1; i < n; i++) { ++ val = d->log_history[i]; ++ sum += val; ++ s->min = float_min(s->min, val); ++ s->max = float_max(s->max, val); ++ } ++ } ++ ++ s->avg = sum / (float)n; ++} ++ ++/* ++ * Extract 8 input features for the MLP. ++ * ++ * [0] log2(sleep_length) — next timer event ++ * [1] log2(last_residency) — actual duration of last idle ++ * [2] log_hist avg — average recent idle duration ++ * [3] log_hist min — shortest recent idle ++ * [4] log_hist max — longest recent idle ++ * [5] signed log2(|pred_error|+1) — prediction feedback ++ * [6] log2(busy_ns) — pre-idle busy duration ++ * [7] log2(lat_req) - log2(deepest_lat) — PM QoS headroom ++ */ ++static void nap_extract_features(struct cpuidle_driver *drv, ++ struct cpuidle_device *dev, ++ float out[NAP_INPUT_SIZE], ++ s64 latency_req) ++{ ++ struct nap_cpu_data *d = this_cpu_ptr(&nap_data); ++ struct logring_stats lr; ++ ktime_t sleep_length, delta_tick; ++ u64 busy_ns; ++ float log_inputs[4] __aligned(16); ++ float log_results[4] __aligned(16); ++ ++ sleep_length = tick_nohz_get_sleep_length(&delta_tick); ++ busy_ns = local_clock() - d->prev_idle_exit; ++ ++ /* ++ * SSE log2 batch: 4 values in one fast_log2f_sse call. ++ * [0] sleep_length → out[0] ++ * [1] last_residency → out[1], also stored to log_history ++ * [2] busy_ns → out[6] ++ * [3] |pred_error_us| + 1 → out[5] (sign restored after) ++ */ ++ { ++ float err_f = (float)(d->last_prediction_error / 1000); ++ float abs_err = (err_f >= 0.0f) ? err_f : -err_f; ++ ++ log_inputs[0] = float_max((float)ktime_to_ns(sleep_length), 1.0f); ++ log_inputs[1] = float_max((float)dev->last_residency_ns, 1.0f); ++ log_inputs[2] = float_max((float)busy_ns, 1.0f); ++ log_inputs[3] = abs_err + 1.0f; ++ ++ { ++ v4sf log_in = *(const v4sf *)log_inputs; ++ v4sf log_out = fast_log2f_sse(log_in); ++ *(v4sf *)log_results = log_out; ++ } ++ ++ out[0] = log_results[0]; ++ out[1] = log_results[1]; ++ out[6] = log_results[2]; ++ ++ /* out[5]: sign-preserving log2(|err_us| + 1) */ ++ { ++ union { float f; u32 i; } res = { .f = log_results[3] }; ++ union { float f; u32 i; } sgn = { .f = err_f }; ++ ++ res.i |= sgn.i & 0x80000000u; ++ out[5] = res.f; ++ } ++ } ++ ++ /* Update log_history ring buffer */ ++ { ++ int prev = (d->hist_idx - 1 + NAP_HISTORY_SIZE) % NAP_HISTORY_SIZE; ++ d->log_history[prev] = log_results[1]; ++ } ++ ++ /* Compute log_history statistics: avg, min, max */ ++ logring_compute(d, &lr); ++ out[2] = lr.avg; ++ out[3] = lr.min; ++ out[4] = lr.max; ++ ++ /* out[7]: log2(latency_req) - log2(deepest_lat), 0 if unconstrained */ ++ { ++ u64 deepest_lat = drv->states[drv->state_count - 1] ++ .exit_latency_ns; ++ bool lat_valid = (latency_req < PM_QOS_LATENCY_ANY_NS && ++ deepest_lat > 0); ++ ++ if (lat_valid) ++ out[7] = fast_log2f(float_max((float)latency_req, 1.0f)) ++ - fast_log2f(float_max((float)deepest_lat, 1.0f)); ++ else ++ out[7] = 0.0f; ++ } ++ ++ d->last_predicted_ns = ktime_to_ns(sleep_length); ++} ++ ++/* ================================================================ ++ * FPU entry point for nap_select ++ * ++ * Called within kernel_fpu_begin()/kernel_fpu_end(). ++ * Returns: selected idle state index (>= 0), or -1 to fall back ++ * to the integer heuristic. ++ * ================================================================ */ ++ ++int nap_fpu_select(struct cpuidle_driver *drv, ++ struct cpuidle_device *dev, ++ struct nap_cpu_data *d) ++{ ++ s64 latency_req = cpuidle_governor_latency_req(dev->cpu); ++ ++ /* Handle deferred weight reset (set by sysfs or nap_enable) */ ++ if (unlikely(d->reset_pending)) { ++ int e; ++ ++ for (e = 0; e < NAP_NUM_EXPERTS; e++) ++ nap_init_weights(&d->expert_weights[e]); ++ nap_init_log2_cost(d, drv); ++ d->stats.learn_count = 0; ++ d->needs_learn = false; ++ d->reset_pending = false; ++ } ++ ++ /* Deferred learning (always, even during warmup) */ ++ if (d->needs_learn) { ++ float log2_eff = d->nn_output; ++ float alpha = (float)d->overshoot_pctl_millths ++ / 1000.0f; ++ int nn_selected = 0; ++ bool is_overshoot; ++ int i; ++ ++ /* Simulate which state the NN selected */ ++ for (i = drv->state_count - 1; i > 0; i--) { ++ if (d->log2_cost[i] <= log2_eff) { ++ nn_selected = i; ++ break; ++ } ++ } ++ ++ /* ++ * Direct overshoot loss. ++ * ++ * Base the gradient on whether the simulated state ++ * selection actually caused overshoot ++ * (actual < target_residency). ++ * ++ * The asymmetric weight is encoded in the learning ++ * rate (not in d_out) so that gradient clamping ++ * cannot destroy the asymmetry. d_out is ±1 and ++ * gets clipped symmetrically; the (1-α) vs α ratio ++ * is preserved through learn_lr. ++ * ++ * At equilibrium, P(overshoot) converges to α. ++ * α = overshoot_pctl / 1000. ++ */ ++ { ++ float base_lr = (float)d->learning_rate_millths ++ / 1000.0f; ++ ++ is_overshoot = (nn_selected > 0 && ++ d->learn_actual_ns < ++ drv->states[nn_selected].target_residency_ns); ++ ++ /* ++ * When the output was clamped at the upper ++ * limit (nn_output == features[0]), the NN ++ * is already predicting the maximum possible ++ * sleep time. Non-overshoot events would ++ * push weights UP, but the output cannot ++ * actually increase. Suppress this gradient ++ * to prevent unbounded weight growth in idle ++ * systems where natural overshoot rate < α. ++ * ++ * Overshoot events still learn normally ++ * (push DOWN) even when clamped. ++ */ ++ if (d->output_clamped && !is_overshoot) { ++ d->learn_lr = 0; ++ d->learn_d_out = 0; ++ } else { ++ d->learn_d_out = is_overshoot ++ ? 1.0f : -1.0f; ++ d->learn_lr = is_overshoot ++ ? base_lr * (1.0f - alpha) ++ : base_lr * alpha; ++ } ++ } ++ ++ d->stats.learn_count++; ++ ++ nap_nn_learn(d); ++ d->needs_learn = false; ++ } ++ ++ /* ++ * Feature extraction + NN forward pass. ++ * features_f32 is __aligned(64) in nap_cpu_data, satisfying ++ * AVX-512 vmovaps requirements. ++ */ ++ nap_extract_features(drv, dev, d->features_f32, latency_req); ++ ++ /* MoE: 3-way expert selection based on log2(sleep_length) */ ++ if (d->features_f32[0] >= d->expert_deep) ++ d->active_expert = 2; /* deep: deepest C-state */ ++ else if (d->features_f32[0] >= d->expert_mid) ++ d->active_expert = 1; /* long: nohz intermediate */ ++ else ++ d->active_expert = 0; /* short: tick-bound */ ++ d->active_w = &d->expert_weights[d->active_expert]; ++ ++ nap_nn_forward(d->features_f32, &d->nn_output, d->hidden_out, ++ d->active_w); ++ ++ /* ++ * Clamp NN output: predicted sleep cannot exceed sleep_length ++ * (next timer event). features_f32[0] = log2(sleep_length). ++ * ++ * Track whether the clamp was applied so the learning block ++ * can suppress "push up" gradients when the output is already ++ * at the maximum. Without this, weights diverge unboundedly ++ * in idle systems where the natural overshoot rate < alpha. ++ */ ++ d->output_clamped = (d->nn_output > d->features_f32[0]); ++ if (d->output_clamped) ++ d->nn_output = d->features_f32[0]; ++ ++ /* ++ * Threshold-based selection using NN predicted sleep time. ++ * ++ * The NN directly outputs log2(predicted_sleep) in ns. ++ * Select the deepest feasible state whose cost ≤ predicted_sleep. ++ */ ++ { ++ float log2_eff = d->nn_output; ++ int idx = 0, i; ++ ++ for (i = drv->state_count - 1; i > 0; i--) { ++ if (dev->states_usage[i].disable) ++ continue; ++ if (drv->states[i].exit_latency_ns > latency_req) ++ continue; ++ if (d->log2_cost[i] <= log2_eff) { ++ idx = i; ++ break; ++ } ++ } ++ return idx; ++ } ++} +diff --git a/drivers/cpuidle/governors/nap/nap_nn_avx2.c b/drivers/cpuidle/governors/nap/nap_nn_avx2.c +new file mode 100644 +index 0000000000..96e5415423 +--- /dev/null ++++ b/drivers/cpuidle/governors/nap/nap_nn_avx2.c +@@ -0,0 +1,135 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * nap_nn_avx2.c — AVX2+FMA forward pass and backpropagation for the nap MLP ++ * ++ * 8→8→1 scalar regression (log2 correction factor). ++ * Uses 256-bit ymm registers: 8 hidden neurons = 1 ymm. ++ * FMA via vfmadd231ps for fused multiply-add. ++ * ++ * Must be called within kernel_fpu_begin/end. ++ * Compiled with: CFLAGS += -mavx2 -mfma ++ */ ++ ++#include "nap.h" ++ ++/* Aligned load/store: GCC translates v8sf* dereference to vmovaps */ ++static inline v8sf v8sf_load(const float *p) { return *(const v8sf *)p; } ++static inline void v8sf_store(float *p, v8sf v) { *(v8sf *)p = v; } ++ ++/* FMA: a*b+c — vfmadd231ps: dest = src1 * src2 + dest */ ++static inline v8sf v8sf_fmadd(v8sf a, v8sf b, v8sf c) ++{ ++ asm("vfmadd231ps %2, %1, %0" : "+x"(c) : "x"(a), "xm"(b)); ++ return c; ++} ++ ++/* ymm clamp: max(min(v, hi), lo) */ ++static inline v8sf v8sf_clamp(v8sf v, v8sf lo, v8sf hi) ++{ ++ return __builtin_ia32_maxps256(__builtin_ia32_minps256(v, hi), lo); ++} ++ ++void nap_nn_forward_avx2(const float *input, ++ float *output, ++ float *hidden_save, ++ const struct nap_weights *w) ++{ ++ int j; ++ ++ /* === Hidden layer: 8 outputs = 1×ymm, 2-way accumulator === */ ++ v8sf acc0 = v8sf_load(&w->b_h1[0]); ++ v8sf acc1 = V8SF_ZERO; ++ ++ for (j = 0; j < NAP_INPUT_SIZE; j += 2) { ++ v8sf x0 = V8SF_SET1(input[j]); ++ v8sf x1 = V8SF_SET1(input[j + 1]); ++ ++ acc0 = v8sf_fmadd(v8sf_load(&w->w_h1[j][0]), x0, acc0); ++ acc1 = v8sf_fmadd(v8sf_load(&w->w_h1[j + 1][0]), x1, acc1); ++ } ++ ++ /* Merge accumulators + ReLU */ ++ { ++ v8sf h = __builtin_ia32_maxps256(acc0 + acc1, V8SF_ZERO); ++ ++ v8sf_store(hidden_save, h); ++ ++ /* === Output layer: dot(hidden[8], w_out[8]) + b_out === */ ++ { ++ v8sf p = v8sf_load(&w->w_out[0]) * h; ++ ++ /* Horizontal reduce: 8 → 4 → scalar */ ++ v4sf lo = __builtin_ia32_vextractf128_ps256(p, 0); ++ v4sf hi = __builtin_ia32_vextractf128_ps256(p, 1); ++ v4sf s4 = lo + hi; ++ ++ *output = s4[0] + s4[1] + s4[2] + s4[3] + w->b_out; ++ } ++ } ++} ++ ++/* ++ * Online learning (backpropagation) — AVX2+FMA ++ * ++ * Output: scalar d_out (pre-computed by caller) ++ * Hidden layer: 8 neurons = 1×ymm ++ */ ++void nap_nn_learn_avx2(struct nap_cpu_data *d) ++{ ++ int i; ++ float d_out_scalar = d->learn_d_out; ++ float *d_hid = d->learn_d_hid; ++ float lr = d->learn_lr; ++ float clamp_val = (float)d->max_grad_norm_millths / 1000.0f; ++ v8sf v_neg_lr = V8SF_SET1(-lr); ++ v8sf v_cl_hi = V8SF_SET1(clamp_val); ++ v8sf v_cl_lo = V8SF_SET1(-clamp_val); ++ ++ /* ++ * Hidden gradient: d_hid[j] = relu'(h[j]) * w_out[j] * d_out. ++ * vcmpps + vandps: branchless SIMD mask (1×ymm = 8 neurons). ++ */ ++ v8sf dh; ++ { ++ v8sf vd = V8SF_SET1(d_out_scalar); ++ v8sf g = v8sf_load(&d->active_w->w_out[0]) * vd; ++ v8sf mask = __builtin_ia32_cmpps256( ++ v8sf_load(&d->hidden_out[0]), V8SF_ZERO, 14); ++ ++ asm("vandps %2, %1, %0" : "=x"(dh) : "x"(g), "xm"(mask)); ++ v8sf_store(d_hid, dh); ++ } ++ ++ /* Output weight update: w_out[j] -= lr * clamp(h[j] * d_out) */ ++ { ++ v8sf vd = V8SF_SET1(d_out_scalar); ++ v8sf *w = (v8sf *)&d->active_w->w_out[0]; ++ ++ *w = v8sf_fmadd(v_neg_lr, ++ v8sf_clamp(v8sf_load(&d->hidden_out[0]) * vd, ++ v_cl_lo, v_cl_hi), ++ *w); ++ } ++ ++ /* Output bias update (scalar) */ ++ d->active_w->b_out -= lr * fclampf(d_out_scalar, -clamp_val, clamp_val); ++ ++ /* Hidden weight update: w_h1[i][j] -= lr * clamp(feat[i] * d_hid[j]) */ ++ for (i = 0; i < NAP_INPUT_SIZE; i++) { ++ v8sf vf = V8SF_SET1(d->features_f32[i]); ++ v8sf *w = (v8sf *)&d->active_w->w_h1[i][0]; ++ ++ *w = v8sf_fmadd(v_neg_lr, ++ v8sf_clamp(vf * dh, v_cl_lo, v_cl_hi), ++ *w); ++ } ++ ++ /* Hidden bias update */ ++ { ++ v8sf *b = (v8sf *)&d->active_w->b_h1[0]; ++ ++ *b = v8sf_fmadd(v_neg_lr, ++ v8sf_clamp(dh, v_cl_lo, v_cl_hi), ++ *b); ++ } ++} +diff --git a/drivers/cpuidle/governors/nap/nap_nn_sse2.c b/drivers/cpuidle/governors/nap/nap_nn_sse2.c +new file mode 100644 +index 0000000000..a9fffb3b98 +--- /dev/null ++++ b/drivers/cpuidle/governors/nap/nap_nn_sse2.c +@@ -0,0 +1,136 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * nap_nn_sse2.c — SSE2 forward pass and backpropagation for the nap MLP ++ * ++ * 8→8→1 scalar regression (log2 correction factor). ++ * Baseline implementation using SSE2, which is always available on x86_64. ++ * No FMA — uses separate mul + add (2 instructions per MAC). ++ * ++ * Must be called within kernel_fpu_begin/end. ++ * Compiled with: CFLAGS += -msse2 ++ */ ++ ++#include "nap.h" ++ ++/* Aligned load/store */ ++static inline v4sf v4sf_load(const float *p) { return *(const v4sf *)p; } ++static inline void v4sf_store(float *p, v4sf v) { *(v4sf *)p = v; } ++ ++/* ReLU helper */ ++static inline v4sf v4sf_max(v4sf a, v4sf b) ++{ ++ return __builtin_ia32_maxps(a, b); ++} ++ ++void nap_nn_forward_sse2(const float *input, ++ float *output, ++ float *hidden_save, ++ const struct nap_weights *w) ++{ ++ int j; ++ ++ /* === Hidden layer: 8 outputs = 2×xmm === */ ++ v4sf acc0 = v4sf_load(&w->b_h1[0]); ++ v4sf acc1 = v4sf_load(&w->b_h1[4]); ++ ++ for (j = 0; j < NAP_INPUT_SIZE; j++) { ++ v4sf x = V4SF_SET1(input[j]); ++ acc0 += v4sf_load(&w->w_h1[j][0]) * x; ++ acc1 += v4sf_load(&w->w_h1[j][4]) * x; ++ } ++ ++ /* ReLU */ ++ { ++ v4sf zero = V4SF_SET1(0.0f); ++ ++ acc0 = v4sf_max(acc0, zero); ++ acc1 = v4sf_max(acc1, zero); ++ } ++ v4sf_store(&hidden_save[0], acc0); ++ v4sf_store(&hidden_save[4], acc1); ++ ++ /* === Output layer: dot(hidden[8], w_out[8]) + b_out → 1 scalar === */ ++ { ++ v4sf p0 = v4sf_load(&w->w_out[0]) * acc0; ++ v4sf p1 = v4sf_load(&w->w_out[4]) * acc1; ++ v4sf sum = p0 + p1; ++ ++ *output = sum[0] + sum[1] + sum[2] + sum[3] + w->b_out; ++ } ++} ++ ++/* ++ * Online learning (backpropagation) — SSE2 ++ * ++ * Output: scalar d_out (pre-computed by caller) ++ * Hidden layer: 8 neurons = 2×xmm ++ */ ++void nap_nn_learn_sse2(struct nap_cpu_data *d) ++{ ++ int i; ++ float d_out_scalar = d->learn_d_out; ++ float *d_hid = d->learn_d_hid; ++ float lr = d->learn_lr; ++ float clamp_val = (float)d->max_grad_norm_millths / 1000.0f; ++ v4sf v_lr = V4SF_SET1(lr); ++ v4sf v_cl_hi = V4SF_SET1(clamp_val); ++ v4sf v_cl_lo = V4SF_SET1(-clamp_val); ++ ++ /* ++ * Hidden gradient: d_hid[j] = relu'(h[j]) * w_out[j] * d_out. ++ * Must be computed before output weight update to use pre-update ++ * w_out. ++ */ ++ { ++ v4sf vd = V4SF_SET1(d_out_scalar); ++ v4sf zero = V4SF_SET1(0.0f); ++ v4sf h, g; ++ v4si m; ++ ++ h = v4sf_load(&d->hidden_out[0]); ++ g = v4sf_load(&d->active_w->w_out[0]) * vd; ++ m = (v4si)(h > zero); ++ v4sf_store(&d_hid[0], v4si_as_v4sf(v4sf_as_v4si(g) & m)); ++ ++ h = v4sf_load(&d->hidden_out[4]); ++ g = v4sf_load(&d->active_w->w_out[4]) * vd; ++ m = (v4si)(h > zero); ++ v4sf_store(&d_hid[4], v4si_as_v4sf(v4sf_as_v4si(g) & m)); ++ } ++ ++ /* Output weight update: w_out[j] -= lr * clamp(h[j] * d_out) */ ++ { ++ v4sf vd = V4SF_SET1(d_out_scalar); ++ v4sf *w = (v4sf *)&d->active_w->w_out[0]; ++ ++ w[0] -= v_lr * v4sf_clamp(v4sf_load(&d->hidden_out[0]) * vd, ++ v_cl_lo, v_cl_hi); ++ w[1] -= v_lr * v4sf_clamp(v4sf_load(&d->hidden_out[4]) * vd, ++ v_cl_lo, v_cl_hi); ++ } ++ ++ /* Output bias update: b_out -= lr * clamp(d_out) */ ++ d->active_w->b_out -= lr * fclampf(d_out_scalar, -clamp_val, clamp_val); ++ ++ /* Hidden weight update: w_h1[i][j] -= lr * clamp(feat[i] * d_hid[j]) */ ++ { ++ v4sf dh0 = *(const v4sf *)&d_hid[0]; ++ v4sf dh1 = *(const v4sf *)&d_hid[4]; ++ ++ for (i = 0; i < NAP_INPUT_SIZE; i++) { ++ v4sf vf = V4SF_SET1(d->features_f32[i]); ++ v4sf *w = (v4sf *)&d->active_w->w_h1[i][0]; ++ ++ w[0] -= v_lr * v4sf_clamp(vf * dh0, v_cl_lo, v_cl_hi); ++ w[1] -= v_lr * v4sf_clamp(vf * dh1, v_cl_lo, v_cl_hi); ++ } ++ ++ /* Hidden bias update: b_h1[j] -= lr * clamp(d_hid[j]) */ ++ { ++ v4sf *b = (v4sf *)&d->active_w->b_h1[0]; ++ ++ b[0] -= v_lr * v4sf_clamp(dh0, v_cl_lo, v_cl_hi); ++ b[1] -= v_lr * v4sf_clamp(dh1, v_cl_lo, v_cl_hi); ++ } ++ } ++} +-- +2.34.1 diff --git a/PKGBUILD b/PKGBUILD index 943b69b..9017452 100755 --- a/PKGBUILD +++ b/PKGBUILD @@ -94,6 +94,7 @@ source=( "git+https://github.com/forkymcforkface/xpad-noone.git#commit=8e903676dd9514c07ce5e06e43c5f7d8cc51cb7d" "git+https://github.com/atar-axis/xpadneo.git#tag=v$_xpadneo_version" 6.16-poc-selector-v2.6.1.patch + 6.16-nap-v0.4.0.patch ) sha256sums=('a69eea3b189ab64e65608140d6cd7c57823d1b39b361e876197eec1b4d1db957' '37452b4d09e5e42134ae24a61f2f656790837c327268074cf79d7dab3558b972' @@ -141,7 +142,8 @@ sha256sums=('a69eea3b189ab64e65608140d6cd7c57823d1b39b361e876197eec1b4d1db957' '7ba61ccf2ddb508d6adb30906d3d57dc0ce1bc64a6d1a41796eb94a8584ea63b' '1055bbbd32985017f4501d375648873bd598db084177d302aeeade56b47920e1' '26b3a811d38471a42229fa037cb6d2bb5ff78f19f45a17c7f263339ee67769a7' - '14dabfb0452a3a817e8d809fb28eb7565512e95386d789c627b62baf136e001f') + '14dabfb0452a3a817e8d809fb28eb7565512e95386d789c627b62baf136e001f' + 'e3a353432be799ba938f6cb2495f07e531ba456818500008f09bf6b6a8632862') export KBUILD_BUILD_HOST=archlinux export KBUILD_BUILD_USER=$pkgbase From 8ca236f29872207953684b524d82dc82718eb190 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?= Date: Fri, 15 May 2026 17:50:08 -0300 Subject: [PATCH 3/5] Enable high-performance idle CPU selector --- PKGBUILD | 2 +- config-charcoal | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/PKGBUILD b/PKGBUILD index 9017452..e6a7ba5 100755 --- a/PKGBUILD +++ b/PKGBUILD @@ -99,7 +99,7 @@ source=( sha256sums=('a69eea3b189ab64e65608140d6cd7c57823d1b39b361e876197eec1b4d1db957' '37452b4d09e5e42134ae24a61f2f656790837c327268074cf79d7dab3558b972' 'd88eaf0f94bae470040e4882f334c05b1bb2ab0a99e4b7299aa0b2337810ab8d' - 'fd18272b72f7226a9b00bf676ed74b961c666402d0fdea0846aa50ff7a8f3758' + '5e04417ff3a3416b64ff26112825842ccc6ad353a2f86d27a6412a5b7fda30d1' 'b831de1b98a2f77f636f4780e37ebfcb3a6829f94f5423eb04c4b26e64ac43b8' '52cbbf41450806d766260bc4f1ea055f6f9fdd55d37ad831840b16d505beb0cc' '0a6a7408ccc0c94b5cce50dabc7ee318abcc1b9eaaedd3d83fd7e7d5a73b4d4f' diff --git a/config-charcoal b/config-charcoal index 2e16643..9542d7d 100755 --- a/config-charcoal +++ b/config-charcoal @@ -31,9 +31,12 @@ CONFIG_LTO_CLANG_FULL=y # Enable Polly CONFIG_POLLY_CLANG=y -# Change CPU IDLE goernor +# Change CPU IDLE governor CONFIG_CPU_IDLE_GOV_LADDER=n CONFIG_CPU_IDLE_GOV_MENU=n +CONFIG_CPU_IDLE_GOV_NAP=y +CONFIG_CPU_IDLE_DEFAULT_GOVERNOR="nap" +CONFIG_SCHED_POC_SELECTOR=y # Enable Zen kernel tuning CONFIG_ZEN_INTERACTIVE=y From 8e5af07c97dbbc2afea35bfc102829e0e0cb57a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?= Date: Wed, 13 May 2026 02:09:08 -0300 Subject: [PATCH 4/5] Add poc link to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 19a9199..4418a65 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,8 @@ Works on Steam Deck and possibly other AMD based handheld PCs. - Add [re-swappiness](https://github.com/firelzrd/re-swappiness) - Add [zram-ir](https://github.com/firelzrd/zram-ir) - Add [kcompressd-unofficial](https://github.com/firelzrd/kcompressd-unofficial) + +- Add [poc selectors](https://github.com/firelzrd/poc-selector) - Switch default DRM scheduling policy to round-robin - Optimize kernel with -O3 (from tkg) - Optimize for Zen 2 (from Gentoo) From 4bfb080e38336591231d5db5a02a0c923701a922 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Luis=20Silv=C3=A9rio=20de=20Lima?= Date: Fri, 15 May 2026 14:10:22 -0300 Subject: [PATCH 5/5] Add nap link to README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4418a65..21f9b3e 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,8 @@ Works on Steam Deck and possibly other AMD based handheld PCs. - Add [re-swappiness](https://github.com/firelzrd/re-swappiness) - Add [zram-ir](https://github.com/firelzrd/zram-ir) - Add [kcompressd-unofficial](https://github.com/firelzrd/kcompressd-unofficial) - - Add [poc selectors](https://github.com/firelzrd/poc-selector) +- Add [nap](https://github.com/firelzrd/nap) - Switch default DRM scheduling policy to round-robin - Optimize kernel with -O3 (from tkg) - Optimize for Zen 2 (from Gentoo)