From a869cffd0b010f0976c416a91a0f61b26ed7107a Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 20 Mar 2026 05:59:12 +0000
Subject: [PATCH 01/10] sched: Provide sparsemask, a reduced contention bitmap

Provide struct sparsemask and functions to manipulate it.  A sparsemask is
a sparse bitmap.  It reduces cache contention vs the usual bitmap when many
threads concurrently set, clear, and visit elements, by reducing the number
of significant bits per cacheline.  For each cacheline chunk of the mask,
only the first K bits of the first word are used, and the remaining bits
are ignored, where K is a creation time parameter.  Thus a sparsemask that
can represent a set of N elements is approximately (N/K * CACHELINE) bytes
in size.

This type is simpler and more efficient than the struct sbitmap used by
block drivers.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com>
Signed-off-by: WangYuli <wangyl5933@chinaunicom.cn>
---
 kernel/sched/sparsemask.h | 210 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 210 insertions(+)
 create mode 100644 kernel/sched/sparsemask.h

diff --git a/kernel/sched/sparsemask.h b/kernel/sched/sparsemask.h
new file mode 100644
index 000000000000..11948620a1a2
--- /dev/null
+++ b/kernel/sched/sparsemask.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sparsemask.h - sparse bitmap operations
+ *
+ * Copyright (c) 2018 Oracle Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_SPARSEMASK_H
+#define __LINUX_SPARSEMASK_H
+
+#include <linux/kernel.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+
+/*
+ * A sparsemask is a sparse bitmap.  It reduces cache contention vs the usual
+ * bitmap when many threads concurrently set, clear, and visit elements.  For
+ * each cacheline chunk of the mask, only the first K bits of the first word are
+ * used, and the remaining bits are ignored, where K is a creation time
+ * parameter.  Thus a sparsemask that can represent a set of N elements is
+ * approximately (N/K * CACHELINE) bytes in size.
+ *
+ * Clients pass and receive element numbers in the public API, and the
+ * implementation translates them to bit numbers to perform the bitmap
+ * operations.
+ */
+
+struct sparsemask_chunk {
+	unsigned long word;	/* the significant bits */
+} ____cacheline_aligned_in_smp;
+
+struct sparsemask {
+	short nelems;		/* current number of elements */
+	short density;		/* store 2^density elements per chunk */
+	struct sparsemask_chunk chunks[0];  /* embedded array of chunks */
+};
+
+#define _SMASK_INDEX(density, elem)	((elem) >> (density))
+#define _SMASK_BIT(density, elem)	((elem) & ((1U << (density)) - 1U))
+#define SMASK_INDEX(mask, elem)		_SMASK_INDEX((mask)->density, elem)
+#define SMASK_BIT(mask, elem)		_SMASK_BIT((mask)->density, elem)
+#define SMASK_WORD(mask, elem)		\
+	(&(mask)->chunks[SMASK_INDEX((mask), (elem))].word)
+
+/*
+ * sparsemask_next() - Return the next one bit in a bitmap, starting at a
+ * specified position and wrapping from the last bit to the first, up to but
+ * not including a specified origin.  This is a helper, so do not call it
+ * directly.
+ *
+ * @mask: Bitmap to search.
+ * @origin: Origin.
+ * @prev: Previous bit. Start search after this bit number.
+ *	  If -1, start search at @origin.
+ *
+ * Return: the bit number, else mask->nelems if no bits are set in the range.
+ */
+static inline int
+sparsemask_next(const struct sparsemask *mask, int origin, int prev)
+{
+	int density = mask->density;
+	int bits_per_word = 1U << density;
+	const struct sparsemask_chunk *chunk;
+	int nelems = mask->nelems;
+	int next, bit, nbits;
+	unsigned long word;
+
+	/* Calculate number of bits to be searched. */
+	if (prev == -1) {
+		nbits = nelems;
+		next = origin;
+	} else if (prev < origin) {
+		nbits = origin - prev;
+		next = prev + 1;
+	} else {
+		nbits = nelems - prev + origin - 1;
+		next = prev + 1;
+	}
+
+	if (unlikely(next >= nelems))
+		return nelems;
+
+	/*
+	 * Fetch and adjust first word.  Clear word bits below @next, and round
+	 * @next down to @bits_per_word boundary because later ffs will add
+	 * those bits back.
+	 */
+	chunk = &mask->chunks[_SMASK_INDEX(density, next)];
+	bit = _SMASK_BIT(density, next);
+	word = chunk->word & (~0UL << bit);
+	next -= bit;
+	nbits += bit;
+
+	while (!word) {
+		next += bits_per_word;
+		nbits -= bits_per_word;
+		if (nbits <= 0)
+			return nelems;
+
+		if (next >= nelems) {
+			chunk = mask->chunks;
+			nbits -= (next - nelems);
+			next = 0;
+		} else {
+			chunk++;
+		}
+		word = chunk->word;
+	}
+
+	next += __ffs(word);
+	if (next >= origin && prev != -1)
+		return nelems;
+	return next;
+}
+
+/****************** The public API ********************/
+
+/*
+ * Max value for the density parameter, limited by 64 bits in the chunk word.
+ */
+#define SMASK_DENSITY_MAX		6
+
+/*
+ * Return bytes to allocate for a sparsemask, for custom allocators.
+ */
+static inline size_t sparsemask_size(int nelems, int density)
+{
+	int index = _SMASK_INDEX(density, nelems) + 1;
+
+	return offsetof(struct sparsemask, chunks[index]);
+}
+
+/*
+ * Initialize an allocated sparsemask, for custom allocators.
+ */
+static inline void
+sparsemask_init(struct sparsemask *mask, int nelems, int density)
+{
+	WARN_ON(density < 0 || density > SMASK_DENSITY_MAX || nelems < 0);
+	mask->nelems = nelems;
+	mask->density = density;
+}
+
+/*
+ * sparsemask_alloc_node() - Allocate, initialize, and return a sparsemask.
+ *
+ * @nelems - maximum number of elements.
+ * @density - store 2^density elements per cacheline chunk.
+ *	      values from 0 to SMASK_DENSITY_MAX inclusive.
+ * @flags - kmalloc allocation flags
+ * @node - numa node
+ */
+static inline struct sparsemask *
+sparsemask_alloc_node(int nelems, int density, gfp_t flags, int node)
+{
+	int nbytes = sparsemask_size(nelems, density);
+	struct sparsemask *mask = kmalloc_node(nbytes, flags, node);
+
+	if (mask)
+		sparsemask_init(mask, nelems, density);
+	return mask;
+}
+
+static inline void sparsemask_free(struct sparsemask *mask)
+{
+	kfree(mask);
+}
+
+static inline void sparsemask_set_elem(struct sparsemask *dst, int elem)
+{
+	set_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem));
+}
+
+static inline void sparsemask_clear_elem(struct sparsemask *dst, int elem)
+{
+	clear_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem));
+}
+
+static inline int sparsemask_test_elem(const struct sparsemask *mask, int elem)
+{
+	return test_bit(SMASK_BIT(mask, elem), SMASK_WORD(mask, elem));
+}
+
+/*
+ * sparsemask_for_each() - iterate over each set bit in a bitmap, starting at a
+ *   specified position, and wrapping from the last bit to the first.
+ *
+ * @mask: Bitmap to iterate over.
+ * @origin: Bit number at which to start searching.
+ * @elem: Iterator.  Can be signed or unsigned integer.
+ *
+ * The implementation does not assume any bit in @mask is set, including
+ * @origin.  After the loop, @elem = @mask->nelems.
+ */
+#define sparsemask_for_each(mask, origin, elem)				\
+	for ((elem) = -1;						\
+	     (elem) = sparsemask_next((mask), (origin), (elem)),	\
+		(elem) < (mask)->nelems;)
+
+#endif /* __LINUX_SPARSEMASK_H */

From e6f119d217786d44f59ca120abf357ee96eb8986 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 20 Mar 2026 05:59:13 +0000
Subject: [PATCH 02/10] sched/topology: Provide hooks to allocate data shared
 per LLC

Add functions sd_llc_alloc_all() and sd_llc_free_all() to allocate and
free data pointed to by struct sched_domain_shared at the last-level-cache
domain.  sd_llc_alloc_all() is called after the SD hierarchy is known, to
eliminate the unnecessary allocations that would occur if we instead
allocated in __sdt_alloc() and then figured out which shared nodes are
redundant.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com>
Signed-off-by: WangYuli <wangyl5933@chinaunicom.cn>
---
 kernel/sched/topology.c | 75 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 48f671b07af7..b797c154eb40 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -21,6 +21,12 @@ void sched_domains_mutex_unlock(void)
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
 
+struct s_data;
+static int sd_llc_alloc(struct sched_domain *sd);
+static void sd_llc_free(struct sched_domain *sd);
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d);
+static void sd_llc_free_all(const struct cpumask *cpu_map);
+
 static int __init sched_debug_setup(char *str)
 {
 	sched_debug_verbose = true;
@@ -625,8 +631,10 @@ static void destroy_sched_domain(struct sched_domain *sd)
 	 */
 	free_sched_groups(sd->groups, 1);
 
-	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+	if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) {
+		sd_llc_free(sd);
 		kfree(sd->shared);
+	}
 	kfree(sd);
 }
 
@@ -1541,6 +1549,7 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 		free_percpu(d->sd);
 		fallthrough;
 	case sa_sd_storage:
+		sd_llc_free_all(cpu_map);
 		__sdt_free(cpu_map);
 		fallthrough;
 	case sa_none:
@@ -2394,6 +2403,62 @@ static void __sdt_free(const struct cpumask *cpu_map)
 	}
 }
 
+static int sd_llc_alloc(struct sched_domain *sd)
+{
+	/* Allocate sd->shared data here. Empty for now. */
+
+	return 0;
+}
+
+static void sd_llc_free(struct sched_domain *sd)
+{
+	struct sched_domain_shared *sds = sd->shared;
+
+	if (!sds)
+		return;
+
+	/* Free data here. Empty for now. */
+}
+
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
+{
+	struct sched_domain *sd, *hsd;
+	int i;
+
+	for_each_cpu(i, cpu_map) {
+		/* Find highest domain that shares resources */
+		hsd = NULL;
+		for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) {
+			if (!(sd->flags & SD_SHARE_LLC))
+				break;
+			hsd = sd;
+		}
+		if (hsd && sd_llc_alloc(hsd))
+			return 1;
+	}
+
+	return 0;
+}
+
+static void sd_llc_free_all(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	struct sched_domain *sd;
+	struct sd_data *sdd;
+	int j;
+
+	for_each_sd_topology(tl) {
+		sdd = &tl->data;
+		if (!sdd || !sdd->sd)
+			continue;
+		for_each_cpu(j, cpu_map) {
+			sd = *per_cpu_ptr(sdd->sd, j);
+			if (sd)
+				sd_llc_free(sd);
+		}
+	}
+}
+
 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 		struct sched_domain *child, int cpu)
@@ -2616,6 +2681,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
+	/*
+	 * Allocate shared sd data at last level cache.  Must be done after
+	 * domains are built above, but before the data is used in
+	 * cpu_attach_domain and descendants below.
+	 */
+	if (sd_llc_alloc_all(cpu_map, &d))
+		goto error;
+
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {

From 35ee846bd12513b9c28c7ff357325376aeaa2a2c Mon Sep 17 00:00:00 2001
From: Steve Sistare <steve.sistare@oracle.com>
Date: Fri, 20 Mar 2026 05:59:14 +0000
Subject: [PATCH 03/10] sched/topology: Provide cfs_overload_cpus bitmap

Define and initialize a sparse bitmap of overloaded CPUs, per
last-level-cache scheduling domain, for use by the CFS scheduling class.
Save a pointer to cfs_overload_cpus in the rq for efficient access.

Signed-off-by: Steve Sistare <steve.sistare@oracle.com>
Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com>
Signed-off-by: WangYuli <wangyl5933@chinaunicom.cn>
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/sched.h           |  2 ++
 kernel/sched/topology.c        | 25 +++++++++++++++++++++++--
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 45c0022b91ce..472c3dcf5a34 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -67,6 +67,7 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+	struct sparsemask *cfs_overload_cpus;
 	int		nr_idle_scan;
 };
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e87064bc0c32..1d507d3da2d3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -82,6 +82,7 @@ struct cfs_rq;
 struct rt_rq;
 struct sched_group;
 struct cpuidle_state;
+struct sparsemask;
 
 #ifdef CONFIG_PARAVIRT
 # include <asm/paravirt.h>
@@ -1172,6 +1173,7 @@ struct rq {
 	struct cfs_rq		cfs;
 	struct rt_rq		rt;
 	struct dl_rq		dl;
+	struct sparsemask	*cfs_overload_cpus;
 #ifdef CONFIG_SCHED_CLASS_EXT
 	struct scx_rq		scx;
 #endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b797c154eb40..f0cfa782cbb1 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -6,6 +6,7 @@
 #include <linux/sched/isolation.h>
 #include <linux/bsearch.h>
 #include "sched.h"
+#include "sparsemask.h"
 
 DEFINE_MUTEX(sched_domains_mutex);
 void sched_domains_mutex_lock(void)
@@ -678,7 +679,9 @@ DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
 
 static void update_top_cache_domain(int cpu)
 {
+	struct sparsemask *cfs_overload_cpus = NULL;
 	struct sched_domain_shared *sds = NULL;
+	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *sd;
 	int id = cpu;
 	int size = 1;
@@ -688,8 +691,10 @@ static void update_top_cache_domain(int cpu)
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
 		sds = sd->shared;
+		cfs_overload_cpus = sds->cfs_overload_cpus;
 	}
 
+	rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
 	per_cpu(sd_llc_id, cpu) = id;
@@ -2405,7 +2410,22 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
 static int sd_llc_alloc(struct sched_domain *sd)
 {
-	/* Allocate sd->shared data here. Empty for now. */
+	struct sched_domain_shared *sds = sd->shared;
+	struct cpumask *span = sched_domain_span(sd);
+	int nid = cpu_to_node(cpumask_first(span));
+	int flags = __GFP_ZERO | GFP_KERNEL;
+	struct sparsemask *mask;
+
+	/*
+	 * Allocate the bitmap if not already allocated.  This is called for
+	 * every CPU in the LLC but only allocates once per sd_llc_shared.
+	 */
+	if (!sds->cfs_overload_cpus) {
+		mask = sparsemask_alloc_node(nr_cpu_ids, 3, flags, nid);
+		if (!mask)
+			return 1;
+		sds->cfs_overload_cpus = mask;
+	}
 
 	return 0;
 }
@@ -2417,7 +2437,8 @@ static void sd_llc_free(struct sched_domain *sd)
 	if (!sds)
 		return;
 
-	/* Free data here. Empty for now. */
+	sparsemask_free(sds->cfs_overload_cpus);
+	sds->cfs_overload_cpus = NULL;
 }
 
 static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)

From 7c0847ae13fff90762d1b8e62c43edee5c7ac3a3 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 20 Mar 2026 05:59:15 +0000
Subject: [PATCH 04/10] sched/fair: Dynamically update cfs_overload_cpus

An overloaded CPU has more than 1 runnable task.  When a CFS task wakes
on a CPU, if h_nr_runnable transitions from 1 to more, then set the CPU in
the cfs_overload_cpus bitmap.  When a CFS task sleeps, if h_nr_runnable
transitions from 2 to less, then clear the CPU in cfs_overload_cpus.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com>
Signed-off-by: WangYuli <wangyl5933@chinaunicom.cn>
---
 kernel/sched/fair.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 241505fda915..3874e197213a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -55,6 +55,7 @@
 #include <uapi/linux/sched/types.h>
 
 #include "sched.h"
+#include "sparsemask.h"
 #include "stats.h"
 #include "autogroup.h"
 
@@ -5217,6 +5218,33 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
 }
 
+#ifdef CONFIG_SMP
+static void overload_clear(struct rq *rq)
+{
+	struct sparsemask *overload_cpus;
+
+	rcu_read_lock();
+	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+	if (overload_cpus)
+		sparsemask_clear_elem(overload_cpus, rq->cpu);
+	rcu_read_unlock();
+}
+
+static void overload_set(struct rq *rq)
+{
+	struct sparsemask *overload_cpus;
+
+	rcu_read_lock();
+	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+	if (overload_cpus)
+		sparsemask_set_elem(overload_cpus, rq->cpu);
+	rcu_read_unlock();
+}
+#else /* CONFIG_SMP */
+static inline void overload_clear(struct rq *rq) {}
+static inline void overload_set(struct rq *rq) {}
+#endif
+
 void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
 {
 	struct sched_entity *se = &p->se;
@@ -6129,6 +6157,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!dequeue)
 		return false;  /* Throttle no longer required. */
 
+
 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
@@ -7057,6 +7086,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int h_nr_idle = task_has_idle_policy(p);
 	int h_nr_runnable = 1;
 	int task_new = !(flags & ENQUEUE_WAKEUP);
+	unsigned int prev_nr = rq->cfs.h_nr_runnable;
 	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	u64 slice = 0;
 
@@ -7074,6 +7104,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 	if (flags & ENQUEUE_DELAYED) {
 		requeue_delayed_entity(se, flags);
+
+		if (prev_nr <= 1 && rq->cfs.h_nr_runnable >= 2)
+			overload_set(rq);
+
 		return;
 	}
 
@@ -7147,6 +7181,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 	/* At this point se is NULL and we are at root level*/
 	add_nr_running(rq, 1);
+	if (prev_nr <= 1 && rq->cfs.h_nr_runnable >= 2)
+		overload_set(rq);
 
 	/*
 	 * Since new tasks are assigned an initial util_avg equal to
@@ -7191,6 +7227,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	int h_nr_idle = 0;
 	int h_nr_queued = 0;
 	int h_nr_runnable = 0;
+	unsigned int prev_nr = rq->cfs.h_nr_runnable;
 	struct cfs_rq *cfs_rq;
 	u64 slice = 0;
 
@@ -7206,8 +7243,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		cfs_rq = cfs_rq_of(se);
 
 		if (!dequeue_entity(cfs_rq, se, flags)) {
-			if (p && &p->se == se)
+			if (p && &p->se == se) {
+				if (prev_nr >= 2 && rq->cfs.h_nr_runnable <= 1)
+					overload_clear(rq);
+
 				return -1;
+			}
 
 			slice = cfs_rq_min_slice(cfs_rq);
 			break;
@@ -7265,6 +7306,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	}
 
 	sub_nr_running(rq, h_nr_queued);
+	if (prev_nr >= 2 && rq->cfs.h_nr_runnable <= 1)
+		overload_clear(rq);
 
 	/* balance early to pull high priority tasks */
 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))

From e3a9c6bde88332a1ead0cf144eaef0f555c17536 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 20 Mar 2026 05:59:16 +0000
Subject: [PATCH 05/10] sched/fair: Hoist idle_stamp up from idle_balance

Move the update of idle_stamp from idle_balance to the call site in
pick_next_task_fair, to prepare for a future patch that adds work to
pick_next_task_fair which must be included in the idle_stamp interval.
No functional change.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com>
Signed-off-by: WangYuli <wangyl5933@chinaunicom.cn>
---
 kernel/sched/fair.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3874e197213a..8fafb4cedf20 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5219,6 +5219,16 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 }
 
 #ifdef CONFIG_SMP
+static inline void rq_idle_stamp_update(struct rq *rq)
+{
+	rq->idle_stamp = rq_clock(rq);
+}
+
+static inline void rq_idle_stamp_clear(struct rq *rq)
+{
+	rq->idle_stamp = 0;
+}
+
 static void overload_clear(struct rq *rq)
 {
 	struct sparsemask *overload_cpus;
@@ -5241,6 +5251,8 @@ static void overload_set(struct rq *rq)
 	rcu_read_unlock();
 }
 #else /* CONFIG_SMP */
+static inline void rq_idle_stamp_update(struct rq *rq) {}
+static inline void rq_idle_stamp_clear(struct rq *rq) {}
 static inline void overload_clear(struct rq *rq) {}
 static inline void overload_set(struct rq *rq) {}
 #endif
@@ -9115,8 +9127,17 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 
 idle:
 	if (rf) {
+		/*
+		 * We must set idle_stamp _before_ calling idle_balance(), such that we
+		 * measure the duration of idle_balance() as idle time.
+		 */
+		rq_idle_stamp_update(rq);
+
 		new_tasks = sched_balance_newidle(rq, rf);
 
+		if (new_tasks)
+			rq_idle_stamp_clear(rq);
+
 		/*
 		 * Because sched_balance_newidle() releases (and re-acquires)
 		 * rq->lock, it is possible for any higher priority task to
@@ -13017,13 +13038,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 	if (this_rq->ttwu_pending)
 		return 0;
 
-	/*
-	 * We must set idle_stamp _before_ calling sched_balance_rq()
-	 * for CPU_NEWLY_IDLE, such that we measure the this duration
-	 * as idle time.
-	 */
-	this_rq->idle_stamp = rq_clock(this_rq);
-
 	/*
 	 * Do not pull tasks towards !active CPUs...
 	 */
@@ -13133,9 +13147,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 	if (time_after(this_rq->next_balance, next_balance))
 		this_rq->next_balance = next_balance;
 
-	if (pulled_task)
-		this_rq->idle_stamp = 0;
-	else
+	if (!pulled_task)
 		nohz_newidle_balance(this_rq);
 
 	rq_repin_lock(this_rq, rf);

From 89c246cd50159ef95de89e63c87c827060519c67 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 20 Mar 2026 05:59:17 +0000
Subject: [PATCH 06/10] sched/fair: Generalize the detach_task interface

The detach_task function takes a struct lb_env argument, but only needs a
few of its members.  Pass the rq and cpu arguments explicitly so the
function may be called from code that is not based on lb_env.  No
functional change.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com>
Signed-off-by: WangYuli <wangyl5933@chinaunicom.cn>
---
 kernel/sched/fair.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8fafb4cedf20..a83e65e3a41b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9719,6 +9719,17 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 	set_task_cpu(p, env->dst_cpu);
 }
 
+/*
+ * detach_task_steal() -- detach the task for the migration from @src_rq to @dst_cpu.
+ */
+static void detach_task_steal(struct task_struct *p, struct rq *src_rq, int dst_cpu)
+{
+	lockdep_assert_rq_held(src_rq);
+
+	deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+	set_task_cpu(p, dst_cpu);
+}
+
 /*
  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
  * part of active balancing operations within "domain".

From cb0bdaa231204e37c628be0410bdfbd0cc13f8be Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 20 Mar 2026 05:59:18 +0000
Subject: [PATCH 07/10] sched/fair: Provide can_migrate_task_llc

Define a simpler version of can_migrate_task called can_migrate_task_llc
which does not require a struct lb_env argument, and judges whether a
migration from one CPU to another within the same LLC should be allowed.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com>
Signed-off-by: WangYuli <wangyl5933@chinaunicom.cn>
---
 kernel/sched/fair.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a83e65e3a41b..bdca53d7cc1b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9699,6 +9699,34 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	return 0;
 }
 
+/*
+ * Return true if task @p can migrate from @rq to @dst_rq in the same LLC.
+ * No need to test for co-locality, and no need to test task_hot(), as sharing
+ * LLC provides cache warmth at that level.
+ */
+static bool
+can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
+{
+	int dst_cpu = dst_rq->cpu;
+
+	lockdep_assert_rq_held(rq);
+
+	if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) {
+		schedstat_inc(p->stats.nr_failed_migrations_affine);
+		return false;
+	}
+
+	if (task_on_cpu(rq, p)) {
+		schedstat_inc(p->stats.nr_failed_migrations_running);
+		return false;
+	}
+
+	if (p->se.sched_delayed)
+		return false;
+
+	return true;
+}
+
 /*
  * detach_task() -- detach the task for the migration specified in env
  */

From 48df6e45f0272ec3435d289ae2175840b5cc824a Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Fri, 20 Mar 2026 05:59:19 +0000
Subject: [PATCH 08/10] sched/fair: Steal work from an overloaded CPU when CPU
 goes idle

When a CPU has no more CFS tasks to run, and idle_balance() fails to find a
task, then attempt to steal a task from an overloaded CPU in the same LLC,
using the cfs_overload_cpus bitmap to efficiently identify candidates.  To
minimize search time, steal the first migratable task that is found when
the bitmap is traversed.  For fairness, search for migratable tasks on an
overloaded CPU in order of next to run.

This simple stealing yields a higher CPU utilization than idle_balance()
alone, because the search is cheap, so it may be called every time the CPU
is about to go idle.  idle_balance() does more work because it searches
widely for the busiest queue, so to limit its CPU consumption, it declines
to search if the system is too busy.  Simple stealing does not offload the
globally busiest queue, but it is much better than running nothing at all.

Stealing is controlled by the sched feature SCHED_STEAL, which is enabled
by default. Note that all test results presented below are based on the
NO_DELAY_DEQUEUE implementation.

Stealing imprroves utilization with only a modest CPU overhead in scheduler
code.  In the following experiment, hackbench is run with varying numbers
of groups (40 tasks per group), and the delta in /proc/schedstat is shown
for each run, averaged per CPU, augmented with these non-standard stats:

  steal - number of times a task is stolen from another CPU.

X6-2: 2 socket * 40 cores * 2 hyperthreads = 160 CPUs
Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz
hackbench <grps> process 100000

  baseline
  grps  time   %busy  sched   idle    wake   steal
  1     2.182  20.00  35876   17905   17958  0
  2     2.391  39.00  67753   33808   33921  0
  3     2.871  47.00  100944  48966   51538  0
  4     2.928  62.00  114489  55171   59059  0
  8     4.852  83.00  219907  92961   121703 0

  new
  grps  time   %busy  sched   idle    wake   steal   %speedup
  1     2.229  18.00  45450   22691   22751  52      -2.1
  2     2.123  40.00  49975   24977   24990  6       12.6
  3     2.690  61.00  56118   22641   32780  9073    6.7
  4     2.828  80.00  37927   12828   24165  8442    3.5
  8     4.120  95.00  85929   8613    57858  11098   17.8

Elapsed time improves by 17.8, and CPU busy utilization is up
by 1 to 18% hitting 95% at peak load.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Signed-off-by: Chen Jinghuang <chenjinghuang2@huawei.com>
Signed-off-by: WangYuli <wangyl5933@chinaunicom.cn>
---
 kernel/sched/fair.c     | 174 ++++++++++++++++++++++++++++++++++++++--
 kernel/sched/features.h |   6 ++
 2 files changed, 174 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bdca53d7cc1b..c4786568d5d0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5233,6 +5233,9 @@ static void overload_clear(struct rq *rq)
 {
 	struct sparsemask *overload_cpus;
 
+	if (!sched_feat(STEAL))
+		return;
+
 	rcu_read_lock();
 	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
 	if (overload_cpus)
@@ -5244,17 +5247,29 @@ static void overload_set(struct rq *rq)
 {
 	struct sparsemask *overload_cpus;
 
+	if (!sched_feat(STEAL))
+		return;
+
 	rcu_read_lock();
 	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
 	if (overload_cpus)
 		sparsemask_set_elem(overload_cpus, rq->cpu);
 	rcu_read_unlock();
 }
+
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
 #else /* CONFIG_SMP */
 static inline void rq_idle_stamp_update(struct rq *rq) {}
 static inline void rq_idle_stamp_clear(struct rq *rq) {}
 static inline void overload_clear(struct rq *rq) {}
 static inline void overload_set(struct rq *rq) {}
+
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+	return 0;
+}
+
 #endif
 
 void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
@@ -9128,21 +9143,24 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 idle:
 	if (rf) {
 		/*
-		 * We must set idle_stamp _before_ calling idle_balance(), such that we
-		 * measure the duration of idle_balance() as idle time.
+		 * We must set idle_stamp _before_ calling try_steal() or
+		 * sched_balance_newidle(), such that we measure the duration
+		 * as idle time.
 		 */
 		rq_idle_stamp_update(rq);
 
 		new_tasks = sched_balance_newidle(rq, rf);
+		if (new_tasks == 0)
+			new_tasks = try_steal(rq, rf);
 
 		if (new_tasks)
 			rq_idle_stamp_clear(rq);
 
 		/*
-		 * Because sched_balance_newidle() releases (and re-acquires)
-		 * rq->lock, it is possible for any higher priority task to
-		 * appear. In that case we must re-start the pick_next_entity()
-		 * loop.
+		 * Because try_steal() and sched_balance_newidle() release
+		 * (and re-acquire) rq->lock, it is possible for any higher priority
+		 * task to appear. In that case we must re-start the
+		 * pick_next_entity() loop.
 		 */
 		if (new_tasks < 0)
 			return RETRY_TASK;
@@ -13240,6 +13258,150 @@ void sched_balance_trigger(struct rq *rq)
 	nohz_balancer_kick(rq);
 }
 
+/*
+ * Search the runnable tasks in @cfs_rq in order of next to run, and find
+ * the first one that can be migrated to @dst_rq.  @cfs_rq is locked on entry.
+ * On success, dequeue the task from @cfs_rq and return it, else return NULL.
+ */
+static struct task_struct *
+detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq)
+{
+	int dst_cpu = dst_rq->cpu;
+	struct task_struct *p;
+	struct rq *rq = rq_of(cfs_rq);
+
+	lockdep_assert_rq_held(rq);
+
+	list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) {
+		if (can_migrate_task_llc(p, rq, dst_rq)) {
+			detach_task_steal(p, rq, dst_cpu);
+			return p;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Attempt to migrate a CFS task from @src_cpu to @dst_rq.  @locked indicates
+ * whether @dst_rq is already locked on entry.  This function may lock or
+ * unlock @dst_rq, and updates @locked to indicate the locked state on return.
+ * The locking protocol is based on idle_balance().
+ * Returns 1 on success and 0 on failure.
+ */
+static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+		      int src_cpu)
+{
+	struct task_struct *p;
+	struct rq_flags rf;
+	int stolen = 0;
+	int dst_cpu = dst_rq->cpu;
+	struct rq *src_rq = cpu_rq(src_cpu);
+
+	if (dst_cpu == src_cpu || src_rq->cfs.h_nr_runnable < 2)
+		return 0;
+
+	if (*locked) {
+		rq_unpin_lock(dst_rq, dst_rf);
+		raw_spin_rq_unlock(dst_rq);
+		*locked = false;
+	}
+	rq_lock_irqsave(src_rq, &rf);
+	update_rq_clock(src_rq);
+
+	if (src_rq->cfs.h_nr_runnable < 2 || !cpu_active(src_cpu))
+		p = NULL;
+	else
+		p = detach_next_task(&src_rq->cfs, dst_rq);
+
+	rq_unlock(src_rq, &rf);
+
+	if (p) {
+		raw_spin_rq_lock(dst_rq);
+		rq_repin_lock(dst_rq, dst_rf);
+		*locked = true;
+		update_rq_clock(dst_rq);
+		attach_task(dst_rq, p);
+		stolen = 1;
+	}
+	local_irq_restore(rf.flags);
+
+	return stolen;
+}
+
+/*
+ * Conservative upper bound on the max cost of a steal, in nsecs (the typical
+ * cost is 1-2 microsec).  Do not steal if average idle time is less.
+ */
+#define SCHED_STEAL_COST 10000
+
+/*
+ * Try to steal a runnable CFS task from a CPU in the same LLC as @dst_rq,
+ * and migrate it to @dst_rq.  rq_lock is held on entry and return, but
+ * may be dropped in between.  Return 1 on success, 0 on failure, and -1
+ * if a task in a different scheduling class has become runnable on @dst_rq.
+ */
+static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
+{
+	int src_cpu;
+	int dst_cpu = dst_rq->cpu;
+	bool locked = true;
+	int stolen = 0;
+	struct sparsemask *overload_cpus;
+
+	if (!sched_feat(STEAL))
+		return 0;
+
+	if (!cpu_active(dst_cpu))
+		return 0;
+
+	if (dst_rq->avg_idle < SCHED_STEAL_COST)
+		return 0;
+
+	/* Get bitmap of overloaded CPUs in the same LLC as @dst_rq */
+
+	rcu_read_lock();
+	overload_cpus = rcu_dereference(dst_rq->cfs_overload_cpus);
+	if (!overload_cpus) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+#ifdef CONFIG_SCHED_SMT
+	/*
+	 * First try overloaded CPUs on the same core to preserve cache warmth.
+	 */
+	if (static_branch_likely(&sched_smt_present)) {
+		for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) {
+			if (sparsemask_test_elem(overload_cpus, src_cpu) &&
+			    steal_from(dst_rq, dst_rf, &locked, src_cpu)) {
+				stolen = 1;
+				goto out;
+			}
+		}
+	}
+#endif	/* CONFIG_SCHED_SMT */
+
+	/* Accept any suitable task in the LLC */
+
+	sparsemask_for_each(overload_cpus, dst_cpu, src_cpu) {
+		if (steal_from(dst_rq, dst_rf, &locked, src_cpu)) {
+			stolen = 1;
+			goto out;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+	if (!locked) {
+		raw_spin_rq_lock(dst_rq);
+		rq_repin_lock(dst_rq, dst_rf);
+	}
+	stolen |= (dst_rq->cfs.h_nr_runnable > 0);
+	if (dst_rq->nr_running != dst_rq->cfs.h_nr_runnable)
+		stolen = -1;
+	return stolen;
+}
+
 static void rq_online_fair(struct rq *rq)
 {
 	update_sysctl();
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 136a6584be79..e8c3e19bf585 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -87,6 +87,12 @@ SCHED_FEAT(TTWU_QUEUE, true)
  */
 SCHED_FEAT(SIS_UTIL, true)
 
+/*
+ * Steal a CFS task from another CPU when going idle.
+ * Improves CPU utilization.
+ */
+SCHED_FEAT(STEAL, true)
+
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
  * in a single rq->lock section. Default disabled because the

From de4e397a4c00a9ad9abd11fb359b8f948c99a6d0 Mon Sep 17 00:00:00 2001
From: WangYuli <wangyl5933@chinaunicom.cn>
Date: Mon, 13 Apr 2026 19:56:47 +0800
Subject: [PATCH 09/10] deepin: sched/sparsemask: fix UBSAN
 array-index-out-of-bounds

Replace zero-length array chunks[0] with C99 flexible array member
chunks[] in struct sparsemask to fix UBSAN warning:

  UBSAN: array-index-out-of-bounds in kernel/sched/sparsemask.h:181:32
  index 0 is out of range for type 'struct sparsemask_chunk[0]'

The zero-length array is a deprecated GCC extension. Using a proper
flexible array member eliminates the UBSAN false positive while
maintaining the same runtime behavior.

Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202603242133.f66e336f-lkp@intel.com
Cc: Chen Jinghuang <chenjinghuang2@huawei.com>
Signed-off-by: WangYuli <wangyl5933@chinaunicom.cn>
---
 kernel/sched/sparsemask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/sparsemask.h b/kernel/sched/sparsemask.h
index 11948620a1a2..048443c7dae4 100644
--- a/kernel/sched/sparsemask.h
+++ b/kernel/sched/sparsemask.h
@@ -42,7 +42,7 @@ struct sparsemask_chunk {
 struct sparsemask {
 	short nelems;		/* current number of elements */
 	short density;		/* store 2^density elements per chunk */
-	struct sparsemask_chunk chunks[0];  /* embedded array of chunks */
+	struct sparsemask_chunk chunks[];  /* embedded array of chunks */
 };
 
 #define _SMASK_INDEX(density, elem)	((elem) >> (density))

From 5092be92940ba7fc099f0f68ca1f88fa3e48d0a3 Mon Sep 17 00:00:00 2001
From: WangYuli <wangyl5933@chinaunicom.cn>
Date: Mon, 13 Apr 2026 20:19:03 +0800
Subject: [PATCH 10/10] deepin: sched/fair: Default disable SCHED_FEAT(STEAL)
 for safety

Disable the STEAL scheduler feature by default to minimize the risk of
unexpected regressions on existing workloads. The steal-task mechanism
is still compiled in and can be dynamically enabled at runtime via:

  echo STEAL > /sys/kernel/debug/sched/features

Once sufficient testing evidence demonstrates that this feature is
universally beneficial with no adverse side effects, this commit can
simply be reverted to re-enable it by default.

Signed-off-by: WangYuli <wangyl5933@chinaunicom.cn>
---
 kernel/sched/features.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e8c3e19bf585..a3fc8dd7c824 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -91,7 +91,7 @@ SCHED_FEAT(SIS_UTIL, true)
  * Steal a CFS task from another CPU when going idle.
  * Improves CPU utilization.
  */
-SCHED_FEAT(STEAL, true)
+SCHED_FEAT(STEAL, false)
 
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls