diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 45c0022b91ce..472c3dcf5a34 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -67,6 +67,7 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+	struct sparsemask *cfs_overload_cpus;
 	int		nr_idle_scan;
 };
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 241505fda915..c4786568d5d0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -55,6 +55,7 @@
 #include <uapi/linux/sched/types.h>
 
 #include "sched.h"
+#include "sparsemask.h"
 #include "stats.h"
 #include "autogroup.h"
 
@@ -5217,6 +5218,60 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
 }
 
+#ifdef CONFIG_SMP
+static inline void rq_idle_stamp_update(struct rq *rq)
+{
+	rq->idle_stamp = rq_clock(rq);
+}
+
+static inline void rq_idle_stamp_clear(struct rq *rq)
+{
+	rq->idle_stamp = 0;
+}
+
+static void overload_clear(struct rq *rq)
+{
+	struct sparsemask *overload_cpus;
+
+	if (!sched_feat(STEAL))
+		return;
+
+	rcu_read_lock();
+	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+	if (overload_cpus)
+		sparsemask_clear_elem(overload_cpus, rq->cpu);
+	rcu_read_unlock();
+}
+
+static void overload_set(struct rq *rq)
+{
+	struct sparsemask *overload_cpus;
+
+	if (!sched_feat(STEAL))
+		return;
+
+	rcu_read_lock();
+	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
+	if (overload_cpus)
+		sparsemask_set_elem(overload_cpus, rq->cpu);
+	rcu_read_unlock();
+}
+
+static int try_steal(struct rq *this_rq, struct rq_flags *rf);
+
+#else /* CONFIG_SMP */
+static inline void rq_idle_stamp_update(struct rq *rq) {}
+static inline void rq_idle_stamp_clear(struct rq *rq) {}
+static inline void overload_clear(struct rq *rq) {}
+static inline void overload_set(struct rq *rq) {}
+
+static inline int try_steal(struct rq *this_rq, struct rq_flags *rf)
+{
+	return 0;
+}
+
+#endif
+
 void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
 {
 	struct sched_entity *se = &p->se;
@@ -6129,6 +6184,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!dequeue)
 		return false;  /* Throttle no longer required. */
 
+
 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
@@ -7057,6 +7113,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int h_nr_idle = task_has_idle_policy(p);
 	int h_nr_runnable = 1;
 	int task_new = !(flags & ENQUEUE_WAKEUP);
+	unsigned int prev_nr = rq->cfs.h_nr_runnable;
 	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	u64 slice = 0;
 
@@ -7074,6 +7131,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 	if (flags & ENQUEUE_DELAYED) {
 		requeue_delayed_entity(se, flags);
+
+		if (prev_nr <= 1 && rq->cfs.h_nr_runnable >= 2)
+			overload_set(rq);
+
 		return;
 	}
 
@@ -7147,6 +7208,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 	/* At this point se is NULL and we are at root level*/
 	add_nr_running(rq, 1);
+	if (prev_nr <= 1 && rq->cfs.h_nr_runnable >= 2)
+		overload_set(rq);
 
 	/*
 	 * Since new tasks are assigned an initial util_avg equal to
@@ -7191,6 +7254,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	int h_nr_idle = 0;
 	int h_nr_queued = 0;
 	int h_nr_runnable = 0;
+	unsigned int prev_nr = rq->cfs.h_nr_runnable;
 	struct cfs_rq *cfs_rq;
 	u64 slice = 0;
 
@@ -7206,8 +7270,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		cfs_rq = cfs_rq_of(se);
 
 		if (!dequeue_entity(cfs_rq, se, flags)) {
-			if (p && &p->se == se)
+			if (p && &p->se == se) {
+				if (prev_nr >= 2 && rq->cfs.h_nr_runnable <= 1)
+					overload_clear(rq);
+
 				return -1;
+			}
 
 			slice = cfs_rq_min_slice(cfs_rq);
 			break;
@@ -7265,6 +7333,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	}
 
 	sub_nr_running(rq, h_nr_queued);
+	if (prev_nr >= 2 && rq->cfs.h_nr_runnable <= 1)
+		overload_clear(rq);
 
 	/* balance early to pull high priority tasks */
 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
@@ -9072,13 +9142,25 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 
 idle:
 	if (rf) {
+		/*
+		 * We must set idle_stamp _before_ calling try_steal() or
+		 * sched_balance_newidle(), such that we measure the duration
+		 * as idle time.
+		 */
+		rq_idle_stamp_update(rq);
+
 		new_tasks = sched_balance_newidle(rq, rf);
+		if (new_tasks == 0)
+			new_tasks = try_steal(rq, rf);
+
+		if (new_tasks)
+			rq_idle_stamp_clear(rq);
 
 		/*
-		 * Because sched_balance_newidle() releases (and re-acquires)
-		 * rq->lock, it is possible for any higher priority task to
-		 * appear. In that case we must re-start the pick_next_entity()
-		 * loop.
+		 * Because try_steal() and sched_balance_newidle() release
+		 * (and re-acquire) rq->lock, it is possible for any higher priority
+		 * task to appear. In that case we must re-start the
+		 * pick_next_entity() loop.
 		 */
 		if (new_tasks < 0)
 			return RETRY_TASK;
@@ -9635,6 +9717,34 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	return 0;
 }
 
+/*
+ * Return true if task @p can migrate from @rq to @dst_rq in the same LLC.
+ * No need to test for co-locality, and no need to test task_hot(), as sharing
+ * LLC provides cache warmth at that level.
+ */
+static bool
+can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq)
+{
+	int dst_cpu = dst_rq->cpu;
+
+	lockdep_assert_rq_held(rq);
+
+	if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) {
+		schedstat_inc(p->stats.nr_failed_migrations_affine);
+		return false;
+	}
+
+	if (task_on_cpu(rq, p)) {
+		schedstat_inc(p->stats.nr_failed_migrations_running);
+		return false;
+	}
+
+	if (p->se.sched_delayed)
+		return false;
+
+	return true;
+}
+
 /*
  * detach_task() -- detach the task for the migration specified in env
  */
@@ -9655,6 +9765,17 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 	set_task_cpu(p, env->dst_cpu);
 }
 
+/*
+ * detach_task_steal() -- detach the task for the migration from @src_rq to @dst_cpu.
+ */
+static void detach_task_steal(struct task_struct *p, struct rq *src_rq, int dst_cpu)
+{
+	lockdep_assert_rq_held(src_rq);
+
+	deactivate_task(src_rq, p, DEQUEUE_NOCLOCK);
+	set_task_cpu(p, dst_cpu);
+}
+
 /*
  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
  * part of active balancing operations within "domain".
@@ -12974,13 +13095,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 	if (this_rq->ttwu_pending)
 		return 0;
 
-	/*
-	 * We must set idle_stamp _before_ calling sched_balance_rq()
-	 * for CPU_NEWLY_IDLE, such that we measure the this duration
-	 * as idle time.
-	 */
-	this_rq->idle_stamp = rq_clock(this_rq);
-
 	/*
 	 * Do not pull tasks towards !active CPUs...
 	 */
@@ -13090,9 +13204,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 	if (time_after(this_rq->next_balance, next_balance))
 		this_rq->next_balance = next_balance;
 
-	if (pulled_task)
-		this_rq->idle_stamp = 0;
-	else
+	if (!pulled_task)
 		nohz_newidle_balance(this_rq);
 
 	rq_repin_lock(this_rq, rf);
@@ -13146,6 +13258,150 @@ void sched_balance_trigger(struct rq *rq)
 	nohz_balancer_kick(rq);
 }
 
+/*
+ * Search the runnable tasks in @cfs_rq in order of next to run, and find
+ * the first one that can be migrated to @dst_rq.  @cfs_rq is locked on entry.
+ * On success, dequeue the task from @cfs_rq and return it, else return NULL.
+ */
+static struct task_struct *
+detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq)
+{
+	int dst_cpu = dst_rq->cpu;
+	struct task_struct *p;
+	struct rq *rq = rq_of(cfs_rq);
+
+	lockdep_assert_rq_held(rq);
+
+	list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) {
+		if (can_migrate_task_llc(p, rq, dst_rq)) {
+			detach_task_steal(p, rq, dst_cpu);
+			return p;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Attempt to migrate a CFS task from @src_cpu to @dst_rq.  @locked indicates
+ * whether @dst_rq is already locked on entry.  This function may lock or
+ * unlock @dst_rq, and updates @locked to indicate the locked state on return.
+ * The locking protocol is based on idle_balance().
+ * Returns 1 on success and 0 on failure.
+ */
+static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
+		      int src_cpu)
+{
+	struct task_struct *p;
+	struct rq_flags rf;
+	int stolen = 0;
+	int dst_cpu = dst_rq->cpu;
+	struct rq *src_rq = cpu_rq(src_cpu);
+
+	if (dst_cpu == src_cpu || src_rq->cfs.h_nr_runnable < 2)
+		return 0;
+
+	if (*locked) {
+		rq_unpin_lock(dst_rq, dst_rf);
+		raw_spin_rq_unlock(dst_rq);
+		*locked = false;
+	}
+	rq_lock_irqsave(src_rq, &rf);
+	update_rq_clock(src_rq);
+
+	if (src_rq->cfs.h_nr_runnable < 2 || !cpu_active(src_cpu))
+		p = NULL;
+	else
+		p = detach_next_task(&src_rq->cfs, dst_rq);
+
+	rq_unlock(src_rq, &rf);
+
+	if (p) {
+		raw_spin_rq_lock(dst_rq);
+		rq_repin_lock(dst_rq, dst_rf);
+		*locked = true;
+		update_rq_clock(dst_rq);
+		attach_task(dst_rq, p);
+		stolen = 1;
+	}
+	local_irq_restore(rf.flags);
+
+	return stolen;
+}
+
+/*
+ * Conservative upper bound on the max cost of a steal, in nsecs (the typical
+ * cost is 1-2 microsec).  Do not steal if average idle time is less.
+ */
+#define SCHED_STEAL_COST 10000
+
+/*
+ * Try to steal a runnable CFS task from a CPU in the same LLC as @dst_rq,
+ * and migrate it to @dst_rq.  rq_lock is held on entry and return, but
+ * may be dropped in between.  Return 1 on success, 0 on failure, and -1
+ * if a task in a different scheduling class has become runnable on @dst_rq.
+ */
+static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
+{
+	int src_cpu;
+	int dst_cpu = dst_rq->cpu;
+	bool locked = true;
+	int stolen = 0;
+	struct sparsemask *overload_cpus;
+
+	if (!sched_feat(STEAL))
+		return 0;
+
+	if (!cpu_active(dst_cpu))
+		return 0;
+
+	if (dst_rq->avg_idle < SCHED_STEAL_COST)
+		return 0;
+
+	/* Get bitmap of overloaded CPUs in the same LLC as @dst_rq */
+
+	rcu_read_lock();
+	overload_cpus = rcu_dereference(dst_rq->cfs_overload_cpus);
+	if (!overload_cpus) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+#ifdef CONFIG_SCHED_SMT
+	/*
+	 * First try overloaded CPUs on the same core to preserve cache warmth.
+	 */
+	if (static_branch_likely(&sched_smt_present)) {
+		for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) {
+			if (sparsemask_test_elem(overload_cpus, src_cpu) &&
+			    steal_from(dst_rq, dst_rf, &locked, src_cpu)) {
+				stolen = 1;
+				goto out;
+			}
+		}
+	}
+#endif	/* CONFIG_SCHED_SMT */
+
+	/* Accept any suitable task in the LLC */
+
+	sparsemask_for_each(overload_cpus, dst_cpu, src_cpu) {
+		if (steal_from(dst_rq, dst_rf, &locked, src_cpu)) {
+			stolen = 1;
+			goto out;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+	if (!locked) {
+		raw_spin_rq_lock(dst_rq);
+		rq_repin_lock(dst_rq, dst_rf);
+	}
+	stolen |= (dst_rq->cfs.h_nr_runnable > 0);
+	if (dst_rq->nr_running != dst_rq->cfs.h_nr_runnable)
+		stolen = -1;
+	return stolen;
+}
+
 static void rq_online_fair(struct rq *rq)
 {
 	update_sysctl();
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 136a6584be79..a3fc8dd7c824 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -87,6 +87,12 @@ SCHED_FEAT(TTWU_QUEUE, true)
  */
 SCHED_FEAT(SIS_UTIL, true)
 
+/*
+ * Steal a CFS task from another CPU when going idle.
+ * Improves CPU utilization.
+ */
+SCHED_FEAT(STEAL, false)
+
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
  * in a single rq->lock section. Default disabled because the
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e87064bc0c32..1d507d3da2d3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -82,6 +82,7 @@ struct cfs_rq;
 struct rt_rq;
 struct sched_group;
 struct cpuidle_state;
+struct sparsemask;
 
 #ifdef CONFIG_PARAVIRT
 # include <asm/paravirt.h>
@@ -1172,6 +1173,7 @@ struct rq {
 	struct cfs_rq		cfs;
 	struct rt_rq		rt;
 	struct dl_rq		dl;
+	struct sparsemask	*cfs_overload_cpus;
 #ifdef CONFIG_SCHED_CLASS_EXT
 	struct scx_rq		scx;
 #endif
diff --git a/kernel/sched/sparsemask.h b/kernel/sched/sparsemask.h
new file mode 100644
index 000000000000..048443c7dae4
--- /dev/null
+++ b/kernel/sched/sparsemask.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sparsemask.h - sparse bitmap operations
+ *
+ * Copyright (c) 2018 Oracle Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_SPARSEMASK_H
+#define __LINUX_SPARSEMASK_H
+
+#include <linux/kernel.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+
+/*
+ * A sparsemask is a sparse bitmap.  It reduces cache contention vs the usual
+ * bitmap when many threads concurrently set, clear, and visit elements.  For
+ * each cacheline chunk of the mask, only the first K bits of the first word are
+ * used, and the remaining bits are ignored, where K is a creation time
+ * parameter.  Thus a sparsemask that can represent a set of N elements is
+ * approximately (N/K * CACHELINE) bytes in size.
+ *
+ * Clients pass and receive element numbers in the public API, and the
+ * implementation translates them to bit numbers to perform the bitmap
+ * operations.
+ */
+
+struct sparsemask_chunk {
+	unsigned long word;	/* the significant bits */
+} ____cacheline_aligned_in_smp;
+
+struct sparsemask {
+	short nelems;		/* current number of elements */
+	short density;		/* store 2^density elements per chunk */
+	struct sparsemask_chunk chunks[];  /* embedded array of chunks */
+};
+
+#define _SMASK_INDEX(density, elem)	((elem) >> (density))
+#define _SMASK_BIT(density, elem)	((elem) & ((1U << (density)) - 1U))
+#define SMASK_INDEX(mask, elem)		_SMASK_INDEX((mask)->density, elem)
+#define SMASK_BIT(mask, elem)		_SMASK_BIT((mask)->density, elem)
+#define SMASK_WORD(mask, elem)		\
+	(&(mask)->chunks[SMASK_INDEX((mask), (elem))].word)
+
+/*
+ * sparsemask_next() - Return the next one bit in a bitmap, starting at a
+ * specified position and wrapping from the last bit to the first, up to but
+ * not including a specified origin.  This is a helper, so do not call it
+ * directly.
+ *
+ * @mask: Bitmap to search.
+ * @origin: Origin.
+ * @prev: Previous bit. Start search after this bit number.
+ *	  If -1, start search at @origin.
+ *
+ * Return: the bit number, else mask->nelems if no bits are set in the range.
+ */
+static inline int
+sparsemask_next(const struct sparsemask *mask, int origin, int prev)
+{
+	int density = mask->density;
+	int bits_per_word = 1U << density;
+	const struct sparsemask_chunk *chunk;
+	int nelems = mask->nelems;
+	int next, bit, nbits;
+	unsigned long word;
+
+	/* Calculate number of bits to be searched. */
+	if (prev == -1) {
+		nbits = nelems;
+		next = origin;
+	} else if (prev < origin) {
+		nbits = origin - prev;
+		next = prev + 1;
+	} else {
+		nbits = nelems - prev + origin - 1;
+		next = prev + 1;
+	}
+
+	if (unlikely(next >= nelems))
+		return nelems;
+
+	/*
+	 * Fetch and adjust first word.  Clear word bits below @next, and round
+	 * @next down to @bits_per_word boundary because later ffs will add
+	 * those bits back.
+	 */
+	chunk = &mask->chunks[_SMASK_INDEX(density, next)];
+	bit = _SMASK_BIT(density, next);
+	word = chunk->word & (~0UL << bit);
+	next -= bit;
+	nbits += bit;
+
+	while (!word) {
+		next += bits_per_word;
+		nbits -= bits_per_word;
+		if (nbits <= 0)
+			return nelems;
+
+		if (next >= nelems) {
+			chunk = mask->chunks;
+			nbits -= (next - nelems);
+			next = 0;
+		} else {
+			chunk++;
+		}
+		word = chunk->word;
+	}
+
+	next += __ffs(word);
+	if (next >= origin && prev != -1)
+		return nelems;
+	return next;
+}
+
+/****************** The public API ********************/
+
+/*
+ * Max value for the density parameter, limited by 64 bits in the chunk word.
+ */
+#define SMASK_DENSITY_MAX		6
+
+/*
+ * Return bytes to allocate for a sparsemask, for custom allocators.
+ */
+static inline size_t sparsemask_size(int nelems, int density)
+{
+	int index = _SMASK_INDEX(density, nelems) + 1;
+
+	return offsetof(struct sparsemask, chunks[index]);
+}
+
+/*
+ * Initialize an allocated sparsemask, for custom allocators.
+ */
+static inline void
+sparsemask_init(struct sparsemask *mask, int nelems, int density)
+{
+	WARN_ON(density < 0 || density > SMASK_DENSITY_MAX || nelems < 0);
+	mask->nelems = nelems;
+	mask->density = density;
+}
+
+/*
+ * sparsemask_alloc_node() - Allocate, initialize, and return a sparsemask.
+ *
+ * @nelems - maximum number of elements.
+ * @density - store 2^density elements per cacheline chunk.
+ *	      values from 0 to SMASK_DENSITY_MAX inclusive.
+ * @flags - kmalloc allocation flags
+ * @node - numa node
+ */
+static inline struct sparsemask *
+sparsemask_alloc_node(int nelems, int density, gfp_t flags, int node)
+{
+	int nbytes = sparsemask_size(nelems, density);
+	struct sparsemask *mask = kmalloc_node(nbytes, flags, node);
+
+	if (mask)
+		sparsemask_init(mask, nelems, density);
+	return mask;
+}
+
+static inline void sparsemask_free(struct sparsemask *mask)
+{
+	kfree(mask);
+}
+
+static inline void sparsemask_set_elem(struct sparsemask *dst, int elem)
+{
+	set_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem));
+}
+
+static inline void sparsemask_clear_elem(struct sparsemask *dst, int elem)
+{
+	clear_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem));
+}
+
+static inline int sparsemask_test_elem(const struct sparsemask *mask, int elem)
+{
+	return test_bit(SMASK_BIT(mask, elem), SMASK_WORD(mask, elem));
+}
+
+/*
+ * sparsemask_for_each() - iterate over each set bit in a bitmap, starting at a
+ *   specified position, and wrapping from the last bit to the first.
+ *
+ * @mask: Bitmap to iterate over.
+ * @origin: Bit number at which to start searching.
+ * @elem: Iterator.  Can be signed or unsigned integer.
+ *
+ * The implementation does not assume any bit in @mask is set, including
+ * @origin.  After the loop, @elem = @mask->nelems.
+ */
+#define sparsemask_for_each(mask, origin, elem)				\
+	for ((elem) = -1;						\
+	     (elem) = sparsemask_next((mask), (origin), (elem)),	\
+		(elem) < (mask)->nelems;)
+
+#endif /* __LINUX_SPARSEMASK_H */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 48f671b07af7..f0cfa782cbb1 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -6,6 +6,7 @@
 #include <linux/sched/isolation.h>
 #include <linux/bsearch.h>
 #include "sched.h"
+#include "sparsemask.h"
 
 DEFINE_MUTEX(sched_domains_mutex);
 void sched_domains_mutex_lock(void)
@@ -21,6 +22,12 @@ void sched_domains_mutex_unlock(void)
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
 
+struct s_data;
+static int sd_llc_alloc(struct sched_domain *sd);
+static void sd_llc_free(struct sched_domain *sd);
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d);
+static void sd_llc_free_all(const struct cpumask *cpu_map);
+
 static int __init sched_debug_setup(char *str)
 {
 	sched_debug_verbose = true;
@@ -625,8 +632,10 @@ static void destroy_sched_domain(struct sched_domain *sd)
 	 */
 	free_sched_groups(sd->groups, 1);
 
-	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+	if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) {
+		sd_llc_free(sd);
 		kfree(sd->shared);
+	}
 	kfree(sd);
 }
 
@@ -670,7 +679,9 @@ DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
 
 static void update_top_cache_domain(int cpu)
 {
+	struct sparsemask *cfs_overload_cpus = NULL;
 	struct sched_domain_shared *sds = NULL;
+	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *sd;
 	int id = cpu;
 	int size = 1;
@@ -680,8 +691,10 @@ static void update_top_cache_domain(int cpu)
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
 		sds = sd->shared;
+		cfs_overload_cpus = sds->cfs_overload_cpus;
 	}
 
+	rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus);
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
 	per_cpu(sd_llc_id, cpu) = id;
@@ -1541,6 +1554,7 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 		free_percpu(d->sd);
 		fallthrough;
 	case sa_sd_storage:
+		sd_llc_free_all(cpu_map);
 		__sdt_free(cpu_map);
 		fallthrough;
 	case sa_none:
@@ -2394,6 +2408,78 @@ static void __sdt_free(const struct cpumask *cpu_map)
 	}
 }
 
+static int sd_llc_alloc(struct sched_domain *sd)
+{
+	struct sched_domain_shared *sds = sd->shared;
+	struct cpumask *span = sched_domain_span(sd);
+	int nid = cpu_to_node(cpumask_first(span));
+	int flags = __GFP_ZERO | GFP_KERNEL;
+	struct sparsemask *mask;
+
+	/*
+	 * Allocate the bitmap if not already allocated.  This is called for
+	 * every CPU in the LLC but only allocates once per sd_llc_shared.
+	 */
+	if (!sds->cfs_overload_cpus) {
+		mask = sparsemask_alloc_node(nr_cpu_ids, 3, flags, nid);
+		if (!mask)
+			return 1;
+		sds->cfs_overload_cpus = mask;
+	}
+
+	return 0;
+}
+
+static void sd_llc_free(struct sched_domain *sd)
+{
+	struct sched_domain_shared *sds = sd->shared;
+
+	if (!sds)
+		return;
+
+	sparsemask_free(sds->cfs_overload_cpus);
+	sds->cfs_overload_cpus = NULL;
+}
+
+static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d)
+{
+	struct sched_domain *sd, *hsd;
+	int i;
+
+	for_each_cpu(i, cpu_map) {
+		/* Find highest domain that shares resources */
+		hsd = NULL;
+		for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) {
+			if (!(sd->flags & SD_SHARE_LLC))
+				break;
+			hsd = sd;
+		}
+		if (hsd && sd_llc_alloc(hsd))
+			return 1;
+	}
+
+	return 0;
+}
+
+static void sd_llc_free_all(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	struct sched_domain *sd;
+	struct sd_data *sdd;
+	int j;
+
+	for_each_sd_topology(tl) {
+		sdd = &tl->data;
+		if (!sdd || !sdd->sd)
+			continue;
+		for_each_cpu(j, cpu_map) {
+			sd = *per_cpu_ptr(sdd->sd, j);
+			if (sd)
+				sd_llc_free(sd);
+		}
+	}
+}
+
 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 		struct sched_domain *child, int cpu)
@@ -2616,6 +2702,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
+	/*
+	 * Allocate shared sd data at last level cache.  Must be done after
+	 * domains are built above, but before the data is used in
+	 * cpu_attach_domain and descendants below.
+	 */
+	if (sd_llc_alloc_all(cpu_map, &d))
+		goto error;
+
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {