diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 45c0022b91ce..472c3dcf5a34 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -67,6 +67,7 @@ struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; int has_idle_cores; + struct sparsemask *cfs_overload_cpus; int nr_idle_scan; }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 241505fda915..c4786568d5d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -55,6 +55,7 @@ #include #include "sched.h" +#include "sparsemask.h" #include "stats.h" #include "autogroup.h" @@ -5217,6 +5218,60 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } +#ifdef CONFIG_SMP +static inline void rq_idle_stamp_update(struct rq *rq) +{ + rq->idle_stamp = rq_clock(rq); +} + +static inline void rq_idle_stamp_clear(struct rq *rq) +{ + rq->idle_stamp = 0; +} + +static void overload_clear(struct rq *rq) +{ + struct sparsemask *overload_cpus; + + if (!sched_feat(STEAL)) + return; + + rcu_read_lock(); + overload_cpus = rcu_dereference(rq->cfs_overload_cpus); + if (overload_cpus) + sparsemask_clear_elem(overload_cpus, rq->cpu); + rcu_read_unlock(); +} + +static void overload_set(struct rq *rq) +{ + struct sparsemask *overload_cpus; + + if (!sched_feat(STEAL)) + return; + + rcu_read_lock(); + overload_cpus = rcu_dereference(rq->cfs_overload_cpus); + if (overload_cpus) + sparsemask_set_elem(overload_cpus, rq->cpu); + rcu_read_unlock(); +} + +static int try_steal(struct rq *this_rq, struct rq_flags *rf); + +#else /* CONFIG_SMP */ +static inline void rq_idle_stamp_update(struct rq *rq) {} +static inline void rq_idle_stamp_clear(struct rq *rq) {} +static inline void overload_clear(struct rq *rq) {} +static inline void overload_set(struct rq *rq) {} + +static inline int try_steal(struct rq *this_rq, struct rq_flags *rf) +{ + return 0; +} + +#endif + void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) { struct sched_entity *se = &p->se; @@ -6129,6 +6184,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) if (!dequeue) return false; /* Throttle no longer required. */ + /* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); @@ -7057,6 +7113,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int h_nr_idle = task_has_idle_policy(p); int h_nr_runnable = 1; int task_new = !(flags & ENQUEUE_WAKEUP); + unsigned int prev_nr = rq->cfs.h_nr_runnable; int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; @@ -7074,6 +7131,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_DELAYED) { requeue_delayed_entity(se, flags); + + if (prev_nr <= 1 && rq->cfs.h_nr_runnable >= 2) + overload_set(rq); + return; } @@ -7147,6 +7208,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); + if (prev_nr <= 1 && rq->cfs.h_nr_runnable >= 2) + overload_set(rq); /* * Since new tasks are assigned an initial util_avg equal to @@ -7191,6 +7254,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) int h_nr_idle = 0; int h_nr_queued = 0; int h_nr_runnable = 0; + unsigned int prev_nr = rq->cfs.h_nr_runnable; struct cfs_rq *cfs_rq; u64 slice = 0; @@ -7206,8 +7270,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) cfs_rq = cfs_rq_of(se); if (!dequeue_entity(cfs_rq, se, flags)) { - if (p && &p->se == se) + if (p && &p->se == se) { + if (prev_nr >= 2 && rq->cfs.h_nr_runnable <= 1) + overload_clear(rq); + return -1; + } slice = cfs_rq_min_slice(cfs_rq); break; @@ -7265,6 +7333,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) } sub_nr_running(rq, h_nr_queued); + if (prev_nr >= 2 && rq->cfs.h_nr_runnable <= 1) + overload_clear(rq); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) @@ -9072,13 +9142,25 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf idle: if (rf) { + /* + * We must set idle_stamp _before_ calling try_steal() or + * sched_balance_newidle(), such that we measure the duration + * as idle time. + */ + rq_idle_stamp_update(rq); + new_tasks = sched_balance_newidle(rq, rf); + if (new_tasks == 0) + new_tasks = try_steal(rq, rf); + + if (new_tasks) + rq_idle_stamp_clear(rq); /* - * Because sched_balance_newidle() releases (and re-acquires) - * rq->lock, it is possible for any higher priority task to - * appear. In that case we must re-start the pick_next_entity() - * loop. + * Because try_steal() and sched_balance_newidle() release + * (and re-acquire) rq->lock, it is possible for any higher priority + * task to appear. In that case we must re-start the + * pick_next_entity() loop. */ if (new_tasks < 0) return RETRY_TASK; @@ -9635,6 +9717,34 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; } +/* + * Return true if task @p can migrate from @rq to @dst_rq in the same LLC. + * No need to test for co-locality, and no need to test task_hot(), as sharing + * LLC provides cache warmth at that level. + */ +static bool +can_migrate_task_llc(struct task_struct *p, struct rq *rq, struct rq *dst_rq) +{ + int dst_cpu = dst_rq->cpu; + + lockdep_assert_rq_held(rq); + + if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) { + schedstat_inc(p->stats.nr_failed_migrations_affine); + return false; + } + + if (task_on_cpu(rq, p)) { + schedstat_inc(p->stats.nr_failed_migrations_running); + return false; + } + + if (p->se.sched_delayed) + return false; + + return true; +} + /* * detach_task() -- detach the task for the migration specified in env */ @@ -9655,6 +9765,17 @@ static void detach_task(struct task_struct *p, struct lb_env *env) set_task_cpu(p, env->dst_cpu); } +/* + * detach_task_steal() -- detach the task for the migration from @src_rq to @dst_cpu. + */ +static void detach_task_steal(struct task_struct *p, struct rq *src_rq, int dst_cpu) +{ + lockdep_assert_rq_held(src_rq); + + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, dst_cpu); +} + /* * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * part of active balancing operations within "domain". @@ -12974,13 +13095,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (this_rq->ttwu_pending) return 0; - /* - * We must set idle_stamp _before_ calling sched_balance_rq() - * for CPU_NEWLY_IDLE, such that we measure the this duration - * as idle time. - */ - this_rq->idle_stamp = rq_clock(this_rq); - /* * Do not pull tasks towards !active CPUs... */ @@ -13090,9 +13204,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; - if (pulled_task) - this_rq->idle_stamp = 0; - else + if (!pulled_task) nohz_newidle_balance(this_rq); rq_repin_lock(this_rq, rf); @@ -13146,6 +13258,150 @@ void sched_balance_trigger(struct rq *rq) nohz_balancer_kick(rq); } +/* + * Search the runnable tasks in @cfs_rq in order of next to run, and find + * the first one that can be migrated to @dst_rq. @cfs_rq is locked on entry. + * On success, dequeue the task from @cfs_rq and return it, else return NULL. + */ +static struct task_struct * +detach_next_task(struct cfs_rq *cfs_rq, struct rq *dst_rq) +{ + int dst_cpu = dst_rq->cpu; + struct task_struct *p; + struct rq *rq = rq_of(cfs_rq); + + lockdep_assert_rq_held(rq); + + list_for_each_entry_reverse(p, &rq->cfs_tasks, se.group_node) { + if (can_migrate_task_llc(p, rq, dst_rq)) { + detach_task_steal(p, rq, dst_cpu); + return p; + } + } + return NULL; +} + +/* + * Attempt to migrate a CFS task from @src_cpu to @dst_rq. @locked indicates + * whether @dst_rq is already locked on entry. This function may lock or + * unlock @dst_rq, and updates @locked to indicate the locked state on return. + * The locking protocol is based on idle_balance(). + * Returns 1 on success and 0 on failure. + */ +static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked, + int src_cpu) +{ + struct task_struct *p; + struct rq_flags rf; + int stolen = 0; + int dst_cpu = dst_rq->cpu; + struct rq *src_rq = cpu_rq(src_cpu); + + if (dst_cpu == src_cpu || src_rq->cfs.h_nr_runnable < 2) + return 0; + + if (*locked) { + rq_unpin_lock(dst_rq, dst_rf); + raw_spin_rq_unlock(dst_rq); + *locked = false; + } + rq_lock_irqsave(src_rq, &rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.h_nr_runnable < 2 || !cpu_active(src_cpu)) + p = NULL; + else + p = detach_next_task(&src_rq->cfs, dst_rq); + + rq_unlock(src_rq, &rf); + + if (p) { + raw_spin_rq_lock(dst_rq); + rq_repin_lock(dst_rq, dst_rf); + *locked = true; + update_rq_clock(dst_rq); + attach_task(dst_rq, p); + stolen = 1; + } + local_irq_restore(rf.flags); + + return stolen; +} + +/* + * Conservative upper bound on the max cost of a steal, in nsecs (the typical + * cost is 1-2 microsec). Do not steal if average idle time is less. + */ +#define SCHED_STEAL_COST 10000 + +/* + * Try to steal a runnable CFS task from a CPU in the same LLC as @dst_rq, + * and migrate it to @dst_rq. rq_lock is held on entry and return, but + * may be dropped in between. Return 1 on success, 0 on failure, and -1 + * if a task in a different scheduling class has become runnable on @dst_rq. + */ +static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf) +{ + int src_cpu; + int dst_cpu = dst_rq->cpu; + bool locked = true; + int stolen = 0; + struct sparsemask *overload_cpus; + + if (!sched_feat(STEAL)) + return 0; + + if (!cpu_active(dst_cpu)) + return 0; + + if (dst_rq->avg_idle < SCHED_STEAL_COST) + return 0; + + /* Get bitmap of overloaded CPUs in the same LLC as @dst_rq */ + + rcu_read_lock(); + overload_cpus = rcu_dereference(dst_rq->cfs_overload_cpus); + if (!overload_cpus) { + rcu_read_unlock(); + return 0; + } + +#ifdef CONFIG_SCHED_SMT + /* + * First try overloaded CPUs on the same core to preserve cache warmth. + */ + if (static_branch_likely(&sched_smt_present)) { + for_each_cpu(src_cpu, cpu_smt_mask(dst_cpu)) { + if (sparsemask_test_elem(overload_cpus, src_cpu) && + steal_from(dst_rq, dst_rf, &locked, src_cpu)) { + stolen = 1; + goto out; + } + } + } +#endif /* CONFIG_SCHED_SMT */ + + /* Accept any suitable task in the LLC */ + + sparsemask_for_each(overload_cpus, dst_cpu, src_cpu) { + if (steal_from(dst_rq, dst_rf, &locked, src_cpu)) { + stolen = 1; + goto out; + } + } + +out: + rcu_read_unlock(); + if (!locked) { + raw_spin_rq_lock(dst_rq); + rq_repin_lock(dst_rq, dst_rf); + } + stolen |= (dst_rq->cfs.h_nr_runnable > 0); + if (dst_rq->nr_running != dst_rq->cfs.h_nr_runnable) + stolen = -1; + return stolen; +} + static void rq_online_fair(struct rq *rq) { update_sysctl(); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 136a6584be79..a3fc8dd7c824 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -87,6 +87,12 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_UTIL, true) +/* + * Steal a CFS task from another CPU when going idle. + * Improves CPU utilization. + */ +SCHED_FEAT(STEAL, false) + /* * Issue a WARN when we do multiple update_rq_clock() calls * in a single rq->lock section. Default disabled because the diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e87064bc0c32..1d507d3da2d3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -82,6 +82,7 @@ struct cfs_rq; struct rt_rq; struct sched_group; struct cpuidle_state; +struct sparsemask; #ifdef CONFIG_PARAVIRT # include @@ -1172,6 +1173,7 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; + struct sparsemask *cfs_overload_cpus; #ifdef CONFIG_SCHED_CLASS_EXT struct scx_rq scx; #endif diff --git a/kernel/sched/sparsemask.h b/kernel/sched/sparsemask.h new file mode 100644 index 000000000000..048443c7dae4 --- /dev/null +++ b/kernel/sched/sparsemask.h @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sparsemask.h - sparse bitmap operations + * + * Copyright (c) 2018 Oracle Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __LINUX_SPARSEMASK_H +#define __LINUX_SPARSEMASK_H + +#include +#include +#include + +/* + * A sparsemask is a sparse bitmap. It reduces cache contention vs the usual + * bitmap when many threads concurrently set, clear, and visit elements. For + * each cacheline chunk of the mask, only the first K bits of the first word are + * used, and the remaining bits are ignored, where K is a creation time + * parameter. Thus a sparsemask that can represent a set of N elements is + * approximately (N/K * CACHELINE) bytes in size. + * + * Clients pass and receive element numbers in the public API, and the + * implementation translates them to bit numbers to perform the bitmap + * operations. + */ + +struct sparsemask_chunk { + unsigned long word; /* the significant bits */ +} ____cacheline_aligned_in_smp; + +struct sparsemask { + short nelems; /* current number of elements */ + short density; /* store 2^density elements per chunk */ + struct sparsemask_chunk chunks[]; /* embedded array of chunks */ +}; + +#define _SMASK_INDEX(density, elem) ((elem) >> (density)) +#define _SMASK_BIT(density, elem) ((elem) & ((1U << (density)) - 1U)) +#define SMASK_INDEX(mask, elem) _SMASK_INDEX((mask)->density, elem) +#define SMASK_BIT(mask, elem) _SMASK_BIT((mask)->density, elem) +#define SMASK_WORD(mask, elem) \ + (&(mask)->chunks[SMASK_INDEX((mask), (elem))].word) + +/* + * sparsemask_next() - Return the next one bit in a bitmap, starting at a + * specified position and wrapping from the last bit to the first, up to but + * not including a specified origin. This is a helper, so do not call it + * directly. + * + * @mask: Bitmap to search. + * @origin: Origin. + * @prev: Previous bit. Start search after this bit number. + * If -1, start search at @origin. + * + * Return: the bit number, else mask->nelems if no bits are set in the range. + */ +static inline int +sparsemask_next(const struct sparsemask *mask, int origin, int prev) +{ + int density = mask->density; + int bits_per_word = 1U << density; + const struct sparsemask_chunk *chunk; + int nelems = mask->nelems; + int next, bit, nbits; + unsigned long word; + + /* Calculate number of bits to be searched. */ + if (prev == -1) { + nbits = nelems; + next = origin; + } else if (prev < origin) { + nbits = origin - prev; + next = prev + 1; + } else { + nbits = nelems - prev + origin - 1; + next = prev + 1; + } + + if (unlikely(next >= nelems)) + return nelems; + + /* + * Fetch and adjust first word. Clear word bits below @next, and round + * @next down to @bits_per_word boundary because later ffs will add + * those bits back. + */ + chunk = &mask->chunks[_SMASK_INDEX(density, next)]; + bit = _SMASK_BIT(density, next); + word = chunk->word & (~0UL << bit); + next -= bit; + nbits += bit; + + while (!word) { + next += bits_per_word; + nbits -= bits_per_word; + if (nbits <= 0) + return nelems; + + if (next >= nelems) { + chunk = mask->chunks; + nbits -= (next - nelems); + next = 0; + } else { + chunk++; + } + word = chunk->word; + } + + next += __ffs(word); + if (next >= origin && prev != -1) + return nelems; + return next; +} + +/****************** The public API ********************/ + +/* + * Max value for the density parameter, limited by 64 bits in the chunk word. + */ +#define SMASK_DENSITY_MAX 6 + +/* + * Return bytes to allocate for a sparsemask, for custom allocators. + */ +static inline size_t sparsemask_size(int nelems, int density) +{ + int index = _SMASK_INDEX(density, nelems) + 1; + + return offsetof(struct sparsemask, chunks[index]); +} + +/* + * Initialize an allocated sparsemask, for custom allocators. + */ +static inline void +sparsemask_init(struct sparsemask *mask, int nelems, int density) +{ + WARN_ON(density < 0 || density > SMASK_DENSITY_MAX || nelems < 0); + mask->nelems = nelems; + mask->density = density; +} + +/* + * sparsemask_alloc_node() - Allocate, initialize, and return a sparsemask. + * + * @nelems - maximum number of elements. + * @density - store 2^density elements per cacheline chunk. + * values from 0 to SMASK_DENSITY_MAX inclusive. + * @flags - kmalloc allocation flags + * @node - numa node + */ +static inline struct sparsemask * +sparsemask_alloc_node(int nelems, int density, gfp_t flags, int node) +{ + int nbytes = sparsemask_size(nelems, density); + struct sparsemask *mask = kmalloc_node(nbytes, flags, node); + + if (mask) + sparsemask_init(mask, nelems, density); + return mask; +} + +static inline void sparsemask_free(struct sparsemask *mask) +{ + kfree(mask); +} + +static inline void sparsemask_set_elem(struct sparsemask *dst, int elem) +{ + set_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem)); +} + +static inline void sparsemask_clear_elem(struct sparsemask *dst, int elem) +{ + clear_bit(SMASK_BIT(dst, elem), SMASK_WORD(dst, elem)); +} + +static inline int sparsemask_test_elem(const struct sparsemask *mask, int elem) +{ + return test_bit(SMASK_BIT(mask, elem), SMASK_WORD(mask, elem)); +} + +/* + * sparsemask_for_each() - iterate over each set bit in a bitmap, starting at a + * specified position, and wrapping from the last bit to the first. + * + * @mask: Bitmap to iterate over. + * @origin: Bit number at which to start searching. + * @elem: Iterator. Can be signed or unsigned integer. + * + * The implementation does not assume any bit in @mask is set, including + * @origin. After the loop, @elem = @mask->nelems. + */ +#define sparsemask_for_each(mask, origin, elem) \ + for ((elem) = -1; \ + (elem) = sparsemask_next((mask), (origin), (elem)), \ + (elem) < (mask)->nelems;) + +#endif /* __LINUX_SPARSEMASK_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 48f671b07af7..f0cfa782cbb1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -6,6 +6,7 @@ #include #include #include "sched.h" +#include "sparsemask.h" DEFINE_MUTEX(sched_domains_mutex); void sched_domains_mutex_lock(void) @@ -21,6 +22,12 @@ void sched_domains_mutex_unlock(void) static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; +struct s_data; +static int sd_llc_alloc(struct sched_domain *sd); +static void sd_llc_free(struct sched_domain *sd); +static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d); +static void sd_llc_free_all(const struct cpumask *cpu_map); + static int __init sched_debug_setup(char *str) { sched_debug_verbose = true; @@ -625,8 +632,10 @@ static void destroy_sched_domain(struct sched_domain *sd) */ free_sched_groups(sd->groups, 1); - if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) { + sd_llc_free(sd); kfree(sd->shared); + } kfree(sd); } @@ -670,7 +679,9 @@ DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { + struct sparsemask *cfs_overload_cpus = NULL; struct sched_domain_shared *sds = NULL; + struct rq *rq = cpu_rq(cpu); struct sched_domain *sd; int id = cpu; int size = 1; @@ -680,8 +691,10 @@ static void update_top_cache_domain(int cpu) id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); sds = sd->shared; + cfs_overload_cpus = sds->cfs_overload_cpus; } + rcu_assign_pointer(rq->cfs_overload_cpus, cfs_overload_cpus); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; @@ -1541,6 +1554,7 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_percpu(d->sd); fallthrough; case sa_sd_storage: + sd_llc_free_all(cpu_map); __sdt_free(cpu_map); fallthrough; case sa_none: @@ -2394,6 +2408,78 @@ static void __sdt_free(const struct cpumask *cpu_map) } } +static int sd_llc_alloc(struct sched_domain *sd) +{ + struct sched_domain_shared *sds = sd->shared; + struct cpumask *span = sched_domain_span(sd); + int nid = cpu_to_node(cpumask_first(span)); + int flags = __GFP_ZERO | GFP_KERNEL; + struct sparsemask *mask; + + /* + * Allocate the bitmap if not already allocated. This is called for + * every CPU in the LLC but only allocates once per sd_llc_shared. + */ + if (!sds->cfs_overload_cpus) { + mask = sparsemask_alloc_node(nr_cpu_ids, 3, flags, nid); + if (!mask) + return 1; + sds->cfs_overload_cpus = mask; + } + + return 0; +} + +static void sd_llc_free(struct sched_domain *sd) +{ + struct sched_domain_shared *sds = sd->shared; + + if (!sds) + return; + + sparsemask_free(sds->cfs_overload_cpus); + sds->cfs_overload_cpus = NULL; +} + +static int sd_llc_alloc_all(const struct cpumask *cpu_map, struct s_data *d) +{ + struct sched_domain *sd, *hsd; + int i; + + for_each_cpu(i, cpu_map) { + /* Find highest domain that shares resources */ + hsd = NULL; + for (sd = *per_cpu_ptr(d->sd, i); sd; sd = sd->parent) { + if (!(sd->flags & SD_SHARE_LLC)) + break; + hsd = sd; + } + if (hsd && sd_llc_alloc(hsd)) + return 1; + } + + return 0; +} + +static void sd_llc_free_all(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + struct sched_domain *sd; + struct sd_data *sdd; + int j; + + for_each_sd_topology(tl) { + sdd = &tl->data; + if (!sdd || !sdd->sd) + continue; + for_each_cpu(j, cpu_map) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd) + sd_llc_free(sd); + } + } +} + static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) @@ -2616,6 +2702,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + /* + * Allocate shared sd data at last level cache. Must be done after + * domains are built above, but before the data is used in + * cpu_attach_domain and descendants below. + */ + if (sd_llc_alloc_all(cpu_map, &d)) + goto error; + /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) {