From 7b590155c4fb87397a72236a73368e8c424ab30d Mon Sep 17 00:00:00 2001 From: Wentao Guan Date: Thu, 16 Apr 2026 09:41:20 +0800 Subject: [PATCH 1/2] Revert "FROMEXT: sched: linux6.18.18-bore-6.6.2" This reverts commit 39c5dcbebe7dca0d22c94159eb434d06c48e0301. --- include/linux/sched.h | 34 --- include/linux/sched/bore.h | 41 ---- init/Kconfig | 17 -- kernel/Kconfig.hz | 17 -- kernel/exit.c | 4 - kernel/fork.c | 13 -- kernel/futex/waitwake.c | 11 - kernel/sched/Makefile | 1 - kernel/sched/bore.c | 428 ------------------------------------- kernel/sched/core.c | 12 -- kernel/sched/debug.c | 61 ------ kernel/sched/fair.c | 110 +--------- kernel/sched/sched.h | 9 - 13 files changed, 11 insertions(+), 747 deletions(-) delete mode 100644 include/linux/sched/bore.h delete mode 100644 kernel/sched/bore.c diff --git a/include/linux/sched.h b/include/linux/sched.h index fd47e4572c737..3e2005e9e2f0b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -817,37 +817,6 @@ struct kmap_ctrl { #endif }; -#ifdef CONFIG_SCHED_BORE -#define BORE_BC_TIMESTAMP_SHIFT 16 - -struct bore_bc { - union { - struct { - u64 timestamp: 48; - u64 penalty: 16; - }; - u64 value; - }; -}; - -struct bore_ctx { - u64 burst_time; - u16 prev_penalty; - u16 curr_penalty; - union { - u16 penalty; - struct { - u8 _; - u8 score; - }; - }; - bool stop_update; - bool futex_waiting; - struct bore_bc subtree; - struct bore_bc group; -}; -#endif /* CONFIG_SCHED_BORE */ - struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -906,9 +875,6 @@ struct task_struct { #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif -#ifdef CONFIG_SCHED_BORE - struct bore_ctx bore; -#endif /* CONFIG_SCHED_BORE */ const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h deleted file mode 100644 index fbb73a64d082c..0000000000000 --- a/include/linux/sched/bore.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef _KERNEL_SCHED_BORE_H -#define _KERNEL_SCHED_BORE_H - -#include -#include -#include -#include -#include -#include - -#define SCHED_BORE_AUTHOR "Masahito Suzuki" -#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification" - -#define SCHED_BORE_VERSION "6.6.2" - -extern u8 __read_mostly sched_bore; -DECLARE_STATIC_KEY_TRUE(sched_bore_key); -extern u8 __read_mostly sched_burst_inherit_type; -extern u8 __read_mostly sched_burst_smoothness; -extern u8 __read_mostly sched_burst_penalty_offset; -extern uint __read_mostly sched_burst_penalty_scale; -extern uint __read_mostly sched_burst_cache_lifetime; - -extern u8 effective_prio_bore(struct task_struct *p); -extern void update_curr_bore(struct task_struct *p, u64 delta_exec); -extern void restart_burst_bore(struct task_struct *p); -extern void restart_burst_rescale_deadline_bore(struct task_struct *p); -extern void task_fork_bore(struct task_struct *p, struct task_struct *parent, - u64 clone_flags, u64 now); -extern void sched_init_bore(void); -extern void reset_task_bore(struct task_struct *p); - -extern int sched_bore_update_handler(const struct ctl_table *table, - int write, void __user *buffer, size_t *lenp, loff_t *ppos); -extern int sched_burst_inherit_type_update_handler(const struct ctl_table *table, - int write, void __user *buffer, size_t *lenp, loff_t *ppos); - -extern void reweight_entity( - struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight); - -#endif /* _KERNEL_SCHED_BORE_H */ diff --git a/init/Kconfig b/init/Kconfig index 5f30738ba2a66..5e15735e1d4a1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1423,23 +1423,6 @@ config CHECKPOINT_RESTORE If unsure, say N here. -config SCHED_BORE - bool "Burst-Oriented Response Enhancer" - default y - help - In Desktop and Mobile computing, one might prefer interactive - tasks to keep responsive no matter what they run in the background. - - Enabling this kernel feature modifies the scheduler to discriminate - tasks by their burst time (runtime since it last went sleeping or - yielding state) and prioritize those that run less bursty. - Such tasks usually include window compositor, widgets backend, - terminal emulator, video playback, games and so on. - With a little impact to scheduling fairness, it may improve - responsiveness especially under heavy background workload. - - If unsure, say Y here. - config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 9eee2005e25f0..ce1435cb08b1e 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -57,20 +57,3 @@ config HZ config SCHED_HRTICK def_bool HIGH_RES_TIMERS - -config MIN_BASE_SLICE_NS - int "Default value for min_base_slice_ns" - default 2000000 - help - The BORE Scheduler automatically calculates the optimal base - slice for the configured HZ using the following equation: - - base_slice_ns = - 1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ) - - This option sets the default lower bound limit of the base slice - to prevent the loss of task throughput due to overscheduling. - - Setting this value too high can cause the system to boot with - an unnecessarily large base slice, resulting in high scheduling - latency and poor system responsiveness. diff --git a/kernel/exit.c b/kernel/exit.c index 0ec0d3db779b2..c8c3ff935a84b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -147,11 +147,7 @@ static void __unhash_process(struct release_task_post *post, struct task_struct detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); -#ifdef CONFIG_SCHED_BORE - list_del_rcu(&p->sibling); -#else /* !CONFIG_SCHED_BORE */ list_del_init(&p->sibling); -#endif /* CONFIG_SCHED_BORE */ __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_node); diff --git a/kernel/fork.c b/kernel/fork.c index 9a36106454405..924a9e10106b3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -116,10 +116,6 @@ /* For dup_mmap(). */ #include "../mm/internal.h" -#ifdef CONFIG_SCHED_BORE -#include -#endif /* CONFIG_SCHED_BORE */ - #include #define CREATE_TRACE_POINTS @@ -2324,11 +2320,6 @@ __latent_entropy struct task_struct *copy_process( p->start_time = ktime_get_ns(); p->start_boottime = ktime_get_boottime_ns(); -#ifdef CONFIG_SCHED_BORE - if (likely(p->pid)) - task_fork_bore(p, current, clone_flags, p->start_time); -#endif /* CONFIG_SCHED_BORE */ - /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! @@ -2402,11 +2393,7 @@ __latent_entropy struct task_struct *copy_process( */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; -#ifdef CONFIG_SCHED_BORE - list_add_tail_rcu(&p->sibling, &p->real_parent->children); -#else /* !CONFIG_SCHED_BORE */ list_add_tail(&p->sibling, &p->real_parent->children); -#endif /* CONFIG_SCHED_BORE */ list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); attach_pid(p, PIDTYPE_PGID); diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 6484ad583f3bf..e2bbe5509ec27 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -4,9 +4,6 @@ #include #include #include -#ifdef CONFIG_SCHED_BORE -#include -#endif /* CONFIG_SCHED_BORE */ #include "futex.h" @@ -358,15 +355,7 @@ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) -#ifdef CONFIG_SCHED_BORE - { - current->bore.futex_waiting = true; -#endif /* CONFIG_SCHED_BORE */ schedule(); -#ifdef CONFIG_SCHED_BORE - current->bore.futex_waiting = false; - } -#endif /* CONFIG_SCHED_BORE */ } __set_current_state(TASK_RUNNING); } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index b688084bcecc7..8ae86371ddcdd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -37,4 +37,3 @@ obj-y += core.o obj-y += fair.o obj-y += build_policy.o obj-y += build_utility.o -obj-$(CONFIG_SCHED_BORE) += bore.o diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c deleted file mode 100644 index 759eee843ca32..0000000000000 --- a/kernel/sched/bore.c +++ /dev/null @@ -1,428 +0,0 @@ -/* - * Burst-Oriented Response Enhancer (BORE) CPU Scheduler - * Copyright (C) 2021-2025 Masahito Suzuki - */ -#include -#include -#include -#include "sched.h" - -#ifdef CONFIG_SCHED_BORE -DEFINE_STATIC_KEY_TRUE(sched_bore_key); -u8 __read_mostly sched_bore = 1; -u8 __read_mostly sched_burst_inherit_type = 2; -u8 __read_mostly sched_burst_smoothness = 1; -u8 __read_mostly sched_burst_penalty_offset = 24; -uint __read_mostly sched_burst_penalty_scale = 1536; -uint __read_mostly sched_burst_cache_lifetime = 75000000; -static int __maybe_unused maxval_prio = 39; -static int __maybe_unused maxval_6_bits = 63; -static int __maybe_unused maxval_8_bits = 255; -static int __maybe_unused maxval_12_bits = 4095; - -#define MAX_BURST_PENALTY ((40U << 8) - 1) -#define BURST_CACHE_SAMPLE_LIMIT 63 -#define BURST_CACHE_SCAN_LIMIT (BURST_CACHE_SAMPLE_LIMIT * 2) - -static u32 bore_reciprocal_lut[BURST_CACHE_SAMPLE_LIMIT + 1]; - -DEFINE_STATIC_KEY_TRUE(sched_burst_inherit_key); -DEFINE_STATIC_KEY_TRUE(sched_burst_ancestor_key); - -static inline u32 log2p1_u64_u32fp(u64 v, u8 fp) { - if (unlikely(!v)) return 0; - int clz = __builtin_clzll(v); - int exponent = 64 - clz; - u32 mantissa = (u32)((v << clz) << 1 >> (64 - fp)); - return exponent << fp | mantissa; -} - -static inline u32 calc_burst_penalty(u64 burst_time) { - u32 greed = log2p1_u64_u32fp(burst_time, 8), - tolerance = sched_burst_penalty_offset << 8; - s32 diff = (s32)(greed - tolerance); - u32 penalty = diff & ~(diff >> 31); - u32 scaled_penalty = penalty * sched_burst_penalty_scale >> 10; - s32 overflow = scaled_penalty - MAX_BURST_PENALTY; - return scaled_penalty - (overflow & ~(overflow >> 31)); -} - -static inline u64 rescale_slice(u64 delta, u8 old_prio, u8 new_prio) { - u64 unscaled, rescaled; - unscaled = mul_u64_u32_shr(delta , sched_prio_to_weight[old_prio], 10); - rescaled = mul_u64_u32_shr(unscaled, sched_prio_to_wmult [new_prio], 22); - return rescaled; -} - -static inline u32 binary_smooth(u32 new, u32 old) { - u32 is_growing = (new > old); - u32 increment = (new - old) * is_growing; - u32 shift = sched_burst_smoothness; - u32 smoothed = old + ((increment + (1U << shift) - 1) >> shift); - return (new & ~(-is_growing)) | (smoothed & (-is_growing)); -} - -static void reweight_task_by_prio(struct task_struct *p, int prio) { - if (task_has_idle_policy(p)) return; - - struct sched_entity *se = &p->se; - unsigned long weight = scale_load(sched_prio_to_weight[prio]); - - if (se->on_rq) { - p->bore.stop_update = true; - reweight_entity(cfs_rq_of(se), se, weight); - p->bore.stop_update = false; - } else - se->load.weight = weight; - se->load.inv_weight = sched_prio_to_wmult[prio]; -} - -u8 effective_prio_bore(struct task_struct *p) { - int prio = p->static_prio - MAX_RT_PRIO; - if (static_branch_likely(&sched_bore_key)) - prio += p->bore.score; - prio &= ~(prio >> 31); - s32 diff = prio - maxval_prio; - prio -= (diff & ~(diff >> 31)); - return (u8)prio; -} - -static void update_penalty(struct task_struct *p) { - struct bore_ctx *ctx = &p->bore; - - u8 prev_prio = effective_prio_bore(p); - - s32 diff = (s32)ctx->curr_penalty - (s32)ctx->prev_penalty; - u16 max_val = ctx->curr_penalty - (diff & (diff >> 31)); - u32 is_kthread = !!(p->flags & PF_KTHREAD); - ctx->penalty = max_val & -(s32)(!is_kthread); - - u8 new_prio = effective_prio_bore(p); - if (new_prio != prev_prio) - reweight_task_by_prio(p, new_prio); -} - -void update_curr_bore(struct task_struct *p, u64 delta_exec) { - struct bore_ctx *ctx = &p->bore; - if (ctx->stop_update) return; - - ctx->burst_time += delta_exec; - u32 curr_penalty = ctx->curr_penalty = calc_burst_penalty(ctx->burst_time); - - if (curr_penalty <= ctx->prev_penalty) return; - update_penalty(p); -} - -void restart_burst_bore(struct task_struct *p) { - struct bore_ctx *ctx = &p->bore; - u32 new_penalty = binary_smooth(ctx->curr_penalty, ctx->prev_penalty); - ctx->prev_penalty = new_penalty; - ctx->curr_penalty = 0; - ctx->burst_time = 0; - update_penalty(p); -} - -void restart_burst_rescale_deadline_bore(struct task_struct *p) { - struct sched_entity *se = &p->se; - s64 vscaled, vremain = se->deadline - se->vruntime; - - u8 old_prio = effective_prio_bore(p); - restart_burst_bore(p); - u8 new_prio = effective_prio_bore(p); - - if (old_prio > new_prio) { - vscaled = rescale_slice(abs(vremain), old_prio, new_prio); - if (unlikely(vremain < 0)) - vscaled = -vscaled; - se->deadline = se->vruntime + vscaled; - } -} - -static inline bool task_is_bore_eligible(struct task_struct *p) -{return p && p->sched_class == &fair_sched_class && !p->exit_state;} - -#ifndef for_each_child_task -#define for_each_child_task(p, t) \ - list_for_each_entry_rcu(t, &(p)->children, sibling) -#endif - -static inline u32 count_children_upto2(struct task_struct *p) { - struct list_head *head = &p->children; - struct list_head *next = head->next; - return (next != head) + (next->next != head); -} - -static inline bool burst_cache_expired(struct bore_bc *bc, u64 now) { - struct bore_bc bc_val = { .value = READ_ONCE(bc->value) }; - u64 timestamp = (u64)bc_val.timestamp << BORE_BC_TIMESTAMP_SHIFT; - return now - timestamp > (u64)sched_burst_cache_lifetime; -} - -static void update_burst_cache(struct bore_bc *bc, - struct task_struct *p, u32 count, u32 total, u64 now) { - u32 average = (count == 1) ? total : - (u32)(((u64)total * bore_reciprocal_lut[count]) >> 32); - - struct bore_bc new_bc = { - .penalty = max(average, p->bore.penalty), - .timestamp = now >> BORE_BC_TIMESTAMP_SHIFT - }; - WRITE_ONCE(bc->value, new_bc.value); -} - -static u32 inherit_from_parent(struct task_struct *parent, - u64 clone_flags, u64 now) { - struct bore_bc bc_val; - - if (clone_flags & CLONE_PARENT) - parent = rcu_dereference(parent->real_parent); - - struct bore_bc *bc = &parent->bore.subtree; - - if (burst_cache_expired(bc, now)) { - struct task_struct *child; - u32 count = 0, total = 0, scan_count = 0; - for_each_child_task(parent, child) { - if (count >= BURST_CACHE_SAMPLE_LIMIT) break; - if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break; - - if (!task_is_bore_eligible(child)) continue; - count++; - total += child->bore.penalty; - } - - update_burst_cache(bc, parent, count, total, now); - } - - bc_val.value = READ_ONCE(bc->value); - return (u32)bc_val.penalty; -} - -static u32 inherit_from_ancestor_hub(struct task_struct *parent, - u64 clone_flags, u64 now) { - struct bore_bc bc_val; - struct task_struct *ancestor = parent; - u32 sole_child_count = 0; - - if (clone_flags & CLONE_PARENT) { - ancestor = rcu_dereference(ancestor->real_parent); - sole_child_count = 1; - } - - for (struct task_struct *next; - (next = rcu_dereference(ancestor->real_parent)) != ancestor && - count_children_upto2(ancestor) <= sole_child_count; - ancestor = next, sole_child_count = 1) {} - - struct bore_bc *bc = &ancestor->bore.subtree; - - if (burst_cache_expired(bc, now)) { - struct task_struct *direct_child; - u32 count = 0, total = 0, scan_count = 0; - for_each_child_task(ancestor, direct_child) { - if (count >= BURST_CACHE_SAMPLE_LIMIT) break; - if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break; - - struct task_struct *descendant = direct_child; - while (count_children_upto2(descendant) == 1) - descendant = list_first_entry(&descendant->children, - struct task_struct, sibling); - - if (!task_is_bore_eligible(descendant)) continue; - count++; - total += descendant->bore.penalty; - } - - update_burst_cache(bc, ancestor, count, total, now); - } - - bc_val.value = READ_ONCE(bc->value); - return (u32)bc_val.penalty; -} - -static u32 inherit_from_thread_group(struct task_struct *p, u64 now) { - struct bore_bc bc_val; - struct task_struct *leader = p->group_leader; - struct bore_bc *bc = &leader->bore.group; - - if (burst_cache_expired(bc, now)) { - struct task_struct *sibling; - u32 count = 0, total = 0; - - for_each_thread(leader, sibling) { - if (count >= BURST_CACHE_SAMPLE_LIMIT) break; - - if (!task_is_bore_eligible(sibling)) continue; - count++; - total += sibling->bore.penalty; - } - - update_burst_cache(bc, leader, count, total, now); - } - - bc_val.value = READ_ONCE(bc->value); - return (u32)bc_val.penalty; -} - -void task_fork_bore(struct task_struct *p, - struct task_struct *parent, u64 clone_flags, u64 now) { - if (!static_branch_likely(&sched_bore_key) || !task_is_bore_eligible(p)) return; - - rcu_read_lock(); - struct bore_ctx *ctx = &p->bore; - u32 inherited_penalty; - if (clone_flags & CLONE_THREAD) - inherited_penalty = inherit_from_thread_group(parent, now); - else if (static_branch_likely(&sched_burst_inherit_key)) - inherited_penalty = static_branch_likely(&sched_burst_ancestor_key)? - inherit_from_ancestor_hub(parent, clone_flags, now): - inherit_from_parent(parent, clone_flags, now); - else - inherited_penalty = 0; - - if (ctx->prev_penalty < inherited_penalty) - ctx->prev_penalty = inherited_penalty; - ctx->curr_penalty = 0; - ctx->burst_time = 0; - ctx->stop_update = false; - ctx->futex_waiting = false; - update_penalty(p); - rcu_read_unlock(); -} - -void reset_task_bore(struct task_struct *p) -{ memset(&p->bore, 0, sizeof(struct bore_ctx)); } - -static void update_inherit_type(void) { - switch(sched_burst_inherit_type) { - case 1: - static_branch_enable(&sched_burst_inherit_key); - static_branch_disable(&sched_burst_ancestor_key); - break; - case 2: - static_branch_enable(&sched_burst_inherit_key); - static_branch_enable(&sched_burst_ancestor_key); - break; - default: - static_branch_disable(&sched_burst_inherit_key); - break; - } -} - -void __init sched_init_bore(void) { - printk(KERN_INFO "%s %s by %s\n", - SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR); - - for (int i = 1; i <= BURST_CACHE_SAMPLE_LIMIT; i++) - bore_reciprocal_lut[i] = (u32)div64_u64(0xffffffffULL + i, i); - - reset_task_bore(&init_task); - update_inherit_type(); -} - -static void readjust_all_task_weights(void) { - struct task_struct *task; - struct rq *rq; - struct rq_flags rf; - - scoped_guard(write_lock_irq, &tasklist_lock) - for_each_process(task) { - if (!task_is_bore_eligible(task)) continue; - rq = task_rq_lock(task, &rf); - update_rq_clock(rq); - reweight_task_by_prio(task, effective_prio_bore(task)); - task_rq_unlock(rq, task, &rf); - } -} - -int sched_bore_update_handler(const struct ctl_table *table, - int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); - if (ret || !write) - return ret; - - if (sched_bore) - static_branch_enable(&sched_bore_key); - else - static_branch_disable(&sched_bore_key); - - readjust_all_task_weights(); - - return 0; -} - -int sched_burst_inherit_type_update_handler(const struct ctl_table *table, - int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); - if (ret || !write) - return ret; - - update_inherit_type(); - - return 0; -} - -#ifdef CONFIG_SYSCTL -static struct ctl_table sched_bore_sysctls[] = { - { - .procname = "sched_bore", - .data = &sched_bore, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = sched_bore_update_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "sched_burst_inherit_type", - .data = &sched_burst_inherit_type, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = sched_burst_inherit_type_update_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "sched_burst_smoothness", - .data = &sched_burst_smoothness, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_THREE, - }, - { - .procname = "sched_burst_penalty_offset", - .data = &sched_burst_penalty_offset, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &maxval_6_bits, - }, - { - .procname = "sched_burst_penalty_scale", - .data = &sched_burst_penalty_scale, - .maxlen = sizeof(uint), - .mode = 0644, - .proc_handler = proc_douintvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &maxval_12_bits, - }, - { - .procname = "sched_burst_cache_lifetime", - .data = &sched_burst_cache_lifetime, - .maxlen = sizeof(uint), - .mode = 0644, - .proc_handler = proc_douintvec, - }, -}; - -static int __init sched_bore_sysctl_init(void) { - register_sysctl_init("kernel", sched_bore_sysctls); - return 0; -} -late_initcall(sched_bore_sysctl_init); - -#endif // CONFIG_SYSCTL -#endif /* CONFIG_SCHED_BORE */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 87fe3af6bc3a9..582c3847f483a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -100,10 +100,6 @@ #include "../smpboot.h" #include "../locking/mutex.h" -#ifdef CONFIG_SCHED_BORE -#include -#endif /* CONFIG_SCHED_BORE */ - EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -1440,11 +1436,7 @@ int tg_nop(struct task_group *tg, void *data) void set_load_weight(struct task_struct *p, bool update_load) { -#ifdef CONFIG_SCHED_BORE - int prio = effective_prio_bore(p); -#else /* !CONFIG_SCHED_BORE */ int prio = p->static_prio - MAX_RT_PRIO; -#endif /* CONFIG_SCHED_BORE */ struct load_weight lw; if (task_has_idle_policy(p)) { @@ -8670,10 +8662,6 @@ void __init sched_init(void) BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif -#ifdef CONFIG_SCHED_BORE - sched_init_bore(); -#endif /* CONFIG_SCHED_BORE */ - wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ed90b4f942936..93f009e1076d8 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -169,53 +169,6 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; -#ifdef CONFIG_SCHED_BORE -#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \ -static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \ -{ \ - char buf[16]; \ - unsigned int value; \ -\ - if (cnt > 15) \ - cnt = 15; \ -\ - if (copy_from_user(&buf, ubuf, cnt)) \ - return -EFAULT; \ - buf[cnt] = '\0'; \ -\ - if (kstrtouint(buf, 10, &value)) \ - return -EINVAL; \ -\ - sysctl_sched_##name = value; \ - sched_update_##update_func(); \ -\ - *ppos += cnt; \ - return cnt; \ -} \ -\ -static int sched_##name##_show(struct seq_file *m, void *v) \ -{ \ - seq_printf(m, "%d\n", sysctl_sched_##name); \ - return 0; \ -} \ -\ -static int sched_##name##_open(struct inode *inode, struct file *filp) \ -{ \ - return single_open(filp, sched_##name##_show, NULL); \ -} \ -\ -static const struct file_operations sched_##name##_fops = { \ - .open = sched_##name##_open, \ - .write = sched_##name##_write, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice) - -#undef DEFINE_SYSCTL_SCHED_FUNC -#else /* !CONFIG_SCHED_BORE */ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -261,7 +214,6 @@ static const struct file_operations sched_scaling_fops = { .llseek = seq_lseek, .release = single_release, }; -#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_PREEMPT_DYNAMIC @@ -549,19 +501,12 @@ static __init int sched_init_debug(void) debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif -#ifdef CONFIG_SCHED_BORE - debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); - debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice); -#else /* !CONFIG_SCHED_BORE */ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); -#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -#if !defined(CONFIG_SCHED_BORE) debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); -#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -803,9 +748,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); -#ifdef CONFIG_SCHED_BORE - SEQ_printf(m, " %2d", p->bore.score); -#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif @@ -1283,9 +1225,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); -#ifdef CONFIG_SCHED_BORE - P(bore.score); -#endif /* CONFIG_SCHED_BORE */ P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0fcdadad8d67a..d9777c81db0da 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -58,10 +58,6 @@ #include "stats.h" #include "autogroup.h" -#ifdef CONFIG_SCHED_BORE -#include -#endif /* CONFIG_SCHED_BORE */ - /* * The initial- and re-scaling of tunables is configurable * @@ -71,30 +67,17 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * - * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant - * EEVDF: default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ -#ifdef CONFIG_SCHED_BORE -unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; -#else /* !CONFIG_SCHED_BORE */ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; -#endif /* CONFIG_SCHED_BORE */ /* * Minimal preemption granularity for CPU-bound tasks: * - * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice - * (default min_base_slice = 2000000 constant, units: nanoseconds) - * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds + * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -#ifdef CONFIG_SCHED_BORE -static const unsigned int nsecs_per_tick = 1000000000ULL / HZ; -unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; -__read_mostly uint sysctl_sched_base_slice = nsecs_per_tick; -#else /* !CONFIG_SCHED_BORE */ unsigned int sysctl_sched_base_slice = 700000ULL; static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; -#endif /* CONFIG_SCHED_BORE */ __read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; @@ -206,13 +189,6 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ -#ifdef CONFIG_SCHED_BORE -static void update_sysctl(void) { - sysctl_sched_base_slice = nsecs_per_tick * - max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick)); -} -void sched_update_min_base_slice(void) { update_sysctl(); } -#else /* !CONFIG_SCHED_BORE */ static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); @@ -243,7 +219,6 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } -#endif /* CONFIG_SCHED_BORE */ void __init sched_init_granularity(void) { @@ -982,11 +957,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) */ static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { -#ifdef CONFIG_SCHED_BORE - u64 slice = sysctl_sched_base_slice; -#else /* CONFIG_SCHED_BORE */ u64 slice = normalized_sysctl_sched_base_slice; -#endif /* CONFIG_SCHED_BORE */ u64 vprot = se->deadline; if (sched_feat(RUN_TO_PARITY)) @@ -1054,11 +1025,6 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) curr = NULL; if (curr && protect && protect_slice(curr)) -#ifdef CONFIG_SCHED_BORE - if (!static_branch_likely(&sched_bore_key) || - !entity_is_task(curr) || - !task_of(curr)->bore.futex_waiting) -#endif /* CONFIG_SCHED_BORE */ return curr; /* Pick the leftmost entity if it's eligible */ @@ -1120,7 +1086,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ -#if !defined(CONFIG_SCHED_BORE) int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1132,7 +1097,6 @@ int sched_update_scaling(void) return 0; } -#endif /* CONFIG_SCHED_BORE */ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1331,11 +1295,6 @@ static void update_curr(struct cfs_rq *cfs_rq) resched = update_deadline(cfs_rq, curr); if (entity_is_task(curr)) { -#ifdef CONFIG_SCHED_BORE - struct task_struct *p = task_of(curr); - update_curr_bore(p, delta_exec); -#endif /* CONFIG_SCHED_BORE */ - /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on @@ -3874,7 +3833,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); -void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; @@ -5237,11 +5196,12 @@ void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vslice = 0, vruntime = avg_vruntime(cfs_rq); + u64 vslice, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; if (!se->custom_slice) se->slice = sysctl_sched_base_slice; + vslice = calc_delta_fair(se->slice, se); /* * Due to how V is constructed as the weighted average of entities, @@ -5326,18 +5286,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->rel_deadline = 0; return; } -#ifdef CONFIG_SCHED_BORE - if (static_branch_likely(&sched_bore_key) && - entity_is_task(se) && - task_of(se)->bore.futex_waiting) - goto vslice_found; -#endif /* !CONFIG_SCHED_BORE */ - vslice = calc_delta_fair(se->slice, se); -#ifdef CONFIG_SCHED_BORE - if (static_branch_likely(&sched_bore_key)) - vslice >>= !!(flags & (ENQUEUE_INITIAL | ENQUEUE_WAKEUP)); - else -#endif /* CONFIG_SCHED_BORE */ + /* * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks @@ -5346,9 +5295,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) vslice /= 2; -#ifdef CONFIG_SCHED_BORE -vslice_found: -#endif /* CONFIG_SCHED_BORE */ /* * EEVDF: vd_i = ve_i + r_i/w_i */ @@ -5359,7 +5305,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static void -requeue_delayed_entity(struct sched_entity *se, int flags); +requeue_delayed_entity(struct sched_entity *se); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -5517,10 +5463,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { update_load_avg(cfs_rq, se, 0); -#ifdef CONFIG_SCHED_BORE - if (static_branch_likely(&sched_bore_key) && sched_feat(DELAY_ZERO)) - update_entity_lag(cfs_rq, se); -#endif /* CONFIG_SCHED_BORE */ set_delayed(se); return false; } @@ -7001,7 +6943,7 @@ static int sched_idle_cpu(int cpu) } static void -requeue_delayed_entity(struct sched_entity *se, int flags) +requeue_delayed_entity(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -7014,22 +6956,13 @@ requeue_delayed_entity(struct sched_entity *se, int flags) WARN_ON_ONCE(!se->on_rq); if (sched_feat(DELAY_ZERO)) { -#ifdef CONFIG_SCHED_BORE - if (static_branch_likely(&sched_bore_key)) - flags |= ENQUEUE_WAKEUP; - else { -#endif /* CONFIG_SCHED_BORE */ - flags = 0; update_entity_lag(cfs_rq, se); -#ifdef CONFIG_SCHED_BORE - } -#endif /* CONFIG_SCHED_BORE */ if (se->vlag > 0) { cfs_rq->nr_queued--; if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag = 0; - place_entity(cfs_rq, se, flags); + place_entity(cfs_rq, se, 0); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); cfs_rq->nr_queued++; @@ -7069,7 +7002,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_enqueue(&rq->cfs, p); if (flags & ENQUEUE_DELAYED) { - requeue_delayed_entity(se, flags); + requeue_delayed_entity(se); return; } @@ -7087,7 +7020,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { if (se->on_rq) { if (se->sched_delayed) - requeue_delayed_entity(se, flags); + requeue_delayed_entity(se); break; } cfs_rq = cfs_rq_of(se); @@ -7300,15 +7233,6 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); -#ifdef CONFIG_SCHED_BORE - struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); - struct sched_entity *se = &p->se; - if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { - if (cfs_rq->curr == se) - update_curr(cfs_rq); - restart_burst_bore(p); - } -#endif /* CONFIG_SCHED_BORE */ if (dequeue_entities(rq, &p->se, flags) < 0) return false; @@ -9131,25 +9055,16 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ -#if !defined(CONFIG_SCHED_BORE) if (unlikely(rq->nr_running == 1)) return; clear_buddies(cfs_rq, se); -#endif /* CONFIG_SCHED_BORE */ update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); -#ifdef CONFIG_SCHED_BORE - restart_burst_rescale_deadline_bore(curr); - if (unlikely(rq->nr_running == 1)) - return; - - clear_buddies(cfs_rq, se); -#endif /* CONFIG_SCHED_BORE */ /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() @@ -13603,9 +13518,6 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) WARN_ON_ONCE(p->se.sched_delayed); attach_task_cfs_rq(p); -#ifdef CONFIG_SCHED_BORE - reset_task_bore(p); -#endif /* CONFIG_SCHED_BORE */ set_task_max_allowed_capacity(p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e87064bc0c32e..e2f27239a87af 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2156,11 +2156,7 @@ extern int group_balance_cpu(struct sched_group *sg); extern void update_sched_domain_debugfs(void); extern void dirty_sched_domain_sysctl(int cpu); -#ifdef CONFIG_SCHED_BORE -extern void sched_update_min_base_slice(void); -#else /* !CONFIG_SCHED_BORE */ extern int sched_update_scaling(void); -#endif /* CONFIG_SCHED_BORE */ static inline const struct cpumask *task_user_cpus(struct task_struct *p) { @@ -2839,12 +2835,7 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); extern __read_mostly unsigned int sysctl_sched_nr_migrate; extern __read_mostly unsigned int sysctl_sched_migration_cost; -#ifdef CONFIG_SCHED_BORE -extern unsigned int sysctl_sched_min_base_slice; -extern __read_mostly uint sysctl_sched_base_slice; -#else /* !CONFIG_SCHED_BORE */ extern unsigned int sysctl_sched_base_slice; -#endif /* CONFIG_SCHED_BORE */ extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; From 9f1b95a18b046ded872ec66365ff13e1ade6b40e Mon Sep 17 00:00:00 2001 From: Masahito S Date: Thu, 16 Apr 2026 05:24:16 +0900 Subject: [PATCH 2/2] FROMEXT: linux6.18.22-bore-6.6.3 Signed-off-by: Wentao Guan --- include/linux/sched.h | 34 +++ include/linux/sched/bore.h | 41 ++++ init/Kconfig | 17 ++ kernel/Kconfig.hz | 17 ++ kernel/exit.c | 4 + kernel/fork.c | 13 ++ kernel/futex/waitwake.c | 11 + kernel/sched/Makefile | 1 + kernel/sched/bore.c | 434 +++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 12 + kernel/sched/debug.c | 61 ++++++ kernel/sched/fair.c | 126 ++++++++++- kernel/sched/sched.h | 9 + 13 files changed, 769 insertions(+), 11 deletions(-) create mode 100644 include/linux/sched/bore.h create mode 100644 kernel/sched/bore.c diff --git a/include/linux/sched.h b/include/linux/sched.h index 3e2005e9e2f0b..fd47e4572c737 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -817,6 +817,37 @@ struct kmap_ctrl { #endif }; +#ifdef CONFIG_SCHED_BORE +#define BORE_BC_TIMESTAMP_SHIFT 16 + +struct bore_bc { + union { + struct { + u64 timestamp: 48; + u64 penalty: 16; + }; + u64 value; + }; +}; + +struct bore_ctx { + u64 burst_time; + u16 prev_penalty; + u16 curr_penalty; + union { + u16 penalty; + struct { + u8 _; + u8 score; + }; + }; + bool stop_update; + bool futex_waiting; + struct bore_bc subtree; + struct bore_bc group; +}; +#endif /* CONFIG_SCHED_BORE */ + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -875,6 +906,9 @@ struct task_struct { #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif +#ifdef CONFIG_SCHED_BORE + struct bore_ctx bore; +#endif /* CONFIG_SCHED_BORE */ const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h new file mode 100644 index 0000000000000..9215c13a91a88 --- /dev/null +++ b/include/linux/sched/bore.h @@ -0,0 +1,41 @@ +#ifndef _KERNEL_SCHED_BORE_H +#define _KERNEL_SCHED_BORE_H + +#include +#include +#include +#include +#include +#include + +#define SCHED_BORE_AUTHOR "Masahito Suzuki" +#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification" + +#define SCHED_BORE_VERSION "6.6.3" + +extern u8 __read_mostly sched_bore; +DECLARE_STATIC_KEY_TRUE(sched_bore_key); +extern u8 __read_mostly sched_burst_inherit_type; +extern u8 __read_mostly sched_burst_smoothness; +extern u8 __read_mostly sched_burst_penalty_offset; +extern uint __read_mostly sched_burst_penalty_scale; +extern uint __read_mostly sched_burst_cache_lifetime; + +extern u8 effective_prio_bore(struct task_struct *p); +extern void update_curr_bore(struct task_struct *p, u64 delta_exec); +extern void restart_burst_bore(struct task_struct *p); +extern void restart_burst_rescale_deadline_bore(struct task_struct *p); +extern void task_fork_bore(struct task_struct *p, struct task_struct *parent, + u64 clone_flags, u64 now); +extern void sched_init_bore(void); +extern void reset_task_bore(struct task_struct *p); + +extern int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); + +extern void reweight_entity( + struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight); + +#endif /* _KERNEL_SCHED_BORE_H */ diff --git a/init/Kconfig b/init/Kconfig index 5e15735e1d4a1..5f30738ba2a66 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1423,6 +1423,23 @@ config CHECKPOINT_RESTORE If unsure, say N here. +config SCHED_BORE + bool "Burst-Oriented Response Enhancer" + default y + help + In Desktop and Mobile computing, one might prefer interactive + tasks to keep responsive no matter what they run in the background. + + Enabling this kernel feature modifies the scheduler to discriminate + tasks by their burst time (runtime since it last went sleeping or + yielding state) and prioritize those that run less bursty. + Such tasks usually include window compositor, widgets backend, + terminal emulator, video playback, games and so on. + With a little impact to scheduling fairness, it may improve + responsiveness especially under heavy background workload. + + If unsure, say Y here. + config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index ce1435cb08b1e..9eee2005e25f0 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -57,3 +57,20 @@ config HZ config SCHED_HRTICK def_bool HIGH_RES_TIMERS + +config MIN_BASE_SLICE_NS + int "Default value for min_base_slice_ns" + default 2000000 + help + The BORE Scheduler automatically calculates the optimal base + slice for the configured HZ using the following equation: + + base_slice_ns = + 1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ) + + This option sets the default lower bound limit of the base slice + to prevent the loss of task throughput due to overscheduling. + + Setting this value too high can cause the system to boot with + an unnecessarily large base slice, resulting in high scheduling + latency and poor system responsiveness. diff --git a/kernel/exit.c b/kernel/exit.c index c8c3ff935a84b..0ec0d3db779b2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -147,7 +147,11 @@ static void __unhash_process(struct release_task_post *post, struct task_struct detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); +#ifdef CONFIG_SCHED_BORE + list_del_rcu(&p->sibling); +#else /* !CONFIG_SCHED_BORE */ list_del_init(&p->sibling); +#endif /* CONFIG_SCHED_BORE */ __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_node); diff --git a/kernel/fork.c b/kernel/fork.c index 924a9e10106b3..9a36106454405 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -116,6 +116,10 @@ /* For dup_mmap(). */ #include "../mm/internal.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + #include #define CREATE_TRACE_POINTS @@ -2320,6 +2324,11 @@ __latent_entropy struct task_struct *copy_process( p->start_time = ktime_get_ns(); p->start_boottime = ktime_get_boottime_ns(); +#ifdef CONFIG_SCHED_BORE + if (likely(p->pid)) + task_fork_bore(p, current, clone_flags, p->start_time); +#endif /* CONFIG_SCHED_BORE */ + /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! @@ -2393,7 +2402,11 @@ __latent_entropy struct task_struct *copy_process( */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; +#ifdef CONFIG_SCHED_BORE + list_add_tail_rcu(&p->sibling, &p->real_parent->children); +#else /* !CONFIG_SCHED_BORE */ list_add_tail(&p->sibling, &p->real_parent->children); +#endif /* CONFIG_SCHED_BORE */ list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); attach_pid(p, PIDTYPE_PGID); diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index e2bbe5509ec27..6484ad583f3bf 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -4,6 +4,9 @@ #include #include #include +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ #include "futex.h" @@ -355,7 +358,15 @@ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) +#ifdef CONFIG_SCHED_BORE + { + current->bore.futex_waiting = true; +#endif /* CONFIG_SCHED_BORE */ schedule(); +#ifdef CONFIG_SCHED_BORE + current->bore.futex_waiting = false; + } +#endif /* CONFIG_SCHED_BORE */ } __set_current_state(TASK_RUNNING); } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 8ae86371ddcdd..b688084bcecc7 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -37,3 +37,4 @@ obj-y += core.o obj-y += fair.o obj-y += build_policy.o obj-y += build_utility.o +obj-$(CONFIG_SCHED_BORE) += bore.o diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c new file mode 100644 index 0000000000000..c27a22cd63d60 --- /dev/null +++ b/kernel/sched/bore.c @@ -0,0 +1,434 @@ +/* + * Burst-Oriented Response Enhancer (BORE) CPU Scheduler + * Copyright (C) 2021-2025 Masahito Suzuki + */ +#include +#include +#include +#include "sched.h" + +#ifdef CONFIG_SCHED_BORE +DEFINE_STATIC_KEY_TRUE(sched_bore_key); +u8 __read_mostly sched_bore = 1; +u8 __read_mostly sched_burst_inherit_type = 2; +u8 __read_mostly sched_burst_smoothness = 1; +u8 __read_mostly sched_burst_penalty_offset = 24; +uint __read_mostly sched_burst_penalty_scale = 1536; +uint __read_mostly sched_burst_cache_lifetime = 75000000; +static int __maybe_unused maxval_prio = 39; +static int __maybe_unused maxval_6_bits = 63; +static int __maybe_unused maxval_8_bits = 255; +static int __maybe_unused maxval_12_bits = 4095; + +#define MAX_BURST_PENALTY ((40U << 8) - 1) +#define BURST_CACHE_SAMPLE_LIMIT 63 +#define BURST_CACHE_SCAN_LIMIT (BURST_CACHE_SAMPLE_LIMIT * 2) + +static u32 bore_reciprocal_lut[BURST_CACHE_SAMPLE_LIMIT + 1]; + +DEFINE_STATIC_KEY_TRUE(sched_burst_inherit_key); +DEFINE_STATIC_KEY_TRUE(sched_burst_ancestor_key); + +static inline u32 log2p1_u64_u32fp(u64 v, u8 fp) { + if (unlikely(!v)) return 0; + int clz = __builtin_clzll(v); + int exponent = 64 - clz; + u32 mantissa = (u32)((v << clz) << 1 >> (64 - fp)); + return exponent << fp | mantissa; +} + +static inline u32 calc_burst_penalty(u64 burst_time) { + u32 greed = log2p1_u64_u32fp(burst_time, 8), + tolerance = sched_burst_penalty_offset << 8; + s32 diff = (s32)(greed - tolerance); + u32 penalty = diff & ~(diff >> 31); + u32 scaled_penalty = penalty * sched_burst_penalty_scale >> 10; + s32 overflow = scaled_penalty - MAX_BURST_PENALTY; + return scaled_penalty - (overflow & ~(overflow >> 31)); +} + +static inline u64 rescale_slice(u64 delta, u8 old_prio, u8 new_prio) { + u64 unscaled, rescaled; + unscaled = mul_u64_u32_shr(delta , sched_prio_to_weight[old_prio], 10); + rescaled = mul_u64_u32_shr(unscaled, sched_prio_to_wmult [new_prio], 22); + return rescaled; +} + +static inline u32 binary_smooth(u32 new, u32 old) { + u32 is_growing = (new > old); + u32 increment = (new - old) * is_growing; + u32 shift = sched_burst_smoothness; + u32 smoothed = old + ((increment + (1U << shift) - 1) >> shift); + return (new & ~(-is_growing)) | (smoothed & (-is_growing)); +} + +static void reweight_task_by_prio(struct task_struct *p, int prio) { + if (task_has_idle_policy(p)) return; + + struct sched_entity *se = &p->se; + unsigned long weight = scale_load(sched_prio_to_weight[prio]); + + if (se->on_rq) { + p->bore.stop_update = true; + reweight_entity(cfs_rq_of(se), se, weight); + p->bore.stop_update = false; + } else + se->load.weight = weight; + se->load.inv_weight = sched_prio_to_wmult[prio]; +} + +u8 effective_prio_bore(struct task_struct *p) { + int prio = p->static_prio - MAX_RT_PRIO; + if (static_branch_likely(&sched_bore_key)) + prio += p->bore.score; + prio &= ~(prio >> 31); + s32 diff = prio - maxval_prio; + prio -= (diff & ~(diff >> 31)); + return (u8)prio; +} + +static void update_penalty(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + + u8 prev_prio = effective_prio_bore(p); + + s32 diff = (s32)ctx->curr_penalty - (s32)ctx->prev_penalty; + u16 max_val = ctx->curr_penalty - (diff & (diff >> 31)); + u32 is_kthread = !!(p->flags & PF_KTHREAD); + ctx->penalty = max_val & -(s32)(!is_kthread); + + u8 new_prio = effective_prio_bore(p); + if (new_prio != prev_prio) + reweight_task_by_prio(p, new_prio); +} + +void update_curr_bore(struct task_struct *p, u64 delta_exec) { + struct bore_ctx *ctx = &p->bore; + if (ctx->stop_update) return; + + ctx->burst_time += delta_exec; + u32 curr_penalty = ctx->curr_penalty = calc_burst_penalty(ctx->burst_time); + + if (curr_penalty <= ctx->prev_penalty) return; + update_penalty(p); +} + +void restart_burst_bore(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + u32 new_penalty = binary_smooth(ctx->curr_penalty, ctx->prev_penalty); + ctx->prev_penalty = new_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + update_penalty(p); +} + +void restart_burst_rescale_deadline_bore(struct task_struct *p) { + struct sched_entity *se = &p->se; + s64 vscaled, vremain = se->deadline - se->vruntime; + + u8 old_prio = effective_prio_bore(p); + restart_burst_bore(p); + u8 new_prio = effective_prio_bore(p); + + if (old_prio > new_prio) { + vscaled = rescale_slice(abs(vremain), old_prio, new_prio); + if (unlikely(vremain < 0)) + vscaled = -vscaled; + se->deadline = se->vruntime + vscaled; + } +} + +static inline bool task_is_bore_eligible(struct task_struct *p) +{return p && p->sched_class == &fair_sched_class && !p->exit_state;} + +#ifndef for_each_child_task +#define for_each_child_task(p, t) \ + list_for_each_entry_rcu(t, &(p)->children, sibling) +#endif + +static inline u32 count_children_upto2(struct task_struct *p) { + struct list_head *head = &p->children; + struct list_head *first = READ_ONCE(head->next); + struct list_head *second = READ_ONCE(first->next); + return (first != head) + (second != head); +} + +static inline bool burst_cache_expired(struct bore_bc *bc, u64 now) { + struct bore_bc bc_val = { .value = READ_ONCE(bc->value) }; + u64 timestamp = (u64)bc_val.timestamp << BORE_BC_TIMESTAMP_SHIFT; + return now - timestamp > (u64)sched_burst_cache_lifetime; +} + +static void update_burst_cache(struct bore_bc *bc, + struct task_struct *p, u32 count, u32 total, u64 now) { + u32 average = (count == 1) ? total : + (u32)(((u64)total * bore_reciprocal_lut[count]) >> 32); + + struct bore_bc new_bc = { + .penalty = max(average, p->bore.penalty), + .timestamp = now >> BORE_BC_TIMESTAMP_SHIFT + }; + WRITE_ONCE(bc->value, new_bc.value); +} + +static u32 inherit_from_parent(struct task_struct *parent, + u64 clone_flags, u64 now) { + struct bore_bc bc_val; + + if (clone_flags & CLONE_PARENT) + parent = rcu_dereference(parent->real_parent); + + struct bore_bc *bc = &parent->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *child; + u32 count = 0, total = 0, scan_count = 0; + for_each_child_task(parent, child) { + if (count >= BURST_CACHE_SAMPLE_LIMIT) break; + if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break; + + if (!task_is_bore_eligible(child)) continue; + count++; + total += child->bore.penalty; + } + + update_burst_cache(bc, parent, count, total, now); + } + + bc_val.value = READ_ONCE(bc->value); + return (u32)bc_val.penalty; +} + +static u32 inherit_from_ancestor_hub(struct task_struct *parent, + u64 clone_flags, u64 now) { + struct bore_bc bc_val; + struct task_struct *ancestor = parent; + u32 sole_child_count = 0; + + if (clone_flags & CLONE_PARENT) { + ancestor = rcu_dereference(ancestor->real_parent); + sole_child_count = 1; + } + + for (struct task_struct *next; + (next = rcu_dereference(ancestor->real_parent)) != ancestor && + count_children_upto2(ancestor) <= sole_child_count; + ancestor = next, sole_child_count = 1) {} + + struct bore_bc *bc = &ancestor->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *direct_child; + u32 count = 0, total = 0, scan_count = 0; + for_each_child_task(ancestor, direct_child) { + if (count >= BURST_CACHE_SAMPLE_LIMIT) break; + if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break; + + struct task_struct *descendant = direct_child; + while (count_children_upto2(descendant) == 1) { + struct task_struct *next_descendant = + list_first_or_null_rcu(&descendant->children, + struct task_struct, sibling); + if (!next_descendant) break; + descendant = next_descendant; + } + + if (!task_is_bore_eligible(descendant)) continue; + count++; + total += descendant->bore.penalty; + } + + update_burst_cache(bc, ancestor, count, total, now); + } + + bc_val.value = READ_ONCE(bc->value); + return (u32)bc_val.penalty; +} + +static u32 inherit_from_thread_group(struct task_struct *p, u64 now) { + struct bore_bc bc_val; + struct task_struct *leader = p->group_leader; + struct bore_bc *bc = &leader->bore.group; + + if (burst_cache_expired(bc, now)) { + struct task_struct *sibling; + u32 count = 0, total = 0, scan_count = 0; + + for_each_thread(leader, sibling) { + if (count >= BURST_CACHE_SAMPLE_LIMIT) break; + if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break; + + if (!task_is_bore_eligible(sibling)) continue; + count++; + total += sibling->bore.penalty; + } + + update_burst_cache(bc, leader, count, total, now); + } + + bc_val.value = READ_ONCE(bc->value); + return (u32)bc_val.penalty; +} + +void task_fork_bore(struct task_struct *p, + struct task_struct *parent, u64 clone_flags, u64 now) { + if (!static_branch_likely(&sched_bore_key) || !task_is_bore_eligible(p)) return; + + rcu_read_lock(); + struct bore_ctx *ctx = &p->bore; + u32 inherited_penalty; + if (clone_flags & CLONE_THREAD) + inherited_penalty = inherit_from_thread_group(parent, now); + else if (static_branch_likely(&sched_burst_inherit_key)) + inherited_penalty = static_branch_likely(&sched_burst_ancestor_key)? + inherit_from_ancestor_hub(parent, clone_flags, now): + inherit_from_parent(parent, clone_flags, now); + else + inherited_penalty = 0; + + if (ctx->prev_penalty < inherited_penalty) + ctx->prev_penalty = inherited_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + ctx->stop_update = false; + ctx->futex_waiting = false; + update_penalty(p); + rcu_read_unlock(); +} + +void reset_task_bore(struct task_struct *p) +{ memset(&p->bore, 0, sizeof(struct bore_ctx)); } + +static void update_inherit_type(void) { + switch(sched_burst_inherit_type) { + case 1: + static_branch_enable(&sched_burst_inherit_key); + static_branch_disable(&sched_burst_ancestor_key); + break; + case 2: + static_branch_enable(&sched_burst_inherit_key); + static_branch_enable(&sched_burst_ancestor_key); + break; + default: + static_branch_disable(&sched_burst_inherit_key); + break; + } +} + +void __init sched_init_bore(void) { + printk(KERN_INFO "%s %s by %s\n", + SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR); + + for (int i = 1; i <= BURST_CACHE_SAMPLE_LIMIT; i++) + bore_reciprocal_lut[i] = (u32)div64_u64(0xffffffffULL + i, i); + + reset_task_bore(&init_task); + update_inherit_type(); +} + +static void readjust_all_task_weights(void) { + struct task_struct *task; + struct rq *rq; + struct rq_flags rf; + + scoped_guard(write_lock_irq, &tasklist_lock) + for_each_process(task) { + if (!task_is_bore_eligible(task)) continue; + rq = task_rq_lock(task, &rf); + update_rq_clock(rq); + reweight_task_by_prio(task, effective_prio_bore(task)); + task_rq_unlock(rq, task, &rf); + } +} + +int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + if (sched_bore) + static_branch_enable(&sched_bore_key); + else + static_branch_disable(&sched_bore_key); + + readjust_all_task_weights(); + + return 0; +} + +int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + update_inherit_type(); + + return 0; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_bore_sysctls[] = { + { + .procname = "sched_bore", + .data = &sched_bore, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_bore_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_burst_inherit_type", + .data = &sched_burst_inherit_type, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_burst_inherit_type_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "sched_burst_smoothness", + .data = &sched_burst_smoothness, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { + .procname = "sched_burst_penalty_offset", + .data = &sched_burst_penalty_offset, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_6_bits, + }, + { + .procname = "sched_burst_penalty_scale", + .data = &sched_burst_penalty_scale, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_12_bits, + }, + { + .procname = "sched_burst_cache_lifetime", + .data = &sched_burst_cache_lifetime, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = proc_douintvec, + }, +}; + +static int __init sched_bore_sysctl_init(void) { + register_sysctl_init("kernel", sched_bore_sysctls); + return 0; +} +late_initcall(sched_bore_sysctl_init); + +#endif // CONFIG_SYSCTL +#endif /* CONFIG_SCHED_BORE */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 582c3847f483a..87fe3af6bc3a9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -100,6 +100,10 @@ #include "../smpboot.h" #include "../locking/mutex.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -1436,7 +1440,11 @@ int tg_nop(struct task_group *tg, void *data) void set_load_weight(struct task_struct *p, bool update_load) { +#ifdef CONFIG_SCHED_BORE + int prio = effective_prio_bore(p); +#else /* !CONFIG_SCHED_BORE */ int prio = p->static_prio - MAX_RT_PRIO; +#endif /* CONFIG_SCHED_BORE */ struct load_weight lw; if (task_has_idle_policy(p)) { @@ -8662,6 +8670,10 @@ void __init sched_init(void) BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); +#endif /* CONFIG_SCHED_BORE */ + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 93f009e1076d8..ed90b4f942936 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_BORE +#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \ +static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \ +{ \ + char buf[16]; \ + unsigned int value; \ +\ + if (cnt > 15) \ + cnt = 15; \ +\ + if (copy_from_user(&buf, ubuf, cnt)) \ + return -EFAULT; \ + buf[cnt] = '\0'; \ +\ + if (kstrtouint(buf, 10, &value)) \ + return -EINVAL; \ +\ + sysctl_sched_##name = value; \ + sched_update_##update_func(); \ +\ + *ppos += cnt; \ + return cnt; \ +} \ +\ +static int sched_##name##_show(struct seq_file *m, void *v) \ +{ \ + seq_printf(m, "%d\n", sysctl_sched_##name); \ + return 0; \ +} \ +\ +static int sched_##name##_open(struct inode *inode, struct file *filp) \ +{ \ + return single_open(filp, sched_##name##_show, NULL); \ +} \ +\ +static const struct file_operations sched_##name##_fops = { \ + .open = sched_##name##_open, \ + .write = sched_##name##_write, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice) + +#undef DEFINE_SYSCTL_SCHED_FUNC +#else /* !CONFIG_SCHED_BORE */ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -214,6 +261,7 @@ static const struct file_operations sched_scaling_fops = { .llseek = seq_lseek, .release = single_release, }; +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_PREEMPT_DYNAMIC @@ -501,12 +549,19 @@ static __init int sched_init_debug(void) debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif +#ifdef CONFIG_SCHED_BORE + debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); + debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice); +#else /* !CONFIG_SCHED_BORE */ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); +#if !defined(CONFIG_SCHED_BORE) debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -748,6 +803,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); +#ifdef CONFIG_SCHED_BORE + SEQ_printf(m, " %2d", p->bore.score); +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif @@ -1225,6 +1283,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); +#ifdef CONFIG_SCHED_BORE + P(bore.score); +#endif /* CONFIG_SCHED_BORE */ P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d9777c81db0da..2310dd7c2adc5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -58,6 +58,10 @@ #include "stats.h" #include "autogroup.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + /* * The initial- and re-scaling of tunables is configurable * @@ -67,17 +71,30 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * - * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant + * EEVDF: default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ +#ifdef CONFIG_SCHED_BORE +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +#else /* !CONFIG_SCHED_BORE */ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +#endif /* CONFIG_SCHED_BORE */ /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) + * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice + * (default min_base_slice = 2000000 constant, units: nanoseconds) + * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds */ +#ifdef CONFIG_SCHED_BORE +static const unsigned int nsecs_per_tick = 1000000000ULL / HZ; +unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; +__read_mostly uint sysctl_sched_base_slice = nsecs_per_tick; +#else /* !CONFIG_SCHED_BORE */ unsigned int sysctl_sched_base_slice = 700000ULL; static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; +#endif /* CONFIG_SCHED_BORE */ __read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; @@ -189,6 +206,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ +#ifdef CONFIG_SCHED_BORE +static void update_sysctl(void) { + sysctl_sched_base_slice = nsecs_per_tick * + max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick)); +} +void sched_update_min_base_slice(void) { update_sysctl(); } +#else /* !CONFIG_SCHED_BORE */ static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); @@ -219,6 +243,7 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } +#endif /* CONFIG_SCHED_BORE */ void __init sched_init_granularity(void) { @@ -957,7 +982,11 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) */ static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#ifdef CONFIG_SCHED_BORE + u64 slice = sysctl_sched_base_slice; +#else /* CONFIG_SCHED_BORE */ u64 slice = normalized_sysctl_sched_base_slice; +#endif /* CONFIG_SCHED_BORE */ u64 vprot = se->deadline; if (sched_feat(RUN_TO_PARITY)) @@ -1025,6 +1054,11 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) curr = NULL; if (curr && protect && protect_slice(curr)) +#ifdef CONFIG_SCHED_BORE + if (!static_branch_likely(&sched_bore_key) || + !entity_is_task(curr) || + !task_of(curr)->bore.futex_waiting) +#endif /* CONFIG_SCHED_BORE */ return curr; /* Pick the leftmost entity if it's eligible */ @@ -1086,6 +1120,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ +#if !defined(CONFIG_SCHED_BORE) int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1097,6 +1132,7 @@ int sched_update_scaling(void) return 0; } +#endif /* CONFIG_SCHED_BORE */ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1295,6 +1331,11 @@ static void update_curr(struct cfs_rq *cfs_rq) resched = update_deadline(cfs_rq, curr); if (entity_is_task(curr)) { +#ifdef CONFIG_SCHED_BORE + struct task_struct *p = task_of(curr); + update_curr_bore(p, delta_exec); +#endif /* CONFIG_SCHED_BORE */ + /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on @@ -3833,17 +3874,23 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; bool rel_vprot = false; u64 vprot; +#ifdef CONFIG_SCHED_BORE + s64 vlag_unscaled = 0; +#endif /* !CONFIG_SCHED_BORE */ if (se->on_rq) { /* commit outstanding execution time */ update_curr(cfs_rq); update_entity_lag(cfs_rq, se); +#ifdef CONFIG_SCHED_BORE + vlag_unscaled = se->vlag; +#endif /* !CONFIG_SCHED_BORE */ se->deadline -= se->vruntime; se->rel_deadline = 1; if (curr && protect_slice(se)) { @@ -3879,6 +3926,16 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { +#ifdef CONFIG_SCHED_BORE + if (curr) { + se->vruntime += vlag_unscaled - se->vlag; + if (se->rel_deadline) { + se->deadline += se->vruntime; + se->rel_deadline = 0; + } + } + else +#endif /* !CONFIG_SCHED_BORE */ place_entity(cfs_rq, se, 0); if (rel_vprot) se->vprot = se->vruntime + vprot; @@ -5196,12 +5253,11 @@ void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vslice, vruntime = avg_vruntime(cfs_rq); + u64 vslice = 0, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; if (!se->custom_slice) se->slice = sysctl_sched_base_slice; - vslice = calc_delta_fair(se->slice, se); /* * Due to how V is constructed as the weighted average of entities, @@ -5286,7 +5342,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->rel_deadline = 0; return; } - +#ifdef CONFIG_SCHED_BORE + if (static_branch_likely(&sched_bore_key) && + entity_is_task(se) && + task_of(se)->bore.futex_waiting) + goto vslice_found; +#endif /* !CONFIG_SCHED_BORE */ + vslice = calc_delta_fair(se->slice, se); +#ifdef CONFIG_SCHED_BORE + if (static_branch_likely(&sched_bore_key)) + vslice >>= !!(flags & (ENQUEUE_INITIAL | ENQUEUE_WAKEUP)); + else +#endif /* CONFIG_SCHED_BORE */ /* * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks @@ -5295,6 +5362,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) vslice /= 2; +#ifdef CONFIG_SCHED_BORE +vslice_found: +#endif /* CONFIG_SCHED_BORE */ /* * EEVDF: vd_i = ve_i + r_i/w_i */ @@ -5305,7 +5375,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static void -requeue_delayed_entity(struct sched_entity *se); +requeue_delayed_entity(struct sched_entity *se, int flags); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -5463,6 +5533,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { update_load_avg(cfs_rq, se, 0); +#ifdef CONFIG_SCHED_BORE + if (static_branch_likely(&sched_bore_key) && sched_feat(DELAY_ZERO)) + update_entity_lag(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ set_delayed(se); return false; } @@ -6943,7 +7017,7 @@ static int sched_idle_cpu(int cpu) } static void -requeue_delayed_entity(struct sched_entity *se) +requeue_delayed_entity(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -6956,13 +7030,22 @@ requeue_delayed_entity(struct sched_entity *se) WARN_ON_ONCE(!se->on_rq); if (sched_feat(DELAY_ZERO)) { +#ifdef CONFIG_SCHED_BORE + if (static_branch_likely(&sched_bore_key)) + flags |= ENQUEUE_WAKEUP; + else { +#endif /* CONFIG_SCHED_BORE */ + flags = 0; update_entity_lag(cfs_rq, se); +#ifdef CONFIG_SCHED_BORE + } +#endif /* CONFIG_SCHED_BORE */ if (se->vlag > 0) { cfs_rq->nr_queued--; if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag = 0; - place_entity(cfs_rq, se, 0); + place_entity(cfs_rq, se, flags); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); cfs_rq->nr_queued++; @@ -7002,7 +7085,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_enqueue(&rq->cfs, p); if (flags & ENQUEUE_DELAYED) { - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); return; } @@ -7020,7 +7103,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { if (se->on_rq) { if (se->sched_delayed) - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); break; } cfs_rq = cfs_rq_of(se); @@ -7233,6 +7316,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); +#ifdef CONFIG_SCHED_BORE + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + struct sched_entity *se = &p->se; + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + if (cfs_rq->curr == se) + update_curr(cfs_rq); + restart_burst_bore(p); + } +#endif /* CONFIG_SCHED_BORE */ if (dequeue_entities(rq, &p->se, flags) < 0) return false; @@ -9055,16 +9147,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ +#if !defined(CONFIG_SCHED_BORE) if (unlikely(rq->nr_running == 1)) return; clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); +#ifdef CONFIG_SCHED_BORE + restart_burst_rescale_deadline_bore(curr); + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() @@ -13518,6 +13619,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) WARN_ON_ONCE(p->se.sched_delayed); attach_task_cfs_rq(p); +#ifdef CONFIG_SCHED_BORE + reset_task_bore(p); +#endif /* CONFIG_SCHED_BORE */ set_task_max_allowed_capacity(p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e2f27239a87af..e87064bc0c32e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2156,7 +2156,11 @@ extern int group_balance_cpu(struct sched_group *sg); extern void update_sched_domain_debugfs(void); extern void dirty_sched_domain_sysctl(int cpu); +#ifdef CONFIG_SCHED_BORE +extern void sched_update_min_base_slice(void); +#else /* !CONFIG_SCHED_BORE */ extern int sched_update_scaling(void); +#endif /* CONFIG_SCHED_BORE */ static inline const struct cpumask *task_user_cpus(struct task_struct *p) { @@ -2835,7 +2839,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); extern __read_mostly unsigned int sysctl_sched_nr_migrate; extern __read_mostly unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_BORE +extern unsigned int sysctl_sched_min_base_slice; +extern __read_mostly uint sysctl_sched_base_slice; +#else /* !CONFIG_SCHED_BORE */ extern unsigned int sysctl_sched_base_slice; +#endif /* CONFIG_SCHED_BORE */ extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once;