diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h index a1fc7c1d66..6af255deed 100644 --- a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h @@ -176,7 +176,7 @@ static inline s32 get_cpu_from_dsq(dsq_id_t dsq_id) } /* Helper functions to construct DSQ IDs */ -static inline dsq_id_t get_cpu_dsq_id(u32 cpu) +static __always_inline dsq_id_t get_cpu_dsq_id(u32 cpu) { // Check for valid CPU range, 0 indexed so >=. if (cpu >= MAX_CPUS) { @@ -187,7 +187,8 @@ static inline dsq_id_t get_cpu_dsq_id(u32 cpu) return (dsq_id_t){ .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } }; } -static inline dsq_id_t get_cell_llc_dsq_id(u32 cell, u32 llc) +/* __always_inline is technically required for a fn w/ aggregate return */ +static __always_inline dsq_id_t get_cell_llc_dsq_id(u32 cell, u32 llc) { if (cell >= MAX_CELLS || llc >= MAX_LLCS) { scx_bpf_error("cell %u or llc %u too large", cell, llc); diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h index dc41f6df87..a549f12031 100644 --- a/scheds/rust/scx_mitosis/src/bpf/intf.h +++ b/scheds/rust/scx_mitosis/src/bpf/intf.h @@ -65,6 +65,8 @@ struct debug_event { }; }; +enum migration_source { MIG_SELECT_CPU, MIG_ENQUEUE, MIG_DISPATCH, NR_MIG_SOURCES }; + /* Statistics */ enum cell_stat_idx { CSTAT_LOCAL, diff --git a/scheds/rust/scx_mitosis/src/bpf/llc_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/llc_aware.bpf.h index 2f81adaf1f..db8b8a21fa 100644 --- a/scheds/rust/scx_mitosis/src/bpf/llc_aware.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/llc_aware.bpf.h @@ -24,6 +24,20 @@ typedef u32 llc_id_t; */ extern u32 cpu_to_llc[MAX_CPUS]; extern struct llc_cpumask llc_to_cpus[MAX_LLCS]; +extern u32 per_cell_steal_min_queued[MAX_CELLS]; + +/* Global migration counters - 2x3 cross-product (read by userspace). + * Same-vs-cross is classified against cpu_to_llc[] above. */ +u64 nr_mig_same_select; +u64 nr_mig_same_enqueue; +u64 nr_mig_same_dispatch; +u64 nr_mig_cross_select; +u64 nr_mig_cross_enqueue; +u64 nr_mig_cross_dispatch; + +/* Counter for tasks moved by the cell-shrink drain in + * recalc_cell_llc_counts. Read by userspace. */ +u64 nr_drained_tasks; static inline bool llc_is_valid(u32 llc_id) { @@ -77,6 +91,7 @@ static __always_inline int recalc_cell_llc_counts(u32 cell_idx, const struct cpu u32 llc, llcs_present = 0, total_cpus = 0; // Just so we don't hold the lock longer than necessary u32 llc_cpu_cnt_tmp[MAX_LLCS] = { 0 }; + u32 llc_cpu_cnt_old[MAX_LLCS] = { 0 }; const struct cpumask *cell_mask; if (explicit_mask) { @@ -97,6 +112,9 @@ static __always_inline int recalc_cell_llc_counts(u32 cell_idx, const struct cpu u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp_mask); + /* Snapshot old count BEFORE the publish below so the drain's + * >0 -> 0 transition check still works after the reorder. */ + llc_cpu_cnt_old[llc] = READ_ONCE(cell->llcs[llc].cpu_cnt); llc_cpu_cnt_tmp[llc] = cnt; // These are counted across the whole cell @@ -107,7 +125,21 @@ static __always_inline int recalc_cell_llc_counts(u32 cell_idx, const struct cpu llcs_present++; } - // Write to cell + /* + * Publish cpu_cnt FIRST, then drain. Enqueues that read cpu_cnt + * after the publish see 0 and re-pick via update_task_cpumask + * (mitosis_enqueue's cpu_cnt==0 guard); the drain then sweeps + * anything in the DSQ at iterator time. + * Order matters: dispatch never peeks cpu_cnt==0 LLCs and stealing + * skips them, so any task that lands in an orphan (cell, llc) DSQ + * stays stranded indefinitely. + * + * Residual race (not eliminated): an enqueue between its cpu_cnt + * read and its scx_bpf_dsq_insert_vtime call may have read the + * old cpu_cnt and complete its insert after the drain iterator. + * BPF lacks the synchronize_rcu / atomic-insert-if-predicate + * primitives needed to close this fully. + */ scoped_guard(spin_lock, &cell->lock) { for (u32 llc_idx = 0; llc_idx < nr_llc; llc_idx++) { @@ -117,6 +149,70 @@ static __always_inline int recalc_cell_llc_counts(u32 cell_idx, const struct cpu cell->llc_present_cnt = llcs_present; cell->cpu_cnt = total_cpus; } + + /* + * Drain orphan: if an LLC's count just dropped from >0 to 0, any + * tasks queued on (cell, llc) DSQ would be stranded (no consumer). + * Move them to a sibling LLC's DSQ that the cell still has CPUs in. + * + * Done outside the spin_lock above because bpf_for_each(scx_dsq,..) + * and scx_bpf_dsq_move_vtime cannot be called under cell->lock. + * The transition check uses the snapshotted llc_cpu_cnt_old[] + * because cell->llcs[].cpu_cnt now holds the new value. + */ + bpf_for(llc, 0, nr_llc) + { + u32 old_cnt = llc_cpu_cnt_old[llc]; + u32 new_cnt = llc_cpu_cnt_tmp[llc]; + if (!(old_cnt > 0 && new_cnt == 0)) + continue; + + /* Pick destination LLC: first one the cell still has CPUs in. + * TODO: spread across surviving LLCs (round-robin or + * least-loaded); currently we dump all stranded tasks onto + * one LLC. */ + u32 dest_llc = LLC_INVALID; + u32 d; + bpf_for(d, 0, nr_llc) + { + if (llc_cpu_cnt_tmp[d] > 0) { + dest_llc = d; + break; + } + } + if (dest_llc == LLC_INVALID) { + scx_bpf_error( + "recalc_cell_llc_counts: cell %u has no CPUs in any LLC; cannot drain (cell, llc=%u)", + cell_idx, llc); + return -EINVAL; + } + + dsq_id_t src_dsq = get_cell_llc_dsq_id(cell_idx, llc); + dsq_id_t dst_dsq = get_cell_llc_dsq_id(cell_idx, dest_llc); + if (dsq_is_invalid(src_dsq) || dsq_is_invalid(dst_dsq)) + return -EINVAL; + + struct task_struct *p; + u64 dst_vtime = READ_ONCE(cell->llcs[dest_llc].vtime_now); + /* Caller must hold RCU read lock for bpf_iter_scx_dsq_next. */ + bpf_for_each(scx_dsq, p, src_dsq.raw, 0) { + struct task_ctx *tctx = lookup_task_ctx(p); + if (!tctx) + return -ENOENT; /* error already raised by lookup_task_ctx */ + tctx->llc = dest_llc; + tctx->dsq = dst_dsq; + /* Use dest LLC's vtime_now for the move so the task lands + * at the tail of the dest DSQ instead of the head. We can't + * write p->scx.dsq_vtime directly here (iterator-returned + * pointers are RCU read-only), so dispatch's cross-DSQ + * comparison may still see the source-domain dsq_vtime + * until the task next stops and dsq_vtime is recharged. */ + scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, dst_vtime); + scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, dst_dsq.raw, 0); + __sync_fetch_and_add(&nr_drained_tasks, 1); + } + } + return 0; } @@ -211,9 +307,10 @@ static inline int maybe_retag_stolen_task(struct task_struct *p, struct task_ctx if (tctx->llc == cctx->llc) return 0; - /* Task was stolen to a different LLC - update accounting */ + /* Task was stolen to a different LLC - update accounting. */ tctx->steal_count++; tctx->last_stolen_at = scx_bpf_now(); + tctx->last_cpu_source = MIG_DISPATCH; /* Assign task to new LLC */ tctx->llc = cctx->llc; @@ -225,6 +322,46 @@ static inline int maybe_retag_stolen_task(struct task_struct *p, struct task_ctx return update_task_cpumask(p, tctx); } +/* Bump the appropriate (same|cross)_(select|enqueue|dispatch) counter + * based on whether the new CPU is in the same LLC as the previous one + * and how this task arrived (last_cpu_source). Updates last_ran_cpu. + * First run on a CPU (prev < 0) is uncounted by design. */ +static inline void track_cpu_migration(struct task_struct *p, struct task_ctx *tctx) +{ + s32 cpu = scx_bpf_task_cpu(p); + s32 prev = tctx->last_ran_cpu; + tctx->last_ran_cpu = cpu; + + if (!(prev >= 0 && prev != cpu && prev < nr_possible_cpus && cpu < nr_possible_cpus)) + return; + + if (cpu_to_llc[prev] == cpu_to_llc[cpu]) { + switch (tctx->last_cpu_source) { + case MIG_SELECT_CPU: + __sync_fetch_and_add(&nr_mig_same_select, 1); + break; + case MIG_ENQUEUE: + __sync_fetch_and_add(&nr_mig_same_enqueue, 1); + break; + case MIG_DISPATCH: + __sync_fetch_and_add(&nr_mig_same_dispatch, 1); + break; + } + } else { + switch (tctx->last_cpu_source) { + case MIG_SELECT_CPU: + __sync_fetch_and_add(&nr_mig_cross_select, 1); + break; + case MIG_ENQUEUE: + __sync_fetch_and_add(&nr_mig_cross_enqueue, 1); + break; + case MIG_DISPATCH: + __sync_fetch_and_add(&nr_mig_cross_dispatch, 1); + break; + } + } +} + /* Work stealing: * Scan sibling (cell,LLC) DSQs in the same cell and steal the first queued task if it can run on this cpu * Returns: @@ -244,50 +381,55 @@ static inline s32 try_stealing_work(u32 cell, s32 local_llc) return -EINVAL; // Loop over all other LLCs, looking for a queued task to steal + /* Find the most backed-up sibling LLC and steal from it, but only + * if its queue depth exceeds per_cell_steal_min_queued[cell] (the + * per-cell threshold maintained by userspace's adaptive controller; + * default 0 means "steal whenever a sibling has any queued task"). */ u32 i; + u32 max_queued = 0; + dsq_id_t max_dsq = DSQ_INVALID; + bpf_for(i, 1, nr_llc) { - // Start with the next one to spread out the load u32 candidate_llc = (local_llc + i) % nr_llc; - // Prevents the optimizer from removing the following conditional return - // so that the verifier knows the read will be safe barrier_var(candidate_llc); if (candidate_llc >= MAX_LLCS) continue; /* - * Skip if the cell doesn't have CPUs in this LLC. - * Note: rechecking cell_ptr for verifier. - * This is racy with try_stealing_this_task, but we don't care - - * if the LLC actually doesn't have CPUs come steal time, - * we will fail the steal and continue to the next LLC. - */ + * Skip LLCs the cell has no CPUs in. Read is racy with reconfig + * but harmless: a stale-positive just means the steal will fail. + * cell_ptr re-check is for the verifier. + */ if (cell_ptr && READ_ONCE(cell_ptr->llcs[candidate_llc].cpu_cnt) == 0) continue; dsq_id_t candidate_dsq = get_cell_llc_dsq_id(cell, candidate_llc); if (dsq_is_invalid(candidate_dsq)) - return -EINVAL; // already errored in get_cell_llc_dsq_id - - // Optimization: skip if faster than constructing an iterator - // Not redundant with later checking if task found (race) - if (!scx_bpf_dsq_nr_queued(candidate_dsq.raw)) - continue; + return -EINVAL; - /* - * Attempt the steal - can fail because it's a race. - * We don't update task_ctx here because the peeked task_ctx - * may be stale (a different task may now be at head of DSQ). - * Actual retag and accounting happens in running() via - * mismatch detection. - */ - if (!scx_bpf_dsq_move_to_local(candidate_dsq.raw, 0)) - continue; + u32 nr = scx_bpf_dsq_nr_queued(candidate_dsq.raw); + if (nr > max_queued) { + max_queued = nr; + max_dsq = candidate_dsq; + } + } - // Success, we got a task - return true; + /* + * Single steal attempt; can fail under race (e.g. another CPU + * consumed the head first). Stolen task's LLC retag happens + * lazily in mitosis_running via maybe_retag_stolen_task. + * + * TODO: on race the CPU goes idle even if other siblings have + * queued tasks. Track second-best during the scan and try it on + * failure (cheap, ~6 lines, single-pass), or fall back to a full + * sibling sweep. + */ + if (max_queued > per_cell_steal_min_queued[cell] && !dsq_is_invalid(max_dsq)) { + if (scx_bpf_dsq_move_to_local(max_dsq.raw, 0)) + return true; } return false; } diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index d363a5fd66..7c8ec337d1 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -50,6 +50,8 @@ const volatile bool userspace_managed_cell_mode = false; const volatile bool enable_borrowing = false; const volatile bool use_lockless_peek = false; const volatile bool dynamic_affinity_cpu_selection = false; +/* Per-cell sibling-LLC queue-depth threshold gating work-stealing; tuned by userspace. */ +u32 per_cell_steal_min_queued[MAX_CELLS]; /* * Global arrays for LLC topology, populated by userspace before load. @@ -66,6 +68,7 @@ struct llc_cpumask llc_to_cpus[MAX_LLCS]; u32 configuration_seq; u32 applied_configuration_seq; u32 cpuset_seq; + u32 applied_cpuset_seq; /* @@ -584,10 +587,7 @@ static __always_inline s32 try_pick_idle_cpu(struct task_struct *p, s32 prev_cpu * enqueue), SCX_DSQ_LOCAL resolves to task_rq(p) -- not * the idle CPU we picked. */ - tctx->vtime_charge_cell = tctx->cell; - scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, 0); - if (kick) - scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + place_task_on_cpu(p, tctx, cpu, kick); return cpu; } if (cpu == -1) @@ -608,10 +608,7 @@ static __always_inline s32 try_pick_idle_cpu(struct task_struct *p, s32 prev_cpu if (cpu >= 0) { tctx->borrowed = true; cstat_inc(CSTAT_BORROWED, tctx->cell, cctx); - tctx->vtime_charge_cell = tctx->cell; - scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, 0); - if (kick) - scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + place_task_on_cpu(p, tctx, cpu, kick); return cpu; } } @@ -684,6 +681,8 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu, u64 if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) return prev_cpu; + tctx->last_cpu_source = MIG_SELECT_CPU; + if (maybe_refresh_cell(p, tctx) < 0) return prev_cpu; @@ -772,6 +771,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1))) return; + tctx->last_cpu_source = MIG_ENQUEUE; + if (maybe_refresh_cell(p, tctx) < 0) return; @@ -841,6 +842,22 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags) return; } + /* If the cell no longer owns CPUs in this LLC, the + * (cell, llc) DSQ has no consumer. Re-pick by + * resetting cpumask and re-running LLC assignment. + * update_task_cpumask resets tctx->cpumask to + * (cell & affinity) before re-narrowing by LLC. */ + if (READ_ONCE(cell->llcs[tctx->llc].cpu_cnt) == 0) { + if (update_task_cpumask(p, tctx) < 0) + return; + /* re-validate for verifier; vtime was reset */ + if (!llc_is_valid(tctx->llc)) { + scx_bpf_error("Invalid LLC after re-pick: %d", tctx->llc); + return; + } + vtime = p->scx.dsq_vtime; + } + basis_vtime = READ_ONCE(cell->llcs[tctx->llc].vtime_now); } else { basis_vtime = READ_ONCE(cell->llcs[FAKE_FLAT_CELL_LLC].vtime_now); @@ -917,6 +934,10 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) /* Peek at cell-LLC DSQ head */ p = dsq_peek(cell_dsq.raw); if (p) { + struct task_ctx *dtctx = lookup_task_ctx(p); + if (!dtctx) + return; + dtctx->last_cpu_source = MIG_DISPATCH; min_vtime = p->scx.dsq_vtime; min_vtime_dsq = cell_dsq; found = true; @@ -925,6 +946,10 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) /* Peek at CPU DSQ head, prefer if lower vtime */ p = dsq_peek(cpu_dsq.raw); if (p && (!found || time_before(p->scx.dsq_vtime, min_vtime))) { + struct task_ctx *dtctx = lookup_task_ctx(p); + if (!dtctx) + return; + dtctx->last_cpu_source = MIG_DISPATCH; min_vtime = p->scx.dsq_vtime; min_vtime_dsq = cpu_dsq; found = true; @@ -936,15 +961,15 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev) * SCX_TASK_QUEUED (we don't set SCX_OPS_ENQ_LAST), and otherwise go idle. */ if (!found) { - /* Try work stealing if enabled */ + /* We can't stamp the task with MIG_DISPATCH here, but we + * detect the LLC change in maybe_retag_stolen_task */ if (enable_llc_awareness && enable_work_stealing) { /* Returns: <0 error, 0 no steal, >0 stole work */ s32 ret = try_stealing_work(cell, llc); if (ret < 0) return; - if (ret > 0) { + if (ret > 0) cstat_inc(CSTAT_STEAL, cell, cctx); - } } return; } @@ -1342,12 +1367,18 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p) if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) return; + /* Stolen tasks skip select_cpu/enqueue, so refresh here as a catch-all. */ + if (maybe_refresh_cell(p, tctx) < 0) + return; + /* Handle stolen task retag (LLC-aware mode only) */ if (enable_llc_awareness && enable_work_stealing) { if (maybe_retag_stolen_task(p, tctx, cctx) < 0) return; } + track_cpu_migration(p, tctx); + /* Record the running slice start time. */ tctx->started_running_at = scx_bpf_now(); @@ -1740,6 +1771,8 @@ static int init_task_impl(struct task_struct *p, struct cgroup *cgrp) return -EINVAL; } + tctx->last_ran_cpu = -1; + /* Initialize LLC assignment fields */ if (enable_llc_awareness) init_task_llc(tctx); @@ -1826,14 +1859,28 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx) scx_bpf_dump("CELL[%d] CPUS=", i); dump_cell_cpumask(i); scx_bpf_dump("\n"); - /* Per-LLC stats deferred: FAKE_FLAT_CELL_LLC used for now */ - dsq_id_t dsq_id = get_cell_llc_dsq_id(i, FAKE_FLAT_CELL_LLC); - if (dsq_is_invalid(dsq_id)) - return; - - scx_bpf_dump("CELL[%d] vtime=%llu nr_queued=%d\n", i, - READ_ONCE(cell->llcs[FAKE_FLAT_CELL_LLC].vtime_now), - scx_bpf_dsq_nr_queued(dsq_id.raw)); + if (enable_llc_awareness) { + u32 llc; + bpf_for(llc, 0, nr_llc) + { + if (llc >= MAX_LLCS) + break; + dsq_id_t llc_dsq = get_cell_llc_dsq_id(i, llc); + if (dsq_is_invalid(llc_dsq)) + continue; + scx_bpf_dump( + "CELL[%d] LLC[%d] cpu_cnt=%u vtime=%llu nr_queued=%d\n", i, + llc, READ_ONCE(cell->llcs[llc].cpu_cnt), + READ_ONCE(cell->llcs[llc].vtime_now), + scx_bpf_dsq_nr_queued(llc_dsq.raw)); + } + } else { + dsq_id_t dsq_id = get_cell_llc_dsq_id(i, FAKE_FLAT_CELL_LLC); + if (!dsq_is_invalid(dsq_id)) + scx_bpf_dump("CELL[%d] vtime=%llu nr_queued=%d\n", i, + READ_ONCE(cell->llcs[FAKE_FLAT_CELL_LLC].vtime_now), + scx_bpf_dsq_nr_queued(dsq_id.raw)); + } } bpf_for(i, 0, nr_possible_cpus) @@ -2248,6 +2295,20 @@ int apply_cell_config(void *ctx) cctx->cell = cell_id; } + if (enable_llc_awareness) { + int rc; + /* recalc_cell_llc_counts may iterate per-cell-LLC DSQs to + * drain orphaned tasks via bpf_for_each(scx_dsq, ...) which + * requires an RCU read CS. */ + scoped_guard(rcu) + { + rc = recalc_cell_llc_counts(cell_id, + (const struct cpumask *)new_cpumask); + } + if (rc) + return -EINVAL; + } + /* Swap the new cpumask into place */ if (publish_prepared_cpumask(&cpumaskw->primary, &new_cpumask)) { scx_bpf_error("failed to publish cpumask for cell_id %d", cell_id); @@ -2272,21 +2333,6 @@ int apply_cell_config(void *ctx) } } - /* Phase 2.5: Recompute per-LLC CPU counts for all cells */ - if (enable_llc_awareness) { - u32 c; - scoped_guard(rcu) - { - bpf_for(c, 0, MAX_CELLS) - { - if (c >= config->num_cells) - break; - if (recalc_cell_llc_counts(c, NULL)) - return -EINVAL; - } - } - } - /* Phase 3: Apply cell-to-cgroup assignments for owner cgroups */ if (config->num_cell_assignments > MAX_CELLS) return -EINVAL; diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h index bea6af53dc..de7afa48ed 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h @@ -24,6 +24,7 @@ #include extern const volatile u32 nr_llc; +extern const volatile u64 slice_ns; extern struct cell_map cells; @@ -89,6 +90,8 @@ struct task_ctx { s32 llc; u64 avg_runtime_ns; /* EWMA of per-wake runtimes (ns), init to 0 */ + s32 last_ran_cpu; /* CPU this task last ran on (-1 = never) */ + u32 last_cpu_source; /* enum migration_source: who picked the CPU */ u32 steal_count; /* how many times this task has been stolen */ u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */ @@ -144,6 +147,19 @@ static inline void cstat_inc(enum cell_stat_idx idx, u32 cell, struct cpu_ctx *c cstat_add(idx, cell, cctx, 1); } +/* Common dispatch ritual: charge cell vtime to this cell, insert onto + * cpu's local DSQ, optionally kick the CPU. Caller is responsible for + * any per-path bookkeeping (cstat counters, tctx->borrowed) BEFORE + * invoking this. */ +static __always_inline void place_task_on_cpu(struct task_struct *p, struct task_ctx *tctx, s32 cpu, + bool kick) +{ + tctx->vtime_charge_cell = tctx->cell; + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, 0); + if (kick) + scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); +} + struct cell_map { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, u32); diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs index 80cc4041b6..c8556b331c 100644 --- a/scheds/rust/scx_mitosis/src/main.rs +++ b/scheds/rust/scx_mitosis/src/main.rs @@ -209,6 +209,23 @@ struct Opts { #[clap(long, default_value = "500")] slice_shrink_min_us: u64, + /// Enable adaptive steal threshold controller. Modulates per-cell queue + /// depth threshold to target --steal-target-pct. Requires --enable-work-stealing. + #[clap(long, action = clap::ArgAction::SetTrue)] + enable_adaptive_stealing: bool, + + /// Minimum value for adaptive steal queue depth threshold. + #[clap(long, default_value = "0")] + steal_queued_min: u32, + + /// Maximum value for adaptive steal queue depth threshold. + #[clap(long, default_value = "20")] + steal_queued_max: u32, + + /// Target steal percentage for adaptive threshold controller. + #[clap(long, default_value = "2.0")] + steal_target_pct: f64, + #[clap(flatten, next_help_heading = "Libbpf Options")] pub libbpf: LibbpfOpts, } @@ -264,10 +281,19 @@ struct Scheduler<'a> { last_rebalance: Instant, /// Number of rebalancing events rebalance_count: u64, + /// Previous nr_drained_tasks value for per-rebalance delta reporting + prev_drained_tasks: u64, /// Epoll instance for waiting on multiple fds (inotify, stats wakeup) epoll: Epoll, /// EventFd to wake up main loop when stats are requested stats_waker: EventFd, + /// Previous migration counters (2x3 cross-product) + prev_mig: [u64; 6], // same_{select,enqueue,dispatch}, cross_{select,enqueue,dispatch} + /// Adaptive steal threshold range + enable_adaptive_stealing: bool, + steal_queued_min: u32, + steal_queued_max: u32, + steal_target_pct: f64, } struct DistributionStats { @@ -321,6 +347,9 @@ impl<'a> Scheduler<'a> { if opts.enable_work_stealing && !opts.enable_llc_awareness { bail!("Work stealing requires LLC-aware mode (--enable-llc-awareness)"); } + if opts.enable_adaptive_stealing && !opts.enable_work_stealing { + bail!("--enable-adaptive-stealing requires --enable-work-stealing"); + } Ok(()) } @@ -486,8 +515,14 @@ impl<'a> Scheduler<'a> { smoothed_util: [0.0; MAX_CELLS], last_rebalance: Instant::now(), rebalance_count: 0, + prev_drained_tasks: 0, epoll, stats_waker, + prev_mig: [0; 6], + enable_adaptive_stealing: opts.enable_adaptive_stealing, + steal_queued_min: opts.steal_queued_min, + steal_queued_max: opts.steal_queued_max, + steal_target_pct: opts.steal_target_pct, }) } @@ -791,14 +826,19 @@ impl<'a> Scheduler<'a> { self.rebalance_count += 1; self.metrics.rebalance_count = self.rebalance_count; + let cur_drained = self.skel.maps.bss_data.as_ref().unwrap().nr_drained_tasks; + let drained_this_event = cur_drained - self.prev_drained_tasks; + self.prev_drained_tasks = cur_drained; + let cell_manager = self .cell_manager .as_ref() .expect("BUG: cell_manager missing after apply_cell_config in maybe_rebalance"); info!( - "Rebalanced CPUs (spread={:.1}%, count={}): {}", + "Rebalanced CPUs (spread={:.1}%, count={}, drained={}): {}", spread, self.rebalance_count, + drained_this_event, cell_manager.format_cell_config(&cpu_assignments) ); @@ -1094,6 +1134,35 @@ impl<'a> Scheduler<'a> { + cell_metrics.slice_shrink_proportional + cell_metrics.slice_shrink_min; + let steal_pct = stats.steal_pct; + + if self.enable_adaptive_stealing { + let bss = self.skel.maps.bss_data.as_mut().unwrap(); + let cur = bss.per_cell_steal_min_queued[cell]; + let new_val = if steal_pct > self.steal_target_pct { + (cur + 1).min(self.steal_queued_max) + } else if steal_pct < self.steal_target_pct && cur > self.steal_queued_min { + cur - 1 + } else { + cur + }; + if new_val != cur { + unsafe { + let ptr = &mut bss.per_cell_steal_min_queued[cell] as *mut u32; + std::ptr::write_volatile(ptr, new_val); + } + } + cell_metrics.steal_threshold = new_val; + trace!( + " Cell {:>2} steal: S%={:.1}% thresh={}", + cell, + steal_pct, + new_val + ); + } else { + trace!(" Cell {:>2} steal: S%={:.1}%", cell, steal_pct); + } + trace!("{} {}", prefix, stats); } Ok(()) @@ -1164,6 +1233,43 @@ impl<'a> Scheduler<'a> { .context("collecting demand metrics")?; } + // Log migration counters (2x3 cross-product) + { + let bss = self.skel.maps.bss_data.as_ref().unwrap(); + let cur = [ + bss.nr_mig_same_select, + bss.nr_mig_same_enqueue, + bss.nr_mig_same_dispatch, + bss.nr_mig_cross_select, + bss.nr_mig_cross_enqueue, + bss.nr_mig_cross_dispatch, + ]; + let d: Vec = cur + .iter() + .zip(self.prev_mig.iter()) + .map(|(c, p)| c - p) + .collect(); + self.prev_mig = cur; + + // d[0..3] = same_{select,enqueue,dispatch}, d[3..6] = cross_{select,enqueue,dispatch} + let same = d[0] + d[1] + d[2]; + let cross = d[3] + d[4] + d[5]; + let total = same + cross; + let sel = d[0] + d[3]; + let enq = d[1] + d[4]; + let dis = d[2] + d[5]; + + let k = |n: u64| n / 1000; + let pct = |n: u64, of: u64| if of > 0 { (100 * n / of) as u64 } else { 0 }; + + trace!("Migrations:{:>5}k select_cpu:{:>5}k ({:>2}%) enqueue:{:>5}k ({:>2}%) dispatch:{:>5}k ({:>2}%)", + k(total), k(sel), pct(sel, total), k(enq), pct(enq, total), k(dis), pct(dis, total)); + trace!(" same_llc:{:>5}k ({:>2}%) select_cpu:{:>5}k ({:>2}%) enqueue:{:>5}k ({:>2}%) dispatch:{:>5}k ({:>2}%)", + k(same), pct(same, total), k(d[0]), pct(d[0], same), k(d[1]), pct(d[1], same), k(d[2]), pct(d[2], same)); + trace!(" cross_llc:{:>4}k ({:>2}%) select_cpu:{:>5}k ({:>2}%) enqueue:{:>5}k ({:>2}%) dispatch:{:>5}k ({:>2}%)", + k(cross), pct(cross, total), k(d[3]), pct(d[3], cross), k(d[4]), pct(d[4], cross), k(d[5]), pct(d[5], cross)); + } + for (cell_id, cell) in &self.cells { trace!("CELL[{}]: {}", cell_id, cell.cpus); } diff --git a/scheds/rust/scx_mitosis/src/stats.rs b/scheds/rust/scx_mitosis/src/stats.rs index cce441f0fa..149f95877b 100644 --- a/scheds/rust/scx_mitosis/src/stats.rs +++ b/scheds/rust/scx_mitosis/src/stats.rs @@ -34,6 +34,8 @@ pub struct CellMetrics { pub affn_violations_pct: f64, #[stat(desc = "Steal %")] pub steal_pct: f64, + #[stat(desc = "Adaptive steal queue-depth threshold")] + pub steal_threshold: u32, #[stat(desc = "Pin reject skipped %")] pub pin_skip_pct: f64, #[stat(desc = "Slice shrink events")]