diff --git a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
index a1fc7c1d66..6af255deed 100644
--- a/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/dsq.bpf.h
@@ -176,7 +176,7 @@ static inline s32 get_cpu_from_dsq(dsq_id_t dsq_id)
 }
 
 /* Helper functions to construct DSQ IDs */
-static inline dsq_id_t get_cpu_dsq_id(u32 cpu)
+static __always_inline dsq_id_t get_cpu_dsq_id(u32 cpu)
 {
 	// Check for valid CPU range, 0 indexed so >=.
 	if (cpu >= MAX_CPUS) {
@@ -187,7 +187,8 @@ static inline dsq_id_t get_cpu_dsq_id(u32 cpu)
 	return (dsq_id_t){ .cpu_dsq = { .cpu = cpu, .type = DSQ_TYPE_CPU } };
 }
 
-static inline dsq_id_t get_cell_llc_dsq_id(u32 cell, u32 llc)
+/* __always_inline is technically required for a fn w/ aggregate return */
+static __always_inline dsq_id_t get_cell_llc_dsq_id(u32 cell, u32 llc)
 {
 	if (cell >= MAX_CELLS || llc >= MAX_LLCS) {
 		scx_bpf_error("cell %u or llc %u too large", cell, llc);
diff --git a/scheds/rust/scx_mitosis/src/bpf/intf.h b/scheds/rust/scx_mitosis/src/bpf/intf.h
index dc41f6df87..a549f12031 100644
--- a/scheds/rust/scx_mitosis/src/bpf/intf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/intf.h
@@ -65,6 +65,8 @@ struct debug_event {
 	};
 };
 
+enum migration_source { MIG_SELECT_CPU, MIG_ENQUEUE, MIG_DISPATCH, NR_MIG_SOURCES };
+
 /* Statistics */
 enum cell_stat_idx {
 	CSTAT_LOCAL,
diff --git a/scheds/rust/scx_mitosis/src/bpf/llc_aware.bpf.h b/scheds/rust/scx_mitosis/src/bpf/llc_aware.bpf.h
index 2f81adaf1f..db8b8a21fa 100644
--- a/scheds/rust/scx_mitosis/src/bpf/llc_aware.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/llc_aware.bpf.h
@@ -24,6 +24,20 @@ typedef u32 llc_id_t;
  */
 extern u32 cpu_to_llc[MAX_CPUS];
 extern struct llc_cpumask llc_to_cpus[MAX_LLCS];
+extern u32 per_cell_steal_min_queued[MAX_CELLS];
+
+/* Global migration counters - 2x3 cross-product (read by userspace).
+ * Same-vs-cross is classified against cpu_to_llc[] above. */
+u64 nr_mig_same_select;
+u64 nr_mig_same_enqueue;
+u64 nr_mig_same_dispatch;
+u64 nr_mig_cross_select;
+u64 nr_mig_cross_enqueue;
+u64 nr_mig_cross_dispatch;
+
+/* Counter for tasks moved by the cell-shrink drain in
+ * recalc_cell_llc_counts. Read by userspace. */
+u64 nr_drained_tasks;
 
 static inline bool llc_is_valid(u32 llc_id)
 {
@@ -77,6 +91,7 @@ static __always_inline int recalc_cell_llc_counts(u32 cell_idx, const struct cpu
 	u32 llc, llcs_present = 0, total_cpus = 0;
 	// Just so we don't hold the lock longer than necessary
 	u32 llc_cpu_cnt_tmp[MAX_LLCS] = { 0 };
+	u32 llc_cpu_cnt_old[MAX_LLCS] = { 0 };
 
 	const struct cpumask *cell_mask;
 	if (explicit_mask) {
@@ -97,6 +112,9 @@ static __always_inline int recalc_cell_llc_counts(u32 cell_idx, const struct cpu
 
 		u32 cnt = bpf_cpumask_weight((const struct cpumask *)tmp_mask);
 
+		/* Snapshot old count BEFORE the publish below so the drain's
+		 * >0 -> 0 transition check still works after the reorder. */
+		llc_cpu_cnt_old[llc] = READ_ONCE(cell->llcs[llc].cpu_cnt);
 		llc_cpu_cnt_tmp[llc] = cnt;
 
 		// These are counted across the whole cell
@@ -107,7 +125,21 @@ static __always_inline int recalc_cell_llc_counts(u32 cell_idx, const struct cpu
 			llcs_present++;
 	}
 
-	// Write to cell
+	/*
+	 * Publish cpu_cnt FIRST, then drain. Enqueues that read cpu_cnt
+	 * after the publish see 0 and re-pick via update_task_cpumask
+	 * (mitosis_enqueue's cpu_cnt==0 guard); the drain then sweeps
+	 * anything in the DSQ at iterator time.
+	 * Order matters: dispatch never peeks cpu_cnt==0 LLCs and stealing
+	 * skips them, so any task that lands in an orphan (cell, llc) DSQ
+	 * stays stranded indefinitely.
+	 *
+	 * Residual race (not eliminated): an enqueue between its cpu_cnt
+	 * read and its scx_bpf_dsq_insert_vtime call may have read the
+	 * old cpu_cnt and complete its insert after the drain iterator.
+	 * BPF lacks the synchronize_rcu / atomic-insert-if-predicate
+	 * primitives needed to close this fully.
+	 */
 	scoped_guard(spin_lock, &cell->lock)
 	{
 		for (u32 llc_idx = 0; llc_idx < nr_llc; llc_idx++) {
@@ -117,6 +149,70 @@ static __always_inline int recalc_cell_llc_counts(u32 cell_idx, const struct cpu
 		cell->llc_present_cnt = llcs_present;
 		cell->cpu_cnt = total_cpus;
 	}
+
+	/*
+	 * Drain orphan: if an LLC's count just dropped from >0 to 0, any
+	 * tasks queued on (cell, llc) DSQ would be stranded (no consumer).
+	 * Move them to a sibling LLC's DSQ that the cell still has CPUs in.
+	 *
+	 * Done outside the spin_lock above because bpf_for_each(scx_dsq,..)
+	 * and scx_bpf_dsq_move_vtime cannot be called under cell->lock.
+	 * The transition check uses the snapshotted llc_cpu_cnt_old[]
+	 * because cell->llcs[].cpu_cnt now holds the new value.
+	 */
+	bpf_for(llc, 0, nr_llc)
+	{
+		u32 old_cnt = llc_cpu_cnt_old[llc];
+		u32 new_cnt = llc_cpu_cnt_tmp[llc];
+		if (!(old_cnt > 0 && new_cnt == 0))
+			continue;
+
+		/* Pick destination LLC: first one the cell still has CPUs in.
+		 * TODO: spread across surviving LLCs (round-robin or
+		 * least-loaded); currently we dump all stranded tasks onto
+		 * one LLC. */
+		u32 dest_llc = LLC_INVALID;
+		u32 d;
+		bpf_for(d, 0, nr_llc)
+		{
+			if (llc_cpu_cnt_tmp[d] > 0) {
+				dest_llc = d;
+				break;
+			}
+		}
+		if (dest_llc == LLC_INVALID) {
+			scx_bpf_error(
+				"recalc_cell_llc_counts: cell %u has no CPUs in any LLC; cannot drain (cell, llc=%u)",
+				cell_idx, llc);
+			return -EINVAL;
+		}
+
+		dsq_id_t src_dsq = get_cell_llc_dsq_id(cell_idx, llc);
+		dsq_id_t dst_dsq = get_cell_llc_dsq_id(cell_idx, dest_llc);
+		if (dsq_is_invalid(src_dsq) || dsq_is_invalid(dst_dsq))
+			return -EINVAL;
+
+		struct task_struct *p;
+		u64 dst_vtime = READ_ONCE(cell->llcs[dest_llc].vtime_now);
+		/* Caller must hold RCU read lock for bpf_iter_scx_dsq_next. */
+		bpf_for_each(scx_dsq, p, src_dsq.raw, 0) {
+			struct task_ctx *tctx = lookup_task_ctx(p);
+			if (!tctx)
+				return -ENOENT; /* error already raised by lookup_task_ctx */
+			tctx->llc = dest_llc;
+			tctx->dsq = dst_dsq;
+			/* Use dest LLC's vtime_now for the move so the task lands
+			 * at the tail of the dest DSQ instead of the head. We can't
+			 * write p->scx.dsq_vtime directly here (iterator-returned
+			 * pointers are RCU read-only), so dispatch's cross-DSQ
+			 * comparison may still see the source-domain dsq_vtime
+			 * until the task next stops and dsq_vtime is recharged. */
+			scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, dst_vtime);
+			scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, dst_dsq.raw, 0);
+			__sync_fetch_and_add(&nr_drained_tasks, 1);
+		}
+	}
+
 	return 0;
 }
 
@@ -211,9 +307,10 @@ static inline int maybe_retag_stolen_task(struct task_struct *p, struct task_ctx
 	if (tctx->llc == cctx->llc)
 		return 0;
 
-	/* Task was stolen to a different LLC - update accounting */
+	/* Task was stolen to a different LLC - update accounting. */
 	tctx->steal_count++;
 	tctx->last_stolen_at = scx_bpf_now();
+	tctx->last_cpu_source = MIG_DISPATCH;
 
 	/* Assign task to new LLC */
 	tctx->llc = cctx->llc;
@@ -225,6 +322,46 @@ static inline int maybe_retag_stolen_task(struct task_struct *p, struct task_ctx
 	return update_task_cpumask(p, tctx);
 }
 
+/* Bump the appropriate (same|cross)_(select|enqueue|dispatch) counter
+ * based on whether the new CPU is in the same LLC as the previous one
+ * and how this task arrived (last_cpu_source). Updates last_ran_cpu.
+ * First run on a CPU (prev < 0) is uncounted by design. */
+static inline void track_cpu_migration(struct task_struct *p, struct task_ctx *tctx)
+{
+	s32 cpu = scx_bpf_task_cpu(p);
+	s32 prev = tctx->last_ran_cpu;
+	tctx->last_ran_cpu = cpu;
+
+	if (!(prev >= 0 && prev != cpu && prev < nr_possible_cpus && cpu < nr_possible_cpus))
+		return;
+
+	if (cpu_to_llc[prev] == cpu_to_llc[cpu]) {
+		switch (tctx->last_cpu_source) {
+		case MIG_SELECT_CPU:
+			__sync_fetch_and_add(&nr_mig_same_select, 1);
+			break;
+		case MIG_ENQUEUE:
+			__sync_fetch_and_add(&nr_mig_same_enqueue, 1);
+			break;
+		case MIG_DISPATCH:
+			__sync_fetch_and_add(&nr_mig_same_dispatch, 1);
+			break;
+		}
+	} else {
+		switch (tctx->last_cpu_source) {
+		case MIG_SELECT_CPU:
+			__sync_fetch_and_add(&nr_mig_cross_select, 1);
+			break;
+		case MIG_ENQUEUE:
+			__sync_fetch_and_add(&nr_mig_cross_enqueue, 1);
+			break;
+		case MIG_DISPATCH:
+			__sync_fetch_and_add(&nr_mig_cross_dispatch, 1);
+			break;
+		}
+	}
+}
+
 /* Work stealing:
  * Scan sibling (cell,LLC) DSQs in the same cell and steal the first queued task if it can run on this cpu
  * Returns:
@@ -244,50 +381,55 @@ static inline s32 try_stealing_work(u32 cell, s32 local_llc)
 		return -EINVAL;
 
 	// Loop over all other LLCs, looking for a queued task to steal
+	/* Find the most backed-up sibling LLC and steal from it, but only
+	 * if its queue depth exceeds per_cell_steal_min_queued[cell] (the
+	 * per-cell threshold maintained by userspace's adaptive controller;
+	 * default 0 means "steal whenever a sibling has any queued task"). */
 	u32 i;
+	u32 max_queued = 0;
+	dsq_id_t max_dsq = DSQ_INVALID;
+
 	bpf_for(i, 1, nr_llc)
 	{
-		// Start with the next one to spread out the load
 		u32 candidate_llc = (local_llc + i) % nr_llc;
 
-		// Prevents the optimizer from removing the following conditional return
-		// so that the verifier knows the read will be safe
 		barrier_var(candidate_llc);
 
 		if (candidate_llc >= MAX_LLCS)
 			continue;
 
 		/*
-    * Skip if the cell doesn't have CPUs in this LLC.
-    * Note: rechecking cell_ptr for verifier.
-    * This is racy with try_stealing_this_task, but we don't care -
-    * if the LLC actually doesn't have CPUs come steal time,
-    * we will fail the steal and continue to the next LLC.
-    */
+		 * Skip LLCs the cell has no CPUs in. Read is racy with reconfig
+		 * but harmless: a stale-positive just means the steal will fail.
+		 * cell_ptr re-check is for the verifier.
+		 */
 		if (cell_ptr && READ_ONCE(cell_ptr->llcs[candidate_llc].cpu_cnt) == 0)
 			continue;
 
 		dsq_id_t candidate_dsq = get_cell_llc_dsq_id(cell, candidate_llc);
 		if (dsq_is_invalid(candidate_dsq))
-			return -EINVAL; // already errored in get_cell_llc_dsq_id
-
-		// Optimization: skip if faster than constructing an iterator
-		// Not redundant with later checking if task found (race)
-		if (!scx_bpf_dsq_nr_queued(candidate_dsq.raw))
-			continue;
+			return -EINVAL;
 
-		/*
-		 * Attempt the steal - can fail because it's a race.
-		 * We don't update task_ctx here because the peeked task_ctx
-		 * may be stale (a different task may now be at head of DSQ).
-		 * Actual retag and accounting happens in running() via
-		 * mismatch detection.
-		 */
-		if (!scx_bpf_dsq_move_to_local(candidate_dsq.raw, 0))
-			continue;
+		u32 nr = scx_bpf_dsq_nr_queued(candidate_dsq.raw);
+		if (nr > max_queued) {
+			max_queued = nr;
+			max_dsq = candidate_dsq;
+		}
+	}
 
-		// Success, we got a task
-		return true;
+	/*
+	 * Single steal attempt; can fail under race (e.g. another CPU
+	 * consumed the head first). Stolen task's LLC retag happens
+	 * lazily in mitosis_running via maybe_retag_stolen_task.
+	 *
+	 * TODO: on race the CPU goes idle even if other siblings have
+	 * queued tasks. Track second-best during the scan and try it on
+	 * failure (cheap, ~6 lines, single-pass), or fall back to a full
+	 * sibling sweep.
+	 */
+	if (max_queued > per_cell_steal_min_queued[cell] && !dsq_is_invalid(max_dsq)) {
+		if (scx_bpf_dsq_move_to_local(max_dsq.raw, 0))
+			return true;
 	}
 	return false;
 }
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index d363a5fd66..7c8ec337d1 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -50,6 +50,8 @@ const volatile bool userspace_managed_cell_mode = false;
 const volatile bool enable_borrowing = false;
 const volatile bool use_lockless_peek = false;
 const volatile bool dynamic_affinity_cpu_selection = false;
+/* Per-cell sibling-LLC queue-depth threshold gating work-stealing; tuned by userspace. */
+u32 per_cell_steal_min_queued[MAX_CELLS];
 
 /*
  * Global arrays for LLC topology, populated by userspace before load.
@@ -66,6 +68,7 @@ struct llc_cpumask llc_to_cpus[MAX_LLCS];
 u32 configuration_seq;
 u32 applied_configuration_seq;
 u32 cpuset_seq;
+
 u32 applied_cpuset_seq;
 
 /*
@@ -584,10 +587,7 @@ static __always_inline s32 try_pick_idle_cpu(struct task_struct *p, s32 prev_cpu
 		 * enqueue), SCX_DSQ_LOCAL resolves to task_rq(p) -- not
 		 * the idle CPU we picked.
 		 */
-		tctx->vtime_charge_cell = tctx->cell;
-		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, 0);
-		if (kick)
-			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+		place_task_on_cpu(p, tctx, cpu, kick);
 		return cpu;
 	}
 	if (cpu == -1)
@@ -608,10 +608,7 @@ static __always_inline s32 try_pick_idle_cpu(struct task_struct *p, s32 prev_cpu
 		if (cpu >= 0) {
 			tctx->borrowed = true;
 			cstat_inc(CSTAT_BORROWED, tctx->cell, cctx);
-			tctx->vtime_charge_cell = tctx->cell;
-			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, 0);
-			if (kick)
-				scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+			place_task_on_cpu(p, tctx, cpu, kick);
 			return cpu;
 		}
 	}
@@ -684,6 +681,8 @@ s32 BPF_STRUCT_OPS(mitosis_select_cpu, struct task_struct *p, s32 prev_cpu, u64
 	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
 		return prev_cpu;
 
+	tctx->last_cpu_source = MIG_SELECT_CPU;
+
 	if (maybe_refresh_cell(p, tctx) < 0)
 		return prev_cpu;
 
@@ -772,6 +771,8 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 	if (!(tctx = lookup_task_ctx(p)) || !(cctx = lookup_cpu_ctx(-1)))
 		return;
 
+	tctx->last_cpu_source = MIG_ENQUEUE;
+
 	if (maybe_refresh_cell(p, tctx) < 0)
 		return;
 
@@ -841,6 +842,22 @@ void BPF_STRUCT_OPS(mitosis_enqueue, struct task_struct *p, u64 enq_flags)
 				return;
 			}
 
+			/* If the cell no longer owns CPUs in this LLC, the
+			 * (cell, llc) DSQ has no consumer. Re-pick by
+			 * resetting cpumask and re-running LLC assignment.
+			 * update_task_cpumask resets tctx->cpumask to
+			 * (cell & affinity) before re-narrowing by LLC. */
+			if (READ_ONCE(cell->llcs[tctx->llc].cpu_cnt) == 0) {
+				if (update_task_cpumask(p, tctx) < 0)
+					return;
+				/* re-validate for verifier; vtime was reset */
+				if (!llc_is_valid(tctx->llc)) {
+					scx_bpf_error("Invalid LLC after re-pick: %d", tctx->llc);
+					return;
+				}
+				vtime = p->scx.dsq_vtime;
+			}
+
 			basis_vtime = READ_ONCE(cell->llcs[tctx->llc].vtime_now);
 		} else {
 			basis_vtime = READ_ONCE(cell->llcs[FAKE_FLAT_CELL_LLC].vtime_now);
@@ -917,6 +934,10 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	/* Peek at cell-LLC DSQ head */
 	p = dsq_peek(cell_dsq.raw);
 	if (p) {
+		struct task_ctx *dtctx = lookup_task_ctx(p);
+		if (!dtctx)
+			return;
+		dtctx->last_cpu_source = MIG_DISPATCH;
 		min_vtime = p->scx.dsq_vtime;
 		min_vtime_dsq = cell_dsq;
 		found = true;
@@ -925,6 +946,10 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	/* Peek at CPU DSQ head, prefer if lower vtime */
 	p = dsq_peek(cpu_dsq.raw);
 	if (p && (!found || time_before(p->scx.dsq_vtime, min_vtime))) {
+		struct task_ctx *dtctx = lookup_task_ctx(p);
+		if (!dtctx)
+			return;
+		dtctx->last_cpu_source = MIG_DISPATCH;
 		min_vtime = p->scx.dsq_vtime;
 		min_vtime_dsq = cpu_dsq;
 		found = true;
@@ -936,15 +961,15 @@ void BPF_STRUCT_OPS(mitosis_dispatch, s32 cpu, struct task_struct *prev)
 	 * SCX_TASK_QUEUED (we don't set SCX_OPS_ENQ_LAST), and otherwise go idle.
 	 */
 	if (!found) {
-		/* Try work stealing if enabled */
+		/* We can't stamp the task with MIG_DISPATCH here, but we
+		 * detect the LLC change in maybe_retag_stolen_task */
 		if (enable_llc_awareness && enable_work_stealing) {
 			/* Returns: <0 error, 0 no steal, >0 stole work */
 			s32 ret = try_stealing_work(cell, llc);
 			if (ret < 0)
 				return;
-			if (ret > 0) {
+			if (ret > 0)
 				cstat_inc(CSTAT_STEAL, cell, cctx);
-			}
 		}
 		return;
 	}
@@ -1342,12 +1367,18 @@ void BPF_STRUCT_OPS(mitosis_running, struct task_struct *p)
 	if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)))
 		return;
 
+	/* Stolen tasks skip select_cpu/enqueue, so refresh here as a catch-all. */
+	if (maybe_refresh_cell(p, tctx) < 0)
+		return;
+
 	/* Handle stolen task retag (LLC-aware mode only) */
 	if (enable_llc_awareness && enable_work_stealing) {
 		if (maybe_retag_stolen_task(p, tctx, cctx) < 0)
 			return;
 	}
 
+	track_cpu_migration(p, tctx);
+
 	/* Record the running slice start time. */
 	tctx->started_running_at = scx_bpf_now();
 
@@ -1740,6 +1771,8 @@ static int init_task_impl(struct task_struct *p, struct cgroup *cgrp)
 		return -EINVAL;
 	}
 
+	tctx->last_ran_cpu = -1;
+
 	/* Initialize LLC assignment fields */
 	if (enable_llc_awareness)
 		init_task_llc(tctx);
@@ -1826,14 +1859,28 @@ void BPF_STRUCT_OPS(mitosis_dump, struct scx_dump_ctx *dctx)
 		scx_bpf_dump("CELL[%d] CPUS=", i);
 		dump_cell_cpumask(i);
 		scx_bpf_dump("\n");
-		/* Per-LLC stats deferred: FAKE_FLAT_CELL_LLC used for now */
-		dsq_id_t dsq_id = get_cell_llc_dsq_id(i, FAKE_FLAT_CELL_LLC);
-		if (dsq_is_invalid(dsq_id))
-			return;
-
-		scx_bpf_dump("CELL[%d] vtime=%llu nr_queued=%d\n", i,
-			     READ_ONCE(cell->llcs[FAKE_FLAT_CELL_LLC].vtime_now),
-			     scx_bpf_dsq_nr_queued(dsq_id.raw));
+		if (enable_llc_awareness) {
+			u32 llc;
+			bpf_for(llc, 0, nr_llc)
+			{
+				if (llc >= MAX_LLCS)
+					break;
+				dsq_id_t llc_dsq = get_cell_llc_dsq_id(i, llc);
+				if (dsq_is_invalid(llc_dsq))
+					continue;
+				scx_bpf_dump(
+					"CELL[%d] LLC[%d] cpu_cnt=%u vtime=%llu nr_queued=%d\n", i,
+					llc, READ_ONCE(cell->llcs[llc].cpu_cnt),
+					READ_ONCE(cell->llcs[llc].vtime_now),
+					scx_bpf_dsq_nr_queued(llc_dsq.raw));
+			}
+		} else {
+			dsq_id_t dsq_id = get_cell_llc_dsq_id(i, FAKE_FLAT_CELL_LLC);
+			if (!dsq_is_invalid(dsq_id))
+				scx_bpf_dump("CELL[%d] vtime=%llu nr_queued=%d\n", i,
+					     READ_ONCE(cell->llcs[FAKE_FLAT_CELL_LLC].vtime_now),
+					     scx_bpf_dsq_nr_queued(dsq_id.raw));
+		}
 	}
 
 	bpf_for(i, 0, nr_possible_cpus)
@@ -2248,6 +2295,20 @@ int apply_cell_config(void *ctx)
 			cctx->cell = cell_id;
 		}
 
+		if (enable_llc_awareness) {
+			int rc;
+			/* recalc_cell_llc_counts may iterate per-cell-LLC DSQs to
+			 * drain orphaned tasks via bpf_for_each(scx_dsq, ...) which
+			 * requires an RCU read CS. */
+			scoped_guard(rcu)
+			{
+				rc = recalc_cell_llc_counts(cell_id,
+							    (const struct cpumask *)new_cpumask);
+			}
+			if (rc)
+				return -EINVAL;
+		}
+
 		/* Swap the new cpumask into place */
 		if (publish_prepared_cpumask(&cpumaskw->primary, &new_cpumask)) {
 			scx_bpf_error("failed to publish cpumask for cell_id %d", cell_id);
@@ -2272,21 +2333,6 @@ int apply_cell_config(void *ctx)
 		}
 	}
 
-	/* Phase 2.5: Recompute per-LLC CPU counts for all cells */
-	if (enable_llc_awareness) {
-		u32 c;
-		scoped_guard(rcu)
-		{
-			bpf_for(c, 0, MAX_CELLS)
-			{
-				if (c >= config->num_cells)
-					break;
-				if (recalc_cell_llc_counts(c, NULL))
-					return -EINVAL;
-			}
-		}
-	}
-
 	/* Phase 3: Apply cell-to-cgroup assignments for owner cgroups */
 	if (config->num_cell_assignments > MAX_CELLS)
 		return -EINVAL;
diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
index bea6af53dc..de7afa48ed 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.h
@@ -24,6 +24,7 @@
 #include <lib/cleanup.bpf.h>
 
 extern const volatile u32 nr_llc;
+extern const volatile u64 slice_ns;
 
 extern struct cell_map cells;
 
@@ -89,6 +90,8 @@ struct task_ctx {
 	s32 llc;
 
 	u64 avg_runtime_ns; /* EWMA of per-wake runtimes (ns), init to 0 */
+	s32 last_ran_cpu; /* CPU this task last ran on (-1 = never) */
+	u32 last_cpu_source; /* enum migration_source: who picked the CPU */
 
 	u32 steal_count; /* how many times this task has been stolen */
 	u64 last_stolen_at; /* ns timestamp of the last steal (scx_bpf_now) */
@@ -144,6 +147,19 @@ static inline void cstat_inc(enum cell_stat_idx idx, u32 cell, struct cpu_ctx *c
 	cstat_add(idx, cell, cctx, 1);
 }
 
+/* Common dispatch ritual: charge cell vtime to this cell, insert onto
+ * cpu's local DSQ, optionally kick the CPU. Caller is responsible for
+ * any per-path bookkeeping (cstat counters, tctx->borrowed) BEFORE
+ * invoking this. */
+static __always_inline void place_task_on_cpu(struct task_struct *p, struct task_ctx *tctx, s32 cpu,
+					      bool kick)
+{
+	tctx->vtime_charge_cell = tctx->cell;
+	scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, 0);
+	if (kick)
+		scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+}
+
 struct cell_map {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__type(key, u32);
diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs
index 80cc4041b6..c8556b331c 100644
--- a/scheds/rust/scx_mitosis/src/main.rs
+++ b/scheds/rust/scx_mitosis/src/main.rs
@@ -209,6 +209,23 @@ struct Opts {
     #[clap(long, default_value = "500")]
     slice_shrink_min_us: u64,
 
+    /// Enable adaptive steal threshold controller. Modulates per-cell queue
+    /// depth threshold to target --steal-target-pct. Requires --enable-work-stealing.
+    #[clap(long, action = clap::ArgAction::SetTrue)]
+    enable_adaptive_stealing: bool,
+
+    /// Minimum value for adaptive steal queue depth threshold.
+    #[clap(long, default_value = "0")]
+    steal_queued_min: u32,
+
+    /// Maximum value for adaptive steal queue depth threshold.
+    #[clap(long, default_value = "20")]
+    steal_queued_max: u32,
+
+    /// Target steal percentage for adaptive threshold controller.
+    #[clap(long, default_value = "2.0")]
+    steal_target_pct: f64,
+
     #[clap(flatten, next_help_heading = "Libbpf Options")]
     pub libbpf: LibbpfOpts,
 }
@@ -264,10 +281,19 @@ struct Scheduler<'a> {
     last_rebalance: Instant,
     /// Number of rebalancing events
     rebalance_count: u64,
+    /// Previous nr_drained_tasks value for per-rebalance delta reporting
+    prev_drained_tasks: u64,
     /// Epoll instance for waiting on multiple fds (inotify, stats wakeup)
     epoll: Epoll,
     /// EventFd to wake up main loop when stats are requested
     stats_waker: EventFd,
+    /// Previous migration counters (2x3 cross-product)
+    prev_mig: [u64; 6], // same_{select,enqueue,dispatch}, cross_{select,enqueue,dispatch}
+    /// Adaptive steal threshold range
+    enable_adaptive_stealing: bool,
+    steal_queued_min: u32,
+    steal_queued_max: u32,
+    steal_target_pct: f64,
 }
 
 struct DistributionStats {
@@ -321,6 +347,9 @@ impl<'a> Scheduler<'a> {
         if opts.enable_work_stealing && !opts.enable_llc_awareness {
             bail!("Work stealing requires LLC-aware mode (--enable-llc-awareness)");
         }
+        if opts.enable_adaptive_stealing && !opts.enable_work_stealing {
+            bail!("--enable-adaptive-stealing requires --enable-work-stealing");
+        }
 
         Ok(())
     }
@@ -486,8 +515,14 @@ impl<'a> Scheduler<'a> {
             smoothed_util: [0.0; MAX_CELLS],
             last_rebalance: Instant::now(),
             rebalance_count: 0,
+            prev_drained_tasks: 0,
             epoll,
             stats_waker,
+            prev_mig: [0; 6],
+            enable_adaptive_stealing: opts.enable_adaptive_stealing,
+            steal_queued_min: opts.steal_queued_min,
+            steal_queued_max: opts.steal_queued_max,
+            steal_target_pct: opts.steal_target_pct,
         })
     }
 
@@ -791,14 +826,19 @@ impl<'a> Scheduler<'a> {
         self.rebalance_count += 1;
         self.metrics.rebalance_count = self.rebalance_count;
 
+        let cur_drained = self.skel.maps.bss_data.as_ref().unwrap().nr_drained_tasks;
+        let drained_this_event = cur_drained - self.prev_drained_tasks;
+        self.prev_drained_tasks = cur_drained;
+
         let cell_manager = self
             .cell_manager
             .as_ref()
             .expect("BUG: cell_manager missing after apply_cell_config in maybe_rebalance");
         info!(
-            "Rebalanced CPUs (spread={:.1}%, count={}): {}",
+            "Rebalanced CPUs (spread={:.1}%, count={}, drained={}): {}",
             spread,
             self.rebalance_count,
+            drained_this_event,
             cell_manager.format_cell_config(&cpu_assignments)
         );
 
@@ -1094,6 +1134,35 @@ impl<'a> Scheduler<'a> {
                 + cell_metrics.slice_shrink_proportional
                 + cell_metrics.slice_shrink_min;
 
+            let steal_pct = stats.steal_pct;
+
+            if self.enable_adaptive_stealing {
+                let bss = self.skel.maps.bss_data.as_mut().unwrap();
+                let cur = bss.per_cell_steal_min_queued[cell];
+                let new_val = if steal_pct > self.steal_target_pct {
+                    (cur + 1).min(self.steal_queued_max)
+                } else if steal_pct < self.steal_target_pct && cur > self.steal_queued_min {
+                    cur - 1
+                } else {
+                    cur
+                };
+                if new_val != cur {
+                    unsafe {
+                        let ptr = &mut bss.per_cell_steal_min_queued[cell] as *mut u32;
+                        std::ptr::write_volatile(ptr, new_val);
+                    }
+                }
+                cell_metrics.steal_threshold = new_val;
+                trace!(
+                    "  Cell {:>2} steal: S%={:.1}% thresh={}",
+                    cell,
+                    steal_pct,
+                    new_val
+                );
+            } else {
+                trace!("  Cell {:>2} steal: S%={:.1}%", cell, steal_pct);
+            }
+
             trace!("{} {}", prefix, stats);
         }
         Ok(())
@@ -1164,6 +1233,43 @@ impl<'a> Scheduler<'a> {
                 .context("collecting demand metrics")?;
         }
 
+        // Log migration counters (2x3 cross-product)
+        {
+            let bss = self.skel.maps.bss_data.as_ref().unwrap();
+            let cur = [
+                bss.nr_mig_same_select,
+                bss.nr_mig_same_enqueue,
+                bss.nr_mig_same_dispatch,
+                bss.nr_mig_cross_select,
+                bss.nr_mig_cross_enqueue,
+                bss.nr_mig_cross_dispatch,
+            ];
+            let d: Vec<u64> = cur
+                .iter()
+                .zip(self.prev_mig.iter())
+                .map(|(c, p)| c - p)
+                .collect();
+            self.prev_mig = cur;
+
+            // d[0..3] = same_{select,enqueue,dispatch}, d[3..6] = cross_{select,enqueue,dispatch}
+            let same = d[0] + d[1] + d[2];
+            let cross = d[3] + d[4] + d[5];
+            let total = same + cross;
+            let sel = d[0] + d[3];
+            let enq = d[1] + d[4];
+            let dis = d[2] + d[5];
+
+            let k = |n: u64| n / 1000;
+            let pct = |n: u64, of: u64| if of > 0 { (100 * n / of) as u64 } else { 0 };
+
+            trace!("Migrations:{:>5}k        select_cpu:{:>5}k ({:>2}%)   enqueue:{:>5}k ({:>2}%)   dispatch:{:>5}k ({:>2}%)",
+                k(total), k(sel), pct(sel, total), k(enq), pct(enq, total), k(dis), pct(dis, total));
+            trace!("  same_llc:{:>5}k ({:>2}%)  select_cpu:{:>5}k ({:>2}%)   enqueue:{:>5}k ({:>2}%)   dispatch:{:>5}k ({:>2}%)",
+                k(same), pct(same, total), k(d[0]), pct(d[0], same), k(d[1]), pct(d[1], same), k(d[2]), pct(d[2], same));
+            trace!("  cross_llc:{:>4}k ({:>2}%)  select_cpu:{:>5}k ({:>2}%)   enqueue:{:>5}k ({:>2}%)   dispatch:{:>5}k ({:>2}%)",
+                k(cross), pct(cross, total), k(d[3]), pct(d[3], cross), k(d[4]), pct(d[4], cross), k(d[5]), pct(d[5], cross));
+        }
+
         for (cell_id, cell) in &self.cells {
             trace!("CELL[{}]: {}", cell_id, cell.cpus);
         }
diff --git a/scheds/rust/scx_mitosis/src/stats.rs b/scheds/rust/scx_mitosis/src/stats.rs
index cce441f0fa..149f95877b 100644
--- a/scheds/rust/scx_mitosis/src/stats.rs
+++ b/scheds/rust/scx_mitosis/src/stats.rs
@@ -34,6 +34,8 @@ pub struct CellMetrics {
     pub affn_violations_pct: f64,
     #[stat(desc = "Steal %")]
     pub steal_pct: f64,
+    #[stat(desc = "Adaptive steal queue-depth threshold")]
+    pub steal_threshold: u32,
     #[stat(desc = "Pin reject skipped %")]
     pub pin_skip_pct: f64,
     #[stat(desc = "Slice shrink events")]