scx_chaos: use peek operation to optimise for empty delay dsq

JakeHillion · JakeHillion · commit d34a00da0354 · 2025-10-15T11:29:36.000+01:00
Use the new `scx_bpf_dsq_peek` in scx_chaos to optimise for the fast
path. This avoids locking the DSQs and should be beneficial in the
common case where the DSQ is empty/nothing in the DSQ is ready.

Add a few stats for tracking how successful peek is. This works really
well on my local machine for skipping the hot path.

This mostly avoids contention with the crawling timer thread, as the
insertion in chaos_enqueue and removal in chaos_dispatch are all local
to one CPU and the locking overhead would be minimal.

Test plan:
- CI

```
jake@merlin:/data/users/jake/repos/scx/ &gt; cargo build --release -p scx_chaos &amp;&amp; sudo target/release/scx_chaos --random-delay-frequency 0.01 --random-delay-min-us 100000 --random-delay-max-us 200000 --stats 10
...
    Finished `release` profile [optimized] target(s) in 1m 01s
11:28:59 [INFO] Running scx_chaos (build ID: 1.0.20-ga6134e95-dirty x86_64-unknown-linux-gnu)
11:28:59 [INFO] Builder { traits: [RandomDelays { frequency: 0.01, min_us: 100000, max_us: 200000 }], verbose: 0, kprobe_random_delays: None, p2dq_opts: SchedulerOpts { disable_kthreads_local: false, autoslice: false, interactive_ratio: 10, deadline: false, eager_load_balance: false, freq_control: false, greedy_idle_disable: true, interactive_sticky: false, interactive_fifo: false, dispatch_pick2_disable: false, dispatch_lb_busy: 75, dispatch_lb_interactive: true, keep_running: false, atq_enabled: false, cpu_priority: false, interactive_dsq: true, wakeup_lb_busy: 0, wakeup_llc_migrations: false, select_idle_in_enqueue: false, queued_wakeup: false, idle_resume_us: None, max_dsq_pick2: false, task_slice: false, min_slice_us: 100, lb_mode: Load, sched_mode: Default, lb_slack_factor: 5, min_llc_runs_pick2: 1, saturated_percent: 5, dsq_time_slices: [], dsq_shift: 4, llc_shards: 5, min_nr_queued_pick2: 0, dumb_queues: 3, init_dsq_index: 0, virt_llc_enabled: false, topo: TopologyArgs { virt_llc: None } }, requires_ppid: None }
11:28:59 [INFO] DSQ[0] slice_ns 100000
11:28:59 [INFO] DSQ[1] slice_ns 3200000
11:28:59 [INFO] DSQ[2] slice_ns 6400000
11:28:59 [WARN] libbpf: map 'chaos': BPF map skeleton link is uninitialized

chaos traits: random_delays/cpu_freq/degradation 0/0/0
        chaos excluded/skipped 0/0
        kprobe_random_delays 0
        timer kicks: 0
peek: empty/not_ready/needs_proc 1057/0/0
chaos traits: random_delays/cpu_freq/degradation 3/0/0
        chaos excluded/skipped 0/0
        kprobe_random_delays 0
        timer kicks: 3
peek: empty/not_ready/needs_proc 107168/309/9716
chaos traits: random_delays/cpu_freq/degradation 0/0/0
        chaos excluded/skipped 0/0
        kprobe_random_delays 0
        timer kicks: 0
peek: empty/not_ready/needs_proc 91787/0/15417
^C11:29:23 [INFO] Unregister scx_chaos scheduler
```
diff --git a/scheds/include/scx/common.bpf.h b/scheds/include/scx/common.bpf.h
@@ -75,6 +75,7 @@ u32 scx_bpf_reenqueue_local(void) __ksym;
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
 int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
 struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
 void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
diff --git a/scheds/include/scx/compat.bpf.h b/scheds/include/scx/compat.bpf.h
@@ -230,6 +230,13 @@ static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags)
 	 scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) :			\
 	 scx_bpf_pick_any_cpu(cpus_allowed, flags))
 
+#define __COMPAT_scx_bpf_dsq_peek(dsq_id)                                      \
+  (bpf_ksym_exists(scx_bpf_dsq_peek) ? scx_bpf_dsq_peek(dsq_id) : ({           \
+    struct task_struct *p = NULL;                                              \
+    bpf_for_each(scx_dsq, p, dsq_id, 0) { break; }                             \
+    p;                                                                         \
+  }))
+
 /*
  * Define sched_ext_ops. This may be expanded to define multiple variants for
  * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
diff --git a/scheds/rust/scx_chaos/src/bpf/intf.h b/scheds/rust/scx_chaos/src/bpf/intf.h
@@ -54,6 +54,9 @@ enum chaos_stat_idx {
 	CHAOS_STAT_CHAOS_SKIPPED,
 	CHAOS_STAT_KPROBE_RANDOM_DELAYS,
 	CHAOS_STAT_TIMER_KICKS,
+	CHAOS_STAT_PEEK_EMPTY_DSQ,
+	CHAOS_STAT_PEEK_NOT_READY,
+	CHAOS_STAT_PEEK_NEEDS_PROCESSING,
 	CHAOS_NR_STATS,
 };
 
diff --git a/scheds/rust/scx_chaos/src/bpf/main.bpf.c b/scheds/rust/scx_chaos/src/bpf/main.bpf.c
@@ -63,6 +63,8 @@ const volatile u64 kprobe_delays_max_ns	     = 2;
 #define MIN(x, y) ((x) < (y) ? (x) : (y))
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
 
+#define U64_MAX ((u64)~0ULL)
+
 enum chaos_timer_callbacks {
 	CHAOS_TIMER_CHECK_QUEUES,
 	CHAOS_MAX_TIMERS,
@@ -143,6 +145,46 @@ static __always_inline void chaos_stat_inc(enum chaos_stat_idx stat)
 		(*cnt_p)++;
 }
 
+/*
+ * Get the next time a delay DSQ needs processing.
+ *
+ * Safe for delay DSQs which use monotonic time (vtimes won't wrap to U64_MAX).
+ * Must be called with RCU read lock held.
+ */
+static __always_inline u64 delay_dsq_next_time(u64 dsq_id)
+{
+	struct task_struct *first_p;
+	u64		    vtime;
+
+	// If we don't have native peek, fall back to always iterating
+	if (!bpf_ksym_exists(scx_bpf_dsq_peek)) {
+		chaos_stat_inc(CHAOS_STAT_PEEK_NEEDS_PROCESSING);
+		return 0;
+	}
+
+	first_p = scx_bpf_dsq_peek(dsq_id);
+	if (!first_p) {
+		chaos_stat_inc(CHAOS_STAT_PEEK_EMPTY_DSQ);
+		return U64_MAX;
+	}
+
+	first_p = bpf_task_from_pid(first_p->pid);
+	if (!first_p)
+		return 0;
+
+	vtime = first_p->scx.dsq_vtime;
+	bpf_task_release(first_p);
+
+	if (vtime > bpf_ktime_get_ns()) {
+		chaos_stat_inc(CHAOS_STAT_PEEK_NOT_READY);
+		return vtime;
+	}
+
+	// First task is ready, need to iterate
+	chaos_stat_inc(CHAOS_STAT_PEEK_NEEDS_PROCESSING);
+	return 0;
+}
+
 static __always_inline enum chaos_trait_kind
 choose_chaos(struct chaos_task_ctx *taskc)
 {
@@ -362,9 +404,22 @@ __weak u64 check_dsq_times(int cpu_idx)
 	u64		    next_trigger_time = 0;
 	u64		    now		      = bpf_ktime_get_ns();
 	bool		    has_kicked	      = false;
+	u64		    dsq_id	      = get_cpu_delay_dsq(cpu_idx);
 
 	bpf_rcu_read_lock();
-	bpf_for_each(scx_dsq, p, get_cpu_delay_dsq(cpu_idx), 0) {
+
+	next_trigger_time = delay_dsq_next_time(dsq_id);
+	if (next_trigger_time > now + chaos_timer_check_queues_slack_ns) {
+		// DSQ empty (U64_MAX) or first task beyond slack window
+		bpf_rcu_read_unlock();
+		return next_trigger_time == U64_MAX ? 0 : next_trigger_time;
+	}
+
+	// Need to iterate: no peek support (0), task ready, or task within slack window
+	next_trigger_time = 0;
+
+	// Need to iterate to handle ready tasks
+	bpf_for_each(scx_dsq, p, dsq_id, 0) {
 		p = bpf_task_from_pid(p->pid);
 		if (!p)
 			break;
@@ -387,8 +442,8 @@ __weak u64 check_dsq_times(int cpu_idx)
 		if (next_trigger_time > now + chaos_timer_check_queues_slack_ns)
 			break;
 	}
-	bpf_rcu_read_unlock();
 
+	bpf_rcu_read_unlock();
 	return next_trigger_time;
 }
 
@@ -531,9 +586,14 @@ void BPF_STRUCT_OPS(chaos_dispatch, s32 cpu, struct task_struct *prev)
 	struct enqueue_promise promise;
 	struct chaos_task_ctx *taskc;
 	struct task_struct    *p;
-	u64		       now = bpf_ktime_get_ns();
+	u64		       now    = bpf_ktime_get_ns();
+	u64		       dsq_id = get_cpu_delay_dsq(-1);
 
-	bpf_for_each(scx_dsq, p, get_cpu_delay_dsq(-1), 0) {
+	// Check if we need to process the delay DSQ
+	if (delay_dsq_next_time(dsq_id) > now)
+		goto p2dq;
+
+	bpf_for_each(scx_dsq, p, dsq_id, 0) {
 		p = bpf_task_from_pid(p->pid);
 		if (!p)
 			continue;
@@ -557,6 +617,7 @@ void BPF_STRUCT_OPS(chaos_dispatch, s32 cpu, struct task_struct *prev)
 		bpf_task_release(p);
 	}
 
+p2dq:
 	return p2dq_dispatch_impl(cpu, prev);
 }
 
diff --git a/scheds/rust/scx_chaos/src/lib.rs b/scheds/rust/scx_chaos/src/lib.rs
@@ -200,6 +200,10 @@ impl Scheduler {
             kprobe_random_delays: stats
                 [bpf_intf::chaos_stat_idx_CHAOS_STAT_KPROBE_RANDOM_DELAYS as usize],
             timer_kicks: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_TIMER_KICKS as usize],
+            peek_empty_dsq: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_PEEK_EMPTY_DSQ as usize],
+            peek_not_ready: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_PEEK_NOT_READY as usize],
+            peek_needs_processing: stats
+                [bpf_intf::chaos_stat_idx_CHAOS_STAT_PEEK_NEEDS_PROCESSING as usize],
         }
     }
 
diff --git a/scheds/rust/scx_chaos/src/stats.rs b/scheds/rust/scx_chaos/src/stats.rs
@@ -29,6 +29,12 @@ pub struct Metrics {
     pub timer_kicks: u64,
     #[stat(desc = "Number of times a kprobe caused a random delay to be applied")]
     pub kprobe_random_delays: u64,
+    #[stat(desc = "Peek found empty DSQ")]
+    pub peek_empty_dsq: u64,
+    #[stat(desc = "Peek found task not ready")]
+    pub peek_not_ready: u64,
+    #[stat(desc = "Peek determined DSQ needs processing")]
+    pub peek_needs_processing: u64,
 }
 
 impl Metrics {
@@ -44,6 +50,11 @@ impl Metrics {
             self.kprobe_random_delays,
             self.timer_kicks,
         )?;
+        writeln!(
+            w,
+            "peek: empty/not_ready/needs_proc {}/{}/{}",
+            self.peek_empty_dsq, self.peek_not_ready, self.peek_needs_processing,
+        )?;
         Ok(())
     }
 
@@ -56,6 +67,9 @@ impl Metrics {
             chaos_skipped: self.chaos_skipped - rhs.chaos_skipped,
             kprobe_random_delays: self.kprobe_random_delays - rhs.kprobe_random_delays,
             timer_kicks: self.timer_kicks - rhs.timer_kicks,
+            peek_empty_dsq: self.peek_empty_dsq - rhs.peek_empty_dsq,
+            peek_not_ready: self.peek_not_ready - rhs.peek_not_ready,
+            peek_needs_processing: self.peek_needs_processing - rhs.peek_needs_processing,
         }
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -200,6 +200,10 @@ impl Scheduler {`
`200`	`200`	`kprobe_random_delays: stats`
`201`	`201`	`[bpf_intf::chaos_stat_idx_CHAOS_STAT_KPROBE_RANDOM_DELAYS as usize],`
`202`	`202`	`timer_kicks: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_TIMER_KICKS as usize],`
	`203`	`+ peek_empty_dsq: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_PEEK_EMPTY_DSQ as usize],`
	`204`	`+ peek_not_ready: stats[bpf_intf::chaos_stat_idx_CHAOS_STAT_PEEK_NOT_READY as usize],`
	`205`	`+ peek_needs_processing: stats`
	`206`	`+ [bpf_intf::chaos_stat_idx_CHAOS_STAT_PEEK_NEEDS_PROCESSING as usize],`
`203`	`207`	`}`
`204`	`208`	`}`
`205`	`209`