From 6d2aac1591bafd3b4a6298d5eb64758f12cae7a8 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Mon, 22 Apr 2024 08:52:13 +0200
Subject: [PATCH 1/5] scx_rustland_core: introduce dispatch flags

Reserve some bits of the `cpu` attribute of a task to store special
dispatch flags.

Initially, let's introduce just RL_CPU_ANY to replace the special value
NO_CPU, indicating that the task can be dispatched on any CPU,
specifically the first CPU that becomes available.

This allows to keep the CPU value assigned by the builtin idle selection
logic, that can potentially be used later for further optimizations.

Moreover, having the possibility to specify dispatch flags gives more
flexibility and it allows to map new scheduling features to such flags.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 rust/scx_rustland_core/assets/bpf.rs         | 10 ++++--
 rust/scx_rustland_core/assets/bpf/intf.h     | 33 ++++++++++++++++++++
 rust/scx_rustland_core/assets/bpf/main.bpf.c | 17 +++++-----
 scheds/rust/scx_rustland/src/main.rs         |  8 +++--
 4 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/rust/scx_rustland_core/assets/bpf.rs b/rust/scx_rustland_core/assets/bpf.rs
index 4918ac9..fd5643e 100644
--- a/rust/scx_rustland_core/assets/bpf.rs
+++ b/rust/scx_rustland_core/assets/bpf.rs
@@ -26,11 +26,11 @@ use scx_rustland_core::ALLOCATOR;
 // Defined in UAPI
 const SCHED_EXT: i32 = 7;
 
-// Do not assign any specific CPU to the task.
+// Allow to dispatch the task on any CPU.
 //
 // The task will be dispatched to the global shared DSQ and it will run on the first CPU available.
 #[allow(dead_code)]
-pub const NO_CPU: i32 = -1;
+pub const RL_CPU_ANY: i32 = bpf_intf::RL_CPU_ANY as i32;
 
 /// High-level Rust abstraction to interact with a generic sched-ext BPF component.
 ///
@@ -95,6 +95,12 @@ impl DispatchedTask {
         self.cpu = cpu;
     }
 
+    // Assign a specific dispatch flag to a task.
+    #[allow(dead_code)]
+    pub fn set_flag(&mut self, flag: i32) {
+        self.cpu |= flag;
+    }
+
     // Assign a specific time slice to a task.
     #[allow(dead_code)]
     pub fn set_slice_ns(&mut self, slice_ns: u64) {
diff --git a/rust/scx_rustland_core/assets/bpf/intf.h b/rust/scx_rustland_core/assets/bpf/intf.h
index 138824d..1b0cfe4 100644
--- a/rust/scx_rustland_core/assets/bpf/intf.h
+++ b/rust/scx_rustland_core/assets/bpf/intf.h
@@ -26,6 +26,39 @@ typedef unsigned long long u64;
 typedef long long s64;
 #endif
 
+/* Check a condition at build time */
+#define BUILD_BUG_ON(expr) \
+	do { \
+		extern char __build_assert__[(expr) ? -1 : 1] \
+			__attribute__((unused)); \
+	} while(0)
+
+/*
+ * Maximum amount of CPUs supported by this scheduler (this defines the size of
+ * cpu_map that is used to store the idle state and CPU ownership).
+ */
+#define MAX_CPUS 1024
+
+/* Isolate target CPU from dispatch flags. */
+#define CPU_MASK	(MAX_CPUS - 1)
+
+/* Use extra bits in the CPU attribute to store dispatch flags. */
+#define RL_BASE_FLAG	__builtin_ctz(MAX_CPUS)
+
+/* Define dispatch flags using macros. */
+#define RL_FLAG(flag) (1U << (RL_BASE_FLAG + flag))
+
+/* Dispatch flags */
+enum {
+	/*
+	 * Do not assign any specific CPU to the task.
+	 *
+	 * The task will be dispatched to the global shared DSQ and it will run
+	 * on the first CPU available.
+	 */
+	RL_CPU_ANY = RL_FLAG(0),
+};
+
 /*
  * Task sent to the user-space scheduler by the BPF dispatcher.
  *
diff --git a/rust/scx_rustland_core/assets/bpf/main.bpf.c b/rust/scx_rustland_core/assets/bpf/main.bpf.c
index 9ac1803..575a2cd 100644
--- a/rust/scx_rustland_core/assets/bpf/main.bpf.c
+++ b/rust/scx_rustland_core/assets/bpf/main.bpf.c
@@ -33,12 +33,6 @@ char _license[] SEC("license") = "GPL";
 
 UEI_DEFINE(uei);
 
-/*
- * Maximum amount of CPUs supported by this scheduler (this defines the size of
- * cpu_map that is used to store the idle state and CPU ownership).
- */
-#define MAX_CPUS 1024
-
 /*
  * Introduce a custom DSQ shared across all the CPUs, where we can dispatch
  * tasks that will be executed on the first CPU available.
@@ -592,11 +586,12 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
 		 */
 		dbg_msg("usersched: pid=%d cpu=%d cpumask_cnt=%llu slice_ns=%llu",
 			task.pid, task.cpu, task.cpumask_cnt, task.slice_ns);
-		if (task.cpu < 0)
+
+		if (task.cpu & RL_CPU_ANY)
 			dispatch_task(p, SHARED_DSQ, 0, task.slice_ns, 0);
 		else
-			dispatch_task(p, cpu_to_dsq(task.cpu), task.cpumask_cnt,
-				      task.slice_ns, 0);
+			dispatch_task(p, cpu_to_dsq(task.cpu & CPU_MASK),
+				      task.cpumask_cnt, task.slice_ns, 0);
 		bpf_task_release(p);
 		__sync_fetch_and_add(&nr_user_dispatches, 1);
 	}
@@ -856,6 +851,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(rustland_init)
 {
 	int err;
 
+	/* Compile-time checks */
+	BUILD_BUG_ON((MAX_CPUS % 2));
+
+	/* Initialize rustland core */
 	err = dsq_init();
 	if (err)
 		return err;
diff --git a/scheds/rust/scx_rustland/src/main.rs b/scheds/rust/scx_rustland/src/main.rs
index 33cf43b..5983219 100644
--- a/scheds/rust/scx_rustland/src/main.rs
+++ b/scheds/rust/scx_rustland/src/main.rs
@@ -173,8 +173,8 @@ impl TaskInfoMap {
 
 #[derive(Debug, PartialEq, Eq, PartialOrd, Clone)]
 struct Task {
-    qtask: QueuedTask, // queued task
-    vruntime: u64,     // total vruntime (that determines the order how tasks are dispatched)
+    qtask: QueuedTask,      // queued task
+    vruntime: u64,          // total vruntime (that determines the order how tasks are dispatched)
 }
 
 // Make sure tasks are ordered by vruntime, if multiple tasks have the same vruntime order by pid.
@@ -505,8 +505,10 @@ impl<'a> Scheduler<'a> {
                     // If built-in idle selection logic is disabled, dispatch on the first CPU
                     // available.
                     let mut dispatched_task = DispatchedTask::new(&task.qtask);
+
+                    // Set special dispatch flags.
                     if !self.builtin_idle {
-                        dispatched_task.set_cpu(NO_CPU);
+                        dispatched_task.set_flag(RL_CPU_ANY);
                     }
 
                     // Send task to the BPF dispatcher.

From 27c1f9c329caacf73cdf7a04a3cb575aaad8785d Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Mon, 22 Apr 2024 09:29:40 +0200
Subject: [PATCH 2/5] scx_rustland_core: introduce preemption

Introduce the new dispatch flag RL_PREEMPT_CPU that can be used to
dispatch tasks that can preempt others.

Tasks with this flag set will be dispatched by the BPF part using
SCX_ENQ_PREEMPT, so they can potentially preempt any other task running
on the target CPU.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 rust/scx_rustland_core/assets/bpf.rs         |  4 ++++
 rust/scx_rustland_core/assets/bpf/intf.h     |  5 +++++
 rust/scx_rustland_core/assets/bpf/main.bpf.c | 23 +++++++++++++-------
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/rust/scx_rustland_core/assets/bpf.rs b/rust/scx_rustland_core/assets/bpf.rs
index fd5643e..9e1e552 100644
--- a/rust/scx_rustland_core/assets/bpf.rs
+++ b/rust/scx_rustland_core/assets/bpf.rs
@@ -32,6 +32,10 @@ const SCHED_EXT: i32 = 7;
 #[allow(dead_code)]
 pub const RL_CPU_ANY: i32 = bpf_intf::RL_CPU_ANY as i32;
 
+// Allow to preempt the target CPU when dispatching the task.
+#[allow(dead_code)]
+pub const RL_PREEMPT_CPU: i32 = bpf_intf::RL_PREEMPT_CPU as i32;
+
 /// High-level Rust abstraction to interact with a generic sched-ext BPF component.
 ///
 /// Overview
diff --git a/rust/scx_rustland_core/assets/bpf/intf.h b/rust/scx_rustland_core/assets/bpf/intf.h
index 1b0cfe4..e7544cc 100644
--- a/rust/scx_rustland_core/assets/bpf/intf.h
+++ b/rust/scx_rustland_core/assets/bpf/intf.h
@@ -57,6 +57,11 @@ enum {
 	 * on the first CPU available.
 	 */
 	RL_CPU_ANY = RL_FLAG(0),
+
+	/*
+	 * Allow to preempt the target CPU when dispatching the task.
+	 */
+	RL_PREEMPT_CPU = RL_FLAG(1),
 };
 
 /*
diff --git a/rust/scx_rustland_core/assets/bpf/main.bpf.c b/rust/scx_rustland_core/assets/bpf/main.bpf.c
index 575a2cd..3d0fb09 100644
--- a/rust/scx_rustland_core/assets/bpf/main.bpf.c
+++ b/rust/scx_rustland_core/assets/bpf/main.bpf.c
@@ -564,6 +564,7 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
 	bpf_repeat(MAX_ENQUEUED_TASKS) {
 		struct task_struct *p;
 		struct dispatched_task_ctx task;
+		u64 enq_flags = 0;
 
 		/*
 		 * Pop first task from the dispatched queue, stop if dispatch
@@ -576,22 +577,28 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
 		p = bpf_task_from_pid(task.pid);
 		if (!p)
 			continue;
+
+		dbg_msg("usersched: pid=%d cpu=%d cpumask_cnt=%llu slice_ns=%llu",
+			task.pid, task.cpu, task.cpumask_cnt, task.slice_ns);
+		/*
+		 * Map RL_PREEMPT_CPU to SCX_ENQ_PREEMPT and allow this task to
+		 * preempt others.
+		 */
+		if (task.cpu & RL_PREEMPT_CPU)
+			enq_flags = SCX_ENQ_PREEMPT;
 		/*
 		 * Check whether the user-space scheduler assigned a different
 		 * CPU to the task and migrate (if possible).
 		 *
-		 * If no CPU has been specified (task.cpu < 0), then dispatch
-		 * the task to the shared DSQ and rely on the built-in idle CPU
-		 * selection.
+		 * If the task has been submitted with RL_CPU_ANY, then
+		 * dispatch it to the shared DSQ and run it on the first CPU
+		 * available.
 		 */
-		dbg_msg("usersched: pid=%d cpu=%d cpumask_cnt=%llu slice_ns=%llu",
-			task.pid, task.cpu, task.cpumask_cnt, task.slice_ns);
-
 		if (task.cpu & RL_CPU_ANY)
-			dispatch_task(p, SHARED_DSQ, 0, task.slice_ns, 0);
+			dispatch_task(p, SHARED_DSQ, 0, task.slice_ns, enq_flags);
 		else
 			dispatch_task(p, cpu_to_dsq(task.cpu & CPU_MASK),
-				      task.cpumask_cnt, task.slice_ns, 0);
+				      task.cpumask_cnt, task.slice_ns, enq_flags);
 		bpf_task_release(p);
 		__sync_fetch_and_add(&nr_user_dispatches, 1);
 	}

From 0ffaaac6db28ddfe88f6d9ea1752f470f9d8eaf9 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Mon, 22 Apr 2024 09:32:07 +0200
Subject: [PATCH 3/5] scx_rustland: enable preemption

Use the new scx_rustland_core dispatch flag RL_PREEMPT_CPU to allow
interactive tasks to preempt other tasks with scx_rustland.

If the built-in idle selection logic is enforced (option `-i`), the
scheduler prioritizes keeping tasks on the target CPU designated by this
logic. With preemption enabled, these tasks have a higher likelihood of
reusing their cached working set, potentially improving performance.

Alternatively, when tasks are dispatched to the first available CPU
(default behavior), interactive tasks benefit from running more promptly
by kicking out other tasks before their assigned time slice expires.

This potentially allows to increase the default time slice to higher
values in the future, to improve the overall throughput in the system
and, at the same time, still maintain a good level of responsiveness,
because interactive tasks are now able to run pretty much immediately,
independently on the remaining time slice of the other tasks that are
contending the CPUs in the system.

= Results =

Measuring the performance of the usual benchmark "playing a video game
while running a parallel kernel build in background" seems to give
around 2-10% boost in the fps with preemption enabled, depending on the
particular video game.

Results were obtained running a `make -j32` kernel build on a AMD Ryzen
7 5800X 8-Cores 16GB RAM, while testing video games such as Baldur's
Gate 3 (with a solid +10% fps), Counter Strike 2 (around +5%) and Team
Fortress 2 (+2% boost).

Moreover, some WebGL applications (such as
https://webglsamples.org/aquarium/aquarium.html) seem to benefit even
more with preemption enabled, providing up to a +15% fps boost.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 scheds/rust/scx_rustland/src/main.rs | 34 ++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/scheds/rust/scx_rustland/src/main.rs b/scheds/rust/scx_rustland/src/main.rs
index 5983219..f2dcf81 100644
--- a/scheds/rust/scx_rustland/src/main.rs
+++ b/scheds/rust/scx_rustland/src/main.rs
@@ -175,6 +175,7 @@ impl TaskInfoMap {
 struct Task {
     qtask: QueuedTask,      // queued task
     vruntime: u64,          // total vruntime (that determines the order how tasks are dispatched)
+    is_interactive: bool,   // task can preempt other tasks
 }
 
 // Make sure tasks are ordered by vruntime, if multiple tasks have the same vruntime order by pid.
@@ -327,11 +328,12 @@ impl<'a> Scheduler<'a> {
         ts.as_nanos() as u64
     }
 
-    // Update task's vruntime based on the information collected from the kernel and return the
-    // evaluated weighted time slice to the caller.
+    // Update task's vruntime based on the information collected from the kernel and return to the
+    // caller the evaluated weighted time slice along with a flag indicating whether the task is
+    // interactive or not (interactive tasks are allowed to preempt other tasks).
     //
     // This method implements the main task ordering logic of the scheduler.
-    fn update_enqueued(&mut self, task: &QueuedTask) -> u64 {
+    fn update_enqueued(&mut self, task: &QueuedTask) -> (u64, bool) {
         // Determine if a task is new or old, based on their current runtime and previous runtime
         // counters.
         //
@@ -389,18 +391,19 @@ impl<'a> Scheduler<'a> {
             task.sum_exec_runtime - task_info.sum_exec_runtime
         };
 
-        // Apply the slice boost to interactive tasks.
-        //
         // Determine if a task is interactive, based on the moving average of voluntary context
         // switches over time.
         //
         // NOTE: we should make this threshold a tunable, but for now let's assume that a moving
         // average of 10 voluntary context switch per second is enough to classify the task as
         // interactive.
+        let is_interactive = task_info.avg_nvcsw >= 10;
+
+        // Apply the slice boost to interactive tasks.
         //
         // NOTE: some tasks may have a very high weight, that can potentially disrupt our slice
         // boost optimizations, therefore always limit the task priority to a max of 1000.
-        let weight = if task_info.avg_nvcsw >= 10 {
+        let weight = if is_interactive {
             task.weight.min(1000) * self.slice_boost.max(1)
         } else {
             task.weight.min(1000)
@@ -435,8 +438,8 @@ impl<'a> Scheduler<'a> {
             task_info.nvcsw_ts = now;
         }
 
-        // Return the task vruntime.
-        task_info.vruntime
+        // Return the task vruntime and a flag indicating if the task is interactive.
+        (task_info.vruntime, is_interactive)
     }
 
     // Drain all the tasks from the queued list, update their vruntime (Self::update_enqueued()),
@@ -452,13 +455,14 @@ impl<'a> Scheduler<'a> {
                         continue;
                     }
 
-                    // Update task information.
-                    let vruntime = self.update_enqueued(&task);
+                    // Update task information and determine vruntime and interactiveness.
+                    let (vruntime, is_interactive) = self.update_enqueued(&task);
 
                     // Insert task in the task pool (ordered by vruntime).
                     self.task_pool.push(Task {
                         qtask: task,
                         vruntime,
+                        is_interactive,
                     });
                 }
                 Ok(None) => {
@@ -510,6 +514,16 @@ impl<'a> Scheduler<'a> {
                     if !self.builtin_idle {
                         dispatched_task.set_flag(RL_CPU_ANY);
                     }
+                    if task.is_interactive {
+                        // Assign the maximum time slice to this task and allow to preempt others.
+                        //
+                        // NOTE: considering that, with preemption enabled, interactive tasks can
+                        // preempt each other (for now) and they are also more likely to release
+                        // the CPU before its assigned time slice expires, always give them the
+                        // maximum static time slice allowed.
+                        dispatched_task.set_slice_ns(self.slice_ns);
+                        dispatched_task.set_flag(RL_PREEMPT_CPU);
+                    }
 
                     // Send task to the BPF dispatcher.
                     match self.bpf.dispatch_task(&dispatched_task) {

From fbe9a80af8063217ecb61e1a7fb200b902639c88 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Mon, 22 Apr 2024 09:52:31 +0200
Subject: [PATCH 4/5] scx_rustland: introduce --no-preemption

Provide a run-time option to disable task preemption.

This option can be used to improve the throughput of the CPU-intensive
tasks while still providing a good level of responsiveness in the
system.

By default preemption is enabled, to provide a higher level of
responsiveness to the interactive tasks.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 scheds/rust/scx_rustland/src/main.rs | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/scheds/rust/scx_rustland/src/main.rs b/scheds/rust/scx_rustland/src/main.rs
index f2dcf81..6370a0e 100644
--- a/scheds/rust/scx_rustland/src/main.rs
+++ b/scheds/rust/scx_rustland/src/main.rs
@@ -108,6 +108,16 @@ struct Opts {
     #[clap(short = 'i', long, action = clap::ArgAction::SetTrue)]
     builtin_idle: bool,
 
+    /// If specified, disable task preemption.
+    ///
+    /// Disabling task preemption can help to improve the throughput of CPU-intensive tasks, while
+    /// still providing a good level of system responsiveness.
+    ///
+    /// Preemption is enabled by default to provide a higher level of responsiveness to the
+    /// interactive tasks.
+    #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
+    no_preemption: bool,
+
     /// If specified, all the scheduling events and actions will be processed in user-space,
     /// disabling any form of in-kernel optimization.
     ///
@@ -238,7 +248,8 @@ struct Scheduler<'a> {
     slice_boost: u64,      // Slice booster
     eff_slice_boost: u64,  // Effective slice booster
     init_page_faults: u64, // Initial page faults counter
-    builtin_idle: bool,   // Use sched-ext built-in idle selection logic
+    builtin_idle: bool,    // Use sched-ext built-in idle selection logic
+    no_preemption: bool,   // Disable task preemption
 }
 
 impl<'a> Scheduler<'a> {
@@ -256,6 +267,9 @@ impl<'a> Scheduler<'a> {
         // Use built-in idle selection logic.
         let builtin_idle = opts.builtin_idle;
 
+        // Disable task preemption.
+        let no_preemption = opts.no_preemption;
+
         // Scheduler task pool to sort tasks by vruntime.
         let task_pool = TaskTree::new();
 
@@ -292,6 +306,7 @@ impl<'a> Scheduler<'a> {
             eff_slice_boost,
             init_page_faults,
             builtin_idle,
+            no_preemption,
         })
     }
 
@@ -514,7 +529,7 @@ impl<'a> Scheduler<'a> {
                     if !self.builtin_idle {
                         dispatched_task.set_flag(RL_CPU_ANY);
                     }
-                    if task.is_interactive {
+                    if task.is_interactive && !self.no_preemption {
                         // Assign the maximum time slice to this task and allow to preempt others.
                         //
                         // NOTE: considering that, with preemption enabled, interactive tasks can

From f02e9b072cf6ba774e5622add07ed6fa8432a08a Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Tue, 23 Apr 2024 09:04:27 +0200
Subject: [PATCH 5/5] scx_rustland_core: use a separate field to store dispatch
 flags

Do not encode dispatch flags in the cpu field, but simply use a separate
"flags" field.

This makes the code much simpler and it increases the size of
dispatched_task_ctx from 24 to 32, that is probably better in terms of
cacheline allocation / performance.

Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
---
 rust/scx_rustland_core/assets/bpf.rs         | 11 +++++++----
 rust/scx_rustland_core/assets/bpf/intf.h     | 16 ++++------------
 rust/scx_rustland_core/assets/bpf/main.bpf.c | 10 +++++-----
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/rust/scx_rustland_core/assets/bpf.rs b/rust/scx_rustland_core/assets/bpf.rs
index 9e1e552..2b7a6a6 100644
--- a/rust/scx_rustland_core/assets/bpf.rs
+++ b/rust/scx_rustland_core/assets/bpf.rs
@@ -30,11 +30,11 @@ const SCHED_EXT: i32 = 7;
 //
 // The task will be dispatched to the global shared DSQ and it will run on the first CPU available.
 #[allow(dead_code)]
-pub const RL_CPU_ANY: i32 = bpf_intf::RL_CPU_ANY as i32;
+pub const RL_CPU_ANY: u64 = bpf_intf::RL_CPU_ANY as u64;
 
 // Allow to preempt the target CPU when dispatching the task.
 #[allow(dead_code)]
-pub const RL_PREEMPT_CPU: i32 = bpf_intf::RL_PREEMPT_CPU as i32;
+pub const RL_PREEMPT_CPU: u64 = bpf_intf::RL_PREEMPT_CPU as u64;
 
 /// High-level Rust abstraction to interact with a generic sched-ext BPF component.
 ///
@@ -75,6 +75,7 @@ pub struct QueuedTask {
 pub struct DispatchedTask {
     pid: i32,         // pid that uniquely identifies a task
     cpu: i32,         // target CPU selected by the scheduler
+    flags: u64,       // special dispatch flags
     slice_ns: u64,    // time slice assigned to the task (0 = default)
     cpumask_cnt: u64, // cpumask generation counter (private)
 }
@@ -88,6 +89,7 @@ impl DispatchedTask {
         DispatchedTask {
             pid: task.pid,
             cpu: task.cpu,
+            flags: 0,
             cpumask_cnt: task.cpumask_cnt,
             slice_ns: 0, // use default time slice
         }
@@ -101,8 +103,8 @@ impl DispatchedTask {
 
     // Assign a specific dispatch flag to a task.
     #[allow(dead_code)]
-    pub fn set_flag(&mut self, flag: i32) {
-        self.cpu |= flag;
+    pub fn set_flag(&mut self, flag: u64) {
+        self.flags |= flag;
     }
 
     // Assign a specific time slice to a task.
@@ -151,6 +153,7 @@ impl DispatchedMessage {
         let dispatched_task_struct = bpf_intf::dispatched_task_ctx {
             pid: task.pid,
             cpu: task.cpu,
+            flags: task.flags,
             cpumask_cnt: task.cpumask_cnt,
             slice_ns: task.slice_ns,
         };
diff --git a/rust/scx_rustland_core/assets/bpf/intf.h b/rust/scx_rustland_core/assets/bpf/intf.h
index e7544cc..fbbc4fd 100644
--- a/rust/scx_rustland_core/assets/bpf/intf.h
+++ b/rust/scx_rustland_core/assets/bpf/intf.h
@@ -39,16 +39,7 @@ typedef long long s64;
  */
 #define MAX_CPUS 1024
 
-/* Isolate target CPU from dispatch flags. */
-#define CPU_MASK	(MAX_CPUS - 1)
-
-/* Use extra bits in the CPU attribute to store dispatch flags. */
-#define RL_BASE_FLAG	__builtin_ctz(MAX_CPUS)
-
-/* Define dispatch flags using macros. */
-#define RL_FLAG(flag) (1U << (RL_BASE_FLAG + flag))
-
-/* Dispatch flags */
+/* Special dispatch flags */
 enum {
 	/*
 	 * Do not assign any specific CPU to the task.
@@ -56,12 +47,12 @@ enum {
 	 * The task will be dispatched to the global shared DSQ and it will run
 	 * on the first CPU available.
 	 */
-	RL_CPU_ANY = RL_FLAG(0),
+	RL_CPU_ANY = 1 << 0,
 
 	/*
 	 * Allow to preempt the target CPU when dispatching the task.
 	 */
-	RL_PREEMPT_CPU = RL_FLAG(1),
+	RL_PREEMPT_CPU = 1 << 1,
 };
 
 /*
@@ -87,6 +78,7 @@ struct queued_task_ctx {
 struct dispatched_task_ctx {
 	s32 pid;
 	s32 cpu; /* CPU where the task should be dispatched */
+	u64 flags; /* special dispatch flags */
 	u64 cpumask_cnt; /* cpumask generation counter */
 	u64 slice_ns; /* time slice assigned to the task (0=default) */
 };
diff --git a/rust/scx_rustland_core/assets/bpf/main.bpf.c b/rust/scx_rustland_core/assets/bpf/main.bpf.c
index 3d0fb09..7c7acc7 100644
--- a/rust/scx_rustland_core/assets/bpf/main.bpf.c
+++ b/rust/scx_rustland_core/assets/bpf/main.bpf.c
@@ -578,13 +578,13 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
 		if (!p)
 			continue;
 
-		dbg_msg("usersched: pid=%d cpu=%d cpumask_cnt=%llu slice_ns=%llu",
-			task.pid, task.cpu, task.cpumask_cnt, task.slice_ns);
+		dbg_msg("usersched: pid=%d cpu=%d cpumask_cnt=%llu slice_ns=%llu flags=%llx",
+			task.pid, task.cpu, task.cpumask_cnt, task.slice_ns, task.flags);
 		/*
 		 * Map RL_PREEMPT_CPU to SCX_ENQ_PREEMPT and allow this task to
 		 * preempt others.
 		 */
-		if (task.cpu & RL_PREEMPT_CPU)
+		if (task.flags & RL_PREEMPT_CPU)
 			enq_flags = SCX_ENQ_PREEMPT;
 		/*
 		 * Check whether the user-space scheduler assigned a different
@@ -594,10 +594,10 @@ void BPF_STRUCT_OPS(rustland_dispatch, s32 cpu, struct task_struct *prev)
 		 * dispatch it to the shared DSQ and run it on the first CPU
 		 * available.
 		 */
-		if (task.cpu & RL_CPU_ANY)
+		if (task.flags & RL_CPU_ANY)
 			dispatch_task(p, SHARED_DSQ, 0, task.slice_ns, enq_flags);
 		else
-			dispatch_task(p, cpu_to_dsq(task.cpu & CPU_MASK),
+			dispatch_task(p, cpu_to_dsq(task.cpu),
 				      task.cpumask_cnt, task.slice_ns, enq_flags);
 		bpf_task_release(p);
 		__sync_fetch_and_add(&nr_user_dispatches, 1);