1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00
linux/tools/sched_ext/scx_qmap.bpf.c
Tejun Heo 2a52ca7c98 sched_ext: Add scx_simple and scx_example_qmap example schedulers
Add two simple example BPF schedulers - simple and qmap.

* simple: In terms of scheduling, it behaves identical to not having any
  operation implemented at all. The two operations it implements are only to
  improve visibility and exit handling. On certain homogeneous
  configurations, this actually can perform pretty well.

* qmap: A fixed five level priority scheduler to demonstrate queueing PIDs
  on BPF maps for scheduling. While not very practical, this is useful as a
  simple example and will be used to demonstrate different features.

v7: - Compat helpers stripped out in prepartion of upstreaming as the
      upstreamed patchset will be the baselinfe. Utility macros that can be
      used to implement compat features are kept.

    - Explicitly disable map autoattach on struct_ops to avoid trying to
      attach twice while maintaining compatbility with older libbpf.

v6: - Common header files reorganized and cleaned up. Compat helpers are
      added to demonstrate how schedulers can maintain backward
      compatibility with older kernels while making use of newly added
      features.

    - simple_select_cpu() added to keep track of the number of local
      dispatches. This is needed because the default ops.select_cpu()
      implementation is updated to dispatch directly and won't call
      ops.enqueue().

    - Updated to reflect the sched_ext API changes. Switching all tasks is
      the default behavior now and scx_qmap supports partial switching when
      `-p` is specified.

    - tools/sched_ext/Kconfig dropped. This will be included in the doc
      instead.

v5: - Improve Makefile. Build artifects are now collected into a separate
      dir which change be changed. Install and help targets are added and
      clean actually cleans everything.

    - MEMBER_VPTR() improved to improve access to structs. ARRAY_ELEM_PTR()
      and RESIZEABLE_ARRAY() are added to support resizable arrays in .bss.

    - Add scx_common.h which provides common utilities to user code such as
      SCX_BUG[_ON]() and RESIZE_ARRAY().

    - Use SCX_BUG[_ON]() to simplify error handling.

v4: - Dropped _example prefix from scheduler names.

v3: - Rename scx_example_dummy to scx_example_simple and restructure a bit
      to ease later additions. Comment updates.

    - Added declarations for BPF inline iterators. In the future, hopefully,
      these will be consolidated into a generic BPF header so that they
      don't need to be replicated here.

v2: - Updated with the generic BPF cpumask helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
2024-06-18 10:09:17 -10:00

264 lines
6 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* A simple five-level FIFO queue scheduler.
*
* There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
* assigned to one depending on its compound weight. Each CPU round robins
* through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
* queue0, 2 from queue1, 4 from queue2 and so on.
*
* This scheduler demonstrates:
*
* - BPF-side queueing using PIDs.
* - Sleepable per-task storage allocation using ops.prep_enable().
*
* This scheduler is primarily for demonstration and testing of sched_ext
* features and unlikely to be useful for actual workloads.
*
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
#include <scx/common.bpf.h>
enum consts {
ONE_SEC_IN_NS = 1000000000,
SHARED_DSQ = 0,
};
char _license[] SEC("license") = "GPL";
const volatile u64 slice_ns = SCX_SLICE_DFL;
const volatile u32 dsp_batch;
u32 test_error_cnt;
UEI_DEFINE(uei);
struct qmap {
__uint(type, BPF_MAP_TYPE_QUEUE);
__uint(max_entries, 4096);
__type(value, u32);
} queue0 SEC(".maps"),
queue1 SEC(".maps"),
queue2 SEC(".maps"),
queue3 SEC(".maps"),
queue4 SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
__uint(max_entries, 5);
__type(key, int);
__array(values, struct qmap);
} queue_arr SEC(".maps") = {
.values = {
[0] = &queue0,
[1] = &queue1,
[2] = &queue2,
[3] = &queue3,
[4] = &queue4,
},
};
/* Per-task scheduling context */
struct task_ctx {
bool force_local; /* Dispatch directly to local_dsq */
};
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct task_ctx);
} task_ctx_stor SEC(".maps");
struct cpu_ctx {
u64 dsp_idx; /* dispatch index */
u64 dsp_cnt; /* remaining count */
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__type(key, u32);
__type(value, struct cpu_ctx);
} cpu_ctx_stor SEC(".maps");
/* Statistics */
u64 nr_enqueued, nr_dispatched, nr_dequeued;
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
struct task_ctx *tctx;
s32 cpu;
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
if (!tctx) {
scx_bpf_error("task_ctx lookup failed");
return -ESRCH;
}
if (p->nr_cpus_allowed == 1 ||
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
tctx->force_local = true;
return prev_cpu;
}
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
if (cpu >= 0)
return cpu;
return prev_cpu;
}
static int weight_to_idx(u32 weight)
{
/* Coarsely map the compound weight to a FIFO. */
if (weight <= 25)
return 0;
else if (weight <= 50)
return 1;
else if (weight < 200)
return 2;
else if (weight < 400)
return 3;
else
return 4;
}
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
struct task_ctx *tctx;
u32 pid = p->pid;
int idx = weight_to_idx(p->scx.weight);
void *ring;
if (test_error_cnt && !--test_error_cnt)
scx_bpf_error("test triggering error");
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
if (!tctx) {
scx_bpf_error("task_ctx lookup failed");
return;
}
/* Is select_cpu() is telling us to enqueue locally? */
if (tctx->force_local) {
tctx->force_local = false;
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
return;
}
ring = bpf_map_lookup_elem(&queue_arr, &idx);
if (!ring) {
scx_bpf_error("failed to find ring %d", idx);
return;
}
/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
if (bpf_map_push_elem(ring, &pid, 0)) {
scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
return;
}
__sync_fetch_and_add(&nr_enqueued, 1);
}
/*
* The BPF queue map doesn't support removal and sched_ext can handle spurious
* dispatches. qmap_dequeue() is only used to collect statistics.
*/
void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
{
__sync_fetch_and_add(&nr_dequeued, 1);
}
void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
{
struct task_struct *p;
struct cpu_ctx *cpuc;
u32 zero = 0, batch = dsp_batch ?: 1;
void *fifo;
s32 i, pid;
if (scx_bpf_consume(SHARED_DSQ))
return;
if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
scx_bpf_error("failed to look up cpu_ctx");
return;
}
for (i = 0; i < 5; i++) {
/* Advance the dispatch cursor and pick the fifo. */
if (!cpuc->dsp_cnt) {
cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
}
fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
if (!fifo) {
scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
return;
}
/* Dispatch or advance. */
bpf_repeat(BPF_MAX_LOOPS) {
if (bpf_map_pop_elem(fifo, &pid))
break;
p = bpf_task_from_pid(pid);
if (!p)
continue;
__sync_fetch_and_add(&nr_dispatched, 1);
scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
bpf_task_release(p);
batch--;
cpuc->dsp_cnt--;
if (!batch || !scx_bpf_dispatch_nr_slots()) {
scx_bpf_consume(SHARED_DSQ);
return;
}
if (!cpuc->dsp_cnt)
break;
}
cpuc->dsp_cnt = 0;
}
}
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
struct scx_init_task_args *args)
{
/*
* @p is new. Let's ensure that its task_ctx is available. We can sleep
* in this function and the following will automatically use GFP_KERNEL.
*/
if (bpf_task_storage_get(&task_ctx_stor, p, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE))
return 0;
else
return -ENOMEM;
}
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
return scx_bpf_create_dsq(SHARED_DSQ, -1);
}
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
{
UEI_RECORD(uei, ei);
}
SCX_OPS_DEFINE(qmap_ops,
.select_cpu = (void *)qmap_select_cpu,
.enqueue = (void *)qmap_enqueue,
.dequeue = (void *)qmap_dequeue,
.dispatch = (void *)qmap_dispatch,
.init_task = (void *)qmap_init_task,
.init = (void *)qmap_init,
.exit = (void *)qmap_exit,
.name = "qmap");