Merge branch 'tip/sched/core' into sched_ext/for-6.12
Pull in tip/sched/core to resolve two merge conflicts: -96fd6c65ef
("sched: Factor out update_other_load_avgs() from __update_blocked_others()")5d871a6399
("sched/fair: Move effective_cpu_util() and effective_cpu_util() in fair.c") A simple context conflict. The former added __update_blocked_others() in the same #ifdef CONFIG_SMP block that effective_cpu_util() and sched_cpu_util() are in and the latter moved those functions to fair.c. This makes __update_blocked_others() more out of place. Will follow up with a patch to relocate. -96fd6c65ef
("sched: Factor out update_other_load_avgs() from __update_blocked_others()")84d265281d
("sched/pelt: Use rq_clock_task() for hw_pressure") The former factored out the body of __update_blocked_others() into update_other_load_avgs(). The latter changed how update_hw_load_avg() is called in the body. Resolved by applying the change to update_other_load_avgs() instead. Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
commit
0b1777f0fa
9 changed files with 189 additions and 152 deletions
|
@ -749,21 +749,19 @@ Appendix A. Test suite
|
||||||
of the command line options. Please refer to rt-app documentation for more
|
of the command line options. Please refer to rt-app documentation for more
|
||||||
details (`<rt-app-sources>/doc/*.json`).
|
details (`<rt-app-sources>/doc/*.json`).
|
||||||
|
|
||||||
The second testing application is a modification of schedtool, called
|
The second testing application is done using chrt which has support
|
||||||
schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a
|
for SCHED_DEADLINE.
|
||||||
certain pid/application. schedtool-dl is available at:
|
|
||||||
https://github.com/scheduler-tools/schedtool-dl.git.
|
|
||||||
|
|
||||||
The usage is straightforward::
|
The usage is straightforward::
|
||||||
|
|
||||||
# schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app
|
# chrt -d -T 10000000 -D 100000000 0 ./my_cpuhog_app
|
||||||
|
|
||||||
With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation
|
With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation
|
||||||
of 10ms every 100ms (note that parameters are expressed in microseconds).
|
of 10ms every 100ms (note that parameters are expressed in nanoseconds).
|
||||||
You can also use schedtool to create a reservation for an already running
|
You can also use chrt to create a reservation for an already running
|
||||||
application, given that you know its pid::
|
application, given that you know its pid::
|
||||||
|
|
||||||
# schedtool -E -t 10000000:100000000 my_app_pid
|
# chrt -d -T 10000000 -D 100000000 -p 0 my_app_pid
|
||||||
|
|
||||||
Appendix B. Minimal main()
|
Appendix B. Minimal main()
|
||||||
==========================
|
==========================
|
||||||
|
|
|
@ -224,9 +224,9 @@ static void __init cppc_freq_invariance_init(void)
|
||||||
* Fake (unused) bandwidth; workaround to "fix"
|
* Fake (unused) bandwidth; workaround to "fix"
|
||||||
* priority inheritance.
|
* priority inheritance.
|
||||||
*/
|
*/
|
||||||
.sched_runtime = 1000000,
|
.sched_runtime = NSEC_PER_MSEC,
|
||||||
.sched_deadline = 10000000,
|
.sched_deadline = 10 * NSEC_PER_MSEC,
|
||||||
.sched_period = 10000000,
|
.sched_period = 10 * NSEC_PER_MSEC,
|
||||||
};
|
};
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
|
|
@ -58,9 +58,9 @@
|
||||||
*
|
*
|
||||||
* This is reflected by the following fields of the sched_attr structure:
|
* This is reflected by the following fields of the sched_attr structure:
|
||||||
*
|
*
|
||||||
* @sched_deadline representative of the task's deadline
|
* @sched_deadline representative of the task's deadline in nanoseconds
|
||||||
* @sched_runtime representative of the task's runtime
|
* @sched_runtime representative of the task's runtime in nanoseconds
|
||||||
* @sched_period representative of the task's period
|
* @sched_period representative of the task's period in nanoseconds
|
||||||
*
|
*
|
||||||
* Given this task model, there are a multiplicity of scheduling algorithms
|
* Given this task model, there are a multiplicity of scheduling algorithms
|
||||||
* and policies, that can be used to ensure all the tasks will make their
|
* and policies, that can be used to ensure all the tasks will make their
|
||||||
|
|
|
@ -845,8 +845,16 @@ repeat:
|
||||||
* event only cares about the address.
|
* event only cares about the address.
|
||||||
*/
|
*/
|
||||||
trace_sched_kthread_work_execute_end(work, func);
|
trace_sched_kthread_work_execute_end(work, func);
|
||||||
} else if (!freezing(current))
|
} else if (!freezing(current)) {
|
||||||
schedule();
|
schedule();
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Handle the case where the current remains
|
||||||
|
* TASK_INTERRUPTIBLE. try_to_freeze() expects
|
||||||
|
* the current to be TASK_RUNNING.
|
||||||
|
*/
|
||||||
|
__set_current_state(TASK_RUNNING);
|
||||||
|
}
|
||||||
|
|
||||||
try_to_freeze();
|
try_to_freeze();
|
||||||
cond_resched();
|
cond_resched();
|
||||||
|
|
|
@ -267,6 +267,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
|
||||||
|
|
||||||
void sched_core_enqueue(struct rq *rq, struct task_struct *p)
|
void sched_core_enqueue(struct rq *rq, struct task_struct *p)
|
||||||
{
|
{
|
||||||
|
if (p->se.sched_delayed)
|
||||||
|
return;
|
||||||
|
|
||||||
rq->core->core_task_seq++;
|
rq->core->core_task_seq++;
|
||||||
|
|
||||||
if (!p->core_cookie)
|
if (!p->core_cookie)
|
||||||
|
@ -277,6 +280,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
|
||||||
|
|
||||||
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
|
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
|
||||||
{
|
{
|
||||||
|
if (p->se.sched_delayed)
|
||||||
|
return;
|
||||||
|
|
||||||
rq->core->core_task_seq++;
|
rq->core->core_task_seq++;
|
||||||
|
|
||||||
if (sched_core_enqueued(p)) {
|
if (sched_core_enqueued(p)) {
|
||||||
|
@ -6477,19 +6483,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||||
* Constants for the sched_mode argument of __schedule().
|
* Constants for the sched_mode argument of __schedule().
|
||||||
*
|
*
|
||||||
* The mode argument allows RT enabled kernels to differentiate a
|
* The mode argument allows RT enabled kernels to differentiate a
|
||||||
* preemption from blocking on an 'sleeping' spin/rwlock. Note that
|
* preemption from blocking on an 'sleeping' spin/rwlock.
|
||||||
* SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
|
|
||||||
* optimize the AND operation out and just check for zero.
|
|
||||||
*/
|
*/
|
||||||
#define SM_NONE 0x0
|
#define SM_IDLE (-1)
|
||||||
#define SM_PREEMPT 0x1
|
#define SM_NONE 0
|
||||||
#define SM_RTLOCK_WAIT 0x2
|
#define SM_PREEMPT 1
|
||||||
|
#define SM_RTLOCK_WAIT 2
|
||||||
#ifndef CONFIG_PREEMPT_RT
|
|
||||||
# define SM_MASK_PREEMPT (~0U)
|
|
||||||
#else
|
|
||||||
# define SM_MASK_PREEMPT SM_PREEMPT
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* __schedule() is the main scheduler function.
|
* __schedule() is the main scheduler function.
|
||||||
|
@ -6530,9 +6529,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
||||||
*
|
*
|
||||||
* WARNING: must be called with preemption disabled!
|
* WARNING: must be called with preemption disabled!
|
||||||
*/
|
*/
|
||||||
static void __sched notrace __schedule(unsigned int sched_mode)
|
static void __sched notrace __schedule(int sched_mode)
|
||||||
{
|
{
|
||||||
struct task_struct *prev, *next;
|
struct task_struct *prev, *next;
|
||||||
|
/*
|
||||||
|
* On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
|
||||||
|
* as a preemption by schedule_debug() and RCU.
|
||||||
|
*/
|
||||||
|
bool preempt = sched_mode > SM_NONE;
|
||||||
unsigned long *switch_count;
|
unsigned long *switch_count;
|
||||||
unsigned long prev_state;
|
unsigned long prev_state;
|
||||||
struct rq_flags rf;
|
struct rq_flags rf;
|
||||||
|
@ -6543,13 +6547,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
|
||||||
rq = cpu_rq(cpu);
|
rq = cpu_rq(cpu);
|
||||||
prev = rq->curr;
|
prev = rq->curr;
|
||||||
|
|
||||||
schedule_debug(prev, !!sched_mode);
|
schedule_debug(prev, preempt);
|
||||||
|
|
||||||
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
|
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
|
||||||
hrtick_clear(rq);
|
hrtick_clear(rq);
|
||||||
|
|
||||||
local_irq_disable();
|
local_irq_disable();
|
||||||
rcu_note_context_switch(!!sched_mode);
|
rcu_note_context_switch(preempt);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Make sure that signal_pending_state()->signal_pending() below
|
* Make sure that signal_pending_state()->signal_pending() below
|
||||||
|
@ -6578,12 +6582,20 @@ static void __sched notrace __schedule(unsigned int sched_mode)
|
||||||
|
|
||||||
switch_count = &prev->nivcsw;
|
switch_count = &prev->nivcsw;
|
||||||
|
|
||||||
|
/* Task state changes only considers SM_PREEMPT as preemption */
|
||||||
|
preempt = sched_mode == SM_PREEMPT;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We must load prev->state once (task_struct::state is volatile), such
|
* We must load prev->state once (task_struct::state is volatile), such
|
||||||
* that we form a control dependency vs deactivate_task() below.
|
* that we form a control dependency vs deactivate_task() below.
|
||||||
*/
|
*/
|
||||||
prev_state = READ_ONCE(prev->__state);
|
prev_state = READ_ONCE(prev->__state);
|
||||||
if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
|
if (sched_mode == SM_IDLE) {
|
||||||
|
if (!rq->nr_running) {
|
||||||
|
next = prev;
|
||||||
|
goto picked;
|
||||||
|
}
|
||||||
|
} else if (!preempt && prev_state) {
|
||||||
if (signal_pending_state(prev_state, prev)) {
|
if (signal_pending_state(prev_state, prev)) {
|
||||||
WRITE_ONCE(prev->__state, TASK_RUNNING);
|
WRITE_ONCE(prev->__state, TASK_RUNNING);
|
||||||
} else {
|
} else {
|
||||||
|
@ -6614,6 +6626,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
|
||||||
}
|
}
|
||||||
|
|
||||||
next = pick_next_task(rq, prev, &rf);
|
next = pick_next_task(rq, prev, &rf);
|
||||||
|
picked:
|
||||||
clear_tsk_need_resched(prev);
|
clear_tsk_need_resched(prev);
|
||||||
clear_preempt_need_resched();
|
clear_preempt_need_resched();
|
||||||
#ifdef CONFIG_SCHED_DEBUG
|
#ifdef CONFIG_SCHED_DEBUG
|
||||||
|
@ -6655,7 +6668,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
|
||||||
psi_account_irqtime(rq, prev, next);
|
psi_account_irqtime(rq, prev, next);
|
||||||
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
|
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
|
||||||
|
|
||||||
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
|
trace_sched_switch(preempt, prev, next, prev_state);
|
||||||
|
|
||||||
/* Also unlocks the rq: */
|
/* Also unlocks the rq: */
|
||||||
rq = context_switch(rq, prev, next, &rf);
|
rq = context_switch(rq, prev, next, &rf);
|
||||||
|
@ -6731,7 +6744,7 @@ static void sched_update_worker(struct task_struct *tsk)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __always_inline void __schedule_loop(unsigned int sched_mode)
|
static __always_inline void __schedule_loop(int sched_mode)
|
||||||
{
|
{
|
||||||
do {
|
do {
|
||||||
preempt_disable();
|
preempt_disable();
|
||||||
|
@ -6776,7 +6789,7 @@ void __sched schedule_idle(void)
|
||||||
*/
|
*/
|
||||||
WARN_ON_ONCE(current->__state);
|
WARN_ON_ONCE(current->__state);
|
||||||
do {
|
do {
|
||||||
__schedule(SM_NONE);
|
__schedule(SM_IDLE);
|
||||||
} while (need_resched());
|
} while (need_resched());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -662,9 +662,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
|
||||||
* Fake (unused) bandwidth; workaround to "fix"
|
* Fake (unused) bandwidth; workaround to "fix"
|
||||||
* priority inheritance.
|
* priority inheritance.
|
||||||
*/
|
*/
|
||||||
.sched_runtime = 1000000,
|
.sched_runtime = NSEC_PER_MSEC,
|
||||||
.sched_deadline = 10000000,
|
.sched_deadline = 10 * NSEC_PER_MSEC,
|
||||||
.sched_period = 10000000,
|
.sched_period = 10 * NSEC_PER_MSEC,
|
||||||
};
|
};
|
||||||
struct cpufreq_policy *policy = sg_policy->policy;
|
struct cpufreq_policy *policy = sg_policy->policy;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
|
@ -739,7 +739,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
|
||||||
else
|
else
|
||||||
SEQ_printf(m, " %c", task_state_to_char(p));
|
SEQ_printf(m, " %c", task_state_to_char(p));
|
||||||
|
|
||||||
SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
|
SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
|
||||||
p->comm, task_pid_nr(p),
|
p->comm, task_pid_nr(p),
|
||||||
SPLIT_NS(p->se.vruntime),
|
SPLIT_NS(p->se.vruntime),
|
||||||
entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
|
entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
|
||||||
|
@ -750,17 +750,16 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
|
||||||
(long long)(p->nvcsw + p->nivcsw),
|
(long long)(p->nvcsw + p->nivcsw),
|
||||||
p->prio);
|
p->prio);
|
||||||
|
|
||||||
SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld",
|
SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld",
|
||||||
SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
|
SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
|
||||||
SPLIT_NS(p->se.sum_exec_runtime),
|
|
||||||
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
|
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
|
||||||
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
|
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA_BALANCING
|
#ifdef CONFIG_NUMA_BALANCING
|
||||||
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
|
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
|
||||||
#endif
|
#endif
|
||||||
#ifdef CONFIG_CGROUP_SCHED
|
#ifdef CONFIG_CGROUP_SCHED
|
||||||
SEQ_printf_task_group_path(m, task_group(p), " %s")
|
SEQ_printf_task_group_path(m, task_group(p), " %s")
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
SEQ_printf(m, "\n");
|
SEQ_printf(m, "\n");
|
||||||
|
@ -772,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
|
||||||
|
|
||||||
SEQ_printf(m, "\n");
|
SEQ_printf(m, "\n");
|
||||||
SEQ_printf(m, "runnable tasks:\n");
|
SEQ_printf(m, "runnable tasks:\n");
|
||||||
SEQ_printf(m, " S task PID tree-key switches prio"
|
SEQ_printf(m, " S task PID vruntime eligible "
|
||||||
" wait-time sum-exec sum-sleep\n");
|
"deadline slice sum-exec switches "
|
||||||
|
"prio wait-time sum-sleep sum-block"
|
||||||
|
#ifdef CONFIG_NUMA_BALANCING
|
||||||
|
" node group-id"
|
||||||
|
#endif
|
||||||
|
#ifdef CONFIG_CGROUP_SCHED
|
||||||
|
" group-path"
|
||||||
|
#endif
|
||||||
|
"\n");
|
||||||
SEQ_printf(m, "-------------------------------------------------------"
|
SEQ_printf(m, "-------------------------------------------------------"
|
||||||
"------------------------------------------------------\n");
|
"------------------------------------------------------"
|
||||||
|
"------------------------------------------------------"
|
||||||
|
#ifdef CONFIG_NUMA_BALANCING
|
||||||
|
"--------------"
|
||||||
|
#endif
|
||||||
|
#ifdef CONFIG_CGROUP_SCHED
|
||||||
|
"--------------"
|
||||||
|
#endif
|
||||||
|
"\n");
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
for_each_process_thread(g, p) {
|
for_each_process_thread(g, p) {
|
||||||
|
|
|
@ -6949,18 +6949,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||||
int rq_h_nr_running = rq->cfs.h_nr_running;
|
int rq_h_nr_running = rq->cfs.h_nr_running;
|
||||||
u64 slice = 0;
|
u64 slice = 0;
|
||||||
|
|
||||||
if (flags & ENQUEUE_DELAYED) {
|
|
||||||
requeue_delayed_entity(se);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The code below (indirectly) updates schedutil which looks at
|
* The code below (indirectly) updates schedutil which looks at
|
||||||
* the cfs_rq utilization to select a frequency.
|
* the cfs_rq utilization to select a frequency.
|
||||||
* Let's add the task's estimated utilization to the cfs_rq's
|
* Let's add the task's estimated utilization to the cfs_rq's
|
||||||
* estimated utilization, before we update schedutil.
|
* estimated utilization, before we update schedutil.
|
||||||
*/
|
*/
|
||||||
util_est_enqueue(&rq->cfs, p);
|
if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
|
||||||
|
util_est_enqueue(&rq->cfs, p);
|
||||||
|
|
||||||
|
if (flags & ENQUEUE_DELAYED) {
|
||||||
|
requeue_delayed_entity(se);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If in_iowait is set, the code below may not trigger any cpufreq
|
* If in_iowait is set, the code below may not trigger any cpufreq
|
||||||
|
@ -7178,7 +7179,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||||
*/
|
*/
|
||||||
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||||
{
|
{
|
||||||
util_est_dequeue(&rq->cfs, p);
|
if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
|
||||||
|
util_est_dequeue(&rq->cfs, p);
|
||||||
|
|
||||||
if (dequeue_entities(rq, &p->se, flags) < 0) {
|
if (dequeue_entities(rq, &p->se, flags) < 0) {
|
||||||
util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
|
util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
|
||||||
|
@ -8085,6 +8087,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
|
||||||
return cpu_util(cpu, p, -1, 0);
|
return cpu_util(cpu, p, -1, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This function computes an effective utilization for the given CPU, to be
|
||||||
|
* used for frequency selection given the linear relation: f = u * f_max.
|
||||||
|
*
|
||||||
|
* The scheduler tracks the following metrics:
|
||||||
|
*
|
||||||
|
* cpu_util_{cfs,rt,dl,irq}()
|
||||||
|
* cpu_bw_dl()
|
||||||
|
*
|
||||||
|
* Where the cfs,rt and dl util numbers are tracked with the same metric and
|
||||||
|
* synchronized windows and are thus directly comparable.
|
||||||
|
*
|
||||||
|
* The cfs,rt,dl utilization are the running times measured with rq->clock_task
|
||||||
|
* which excludes things like IRQ and steal-time. These latter are then accrued
|
||||||
|
* in the IRQ utilization.
|
||||||
|
*
|
||||||
|
* The DL bandwidth number OTOH is not a measured metric but a value computed
|
||||||
|
* based on the task model parameters and gives the minimal utilization
|
||||||
|
* required to meet deadlines.
|
||||||
|
*/
|
||||||
|
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
|
||||||
|
unsigned long *min,
|
||||||
|
unsigned long *max)
|
||||||
|
{
|
||||||
|
unsigned long util, irq, scale;
|
||||||
|
struct rq *rq = cpu_rq(cpu);
|
||||||
|
|
||||||
|
scale = arch_scale_cpu_capacity(cpu);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Early check to see if IRQ/steal time saturates the CPU, can be
|
||||||
|
* because of inaccuracies in how we track these -- see
|
||||||
|
* update_irq_load_avg().
|
||||||
|
*/
|
||||||
|
irq = cpu_util_irq(rq);
|
||||||
|
if (unlikely(irq >= scale)) {
|
||||||
|
if (min)
|
||||||
|
*min = scale;
|
||||||
|
if (max)
|
||||||
|
*max = scale;
|
||||||
|
return scale;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (min) {
|
||||||
|
/*
|
||||||
|
* The minimum utilization returns the highest level between:
|
||||||
|
* - the computed DL bandwidth needed with the IRQ pressure which
|
||||||
|
* steals time to the deadline task.
|
||||||
|
* - The minimum performance requirement for CFS and/or RT.
|
||||||
|
*/
|
||||||
|
*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When an RT task is runnable and uclamp is not used, we must
|
||||||
|
* ensure that the task will run at maximum compute capacity.
|
||||||
|
*/
|
||||||
|
if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
|
||||||
|
*min = max(*min, scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Because the time spend on RT/DL tasks is visible as 'lost' time to
|
||||||
|
* CFS tasks and we use the same metric to track the effective
|
||||||
|
* utilization (PELT windows are synchronized) we can directly add them
|
||||||
|
* to obtain the CPU's actual utilization.
|
||||||
|
*/
|
||||||
|
util = util_cfs + cpu_util_rt(rq);
|
||||||
|
util += cpu_util_dl(rq);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The maximum hint is a soft bandwidth requirement, which can be lower
|
||||||
|
* than the actual utilization because of uclamp_max requirements.
|
||||||
|
*/
|
||||||
|
if (max)
|
||||||
|
*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
|
||||||
|
|
||||||
|
if (util >= scale)
|
||||||
|
return scale;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There is still idle time; further improve the number by using the
|
||||||
|
* IRQ metric. Because IRQ/steal time is hidden from the task clock we
|
||||||
|
* need to scale the task numbers:
|
||||||
|
*
|
||||||
|
* max - irq
|
||||||
|
* U' = irq + --------- * U
|
||||||
|
* max
|
||||||
|
*/
|
||||||
|
util = scale_irq_capacity(util, irq, scale);
|
||||||
|
util += irq;
|
||||||
|
|
||||||
|
return min(scale, util);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned long sched_cpu_util(int cpu)
|
||||||
|
{
|
||||||
|
return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* energy_env - Utilization landscape for energy estimation.
|
* energy_env - Utilization landscape for energy estimation.
|
||||||
* @task_busy_time: Utilization contribution by the task for which we test the
|
* @task_busy_time: Utilization contribution by the task for which we test the
|
||||||
|
|
|
@ -272,110 +272,12 @@ bool update_other_load_avgs(struct rq *rq)
|
||||||
|
|
||||||
lockdep_assert_rq_held(rq);
|
lockdep_assert_rq_held(rq);
|
||||||
|
|
||||||
|
/* hw_pressure doesn't care about invariance */
|
||||||
return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
|
return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
|
||||||
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
|
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
|
||||||
update_hw_load_avg(now, rq, hw_pressure) |
|
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) |
|
||||||
update_irq_load_avg(rq, 0);
|
update_irq_load_avg(rq, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* This function computes an effective utilization for the given CPU, to be
|
|
||||||
* used for frequency selection given the linear relation: f = u * f_max.
|
|
||||||
*
|
|
||||||
* The scheduler tracks the following metrics:
|
|
||||||
*
|
|
||||||
* cpu_util_{cfs,rt,dl,irq}()
|
|
||||||
* cpu_bw_dl()
|
|
||||||
*
|
|
||||||
* Where the cfs,rt and dl util numbers are tracked with the same metric and
|
|
||||||
* synchronized windows and are thus directly comparable.
|
|
||||||
*
|
|
||||||
* The cfs,rt,dl utilization are the running times measured with rq->clock_task
|
|
||||||
* which excludes things like IRQ and steal-time. These latter are then accrued
|
|
||||||
* in the IRQ utilization.
|
|
||||||
*
|
|
||||||
* The DL bandwidth number OTOH is not a measured metric but a value computed
|
|
||||||
* based on the task model parameters and gives the minimal utilization
|
|
||||||
* required to meet deadlines.
|
|
||||||
*/
|
|
||||||
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
|
|
||||||
unsigned long *min,
|
|
||||||
unsigned long *max)
|
|
||||||
{
|
|
||||||
unsigned long util, irq, scale;
|
|
||||||
struct rq *rq = cpu_rq(cpu);
|
|
||||||
|
|
||||||
scale = arch_scale_cpu_capacity(cpu);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Early check to see if IRQ/steal time saturates the CPU, can be
|
|
||||||
* because of inaccuracies in how we track these -- see
|
|
||||||
* update_irq_load_avg().
|
|
||||||
*/
|
|
||||||
irq = cpu_util_irq(rq);
|
|
||||||
if (unlikely(irq >= scale)) {
|
|
||||||
if (min)
|
|
||||||
*min = scale;
|
|
||||||
if (max)
|
|
||||||
*max = scale;
|
|
||||||
return scale;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (min) {
|
|
||||||
/*
|
|
||||||
* The minimum utilization returns the highest level between:
|
|
||||||
* - the computed DL bandwidth needed with the IRQ pressure which
|
|
||||||
* steals time to the deadline task.
|
|
||||||
* - The minimum performance requirement for CFS and/or RT.
|
|
||||||
*/
|
|
||||||
*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* When an RT task is runnable and uclamp is not used, we must
|
|
||||||
* ensure that the task will run at maximum compute capacity.
|
|
||||||
*/
|
|
||||||
if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
|
|
||||||
*min = max(*min, scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Because the time spend on RT/DL tasks is visible as 'lost' time to
|
|
||||||
* CFS tasks and we use the same metric to track the effective
|
|
||||||
* utilization (PELT windows are synchronized) we can directly add them
|
|
||||||
* to obtain the CPU's actual utilization.
|
|
||||||
*/
|
|
||||||
util = util_cfs + cpu_util_rt(rq);
|
|
||||||
util += cpu_util_dl(rq);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The maximum hint is a soft bandwidth requirement, which can be lower
|
|
||||||
* than the actual utilization because of uclamp_max requirements.
|
|
||||||
*/
|
|
||||||
if (max)
|
|
||||||
*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
|
|
||||||
|
|
||||||
if (util >= scale)
|
|
||||||
return scale;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* There is still idle time; further improve the number by using the
|
|
||||||
* IRQ metric. Because IRQ/steal time is hidden from the task clock we
|
|
||||||
* need to scale the task numbers:
|
|
||||||
*
|
|
||||||
* max - irq
|
|
||||||
* U' = irq + --------- * U
|
|
||||||
* max
|
|
||||||
*/
|
|
||||||
util = scale_irq_capacity(util, irq, scale);
|
|
||||||
util += irq;
|
|
||||||
|
|
||||||
return min(scale, util);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned long sched_cpu_util(int cpu)
|
|
||||||
{
|
|
||||||
return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
|
|
||||||
}
|
|
||||||
#endif /* CONFIG_SMP */
|
#endif /* CONFIG_SMP */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Add table
Reference in a new issue