VMAs are skipped if there is no recent fault activity but this represents a chicken-and-egg problem as there may be no fault activity if the PTEs are never updated to trap NUMA hints. There is an indirect reliance on scanning to be forced early in the lifetime of a task but this may fail to detect changes in phase behaviour. Force inactive VMAs to be scanned when all other eligible VMAs have been updated within the same scan sequence. Test results in general look good with some changes in performance, both negative and positive, depending on whether the additional scanning and faulting was beneficial or not to the workload. The autonuma benchmark workload NUMA01_THREADLOCAL was picked for closer examination. The workload creates two processes with numerous threads and thread-local storage that is zero-filled in a loop. It exercises the corner case where unrelated threads may skip VMAs that are thread-local to another thread and still has some VMAs that inactive while the workload executes. The VMA skipping activity frequency with and without the patch: 6.6.0-rc2-sched-numabtrace-v1 ============================= 649 reason=scan_delay 9,094 reason=unsuitable 48,915 reason=shared_ro 143,919 reason=inaccessible 193,050 reason=pid_inactive 6.6.0-rc2-sched-numabselective-v1 ============================= 146 reason=seq_completed 622 reason=ignore_pid_inactive 624 reason=scan_delay 6,570 reason=unsuitable 16,101 reason=shared_ro 27,608 reason=inaccessible 41,939 reason=pid_inactive Note that with the patch applied, the PID activity is ignored (ignore_pid_inactive) to ensure a VMA with some activity is completely scanned. In addition, a small number of VMAs are scanned when no other eligible VMA is available during a single scan window (seq_completed). The number of times a VMA is skipped due to no PID activity from the scanning task (pid_inactive) drops dramatically. It is expected that this will increase the number of PTEs updated for NUMA hinting faults as well as hinting faults but these represent PTEs that would otherwise have been missed. The tradeoff is scan+fault overhead versus improving locality due to migration. On a 2-socket Cascade Lake test machine, the time to complete the workload is as follows; 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 Min elsp-NUMA01_THREADLOCAL 174.22 ( 0.00%) 117.64 ( 32.48%) Amean elsp-NUMA01_THREADLOCAL 175.68 ( 0.00%) 123.34 * 29.79%* Stddev elsp-NUMA01_THREADLOCAL 1.20 ( 0.00%) 4.06 (-238.20%) CoeffVar elsp-NUMA01_THREADLOCAL 0.68 ( 0.00%) 3.29 (-381.70%) Max elsp-NUMA01_THREADLOCAL 177.18 ( 0.00%) 128.03 ( 27.74%) The time to complete the workload is reduced by almost 30%: 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 / Duration User 91201.80 63506.64 Duration System 2015.53 1819.78 Duration Elapsed 1234.77 868.37 In this specific case, system CPU time was not increased but it's not universally true. From vmstat, the NUMA scanning and fault activity is as follows; 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 Ops NUMA base-page range updates 64272.00 26374386.00 Ops NUMA PTE updates 36624.00 55538.00 Ops NUMA PMD updates 54.00 51404.00 Ops NUMA hint faults 15504.00 75786.00 Ops NUMA hint local faults % 14860.00 56763.00 Ops NUMA hint local percent 95.85 74.90 Ops NUMA pages migrated 1629.00 6469222.00 Both the number of PTE updates and hint faults is dramatically increased. While this is superficially unfortunate, it represents ranges that were simply skipped without the patch. As a result of the scanning and hinting faults, many more pages were also migrated but as the time to completion is reduced, the overhead is offset by the gain. Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Ingo Molnar <mingo@kernel.org> Tested-by: Raghavendra K T <raghavendra.kt@amd.com> Link: https://lore.kernel.org/r/20231010083143.19593-7-mgorman@techsingularity.net
798 lines
19 KiB
C
798 lines
19 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#undef TRACE_SYSTEM
|
|
#define TRACE_SYSTEM sched
|
|
|
|
#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
|
|
#define _TRACE_SCHED_H
|
|
|
|
#include <linux/kthread.h>
|
|
#include <linux/sched/numa_balancing.h>
|
|
#include <linux/tracepoint.h>
|
|
#include <linux/binfmts.h>
|
|
|
|
/*
|
|
* Tracepoint for calling kthread_stop, performed to end a kthread:
|
|
*/
|
|
TRACE_EVENT(sched_kthread_stop,
|
|
|
|
TP_PROTO(struct task_struct *t),
|
|
|
|
TP_ARGS(t),
|
|
|
|
TP_STRUCT__entry(
|
|
__array( char, comm, TASK_COMM_LEN )
|
|
__field( pid_t, pid )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
|
|
__entry->pid = t->pid;
|
|
),
|
|
|
|
TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid)
|
|
);
|
|
|
|
/*
|
|
* Tracepoint for the return value of the kthread stopping:
|
|
*/
|
|
TRACE_EVENT(sched_kthread_stop_ret,
|
|
|
|
TP_PROTO(int ret),
|
|
|
|
TP_ARGS(ret),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( int, ret )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->ret = ret;
|
|
),
|
|
|
|
TP_printk("ret=%d", __entry->ret)
|
|
);
|
|
|
|
/**
|
|
* sched_kthread_work_queue_work - called when a work gets queued
|
|
* @worker: pointer to the kthread_worker
|
|
* @work: pointer to struct kthread_work
|
|
*
|
|
* This event occurs when a work is queued immediately or once a
|
|
* delayed work is actually queued (ie: once the delay has been
|
|
* reached).
|
|
*/
|
|
TRACE_EVENT(sched_kthread_work_queue_work,
|
|
|
|
TP_PROTO(struct kthread_worker *worker,
|
|
struct kthread_work *work),
|
|
|
|
TP_ARGS(worker, work),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( void *, work )
|
|
__field( void *, function)
|
|
__field( void *, worker)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->work = work;
|
|
__entry->function = work->func;
|
|
__entry->worker = worker;
|
|
),
|
|
|
|
TP_printk("work struct=%p function=%ps worker=%p",
|
|
__entry->work, __entry->function, __entry->worker)
|
|
);
|
|
|
|
/**
|
|
* sched_kthread_work_execute_start - called immediately before the work callback
|
|
* @work: pointer to struct kthread_work
|
|
*
|
|
* Allows to track kthread work execution.
|
|
*/
|
|
TRACE_EVENT(sched_kthread_work_execute_start,
|
|
|
|
TP_PROTO(struct kthread_work *work),
|
|
|
|
TP_ARGS(work),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( void *, work )
|
|
__field( void *, function)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->work = work;
|
|
__entry->function = work->func;
|
|
),
|
|
|
|
TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
|
|
);
|
|
|
|
/**
|
|
* sched_kthread_work_execute_end - called immediately after the work callback
|
|
* @work: pointer to struct work_struct
|
|
* @function: pointer to worker function
|
|
*
|
|
* Allows to track workqueue execution.
|
|
*/
|
|
TRACE_EVENT(sched_kthread_work_execute_end,
|
|
|
|
TP_PROTO(struct kthread_work *work, kthread_work_func_t function),
|
|
|
|
TP_ARGS(work, function),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( void *, work )
|
|
__field( void *, function)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->work = work;
|
|
__entry->function = function;
|
|
),
|
|
|
|
TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
|
|
);
|
|
|
|
/*
|
|
* Tracepoint for waking up a task:
|
|
*/
|
|
DECLARE_EVENT_CLASS(sched_wakeup_template,
|
|
|
|
TP_PROTO(struct task_struct *p),
|
|
|
|
TP_ARGS(__perf_task(p)),
|
|
|
|
TP_STRUCT__entry(
|
|
__array( char, comm, TASK_COMM_LEN )
|
|
__field( pid_t, pid )
|
|
__field( int, prio )
|
|
__field( int, target_cpu )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
|
|
__entry->pid = p->pid;
|
|
__entry->prio = p->prio; /* XXX SCHED_DEADLINE */
|
|
__entry->target_cpu = task_cpu(p);
|
|
),
|
|
|
|
TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d",
|
|
__entry->comm, __entry->pid, __entry->prio,
|
|
__entry->target_cpu)
|
|
);
|
|
|
|
/*
|
|
* Tracepoint called when waking a task; this tracepoint is guaranteed to be
|
|
* called from the waking context.
|
|
*/
|
|
DEFINE_EVENT(sched_wakeup_template, sched_waking,
|
|
TP_PROTO(struct task_struct *p),
|
|
TP_ARGS(p));
|
|
|
|
/*
|
|
* Tracepoint called when the task is actually woken; p->state == TASK_RUNNING.
|
|
* It is not always called from the waking context.
|
|
*/
|
|
DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
|
|
TP_PROTO(struct task_struct *p),
|
|
TP_ARGS(p));
|
|
|
|
/*
|
|
* Tracepoint for waking up a new task:
|
|
*/
|
|
DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
|
|
TP_PROTO(struct task_struct *p),
|
|
TP_ARGS(p));
|
|
|
|
#ifdef CREATE_TRACE_POINTS
|
|
static inline long __trace_sched_switch_state(bool preempt,
|
|
unsigned int prev_state,
|
|
struct task_struct *p)
|
|
{
|
|
unsigned int state;
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
BUG_ON(p != current);
|
|
#endif /* CONFIG_SCHED_DEBUG */
|
|
|
|
/*
|
|
* Preemption ignores task state, therefore preempted tasks are always
|
|
* RUNNING (we will not have dequeued if state != RUNNING).
|
|
*/
|
|
if (preempt)
|
|
return TASK_REPORT_MAX;
|
|
|
|
/*
|
|
* task_state_index() uses fls() and returns a value from 0-8 range.
|
|
* Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
|
|
* it for left shift operation to get the correct task->state
|
|
* mapping.
|
|
*/
|
|
state = __task_state_index(prev_state, p->exit_state);
|
|
|
|
return state ? (1 << (state - 1)) : state;
|
|
}
|
|
#endif /* CREATE_TRACE_POINTS */
|
|
|
|
/*
|
|
* Tracepoint for task switches, performed by the scheduler:
|
|
*/
|
|
TRACE_EVENT(sched_switch,
|
|
|
|
TP_PROTO(bool preempt,
|
|
struct task_struct *prev,
|
|
struct task_struct *next,
|
|
unsigned int prev_state),
|
|
|
|
TP_ARGS(preempt, prev, next, prev_state),
|
|
|
|
TP_STRUCT__entry(
|
|
__array( char, prev_comm, TASK_COMM_LEN )
|
|
__field( pid_t, prev_pid )
|
|
__field( int, prev_prio )
|
|
__field( long, prev_state )
|
|
__array( char, next_comm, TASK_COMM_LEN )
|
|
__field( pid_t, next_pid )
|
|
__field( int, next_prio )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
|
|
__entry->prev_pid = prev->pid;
|
|
__entry->prev_prio = prev->prio;
|
|
__entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
|
|
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
|
|
__entry->next_pid = next->pid;
|
|
__entry->next_prio = next->prio;
|
|
/* XXX SCHED_DEADLINE */
|
|
),
|
|
|
|
TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
|
|
__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
|
|
|
|
(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
|
|
__print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
|
|
{ TASK_INTERRUPTIBLE, "S" },
|
|
{ TASK_UNINTERRUPTIBLE, "D" },
|
|
{ __TASK_STOPPED, "T" },
|
|
{ __TASK_TRACED, "t" },
|
|
{ EXIT_DEAD, "X" },
|
|
{ EXIT_ZOMBIE, "Z" },
|
|
{ TASK_PARKED, "P" },
|
|
{ TASK_DEAD, "I" }) :
|
|
"R",
|
|
|
|
__entry->prev_state & TASK_REPORT_MAX ? "+" : "",
|
|
__entry->next_comm, __entry->next_pid, __entry->next_prio)
|
|
);
|
|
|
|
/*
|
|
* Tracepoint for a task being migrated:
|
|
*/
|
|
TRACE_EVENT(sched_migrate_task,
|
|
|
|
TP_PROTO(struct task_struct *p, int dest_cpu),
|
|
|
|
TP_ARGS(p, dest_cpu),
|
|
|
|
TP_STRUCT__entry(
|
|
__array( char, comm, TASK_COMM_LEN )
|
|
__field( pid_t, pid )
|
|
__field( int, prio )
|
|
__field( int, orig_cpu )
|
|
__field( int, dest_cpu )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
|
|
__entry->pid = p->pid;
|
|
__entry->prio = p->prio; /* XXX SCHED_DEADLINE */
|
|
__entry->orig_cpu = task_cpu(p);
|
|
__entry->dest_cpu = dest_cpu;
|
|
),
|
|
|
|
TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d",
|
|
__entry->comm, __entry->pid, __entry->prio,
|
|
__entry->orig_cpu, __entry->dest_cpu)
|
|
);
|
|
|
|
DECLARE_EVENT_CLASS(sched_process_template,
|
|
|
|
TP_PROTO(struct task_struct *p),
|
|
|
|
TP_ARGS(p),
|
|
|
|
TP_STRUCT__entry(
|
|
__array( char, comm, TASK_COMM_LEN )
|
|
__field( pid_t, pid )
|
|
__field( int, prio )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
|
|
__entry->pid = p->pid;
|
|
__entry->prio = p->prio; /* XXX SCHED_DEADLINE */
|
|
),
|
|
|
|
TP_printk("comm=%s pid=%d prio=%d",
|
|
__entry->comm, __entry->pid, __entry->prio)
|
|
);
|
|
|
|
/*
|
|
* Tracepoint for freeing a task:
|
|
*/
|
|
DEFINE_EVENT(sched_process_template, sched_process_free,
|
|
TP_PROTO(struct task_struct *p),
|
|
TP_ARGS(p));
|
|
|
|
/*
|
|
* Tracepoint for a task exiting:
|
|
*/
|
|
DEFINE_EVENT(sched_process_template, sched_process_exit,
|
|
TP_PROTO(struct task_struct *p),
|
|
TP_ARGS(p));
|
|
|
|
/*
|
|
* Tracepoint for waiting on task to unschedule:
|
|
*/
|
|
DEFINE_EVENT(sched_process_template, sched_wait_task,
|
|
TP_PROTO(struct task_struct *p),
|
|
TP_ARGS(p));
|
|
|
|
/*
|
|
* Tracepoint for a waiting task:
|
|
*/
|
|
TRACE_EVENT(sched_process_wait,
|
|
|
|
TP_PROTO(struct pid *pid),
|
|
|
|
TP_ARGS(pid),
|
|
|
|
TP_STRUCT__entry(
|
|
__array( char, comm, TASK_COMM_LEN )
|
|
__field( pid_t, pid )
|
|
__field( int, prio )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
|
|
__entry->pid = pid_nr(pid);
|
|
__entry->prio = current->prio; /* XXX SCHED_DEADLINE */
|
|
),
|
|
|
|
TP_printk("comm=%s pid=%d prio=%d",
|
|
__entry->comm, __entry->pid, __entry->prio)
|
|
);
|
|
|
|
/*
|
|
* Tracepoint for kernel_clone:
|
|
*/
|
|
TRACE_EVENT(sched_process_fork,
|
|
|
|
TP_PROTO(struct task_struct *parent, struct task_struct *child),
|
|
|
|
TP_ARGS(parent, child),
|
|
|
|
TP_STRUCT__entry(
|
|
__array( char, parent_comm, TASK_COMM_LEN )
|
|
__field( pid_t, parent_pid )
|
|
__array( char, child_comm, TASK_COMM_LEN )
|
|
__field( pid_t, child_pid )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
|
|
__entry->parent_pid = parent->pid;
|
|
memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
|
|
__entry->child_pid = child->pid;
|
|
),
|
|
|
|
TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d",
|
|
__entry->parent_comm, __entry->parent_pid,
|
|
__entry->child_comm, __entry->child_pid)
|
|
);
|
|
|
|
/*
|
|
* Tracepoint for exec:
|
|
*/
|
|
TRACE_EVENT(sched_process_exec,
|
|
|
|
TP_PROTO(struct task_struct *p, pid_t old_pid,
|
|
struct linux_binprm *bprm),
|
|
|
|
TP_ARGS(p, old_pid, bprm),
|
|
|
|
TP_STRUCT__entry(
|
|
__string( filename, bprm->filename )
|
|
__field( pid_t, pid )
|
|
__field( pid_t, old_pid )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__assign_str(filename, bprm->filename);
|
|
__entry->pid = p->pid;
|
|
__entry->old_pid = old_pid;
|
|
),
|
|
|
|
TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename),
|
|
__entry->pid, __entry->old_pid)
|
|
);
|
|
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
#define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT
|
|
#define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS
|
|
#else
|
|
#define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT_NOP
|
|
#define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS_NOP
|
|
#endif
|
|
|
|
/*
|
|
* XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
|
|
* adding sched_stat support to SCHED_FIFO/RR would be welcome.
|
|
*/
|
|
DECLARE_EVENT_CLASS_SCHEDSTAT(sched_stat_template,
|
|
|
|
TP_PROTO(struct task_struct *tsk, u64 delay),
|
|
|
|
TP_ARGS(__perf_task(tsk), __perf_count(delay)),
|
|
|
|
TP_STRUCT__entry(
|
|
__array( char, comm, TASK_COMM_LEN )
|
|
__field( pid_t, pid )
|
|
__field( u64, delay )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
|
|
__entry->pid = tsk->pid;
|
|
__entry->delay = delay;
|
|
),
|
|
|
|
TP_printk("comm=%s pid=%d delay=%Lu [ns]",
|
|
__entry->comm, __entry->pid,
|
|
(unsigned long long)__entry->delay)
|
|
);
|
|
|
|
/*
|
|
* Tracepoint for accounting wait time (time the task is runnable
|
|
* but not actually running due to scheduler contention).
|
|
*/
|
|
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_wait,
|
|
TP_PROTO(struct task_struct *tsk, u64 delay),
|
|
TP_ARGS(tsk, delay));
|
|
|
|
/*
|
|
* Tracepoint for accounting sleep time (time the task is not runnable,
|
|
* including iowait, see below).
|
|
*/
|
|
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_sleep,
|
|
TP_PROTO(struct task_struct *tsk, u64 delay),
|
|
TP_ARGS(tsk, delay));
|
|
|
|
/*
|
|
* Tracepoint for accounting iowait time (time the task is not runnable
|
|
* due to waiting on IO to complete).
|
|
*/
|
|
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_iowait,
|
|
TP_PROTO(struct task_struct *tsk, u64 delay),
|
|
TP_ARGS(tsk, delay));
|
|
|
|
/*
|
|
* Tracepoint for accounting blocked time (time the task is in uninterruptible).
|
|
*/
|
|
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked,
|
|
TP_PROTO(struct task_struct *tsk, u64 delay),
|
|
TP_ARGS(tsk, delay));
|
|
|
|
/*
|
|
* Tracepoint for accounting runtime (time the task is executing
|
|
* on a CPU).
|
|
*/
|
|
DECLARE_EVENT_CLASS(sched_stat_runtime,
|
|
|
|
TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
|
|
|
|
TP_ARGS(tsk, __perf_count(runtime), vruntime),
|
|
|
|
TP_STRUCT__entry(
|
|
__array( char, comm, TASK_COMM_LEN )
|
|
__field( pid_t, pid )
|
|
__field( u64, runtime )
|
|
__field( u64, vruntime )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
|
|
__entry->pid = tsk->pid;
|
|
__entry->runtime = runtime;
|
|
__entry->vruntime = vruntime;
|
|
),
|
|
|
|
TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]",
|
|
__entry->comm, __entry->pid,
|
|
(unsigned long long)__entry->runtime,
|
|
(unsigned long long)__entry->vruntime)
|
|
);
|
|
|
|
DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
|
|
TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
|
|
TP_ARGS(tsk, runtime, vruntime));
|
|
|
|
/*
|
|
* Tracepoint for showing priority inheritance modifying a tasks
|
|
* priority.
|
|
*/
|
|
TRACE_EVENT(sched_pi_setprio,
|
|
|
|
TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),
|
|
|
|
TP_ARGS(tsk, pi_task),
|
|
|
|
TP_STRUCT__entry(
|
|
__array( char, comm, TASK_COMM_LEN )
|
|
__field( pid_t, pid )
|
|
__field( int, oldprio )
|
|
__field( int, newprio )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
|
|
__entry->pid = tsk->pid;
|
|
__entry->oldprio = tsk->prio;
|
|
__entry->newprio = pi_task ?
|
|
min(tsk->normal_prio, pi_task->prio) :
|
|
tsk->normal_prio;
|
|
/* XXX SCHED_DEADLINE bits missing */
|
|
),
|
|
|
|
TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
|
|
__entry->comm, __entry->pid,
|
|
__entry->oldprio, __entry->newprio)
|
|
);
|
|
|
|
#ifdef CONFIG_DETECT_HUNG_TASK
|
|
TRACE_EVENT(sched_process_hang,
|
|
TP_PROTO(struct task_struct *tsk),
|
|
TP_ARGS(tsk),
|
|
|
|
TP_STRUCT__entry(
|
|
__array( char, comm, TASK_COMM_LEN )
|
|
__field( pid_t, pid )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
|
|
__entry->pid = tsk->pid;
|
|
),
|
|
|
|
TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid)
|
|
);
|
|
#endif /* CONFIG_DETECT_HUNG_TASK */
|
|
|
|
/*
|
|
* Tracks migration of tasks from one runqueue to another. Can be used to
|
|
* detect if automatic NUMA balancing is bouncing between nodes.
|
|
*/
|
|
TRACE_EVENT(sched_move_numa,
|
|
|
|
TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
|
|
|
|
TP_ARGS(tsk, src_cpu, dst_cpu),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( pid_t, pid )
|
|
__field( pid_t, tgid )
|
|
__field( pid_t, ngid )
|
|
__field( int, src_cpu )
|
|
__field( int, src_nid )
|
|
__field( int, dst_cpu )
|
|
__field( int, dst_nid )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->pid = task_pid_nr(tsk);
|
|
__entry->tgid = task_tgid_nr(tsk);
|
|
__entry->ngid = task_numa_group_id(tsk);
|
|
__entry->src_cpu = src_cpu;
|
|
__entry->src_nid = cpu_to_node(src_cpu);
|
|
__entry->dst_cpu = dst_cpu;
|
|
__entry->dst_nid = cpu_to_node(dst_cpu);
|
|
),
|
|
|
|
TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d",
|
|
__entry->pid, __entry->tgid, __entry->ngid,
|
|
__entry->src_cpu, __entry->src_nid,
|
|
__entry->dst_cpu, __entry->dst_nid)
|
|
);
|
|
|
|
DECLARE_EVENT_CLASS(sched_numa_pair_template,
|
|
|
|
TP_PROTO(struct task_struct *src_tsk, int src_cpu,
|
|
struct task_struct *dst_tsk, int dst_cpu),
|
|
|
|
TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( pid_t, src_pid )
|
|
__field( pid_t, src_tgid )
|
|
__field( pid_t, src_ngid )
|
|
__field( int, src_cpu )
|
|
__field( int, src_nid )
|
|
__field( pid_t, dst_pid )
|
|
__field( pid_t, dst_tgid )
|
|
__field( pid_t, dst_ngid )
|
|
__field( int, dst_cpu )
|
|
__field( int, dst_nid )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->src_pid = task_pid_nr(src_tsk);
|
|
__entry->src_tgid = task_tgid_nr(src_tsk);
|
|
__entry->src_ngid = task_numa_group_id(src_tsk);
|
|
__entry->src_cpu = src_cpu;
|
|
__entry->src_nid = cpu_to_node(src_cpu);
|
|
__entry->dst_pid = dst_tsk ? task_pid_nr(dst_tsk) : 0;
|
|
__entry->dst_tgid = dst_tsk ? task_tgid_nr(dst_tsk) : 0;
|
|
__entry->dst_ngid = dst_tsk ? task_numa_group_id(dst_tsk) : 0;
|
|
__entry->dst_cpu = dst_cpu;
|
|
__entry->dst_nid = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1;
|
|
),
|
|
|
|
TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d",
|
|
__entry->src_pid, __entry->src_tgid, __entry->src_ngid,
|
|
__entry->src_cpu, __entry->src_nid,
|
|
__entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
|
|
__entry->dst_cpu, __entry->dst_nid)
|
|
);
|
|
|
|
DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa,
|
|
|
|
TP_PROTO(struct task_struct *src_tsk, int src_cpu,
|
|
struct task_struct *dst_tsk, int dst_cpu),
|
|
|
|
TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
|
|
);
|
|
|
|
DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
|
|
|
|
TP_PROTO(struct task_struct *src_tsk, int src_cpu,
|
|
struct task_struct *dst_tsk, int dst_cpu),
|
|
|
|
TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
|
|
);
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
#define NUMAB_SKIP_REASON \
|
|
EM( NUMAB_SKIP_UNSUITABLE, "unsuitable" ) \
|
|
EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \
|
|
EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \
|
|
EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \
|
|
EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \
|
|
EM( NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) \
|
|
EMe(NUMAB_SKIP_SEQ_COMPLETED, "seq_completed" )
|
|
|
|
/* Redefine for export. */
|
|
#undef EM
|
|
#undef EMe
|
|
#define EM(a, b) TRACE_DEFINE_ENUM(a);
|
|
#define EMe(a, b) TRACE_DEFINE_ENUM(a);
|
|
|
|
NUMAB_SKIP_REASON
|
|
|
|
/* Redefine for symbolic printing. */
|
|
#undef EM
|
|
#undef EMe
|
|
#define EM(a, b) { a, b },
|
|
#define EMe(a, b) { a, b }
|
|
|
|
TRACE_EVENT(sched_skip_vma_numa,
|
|
|
|
TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
enum numa_vmaskip_reason reason),
|
|
|
|
TP_ARGS(mm, vma, reason),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(unsigned long, numa_scan_offset)
|
|
__field(unsigned long, vm_start)
|
|
__field(unsigned long, vm_end)
|
|
__field(enum numa_vmaskip_reason, reason)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->numa_scan_offset = mm->numa_scan_offset;
|
|
__entry->vm_start = vma->vm_start;
|
|
__entry->vm_end = vma->vm_end;
|
|
__entry->reason = reason;
|
|
),
|
|
|
|
TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s",
|
|
__entry->numa_scan_offset,
|
|
__entry->vm_start,
|
|
__entry->vm_end,
|
|
__print_symbolic(__entry->reason, NUMAB_SKIP_REASON))
|
|
);
|
|
#endif /* CONFIG_NUMA_BALANCING */
|
|
|
|
/*
|
|
* Tracepoint for waking a polling cpu without an IPI.
|
|
*/
|
|
TRACE_EVENT(sched_wake_idle_without_ipi,
|
|
|
|
TP_PROTO(int cpu),
|
|
|
|
TP_ARGS(cpu),
|
|
|
|
TP_STRUCT__entry(
|
|
__field( int, cpu )
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->cpu = cpu;
|
|
),
|
|
|
|
TP_printk("cpu=%d", __entry->cpu)
|
|
);
|
|
|
|
/*
|
|
* Following tracepoints are not exported in tracefs and provide hooking
|
|
* mechanisms only for testing and debugging purposes.
|
|
*
|
|
* Postfixed with _tp to make them easily identifiable in the code.
|
|
*/
|
|
DECLARE_TRACE(pelt_cfs_tp,
|
|
TP_PROTO(struct cfs_rq *cfs_rq),
|
|
TP_ARGS(cfs_rq));
|
|
|
|
DECLARE_TRACE(pelt_rt_tp,
|
|
TP_PROTO(struct rq *rq),
|
|
TP_ARGS(rq));
|
|
|
|
DECLARE_TRACE(pelt_dl_tp,
|
|
TP_PROTO(struct rq *rq),
|
|
TP_ARGS(rq));
|
|
|
|
DECLARE_TRACE(pelt_thermal_tp,
|
|
TP_PROTO(struct rq *rq),
|
|
TP_ARGS(rq));
|
|
|
|
DECLARE_TRACE(pelt_irq_tp,
|
|
TP_PROTO(struct rq *rq),
|
|
TP_ARGS(rq));
|
|
|
|
DECLARE_TRACE(pelt_se_tp,
|
|
TP_PROTO(struct sched_entity *se),
|
|
TP_ARGS(se));
|
|
|
|
DECLARE_TRACE(sched_cpu_capacity_tp,
|
|
TP_PROTO(struct rq *rq),
|
|
TP_ARGS(rq));
|
|
|
|
DECLARE_TRACE(sched_overutilized_tp,
|
|
TP_PROTO(struct root_domain *rd, bool overutilized),
|
|
TP_ARGS(rd, overutilized));
|
|
|
|
DECLARE_TRACE(sched_util_est_cfs_tp,
|
|
TP_PROTO(struct cfs_rq *cfs_rq),
|
|
TP_ARGS(cfs_rq));
|
|
|
|
DECLARE_TRACE(sched_util_est_se_tp,
|
|
TP_PROTO(struct sched_entity *se),
|
|
TP_ARGS(se));
|
|
|
|
DECLARE_TRACE(sched_update_nr_running_tp,
|
|
TP_PROTO(struct rq *rq, int change),
|
|
TP_ARGS(rq, change));
|
|
|
|
DECLARE_TRACE(sched_compute_energy_tp,
|
|
TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy,
|
|
unsigned long max_util, unsigned long busy_time),
|
|
TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
|
|
|
|
#endif /* _TRACE_SCHED_H */
|
|
|
|
/* This part must be outside protection */
|
|
#include <trace/define_trace.h>
|