When there're many lock contentions in the system, people sometimes want to know who caused the contention, IOW who's the owner of the locks. The -o/--lock-owner option tries to follow the lock owners for the contended mutexes and rwsems from BPF, and then attributes the contention time to the owner instead of the waiter. It's a best effort approach to get the owner info at the time of the contention and doesn't guarantee to have the precise tracking of owners if it's changing over time. Currently it only handles mutex and rwsem that have owner field in their struct and it basically points to a task_struct that owns the lock at the moment. Technically its type is atomic_long_t and it comes with some LSB bits used for other meanings. So it needs to clear them when casting it to a pointer to task_struct. Also the atomic_long_t is a typedef of the atomic 32 or 64 bit types depending on arch which is a wrapper struct for the counter value. I'm not aware of proper ways to access those kernel atomic types from BPF so I just read the internal counter value directly. Please let me know if there's a better way. When -o/--lock-owner option is used, it goes to the task aggregation mode like -t/--threads option does. However it cannot get the owner for other lock types like spinlock and sometimes even for mutex. $ sudo ./perf lock con -abo -- ./perf bench sched pipe # Running 'sched/pipe' benchmark: # Executed 1000000 pipe operations between two processes Total time: 4.766 [sec] 4.766540 usecs/op 209795 ops/sec contended total wait max wait avg wait pid owner 403 565.32 us 26.81 us 1.40 us -1 Unknown 4 27.99 us 8.57 us 7.00 us 1583145 sched-pipe 1 8.25 us 8.25 us 8.25 us 1583144 sched-pipe 1 2.03 us 2.03 us 2.03 us 5068 chrome As you can see, the owner is unknown for the most cases. But if we filter only for the mutex locks, it'd more likely get the onwers. $ sudo ./perf lock con -abo -Y mutex -- ./perf bench sched pipe # Running 'sched/pipe' benchmark: # Executed 1000000 pipe operations between two processes Total time: 4.910 [sec] 4.910435 usecs/op 203647 ops/sec contended total wait max wait avg wait pid owner 2 15.50 us 8.29 us 7.75 us 1582852 sched-pipe 7 7.20 us 2.47 us 1.03 us -1 Unknown 1 6.74 us 6.74 us 6.74 us 1582851 sched-pipe Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Boqun Feng <boqun.feng@gmail.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hao Luo <haoluo@google.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Song Liu <song@kernel.org> Cc: Waiman Long <longman@redhat.com> Cc: Will Deacon <will@kernel.org> Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20230207002403.63590-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
343 lines
8.2 KiB
C
343 lines
8.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include "util/debug.h"
|
|
#include "util/evlist.h"
|
|
#include "util/machine.h"
|
|
#include "util/map.h"
|
|
#include "util/symbol.h"
|
|
#include "util/target.h"
|
|
#include "util/thread.h"
|
|
#include "util/thread_map.h"
|
|
#include "util/lock-contention.h"
|
|
#include <linux/zalloc.h>
|
|
#include <linux/string.h>
|
|
#include <bpf/bpf.h>
|
|
|
|
#include "bpf_skel/lock_contention.skel.h"
|
|
#include "bpf_skel/lock_data.h"
|
|
|
|
static struct lock_contention_bpf *skel;
|
|
|
|
int lock_contention_prepare(struct lock_contention *con)
|
|
{
|
|
int i, fd;
|
|
int ncpus = 1, ntasks = 1, ntypes = 1, naddrs = 1;
|
|
struct evlist *evlist = con->evlist;
|
|
struct target *target = con->target;
|
|
|
|
skel = lock_contention_bpf__open();
|
|
if (!skel) {
|
|
pr_err("Failed to open lock-contention BPF skeleton\n");
|
|
return -1;
|
|
}
|
|
|
|
bpf_map__set_value_size(skel->maps.stacks, con->max_stack * sizeof(u64));
|
|
bpf_map__set_max_entries(skel->maps.lock_stat, con->map_nr_entries);
|
|
bpf_map__set_max_entries(skel->maps.tstamp, con->map_nr_entries);
|
|
|
|
if (con->aggr_mode == LOCK_AGGR_TASK)
|
|
bpf_map__set_max_entries(skel->maps.task_data, con->map_nr_entries);
|
|
else
|
|
bpf_map__set_max_entries(skel->maps.task_data, 1);
|
|
|
|
if (con->save_callstack)
|
|
bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
|
|
else
|
|
bpf_map__set_max_entries(skel->maps.stacks, 1);
|
|
|
|
if (target__has_cpu(target))
|
|
ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
|
|
if (target__has_task(target))
|
|
ntasks = perf_thread_map__nr(evlist->core.threads);
|
|
if (con->filters->nr_types)
|
|
ntypes = con->filters->nr_types;
|
|
|
|
/* resolve lock name filters to addr */
|
|
if (con->filters->nr_syms) {
|
|
struct symbol *sym;
|
|
struct map *kmap;
|
|
unsigned long *addrs;
|
|
|
|
for (i = 0; i < con->filters->nr_syms; i++) {
|
|
sym = machine__find_kernel_symbol_by_name(con->machine,
|
|
con->filters->syms[i],
|
|
&kmap);
|
|
if (sym == NULL) {
|
|
pr_warning("ignore unknown symbol: %s\n",
|
|
con->filters->syms[i]);
|
|
continue;
|
|
}
|
|
|
|
addrs = realloc(con->filters->addrs,
|
|
(con->filters->nr_addrs + 1) * sizeof(*addrs));
|
|
if (addrs == NULL) {
|
|
pr_warning("memory allocation failure\n");
|
|
continue;
|
|
}
|
|
|
|
addrs[con->filters->nr_addrs++] = kmap->unmap_ip(kmap, sym->start);
|
|
con->filters->addrs = addrs;
|
|
}
|
|
naddrs = con->filters->nr_addrs;
|
|
}
|
|
|
|
bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus);
|
|
bpf_map__set_max_entries(skel->maps.task_filter, ntasks);
|
|
bpf_map__set_max_entries(skel->maps.type_filter, ntypes);
|
|
bpf_map__set_max_entries(skel->maps.addr_filter, naddrs);
|
|
|
|
if (lock_contention_bpf__load(skel) < 0) {
|
|
pr_err("Failed to load lock-contention BPF skeleton\n");
|
|
return -1;
|
|
}
|
|
|
|
if (target__has_cpu(target)) {
|
|
u32 cpu;
|
|
u8 val = 1;
|
|
|
|
skel->bss->has_cpu = 1;
|
|
fd = bpf_map__fd(skel->maps.cpu_filter);
|
|
|
|
for (i = 0; i < ncpus; i++) {
|
|
cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu;
|
|
bpf_map_update_elem(fd, &cpu, &val, BPF_ANY);
|
|
}
|
|
}
|
|
|
|
if (target__has_task(target)) {
|
|
u32 pid;
|
|
u8 val = 1;
|
|
|
|
skel->bss->has_task = 1;
|
|
fd = bpf_map__fd(skel->maps.task_filter);
|
|
|
|
for (i = 0; i < ntasks; i++) {
|
|
pid = perf_thread_map__pid(evlist->core.threads, i);
|
|
bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
|
|
}
|
|
}
|
|
|
|
if (target__none(target) && evlist->workload.pid > 0) {
|
|
u32 pid = evlist->workload.pid;
|
|
u8 val = 1;
|
|
|
|
skel->bss->has_task = 1;
|
|
fd = bpf_map__fd(skel->maps.task_filter);
|
|
bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
|
|
}
|
|
|
|
if (con->filters->nr_types) {
|
|
u8 val = 1;
|
|
|
|
skel->bss->has_type = 1;
|
|
fd = bpf_map__fd(skel->maps.type_filter);
|
|
|
|
for (i = 0; i < con->filters->nr_types; i++)
|
|
bpf_map_update_elem(fd, &con->filters->types[i], &val, BPF_ANY);
|
|
}
|
|
|
|
if (con->filters->nr_addrs) {
|
|
u8 val = 1;
|
|
|
|
skel->bss->has_addr = 1;
|
|
fd = bpf_map__fd(skel->maps.addr_filter);
|
|
|
|
for (i = 0; i < con->filters->nr_addrs; i++)
|
|
bpf_map_update_elem(fd, &con->filters->addrs[i], &val, BPF_ANY);
|
|
}
|
|
|
|
/* these don't work well if in the rodata section */
|
|
skel->bss->stack_skip = con->stack_skip;
|
|
skel->bss->aggr_mode = con->aggr_mode;
|
|
skel->bss->needs_callstack = con->save_callstack;
|
|
skel->bss->lock_owner = con->owner;
|
|
|
|
lock_contention_bpf__attach(skel);
|
|
return 0;
|
|
}
|
|
|
|
int lock_contention_start(void)
|
|
{
|
|
skel->bss->enabled = 1;
|
|
return 0;
|
|
}
|
|
|
|
int lock_contention_stop(void)
|
|
{
|
|
skel->bss->enabled = 0;
|
|
return 0;
|
|
}
|
|
|
|
static const char *lock_contention_get_name(struct lock_contention *con,
|
|
struct contention_key *key,
|
|
u64 *stack_trace)
|
|
{
|
|
int idx = 0;
|
|
u64 addr;
|
|
const char *name = "";
|
|
static char name_buf[KSYM_NAME_LEN];
|
|
struct symbol *sym;
|
|
struct map *kmap;
|
|
struct machine *machine = con->machine;
|
|
|
|
if (con->aggr_mode == LOCK_AGGR_TASK) {
|
|
struct contention_task_data task;
|
|
int pid = key->pid;
|
|
int task_fd = bpf_map__fd(skel->maps.task_data);
|
|
|
|
/* do not update idle comm which contains CPU number */
|
|
if (pid) {
|
|
struct thread *t = __machine__findnew_thread(machine, /*pid=*/-1, pid);
|
|
|
|
if (t == NULL)
|
|
return name;
|
|
if (!bpf_map_lookup_elem(task_fd, &pid, &task) &&
|
|
thread__set_comm(t, task.comm, /*timestamp=*/0))
|
|
name = task.comm;
|
|
}
|
|
return name;
|
|
}
|
|
|
|
if (con->aggr_mode == LOCK_AGGR_ADDR) {
|
|
sym = machine__find_kernel_symbol(machine, key->lock_addr, &kmap);
|
|
if (sym)
|
|
name = sym->name;
|
|
return name;
|
|
}
|
|
|
|
/* LOCK_AGGR_CALLER: skip lock internal functions */
|
|
while (machine__is_lock_function(machine, stack_trace[idx]) &&
|
|
idx < con->max_stack - 1)
|
|
idx++;
|
|
|
|
addr = stack_trace[idx];
|
|
sym = machine__find_kernel_symbol(machine, addr, &kmap);
|
|
|
|
if (sym) {
|
|
unsigned long offset;
|
|
|
|
offset = kmap->map_ip(kmap, addr) - sym->start;
|
|
|
|
if (offset == 0)
|
|
return sym->name;
|
|
|
|
snprintf(name_buf, sizeof(name_buf), "%s+%#lx", sym->name, offset);
|
|
} else {
|
|
snprintf(name_buf, sizeof(name_buf), "%#lx", (unsigned long)addr);
|
|
}
|
|
|
|
return name_buf;
|
|
}
|
|
|
|
int lock_contention_read(struct lock_contention *con)
|
|
{
|
|
int fd, stack, err = 0;
|
|
struct contention_key *prev_key, key;
|
|
struct contention_data data = {};
|
|
struct lock_stat *st = NULL;
|
|
struct machine *machine = con->machine;
|
|
u64 *stack_trace;
|
|
size_t stack_size = con->max_stack * sizeof(*stack_trace);
|
|
|
|
fd = bpf_map__fd(skel->maps.lock_stat);
|
|
stack = bpf_map__fd(skel->maps.stacks);
|
|
|
|
con->lost = skel->bss->lost;
|
|
|
|
stack_trace = zalloc(stack_size);
|
|
if (stack_trace == NULL)
|
|
return -1;
|
|
|
|
if (con->aggr_mode == LOCK_AGGR_TASK) {
|
|
struct thread *idle = __machine__findnew_thread(machine,
|
|
/*pid=*/0,
|
|
/*tid=*/0);
|
|
thread__set_comm(idle, "swapper", /*timestamp=*/0);
|
|
}
|
|
|
|
/* make sure it loads the kernel map */
|
|
map__load(maps__first(machine->kmaps));
|
|
|
|
prev_key = NULL;
|
|
while (!bpf_map_get_next_key(fd, prev_key, &key)) {
|
|
s64 ls_key;
|
|
const char *name;
|
|
|
|
/* to handle errors in the loop body */
|
|
err = -1;
|
|
|
|
bpf_map_lookup_elem(fd, &key, &data);
|
|
if (con->save_callstack) {
|
|
bpf_map_lookup_elem(stack, &key.stack_id, stack_trace);
|
|
|
|
if (!match_callstack_filter(machine, stack_trace))
|
|
goto next;
|
|
}
|
|
|
|
switch (con->aggr_mode) {
|
|
case LOCK_AGGR_CALLER:
|
|
ls_key = key.stack_id;
|
|
break;
|
|
case LOCK_AGGR_TASK:
|
|
ls_key = key.pid;
|
|
break;
|
|
case LOCK_AGGR_ADDR:
|
|
ls_key = key.lock_addr;
|
|
break;
|
|
default:
|
|
goto next;
|
|
}
|
|
|
|
st = lock_stat_find(ls_key);
|
|
if (st != NULL) {
|
|
st->wait_time_total += data.total_time;
|
|
if (st->wait_time_max < data.max_time)
|
|
st->wait_time_max = data.max_time;
|
|
if (st->wait_time_min > data.min_time)
|
|
st->wait_time_min = data.min_time;
|
|
|
|
st->nr_contended += data.count;
|
|
if (st->nr_contended)
|
|
st->avg_wait_time = st->wait_time_total / st->nr_contended;
|
|
goto next;
|
|
}
|
|
|
|
name = lock_contention_get_name(con, &key, stack_trace);
|
|
st = lock_stat_findnew(ls_key, name, data.flags);
|
|
if (st == NULL)
|
|
break;
|
|
|
|
st->nr_contended = data.count;
|
|
st->wait_time_total = data.total_time;
|
|
st->wait_time_max = data.max_time;
|
|
st->wait_time_min = data.min_time;
|
|
|
|
if (data.count)
|
|
st->avg_wait_time = data.total_time / data.count;
|
|
|
|
if (con->save_callstack) {
|
|
st->callstack = memdup(stack_trace, stack_size);
|
|
if (st->callstack == NULL)
|
|
break;
|
|
}
|
|
|
|
next:
|
|
prev_key = &key;
|
|
|
|
/* we're fine now, reset the error */
|
|
err = 0;
|
|
}
|
|
|
|
free(stack_trace);
|
|
|
|
return err;
|
|
}
|
|
|
|
int lock_contention_finish(void)
|
|
{
|
|
if (skel) {
|
|
skel->bss->enabled = 0;
|
|
lock_contention_bpf__destroy(skel);
|
|
}
|
|
|
|
return 0;
|
|
}
|