diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst index d3028554b1e9..43063ade737a 100644 --- a/Documentation/vm/slub.rst +++ b/Documentation/vm/slub.rst @@ -384,5 +384,69 @@ c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the 40,60`` range will plot only samples collected between 40th and 60th seconds). + +DebugFS files for SLUB +====================== + +For more information about current state of SLUB caches with the user tracking +debug option enabled, debugfs files are available, typically under +/sys/kernel/debug/slab// (created only for caches with enabled user +tracking). There are 2 types of these files with the following debug +information: + +1. alloc_traces:: + + Prints information about unique allocation traces of the currently + allocated objects. The output is sorted by frequency of each trace. + + Information in the output: + Number of objects, allocating function, minimal/average/maximal jiffies since alloc, + pid range of the allocating processes, cpu mask of allocating cpus, and stack trace. + + Example::: + + 1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1:: + __slab_alloc+0x6d/0x90 + kmem_cache_alloc_trace+0x2eb/0x300 + populate_error_injection_list+0x97/0x110 + init_error_injection+0x1b/0x71 + do_one_initcall+0x5f/0x2d0 + kernel_init_freeable+0x26f/0x2d7 + kernel_init+0xe/0x118 + ret_from_fork+0x22/0x30 + + +2. free_traces:: + + Prints information about unique freeing traces of the currently allocated + objects. The freeing traces thus come from the previous life-cycle of the + objects and are reported as not available for objects allocated for the first + time. The output is sorted by frequency of each trace. + + Information in the output: + Number of objects, freeing function, minimal/average/maximal jiffies since free, + pid range of the freeing processes, cpu mask of freeing cpus, and stack trace. + + Example::: + + 1980 age=4294912290 pid=0 cpus=0 + 51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1 + kfree+0x2db/0x420 + acpi_ut_update_ref_count+0x6a6/0x782 + acpi_ut_update_object_reference+0x1ad/0x234 + acpi_ut_remove_reference+0x7d/0x84 + acpi_rs_get_prt_method_data+0x97/0xd6 + acpi_get_irq_routing_table+0x82/0xc4 + acpi_pci_irq_find_prt_entry+0x8e/0x2e0 + acpi_pci_irq_lookup+0x3a/0x1e0 + acpi_pci_irq_enable+0x77/0x240 + pcibios_enable_device+0x39/0x40 + do_pci_enable_device.part.0+0x5d/0xe0 + pci_enable_device_flags+0xfc/0x120 + pci_enable_device+0x13/0x20 + virtio_pci_probe+0x9e/0x170 + local_pci_probe+0x48/0x80 + pci_device_probe+0x105/0x1c0 + Christoph Lameter, May 30, 2007 Sergey Senozhatsky, October 23, 2015 diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 33c5c0e3bd8d..f9c68a9dac04 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -105,7 +105,6 @@ struct kmem_cache { struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ - struct kmem_cache_order_objects max; struct kmem_cache_order_objects min; gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 17f992fe6355..bc2797955de9 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -20,18 +20,36 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, gfp_t gfp_flags, bool can_alloc); /* - * Every user of stack depot has to call this during its own init when it's - * decided that it will be calling stack_depot_save() later. + * Every user of stack depot has to call stack_depot_init() during its own init + * when it's decided that it will be calling stack_depot_save() later. This is + * recommended for e.g. modules initialized later in the boot process, when + * slab_is_available() is true. * * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot * enabled as part of mm_init(), for subsystems where it's known at compile time * that stack depot will be used. + * + * Another alternative is to call stack_depot_want_early_init(), when the + * decision to use stack depot is taken e.g. when evaluating kernel boot + * parameters, which precedes the enablement point in mm_init(). + * + * stack_depot_init() and stack_depot_want_early_init() can be called regardless + * of CONFIG_STACKDEPOT and are no-op when disabled. The actual save/fetch/print + * functions should only be called from code that makes sure CONFIG_STACKDEPOT + * is enabled. */ +#ifdef CONFIG_STACKDEPOT int stack_depot_init(void); -#ifdef CONFIG_STACKDEPOT_ALWAYS_INIT -static inline int stack_depot_early_init(void) { return stack_depot_init(); } +void __init stack_depot_want_early_init(void); + +/* This is supposed to be called only from mm_init() */ +int __init stack_depot_early_init(void); #else +static inline int stack_depot_init(void) { return 0; } + +static inline void stack_depot_want_early_init(void) { } + static inline int stack_depot_early_init(void) { return 0; } #endif diff --git a/init/Kconfig b/init/Kconfig index ddcbefe535e9..adc57f989d87 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1875,6 +1875,7 @@ config SLUB_DEBUG default y bool "Enable SLUB debugging support" if EXPERT depends on SLUB && SYSFS + select STACKDEPOT if STACKTRACE_SUPPORT help SLUB has extensive debug support features. Disabling these can result in significant savings in code size. This also disables diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 075cd25363ac..78d6139111cd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -709,6 +709,7 @@ config DEBUG_SLAB config SLUB_DEBUG_ON bool "SLUB debugging on by default" depends on SLUB && SLUB_DEBUG + select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT default n help Boot with debugging on by default. SLUB boots by default with diff --git a/lib/stackdepot.c b/lib/stackdepot.c index bf5ba9af0500..5ca0d086ef4a 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -66,6 +66,9 @@ struct stack_record { unsigned long entries[]; /* Variable-sized array of entries. */ }; +static bool __stack_depot_want_early_init __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); +static bool __stack_depot_early_init_passed __initdata; + static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; static int depot_index; @@ -162,38 +165,58 @@ static int __init is_stack_depot_disabled(char *str) } early_param("stack_depot_disable", is_stack_depot_disabled); -/* - * __ref because of memblock_alloc(), which will not be actually called after - * the __init code is gone, because at that point slab_is_available() is true - */ -__ref int stack_depot_init(void) +void __init stack_depot_want_early_init(void) +{ + /* Too late to request early init now */ + WARN_ON(__stack_depot_early_init_passed); + + __stack_depot_want_early_init = true; +} + +int __init stack_depot_early_init(void) +{ + size_t size; + + /* This is supposed to be called only once, from mm_init() */ + if (WARN_ON(__stack_depot_early_init_passed)) + return 0; + + __stack_depot_early_init_passed = true; + + if (!__stack_depot_want_early_init || stack_depot_disable) + return 0; + + size = (STACK_HASH_SIZE * sizeof(struct stack_record *)); + pr_info("Stack Depot early init allocating hash table with memblock_alloc, %zu bytes\n", + size); + stack_table = memblock_alloc(size, SMP_CACHE_BYTES); + + if (!stack_table) { + pr_err("Stack Depot hash table allocation failed, disabling\n"); + stack_depot_disable = true; + return -ENOMEM; + } + + return 0; +} + +int stack_depot_init(void) { static DEFINE_MUTEX(stack_depot_init_mutex); + int ret = 0; mutex_lock(&stack_depot_init_mutex); if (!stack_depot_disable && !stack_table) { - size_t size = (STACK_HASH_SIZE * sizeof(struct stack_record *)); - int i; - - if (slab_is_available()) { - pr_info("Stack Depot allocating hash table with kvmalloc\n"); - stack_table = kvmalloc(size, GFP_KERNEL); - } else { - pr_info("Stack Depot allocating hash table with memblock_alloc\n"); - stack_table = memblock_alloc(size, SMP_CACHE_BYTES); - } - if (stack_table) { - for (i = 0; i < STACK_HASH_SIZE; i++) - stack_table[i] = NULL; - } else { + pr_info("Stack Depot allocating hash table with kvcalloc\n"); + stack_table = kvcalloc(STACK_HASH_SIZE, sizeof(struct stack_record *), GFP_KERNEL); + if (!stack_table) { pr_err("Stack Depot hash table allocation failed, disabling\n"); stack_depot_disable = true; - mutex_unlock(&stack_depot_init_mutex); - return -ENOMEM; + ret = -ENOMEM; } } mutex_unlock(&stack_depot_init_mutex); - return 0; + return ret; } EXPORT_SYMBOL_GPL(stack_depot_init); diff --git a/mm/page_owner.c b/mm/page_owner.c index fb3a05fdebdb..2743062e92c2 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -45,7 +45,12 @@ static void init_early_allocated_pages(void); static int __init early_page_owner_param(char *buf) { - return kstrtobool(buf, &page_owner_enabled); + int ret = kstrtobool(buf, &page_owner_enabled); + + if (page_owner_enabled) + stack_depot_want_early_init(); + + return ret; } early_param("page_owner", early_page_owner_param); @@ -83,8 +88,6 @@ static __init void init_page_owner(void) if (!page_owner_enabled) return; - stack_depot_init(); - register_dummy_stack(); register_failure_stack(); register_early_stack(); diff --git a/mm/slab_common.c b/mm/slab_common.c index 6ee64d6208b3..c4d63f2c78b8 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -24,6 +24,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -314,9 +315,13 @@ kmem_cache_create_usercopy(const char *name, * If no slub_debug was enabled globally, the static key is not yet * enabled by setup_slub_debug(). Enable it if the cache is being * created with any of the debugging flags passed explicitly. + * It's also possible that this is the first cache created with + * SLAB_STORE_USER and we should init stack_depot for it. */ if (flags & SLAB_DEBUG_FLAGS) static_branch_enable(&slub_debug_enabled); + if (flags & SLAB_STORE_USER) + stack_depot_init(); #endif mutex_lock(&slab_mutex); @@ -849,6 +854,8 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) return; } flags |= SLAB_ACCOUNT; + } else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) { + flags |= SLAB_CACHE_DMA; } kmalloc_caches[type][idx] = create_kmalloc_cache( @@ -877,7 +884,7 @@ void __init create_kmalloc_caches(slab_flags_t flags) /* * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined */ - for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { + for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { if (!kmalloc_caches[type][i]) new_kmalloc_cache(i, type, flags); @@ -898,20 +905,6 @@ void __init create_kmalloc_caches(slab_flags_t flags) /* Kmalloc array is now usable */ slab_state = UP; - -#ifdef CONFIG_ZONE_DMA - for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i]; - - if (s) { - kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( - kmalloc_info[i].name[KMALLOC_DMA], - kmalloc_info[i].size, - SLAB_CACHE_DMA | flags, 0, - kmalloc_info[i].size); - } - } -#endif } #endif /* !CONFIG_SLOB */ diff --git a/mm/slub.c b/mm/slub.c index bfed9cfba36a..d8d5abf49f5f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +38,7 @@ #include #include #include +#include #include #include @@ -264,8 +266,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ -#ifdef CONFIG_STACKTRACE - unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#ifdef CONFIG_STACKDEPOT + depot_stack_handle_t handle; #endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ @@ -724,57 +726,51 @@ static struct track *get_track(struct kmem_cache *s, void *object, return kasan_reset_tag(p + alloc); } -static void set_track(struct kmem_cache *s, void *object, +static void noinline set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) { struct track *p = get_track(s, object, alloc); - if (addr) { -#ifdef CONFIG_STACKTRACE - unsigned int nr_entries; +#ifdef CONFIG_STACKDEPOT + unsigned long entries[TRACK_ADDRS_COUNT]; + unsigned int nr_entries; - metadata_access_enable(); - nr_entries = stack_trace_save(kasan_reset_tag(p->addrs), - TRACK_ADDRS_COUNT, 3); - metadata_access_disable(); - - if (nr_entries < TRACK_ADDRS_COUNT) - p->addrs[nr_entries] = 0; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); + p->handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); #endif - p->addr = addr; - p->cpu = smp_processor_id(); - p->pid = current->pid; - p->when = jiffies; - } else { - memset(p, 0, sizeof(struct track)); - } + + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current->pid; + p->when = jiffies; } static void init_tracking(struct kmem_cache *s, void *object) { + struct track *p; + if (!(s->flags & SLAB_STORE_USER)) return; - set_track(s, object, TRACK_FREE, 0UL); - set_track(s, object, TRACK_ALLOC, 0UL); + p = get_track(s, object, TRACK_ALLOC); + memset(p, 0, 2*sizeof(struct track)); } static void print_track(const char *s, struct track *t, unsigned long pr_time) { + depot_stack_handle_t handle __maybe_unused; + if (!t->addr) return; pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); -#ifdef CONFIG_STACKTRACE - { - int i; - for (i = 0; i < TRACK_ADDRS_COUNT; i++) - if (t->addrs[i]) - pr_err("\t%pS\n", (void *)t->addrs[i]); - else - break; - } +#ifdef CONFIG_STACKDEPOT + handle = READ_ONCE(t->handle); + if (handle) + stack_depot_print(handle); + else + pr_err("object allocation/free stack trace missing\n"); #endif } @@ -1532,6 +1528,8 @@ static int __init setup_slub_debug(char *str) global_slub_debug_changed = true; } else { slab_list_specified = true; + if (flags & SLAB_STORE_USER) + stack_depot_want_early_init(); } } @@ -1549,6 +1547,8 @@ static int __init setup_slub_debug(char *str) } out: slub_debug = global_flags; + if (slub_debug & SLAB_STORE_USER) + stack_depot_want_early_init(); if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); else @@ -4162,8 +4162,6 @@ static int calculate_sizes(struct kmem_cache *s) */ s->oo = oo_make(order, size); s->min = oo_make(get_order(size), size); - if (oo_objects(s->oo) > oo_objects(s->max)) - s->max = s->oo; return !!oo_objects(s->oo); } @@ -4341,18 +4339,26 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) objp = fixup_red_left(s, objp); trackp = get_track(s, objp, TRACK_ALLOC); kpp->kp_ret = (void *)trackp->addr; -#ifdef CONFIG_STACKTRACE - for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { - kpp->kp_stack[i] = (void *)trackp->addrs[i]; - if (!kpp->kp_stack[i]) - break; - } +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; - trackp = get_track(s, objp, TRACK_FREE); - for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { - kpp->kp_free_stack[i] = (void *)trackp->addrs[i]; - if (!kpp->kp_free_stack[i]) - break; + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_stack[i] = (void *)entries[i]; + } + + trackp = get_track(s, objp, TRACK_FREE); + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_free_stack[i] = (void *)entries[i]; + } } #endif #endif @@ -5054,6 +5060,7 @@ EXPORT_SYMBOL(validate_slab_cache); */ struct location { + depot_stack_handle_t handle; unsigned long count; unsigned long addr; long long sum_time; @@ -5106,9 +5113,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, { long start, end, pos; struct location *l; - unsigned long caddr; + unsigned long caddr, chandle; unsigned long age = jiffies - track->when; + depot_stack_handle_t handle = 0; +#ifdef CONFIG_STACKDEPOT + handle = READ_ONCE(track->handle); +#endif start = -1; end = t->count; @@ -5123,7 +5134,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, break; caddr = t->loc[pos].addr; - if (track->addr == caddr) { + chandle = t->loc[pos].handle; + if ((track->addr == caddr) && (handle == chandle)) { l = &t->loc[pos]; l->count++; @@ -5148,6 +5160,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (track->addr < caddr) end = pos; + else if (track->addr == caddr && handle < chandle) + end = pos; else start = pos; } @@ -5170,6 +5184,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, l->max_time = age; l->min_pid = track->pid; l->max_pid = track->pid; + l->handle = handle; cpumask_clear(to_cpumask(l->cpus)); cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); nodes_clear(l->nodes); @@ -6079,6 +6094,21 @@ static int slab_debugfs_show(struct seq_file *seq, void *v) seq_printf(seq, " nodes=%*pbl", nodemask_pr_args(&l->nodes)); +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries, j; + + handle = READ_ONCE(l->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + seq_puts(seq, "\n"); + for (j = 0; j < nr_entries; j++) + seq_printf(seq, " %pS\n", (void *)entries[j]); + } + } +#endif seq_puts(seq, "\n"); } @@ -6103,6 +6133,17 @@ static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos) return NULL; } +static int cmp_loc_by_count(const void *a, const void *b, const void *data) +{ + struct location *loc1 = (struct location *)a; + struct location *loc2 = (struct location *)b; + + if (loc1->count > loc2->count) + return -1; + else + return 1; +} + static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) { struct loc_track *t = seq->private; @@ -6164,6 +6205,10 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) spin_unlock_irqrestore(&n->list_lock, flags); } + /* Sort locations by count */ + sort_r(t->loc, t->count, sizeof(struct location), + cmp_loc_by_count, NULL, NULL); + bitmap_free(obj_map); return 0; }