1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00
linux/mm/debug.c
Lorenzo Stoakes b0d66d82fc mm/debug: introduce VM_WARN_ON_VMG() to dump VMA merge state
Patch series "mm/debug: introduce and use VM_WARN_ON_VMG()".

We use a number of asserts, enabled only when CONFIG_DEBUG_VM is set,
during VMA merge operations to ensure state is as expected.

However, when syzkaller or the like encounters these asserts, often the
information provided by the report is insufficient to narrow down what the
problem is.

We noticed this recently in [0], where a non-repro issue resisted
debugging due to simply not having sufficient information to go on.

This series improves the situation by providing VM_WARN_ON_VMG() which
acts like VM_WARN_ON() (i.e.  only actually being invoked if
CONFIG_DEBUG_VM is set), while dumping significant information about the
VMA merge state, the mm_struct describing the virtual address space, all
associated VMAs and, if CONFIG_DEBUG_VM_MAPLE_TREE is set, the associated
maple tree.

[0]:https://lore.kernel.org/all/6774c98f.050a0220.25abdd.0991.GAE@google.com/


This patch (of 2):

We use a number of asserts, enabled only when CONFIG_DEBUG_VM is set,
during VMA merge operations to ensure state is as expected.

However, when syzkaller or the like encounters these asserts, often the
information provided by the report is insufficient to narrow down what the
problem is.

This might not be so much of an issue if the reported problem is
reproducible, but if it is a rarely encountered race or some other case
which precludes a repro, it is a very big problem (see [0] for the
motivating case).

It is therefore sensible to provide a means by which we can easily and
conveniently dump a lot more information in these circumstances.

The aggregation of merge state into a single struct threaded through the
operation makes this trivial - we can simply introduce a variant on
VM_WARN_ON() which takes the VMA merge state object (vmg) and use that to
dump information.

This patch therefore introduces VM_WARN_ON_VMG() which provides this
functionality.

It additionally dumps full mm state, VMA state for each of the three VMAs
the vmg contains (prev, next, vma) and if CONFIG_DEBUG_VM_MAPLE_TREE is
enabled, dumps the maple tree from the provided VMA iterator if non-NULL.

This patch has no functional impact if CONFIG_DEBUG_VM is not set.

[0]:https://lore.kernel.org/all/6774c98f.050a0220.25abdd.0991.GAE@google.com/

Link: https://lkml.kernel.org/r/cover.1735932169.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/13b09b52d4d103ee86acaf0ae612539648ae29e0.1735932169.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@Oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-01-25 20:22:23 -08:00

376 lines
9.1 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* mm/debug.c
*
* mm/ specific debug routines.
*
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/trace_events.h>
#include <linux/memcontrol.h>
#include <trace/events/mmflags.h>
#include <linux/migrate.h>
#include <linux/page_owner.h>
#include <linux/ctype.h>
#include "internal.h"
#include <trace/events/migrate.h>
/*
* Define EM() and EMe() so that MIGRATE_REASON from trace/events/migrate.h can
* be used to populate migrate_reason_names[].
*/
#undef EM
#undef EMe
#define EM(a, b) b,
#define EMe(a, b) b
const char *migrate_reason_names[MR_TYPES] = {
MIGRATE_REASON
};
const struct trace_print_flags pageflag_names[] = {
__def_pageflag_names,
{0, NULL}
};
const struct trace_print_flags gfpflag_names[] = {
__def_gfpflag_names,
{0, NULL}
};
const struct trace_print_flags vmaflag_names[] = {
__def_vmaflag_names,
{0, NULL}
};
#define DEF_PAGETYPE_NAME(_name) [PGTY_##_name - 0xf0] = __stringify(_name)
static const char *page_type_names[] = {
DEF_PAGETYPE_NAME(slab),
DEF_PAGETYPE_NAME(hugetlb),
DEF_PAGETYPE_NAME(offline),
DEF_PAGETYPE_NAME(guard),
DEF_PAGETYPE_NAME(table),
DEF_PAGETYPE_NAME(buddy),
DEF_PAGETYPE_NAME(unaccepted),
};
static const char *page_type_name(unsigned int page_type)
{
unsigned i = (page_type >> 24) - 0xf0;
if (i >= ARRAY_SIZE(page_type_names))
return "unknown";
return page_type_names[i];
}
static void __dump_folio(struct folio *folio, struct page *page,
unsigned long pfn, unsigned long idx)
{
struct address_space *mapping = folio_mapping(folio);
int mapcount = atomic_read(&page->_mapcount);
char *type = "";
mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1;
pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
folio_ref_count(folio), mapcount, mapping,
folio->index + idx, pfn);
if (folio_test_large(folio)) {
pr_warn("head: order:%u mapcount:%d entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
folio_order(folio),
folio_mapcount(folio),
folio_entire_mapcount(folio),
folio_nr_pages_mapped(folio),
atomic_read(&folio->_pincount));
}
#ifdef CONFIG_MEMCG
if (folio->memcg_data)
pr_warn("memcg:%lx\n", folio->memcg_data);
#endif
if (folio_test_ksm(folio))
type = "ksm ";
else if (folio_test_anon(folio))
type = "anon ";
else if (mapping)
dump_mapping(mapping);
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
/*
* Accessing the pageblock without the zone lock. It could change to
* "isolate" again in the meantime, but since we are just dumping the
* state for debugging, it should be fine to accept a bit of
* inaccuracy here due to racing.
*/
pr_warn("%sflags: %pGp%s\n", type, &folio->flags,
is_migrate_cma_folio(folio, pfn) ? " CMA" : "");
if (page_has_type(&folio->page))
pr_warn("page_type: %x(%s)\n", folio->page.page_type >> 24,
page_type_name(folio->page.page_type));
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
sizeof(struct page), false);
if (folio_test_large(folio))
print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), folio,
2 * sizeof(struct page), false);
}
static void __dump_page(const struct page *page)
{
struct folio *foliop, folio;
struct page precise;
unsigned long head;
unsigned long pfn = page_to_pfn(page);
unsigned long idx, nr_pages = 1;
int loops = 5;
again:
memcpy(&precise, page, sizeof(*page));
head = precise.compound_head;
if ((head & 1) == 0) {
foliop = (struct folio *)&precise;
idx = 0;
if (!folio_test_large(foliop))
goto dump;
foliop = (struct folio *)page;
} else {
foliop = (struct folio *)(head - 1);
idx = folio_page_idx(foliop, page);
}
if (idx < MAX_FOLIO_NR_PAGES) {
memcpy(&folio, foliop, 2 * sizeof(struct page));
nr_pages = folio_nr_pages(&folio);
foliop = &folio;
}
if (idx > nr_pages) {
if (loops-- > 0)
goto again;
pr_warn("page does not match folio\n");
precise.compound_head &= ~1UL;
foliop = (struct folio *)&precise;
idx = 0;
}
dump:
__dump_folio(foliop, &precise, pfn, idx);
}
void dump_page(const struct page *page, const char *reason)
{
if (PagePoisoned(page))
pr_warn("page:%p is uninitialized and poisoned", page);
else
__dump_page(page);
if (reason)
pr_warn("page dumped because: %s\n", reason);
dump_page_owner(page);
}
EXPORT_SYMBOL(dump_page);
#ifdef CONFIG_DEBUG_VM
void dump_vma(const struct vm_area_struct *vma)
{
pr_emerg("vma %px start %px end %px mm %px\n"
"prot %lx anon_vma %px vm_ops %px\n"
"pgoff %lx file %px private_data %px\n"
"flags: %#lx(%pGv)\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
vma->vm_file, vma->vm_private_data,
vma->vm_flags, &vma->vm_flags);
}
EXPORT_SYMBOL(dump_vma);
void dump_mm(const struct mm_struct *mm)
{
pr_emerg("mm %px task_size %lu\n"
"mmap_base %lu mmap_legacy_base %lu\n"
"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
"binfmt %px flags %lx\n"
#ifdef CONFIG_AIO
"ioctx_table %px\n"
#endif
#ifdef CONFIG_MEMCG
"owner %px "
#endif
"exe_file %px\n"
#ifdef CONFIG_MMU_NOTIFIER
"notifier_subscriptions %px\n"
#endif
#ifdef CONFIG_NUMA_BALANCING
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
#endif
"tlb_flush_pending %d\n"
"def_flags: %#lx(%pGv)\n",
mm, mm->task_size,
mm->mmap_base, mm->mmap_legacy_base,
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
mm_pgtables_bytes(mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
(u64)atomic64_read(&mm->pinned_vm),
mm->data_vm, mm->exec_vm, mm->stack_vm,
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
mm->binfmt, mm->flags,
#ifdef CONFIG_AIO
mm->ioctx_table,
#endif
#ifdef CONFIG_MEMCG
mm->owner,
#endif
mm->exe_file,
#ifdef CONFIG_MMU_NOTIFIER
mm->notifier_subscriptions,
#endif
#ifdef CONFIG_NUMA_BALANCING
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
#endif
atomic_read(&mm->tlb_flush_pending),
mm->def_flags, &mm->def_flags
);
}
EXPORT_SYMBOL(dump_mm);
void dump_vmg(const struct vma_merge_struct *vmg, const char *reason)
{
if (reason)
pr_warn("vmg %px dumped because: %s\n", vmg, reason);
if (!vmg) {
pr_warn("vmg %px state: (NULL)\n", vmg);
return;
}
pr_warn("vmg %px state: mm %px pgoff %lx\n"
"vmi %px [%lx,%lx)\n"
"prev %px next %px vma %px\n"
"start %lx end %lx flags %lx\n"
"file %px anon_vma %px policy %px\n"
"uffd_ctx %px\n"
"anon_name %px\n"
"merge_flags %x state %x\n",
vmg, vmg->mm, vmg->pgoff,
vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0,
vmg->vmi ? vma_iter_end(vmg->vmi) : 0,
vmg->prev, vmg->next, vmg->vma,
vmg->start, vmg->end, vmg->flags,
vmg->file, vmg->anon_vma, vmg->policy,
#ifdef CONFIG_USERFAULTFD
vmg->uffd_ctx.ctx,
#else
(void *)0,
#endif
vmg->anon_name,
(int)vmg->merge_flags, (int)vmg->state);
if (vmg->mm) {
pr_warn("vmg %px mm:\n", vmg);
dump_mm(vmg->mm);
} else {
pr_warn("vmg %px mm: (NULL)\n", vmg);
}
if (vmg->vma) {
pr_warn("vmg %px vma:\n", vmg);
dump_vma(vmg->vma);
} else {
pr_warn("vmg %px vma: (NULL)\n", vmg);
}
if (vmg->prev) {
pr_warn("vmg %px prev:\n", vmg);
dump_vma(vmg->prev);
} else {
pr_warn("vmg %px prev: (NULL)\n", vmg);
}
if (vmg->next) {
pr_warn("vmg %px next:\n", vmg);
dump_vma(vmg->next);
} else {
pr_warn("vmg %px next: (NULL)\n", vmg);
}
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
if (vmg->vmi) {
pr_warn("vmg %px vmi:\n", vmg);
vma_iter_dump_tree(vmg->vmi);
} else {
pr_warn("vmg %px vmi: (NULL)\n", vmg);
}
#endif
}
EXPORT_SYMBOL(dump_vmg);
static bool page_init_poisoning __read_mostly = true;
static int __init setup_vm_debug(char *str)
{
bool __page_init_poisoning = true;
/*
* Calling vm_debug with no arguments is equivalent to requesting
* to enable all debugging options we can control.
*/
if (*str++ != '=' || !*str)
goto out;
__page_init_poisoning = false;
if (*str == '-')
goto out;
while (*str) {
switch (tolower(*str)) {
case'p':
__page_init_poisoning = true;
break;
default:
pr_err("vm_debug option '%c' unknown. skipped\n",
*str);
}
str++;
}
out:
if (page_init_poisoning && !__page_init_poisoning)
pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n");
page_init_poisoning = __page_init_poisoning;
return 1;
}
__setup("vm_debug", setup_vm_debug);
void page_init_poison(struct page *page, size_t size)
{
if (page_init_poisoning)
memset(page, PAGE_POISON_PATTERN, size);
}
void vma_iter_dump_tree(const struct vma_iterator *vmi)
{
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
mas_dump(&vmi->mas);
mt_dump(vmi->mas.tree, mt_dump_hex);
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
}
#endif /* CONFIG_DEBUG_VM */