Commit91fc957c9b
("arm64/bpf: don't allocate BPF JIT programs in module memory") restricts BPF JIT program allocation to a 128MB region to ensure BPF programs are still in branching range of each other. However this restriction should not apply to the aarch64 JIT, since BPF_JMP | BPF_CALL are implemented as a 64-bit move into a register and then a BLR instruction - which has the effect of being able to call anything without proximity limitation. The practical reason to relax this restriction on JIT memory is that 128MB of JIT memory can be quickly exhausted, especially where PAGE_SIZE is 64KB - one page is needed per program. In cases where seccomp filters are applied to multiple VMs on VM launch - such filters are classic BPF but converted to BPF - this can severely limit the number of VMs that can be launched. In a world where we support BPF JIT always on, turning off the JIT isn't always an option either. Fixes:91fc957c9b
("arm64/bpf: don't allocate BPF JIT programs in module memory") Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Russell King <russell.king@oracle.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Tested-by: Alan Maguire <alan.maguire@oracle.com> Link: https://lore.kernel.org/bpf/1636131046-5982-2-git-send-email-alan.maguire@oracle.com
369 lines
12 KiB
C
369 lines
12 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Based on arch/arm/include/asm/memory.h
|
|
*
|
|
* Copyright (C) 2000-2002 Russell King
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
*
|
|
* Note: this file should not be included by non-asm/.h files
|
|
*/
|
|
#ifndef __ASM_MEMORY_H
|
|
#define __ASM_MEMORY_H
|
|
|
|
#include <linux/const.h>
|
|
#include <linux/sizes.h>
|
|
#include <asm/page-def.h>
|
|
|
|
/*
|
|
* Size of the PCI I/O space. This must remain a power of two so that
|
|
* IO_SPACE_LIMIT acts as a mask for the low bits of I/O addresses.
|
|
*/
|
|
#define PCI_IO_SIZE SZ_16M
|
|
|
|
/*
|
|
* VMEMMAP_SIZE - allows the whole linear region to be covered by
|
|
* a struct page array
|
|
*
|
|
* If we are configured with a 52-bit kernel VA then our VMEMMAP_SIZE
|
|
* needs to cover the memory region from the beginning of the 52-bit
|
|
* PAGE_OFFSET all the way to PAGE_END for 48-bit. This allows us to
|
|
* keep a constant PAGE_OFFSET and "fallback" to using the higher end
|
|
* of the VMEMMAP where 52-bit support is not available in hardware.
|
|
*/
|
|
#define VMEMMAP_SHIFT (PAGE_SHIFT - STRUCT_PAGE_MAX_SHIFT)
|
|
#define VMEMMAP_SIZE ((_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET) >> VMEMMAP_SHIFT)
|
|
|
|
/*
|
|
* PAGE_OFFSET - the virtual address of the start of the linear map, at the
|
|
* start of the TTBR1 address space.
|
|
* PAGE_END - the end of the linear map, where all other kernel mappings begin.
|
|
* KIMAGE_VADDR - the virtual address of the start of the kernel image.
|
|
* VA_BITS - the maximum number of bits for virtual addresses.
|
|
*/
|
|
#define VA_BITS (CONFIG_ARM64_VA_BITS)
|
|
#define _PAGE_OFFSET(va) (-(UL(1) << (va)))
|
|
#define PAGE_OFFSET (_PAGE_OFFSET(VA_BITS))
|
|
#define KIMAGE_VADDR (MODULES_END)
|
|
#define MODULES_END (MODULES_VADDR + MODULES_VSIZE)
|
|
#define MODULES_VADDR (_PAGE_END(VA_BITS_MIN))
|
|
#define MODULES_VSIZE (SZ_128M)
|
|
#define VMEMMAP_START (-(UL(1) << (VA_BITS - VMEMMAP_SHIFT)))
|
|
#define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE)
|
|
#define PCI_IO_END (VMEMMAP_START - SZ_8M)
|
|
#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE)
|
|
#define FIXADDR_TOP (VMEMMAP_START - SZ_32M)
|
|
|
|
#if VA_BITS > 48
|
|
#define VA_BITS_MIN (48)
|
|
#else
|
|
#define VA_BITS_MIN (VA_BITS)
|
|
#endif
|
|
|
|
#define _PAGE_END(va) (-(UL(1) << ((va) - 1)))
|
|
|
|
#define KERNEL_START _text
|
|
#define KERNEL_END _end
|
|
|
|
/*
|
|
* Generic and tag-based KASAN require 1/8th and 1/16th of the kernel virtual
|
|
* address space for the shadow region respectively. They can bloat the stack
|
|
* significantly, so double the (minimum) stack size when they are in use.
|
|
*/
|
|
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
|
#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
|
|
#define KASAN_SHADOW_END ((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) \
|
|
+ KASAN_SHADOW_OFFSET)
|
|
#define PAGE_END (KASAN_SHADOW_END - (1UL << (vabits_actual - KASAN_SHADOW_SCALE_SHIFT)))
|
|
#define KASAN_THREAD_SHIFT 1
|
|
#else
|
|
#define KASAN_THREAD_SHIFT 0
|
|
#define PAGE_END (_PAGE_END(VA_BITS_MIN))
|
|
#endif /* CONFIG_KASAN */
|
|
|
|
#define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT)
|
|
|
|
/*
|
|
* VMAP'd stacks are allocated at page granularity, so we must ensure that such
|
|
* stacks are a multiple of page size.
|
|
*/
|
|
#if defined(CONFIG_VMAP_STACK) && (MIN_THREAD_SHIFT < PAGE_SHIFT)
|
|
#define THREAD_SHIFT PAGE_SHIFT
|
|
#else
|
|
#define THREAD_SHIFT MIN_THREAD_SHIFT
|
|
#endif
|
|
|
|
#if THREAD_SHIFT >= PAGE_SHIFT
|
|
#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT)
|
|
#endif
|
|
|
|
#define THREAD_SIZE (UL(1) << THREAD_SHIFT)
|
|
|
|
/*
|
|
* By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by
|
|
* checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry
|
|
* assembly.
|
|
*/
|
|
#ifdef CONFIG_VMAP_STACK
|
|
#define THREAD_ALIGN (2 * THREAD_SIZE)
|
|
#else
|
|
#define THREAD_ALIGN THREAD_SIZE
|
|
#endif
|
|
|
|
#define IRQ_STACK_SIZE THREAD_SIZE
|
|
|
|
#define OVERFLOW_STACK_SIZE SZ_4K
|
|
|
|
/*
|
|
* Alignment of kernel segments (e.g. .text, .data).
|
|
*
|
|
* 4 KB granule: 16 level 3 entries, with contiguous bit
|
|
* 16 KB granule: 4 level 3 entries, without contiguous bit
|
|
* 64 KB granule: 1 level 3 entry
|
|
*/
|
|
#define SEGMENT_ALIGN SZ_64K
|
|
|
|
/*
|
|
* Memory types available.
|
|
*
|
|
* IMPORTANT: MT_NORMAL must be index 0 since vm_get_page_prot() may 'or' in
|
|
* the MT_NORMAL_TAGGED memory type for PROT_MTE mappings. Note
|
|
* that protection_map[] only contains MT_NORMAL attributes.
|
|
*/
|
|
#define MT_NORMAL 0
|
|
#define MT_NORMAL_TAGGED 1
|
|
#define MT_NORMAL_NC 2
|
|
#define MT_DEVICE_nGnRnE 3
|
|
#define MT_DEVICE_nGnRE 4
|
|
|
|
/*
|
|
* Memory types for Stage-2 translation
|
|
*/
|
|
#define MT_S2_NORMAL 0xf
|
|
#define MT_S2_DEVICE_nGnRE 0x1
|
|
|
|
/*
|
|
* Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001
|
|
* Stage-2 enforces Normal-WB and Device-nGnRE
|
|
*/
|
|
#define MT_S2_FWB_NORMAL 6
|
|
#define MT_S2_FWB_DEVICE_nGnRE 1
|
|
|
|
#ifdef CONFIG_ARM64_4K_PAGES
|
|
#define IOREMAP_MAX_ORDER (PUD_SHIFT)
|
|
#else
|
|
#define IOREMAP_MAX_ORDER (PMD_SHIFT)
|
|
#endif
|
|
|
|
/*
|
|
* Open-coded (swapper_pg_dir - reserved_pg_dir) as this cannot be calculated
|
|
* until link time.
|
|
*/
|
|
#define RESERVED_SWAPPER_OFFSET (PAGE_SIZE)
|
|
|
|
/*
|
|
* Open-coded (swapper_pg_dir - tramp_pg_dir) as this cannot be calculated
|
|
* until link time.
|
|
*/
|
|
#define TRAMP_SWAPPER_OFFSET (2 * PAGE_SIZE)
|
|
|
|
#ifndef __ASSEMBLY__
|
|
|
|
#include <linux/bitops.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/mmdebug.h>
|
|
#include <linux/types.h>
|
|
#include <asm/bug.h>
|
|
|
|
extern u64 vabits_actual;
|
|
|
|
extern s64 memstart_addr;
|
|
/* PHYS_OFFSET - the physical address of the start of memory. */
|
|
#define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; })
|
|
|
|
/* the virtual base of the kernel image */
|
|
extern u64 kimage_vaddr;
|
|
|
|
/* the offset between the kernel virtual and physical mappings */
|
|
extern u64 kimage_voffset;
|
|
|
|
static inline unsigned long kaslr_offset(void)
|
|
{
|
|
return kimage_vaddr - KIMAGE_VADDR;
|
|
}
|
|
|
|
/*
|
|
* Allow all memory at the discovery stage. We will clip it later.
|
|
*/
|
|
#define MIN_MEMBLOCK_ADDR 0
|
|
#define MAX_MEMBLOCK_ADDR U64_MAX
|
|
|
|
/*
|
|
* PFNs are used to describe any physical page; this means
|
|
* PFN 0 == physical address 0.
|
|
*
|
|
* This is the PFN of the first RAM page in the kernel
|
|
* direct-mapped view. We assume this is the first page
|
|
* of RAM in the mem_map as well.
|
|
*/
|
|
#define PHYS_PFN_OFFSET (PHYS_OFFSET >> PAGE_SHIFT)
|
|
|
|
/*
|
|
* When dealing with data aborts, watchpoints, or instruction traps we may end
|
|
* up with a tagged userland pointer. Clear the tag to get a sane pointer to
|
|
* pass on to access_ok(), for instance.
|
|
*/
|
|
#define __untagged_addr(addr) \
|
|
((__force __typeof__(addr))sign_extend64((__force u64)(addr), 55))
|
|
|
|
#define untagged_addr(addr) ({ \
|
|
u64 __addr = (__force u64)(addr); \
|
|
__addr &= __untagged_addr(__addr); \
|
|
(__force __typeof__(addr))__addr; \
|
|
})
|
|
|
|
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
|
|
#define __tag_shifted(tag) ((u64)(tag) << 56)
|
|
#define __tag_reset(addr) __untagged_addr(addr)
|
|
#define __tag_get(addr) (__u8)((u64)(addr) >> 56)
|
|
#else
|
|
#define __tag_shifted(tag) 0UL
|
|
#define __tag_reset(addr) (addr)
|
|
#define __tag_get(addr) 0
|
|
#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
|
|
|
|
static inline const void *__tag_set(const void *addr, u8 tag)
|
|
{
|
|
u64 __addr = (u64)addr & ~__tag_shifted(0xff);
|
|
return (const void *)(__addr | __tag_shifted(tag));
|
|
}
|
|
|
|
#ifdef CONFIG_KASAN_HW_TAGS
|
|
#define arch_enable_tagging_sync() mte_enable_kernel_sync()
|
|
#define arch_enable_tagging_async() mte_enable_kernel_async()
|
|
#define arch_enable_tagging_asymm() mte_enable_kernel_asymm()
|
|
#define arch_force_async_tag_fault() mte_check_tfsr_exit()
|
|
#define arch_get_random_tag() mte_get_random_tag()
|
|
#define arch_get_mem_tag(addr) mte_get_mem_tag(addr)
|
|
#define arch_set_mem_tag_range(addr, size, tag, init) \
|
|
mte_set_mem_tag_range((addr), (size), (tag), (init))
|
|
#endif /* CONFIG_KASAN_HW_TAGS */
|
|
|
|
/*
|
|
* Physical vs virtual RAM address space conversion. These are
|
|
* private definitions which should NOT be used outside memory.h
|
|
* files. Use virt_to_phys/phys_to_virt/__pa/__va instead.
|
|
*/
|
|
|
|
|
|
/*
|
|
* Check whether an arbitrary address is within the linear map, which
|
|
* lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
|
|
* kernel's TTBR1 address range.
|
|
*/
|
|
#define __is_lm_address(addr) (((u64)(addr) - PAGE_OFFSET) < (PAGE_END - PAGE_OFFSET))
|
|
|
|
#define __lm_to_phys(addr) (((addr) - PAGE_OFFSET) + PHYS_OFFSET)
|
|
#define __kimg_to_phys(addr) ((addr) - kimage_voffset)
|
|
|
|
#define __virt_to_phys_nodebug(x) ({ \
|
|
phys_addr_t __x = (phys_addr_t)(__tag_reset(x)); \
|
|
__is_lm_address(__x) ? __lm_to_phys(__x) : __kimg_to_phys(__x); \
|
|
})
|
|
|
|
#define __pa_symbol_nodebug(x) __kimg_to_phys((phys_addr_t)(x))
|
|
|
|
#ifdef CONFIG_DEBUG_VIRTUAL
|
|
extern phys_addr_t __virt_to_phys(unsigned long x);
|
|
extern phys_addr_t __phys_addr_symbol(unsigned long x);
|
|
#else
|
|
#define __virt_to_phys(x) __virt_to_phys_nodebug(x)
|
|
#define __phys_addr_symbol(x) __pa_symbol_nodebug(x)
|
|
#endif /* CONFIG_DEBUG_VIRTUAL */
|
|
|
|
#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET)
|
|
#define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset))
|
|
|
|
/*
|
|
* Convert a page to/from a physical address
|
|
*/
|
|
#define page_to_phys(page) (__pfn_to_phys(page_to_pfn(page)))
|
|
#define phys_to_page(phys) (pfn_to_page(__phys_to_pfn(phys)))
|
|
|
|
/*
|
|
* Note: Drivers should NOT use these. They are the wrong
|
|
* translation for translating DMA addresses. Use the driver
|
|
* DMA support - see dma-mapping.h.
|
|
*/
|
|
#define virt_to_phys virt_to_phys
|
|
static inline phys_addr_t virt_to_phys(const volatile void *x)
|
|
{
|
|
return __virt_to_phys((unsigned long)(x));
|
|
}
|
|
|
|
#define phys_to_virt phys_to_virt
|
|
static inline void *phys_to_virt(phys_addr_t x)
|
|
{
|
|
return (void *)(__phys_to_virt(x));
|
|
}
|
|
|
|
/*
|
|
* Drivers should NOT use these either.
|
|
*/
|
|
#define __pa(x) __virt_to_phys((unsigned long)(x))
|
|
#define __pa_symbol(x) __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0))
|
|
#define __pa_nodebug(x) __virt_to_phys_nodebug((unsigned long)(x))
|
|
#define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x)))
|
|
#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
|
|
#define virt_to_pfn(x) __phys_to_pfn(__virt_to_phys((unsigned long)(x)))
|
|
#define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x))
|
|
|
|
/*
|
|
* virt_to_page(x) convert a _valid_ virtual address to struct page *
|
|
* virt_addr_valid(x) indicates whether a virtual address is valid
|
|
*/
|
|
#define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET)
|
|
|
|
#if defined(CONFIG_DEBUG_VIRTUAL)
|
|
#define page_to_virt(x) ({ \
|
|
__typeof__(x) __page = x; \
|
|
void *__addr = __va(page_to_phys(__page)); \
|
|
(void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\
|
|
})
|
|
#define virt_to_page(x) pfn_to_page(virt_to_pfn(x))
|
|
#else
|
|
#define page_to_virt(x) ({ \
|
|
__typeof__(x) __page = x; \
|
|
u64 __idx = ((u64)__page - VMEMMAP_START) / sizeof(struct page);\
|
|
u64 __addr = PAGE_OFFSET + (__idx * PAGE_SIZE); \
|
|
(void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\
|
|
})
|
|
|
|
#define virt_to_page(x) ({ \
|
|
u64 __idx = (__tag_reset((u64)x) - PAGE_OFFSET) / PAGE_SIZE; \
|
|
u64 __addr = VMEMMAP_START + (__idx * sizeof(struct page)); \
|
|
(struct page *)__addr; \
|
|
})
|
|
#endif /* CONFIG_DEBUG_VIRTUAL */
|
|
|
|
#define virt_addr_valid(addr) ({ \
|
|
__typeof__(addr) __addr = __tag_reset(addr); \
|
|
__is_lm_address(__addr) && pfn_is_map_memory(virt_to_pfn(__addr)); \
|
|
})
|
|
|
|
void dump_mem_limit(void);
|
|
#endif /* !ASSEMBLY */
|
|
|
|
/*
|
|
* Given that the GIC architecture permits ITS implementations that can only be
|
|
* configured with a LPI table address once, GICv3 systems with many CPUs may
|
|
* end up reserving a lot of different regions after a kexec for their LPI
|
|
* tables (one per CPU), as we are forced to reuse the same memory after kexec
|
|
* (and thus reserve it persistently with EFI beforehand)
|
|
*/
|
|
#if defined(CONFIG_EFI) && defined(CONFIG_ARM_GIC_V3_ITS)
|
|
# define INIT_MEMBLOCK_RESERVED_REGIONS (INIT_MEMBLOCK_REGIONS + NR_CPUS + 1)
|
|
#endif
|
|
|
|
#include <asm-generic/memory_model.h>
|
|
|
|
#endif /* __ASM_MEMORY_H */
|