- Rework kfence support for the HPT MMU to work on systems with >= 16TB of RAM. - Remove the powerpc "maple" platform, used by the "Yellow Dog Powerstation". - Add support for DYNAMIC_FTRACE_WITH_CALL_OPS, DYNAMIC_FTRACE_WITH_DIRECT_CALLS & BPF Trampolines. - Add support for running KVM nested guests on Power11. - Other small features, cleanups and fixes. Thanks to: Amit Machhiwal, Arnd Bergmann, Christophe Leroy, Costa Shulyupin, David Hunter, David Wang, Disha Goel, Gautam Menghani, Geert Uytterhoeven, Hari Bathini, Julia Lawall, Kajol Jain, Keith Packard, Lukas Bulwahn, Madhavan Srinivasan, Markus Elfring, Michal Suchanek, Ming Lei, Mukesh Kumar Chaurasiya, Nathan Chancellor, Naveen N Rao, Nicholas Piggin, Nysal Jan K.A, Paulo Miguel Almeida, Pavithra Prakash, Ritesh Harjani (IBM), Rob Herring (Arm), Sachin P Bappalige, Shen Lichuan, Simon Horman, Sourabh Jain, Thomas Weißschuh, Thorsten Blum, Thorsten Leemhuis, Venkat Rao Bagalkote, Zhang Zekun, zhang jiao. -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQRjvi15rv0TSTaE+SIF0oADX8seIQUCZ0Fi5AAKCRAF0oADX8se IeI0AQCAkNWRYzGNzPM6aMwDpq5qdeZzvp0rZxuNsRSnIKJlxAD+PAOxOietgjbQ Lxt3oizg+UcH/304Y/iyT8IrwI4n+gE= =xNtu -----END PGP SIGNATURE----- Merge tag 'powerpc-6.13-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux Pull powerpc updates from Michael Ellerman: - Rework kfence support for the HPT MMU to work on systems with >= 16TB of RAM. - Remove the powerpc "maple" platform, used by the "Yellow Dog Powerstation". - Add support for DYNAMIC_FTRACE_WITH_CALL_OPS, DYNAMIC_FTRACE_WITH_DIRECT_CALLS & BPF Trampolines. - Add support for running KVM nested guests on Power11. - Other small features, cleanups and fixes. Thanks to Amit Machhiwal, Arnd Bergmann, Christophe Leroy, Costa Shulyupin, David Hunter, David Wang, Disha Goel, Gautam Menghani, Geert Uytterhoeven, Hari Bathini, Julia Lawall, Kajol Jain, Keith Packard, Lukas Bulwahn, Madhavan Srinivasan, Markus Elfring, Michal Suchanek, Ming Lei, Mukesh Kumar Chaurasiya, Nathan Chancellor, Naveen N Rao, Nicholas Piggin, Nysal Jan K.A, Paulo Miguel Almeida, Pavithra Prakash, Ritesh Harjani (IBM), Rob Herring (Arm), Sachin P Bappalige, Shen Lichuan, Simon Horman, Sourabh Jain, Thomas Weißschuh, Thorsten Blum, Thorsten Leemhuis, Venkat Rao Bagalkote, Zhang Zekun, and zhang jiao. * tag 'powerpc-6.13-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (89 commits) EDAC/powerpc: Remove PPC_MAPLE drivers powerpc/perf: Add per-task/process monitoring to vpa_pmu driver powerpc/kvm: Add vpa latency counters to kvm_vcpu_arch docs: ABI: sysfs-bus-event_source-devices-vpa-pmu: Document sysfs event format entries for vpa_pmu powerpc/perf: Add perf interface to expose vpa counters MAINTAINERS: powerpc: Mark Maddy as "M" powerpc/Makefile: Allow overriding CPP powerpc-km82xx.c: replace of_node_put() with __free ps3: Correct some typos in comments powerpc/kexec: Fix return of uninitialized variable macintosh: Use common error handling code in via_pmu_led_init() powerpc/powermac: Use of_property_match_string() in pmac_has_backlight_type() powerpc: remove dead config options for MPC85xx platform support powerpc/xive: Use cpumask_intersects() selftests/powerpc: Remove the path after initialization. powerpc/xmon: symbol lookup length fixed powerpc/ep8248e: Use %pa to format resource_size_t powerpc/ps3: Reorganize kerneldoc parameter names KVM: PPC: Book3S HV: Fix kmv -> kvm typo powerpc/sstep: make emulate_vsx_load and emulate_vsx_store static ...
375 lines
9.9 KiB
C
375 lines
9.9 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
/*
|
|
* Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp.
|
|
* <benh@kernel.crashing.org>
|
|
*/
|
|
|
|
#include <linux/errno.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/smp.h>
|
|
#include <linux/stddef.h>
|
|
#include <linux/unistd.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/user.h>
|
|
#include <linux/elf.h>
|
|
#include <linux/security.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/time_namespace.h>
|
|
#include <vdso/datapage.h>
|
|
|
|
#include <asm/syscall.h>
|
|
#include <asm/processor.h>
|
|
#include <asm/mmu.h>
|
|
#include <asm/mmu_context.h>
|
|
#include <asm/machdep.h>
|
|
#include <asm/cputable.h>
|
|
#include <asm/sections.h>
|
|
#include <asm/firmware.h>
|
|
#include <asm/vdso.h>
|
|
#include <asm/vdso_datapage.h>
|
|
#include <asm/setup.h>
|
|
|
|
/* The alignment of the vDSO */
|
|
#define VDSO_ALIGNMENT (1 << 16)
|
|
|
|
extern char vdso32_start, vdso32_end;
|
|
extern char vdso64_start, vdso64_end;
|
|
|
|
long sys_ni_syscall(void);
|
|
|
|
/*
|
|
* The vdso data page (aka. systemcfg for old ppc64 fans) is here.
|
|
* Once the early boot kernel code no longer needs to muck around
|
|
* with it, it will become dynamically allocated
|
|
*/
|
|
static union {
|
|
struct vdso_arch_data data;
|
|
u8 page[2 * PAGE_SIZE];
|
|
} vdso_data_store __page_aligned_data;
|
|
struct vdso_arch_data *vdso_data = &vdso_data_store.data;
|
|
|
|
enum vvar_pages {
|
|
VVAR_BASE_PAGE_OFFSET,
|
|
VVAR_TIME_PAGE_OFFSET,
|
|
VVAR_TIMENS_PAGE_OFFSET,
|
|
VVAR_NR_PAGES,
|
|
};
|
|
|
|
static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma,
|
|
unsigned long text_size)
|
|
{
|
|
unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
|
|
|
|
if (new_size != text_size)
|
|
return -EINVAL;
|
|
|
|
current->mm->context.vdso = (void __user *)new_vma->vm_start;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int vdso32_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
|
|
{
|
|
return vdso_mremap(sm, new_vma, &vdso32_end - &vdso32_start);
|
|
}
|
|
|
|
static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
|
|
{
|
|
return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start);
|
|
}
|
|
|
|
static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struct *vma)
|
|
{
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
|
|
/*
|
|
* close() is called for munmap() but also for mremap(). In the mremap()
|
|
* case the vdso pointer has already been updated by the mremap() hook
|
|
* above, so it must not be set to NULL here.
|
|
*/
|
|
if (vma->vm_start != (unsigned long)mm->context.vdso)
|
|
return;
|
|
|
|
mm->context.vdso = NULL;
|
|
}
|
|
|
|
static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
|
|
struct vm_area_struct *vma, struct vm_fault *vmf);
|
|
|
|
static struct vm_special_mapping vvar_spec __ro_after_init = {
|
|
.name = "[vvar]",
|
|
.fault = vvar_fault,
|
|
};
|
|
|
|
static struct vm_special_mapping vdso32_spec __ro_after_init = {
|
|
.name = "[vdso]",
|
|
.mremap = vdso32_mremap,
|
|
.close = vdso_close,
|
|
};
|
|
|
|
static struct vm_special_mapping vdso64_spec __ro_after_init = {
|
|
.name = "[vdso]",
|
|
.mremap = vdso64_mremap,
|
|
.close = vdso_close,
|
|
};
|
|
|
|
#ifdef CONFIG_TIME_NS
|
|
struct vdso_data *arch_get_vdso_data(void *vvar_page)
|
|
{
|
|
return vvar_page;
|
|
}
|
|
|
|
/*
|
|
* The vvar mapping contains data for a specific time namespace, so when a task
|
|
* changes namespace we must unmap its vvar data for the old namespace.
|
|
* Subsequent faults will map in data for the new namespace.
|
|
*
|
|
* For more details see timens_setup_vdso_data().
|
|
*/
|
|
int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
|
|
{
|
|
struct mm_struct *mm = task->mm;
|
|
VMA_ITERATOR(vmi, mm, 0);
|
|
struct vm_area_struct *vma;
|
|
|
|
mmap_read_lock(mm);
|
|
for_each_vma(vmi, vma) {
|
|
if (vma_is_special_mapping(vma, &vvar_spec))
|
|
zap_vma_pages(vma);
|
|
}
|
|
mmap_read_unlock(mm);
|
|
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
|
|
struct vm_area_struct *vma, struct vm_fault *vmf)
|
|
{
|
|
struct page *timens_page = find_timens_vvar_page(vma);
|
|
unsigned long pfn;
|
|
|
|
switch (vmf->pgoff) {
|
|
case VVAR_BASE_PAGE_OFFSET:
|
|
pfn = virt_to_pfn(vdso_data);
|
|
break;
|
|
case VVAR_TIME_PAGE_OFFSET:
|
|
if (timens_page)
|
|
pfn = page_to_pfn(timens_page);
|
|
else
|
|
pfn = virt_to_pfn(vdso_data->data);
|
|
break;
|
|
#ifdef CONFIG_TIME_NS
|
|
case VVAR_TIMENS_PAGE_OFFSET:
|
|
/*
|
|
* If a task belongs to a time namespace then a namespace
|
|
* specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
|
|
* the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
|
|
* offset.
|
|
* See also the comment near timens_setup_vdso_data().
|
|
*/
|
|
if (!timens_page)
|
|
return VM_FAULT_SIGBUS;
|
|
pfn = virt_to_pfn(vdso_data->data);
|
|
break;
|
|
#endif /* CONFIG_TIME_NS */
|
|
default:
|
|
return VM_FAULT_SIGBUS;
|
|
}
|
|
|
|
return vmf_insert_pfn(vma, vmf->address, pfn);
|
|
}
|
|
|
|
/*
|
|
* This is called from binfmt_elf, we create the special vma for the
|
|
* vDSO and insert it into the mm struct tree
|
|
*/
|
|
static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
|
|
{
|
|
unsigned long vdso_size, vdso_base, mappings_size;
|
|
struct vm_special_mapping *vdso_spec;
|
|
unsigned long vvar_size = VVAR_NR_PAGES * PAGE_SIZE;
|
|
struct mm_struct *mm = current->mm;
|
|
struct vm_area_struct *vma;
|
|
|
|
if (is_32bit_task()) {
|
|
vdso_spec = &vdso32_spec;
|
|
vdso_size = &vdso32_end - &vdso32_start;
|
|
} else {
|
|
vdso_spec = &vdso64_spec;
|
|
vdso_size = &vdso64_end - &vdso64_start;
|
|
}
|
|
|
|
mappings_size = vdso_size + vvar_size;
|
|
mappings_size += (VDSO_ALIGNMENT - 1) & PAGE_MASK;
|
|
|
|
/*
|
|
* Pick a base address for the vDSO in process space.
|
|
* Add enough to the size so that the result can be aligned.
|
|
*/
|
|
vdso_base = get_unmapped_area(NULL, 0, mappings_size, 0, 0);
|
|
if (IS_ERR_VALUE(vdso_base))
|
|
return vdso_base;
|
|
|
|
/* Add required alignment. */
|
|
vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT);
|
|
|
|
vma = _install_special_mapping(mm, vdso_base, vvar_size,
|
|
VM_READ | VM_MAYREAD | VM_IO |
|
|
VM_DONTDUMP | VM_PFNMAP, &vvar_spec);
|
|
if (IS_ERR(vma))
|
|
return PTR_ERR(vma);
|
|
|
|
/*
|
|
* our vma flags don't have VM_WRITE so by default, the process isn't
|
|
* allowed to write those pages.
|
|
* gdb can break that with ptrace interface, and thus trigger COW on
|
|
* those pages but it's then your responsibility to never do that on
|
|
* the "data" page of the vDSO or you'll stop getting kernel updates
|
|
* and your nice userland gettimeofday will be totally dead.
|
|
* It's fine to use that for setting breakpoints in the vDSO code
|
|
* pages though.
|
|
*/
|
|
vma = _install_special_mapping(mm, vdso_base + vvar_size, vdso_size,
|
|
VM_READ | VM_EXEC | VM_MAYREAD |
|
|
VM_MAYWRITE | VM_MAYEXEC, vdso_spec);
|
|
if (IS_ERR(vma)) {
|
|
do_munmap(mm, vdso_base, vvar_size, NULL);
|
|
return PTR_ERR(vma);
|
|
}
|
|
|
|
// Now that the mappings are in place, set the mm VDSO pointer
|
|
mm->context.vdso = (void __user *)vdso_base + vvar_size;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
|
|
{
|
|
struct mm_struct *mm = current->mm;
|
|
int rc;
|
|
|
|
mm->context.vdso = NULL;
|
|
|
|
if (mmap_write_lock_killable(mm))
|
|
return -EINTR;
|
|
|
|
rc = __arch_setup_additional_pages(bprm, uses_interp);
|
|
|
|
mmap_write_unlock(mm);
|
|
return rc;
|
|
}
|
|
|
|
#define VDSO_DO_FIXUPS(type, value, bits, sec) do { \
|
|
void *__start = (void *)VDSO##bits##_SYMBOL(&vdso##bits##_start, sec##_start); \
|
|
void *__end = (void *)VDSO##bits##_SYMBOL(&vdso##bits##_start, sec##_end); \
|
|
\
|
|
do_##type##_fixups((value), __start, __end); \
|
|
} while (0)
|
|
|
|
static void __init vdso_fixup_features(void)
|
|
{
|
|
#ifdef CONFIG_PPC64
|
|
VDSO_DO_FIXUPS(feature, cur_cpu_spec->cpu_features, 64, ftr_fixup);
|
|
VDSO_DO_FIXUPS(feature, cur_cpu_spec->mmu_features, 64, mmu_ftr_fixup);
|
|
VDSO_DO_FIXUPS(feature, powerpc_firmware_features, 64, fw_ftr_fixup);
|
|
VDSO_DO_FIXUPS(lwsync, cur_cpu_spec->cpu_features, 64, lwsync_fixup);
|
|
#endif /* CONFIG_PPC64 */
|
|
|
|
#ifdef CONFIG_VDSO32
|
|
VDSO_DO_FIXUPS(feature, cur_cpu_spec->cpu_features, 32, ftr_fixup);
|
|
VDSO_DO_FIXUPS(feature, cur_cpu_spec->mmu_features, 32, mmu_ftr_fixup);
|
|
#ifdef CONFIG_PPC64
|
|
VDSO_DO_FIXUPS(feature, powerpc_firmware_features, 32, fw_ftr_fixup);
|
|
#endif /* CONFIG_PPC64 */
|
|
VDSO_DO_FIXUPS(lwsync, cur_cpu_spec->cpu_features, 32, lwsync_fixup);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Called from setup_arch to initialize the bitmap of available
|
|
* syscalls in the systemcfg page
|
|
*/
|
|
static void __init vdso_setup_syscall_map(void)
|
|
{
|
|
unsigned int i;
|
|
|
|
for (i = 0; i < NR_syscalls; i++) {
|
|
if (sys_call_table[i] != (void *)&sys_ni_syscall)
|
|
vdso_data->syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f);
|
|
if (IS_ENABLED(CONFIG_COMPAT) &&
|
|
compat_sys_call_table[i] != (void *)&sys_ni_syscall)
|
|
vdso_data->compat_syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f);
|
|
}
|
|
}
|
|
|
|
#ifdef CONFIG_PPC64
|
|
int vdso_getcpu_init(void)
|
|
{
|
|
unsigned long cpu, node, val;
|
|
|
|
/*
|
|
* SPRG_VDSO contains the CPU in the bottom 16 bits and the NUMA node
|
|
* in the next 16 bits. The VDSO uses this to implement getcpu().
|
|
*/
|
|
cpu = get_cpu();
|
|
WARN_ON_ONCE(cpu > 0xffff);
|
|
|
|
node = cpu_to_node(cpu);
|
|
WARN_ON_ONCE(node > 0xffff);
|
|
|
|
val = (cpu & 0xffff) | ((node & 0xffff) << 16);
|
|
mtspr(SPRN_SPRG_VDSO_WRITE, val);
|
|
get_paca()->sprg_vdso = val;
|
|
|
|
put_cpu();
|
|
|
|
return 0;
|
|
}
|
|
/* We need to call this before SMP init */
|
|
early_initcall(vdso_getcpu_init);
|
|
#endif
|
|
|
|
static struct page ** __init vdso_setup_pages(void *start, void *end)
|
|
{
|
|
int i;
|
|
struct page **pagelist;
|
|
int pages = (end - start) >> PAGE_SHIFT;
|
|
|
|
pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL);
|
|
if (!pagelist)
|
|
panic("%s: Cannot allocate page list for VDSO", __func__);
|
|
|
|
for (i = 0; i < pages; i++)
|
|
pagelist[i] = virt_to_page(start + i * PAGE_SIZE);
|
|
|
|
return pagelist;
|
|
}
|
|
|
|
static int __init vdso_init(void)
|
|
{
|
|
#ifdef CONFIG_PPC64
|
|
vdso_data->dcache_block_size = ppc64_caches.l1d.block_size;
|
|
vdso_data->icache_block_size = ppc64_caches.l1i.block_size;
|
|
vdso_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size;
|
|
vdso_data->icache_log_block_size = ppc64_caches.l1i.log_block_size;
|
|
#endif /* CONFIG_PPC64 */
|
|
|
|
vdso_setup_syscall_map();
|
|
|
|
vdso_fixup_features();
|
|
|
|
if (IS_ENABLED(CONFIG_VDSO32))
|
|
vdso32_spec.pages = vdso_setup_pages(&vdso32_start, &vdso32_end);
|
|
|
|
if (IS_ENABLED(CONFIG_PPC64))
|
|
vdso64_spec.pages = vdso_setup_pages(&vdso64_start, &vdso64_end);
|
|
|
|
smp_wmb();
|
|
|
|
return 0;
|
|
}
|
|
arch_initcall(vdso_init);
|