1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00

Merge branch 'linus' into x86/cleanups, to pick up dependent commits

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2021-05-12 19:59:37 +02:00
commit 6f0d271d21
62 changed files with 865 additions and 396 deletions

View file

@ -4803,7 +4803,7 @@ KVM_PV_VM_VERIFY
4.126 KVM_X86_SET_MSR_FILTER 4.126 KVM_X86_SET_MSR_FILTER
---------------------------- ----------------------------
:Capability: KVM_X86_SET_MSR_FILTER :Capability: KVM_CAP_X86_MSR_FILTER
:Architectures: x86 :Architectures: x86
:Type: vm ioctl :Type: vm ioctl
:Parameters: struct kvm_msr_filter :Parameters: struct kvm_msr_filter
@ -6715,7 +6715,7 @@ accesses that would usually trigger a #GP by KVM into the guest will
instead get bounced to user space through the KVM_EXIT_X86_RDMSR and instead get bounced to user space through the KVM_EXIT_X86_RDMSR and
KVM_EXIT_X86_WRMSR exit notifications. KVM_EXIT_X86_WRMSR exit notifications.
8.27 KVM_X86_SET_MSR_FILTER 8.27 KVM_CAP_X86_MSR_FILTER
--------------------------- ---------------------------
:Architectures: x86 :Architectures: x86

View file

@ -113,6 +113,7 @@
#define VALID_PAGE(x) ((x) != INVALID_PAGE) #define VALID_PAGE(x) ((x) != INVALID_PAGE)
#define UNMAPPED_GVA (~(gpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0)
#define INVALID_GPA (~(gpa_t)0)
/* KVM Hugepage definitions for x86 */ /* KVM Hugepage definitions for x86 */
#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G #define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G
@ -199,6 +200,7 @@ enum x86_intercept_stage;
#define KVM_NR_DB_REGS 4 #define KVM_NR_DB_REGS 4
#define DR6_BUS_LOCK (1 << 11)
#define DR6_BD (1 << 13) #define DR6_BD (1 << 13)
#define DR6_BS (1 << 14) #define DR6_BS (1 << 14)
#define DR6_BT (1 << 15) #define DR6_BT (1 << 15)
@ -212,7 +214,7 @@ enum x86_intercept_stage;
* DR6_ACTIVE_LOW is also used as the init/reset value for DR6. * DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
*/ */
#define DR6_ACTIVE_LOW 0xffff0ff0 #define DR6_ACTIVE_LOW 0xffff0ff0
#define DR6_VOLATILE 0x0001e00f #define DR6_VOLATILE 0x0001e80f
#define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE) #define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE)
#define DR7_BP_EN_MASK 0x000000ff #define DR7_BP_EN_MASK 0x000000ff
@ -407,7 +409,7 @@ struct kvm_mmu {
u32 pkru_mask; u32 pkru_mask;
u64 *pae_root; u64 *pae_root;
u64 *lm_root; u64 *pml4_root;
/* /*
* check zero bits on shadow page table entries, these * check zero bits on shadow page table entries, these
@ -1417,6 +1419,7 @@ struct kvm_arch_async_pf {
bool direct_map; bool direct_map;
}; };
extern u32 __read_mostly kvm_nr_uret_msrs;
extern u64 __read_mostly host_efer; extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr; extern bool __read_mostly allow_smaller_maxphyaddr;
extern struct kvm_x86_ops kvm_x86_ops; extern struct kvm_x86_ops kvm_x86_ops;
@ -1775,9 +1778,15 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min, unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit); unsigned long icr, int op_64_bit);
void kvm_define_user_return_msr(unsigned index, u32 msr); int kvm_add_user_return_msr(u32 msr);
int kvm_find_user_return_msr(u32 msr);
int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
static inline bool kvm_is_supported_user_return_msr(u32 msr)
{
return kvm_find_user_return_msr(msr) >= 0;
}
u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);

View file

@ -7,8 +7,6 @@
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <uapi/asm/kvm_para.h> #include <uapi/asm/kvm_para.h>
extern void kvmclock_init(void);
#ifdef CONFIG_KVM_GUEST #ifdef CONFIG_KVM_GUEST
bool kvm_check_and_clear_guest_paused(void); bool kvm_check_and_clear_guest_paused(void);
#else #else
@ -86,13 +84,14 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
} }
#ifdef CONFIG_KVM_GUEST #ifdef CONFIG_KVM_GUEST
void kvmclock_init(void);
void kvmclock_disable(void);
bool kvm_para_available(void); bool kvm_para_available(void);
unsigned int kvm_arch_para_features(void); unsigned int kvm_arch_para_features(void);
unsigned int kvm_arch_para_hints(void); unsigned int kvm_arch_para_hints(void);
void kvm_async_pf_task_wait_schedule(u32 token); void kvm_async_pf_task_wait_schedule(u32 token);
void kvm_async_pf_task_wake(u32 token); void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_apf_flags(void); u32 kvm_read_and_reset_apf_flags(void);
void kvm_disable_steal_time(void);
bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token); bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled); DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@ -137,11 +136,6 @@ static inline u32 kvm_read_and_reset_apf_flags(void)
return 0; return 0;
} }
static inline void kvm_disable_steal_time(void)
{
return;
}
static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{ {
return false; return false;

View file

@ -437,6 +437,8 @@ struct kvm_vmx_nested_state_hdr {
__u16 flags; __u16 flags;
} smm; } smm;
__u16 pad;
__u32 flags; __u32 flags;
__u64 preemption_timer_deadline; __u64 preemption_timer_deadline;
}; };

View file

@ -26,6 +26,7 @@
#include <linux/kprobes.h> #include <linux/kprobes.h>
#include <linux/nmi.h> #include <linux/nmi.h>
#include <linux/swait.h> #include <linux/swait.h>
#include <linux/syscore_ops.h>
#include <asm/timer.h> #include <asm/timer.h>
#include <asm/cpu.h> #include <asm/cpu.h>
#include <asm/traps.h> #include <asm/traps.h>
@ -37,6 +38,7 @@
#include <asm/tlb.h> #include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h> #include <asm/cpuidle_haltpoll.h>
#include <asm/ptrace.h> #include <asm/ptrace.h>
#include <asm/reboot.h>
#include <asm/svm.h> #include <asm/svm.h>
DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@ -345,7 +347,7 @@ static void kvm_guest_cpu_init(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1); __this_cpu_write(apf_reason.enabled, 1);
pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); pr_info("setup async PF for cpu %d\n", smp_processor_id());
} }
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
@ -371,34 +373,17 @@ static void kvm_pv_disable_apf(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
__this_cpu_write(apf_reason.enabled, 0); __this_cpu_write(apf_reason.enabled, 0);
pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id()); pr_info("disable async PF for cpu %d\n", smp_processor_id());
} }
static void kvm_pv_guest_cpu_reboot(void *unused) static void kvm_disable_steal_time(void)
{ {
/* if (!has_steal_clock)
* We disable PV EOI before we load a new kernel by kexec, return;
* since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
* New kernel can re-enable when it boots.
*/
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
wrmsrl(MSR_KVM_PV_EOI_EN, 0);
kvm_pv_disable_apf();
kvm_disable_steal_time();
}
static int kvm_pv_reboot_notify(struct notifier_block *nb, wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
unsigned long code, void *unused)
{
if (code == SYS_RESTART)
on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
return NOTIFY_DONE;
} }
static struct notifier_block kvm_pv_reboot_nb = {
.notifier_call = kvm_pv_reboot_notify,
};
static u64 kvm_steal_clock(int cpu) static u64 kvm_steal_clock(int cpu)
{ {
u64 steal; u64 steal;
@ -416,14 +401,6 @@ static u64 kvm_steal_clock(int cpu)
return steal; return steal;
} }
void kvm_disable_steal_time(void)
{
if (!has_steal_clock)
return;
wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
}
static inline void __set_percpu_decrypted(void *ptr, unsigned long size) static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
{ {
early_set_memory_decrypted((unsigned long) ptr, size); early_set_memory_decrypted((unsigned long) ptr, size);
@ -451,6 +428,27 @@ static void __init sev_map_percpu_data(void)
} }
} }
static void kvm_guest_cpu_offline(bool shutdown)
{
kvm_disable_steal_time();
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
wrmsrl(MSR_KVM_PV_EOI_EN, 0);
kvm_pv_disable_apf();
if (!shutdown)
apf_task_wake_all();
kvmclock_disable();
}
static int kvm_cpu_online(unsigned int cpu)
{
unsigned long flags;
local_irq_save(flags);
kvm_guest_cpu_init();
local_irq_restore(flags);
return 0;
}
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
@ -635,33 +633,66 @@ static void __init kvm_smp_prepare_boot_cpu(void)
kvm_spinlock_init(); kvm_spinlock_init();
} }
static void kvm_guest_cpu_offline(void)
{
kvm_disable_steal_time();
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
wrmsrl(MSR_KVM_PV_EOI_EN, 0);
kvm_pv_disable_apf();
apf_task_wake_all();
}
static int kvm_cpu_online(unsigned int cpu)
{
local_irq_disable();
kvm_guest_cpu_init();
local_irq_enable();
return 0;
}
static int kvm_cpu_down_prepare(unsigned int cpu) static int kvm_cpu_down_prepare(unsigned int cpu)
{ {
local_irq_disable(); unsigned long flags;
kvm_guest_cpu_offline();
local_irq_enable(); local_irq_save(flags);
kvm_guest_cpu_offline(false);
local_irq_restore(flags);
return 0; return 0;
} }
#endif #endif
static int kvm_suspend(void)
{
kvm_guest_cpu_offline(false);
return 0;
}
static void kvm_resume(void)
{
kvm_cpu_online(raw_smp_processor_id());
}
static struct syscore_ops kvm_syscore_ops = {
.suspend = kvm_suspend,
.resume = kvm_resume,
};
static void kvm_pv_guest_cpu_reboot(void *unused)
{
kvm_guest_cpu_offline(true);
}
static int kvm_pv_reboot_notify(struct notifier_block *nb,
unsigned long code, void *unused)
{
if (code == SYS_RESTART)
on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
return NOTIFY_DONE;
}
static struct notifier_block kvm_pv_reboot_nb = {
.notifier_call = kvm_pv_reboot_notify,
};
/*
* After a PV feature is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
* won't be valid. In cases like kexec, in which you install a new kernel, this
* means a random memory location will be kept being written.
*/
#ifdef CONFIG_KEXEC_CORE
static void kvm_crash_shutdown(struct pt_regs *regs)
{
kvm_guest_cpu_offline(true);
native_machine_crash_shutdown(regs);
}
#endif
static void __init kvm_guest_init(void) static void __init kvm_guest_init(void)
{ {
int i; int i;
@ -704,6 +735,12 @@ static void __init kvm_guest_init(void)
kvm_guest_cpu_init(); kvm_guest_cpu_init();
#endif #endif
#ifdef CONFIG_KEXEC_CORE
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
register_syscore_ops(&kvm_syscore_ops);
/* /*
* Hard lockup detection is enabled by default. Disable it, as guests * Hard lockup detection is enabled by default. Disable it, as guests
* can get false positives too easily, for example if the host is * can get false positives too easily, for example if the host is

View file

@ -20,7 +20,6 @@
#include <asm/hypervisor.h> #include <asm/hypervisor.h>
#include <asm/mem_encrypt.h> #include <asm/mem_encrypt.h>
#include <asm/x86_init.h> #include <asm/x86_init.h>
#include <asm/reboot.h>
#include <asm/kvmclock.h> #include <asm/kvmclock.h>
static int kvmclock __initdata = 1; static int kvmclock __initdata = 1;
@ -203,28 +202,9 @@ static void kvm_setup_secondary_clock(void)
} }
#endif #endif
/* void kvmclock_disable(void)
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
* won't be valid. In cases like kexec, in which you install a new kernel, this
* means a random memory location will be kept being written. So before any
* kind of shutdown from our side, we unregister the clock by writing anything
* that does not have the 'enable' bit set in the msr
*/
#ifdef CONFIG_KEXEC_CORE
static void kvm_crash_shutdown(struct pt_regs *regs)
{ {
native_write_msr(msr_kvm_system_time, 0, 0); native_write_msr(msr_kvm_system_time, 0, 0);
kvm_disable_steal_time();
native_machine_crash_shutdown(regs);
}
#endif
static void kvm_shutdown(void)
{
native_write_msr(msr_kvm_system_time, 0, 0);
kvm_disable_steal_time();
native_machine_shutdown();
} }
static void __init kvmclock_init_mem(void) static void __init kvmclock_init_mem(void)
@ -351,10 +331,6 @@ void __init kvmclock_init(void)
#endif #endif
x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC_CORE
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj(); kvm_get_preset_lpj();
/* /*

View file

@ -458,7 +458,7 @@ void kvm_set_cpu_caps(void)
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
F(SGX_LC) F(SGX_LC) | F(BUS_LOCK_DETECT)
); );
/* Set LA57 based on hardware capability. */ /* Set LA57 based on hardware capability. */
if (cpuid_ecx(7) & F(LA57)) if (cpuid_ecx(7) & F(LA57))
@ -567,6 +567,21 @@ void kvm_set_cpu_caps(void)
F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
F(PMM) | F(PMM_EN) F(PMM) | F(PMM_EN)
); );
/*
* Hide RDTSCP and RDPID if either feature is reported as supported but
* probing MSR_TSC_AUX failed. This is purely a sanity check and
* should never happen, but the guest will likely crash if RDTSCP or
* RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
* the past. For example, the sanity check may fire if this instance of
* KVM is running as L1 on top of an older, broken KVM.
*/
if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) ||
kvm_cpu_cap_has(X86_FEATURE_RDPID)) &&
!kvm_is_supported_user_return_msr(MSR_TSC_AUX))) {
kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
kvm_cpu_cap_clear(X86_FEATURE_RDPID);
}
} }
EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
@ -637,7 +652,8 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
case 7: case 7:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
entry->eax = 0; entry->eax = 0;
entry->ecx = F(RDPID); if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
entry->ecx = F(RDPID);
++array->nent; ++array->nent;
default: default:
break; break;

View file

@ -4502,7 +4502,7 @@ static const struct opcode group8[] = {
* from the register case of group9. * from the register case of group9.
*/ */
static const struct gprefix pfx_0f_c7_7 = { static const struct gprefix pfx_0f_c7_7 = {
N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp), N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid),
}; };

View file

@ -468,6 +468,7 @@ enum x86_intercept {
x86_intercept_clgi, x86_intercept_clgi,
x86_intercept_skinit, x86_intercept_skinit,
x86_intercept_rdtscp, x86_intercept_rdtscp,
x86_intercept_rdpid,
x86_intercept_icebp, x86_intercept_icebp,
x86_intercept_wbinvd, x86_intercept_wbinvd,
x86_intercept_monitor, x86_intercept_monitor,

View file

@ -1913,8 +1913,8 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
if (!apic->lapic_timer.hv_timer_in_use) if (!apic->lapic_timer.hv_timer_in_use)
goto out; goto out;
WARN_ON(rcuwait_active(&vcpu->wait)); WARN_ON(rcuwait_active(&vcpu->wait));
cancel_hv_timer(apic);
apic_timer_expired(apic, false); apic_timer_expired(apic, false);
cancel_hv_timer(apic);
if (apic_lvtt_period(apic) && apic->lapic_timer.period) { if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
advance_periodic_target_expiration(apic); advance_periodic_target_expiration(apic);

View file

@ -3310,12 +3310,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
if (WARN_ON_ONCE(!mmu->lm_root)) { if (WARN_ON_ONCE(!mmu->pml4_root)) {
r = -EIO; r = -EIO;
goto out_unlock; goto out_unlock;
} }
mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask; mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
} }
for (i = 0; i < 4; ++i) { for (i = 0; i < 4; ++i) {
@ -3335,7 +3335,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
} }
if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
mmu->root_hpa = __pa(mmu->lm_root); mmu->root_hpa = __pa(mmu->pml4_root);
else else
mmu->root_hpa = __pa(mmu->pae_root); mmu->root_hpa = __pa(mmu->pae_root);
@ -3350,7 +3350,7 @@ out_unlock:
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
{ {
struct kvm_mmu *mmu = vcpu->arch.mmu; struct kvm_mmu *mmu = vcpu->arch.mmu;
u64 *lm_root, *pae_root; u64 *pml4_root, *pae_root;
/* /*
* When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
@ -3369,14 +3369,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
return -EIO; return -EIO;
if (mmu->pae_root && mmu->lm_root) if (mmu->pae_root && mmu->pml4_root)
return 0; return 0;
/* /*
* The special roots should always be allocated in concert. Yell and * The special roots should always be allocated in concert. Yell and
* bail if KVM ends up in a state where only one of the roots is valid. * bail if KVM ends up in a state where only one of the roots is valid.
*/ */
if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root)) if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
return -EIO; return -EIO;
/* /*
@ -3387,14 +3387,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
if (!pae_root) if (!pae_root)
return -ENOMEM; return -ENOMEM;
lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!lm_root) { if (!pml4_root) {
free_page((unsigned long)pae_root); free_page((unsigned long)pae_root);
return -ENOMEM; return -ENOMEM;
} }
mmu->pae_root = pae_root; mmu->pae_root = pae_root;
mmu->lm_root = lm_root; mmu->pml4_root = pml4_root;
return 0; return 0;
} }
@ -5261,7 +5261,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
if (!tdp_enabled && mmu->pae_root) if (!tdp_enabled && mmu->pae_root)
set_memory_encrypted((unsigned long)mmu->pae_root, 1); set_memory_encrypted((unsigned long)mmu->pae_root, 1);
free_page((unsigned long)mmu->pae_root); free_page((unsigned long)mmu->pae_root);
free_page((unsigned long)mmu->lm_root); free_page((unsigned long)mmu->pml4_root);
} }
static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)

View file

@ -388,7 +388,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
} }
/** /**
* handle_changed_spte - handle bookkeeping associated with an SPTE change * __handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance * @kvm: kvm instance
* @as_id: the address space of the paging structure the SPTE was a part of * @as_id: the address space of the paging structure the SPTE was a part of
* @gfn: the base GFN that was mapped by the SPTE * @gfn: the base GFN that was mapped by the SPTE
@ -444,6 +444,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
if (is_large_pte(old_spte))
atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
else
atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
}
/* /*
* The only times a SPTE should be changed from a non-present to * The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/ * non-present state is when an MMIO entry is installed/modified/
@ -1009,6 +1016,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
} }
if (!is_shadow_present_pte(iter.old_spte)) { if (!is_shadow_present_pte(iter.old_spte)) {
/*
* If SPTE has been forzen by another thread, just
* give up and retry, avoiding unnecessary page table
* allocation and free.
*/
if (is_removed_spte(iter.old_spte))
break;
sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
child_pt = sp->spt; child_pt = sp->spt;

View file

@ -764,7 +764,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
svm_switch_vmcb(svm, &svm->vmcb01); svm_switch_vmcb(svm, &svm->vmcb01);
WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
/* /*
* On vmexit the GIF is set to false and * On vmexit the GIF is set to false and
@ -872,6 +871,15 @@ void svm_free_nested(struct vcpu_svm *svm)
__free_page(virt_to_page(svm->nested.vmcb02.ptr)); __free_page(virt_to_page(svm->nested.vmcb02.ptr));
svm->nested.vmcb02.ptr = NULL; svm->nested.vmcb02.ptr = NULL;
/*
* When last_vmcb12_gpa matches the current vmcb12 gpa,
* some vmcb12 fields are not loaded if they are marked clean
* in the vmcb12, since in this case they are up to date already.
*
* When the vmcb02 is freed, this optimization becomes invalid.
*/
svm->nested.last_vmcb12_gpa = INVALID_GPA;
svm->nested.initialized = false; svm->nested.initialized = false;
} }
@ -884,9 +892,11 @@ void svm_leave_nested(struct vcpu_svm *svm)
if (is_guest_mode(vcpu)) { if (is_guest_mode(vcpu)) {
svm->nested.nested_run_pending = 0; svm->nested.nested_run_pending = 0;
svm->nested.vmcb12_gpa = INVALID_GPA;
leave_guest_mode(vcpu); leave_guest_mode(vcpu);
svm_switch_vmcb(svm, &svm->nested.vmcb02); svm_switch_vmcb(svm, &svm->vmcb01);
nested_svm_uninit_mmu_context(vcpu); nested_svm_uninit_mmu_context(vcpu);
vmcb_mark_all_dirty(svm->vmcb); vmcb_mark_all_dirty(svm->vmcb);
@ -1298,12 +1308,17 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
* L2 registers if needed are moved from the current VMCB to VMCB02. * L2 registers if needed are moved from the current VMCB to VMCB02.
*/ */
if (is_guest_mode(vcpu))
svm_leave_nested(svm);
else
svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
svm->nested.nested_run_pending = svm->nested.nested_run_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
if (svm->current_vmcb == &svm->vmcb01)
svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
svm->vmcb01.ptr->save.es = save->es; svm->vmcb01.ptr->save.es = save->es;
svm->vmcb01.ptr->save.cs = save->cs; svm->vmcb01.ptr->save.cs = save->cs;

View file

@ -763,7 +763,7 @@ static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
} }
static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
unsigned long __user dst_uaddr, void __user *dst_uaddr,
unsigned long dst_paddr, unsigned long dst_paddr,
int size, int *err) int size, int *err)
{ {
@ -787,8 +787,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
if (tpage) { if (tpage) {
offset = paddr & 15; offset = paddr & 15;
if (copy_to_user((void __user *)(uintptr_t)dst_uaddr, if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
page_address(tpage) + offset, size))
ret = -EFAULT; ret = -EFAULT;
} }
@ -800,9 +799,9 @@ e_free:
} }
static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
unsigned long __user vaddr, void __user *vaddr,
unsigned long dst_paddr, unsigned long dst_paddr,
unsigned long __user dst_vaddr, void __user *dst_vaddr,
int size, int *error) int size, int *error)
{ {
struct page *src_tpage = NULL; struct page *src_tpage = NULL;
@ -810,13 +809,12 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
int ret, len = size; int ret, len = size;
/* If source buffer is not aligned then use an intermediate buffer */ /* If source buffer is not aligned then use an intermediate buffer */
if (!IS_ALIGNED(vaddr, 16)) { if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
src_tpage = alloc_page(GFP_KERNEL); src_tpage = alloc_page(GFP_KERNEL);
if (!src_tpage) if (!src_tpage)
return -ENOMEM; return -ENOMEM;
if (copy_from_user(page_address(src_tpage), if (copy_from_user(page_address(src_tpage), vaddr, size)) {
(void __user *)(uintptr_t)vaddr, size)) {
__free_page(src_tpage); __free_page(src_tpage);
return -EFAULT; return -EFAULT;
} }
@ -830,7 +828,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
* - copy the source buffer in an intermediate buffer * - copy the source buffer in an intermediate buffer
* - use the intermediate buffer as source buffer * - use the intermediate buffer as source buffer
*/ */
if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
int dst_offset; int dst_offset;
dst_tpage = alloc_page(GFP_KERNEL); dst_tpage = alloc_page(GFP_KERNEL);
@ -855,7 +853,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
page_address(src_tpage), size); page_address(src_tpage), size);
else { else {
if (copy_from_user(page_address(dst_tpage) + dst_offset, if (copy_from_user(page_address(dst_tpage) + dst_offset,
(void __user *)(uintptr_t)vaddr, size)) { vaddr, size)) {
ret = -EFAULT; ret = -EFAULT;
goto e_free; goto e_free;
} }
@ -935,15 +933,15 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (dec) if (dec)
ret = __sev_dbg_decrypt_user(kvm, ret = __sev_dbg_decrypt_user(kvm,
__sme_page_pa(src_p[0]) + s_off, __sme_page_pa(src_p[0]) + s_off,
dst_vaddr, (void __user *)dst_vaddr,
__sme_page_pa(dst_p[0]) + d_off, __sme_page_pa(dst_p[0]) + d_off,
len, &argp->error); len, &argp->error);
else else
ret = __sev_dbg_encrypt_user(kvm, ret = __sev_dbg_encrypt_user(kvm,
__sme_page_pa(src_p[0]) + s_off, __sme_page_pa(src_p[0]) + s_off,
vaddr, (void __user *)vaddr,
__sme_page_pa(dst_p[0]) + d_off, __sme_page_pa(dst_p[0]) + d_off,
dst_vaddr, (void __user *)dst_vaddr,
len, &argp->error); len, &argp->error);
sev_unpin_memory(kvm, src_p, n); sev_unpin_memory(kvm, src_p, n);
@ -1764,7 +1762,8 @@ e_mirror_unlock:
e_source_unlock: e_source_unlock:
mutex_unlock(&source_kvm->lock); mutex_unlock(&source_kvm->lock);
e_source_put: e_source_put:
fput(source_kvm_file); if (source_kvm_file)
fput(source_kvm_file);
return ret; return ret;
} }
@ -2198,7 +2197,7 @@ vmgexit_err:
return -EINVAL; return -EINVAL;
} }
static void pre_sev_es_run(struct vcpu_svm *svm) void sev_es_unmap_ghcb(struct vcpu_svm *svm)
{ {
if (!svm->ghcb) if (!svm->ghcb)
return; return;
@ -2234,9 +2233,6 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
struct svm_cpu_data *sd = per_cpu(svm_data, cpu); struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int asid = sev_get_asid(svm->vcpu.kvm); int asid = sev_get_asid(svm->vcpu.kvm);
/* Perform any SEV-ES pre-run actions */
pre_sev_es_run(svm);
/* Assign the asid allocated with this SEV guest */ /* Assign the asid allocated with this SEV guest */
svm->asid = asid; svm->asid = asid;

View file

@ -212,7 +212,7 @@ DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
* RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
* defer the restoration of TSC_AUX until the CPU returns to userspace. * defer the restoration of TSC_AUX until the CPU returns to userspace.
*/ */
#define TSC_AUX_URET_SLOT 0 static int tsc_aux_uret_slot __read_mostly = -1;
static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
@ -447,6 +447,11 @@ static int has_svm(void)
return 0; return 0;
} }
if (pgtable_l5_enabled()) {
pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
return 0;
}
return 1; return 1;
} }
@ -959,8 +964,7 @@ static __init int svm_hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 32; kvm_tsc_scaling_ratio_frac_bits = 32;
} }
if (boot_cpu_has(X86_FEATURE_RDTSCP)) tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX);
/* Check for pause filtering support */ /* Check for pause filtering support */
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
@ -1100,7 +1104,9 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
return svm->vmcb->control.tsc_offset; return svm->vmcb->control.tsc_offset;
} }
static void svm_check_invpcid(struct vcpu_svm *svm) /* Evaluate instruction intercepts that depend on guest CPUID features. */
static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
struct vcpu_svm *svm)
{ {
/* /*
* Intercept INVPCID if shadow paging is enabled to sync/free shadow * Intercept INVPCID if shadow paging is enabled to sync/free shadow
@ -1113,6 +1119,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
else else
svm_clr_intercept(svm, INTERCEPT_INVPCID); svm_clr_intercept(svm, INTERCEPT_INVPCID);
} }
if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
svm_clr_intercept(svm, INTERCEPT_RDTSCP);
else
svm_set_intercept(svm, INTERCEPT_RDTSCP);
}
} }
static void init_vmcb(struct kvm_vcpu *vcpu) static void init_vmcb(struct kvm_vcpu *vcpu)
@ -1235,8 +1248,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm->current_vmcb->asid_generation = 0; svm->current_vmcb->asid_generation = 0;
svm->asid = 0; svm->asid = 0;
svm->nested.vmcb12_gpa = 0; svm->nested.vmcb12_gpa = INVALID_GPA;
svm->nested.last_vmcb12_gpa = 0; svm->nested.last_vmcb12_gpa = INVALID_GPA;
vcpu->arch.hflags = 0; vcpu->arch.hflags = 0;
if (!kvm_pause_in_guest(vcpu->kvm)) { if (!kvm_pause_in_guest(vcpu->kvm)) {
@ -1248,7 +1261,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_PAUSE); svm_clr_intercept(svm, INTERCEPT_PAUSE);
} }
svm_check_invpcid(svm); svm_recalc_instruction_intercepts(vcpu, svm);
/* /*
* If the host supports V_SPEC_CTRL then disable the interception * If the host supports V_SPEC_CTRL then disable the interception
@ -1424,6 +1437,9 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu); struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
if (sev_es_guest(vcpu->kvm))
sev_es_unmap_ghcb(svm);
if (svm->guest_state_loaded) if (svm->guest_state_loaded)
return; return;
@ -1445,8 +1461,8 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
} }
} }
if (static_cpu_has(X86_FEATURE_RDTSCP)) if (likely(tsc_aux_uret_slot >= 0))
kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull); kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
svm->guest_state_loaded = true; svm->guest_state_loaded = true;
} }
@ -2655,11 +2671,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data |= (u64)svm->sysenter_esp_hi << 32; msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
break; break;
case MSR_TSC_AUX: case MSR_TSC_AUX:
if (!boot_cpu_has(X86_FEATURE_RDTSCP))
return 1;
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
msr_info->data = svm->tsc_aux; msr_info->data = svm->tsc_aux;
break; break;
/* /*
@ -2876,30 +2887,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
break; break;
case MSR_TSC_AUX: case MSR_TSC_AUX:
if (!boot_cpu_has(X86_FEATURE_RDTSCP))
return 1;
if (!msr->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
/*
* Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
* incomplete and conflicting architectural behavior. Current
* AMD CPUs completely ignore bits 63:32, i.e. they aren't
* reserved and always read as zeros. Emulate AMD CPU behavior
* to avoid explosions if the vCPU is migrated from an AMD host
* to an Intel host.
*/
data = (u32)data;
/* /*
* TSC_AUX is usually changed only during boot and never read * TSC_AUX is usually changed only during boot and never read
* directly. Intercept TSC_AUX instead of exposing it to the * directly. Intercept TSC_AUX instead of exposing it to the
* guest via direct_access_msrs, and switch it via user return. * guest via direct_access_msrs, and switch it via user return.
*/ */
preempt_disable(); preempt_disable();
r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull); r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
preempt_enable(); preempt_enable();
if (r) if (r)
return 1; return 1;
@ -3084,6 +3078,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_STGI] = stgi_interception, [SVM_EXIT_STGI] = stgi_interception,
[SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_CLGI] = clgi_interception,
[SVM_EXIT_SKINIT] = skinit_interception, [SVM_EXIT_SKINIT] = skinit_interception,
[SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
[SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
[SVM_EXIT_MONITOR] = kvm_emulate_monitor, [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
[SVM_EXIT_MWAIT] = kvm_emulate_mwait, [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
@ -3972,8 +3967,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
/* Check again if INVPCID interception if required */ svm_recalc_instruction_intercepts(vcpu, svm);
svm_check_invpcid(svm);
/* For sev guests, the memory encryption bit is not reserved in CR3. */ /* For sev guests, the memory encryption bit is not reserved in CR3. */
if (sev_guest(vcpu->kvm)) { if (sev_guest(vcpu->kvm)) {

View file

@ -581,6 +581,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_create_vcpu(struct vcpu_svm *svm); void sev_es_create_vcpu(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */ /* vmenter.S */

View file

@ -398,6 +398,9 @@ static inline u64 vmx_supported_debugctl(void)
{ {
u64 debugctl = 0; u64 debugctl = 0;
if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
debugctl |= DEBUGCTLMSR_LBR_MASK; debugctl |= DEBUGCTLMSR_LBR_MASK;

View file

@ -3098,15 +3098,8 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
nested_vmx_handle_enlightened_vmptrld(vcpu, false); nested_vmx_handle_enlightened_vmptrld(vcpu, false);
if (evmptrld_status == EVMPTRLD_VMFAIL || if (evmptrld_status == EVMPTRLD_VMFAIL ||
evmptrld_status == EVMPTRLD_ERROR) { evmptrld_status == EVMPTRLD_ERROR)
pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
__func__);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
return false; return false;
}
} }
return true; return true;
@ -3194,8 +3187,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{ {
if (!nested_get_evmcs_page(vcpu)) if (!nested_get_evmcs_page(vcpu)) {
pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
__func__);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
return false; return false;
}
if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
return false; return false;
@ -4435,7 +4436,15 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/* Similarly, triple faults in L2 should never escape. */ /* Similarly, triple faults in L2 should never escape. */
WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
/*
* KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
* Enlightened VMCS after migration and we still need to
* do that when something is forcing L2->L1 exit prior to
* the first L2 run.
*/
(void)nested_get_evmcs_page(vcpu);
}
/* Service the TLB flush request for L2 before switching to L1. */ /* Service the TLB flush request for L2 before switching to L1. */
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))

View file

@ -455,21 +455,6 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base; static unsigned long host_idt_base;
/*
* Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
* will emulate SYSCALL in legacy mode if the vendor string in guest
* CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
* support this emulation, IA32_STAR must always be included in
* vmx_uret_msrs_list[], even in i386 builds.
*/
static const u32 vmx_uret_msrs_list[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
MSR_IA32_TSX_CTRL,
};
#if IS_ENABLED(CONFIG_HYPERV) #if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true; static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444); module_param(enlightened_vmcs, bool, 0444);
@ -697,21 +682,11 @@ static bool is_valid_passthrough_msr(u32 msr)
return r; return r;
} }
static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
for (i = 0; i < vmx->nr_uret_msrs; ++i)
if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
return i;
return -1;
}
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{ {
int i; int i;
i = __vmx_find_uret_msr(vmx, msr); i = kvm_find_user_return_msr(msr);
if (i >= 0) if (i >= 0)
return &vmx->guest_uret_msrs[i]; return &vmx->guest_uret_msrs[i];
return NULL; return NULL;
@ -720,13 +695,14 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
struct vmx_uret_msr *msr, u64 data) struct vmx_uret_msr *msr, u64 data)
{ {
unsigned int slot = msr - vmx->guest_uret_msrs;
int ret = 0; int ret = 0;
u64 old_msr_data = msr->data; u64 old_msr_data = msr->data;
msr->data = data; msr->data = data;
if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) { if (msr->load_into_hardware) {
preempt_disable(); preempt_disable();
ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask); ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
preempt_enable(); preempt_enable();
if (ret) if (ret)
msr->data = old_msr_data; msr->data = old_msr_data;
@ -1078,7 +1054,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
return false; return false;
} }
i = __vmx_find_uret_msr(vmx, MSR_EFER); i = kvm_find_user_return_msr(MSR_EFER);
if (i < 0) if (i < 0)
return false; return false;
@ -1240,11 +1216,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
*/ */
if (!vmx->guest_uret_msrs_loaded) { if (!vmx->guest_uret_msrs_loaded) {
vmx->guest_uret_msrs_loaded = true; vmx->guest_uret_msrs_loaded = true;
for (i = 0; i < vmx->nr_active_uret_msrs; ++i) for (i = 0; i < kvm_nr_uret_msrs; ++i) {
kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot, if (!vmx->guest_uret_msrs[i].load_into_hardware)
continue;
kvm_set_user_return_msr(i,
vmx->guest_uret_msrs[i].data, vmx->guest_uret_msrs[i].data,
vmx->guest_uret_msrs[i].mask); vmx->guest_uret_msrs[i].mask);
}
} }
if (vmx->nested.need_vmcs12_to_shadow_sync) if (vmx->nested.need_vmcs12_to_shadow_sync)
@ -1751,19 +1730,16 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
vmx_clear_hlt(vcpu); vmx_clear_hlt(vcpu);
} }
static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr) static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
bool load_into_hardware)
{ {
struct vmx_uret_msr tmp; struct vmx_uret_msr *uret_msr;
int from, to;
from = __vmx_find_uret_msr(vmx, msr); uret_msr = vmx_find_uret_msr(vmx, msr);
if (from < 0) if (!uret_msr)
return; return;
to = vmx->nr_active_uret_msrs++;
tmp = vmx->guest_uret_msrs[to]; uret_msr->load_into_hardware = load_into_hardware;
vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
vmx->guest_uret_msrs[from] = tmp;
} }
/* /*
@ -1773,29 +1749,42 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
*/ */
static void setup_msrs(struct vcpu_vmx *vmx) static void setup_msrs(struct vcpu_vmx *vmx)
{ {
vmx->guest_uret_msrs_loaded = false;
vmx->nr_active_uret_msrs = 0;
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
bool load_syscall_msrs;
/* /*
* The SYSCALL MSRs are only needed on long mode guests, and only * The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set. * when EFER.SCE is set.
*/ */
if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
vmx_setup_uret_msr(vmx, MSR_STAR); (vmx->vcpu.arch.efer & EFER_SCE);
vmx_setup_uret_msr(vmx, MSR_LSTAR);
vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK); vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
} vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
#endif #endif
if (update_transition_efer(vmx)) vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
vmx_setup_uret_msr(vmx, MSR_EFER);
if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
vmx_setup_uret_msr(vmx, MSR_TSC_AUX); guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL); /*
* hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
* kernel and old userspace. If those guests run on a tsx=off host, do
* allow guests to use TSX_CTRL, but don't change the value in hardware
* so that TSX remains always disabled.
*/
vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
if (cpu_has_vmx_msr_bitmap()) if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu); vmx_update_msr_bitmap(&vmx->vcpu);
/*
* The set of MSRs to load may have changed, reload MSRs before the
* next VM-Enter.
*/
vmx->guest_uret_msrs_loaded = false;
} }
static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@ -1993,11 +1982,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break; break;
case MSR_TSC_AUX:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
goto find_uret_msr;
case MSR_IA32_DEBUGCTLMSR: case MSR_IA32_DEBUGCTLMSR:
msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
break; break;
@ -2031,6 +2015,9 @@ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
if (!intel_pmu_lbr_is_enabled(vcpu)) if (!intel_pmu_lbr_is_enabled(vcpu))
debugctl &= ~DEBUGCTLMSR_LBR_MASK; debugctl &= ~DEBUGCTLMSR_LBR_MASK;
if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
return debugctl; return debugctl;
} }
@ -2313,14 +2300,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
else else
vmx->pt_desc.guest.addr_a[index / 2] = data; vmx->pt_desc.guest.addr_a[index / 2] = data;
break; break;
case MSR_TSC_AUX:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
return 1;
goto find_uret_msr;
case MSR_IA32_PERF_CAPABILITIES: case MSR_IA32_PERF_CAPABILITIES:
if (data && !vcpu_to_pmu(vcpu)->version) if (data && !vcpu_to_pmu(vcpu)->version)
return 1; return 1;
@ -4369,7 +4348,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
xsaves_enabled, false); xsaves_enabled, false);
} }
vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP); /*
* RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
* feature is exposed to the guest. This creates a virtualization hole
* if both are supported in hardware but only one is exposed to the
* guest, but letting the guest execute RDTSCP or RDPID when either one
* is advertised is preferable to emulating the advertised instruction
* in KVM on #UD, and obviously better than incorrectly injecting #UD.
*/
if (cpu_has_vmx_rdtscp()) {
bool rdpid_or_rdtscp_enabled =
guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
vmx_adjust_secondary_exec_control(vmx, &exec_control,
SECONDARY_EXEC_ENABLE_RDTSCP,
rdpid_or_rdtscp_enabled, false);
}
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@ -6855,6 +6850,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
static int vmx_create_vcpu(struct kvm_vcpu *vcpu) static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{ {
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx; struct vcpu_vmx *vmx;
int i, cpu, err; int i, cpu, err;
@ -6877,43 +6873,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vpid; goto free_vpid;
} }
BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); for (i = 0; i < kvm_nr_uret_msrs; ++i) {
vmx->guest_uret_msrs[i].data = 0;
for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { vmx->guest_uret_msrs[i].mask = -1ull;
u32 index = vmx_uret_msrs_list[i]; }
u32 data_low, data_high; if (boot_cpu_has(X86_FEATURE_RTM)) {
int j = vmx->nr_uret_msrs; /*
* TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
if (rdmsr_safe(index, &data_low, &data_high) < 0) * Keep the host value unchanged to avoid changing CPUID bits
continue; * under the host kernel's feet.
if (wrmsr_safe(index, data_low, data_high) < 0) */
continue; tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (tsx_ctrl)
vmx->guest_uret_msrs[j].slot = i; vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
vmx->guest_uret_msrs[j].data = 0;
switch (index) {
case MSR_IA32_TSX_CTRL:
/*
* TSX_CTRL_CPUID_CLEAR is handled in the CPUID
* interception. Keep the host value unchanged to avoid
* changing CPUID bits under the host kernel's feet.
*
* hle=0, rtm=0, tsx_ctrl=1 can be found with some
* combinations of new kernel and old userspace. If
* those guests run on a tsx=off host, do allow guests
* to use TSX_CTRL, but do not change the value on the
* host so that TSX remains always disabled.
*/
if (boot_cpu_has(X86_FEATURE_RTM))
vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
else
vmx->guest_uret_msrs[j].mask = 0;
break;
default:
vmx->guest_uret_msrs[j].mask = -1ull;
break;
}
++vmx->nr_uret_msrs;
} }
err = alloc_loaded_vmcs(&vmx->vmcs01); err = alloc_loaded_vmcs(&vmx->vmcs01);
@ -7344,9 +7316,11 @@ static __init void vmx_set_cpu_caps(void)
if (!cpu_has_vmx_xsaves()) if (!cpu_has_vmx_xsaves())
kvm_cpu_cap_clear(X86_FEATURE_XSAVES); kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
/* CPUID 0x80000001 */ /* CPUID 0x80000001 and 0x7 (RDPID) */
if (!cpu_has_vmx_rdtscp()) if (!cpu_has_vmx_rdtscp()) {
kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
kvm_cpu_cap_clear(X86_FEATURE_RDPID);
}
if (cpu_has_vmx_waitpkg()) if (cpu_has_vmx_waitpkg())
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
@ -7402,8 +7376,9 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
/* /*
* RDPID causes #UD if disabled through secondary execution controls. * RDPID causes #UD if disabled through secondary execution controls.
* Because it is marked as EmulateOnUD, we need to intercept it here. * Because it is marked as EmulateOnUD, we need to intercept it here.
* Note, RDPID is hidden behind ENABLE_RDTSCP.
*/ */
case x86_intercept_rdtscp: case x86_intercept_rdpid:
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR; exception->vector = UD_VECTOR;
exception->error_code_valid = false; exception->error_code_valid = false;
@ -7769,17 +7744,42 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
}; };
static __init void vmx_setup_user_return_msrs(void)
{
/*
* Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
* will emulate SYSCALL in legacy mode if the vendor string in guest
* CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
* support this emulation, MSR_STAR is included in the list for i386,
* but is never loaded into hardware. MSR_CSTAR is also never loaded
* into hardware and is here purely for emulation purposes.
*/
const u32 vmx_uret_msrs_list[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
MSR_IA32_TSX_CTRL,
};
int i;
BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
}
static __init int hardware_setup(void) static __init int hardware_setup(void)
{ {
unsigned long host_bndcfgs; unsigned long host_bndcfgs;
struct desc_ptr dt; struct desc_ptr dt;
int r, i, ept_lpage_level; int r, ept_lpage_level;
store_idt(&dt); store_idt(&dt);
host_idt_base = dt.address; host_idt_base = dt.address;
for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) vmx_setup_user_return_msrs();
kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO; return -EIO;

View file

@ -36,7 +36,7 @@ struct vmx_msrs {
}; };
struct vmx_uret_msr { struct vmx_uret_msr {
unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */ bool load_into_hardware;
u64 data; u64 data;
u64 mask; u64 mask;
}; };
@ -245,8 +245,16 @@ struct vcpu_vmx {
u32 idt_vectoring_info; u32 idt_vectoring_info;
ulong rflags; ulong rflags;
/*
* User return MSRs are always emulated when enabled in the guest, but
* only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
* of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
* be loaded into hardware if those conditions aren't met.
* nr_active_uret_msrs tracks the number of MSRs that need to be loaded
* into hardware when running the guest. guest_uret_msrs[] is resorted
* whenever the number of "active" uret MSRs is modified.
*/
struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
int nr_uret_msrs;
int nr_active_uret_msrs; int nr_active_uret_msrs;
bool guest_uret_msrs_loaded; bool guest_uret_msrs_loaded;
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64

View file

@ -184,11 +184,6 @@ module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
*/ */
#define KVM_MAX_NR_USER_RETURN_MSRS 16 #define KVM_MAX_NR_USER_RETURN_MSRS 16
struct kvm_user_return_msrs_global {
int nr;
u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
};
struct kvm_user_return_msrs { struct kvm_user_return_msrs {
struct user_return_notifier urn; struct user_return_notifier urn;
bool registered; bool registered;
@ -198,7 +193,9 @@ struct kvm_user_return_msrs {
} values[KVM_MAX_NR_USER_RETURN_MSRS]; } values[KVM_MAX_NR_USER_RETURN_MSRS];
}; };
static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; u32 __read_mostly kvm_nr_uret_msrs;
EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
static struct kvm_user_return_msrs __percpu *user_return_msrs; static struct kvm_user_return_msrs __percpu *user_return_msrs;
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
@ -330,23 +327,53 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
user_return_notifier_unregister(urn); user_return_notifier_unregister(urn);
} }
local_irq_restore(flags); local_irq_restore(flags);
for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
values = &msrs->values[slot]; values = &msrs->values[slot];
if (values->host != values->curr) { if (values->host != values->curr) {
wrmsrl(user_return_msrs_global.msrs[slot], values->host); wrmsrl(kvm_uret_msrs_list[slot], values->host);
values->curr = values->host; values->curr = values->host;
} }
} }
} }
void kvm_define_user_return_msr(unsigned slot, u32 msr) static int kvm_probe_user_return_msr(u32 msr)
{ {
BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); u64 val;
user_return_msrs_global.msrs[slot] = msr; int ret;
if (slot >= user_return_msrs_global.nr)
user_return_msrs_global.nr = slot + 1; preempt_disable();
ret = rdmsrl_safe(msr, &val);
if (ret)
goto out;
ret = wrmsrl_safe(msr, val);
out:
preempt_enable();
return ret;
} }
EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
int kvm_add_user_return_msr(u32 msr)
{
BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
if (kvm_probe_user_return_msr(msr))
return -1;
kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
return kvm_nr_uret_msrs++;
}
EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
int kvm_find_user_return_msr(u32 msr)
{
int i;
for (i = 0; i < kvm_nr_uret_msrs; ++i) {
if (kvm_uret_msrs_list[i] == msr)
return i;
}
return -1;
}
EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
static void kvm_user_return_msr_cpu_online(void) static void kvm_user_return_msr_cpu_online(void)
{ {
@ -355,8 +382,8 @@ static void kvm_user_return_msr_cpu_online(void)
u64 value; u64 value;
int i; int i;
for (i = 0; i < user_return_msrs_global.nr; ++i) { for (i = 0; i < kvm_nr_uret_msrs; ++i) {
rdmsrl_safe(user_return_msrs_global.msrs[i], &value); rdmsrl_safe(kvm_uret_msrs_list[i], &value);
msrs->values[i].host = value; msrs->values[i].host = value;
msrs->values[i].curr = value; msrs->values[i].curr = value;
} }
@ -371,7 +398,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
value = (value & mask) | (msrs->values[slot].host & ~mask); value = (value & mask) | (msrs->values[slot].host & ~mask);
if (value == msrs->values[slot].curr) if (value == msrs->values[slot].curr)
return 0; return 0;
err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
if (err) if (err)
return 1; return 1;
@ -1149,6 +1176,9 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
fixed |= DR6_RTM; fixed |= DR6_RTM;
if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
fixed |= DR6_BUS_LOCK;
return fixed; return fixed;
} }
@ -1615,6 +1645,30 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* invokes 64-bit SYSENTER. * invokes 64-bit SYSENTER.
*/ */
data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
break;
case MSR_TSC_AUX:
if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
return 1;
if (!host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
return 1;
/*
* Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
* incomplete and conflicting architectural behavior. Current
* AMD CPUs completely ignore bits 63:32, i.e. they aren't
* reserved and always read as zeros. Enforce Intel's reserved
* bits check if and only if the guest CPU is Intel, and clear
* the bits in all other cases. This ensures cross-vendor
* migration will provide consistent behavior for the guest.
*/
if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
return 1;
data = (u32)data;
break;
} }
msr.data = data; msr.data = data;
@ -1651,6 +1705,18 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
return KVM_MSR_RET_FILTERED; return KVM_MSR_RET_FILTERED;
switch (index) {
case MSR_TSC_AUX:
if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
return 1;
if (!host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
return 1;
break;
}
msr.index = index; msr.index = index;
msr.host_initiated = host_initiated; msr.host_initiated = host_initiated;
@ -5468,14 +5534,18 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
struct kvm_msr_filter_range *user_range) struct kvm_msr_filter_range *user_range)
{ {
struct msr_bitmap_range range;
unsigned long *bitmap = NULL; unsigned long *bitmap = NULL;
size_t bitmap_size; size_t bitmap_size;
int r;
if (!user_range->nmsrs) if (!user_range->nmsrs)
return 0; return 0;
if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
return -EINVAL;
if (!user_range->flags)
return -EINVAL;
bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
return -EINVAL; return -EINVAL;
@ -5484,31 +5554,15 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
if (IS_ERR(bitmap)) if (IS_ERR(bitmap))
return PTR_ERR(bitmap); return PTR_ERR(bitmap);
range = (struct msr_bitmap_range) { msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
.flags = user_range->flags, .flags = user_range->flags,
.base = user_range->base, .base = user_range->base,
.nmsrs = user_range->nmsrs, .nmsrs = user_range->nmsrs,
.bitmap = bitmap, .bitmap = bitmap,
}; };
if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
r = -EINVAL;
goto err;
}
if (!range.flags) {
r = -EINVAL;
goto err;
}
/* Everything ok, add this range identifier. */
msr_filter->ranges[msr_filter->count] = range;
msr_filter->count++; msr_filter->count++;
return 0; return 0;
err:
kfree(bitmap);
return r;
} }
static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
@ -5937,7 +5991,8 @@ static void kvm_init_msr_list(void)
continue; continue;
break; break;
case MSR_TSC_AUX: case MSR_TSC_AUX:
if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
!kvm_cpu_cap_has(X86_FEATURE_RDPID))
continue; continue;
break; break;
case MSR_IA32_UMWAIT_CONTROL: case MSR_IA32_UMWAIT_CONTROL:
@ -8039,6 +8094,18 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
/*
* Indirection to move queue_work() out of the tk_core.seq write held
* region to prevent possible deadlocks against time accessors which
* are invoked with work related locks held.
*/
static void pvclock_irq_work_fn(struct irq_work *w)
{
queue_work(system_long_wq, &pvclock_gtod_work);
}
static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
/* /*
* Notification about pvclock gtod data update. * Notification about pvclock gtod data update.
*/ */
@ -8050,13 +8117,14 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
update_pvclock_gtod(tk); update_pvclock_gtod(tk);
/* disable master clock if host does not trust, or does not /*
* use, TSC based clocksource. * Disable master clock if host does not trust, or does not use,
* TSC based clocksource. Delegate queue_work() to irq_work as
* this is invoked with tk_core.seq write held.
*/ */
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0) atomic_read(&kvm_guest_has_master_clock) != 0)
queue_work(system_long_wq, &pvclock_gtod_work); irq_work_queue(&pvclock_irq_work);
return 0; return 0;
} }
@ -8118,6 +8186,7 @@ int kvm_arch_init(void *opaque)
printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
goto out_free_x86_emulator_cache; goto out_free_x86_emulator_cache;
} }
kvm_nr_uret_msrs = 0;
r = kvm_mmu_module_init(); r = kvm_mmu_module_init();
if (r) if (r)
@ -8168,6 +8237,8 @@ void kvm_arch_exit(void)
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
irq_work_sync(&pvclock_irq_work);
cancel_work_sync(&pvclock_gtod_work);
#endif #endif
kvm_x86_ops.hardware_enable = NULL; kvm_x86_ops.hardware_enable = NULL;
kvm_mmu_module_exit(); kvm_mmu_module_exit();

View file

@ -3127,7 +3127,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 new_size, struct btrfs_inode *inode, u64 new_size,
u32 min_type); u32 min_type);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context); bool in_reclaim_context);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,

View file

@ -1340,12 +1340,16 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
stripe = bbio->stripes; stripe = bbio->stripes;
for (i = 0; i < bbio->num_stripes; i++, stripe++) { for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes; u64 bytes;
struct btrfs_device *device = stripe->dev;
if (!stripe->dev->bdev) { if (!device->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED)); ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue; continue;
} }
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
continue;
ret = do_discard_extent(stripe, &bytes); ret = do_discard_extent(stripe, &bytes);
if (!ret) { if (!ret) {
discarded_bytes += bytes; discarded_bytes += bytes;

View file

@ -2067,6 +2067,30 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
return ret; return ret;
} }
static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
{
struct btrfs_inode *inode = BTRFS_I(ctx->inode);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (btrfs_inode_in_log(inode, fs_info->generation) &&
list_empty(&ctx->ordered_extents))
return true;
/*
* If we are doing a fast fsync we can not bail out if the inode's
* last_trans is <= then the last committed transaction, because we only
* update the last_trans of the inode during ordered extent completion,
* and for a fast fsync we don't wait for that, we only wait for the
* writeback to complete.
*/
if (inode->last_trans <= fs_info->last_trans_committed &&
(test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
list_empty(&ctx->ordered_extents)))
return true;
return false;
}
/* /*
* fsync call for both files and directories. This logs the inode into * fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible. * the tree log instead of forcing full commits whenever possible.
@ -2185,17 +2209,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch); atomic_inc(&root->log_batch);
/*
* If we are doing a fast fsync we can not bail out if the inode's
* last_trans is <= then the last committed transaction, because we only
* update the last_trans of the inode during ordered extent completion,
* and for a fast fsync we don't wait for that, we only wait for the
* writeback to complete.
*/
smp_mb(); smp_mb();
if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || if (skip_inode_logging(&ctx)) {
(BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed &&
(full_sync || list_empty(&ctx.ordered_extents)))) {
/* /*
* We've had everything committed since the last time we were * We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever * modified so clear this flag in case it was set for whatever

View file

@ -3949,7 +3949,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
{ {
struct btrfs_block_group *block_group; struct btrfs_block_group *block_group;
struct rb_node *node; struct rb_node *node;
int ret; int ret = 0;
btrfs_info(fs_info, "cleaning free space cache v1"); btrfs_info(fs_info, "cleaning free space cache v1");

View file

@ -9678,7 +9678,7 @@ out:
return ret; return ret;
} }
int btrfs_start_delalloc_snapshot(struct btrfs_root *root) int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
{ {
struct writeback_control wbc = { struct writeback_control wbc = {
.nr_to_write = LONG_MAX, .nr_to_write = LONG_MAX,
@ -9691,7 +9691,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS; return -EROFS;
return start_delalloc_inodes(root, &wbc, true, false); return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
} }
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,

View file

@ -259,6 +259,8 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
if (!fa->flags_valid) { if (!fa->flags_valid) {
/* 1 item for the inode */ /* 1 item for the inode */
trans = btrfs_start_transaction(root, 1); trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
goto update_flags; goto update_flags;
} }
@ -907,7 +909,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
*/ */
btrfs_drew_read_lock(&root->snapshot_lock); btrfs_drew_read_lock(&root->snapshot_lock);
ret = btrfs_start_delalloc_snapshot(root); ret = btrfs_start_delalloc_snapshot(root, false);
if (ret) if (ret)
goto out; goto out;

View file

@ -984,7 +984,7 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
if (pre) if (pre)
ret = clone_ordered_extent(ordered, 0, pre); ret = clone_ordered_extent(ordered, 0, pre);
if (post) if (ret == 0 && post)
ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
post); post);

View file

@ -3545,11 +3545,15 @@ static int try_flush_qgroup(struct btrfs_root *root)
struct btrfs_trans_handle *trans; struct btrfs_trans_handle *trans;
int ret; int ret;
/* Can't hold an open transaction or we run the risk of deadlocking */ /*
ASSERT(current->journal_info == NULL || * Can't hold an open transaction or we run the risk of deadlocking,
current->journal_info == BTRFS_SEND_TRANS_STUB); * and can't either be under the context of a send operation (where
if (WARN_ON(current->journal_info && * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that
current->journal_info != BTRFS_SEND_TRANS_STUB)) * would result in a crash when starting a transaction and does not
* make sense either (send is a read-only operation).
*/
ASSERT(current->journal_info == NULL);
if (WARN_ON(current->journal_info))
return 0; return 0;
/* /*
@ -3562,7 +3566,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
return 0; return 0;
} }
ret = btrfs_start_delalloc_snapshot(root); ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0) if (ret < 0)
goto out; goto out;
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);

View file

@ -7170,7 +7170,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
int i; int i;
if (root) { if (root) {
ret = btrfs_start_delalloc_snapshot(root); ret = btrfs_start_delalloc_snapshot(root, false);
if (ret) if (ret)
return ret; return ret;
btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
@ -7178,7 +7178,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
for (i = 0; i < sctx->clone_roots_cnt; i++) { for (i = 0; i < sctx->clone_roots_cnt; i++) {
root = sctx->clone_roots[i].root; root = sctx->clone_roots[i].root;
ret = btrfs_start_delalloc_snapshot(root); ret = btrfs_start_delalloc_snapshot(root, false);
if (ret) if (ret)
return ret; return ret;
btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);

View file

@ -6061,7 +6061,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* (since logging them is pointless, a link count of 0 means they * (since logging them is pointless, a link count of 0 means they
* will never be accessible). * will never be accessible).
*/ */
if (btrfs_inode_in_log(inode, trans->transid) || if ((btrfs_inode_in_log(inode, trans->transid) &&
list_empty(&ctx->ordered_extents)) ||
inode->vfs_inode.i_nlink == 0) { inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC; ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans; goto end_no_trans;

View file

@ -1126,6 +1126,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out; goto out;
} }
if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
ret = -EIO;
goto out;
}
switch (zone.cond) { switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY: case BLK_ZONE_COND_READONLY:

View file

@ -2,6 +2,7 @@
#ifndef _ASM_POWERPC_ERRNO_H #ifndef _ASM_POWERPC_ERRNO_H
#define _ASM_POWERPC_ERRNO_H #define _ASM_POWERPC_ERRNO_H
#undef EDEADLOCK
#include <asm-generic/errno.h> #include <asm-generic/errno.h>
#undef EDEADLOCK #undef EDEADLOCK

View file

@ -84,7 +84,7 @@
/* CPU types for specific tunings: */ /* CPU types for specific tunings: */
#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ /* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */
#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
@ -236,6 +236,8 @@
#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */
#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@ -290,6 +292,8 @@
#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */
#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */
#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
@ -336,6 +340,7 @@
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
@ -354,6 +359,7 @@
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */
#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ #define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */
#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ #define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */
#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ #define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */
@ -374,6 +380,7 @@
#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ #define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */
#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */
#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ #define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */

View file

@ -185,6 +185,9 @@
#define MSR_PEBS_DATA_CFG 0x000003f2 #define MSR_PEBS_DATA_CFG 0x000003f2
#define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_DS_AREA 0x00000600
#define MSR_IA32_PERF_CAPABILITIES 0x00000345 #define MSR_IA32_PERF_CAPABILITIES 0x00000345
#define PERF_CAP_METRICS_IDX 15
#define PERF_CAP_PT_IDX 16
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
#define MSR_IA32_RTIT_CTL 0x00000570 #define MSR_IA32_RTIT_CTL 0x00000570
@ -265,6 +268,7 @@
#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */
#define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF_SHIFT 1
#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */
#define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2)
#define DEBUGCTLMSR_TR (1UL << 6) #define DEBUGCTLMSR_TR (1UL << 6)
#define DEBUGCTLMSR_BTS (1UL << 7) #define DEBUGCTLMSR_BTS (1UL << 7)
#define DEBUGCTLMSR_BTINT (1UL << 8) #define DEBUGCTLMSR_BTINT (1UL << 8)

View file

@ -27,6 +27,7 @@
#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000
#define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXCEPTION_NMI 0
#define EXIT_REASON_EXTERNAL_INTERRUPT 1 #define EXIT_REASON_EXTERNAL_INTERRUPT 1

View file

@ -4,7 +4,7 @@
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/errno.h> #include <asm/errno.h>
#include <asm/cpufeatures.h> #include <asm/cpufeatures.h>
#include <asm/alternative-asm.h> #include <asm/alternative.h>
#include <asm/export.h> #include <asm/export.h>
.pushsection .noinstr.text, "ax" .pushsection .noinstr.text, "ax"

View file

@ -3,7 +3,7 @@
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/cpufeatures.h> #include <asm/cpufeatures.h>
#include <asm/alternative-asm.h> #include <asm/alternative.h>
#include <asm/export.h> #include <asm/export.h>
/* /*

View file

@ -863,9 +863,18 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise)
__SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
#define __NR_mount_setattr 442 #define __NR_mount_setattr 442
__SYSCALL(__NR_mount_setattr, sys_mount_setattr) __SYSCALL(__NR_mount_setattr, sys_mount_setattr)
#define __NR_quotactl_path 443
__SYSCALL(__NR_quotactl_path, sys_quotactl_path)
#define __NR_landlock_create_ruleset 444
__SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset)
#define __NR_landlock_add_rule 445
__SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
#define __NR_landlock_restrict_self 446
__SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
#undef __NR_syscalls #undef __NR_syscalls
#define __NR_syscalls 443 #define __NR_syscalls 447
/* /*
* 32 bit systems traditionally used different * 32 bit systems traditionally used different

View file

@ -625,30 +625,147 @@ struct drm_gem_open {
__u64 size; __u64 size;
}; };
/**
* DRM_CAP_DUMB_BUFFER
*
* If set to 1, the driver supports creating dumb buffers via the
* &DRM_IOCTL_MODE_CREATE_DUMB ioctl.
*/
#define DRM_CAP_DUMB_BUFFER 0x1 #define DRM_CAP_DUMB_BUFFER 0x1
/**
* DRM_CAP_VBLANK_HIGH_CRTC
*
* If set to 1, the kernel supports specifying a CRTC index in the high bits of
* &drm_wait_vblank_request.type.
*
* Starting kernel version 2.6.39, this capability is always set to 1.
*/
#define DRM_CAP_VBLANK_HIGH_CRTC 0x2 #define DRM_CAP_VBLANK_HIGH_CRTC 0x2
/**
* DRM_CAP_DUMB_PREFERRED_DEPTH
*
* The preferred bit depth for dumb buffers.
*
* The bit depth is the number of bits used to indicate the color of a single
* pixel excluding any padding. This is different from the number of bits per
* pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per
* pixel.
*
* Note that this preference only applies to dumb buffers, it's irrelevant for
* other types of buffers.
*/
#define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 #define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3
/**
* DRM_CAP_DUMB_PREFER_SHADOW
*
* If set to 1, the driver prefers userspace to render to a shadow buffer
* instead of directly rendering to a dumb buffer. For best speed, userspace
* should do streaming ordered memory copies into the dumb buffer and never
* read from it.
*
* Note that this preference only applies to dumb buffers, it's irrelevant for
* other types of buffers.
*/
#define DRM_CAP_DUMB_PREFER_SHADOW 0x4 #define DRM_CAP_DUMB_PREFER_SHADOW 0x4
/**
* DRM_CAP_PRIME
*
* Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT
* and &DRM_PRIME_CAP_EXPORT.
*
* PRIME buffers are exposed as dma-buf file descriptors. See
* Documentation/gpu/drm-mm.rst, section "PRIME Buffer Sharing".
*/
#define DRM_CAP_PRIME 0x5 #define DRM_CAP_PRIME 0x5
/**
* DRM_PRIME_CAP_IMPORT
*
* If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME
* buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl.
*/
#define DRM_PRIME_CAP_IMPORT 0x1 #define DRM_PRIME_CAP_IMPORT 0x1
/**
* DRM_PRIME_CAP_EXPORT
*
* If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME
* buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl.
*/
#define DRM_PRIME_CAP_EXPORT 0x2 #define DRM_PRIME_CAP_EXPORT 0x2
/**
* DRM_CAP_TIMESTAMP_MONOTONIC
*
* If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in
* struct drm_event_vblank. If set to 1, the kernel will report timestamps with
* ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these
* clocks.
*
* Starting from kernel version 2.6.39, the default value for this capability
* is 1. Starting kernel version 4.15, this capability is always set to 1.
*/
#define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 #define DRM_CAP_TIMESTAMP_MONOTONIC 0x6
/**
* DRM_CAP_ASYNC_PAGE_FLIP
*
* If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC.
*/
#define DRM_CAP_ASYNC_PAGE_FLIP 0x7 #define DRM_CAP_ASYNC_PAGE_FLIP 0x7
/* /**
* The CURSOR_WIDTH and CURSOR_HEIGHT capabilities return a valid widthxheight * DRM_CAP_CURSOR_WIDTH
* combination for the hardware cursor. The intention is that a hardware *
* agnostic userspace can query a cursor plane size to use. * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid
* width x height combination for the hardware cursor. The intention is that a
* hardware agnostic userspace can query a cursor plane size to use.
* *
* Note that the cross-driver contract is to merely return a valid size; * Note that the cross-driver contract is to merely return a valid size;
* drivers are free to attach another meaning on top, eg. i915 returns the * drivers are free to attach another meaning on top, eg. i915 returns the
* maximum plane size. * maximum plane size.
*/ */
#define DRM_CAP_CURSOR_WIDTH 0x8 #define DRM_CAP_CURSOR_WIDTH 0x8
/**
* DRM_CAP_CURSOR_HEIGHT
*
* See &DRM_CAP_CURSOR_WIDTH.
*/
#define DRM_CAP_CURSOR_HEIGHT 0x9 #define DRM_CAP_CURSOR_HEIGHT 0x9
/**
* DRM_CAP_ADDFB2_MODIFIERS
*
* If set to 1, the driver supports supplying modifiers in the
* &DRM_IOCTL_MODE_ADDFB2 ioctl.
*/
#define DRM_CAP_ADDFB2_MODIFIERS 0x10 #define DRM_CAP_ADDFB2_MODIFIERS 0x10
/**
* DRM_CAP_PAGE_FLIP_TARGET
*
* If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and
* &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in
* &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP
* ioctl.
*/
#define DRM_CAP_PAGE_FLIP_TARGET 0x11 #define DRM_CAP_PAGE_FLIP_TARGET 0x11
/**
* DRM_CAP_CRTC_IN_VBLANK_EVENT
*
* If set to 1, the kernel supports reporting the CRTC ID in
* &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and
* &DRM_EVENT_FLIP_COMPLETE events.
*
* Starting kernel version 4.12, this capability is always set to 1.
*/
#define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 #define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12
/**
* DRM_CAP_SYNCOBJ
*
* If set to 1, the driver supports sync objects. See
* Documentation/gpu/drm-mm.rst, section "DRM Sync Objects".
*/
#define DRM_CAP_SYNCOBJ 0x13 #define DRM_CAP_SYNCOBJ 0x13
/**
* DRM_CAP_SYNCOBJ_TIMELINE
*
* If set to 1, the driver supports timeline operations on sync objects. See
* Documentation/gpu/drm-mm.rst, section "DRM Sync Objects".
*/
#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 #define DRM_CAP_SYNCOBJ_TIMELINE 0x14
/* DRM_IOCTL_GET_CAP ioctl argument type */ /* DRM_IOCTL_GET_CAP ioctl argument type */

View file

@ -943,6 +943,7 @@ struct drm_i915_gem_exec_object {
__u64 offset; __u64 offset;
}; };
/* DRM_IOCTL_I915_GEM_EXECBUFFER was removed in Linux 5.13 */
struct drm_i915_gem_execbuffer { struct drm_i915_gem_execbuffer {
/** /**
* List of buffers to be validated with their relocations to be * List of buffers to be validated with their relocations to be

View file

@ -1078,6 +1078,10 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_DIRTY_LOG_RING 192 #define KVM_CAP_DIRTY_LOG_RING 192
#define KVM_CAP_X86_BUS_LOCK_EXIT 193 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
#define KVM_CAP_PPC_DAWR1 194 #define KVM_CAP_PPC_DAWR1 194
#define KVM_CAP_SET_GUEST_DEBUG2 195
#define KVM_CAP_SGX_ATTRIBUTE 196
#define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
#define KVM_CAP_PTP_KVM 198
#ifdef KVM_CAP_IRQ_ROUTING #ifdef KVM_CAP_IRQ_ROUTING
@ -1671,6 +1675,8 @@ enum sev_cmd_id {
KVM_SEV_CERT_EXPORT, KVM_SEV_CERT_EXPORT,
/* Attestation report */ /* Attestation report */
KVM_SEV_GET_ATTESTATION_REPORT, KVM_SEV_GET_ATTESTATION_REPORT,
/* Guest Migration Extension */
KVM_SEV_SEND_CANCEL,
KVM_SEV_NR_MAX, KVM_SEV_NR_MAX,
}; };
@ -1729,6 +1735,45 @@ struct kvm_sev_attestation_report {
__u32 len; __u32 len;
}; };
struct kvm_sev_send_start {
__u32 policy;
__u64 pdh_cert_uaddr;
__u32 pdh_cert_len;
__u64 plat_certs_uaddr;
__u32 plat_certs_len;
__u64 amd_certs_uaddr;
__u32 amd_certs_len;
__u64 session_uaddr;
__u32 session_len;
};
struct kvm_sev_send_update_data {
__u64 hdr_uaddr;
__u32 hdr_len;
__u64 guest_uaddr;
__u32 guest_len;
__u64 trans_uaddr;
__u32 trans_len;
};
struct kvm_sev_receive_start {
__u32 handle;
__u32 policy;
__u64 pdh_uaddr;
__u32 pdh_len;
__u64 session_uaddr;
__u32 session_len;
};
struct kvm_sev_receive_update_data {
__u64 hdr_uaddr;
__u32 hdr_len;
__u64 guest_uaddr;
__u32 guest_len;
__u64 trans_uaddr;
__u32 trans_len;
};
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2)

View file

@ -127,6 +127,7 @@ enum perf_sw_ids {
PERF_COUNT_SW_EMULATION_FAULTS = 8, PERF_COUNT_SW_EMULATION_FAULTS = 8,
PERF_COUNT_SW_DUMMY = 9, PERF_COUNT_SW_DUMMY = 9,
PERF_COUNT_SW_BPF_OUTPUT = 10, PERF_COUNT_SW_BPF_OUTPUT = 10,
PERF_COUNT_SW_CGROUP_SWITCHES = 11,
PERF_COUNT_SW_MAX, /* non-ABI */ PERF_COUNT_SW_MAX, /* non-ABI */
}; };
@ -326,6 +327,7 @@ enum perf_event_read_format {
#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */
#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ #define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */
#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ #define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */
#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */
/* /*
* Hardware event_id to monitor via a performance monitoring event: * Hardware event_id to monitor via a performance monitoring event:
@ -404,7 +406,10 @@ struct perf_event_attr {
cgroup : 1, /* include cgroup events */ cgroup : 1, /* include cgroup events */
text_poke : 1, /* include text poke events */ text_poke : 1, /* include text poke events */
build_id : 1, /* use build id in mmap2 events */ build_id : 1, /* use build id in mmap2 events */
__reserved_1 : 29; inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
__reserved_1 : 26;
union { union {
__u32 wakeup_events; /* wakeup every n events */ __u32 wakeup_events; /* wakeup every n events */
@ -456,6 +461,12 @@ struct perf_event_attr {
__u16 __reserved_2; __u16 __reserved_2;
__u32 aux_sample_size; __u32 aux_sample_size;
__u32 __reserved_3; __u32 __reserved_3;
/*
* User provided data if sigtrap=1, passed back to user via
* siginfo_t::si_perf, e.g. to permit user to identify the event.
*/
__u64 sig_data;
}; };
/* /*
@ -1171,10 +1182,15 @@ enum perf_callchain_context {
/** /**
* PERF_RECORD_AUX::flags bits * PERF_RECORD_AUX::flags bits
*/ */
#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ #define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */
#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ #define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */
#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ #define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */
#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ #define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */
#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */
/* CoreSight PMU AUX buffer formats */
#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */
#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */
#define PERF_FLAG_FD_NO_GROUP (1UL << 0) #define PERF_FLAG_FD_NO_GROUP (1UL << 0)
#define PERF_FLAG_FD_OUTPUT (1UL << 1) #define PERF_FLAG_FD_OUTPUT (1UL << 1)

View file

@ -255,4 +255,8 @@ struct prctl_mm_map {
# define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_ALLOW 0
# define SYSCALL_DISPATCH_FILTER_BLOCK 1 # define SYSCALL_DISPATCH_FILTER_BLOCK 1
/* Set/get enabled arm64 pointer authentication keys */
#define PR_PAC_SET_ENABLED_KEYS 60
#define PR_PAC_GET_ENABLED_KEYS 61
#endif /* _LINUX_PRCTL_H */ #endif /* _LINUX_PRCTL_H */

View file

@ -111,7 +111,7 @@ OPTIONS
--tracepoints:: --tracepoints::
retrieve statistics from tracepoints retrieve statistics from tracepoints
*z*:: -z::
--skip-zero-records:: --skip-zero-records::
omit records with all zeros in logging mode omit records with all zeros in logging mode

View file

@ -540,6 +540,7 @@ ifndef NO_LIBELF
ifdef LIBBPF_DYNAMIC ifdef LIBBPF_DYNAMIC
ifeq ($(feature-libbpf), 1) ifeq ($(feature-libbpf), 1)
EXTLIBS += -lbpf EXTLIBS += -lbpf
$(call detected,CONFIG_LIBBPF_DYNAMIC)
else else
dummy := $(error Error: No libbpf devel library found, please install libbpf-devel); dummy := $(error Error: No libbpf devel library found, please install libbpf-devel);
endif endif

View file

@ -71,7 +71,7 @@ struct kvm_reg_events_ops kvm_reg_events_ops[] = {
.name = "vmexit", .name = "vmexit",
.ops = &exit_events, .ops = &exit_events,
}, },
{ NULL }, { NULL, NULL },
}; };
const char * const kvm_skip_events[] = { const char * const kvm_skip_events[] = {

View file

@ -356,3 +356,8 @@
439 n64 faccessat2 sys_faccessat2 439 n64 faccessat2 sys_faccessat2
440 n64 process_madvise sys_process_madvise 440 n64 process_madvise sys_process_madvise
441 n64 epoll_pwait2 sys_epoll_pwait2 441 n64 epoll_pwait2 sys_epoll_pwait2
442 n64 mount_setattr sys_mount_setattr
443 n64 quotactl_path sys_quotactl_path
444 n64 landlock_create_ruleset sys_landlock_create_ruleset
445 n64 landlock_add_rule sys_landlock_add_rule
446 n64 landlock_restrict_self sys_landlock_restrict_self

View file

@ -522,3 +522,7 @@
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
442 common mount_setattr sys_mount_setattr 442 common mount_setattr sys_mount_setattr
443 common quotactl_path sys_quotactl_path
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self

View file

@ -445,3 +445,7 @@
440 common process_madvise sys_process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
442 common mount_setattr sys_mount_setattr sys_mount_setattr 442 common mount_setattr sys_mount_setattr sys_mount_setattr
443 common quotactl_path sys_quotactl_path sys_quotactl_path
444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self

View file

@ -364,6 +364,10 @@
440 common process_madvise sys_process_madvise 440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2 441 common epoll_pwait2 sys_epoll_pwait2
442 common mount_setattr sys_mount_setattr 442 common mount_setattr sys_mount_setattr
443 common quotactl_path sys_quotactl_path
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
# #
# Due to a historical design error, certain syscalls are numbered differently # Due to a historical design error, certain syscalls are numbered differently

View file

@ -1123,8 +1123,10 @@ static int process_one_file(const char *fpath, const struct stat *sb,
mapfile = strdup(fpath); mapfile = strdup(fpath);
return 0; return 0;
} }
if (is_json_file(bname))
pr_info("%s: Ignoring file %s\n", prog, fpath); pr_debug("%s: ArchStd json is preprocessed %s\n", prog, fpath);
else
pr_info("%s: Ignoring file %s\n", prog, fpath);
return 0; return 0;
} }

View file

@ -5,7 +5,7 @@ group_fd=-1
flags=0|8 flags=0|8
cpu=* cpu=*
type=0|1 type=0|1
size=120 size=128
config=0 config=0
sample_period=* sample_period=*
sample_type=263 sample_type=263

View file

@ -5,7 +5,7 @@ group_fd=-1
flags=0|8 flags=0|8
cpu=* cpu=*
type=0 type=0
size=120 size=128
config=0 config=0
sample_period=0 sample_period=0
sample_type=65536 sample_type=65536

View file

@ -7,7 +7,7 @@ cpu=*
pid=-1 pid=-1
flags=8 flags=8
type=1 type=1
size=120 size=128
config=9 config=9
sample_period=4000 sample_period=4000
sample_type=455 sample_type=455

View file

@ -145,7 +145,14 @@ perf-$(CONFIG_LIBELF) += symbol-elf.o
perf-$(CONFIG_LIBELF) += probe-file.o perf-$(CONFIG_LIBELF) += probe-file.o
perf-$(CONFIG_LIBELF) += probe-event.o perf-$(CONFIG_LIBELF) += probe-event.o
ifdef CONFIG_LIBBPF_DYNAMIC
hashmap := 1
endif
ifndef CONFIG_LIBBPF ifndef CONFIG_LIBBPF
hashmap := 1
endif
ifdef hashmap
perf-y += hashmap.o perf-y += hashmap.o
endif endif

View file

@ -157,9 +157,15 @@ static int get_max_rate(unsigned int *rate)
static int record_opts__config_freq(struct record_opts *opts) static int record_opts__config_freq(struct record_opts *opts)
{ {
bool user_freq = opts->user_freq != UINT_MAX; bool user_freq = opts->user_freq != UINT_MAX;
bool user_interval = opts->user_interval != ULLONG_MAX;
unsigned int max_rate; unsigned int max_rate;
if (opts->user_interval != ULLONG_MAX) if (user_interval && user_freq) {
pr_err("cannot set frequency and period at the same time\n");
return -1;
}
if (user_interval)
opts->default_interval = opts->user_interval; opts->default_interval = opts->user_interval;
if (user_freq) if (user_freq)
opts->freq = opts->user_freq; opts->freq = opts->user_freq;

View file

@ -904,7 +904,7 @@ static void perf_event__cpu_map_swap(union perf_event *event,
struct perf_record_record_cpu_map *mask; struct perf_record_record_cpu_map *mask;
unsigned i; unsigned i;
data->type = bswap_64(data->type); data->type = bswap_16(data->type);
switch (data->type) { switch (data->type) {
case PERF_CPU_MAP__CPUS: case PERF_CPU_MAP__CPUS:
@ -937,7 +937,7 @@ static void perf_event__stat_config_swap(union perf_event *event,
{ {
u64 size; u64 size;
size = event->stat_config.nr * sizeof(event->stat_config.data[0]); size = bswap_64(event->stat_config.nr) * sizeof(event->stat_config.data[0]);
size += 1; /* nr item itself */ size += 1; /* nr item itself */
mem_bswap_64(&event->stat_config.nr, size); mem_bswap_64(&event->stat_config.nr, size);
} }

View file

@ -54,9 +54,9 @@ idt_handlers:
.align 8 .align 8
/* Fetch current address and append it to idt_handlers. */ /* Fetch current address and append it to idt_handlers. */
current_handler = . 666 :
.pushsection .rodata .pushsection .rodata
.quad current_handler .quad 666b
.popsection .popsection
.if ! \has_error .if ! \has_error

View file

@ -18,6 +18,28 @@
#include "vmx.h" #include "vmx.h"
#define VCPU_ID 5 #define VCPU_ID 5
#define NMI_VECTOR 2
static int ud_count;
void enable_x2apic(void)
{
uint32_t spiv_reg = APIC_BASE_MSR + (APIC_SPIV >> 4);
wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) |
MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD);
wrmsr(spiv_reg, rdmsr(spiv_reg) | APIC_SPIV_APIC_ENABLED);
}
static void guest_ud_handler(struct ex_regs *regs)
{
ud_count++;
regs->rip += 3; /* VMLAUNCH */
}
static void guest_nmi_handler(struct ex_regs *regs)
{
}
void l2_guest_code(void) void l2_guest_code(void)
{ {
@ -25,15 +47,23 @@ void l2_guest_code(void)
GUEST_SYNC(8); GUEST_SYNC(8);
/* Forced exit to L1 upon restore */
GUEST_SYNC(9);
/* Done, exit to L1 and never come back. */ /* Done, exit to L1 and never come back. */
vmcall(); vmcall();
} }
void l1_guest_code(struct vmx_pages *vmx_pages) void guest_code(struct vmx_pages *vmx_pages)
{ {
#define L2_GUEST_STACK_SIZE 64 #define L2_GUEST_STACK_SIZE 64
unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
enable_x2apic();
GUEST_SYNC(1);
GUEST_SYNC(2);
enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist); enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist);
GUEST_ASSERT(vmx_pages->vmcs_gpa); GUEST_ASSERT(vmx_pages->vmcs_gpa);
@ -55,27 +85,40 @@ void l1_guest_code(struct vmx_pages *vmx_pages)
current_evmcs->revision_id = EVMCS_VERSION; current_evmcs->revision_id = EVMCS_VERSION;
GUEST_SYNC(6); GUEST_SYNC(6);
current_evmcs->pin_based_vm_exec_control |=
PIN_BASED_NMI_EXITING;
GUEST_ASSERT(!vmlaunch()); GUEST_ASSERT(!vmlaunch());
GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
GUEST_SYNC(9);
/*
* NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is
* up-to-date (RIP points where it should and not at the beginning
* of l2_guest_code(). GUEST_SYNC(9) checkes that.
*/
GUEST_ASSERT(!vmresume()); GUEST_ASSERT(!vmresume());
GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
GUEST_SYNC(10); GUEST_SYNC(10);
}
void guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
{ GUEST_SYNC(11);
GUEST_SYNC(1);
GUEST_SYNC(2);
if (vmx_pages)
l1_guest_code(vmx_pages);
GUEST_DONE();
/* Try enlightened vmptrld with an incorrect GPA */ /* Try enlightened vmptrld with an incorrect GPA */
evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs); evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs);
GUEST_ASSERT(vmlaunch()); GUEST_ASSERT(vmlaunch());
GUEST_ASSERT(ud_count == 1);
GUEST_DONE();
}
void inject_nmi(struct kvm_vm *vm)
{
struct kvm_vcpu_events events;
vcpu_events_get(vm, VCPU_ID, &events);
events.nmi.pending = 1;
events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
vcpu_events_set(vm, VCPU_ID, &events);
} }
int main(int argc, char *argv[]) int main(int argc, char *argv[])
@ -109,6 +152,13 @@ int main(int argc, char *argv[])
vcpu_alloc_vmx(vm, &vmx_pages_gva); vcpu_alloc_vmx(vm, &vmx_pages_gva);
vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vm, VCPU_ID);
vm_handle_exception(vm, UD_VECTOR, guest_ud_handler);
vm_handle_exception(vm, NMI_VECTOR, guest_nmi_handler);
pr_info("Running L1 which uses EVMCS to run L2\n");
for (stage = 1;; stage++) { for (stage = 1;; stage++) {
_vcpu_run(vm, VCPU_ID); _vcpu_run(vm, VCPU_ID);
TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
@ -124,7 +174,7 @@ int main(int argc, char *argv[])
case UCALL_SYNC: case UCALL_SYNC:
break; break;
case UCALL_DONE: case UCALL_DONE:
goto part1_done; goto done;
default: default:
TEST_FAIL("Unknown ucall %lu", uc.cmd); TEST_FAIL("Unknown ucall %lu", uc.cmd);
} }
@ -154,12 +204,14 @@ int main(int argc, char *argv[])
TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)), TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
"Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
(ulong) regs2.rdi, (ulong) regs2.rsi); (ulong) regs2.rdi, (ulong) regs2.rsi);
/* Force immediate L2->L1 exit before resuming */
if (stage == 8) {
pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n");
inject_nmi(vm);
}
} }
part1_done: done:
_vcpu_run(vm, VCPU_ID);
TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
"Unexpected successful VMEnter with invalid eVMCS pointer!");
kvm_vm_free(vm); kvm_vm_free(vm);
} }

View file

@ -2893,8 +2893,8 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
if (val < grow_start) if (val < grow_start)
val = grow_start; val = grow_start;
if (val > halt_poll_ns) if (val > vcpu->kvm->max_halt_poll_ns)
val = halt_poll_ns; val = vcpu->kvm->max_halt_poll_ns;
vcpu->halt_poll_ns = val; vcpu->halt_poll_ns = val;
out: out:
@ -2973,7 +2973,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
goto out; goto out;
} }
poll_end = cur = ktime_get(); poll_end = cur = ktime_get();
} while (single_task_running() && ktime_before(cur, stop)); } while (single_task_running() && !need_resched() &&
ktime_before(cur, stop));
} }
prepare_to_rcuwait(&vcpu->wait); prepare_to_rcuwait(&vcpu->wait);