Merge branch 'linus' into x86/cleanups, to pick up dependent commits

Signed-off-by: Ingo Molnar <mingo@kernel.org>
2025-03-06 20:59:54 +01:00 · 2021-05-12 19:59:37 +02:00 · 2021-05-12 19:59:37 +02:00 · 6f0d271d21
commit 6f0d271d21
parent 1bc67873d4 88b06399c9
62 changed files with 865 additions and 396 deletions
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@ -4803,7 +4803,7 @@ KVM_PV_VM_VERIFY
 4.126 KVM_X86_SET_MSR_FILTER
 ----------------------------
-:Capability: KVM_X86_SET_MSR_FILTER
+:Capability: KVM_CAP_X86_MSR_FILTER
 :Architectures: x86
 :Type: vm ioctl
 :Parameters: struct kvm_msr_filter
@ -6715,7 +6715,7 @@ accesses that would usually trigger a #GP by KVM into the guest will
 instead get bounced to user space through the KVM_EXIT_X86_RDMSR and
 KVM_EXIT_X86_WRMSR exit notifications.
-8.27 KVM_X86_SET_MSR_FILTER
+8.27 KVM_CAP_X86_MSR_FILTER
 ---------------------------
 :Architectures: x86
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@ -113,6 +113,7 @@
 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
 #define UNMAPPED_GVA (~(gpa_t)0)
 #define INVALID_GPA (~(gpa_t)0)
 /* KVM Hugepage definitions for x86 */
 #define KVM_MAX_HUGEPAGE_LEVEL	PG_LEVEL_1G
@ -199,6 +200,7 @@ enum x86_intercept_stage;
 #define KVM_NR_DB_REGS	4
 #define DR6_BUS_LOCK   (1 << 11)
 #define DR6_BD		(1 << 13)
 #define DR6_BS		(1 << 14)
 #define DR6_BT		(1 << 15)
@ -212,7 +214,7 @@ enum x86_intercept_stage;
 * DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
 */
 #define DR6_ACTIVE_LOW	0xffff0ff0
-#define DR6_VOLATILE	0x0001e00f
+#define DR6_VOLATILE	0x0001e80f
 #define DR6_FIXED_1	(DR6_ACTIVE_LOW & ~DR6_VOLATILE)
 #define DR7_BP_EN_MASK	0x000000ff
@ -407,7 +409,7 @@ struct kvm_mmu {
 	u32 pkru_mask;
 	u64 *pae_root;
-	u64 *lm_root;
+	u64 *pml4_root;
 	/*
 	 * check zero bits on shadow page table entries, these
@ -1417,6 +1419,7 @@ struct kvm_arch_async_pf {
 	bool direct_map;
 };
 extern u32 __read_mostly kvm_nr_uret_msrs;
 extern u64 __read_mostly host_efer;
 extern bool __read_mostly allow_smaller_maxphyaddr;
 extern struct kvm_x86_ops kvm_x86_ops;
@ -1775,9 +1778,15 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
 		    unsigned long ipi_bitmap_high, u32 min,
 		    unsigned long icr, int op_64_bit);
-void kvm_define_user_return_msr(unsigned index, u32 msr);
+int kvm_add_user_return_msr(u32 msr);
 int kvm_find_user_return_msr(u32 msr);
 int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
 static inline bool kvm_is_supported_user_return_msr(u32 msr)
 {
 	return kvm_find_user_return_msr(msr) >= 0;
 }
 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@ -7,8 +7,6 @@
 #include <linux/interrupt.h>
 #include <uapi/asm/kvm_para.h>
 extern void kvmclock_init(void);
 #ifdef CONFIG_KVM_GUEST
 bool kvm_check_and_clear_guest_paused(void);
 #else
@ -86,13 +84,14 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
 }
 #ifdef CONFIG_KVM_GUEST
 void kvmclock_init(void);
 void kvmclock_disable(void);
 bool kvm_para_available(void);
 unsigned int kvm_arch_para_features(void);
 unsigned int kvm_arch_para_hints(void);
 void kvm_async_pf_task_wait_schedule(u32 token);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_apf_flags(void);
 void kvm_disable_steal_time(void);
 bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
 DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@ -137,11 +136,6 @@ static inline u32 kvm_read_and_reset_apf_flags(void)
 	return 0;
 }
 static inline void kvm_disable_steal_time(void)
 {
 	return;
 }
 static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 {
 	return false;
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@ -437,6 +437,8 @@ struct kvm_vmx_nested_state_hdr {
 		__u16 flags;
 	} smm;
 	__u16 pad;
 	__u32 flags;
 	__u64 preemption_timer_deadline;
 };
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@ -26,6 +26,7 @@
 #include <linux/kprobes.h>
 #include <linux/nmi.h>
 #include <linux/swait.h>
 #include <linux/syscore_ops.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
 #include <asm/traps.h>
@ -37,6 +38,7 @@
 #include <asm/tlb.h>
 #include <asm/cpuidle_haltpoll.h>
 #include <asm/ptrace.h>
 #include <asm/reboot.h>
 #include <asm/svm.h>
 DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@ -345,7 +347,7 @@ static void kvm_guest_cpu_init(void)
 		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
 		__this_cpu_write(apf_reason.enabled, 1);
-		pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
+		pr_info("setup async PF for cpu %d\n", smp_processor_id());
 	}
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
@ -371,34 +373,17 @@ static void kvm_pv_disable_apf(void)
 	wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
 	__this_cpu_write(apf_reason.enabled, 0);
-	pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id());
+	pr_info("disable async PF for cpu %d\n", smp_processor_id());
 }
-static void kvm_pv_guest_cpu_reboot(void *unused)
+static void kvm_disable_steal_time(void)
 {
-	/*
+	if (!has_steal_clock)
-	 * We disable PV EOI before we load a new kernel by kexec,
+		return;
 	 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
 	 * New kernel can re-enable when it boots.
 	 */
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
 	kvm_pv_disable_apf();
 	kvm_disable_steal_time();
 }
-static int kvm_pv_reboot_notify(struct notifier_block *nb,
+	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 				unsigned long code, void *unused)
 {
 	if (code == SYS_RESTART)
 		on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
 	return NOTIFY_DONE;
 }
 static struct notifier_block kvm_pv_reboot_nb = {
 	.notifier_call = kvm_pv_reboot_notify,
 };
 static u64 kvm_steal_clock(int cpu)
 {
 	u64 steal;
@ -416,14 +401,6 @@ static u64 kvm_steal_clock(int cpu)
 	return steal;
 }
 void kvm_disable_steal_time(void)
 {
 	if (!has_steal_clock)
 		return;
 	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 }
 static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
 {
 	early_set_memory_decrypted((unsigned long) ptr, size);
@ -451,6 +428,27 @@ static void __init sev_map_percpu_data(void)
 	}
 }
 static void kvm_guest_cpu_offline(bool shutdown)
 {
 	kvm_disable_steal_time();
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
 	kvm_pv_disable_apf();
 	if (!shutdown)
 		apf_task_wake_all();
 	kvmclock_disable();
 }
 static int kvm_cpu_online(unsigned int cpu)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	kvm_guest_cpu_init();
 	local_irq_restore(flags);
 	return 0;
 }
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
@ -635,33 +633,66 @@ static void __init kvm_smp_prepare_boot_cpu(void)
 	kvm_spinlock_init();
 }
 static void kvm_guest_cpu_offline(void)
 {
 	kvm_disable_steal_time();
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
 	kvm_pv_disable_apf();
 	apf_task_wake_all();
 }
 static int kvm_cpu_online(unsigned int cpu)
 {
 	local_irq_disable();
 	kvm_guest_cpu_init();
 	local_irq_enable();
 	return 0;
 }
 static int kvm_cpu_down_prepare(unsigned int cpu)
 {
-	local_irq_disable();
+	unsigned long flags;
-	kvm_guest_cpu_offline();
+
-	local_irq_enable();
+	local_irq_save(flags);
 	kvm_guest_cpu_offline(false);
 	local_irq_restore(flags);
 	return 0;
 }
 #endif
 static int kvm_suspend(void)
 {
 	kvm_guest_cpu_offline(false);
 	return 0;
 }
 static void kvm_resume(void)
 {
 	kvm_cpu_online(raw_smp_processor_id());
 }
 static struct syscore_ops kvm_syscore_ops = {
 	.suspend	= kvm_suspend,
 	.resume		= kvm_resume,
 };
 static void kvm_pv_guest_cpu_reboot(void *unused)
 {
 	kvm_guest_cpu_offline(true);
 }
 static int kvm_pv_reboot_notify(struct notifier_block *nb,
 				unsigned long code, void *unused)
 {
 	if (code == SYS_RESTART)
 		on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
 	return NOTIFY_DONE;
 }
 static struct notifier_block kvm_pv_reboot_nb = {
 	.notifier_call = kvm_pv_reboot_notify,
 };
 /*
 * After a PV feature is registered, the host will keep writing to the
 * registered memory location. If the guest happens to shutdown, this memory
 * won't be valid. In cases like kexec, in which you install a new kernel, this
 * means a random memory location will be kept being written.
 */
 #ifdef CONFIG_KEXEC_CORE
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
 	kvm_guest_cpu_offline(true);
 	native_machine_crash_shutdown(regs);
 }
 #endif
 static void __init kvm_guest_init(void)
 {
 	int i;
@ -704,6 +735,12 @@ static void __init kvm_guest_init(void)
 	kvm_guest_cpu_init();
 #endif
 #ifdef CONFIG_KEXEC_CORE
 	machine_ops.crash_shutdown = kvm_crash_shutdown;
 #endif
 	register_syscore_ops(&kvm_syscore_ops);
 	/*
 	 * Hard lockup detection is enabled by default. Disable it, as guests
 	 * can get false positives too easily, for example if the host is
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@ -20,7 +20,6 @@
 #include <asm/hypervisor.h>
 #include <asm/mem_encrypt.h>
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
 #include <asm/kvmclock.h>
 static int kvmclock __initdata = 1;
@ -203,28 +202,9 @@ static void kvm_setup_secondary_clock(void)
 }
 #endif
-/*
+void kvmclock_disable(void)
 * After the clock is registered, the host will keep writing to the
 * registered memory location. If the guest happens to shutdown, this memory
 * won't be valid. In cases like kexec, in which you install a new kernel, this
 * means a random memory location will be kept being written. So before any
 * kind of shutdown from our side, we unregister the clock by writing anything
 * that does not have the 'enable' bit set in the msr
 */
 #ifdef CONFIG_KEXEC_CORE
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
 	kvm_disable_steal_time();
 	native_machine_crash_shutdown(regs);
 }
 #endif
 static void kvm_shutdown(void)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
 	kvm_disable_steal_time();
 	native_machine_shutdown();
 }
 static void __init kvmclock_init_mem(void)
@ -351,10 +331,6 @@ void __init kvmclock_init(void)
 #endif
 	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
 	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
 	machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC_CORE
 	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
 	kvm_get_preset_lpj();
 	/*
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@ -458,7 +458,7 @@ void kvm_set_cpu_caps(void)
 		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
 		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
 		F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ |
-		F(SGX_LC)
+		F(SGX_LC) | F(BUS_LOCK_DETECT)
 	);
 	/* Set LA57 based on hardware capability. */
 	if (cpuid_ecx(7) & F(LA57))
@ -567,6 +567,21 @@ void kvm_set_cpu_caps(void)
 		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
 		F(PMM) | F(PMM_EN)
 	);
 	/*
 	 * Hide RDTSCP and RDPID if either feature is reported as supported but
 	 * probing MSR_TSC_AUX failed.  This is purely a sanity check and
 	 * should never happen, but the guest will likely crash if RDTSCP or
 	 * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
 	 * the past.  For example, the sanity check may fire if this instance of
 	 * KVM is running as L1 on top of an older, broken KVM.
 	 */
 	if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) ||
 		     kvm_cpu_cap_has(X86_FEATURE_RDPID)) &&
 		     !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) {
 		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
 		kvm_cpu_cap_clear(X86_FEATURE_RDPID);
 	}
 }
 EXPORT_SYMBOL_GPL(kvm_set_cpu_caps);
@ -637,7 +652,8 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
 	case 7:
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		entry->eax = 0;
-		entry->ecx = F(RDPID);
+		if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
 			entry->ecx = F(RDPID);
 		++array->nent;
 	default:
 		break;
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@ -4502,7 +4502,7 @@ static const struct opcode group8[] = {
 * from the register case of group9.
 */
 static const struct gprefix pfx_0f_c7_7 = {
-	N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+	N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid),
 };
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@ -468,6 +468,7 @@ enum x86_intercept {
 	x86_intercept_clgi,
 	x86_intercept_skinit,
 	x86_intercept_rdtscp,
 	x86_intercept_rdpid,
 	x86_intercept_icebp,
 	x86_intercept_wbinvd,
 	x86_intercept_monitor,
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@ -1913,8 +1913,8 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
 	if (!apic->lapic_timer.hv_timer_in_use)
 		goto out;
 	WARN_ON(rcuwait_active(&vcpu->wait));
 	cancel_hv_timer(apic);
 	apic_timer_expired(apic, false);
 	cancel_hv_timer(apic);
 	if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
 		advance_periodic_target_expiration(apic);
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@ -3310,12 +3310,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
 		pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
-		if (WARN_ON_ONCE(!mmu->lm_root)) {
+		if (WARN_ON_ONCE(!mmu->pml4_root)) {
 			r = -EIO;
 			goto out_unlock;
 		}
-		mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+		mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
 	}
 	for (i = 0; i < 4; ++i) {
@ -3335,7 +3335,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	}
 	if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
-		mmu->root_hpa = __pa(mmu->lm_root);
+		mmu->root_hpa = __pa(mmu->pml4_root);
 	else
 		mmu->root_hpa = __pa(mmu->pae_root);
@ -3350,7 +3350,7 @@ out_unlock:
 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
-	u64 *lm_root, *pae_root;
+	u64 *pml4_root, *pae_root;
 	/*
 	 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
@ -3369,14 +3369,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 	if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
 		return -EIO;
-	if (mmu->pae_root && mmu->lm_root)
+	if (mmu->pae_root && mmu->pml4_root)
 		return 0;
 	/*
 	 * The special roots should always be allocated in concert.  Yell and
 	 * bail if KVM ends up in a state where only one of the roots is valid.
 	 */
-	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+	if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
 		return -EIO;
 	/*
@ -3387,14 +3387,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 	if (!pae_root)
 		return -ENOMEM;
-	lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+	pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
-	if (!lm_root) {
+	if (!pml4_root) {
 		free_page((unsigned long)pae_root);
 		return -ENOMEM;
 	}
 	mmu->pae_root = pae_root;
-	mmu->lm_root = lm_root;
+	mmu->pml4_root = pml4_root;
 	return 0;
 }
@ -5261,7 +5261,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
 	if (!tdp_enabled && mmu->pae_root)
 		set_memory_encrypted((unsigned long)mmu->pae_root, 1);
 	free_page((unsigned long)mmu->pae_root);
-	free_page((unsigned long)mmu->lm_root);
+	free_page((unsigned long)mmu->pml4_root);
 }
 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@ -388,7 +388,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
 }
 /**
- * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * __handle_changed_spte - handle bookkeeping associated with an SPTE change
 * @kvm: kvm instance
 * @as_id: the address space of the paging structure the SPTE was a part of
 * @gfn: the base GFN that was mapped by the SPTE
@ -444,6 +444,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
 	if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
 		if (is_large_pte(old_spte))
 			atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
 		else
 			atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
 	}
 	/*
 	 * The only times a SPTE should be changed from a non-present to
 	 * non-present state is when an MMIO entry is installed/modified/
@ -1009,6 +1016,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
 		}
 		if (!is_shadow_present_pte(iter.old_spte)) {
 			/*
 			 * If SPTE has been forzen by another thread, just
 			 * give up and retry, avoiding unnecessary page table
 			 * allocation and free.
 			 */
 			if (is_removed_spte(iter.old_spte))
 				break;
 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
 			child_pt = sp->spt;
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@ -764,7 +764,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
 	svm_switch_vmcb(svm, &svm->vmcb01);
 	WARN_ON_ONCE(svm->vmcb->control.exit_code != SVM_EXIT_VMRUN);
 	/*
 	 * On vmexit the  GIF is set to false and
@ -872,6 +871,15 @@ void svm_free_nested(struct vcpu_svm *svm)
 	__free_page(virt_to_page(svm->nested.vmcb02.ptr));
 	svm->nested.vmcb02.ptr = NULL;
 	/*
 	 * When last_vmcb12_gpa matches the current vmcb12 gpa,
 	 * some vmcb12 fields are not loaded if they are marked clean
 	 * in the vmcb12, since in this case they are up to date already.
 	 *
 	 * When the vmcb02 is freed, this optimization becomes invalid.
 	 */
 	svm->nested.last_vmcb12_gpa = INVALID_GPA;
 	svm->nested.initialized = false;
 }
@ -884,9 +892,11 @@ void svm_leave_nested(struct vcpu_svm *svm)
 	if (is_guest_mode(vcpu)) {
 		svm->nested.nested_run_pending = 0;
 		svm->nested.vmcb12_gpa = INVALID_GPA;
 		leave_guest_mode(vcpu);
-		svm_switch_vmcb(svm, &svm->nested.vmcb02);
+		svm_switch_vmcb(svm, &svm->vmcb01);
 		nested_svm_uninit_mmu_context(vcpu);
 		vmcb_mark_all_dirty(svm->vmcb);
@ -1298,12 +1308,17 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
 	 * L2 registers if needed are moved from the current VMCB to VMCB02.
 	 */
 	if (is_guest_mode(vcpu))
 		svm_leave_nested(svm);
 	else
 		svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
 	svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
 	svm->nested.nested_run_pending =
 		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
 	svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
 	if (svm->current_vmcb == &svm->vmcb01)
 		svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
 	svm->vmcb01.ptr->save.es = save->es;
 	svm->vmcb01.ptr->save.cs = save->cs;
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@ -763,7 +763,7 @@ static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
 }
 static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
-				  unsigned long __user dst_uaddr,
+				  void __user *dst_uaddr,
 				  unsigned long dst_paddr,
 				  int size, int *err)
 {
@ -787,8 +787,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
 	if (tpage) {
 		offset = paddr & 15;
-		if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
+		if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
 				 page_address(tpage) + offset, size))
 			ret = -EFAULT;
 	}
@ -800,9 +799,9 @@ e_free:
 }
 static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
-				  unsigned long __user vaddr,
+				  void __user *vaddr,
 				  unsigned long dst_paddr,
-				  unsigned long __user dst_vaddr,
+				  void __user *dst_vaddr,
 				  int size, int *error)
 {
 	struct page *src_tpage = NULL;
@ -810,13 +809,12 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
 	int ret, len = size;
 	/* If source buffer is not aligned then use an intermediate buffer */
-	if (!IS_ALIGNED(vaddr, 16)) {
+	if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
 		src_tpage = alloc_page(GFP_KERNEL);
 		if (!src_tpage)
 			return -ENOMEM;
-		if (copy_from_user(page_address(src_tpage),
+		if (copy_from_user(page_address(src_tpage), vaddr, size)) {
 				(void __user *)(uintptr_t)vaddr, size)) {
 			__free_page(src_tpage);
 			return -EFAULT;
 		}
@ -830,7 +828,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
 	 *   - copy the source buffer in an intermediate buffer
 	 *   - use the intermediate buffer as source buffer
 	 */
-	if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+	if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
 		int dst_offset;
 		dst_tpage = alloc_page(GFP_KERNEL);
@ -855,7 +853,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
 			       page_address(src_tpage), size);
 		else {
 			if (copy_from_user(page_address(dst_tpage) + dst_offset,
-					   (void __user *)(uintptr_t)vaddr, size)) {
+					   vaddr, size)) {
 				ret = -EFAULT;
 				goto e_free;
 			}
@ -935,15 +933,15 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
 		if (dec)
 			ret = __sev_dbg_decrypt_user(kvm,
 						     __sme_page_pa(src_p[0]) + s_off,
-						     dst_vaddr,
+						     (void __user *)dst_vaddr,
 						     __sme_page_pa(dst_p[0]) + d_off,
 						     len, &argp->error);
 		else
 			ret = __sev_dbg_encrypt_user(kvm,
 						     __sme_page_pa(src_p[0]) + s_off,
-						     vaddr,
+						     (void __user *)vaddr,
 						     __sme_page_pa(dst_p[0]) + d_off,
-						     dst_vaddr,
+						     (void __user *)dst_vaddr,
 						     len, &argp->error);
 		sev_unpin_memory(kvm, src_p, n);
@ -1764,7 +1762,8 @@ e_mirror_unlock:
 e_source_unlock:
 	mutex_unlock(&source_kvm->lock);
 e_source_put:
-	fput(source_kvm_file);
+	if (source_kvm_file)
 		fput(source_kvm_file);
 	return ret;
 }
@ -2198,7 +2197,7 @@ vmgexit_err:
 	return -EINVAL;
 }
-static void pre_sev_es_run(struct vcpu_svm *svm)
+void sev_es_unmap_ghcb(struct vcpu_svm *svm)
 {
 	if (!svm->ghcb)
 		return;
@ -2234,9 +2233,6 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu)
 	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 	int asid = sev_get_asid(svm->vcpu.kvm);
 	/* Perform any SEV-ES pre-run actions */
 	pre_sev_es_run(svm);
 	/* Assign the asid allocated with this SEV guest */
 	svm->asid = asid;
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@ -212,7 +212,7 @@ DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
 * defer the restoration of TSC_AUX until the CPU returns to userspace.
 */
-#define TSC_AUX_URET_SLOT	0
+static int tsc_aux_uret_slot __read_mostly = -1;
 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
@ -447,6 +447,11 @@ static int has_svm(void)
 		return 0;
 	}
 	if (pgtable_l5_enabled()) {
 		pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
 		return 0;
 	}
 	return 1;
 }
@ -959,8 +964,7 @@ static __init int svm_hardware_setup(void)
 		kvm_tsc_scaling_ratio_frac_bits = 32;
 	}
-	if (boot_cpu_has(X86_FEATURE_RDTSCP))
+	tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
 		kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX);
 	/* Check for pause filtering support */
 	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
@ -1100,7 +1104,9 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	return svm->vmcb->control.tsc_offset;
 }
-static void svm_check_invpcid(struct vcpu_svm *svm)
+/* Evaluate instruction intercepts that depend on guest CPUID features. */
 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
 					      struct vcpu_svm *svm)
 {
 	/*
 	 * Intercept INVPCID if shadow paging is enabled to sync/free shadow
@ -1113,6 +1119,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm)
 		else
 			svm_clr_intercept(svm, INTERCEPT_INVPCID);
 	}
 	if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
 		if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 			svm_clr_intercept(svm, INTERCEPT_RDTSCP);
 		else
 			svm_set_intercept(svm, INTERCEPT_RDTSCP);
 	}
 }
 static void init_vmcb(struct kvm_vcpu *vcpu)
@ -1235,8 +1248,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
 	svm->current_vmcb->asid_generation = 0;
 	svm->asid = 0;
-	svm->nested.vmcb12_gpa = 0;
+	svm->nested.vmcb12_gpa = INVALID_GPA;
-	svm->nested.last_vmcb12_gpa = 0;
+	svm->nested.last_vmcb12_gpa = INVALID_GPA;
 	vcpu->arch.hflags = 0;
 	if (!kvm_pause_in_guest(vcpu->kvm)) {
@ -1248,7 +1261,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
 		svm_clr_intercept(svm, INTERCEPT_PAUSE);
 	}
-	svm_check_invpcid(svm);
+	svm_recalc_instruction_intercepts(vcpu, svm);
 	/*
 	 * If the host supports V_SPEC_CTRL then disable the interception
@ -1424,6 +1437,9 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
 	if (sev_es_guest(vcpu->kvm))
 		sev_es_unmap_ghcb(svm);
 	if (svm->guest_state_loaded)
 		return;
@ -1445,8 +1461,8 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 		}
 	}
-	if (static_cpu_has(X86_FEATURE_RDTSCP))
+	if (likely(tsc_aux_uret_slot >= 0))
-		kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull);
+		kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
 	svm->guest_state_loaded = true;
 }
@ -2655,11 +2671,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
 		break;
 	case MSR_TSC_AUX:
 		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
 			return 1;
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 			return 1;
 		msr_info->data = svm->tsc_aux;
 		break;
 	/*
@ -2876,30 +2887,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
 		break;
 	case MSR_TSC_AUX:
 		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
 			return 1;
 		if (!msr->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 			return 1;
 		/*
 		 * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
 		 * incomplete and conflicting architectural behavior.  Current
 		 * AMD CPUs completely ignore bits 63:32, i.e. they aren't
 		 * reserved and always read as zeros.  Emulate AMD CPU behavior
 		 * to avoid explosions if the vCPU is migrated from an AMD host
 		 * to an Intel host.
 		 */
 		data = (u32)data;
 		/*
 		 * TSC_AUX is usually changed only during boot and never read
 		 * directly.  Intercept TSC_AUX instead of exposing it to the
 		 * guest via direct_access_msrs, and switch it via user return.
 		 */
 		preempt_disable();
-		r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
+		r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
 		preempt_enable();
 		if (r)
 			return 1;
@ -3084,6 +3078,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[SVM_EXIT_STGI]				= stgi_interception,
 	[SVM_EXIT_CLGI]				= clgi_interception,
 	[SVM_EXIT_SKINIT]			= skinit_interception,
 	[SVM_EXIT_RDTSCP]			= kvm_handle_invalid_op,
 	[SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
 	[SVM_EXIT_MONITOR]			= kvm_emulate_monitor,
 	[SVM_EXIT_MWAIT]			= kvm_emulate_mwait,
@ -3972,8 +3967,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 	svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
 			     guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
-	/* Check again if INVPCID interception if required */
+	svm_recalc_instruction_intercepts(vcpu, svm);
 	svm_check_invpcid(svm);
 	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
 	if (sev_guest(vcpu->kvm)) {
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@ -581,6 +581,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm);
 void sev_es_create_vcpu(struct vcpu_svm *svm);
 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
 void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
 void sev_es_unmap_ghcb(struct vcpu_svm *svm);
 /* vmenter.S */
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@ -398,6 +398,9 @@ static inline u64 vmx_supported_debugctl(void)
 {
 	u64 debugctl = 0;
 	if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
 		debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
 	if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
 		debugctl |= DEBUGCTLMSR_LBR_MASK;
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@ -3098,15 +3098,8 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
 			nested_vmx_handle_enlightened_vmptrld(vcpu, false);
 		if (evmptrld_status == EVMPTRLD_VMFAIL ||
-		    evmptrld_status == EVMPTRLD_ERROR) {
+		    evmptrld_status == EVMPTRLD_ERROR)
 			pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
 					     __func__);
 			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 			vcpu->run->internal.suberror =
 				KVM_INTERNAL_ERROR_EMULATION;
 			vcpu->run->internal.ndata = 0;
 			return false;
 		}
 	}
 	return true;
@ -3194,8 +3187,16 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
 {
-	if (!nested_get_evmcs_page(vcpu))
+	if (!nested_get_evmcs_page(vcpu)) {
 		pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
 				     __func__);
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror =
 			KVM_INTERNAL_ERROR_EMULATION;
 		vcpu->run->internal.ndata = 0;
 		return false;
 	}
 	if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
 		return false;
@ -4435,7 +4436,15 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
 	/* Similarly, triple faults in L2 should never escape. */
 	WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
-	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+	if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
 		/*
 		 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
 		 * Enlightened VMCS after migration and we still need to
 		 * do that when something is forcing L2->L1 exit prior to
 		 * the first L2 run.
 		 */
 		(void)nested_get_evmcs_page(vcpu);
 	}
 	/* Service the TLB flush request for L2 before switching to L1. */
 	if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@ -455,21 +455,6 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 static unsigned long host_idt_base;
 /*
 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
 * will emulate SYSCALL in legacy mode if the vendor string in guest
 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
 * support this emulation, IA32_STAR must always be included in
 * vmx_uret_msrs_list[], even in i386 builds.
 */
 static const u32 vmx_uret_msrs_list[] = {
 #ifdef CONFIG_X86_64
 	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 #endif
 	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 	MSR_IA32_TSX_CTRL,
 };
 #if IS_ENABLED(CONFIG_HYPERV)
 static bool __read_mostly enlightened_vmcs = true;
 module_param(enlightened_vmcs, bool, 0444);
@ -697,21 +682,11 @@ static bool is_valid_passthrough_msr(u32 msr)
 	return r;
 }
 static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
 	for (i = 0; i < vmx->nr_uret_msrs; ++i)
 		if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
 			return i;
 	return -1;
 }
 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
-	i = __vmx_find_uret_msr(vmx, msr);
+	i = kvm_find_user_return_msr(msr);
 	if (i >= 0)
 		return &vmx->guest_uret_msrs[i];
 	return NULL;
@ -720,13 +695,14 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
 				  struct vmx_uret_msr *msr, u64 data)
 {
 	unsigned int slot = msr - vmx->guest_uret_msrs;
 	int ret = 0;
 	u64 old_msr_data = msr->data;
 	msr->data = data;
-	if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
+	if (msr->load_into_hardware) {
 		preempt_disable();
-		ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
+		ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
 		preempt_enable();
 		if (ret)
 			msr->data = old_msr_data;
@ -1078,7 +1054,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
 		return false;
 	}
-	i = __vmx_find_uret_msr(vmx, MSR_EFER);
+	i = kvm_find_user_return_msr(MSR_EFER);
 	if (i < 0)
 		return false;
@ -1240,11 +1216,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 	 */
 	if (!vmx->guest_uret_msrs_loaded) {
 		vmx->guest_uret_msrs_loaded = true;
-		for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
+		for (i = 0; i < kvm_nr_uret_msrs; ++i) {
-			kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+			if (!vmx->guest_uret_msrs[i].load_into_hardware)
 				continue;
 			kvm_set_user_return_msr(i,
 						vmx->guest_uret_msrs[i].data,
 						vmx->guest_uret_msrs[i].mask);
-
+		}
 	}
    	if (vmx->nested.need_vmcs12_to_shadow_sync)
@ -1751,19 +1730,16 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
 	vmx_clear_hlt(vcpu);
 }
-static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
 			       bool load_into_hardware)
 {
-	struct vmx_uret_msr tmp;
+	struct vmx_uret_msr *uret_msr;
 	int from, to;
-	from = __vmx_find_uret_msr(vmx, msr);
+	uret_msr = vmx_find_uret_msr(vmx, msr);
-	if (from < 0)
+	if (!uret_msr)
 		return;
 	to = vmx->nr_active_uret_msrs++;
-	tmp = vmx->guest_uret_msrs[to];
+	uret_msr->load_into_hardware = load_into_hardware;
 	vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
 	vmx->guest_uret_msrs[from] = tmp;
 }
 /*
@ -1773,29 +1749,42 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
 */
 static void setup_msrs(struct vcpu_vmx *vmx)
 {
 	vmx->guest_uret_msrs_loaded = false;
 	vmx->nr_active_uret_msrs = 0;
 #ifdef CONFIG_X86_64
 	bool load_syscall_msrs;
 	/*
 	 * The SYSCALL MSRs are only needed on long mode guests, and only
 	 * when EFER.SCE is set.
 	 */
-	if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
+	load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
-		vmx_setup_uret_msr(vmx, MSR_STAR);
+			    (vmx->vcpu.arch.efer & EFER_SCE);
-		vmx_setup_uret_msr(vmx, MSR_LSTAR);
+
-		vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
+	vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
-	}
+	vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
 	vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
 #endif
-	if (update_transition_efer(vmx))
+	vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
 		vmx_setup_uret_msr(vmx, MSR_EFER);
-	if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
+	vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
-		vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+			   guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
 			   guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
-	vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+	/*
 	 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
 	 * kernel and old userspace.  If those guests run on a tsx=off host, do
 	 * allow guests to use TSX_CTRL, but don't change the value in hardware
 	 * so that TSX remains always disabled.
 	 */
 	vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
 	if (cpu_has_vmx_msr_bitmap())
 		vmx_update_msr_bitmap(&vmx->vcpu);
 	/*
 	 * The set of MSRs to load may have changed, reload MSRs before the
 	 * next VM-Enter.
 	 */
 	vmx->guest_uret_msrs_loaded = false;
 }
 static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@ -1993,11 +1982,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
 		break;
 	case MSR_TSC_AUX:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 			return 1;
 		goto find_uret_msr;
 	case MSR_IA32_DEBUGCTLMSR:
 		msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
 		break;
@ -2031,6 +2015,9 @@ static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
 	if (!intel_pmu_lbr_is_enabled(vcpu))
 		debugctl &= ~DEBUGCTLMSR_LBR_MASK;
 	if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
 		debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
 	return debugctl;
 }
@ -2313,14 +2300,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			vmx->pt_desc.guest.addr_a[index / 2] = data;
 		break;
 	case MSR_TSC_AUX:
 		if (!msr_info->host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 			return 1;
 		/* Check reserved bit, higher 32 bits should be zero */
 		if ((data >> 32) != 0)
 			return 1;
 		goto find_uret_msr;
 	case MSR_IA32_PERF_CAPABILITIES:
 		if (data && !vcpu_to_pmu(vcpu)->version)
 			return 1;
@ -4369,7 +4348,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 						  xsaves_enabled, false);
 	}
-	vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
+	/*
 	 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
 	 * feature is exposed to the guest.  This creates a virtualization hole
 	 * if both are supported in hardware but only one is exposed to the
 	 * guest, but letting the guest execute RDTSCP or RDPID when either one
 	 * is advertised is preferable to emulating the advertised instruction
 	 * in KVM on #UD, and obviously better than incorrectly injecting #UD.
 	 */
 	if (cpu_has_vmx_rdtscp()) {
 		bool rdpid_or_rdtscp_enabled =
 			guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
 			guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
 		vmx_adjust_secondary_exec_control(vmx, &exec_control,
 						  SECONDARY_EXEC_ENABLE_RDTSCP,
 						  rdpid_or_rdtscp_enabled, false);
 	}
 	vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
 	vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@ -6855,6 +6850,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vmx_uret_msr *tsx_ctrl;
 	struct vcpu_vmx *vmx;
 	int i, cpu, err;
@ -6877,43 +6873,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 			goto free_vpid;
 	}
-	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+	for (i = 0; i < kvm_nr_uret_msrs; ++i) {
-
+		vmx->guest_uret_msrs[i].data = 0;
-	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
+		vmx->guest_uret_msrs[i].mask = -1ull;
-		u32 index = vmx_uret_msrs_list[i];
+	}
-		u32 data_low, data_high;
+	if (boot_cpu_has(X86_FEATURE_RTM)) {
-		int j = vmx->nr_uret_msrs;
+		/*
-
+		 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
-		if (rdmsr_safe(index, &data_low, &data_high) < 0)
+		 * Keep the host value unchanged to avoid changing CPUID bits
-			continue;
+		 * under the host kernel's feet.
-		if (wrmsr_safe(index, data_low, data_high) < 0)
+		 */
-			continue;
+		tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
-
+		if (tsx_ctrl)
-		vmx->guest_uret_msrs[j].slot = i;
+			vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
 		vmx->guest_uret_msrs[j].data = 0;
 		switch (index) {
 		case MSR_IA32_TSX_CTRL:
 			/*
 			 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
 			 * interception.  Keep the host value unchanged to avoid
 			 * changing CPUID bits under the host kernel's feet.
 			 *
 			 * hle=0, rtm=0, tsx_ctrl=1 can be found with some
 			 * combinations of new kernel and old userspace.  If
 			 * those guests run on a tsx=off host, do allow guests
 			 * to use TSX_CTRL, but do not change the value on the
 			 * host so that TSX remains always disabled.
 			 */
 			if (boot_cpu_has(X86_FEATURE_RTM))
 				vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
 			else
 				vmx->guest_uret_msrs[j].mask = 0;
 			break;
 		default:
 			vmx->guest_uret_msrs[j].mask = -1ull;
 			break;
 		}
 		++vmx->nr_uret_msrs;
 	}
 	err = alloc_loaded_vmcs(&vmx->vmcs01);
@ -7344,9 +7316,11 @@ static __init void vmx_set_cpu_caps(void)
 	if (!cpu_has_vmx_xsaves())
 		kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
-	/* CPUID 0x80000001 */
+	/* CPUID 0x80000001 and 0x7 (RDPID) */
-	if (!cpu_has_vmx_rdtscp())
+	if (!cpu_has_vmx_rdtscp()) {
 		kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
 		kvm_cpu_cap_clear(X86_FEATURE_RDPID);
 	}
 	if (cpu_has_vmx_waitpkg())
 		kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
@ -7402,8 +7376,9 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 	/*
 	 * RDPID causes #UD if disabled through secondary execution controls.
 	 * Because it is marked as EmulateOnUD, we need to intercept it here.
 	 * Note, RDPID is hidden behind ENABLE_RDTSCP.
 	 */
-	case x86_intercept_rdtscp:
+	case x86_intercept_rdpid:
 		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
 			exception->vector = UD_VECTOR;
 			exception->error_code_valid = false;
@ -7769,17 +7744,42 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
 };
 static __init void vmx_setup_user_return_msrs(void)
 {
 	/*
 	 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
 	 * will emulate SYSCALL in legacy mode if the vendor string in guest
 	 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
 	 * support this emulation, MSR_STAR is included in the list for i386,
 	 * but is never loaded into hardware.  MSR_CSTAR is also never loaded
 	 * into hardware and is here purely for emulation purposes.
 	 */
 	const u32 vmx_uret_msrs_list[] = {
 	#ifdef CONFIG_X86_64
 		MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 	#endif
 		MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 		MSR_IA32_TSX_CTRL,
 	};
 	int i;
 	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
 	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
 		kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
 }
 static __init int hardware_setup(void)
 {
 	unsigned long host_bndcfgs;
 	struct desc_ptr dt;
-	int r, i, ept_lpage_level;
+	int r, ept_lpage_level;
 	store_idt(&dt);
 	host_idt_base = dt.address;
-	for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+	vmx_setup_user_return_msrs();
 		kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
 	if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
 		return -EIO;
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@ -36,7 +36,7 @@ struct vmx_msrs {
 };
 struct vmx_uret_msr {
-	unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */
+	bool load_into_hardware;
 	u64 data;
 	u64 mask;
 };
@ -245,8 +245,16 @@ struct vcpu_vmx {
 	u32                   idt_vectoring_info;
 	ulong                 rflags;
 	/*
 	 * User return MSRs are always emulated when enabled in the guest, but
 	 * only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
 	 * of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
 	 * be loaded into hardware if those conditions aren't met.
 	 * nr_active_uret_msrs tracks the number of MSRs that need to be loaded
 	 * into hardware when running the guest.  guest_uret_msrs[] is resorted
 	 * whenever the number of "active" uret MSRs is modified.
 	 */
 	struct vmx_uret_msr   guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
 	int                   nr_uret_msrs;
 	int                   nr_active_uret_msrs;
 	bool                  guest_uret_msrs_loaded;
 #ifdef CONFIG_X86_64
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -184,11 +184,6 @@ module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
 */
 #define KVM_MAX_NR_USER_RETURN_MSRS 16
 struct kvm_user_return_msrs_global {
 	int nr;
 	u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
 };
 struct kvm_user_return_msrs {
 	struct user_return_notifier urn;
 	bool registered;
@ -198,7 +193,9 @@ struct kvm_user_return_msrs {
 	} values[KVM_MAX_NR_USER_RETURN_MSRS];
 };
-static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+u32 __read_mostly kvm_nr_uret_msrs;
 EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
 static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
 static struct kvm_user_return_msrs __percpu *user_return_msrs;
 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
@ -330,23 +327,53 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
 		user_return_notifier_unregister(urn);
 	}
 	local_irq_restore(flags);
-	for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+	for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
 		values = &msrs->values[slot];
 		if (values->host != values->curr) {
-			wrmsrl(user_return_msrs_global.msrs[slot], values->host);
+			wrmsrl(kvm_uret_msrs_list[slot], values->host);
 			values->curr = values->host;
 		}
 	}
 }
-void kvm_define_user_return_msr(unsigned slot, u32 msr)
+static int kvm_probe_user_return_msr(u32 msr)
 {
-	BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
+	u64 val;
-	user_return_msrs_global.msrs[slot] = msr;
+	int ret;
-	if (slot >= user_return_msrs_global.nr)
+
-		user_return_msrs_global.nr = slot + 1;
+	preempt_disable();
 	ret = rdmsrl_safe(msr, &val);
 	if (ret)
 		goto out;
 	ret = wrmsrl_safe(msr, val);
 out:
 	preempt_enable();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
+
 int kvm_add_user_return_msr(u32 msr)
 {
 	BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
 	if (kvm_probe_user_return_msr(msr))
 		return -1;
 	kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
 	return kvm_nr_uret_msrs++;
 }
 EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
 int kvm_find_user_return_msr(u32 msr)
 {
 	int i;
 	for (i = 0; i < kvm_nr_uret_msrs; ++i) {
 		if (kvm_uret_msrs_list[i] == msr)
 			return i;
 	}
 	return -1;
 }
 EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
 static void kvm_user_return_msr_cpu_online(void)
 {
@ -355,8 +382,8 @@ static void kvm_user_return_msr_cpu_online(void)
 	u64 value;
 	int i;
-	for (i = 0; i < user_return_msrs_global.nr; ++i) {
+	for (i = 0; i < kvm_nr_uret_msrs; ++i) {
-		rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+		rdmsrl_safe(kvm_uret_msrs_list[i], &value);
 		msrs->values[i].host = value;
 		msrs->values[i].curr = value;
 	}
@ -371,7 +398,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
 	value = (value & mask) | (msrs->values[slot].host & ~mask);
 	if (value == msrs->values[slot].curr)
 		return 0;
-	err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
+	err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
 	if (err)
 		return 1;
@ -1149,6 +1176,9 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
 	if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
 		fixed |= DR6_RTM;
 	if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
 		fixed |= DR6_BUS_LOCK;
 	return fixed;
 }
@ -1615,6 +1645,30 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
 		 * invokes 64-bit SYSENTER.
 		 */
 		data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
 		break;
 	case MSR_TSC_AUX:
 		if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
 			return 1;
 		if (!host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
 			return 1;
 		/*
 		 * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
 		 * incomplete and conflicting architectural behavior.  Current
 		 * AMD CPUs completely ignore bits 63:32, i.e. they aren't
 		 * reserved and always read as zeros.  Enforce Intel's reserved
 		 * bits check if and only if the guest CPU is Intel, and clear
 		 * the bits in all other cases.  This ensures cross-vendor
 		 * migration will provide consistent behavior for the guest.
 		 */
 		if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
 			return 1;
 		data = (u32)data;
 		break;
 	}
 	msr.data = data;
@ -1651,6 +1705,18 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
 	if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
 		return KVM_MSR_RET_FILTERED;
 	switch (index) {
 	case MSR_TSC_AUX:
 		if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
 			return 1;
 		if (!host_initiated &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
 			return 1;
 		break;
 	}
 	msr.index = index;
 	msr.host_initiated = host_initiated;
@ -5468,14 +5534,18 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
 static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
 			      struct kvm_msr_filter_range *user_range)
 {
 	struct msr_bitmap_range range;
 	unsigned long *bitmap = NULL;
 	size_t bitmap_size;
 	int r;
 	if (!user_range->nmsrs)
 		return 0;
 	if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
 		return -EINVAL;
 	if (!user_range->flags)
 		return -EINVAL;
 	bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
 	if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
 		return -EINVAL;
@ -5484,31 +5554,15 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
 	if (IS_ERR(bitmap))
 		return PTR_ERR(bitmap);
-	range = (struct msr_bitmap_range) {
+	msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
 		.flags = user_range->flags,
 		.base = user_range->base,
 		.nmsrs = user_range->nmsrs,
 		.bitmap = bitmap,
 	};
 	if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
 		r = -EINVAL;
 		goto err;
 	}
 	if (!range.flags) {
 		r = -EINVAL;
 		goto err;
 	}
 	/* Everything ok, add this range identifier. */
 	msr_filter->ranges[msr_filter->count] = range;
 	msr_filter->count++;
 	return 0;
 err:
 	kfree(bitmap);
 	return r;
 }
 static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
@ -5937,7 +5991,8 @@ static void kvm_init_msr_list(void)
 				continue;
 			break;
 		case MSR_TSC_AUX:
-			if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+			if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
 			    !kvm_cpu_cap_has(X86_FEATURE_RDPID))
 				continue;
 			break;
 		case MSR_IA32_UMWAIT_CONTROL:
@ -8039,6 +8094,18 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
 /*
 * Indirection to move queue_work() out of the tk_core.seq write held
 * region to prevent possible deadlocks against time accessors which
 * are invoked with work related locks held.
 */
 static void pvclock_irq_work_fn(struct irq_work *w)
 {
 	queue_work(system_long_wq, &pvclock_gtod_work);
 }
 static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
 /*
 * Notification about pvclock gtod data update.
 */
@ -8050,13 +8117,14 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
 	update_pvclock_gtod(tk);
-	/* disable master clock if host does not trust, or does not
+	/*
-	 * use, TSC based clocksource.
+	 * Disable master clock if host does not trust, or does not use,
 	 * TSC based clocksource. Delegate queue_work() to irq_work as
 	 * this is invoked with tk_core.seq write held.
 	 */
 	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
 	    atomic_read(&kvm_guest_has_master_clock) != 0)
-		queue_work(system_long_wq, &pvclock_gtod_work);
+		irq_work_queue(&pvclock_irq_work);
 	return 0;
 }
@ -8118,6 +8186,7 @@ int kvm_arch_init(void *opaque)
 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
 		goto out_free_x86_emulator_cache;
 	}
 	kvm_nr_uret_msrs = 0;
 	r = kvm_mmu_module_init();
 	if (r)
@ -8168,6 +8237,8 @@ void kvm_arch_exit(void)
 	cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
 #ifdef CONFIG_X86_64
 	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
 	irq_work_sync(&pvclock_irq_work);
 	cancel_work_sync(&pvclock_gtod_work);
 #endif
 	kvm_x86_ops.hardware_enable = NULL;
 	kvm_mmu_module_exit();
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@ -3127,7 +3127,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_inode *inode, u64 new_size,
 			       u32 min_type);
-int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
 			       bool in_reclaim_context);
 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@ -1340,12 +1340,16 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 		stripe = bbio->stripes;
 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
 			u64 bytes;
 			struct btrfs_device *device = stripe->dev;
-			if (!stripe->dev->bdev) {
+			if (!device->bdev) {
 				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
 				continue;
 			}
 			if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
 				continue;
 			ret = do_discard_extent(stripe, &bytes);
 			if (!ret) {
 				discarded_bytes += bytes;
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@ -2067,6 +2067,30 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
 	return ret;
 }
 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_inode *inode = BTRFS_I(ctx->inode);
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	if (btrfs_inode_in_log(inode, fs_info->generation) &&
 	    list_empty(&ctx->ordered_extents))
 		return true;
 	/*
 	 * If we are doing a fast fsync we can not bail out if the inode's
 	 * last_trans is <= then the last committed transaction, because we only
 	 * update the last_trans of the inode during ordered extent completion,
 	 * and for a fast fsync we don't wait for that, we only wait for the
 	 * writeback to complete.
 	 */
 	if (inode->last_trans <= fs_info->last_trans_committed &&
 	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
 	     list_empty(&ctx->ordered_extents)))
 		return true;
 	return false;
 }
 /*
 * fsync call for both files and directories.  This logs the inode into
 * the tree log instead of forcing full commits whenever possible.
@ -2185,17 +2209,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	atomic_inc(&root->log_batch);
 	/*
 	 * If we are doing a fast fsync we can not bail out if the inode's
 	 * last_trans is <= then the last committed transaction, because we only
 	 * update the last_trans of the inode during ordered extent completion,
 	 * and for a fast fsync we don't wait for that, we only wait for the
 	 * writeback to complete.
 	 */
 	smp_mb();
-	if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
+	if (skip_inode_logging(&ctx)) {
 	    (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed &&
 	     (full_sync || list_empty(&ctx.ordered_extents)))) {
 		/*
 		 * We've had everything committed since the last time we were
 		 * modified so clear this flag in case it was set for whatever
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@ -3949,7 +3949,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_block_group *block_group;
 	struct rb_node *node;
-	int ret;
+	int ret = 0;
 	btrfs_info(fs_info, "cleaning free space cache v1");
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@ -9678,7 +9678,7 @@ out:
 	return ret;
 }
-int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
 {
 	struct writeback_control wbc = {
 		.nr_to_write = LONG_MAX,
@ -9691,7 +9691,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		return -EROFS;
-	return start_delalloc_inodes(root, &wbc, true, false);
+	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
 }
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@ -259,6 +259,8 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
 	if (!fa->flags_valid) {
 		/* 1 item for the inode */
 		trans = btrfs_start_transaction(root, 1);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
 		goto update_flags;
 	}
@ -907,7 +909,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
 	 */
 	btrfs_drew_read_lock(&root->snapshot_lock);
-	ret = btrfs_start_delalloc_snapshot(root);
+	ret = btrfs_start_delalloc_snapshot(root, false);
 	if (ret)
 		goto out;
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@ -984,7 +984,7 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
 	if (pre)
 		ret = clone_ordered_extent(ordered, 0, pre);
-	if (post)
+	if (ret == 0 && post)
 		ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
 					   post);
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@ -3545,11 +3545,15 @@ static int try_flush_qgroup(struct btrfs_root *root)
 	struct btrfs_trans_handle *trans;
 	int ret;
-	/* Can't hold an open transaction or we run the risk of deadlocking */
+	/*
-	ASSERT(current->journal_info == NULL ||
+	 * Can't hold an open transaction or we run the risk of deadlocking,
-	       current->journal_info == BTRFS_SEND_TRANS_STUB);
+	 * and can't either be under the context of a send operation (where
-	if (WARN_ON(current->journal_info &&
+	 * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that
-		    current->journal_info != BTRFS_SEND_TRANS_STUB))
+	 * would result in a crash when starting a transaction and does not
 	 * make sense either (send is a read-only operation).
 	 */
 	ASSERT(current->journal_info == NULL);
 	if (WARN_ON(current->journal_info))
 		return 0;
 	/*
@ -3562,7 +3566,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
 		return 0;
 	}
-	ret = btrfs_start_delalloc_snapshot(root);
+	ret = btrfs_start_delalloc_snapshot(root, true);
 	if (ret < 0)
 		goto out;
 	btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@ -7170,7 +7170,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
 	int i;
 	if (root) {
-		ret = btrfs_start_delalloc_snapshot(root);
+		ret = btrfs_start_delalloc_snapshot(root, false);
 		if (ret)
 			return ret;
 		btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
@ -7178,7 +7178,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
 	for (i = 0; i < sctx->clone_roots_cnt; i++) {
 		root = sctx->clone_roots[i].root;
-		ret = btrfs_start_delalloc_snapshot(root);
+		ret = btrfs_start_delalloc_snapshot(root, false);
 		if (ret)
 			return ret;
 		btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@ -6061,7 +6061,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	 * (since logging them is pointless, a link count of 0 means they
 	 * will never be accessible).
 	 */
-	if (btrfs_inode_in_log(inode, trans->transid) ||
+	if ((btrfs_inode_in_log(inode, trans->transid) &&
 	     list_empty(&ctx->ordered_extents)) ||
 	    inode->vfs_inode.i_nlink == 0) {
 		ret = BTRFS_NO_LOG_SYNC;
 		goto end_no_trans;
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@ -1126,6 +1126,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 			goto out;
 		}
 		if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
 			ret = -EIO;
 			goto out;
 		}
 		switch (zone.cond) {
 		case BLK_ZONE_COND_OFFLINE:
 		case BLK_ZONE_COND_READONLY:
--- a/tools/arch/powerpc/include/uapi/asm/errno.h
+++ b/tools/arch/powerpc/include/uapi/asm/errno.h
@ -2,6 +2,7 @@
 #ifndef _ASM_POWERPC_ERRNO_H
 #define _ASM_POWERPC_ERRNO_H
 #undef	EDEADLOCK
 #include <asm-generic/errno.h>
 #undef	EDEADLOCK
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@ -84,7 +84,7 @@
 /* CPU types for specific tunings: */
 #define X86_FEATURE_K8			( 3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7			( 3*32+ 5) /* "" Athlon */
+/* FREE, was #define X86_FEATURE_K7			( 3*32+ 5) "" Athlon */
 #define X86_FEATURE_P3			( 3*32+ 6) /* "" P3 */
 #define X86_FEATURE_P4			( 3*32+ 7) /* "" P4 */
 #define X86_FEATURE_CONSTANT_TSC	( 3*32+ 8) /* TSC ticks at a constant rate */
@ -236,6 +236,8 @@
 #define X86_FEATURE_EPT_AD		( 8*32+17) /* Intel Extended Page Table access-dirty bit */
 #define X86_FEATURE_VMCALL		( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
 #define X86_FEATURE_VMW_VMMCALL		( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
 #define X86_FEATURE_PVUNLOCK		( 8*32+20) /* "" PV unlock function */
 #define X86_FEATURE_VCPUPREEMPT		( 8*32+21) /* "" PV vcpu_is_preempted function */
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
 #define X86_FEATURE_FSGSBASE		( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@ -290,6 +292,8 @@
 #define X86_FEATURE_FENCE_SWAPGS_KERNEL	(11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
 #define X86_FEATURE_SPLIT_LOCK_DETECT	(11*32+ 6) /* #AC for split lock */
 #define X86_FEATURE_PER_THREAD_MBA	(11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
 #define X86_FEATURE_SGX1		(11*32+ 8) /* "" Basic SGX */
 #define X86_FEATURE_SGX2		(11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
@ -336,6 +340,7 @@
 #define X86_FEATURE_AVIC		(15*32+13) /* Virtual Interrupt Controller */
 #define X86_FEATURE_V_VMSAVE_VMLOAD	(15*32+15) /* Virtual VMSAVE VMLOAD */
 #define X86_FEATURE_VGIF		(15*32+16) /* Virtual GIF */
 #define X86_FEATURE_V_SPEC_CTRL		(15*32+20) /* Virtual SPEC_CTRL */
 #define X86_FEATURE_SVME_ADDR_CHK	(15*32+28) /* "" SVME addr check */
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
@ -354,6 +359,7 @@
 #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* POPCNT for vectors of DW/QW */
 #define X86_FEATURE_LA57		(16*32+16) /* 5-level page tables */
 #define X86_FEATURE_RDPID		(16*32+22) /* RDPID instruction */
 #define X86_FEATURE_BUS_LOCK_DETECT	(16*32+24) /* Bus Lock detect */
 #define X86_FEATURE_CLDEMOTE		(16*32+25) /* CLDEMOTE instruction */
 #define X86_FEATURE_MOVDIRI		(16*32+27) /* MOVDIRI instruction */
 #define X86_FEATURE_MOVDIR64B		(16*32+28) /* MOVDIR64B instruction */
@ -374,6 +380,7 @@
 #define X86_FEATURE_MD_CLEAR		(18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_TSX_FORCE_ABORT	(18*32+13) /* "" TSX_FORCE_ABORT */
 #define X86_FEATURE_SERIALIZE		(18*32+14) /* SERIALIZE instruction */
 #define X86_FEATURE_HYBRID_CPU		(18*32+15) /* "" This part has CPUs of more than one type */
 #define X86_FEATURE_TSXLDTRK		(18*32+16) /* TSX Suspend Load Address Tracking */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_ARCH_LBR		(18*32+19) /* Intel ARCH LBR */
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@ -185,6 +185,9 @@
 #define MSR_PEBS_DATA_CFG		0x000003f2
 #define MSR_IA32_DS_AREA		0x00000600
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
 #define PERF_CAP_METRICS_IDX		15
 #define PERF_CAP_PT_IDX			16
 #define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
 #define MSR_IA32_RTIT_CTL		0x00000570
@ -265,6 +268,7 @@
 #define DEBUGCTLMSR_LBR			(1UL <<  0) /* last branch recording */
 #define DEBUGCTLMSR_BTF_SHIFT		1
 #define DEBUGCTLMSR_BTF			(1UL <<  1) /* single-step on branches */
 #define DEBUGCTLMSR_BUS_LOCK_DETECT	(1UL <<  2)
 #define DEBUGCTLMSR_TR			(1UL <<  6)
 #define DEBUGCTLMSR_BTS			(1UL <<  7)
 #define DEBUGCTLMSR_BTINT		(1UL <<  8)
--- a/tools/arch/x86/include/uapi/asm/vmx.h
+++ b/tools/arch/x86/include/uapi/asm/vmx.h
@ -27,6 +27,7 @@
 #define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
 #define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE	0x08000000
 #define EXIT_REASON_EXCEPTION_NMI       0
 #define EXIT_REASON_EXTERNAL_INTERRUPT  1
--- a/tools/arch/x86/lib/memcpy_64.S
+++ b/tools/arch/x86/lib/memcpy_64.S
@ -4,7 +4,7 @@
 #include <linux/linkage.h>
 #include <asm/errno.h>
 #include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
 #include <asm/export.h>
 .pushsection .noinstr.text, "ax"
--- a/tools/arch/x86/lib/memset_64.S
+++ b/tools/arch/x86/lib/memset_64.S
@ -3,7 +3,7 @@
 #include <linux/linkage.h>
 #include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
 #include <asm/export.h>
 /*
--- a/tools/include/asm/alternative-asm.h
+++ b/tools/include/asm/alternative-asm.h
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@ -863,9 +863,18 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise)
 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
 #define __NR_mount_setattr 442
 __SYSCALL(__NR_mount_setattr, sys_mount_setattr)
 #define __NR_quotactl_path 443
 __SYSCALL(__NR_quotactl_path, sys_quotactl_path)
 #define __NR_landlock_create_ruleset 444
 __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset)
 #define __NR_landlock_add_rule 445
 __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
 #define __NR_landlock_restrict_self 446
 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
 #undef __NR_syscalls
-#define __NR_syscalls 443
+#define __NR_syscalls 447
 /*
 * 32 bit systems traditionally used different
--- a/tools/include/uapi/drm/drm.h
+++ b/tools/include/uapi/drm/drm.h
@ -625,30 +625,147 @@ struct drm_gem_open {
 	__u64 size;
 };
 /**
 * DRM_CAP_DUMB_BUFFER
 *
 * If set to 1, the driver supports creating dumb buffers via the
 * &DRM_IOCTL_MODE_CREATE_DUMB ioctl.
 */
 #define DRM_CAP_DUMB_BUFFER		0x1
 /**
 * DRM_CAP_VBLANK_HIGH_CRTC
 *
 * If set to 1, the kernel supports specifying a CRTC index in the high bits of
 * &drm_wait_vblank_request.type.
 *
 * Starting kernel version 2.6.39, this capability is always set to 1.
 */
 #define DRM_CAP_VBLANK_HIGH_CRTC	0x2
 /**
 * DRM_CAP_DUMB_PREFERRED_DEPTH
 *
 * The preferred bit depth for dumb buffers.
 *
 * The bit depth is the number of bits used to indicate the color of a single
 * pixel excluding any padding. This is different from the number of bits per
 * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per
 * pixel.
 *
 * Note that this preference only applies to dumb buffers, it's irrelevant for
 * other types of buffers.
 */
 #define DRM_CAP_DUMB_PREFERRED_DEPTH	0x3
 /**
 * DRM_CAP_DUMB_PREFER_SHADOW
 *
 * If set to 1, the driver prefers userspace to render to a shadow buffer
 * instead of directly rendering to a dumb buffer. For best speed, userspace
 * should do streaming ordered memory copies into the dumb buffer and never
 * read from it.
 *
 * Note that this preference only applies to dumb buffers, it's irrelevant for
 * other types of buffers.
 */
 #define DRM_CAP_DUMB_PREFER_SHADOW	0x4
 /**
 * DRM_CAP_PRIME
 *
 * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT
 * and &DRM_PRIME_CAP_EXPORT.
 *
 * PRIME buffers are exposed as dma-buf file descriptors. See
 * Documentation/gpu/drm-mm.rst, section "PRIME Buffer Sharing".
 */
 #define DRM_CAP_PRIME			0x5
 /**
 * DRM_PRIME_CAP_IMPORT
 *
 * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME
 * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl.
 */
 #define  DRM_PRIME_CAP_IMPORT		0x1
 /**
 * DRM_PRIME_CAP_EXPORT
 *
 * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME
 * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl.
 */
 #define  DRM_PRIME_CAP_EXPORT		0x2
 /**
 * DRM_CAP_TIMESTAMP_MONOTONIC
 *
 * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in
 * struct drm_event_vblank. If set to 1, the kernel will report timestamps with
 * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these
 * clocks.
 *
 * Starting from kernel version 2.6.39, the default value for this capability
 * is 1. Starting kernel version 4.15, this capability is always set to 1.
 */
 #define DRM_CAP_TIMESTAMP_MONOTONIC	0x6
 /**
 * DRM_CAP_ASYNC_PAGE_FLIP
 *
 * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC.
 */
 #define DRM_CAP_ASYNC_PAGE_FLIP		0x7
-/*
+/**
- * The CURSOR_WIDTH and CURSOR_HEIGHT capabilities return a valid widthxheight
+ * DRM_CAP_CURSOR_WIDTH
- * combination for the hardware cursor. The intention is that a hardware
+ *
- * agnostic userspace can query a cursor plane size to use.
+ * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid
 * width x height combination for the hardware cursor. The intention is that a
 * hardware agnostic userspace can query a cursor plane size to use.
 *
 * Note that the cross-driver contract is to merely return a valid size;
 * drivers are free to attach another meaning on top, eg. i915 returns the
 * maximum plane size.
 */
 #define DRM_CAP_CURSOR_WIDTH		0x8
 /**
 * DRM_CAP_CURSOR_HEIGHT
 *
 * See &DRM_CAP_CURSOR_WIDTH.
 */
 #define DRM_CAP_CURSOR_HEIGHT		0x9
 /**
 * DRM_CAP_ADDFB2_MODIFIERS
 *
 * If set to 1, the driver supports supplying modifiers in the
 * &DRM_IOCTL_MODE_ADDFB2 ioctl.
 */
 #define DRM_CAP_ADDFB2_MODIFIERS	0x10
 /**
 * DRM_CAP_PAGE_FLIP_TARGET
 *
 * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and
 * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in
 * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP
 * ioctl.
 */
 #define DRM_CAP_PAGE_FLIP_TARGET	0x11
 /**
 * DRM_CAP_CRTC_IN_VBLANK_EVENT
 *
 * If set to 1, the kernel supports reporting the CRTC ID in
 * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and
 * &DRM_EVENT_FLIP_COMPLETE events.
 *
 * Starting kernel version 4.12, this capability is always set to 1.
 */
 #define DRM_CAP_CRTC_IN_VBLANK_EVENT	0x12
 /**
 * DRM_CAP_SYNCOBJ
 *
 * If set to 1, the driver supports sync objects. See
 * Documentation/gpu/drm-mm.rst, section "DRM Sync Objects".
 */
 #define DRM_CAP_SYNCOBJ		0x13
 /**
 * DRM_CAP_SYNCOBJ_TIMELINE
 *
 * If set to 1, the driver supports timeline operations on sync objects. See
 * Documentation/gpu/drm-mm.rst, section "DRM Sync Objects".
 */
 #define DRM_CAP_SYNCOBJ_TIMELINE	0x14
 /* DRM_IOCTL_GET_CAP ioctl argument type */
--- a/tools/include/uapi/drm/i915_drm.h
+++ b/tools/include/uapi/drm/i915_drm.h
@ -943,6 +943,7 @@ struct drm_i915_gem_exec_object {
 	__u64 offset;
 };
 /* DRM_IOCTL_I915_GEM_EXECBUFFER was removed in Linux 5.13 */
 struct drm_i915_gem_execbuffer {
 	/**
 	 * List of buffers to be validated with their relocations to be
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@ -1078,6 +1078,10 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_DIRTY_LOG_RING 192
 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
 #define KVM_CAP_PPC_DAWR1 194
 #define KVM_CAP_SET_GUEST_DEBUG2 195
 #define KVM_CAP_SGX_ATTRIBUTE 196
 #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197
 #define KVM_CAP_PTP_KVM 198
 #ifdef KVM_CAP_IRQ_ROUTING
@ -1671,6 +1675,8 @@ enum sev_cmd_id {
 	KVM_SEV_CERT_EXPORT,
 	/* Attestation report */
 	KVM_SEV_GET_ATTESTATION_REPORT,
 	/* Guest Migration Extension */
 	KVM_SEV_SEND_CANCEL,
 	KVM_SEV_NR_MAX,
 };
@ -1729,6 +1735,45 @@ struct kvm_sev_attestation_report {
 	__u32 len;
 };
 struct kvm_sev_send_start {
 	__u32 policy;
 	__u64 pdh_cert_uaddr;
 	__u32 pdh_cert_len;
 	__u64 plat_certs_uaddr;
 	__u32 plat_certs_len;
 	__u64 amd_certs_uaddr;
 	__u32 amd_certs_len;
 	__u64 session_uaddr;
 	__u32 session_len;
 };
 struct kvm_sev_send_update_data {
 	__u64 hdr_uaddr;
 	__u32 hdr_len;
 	__u64 guest_uaddr;
 	__u32 guest_len;
 	__u64 trans_uaddr;
 	__u32 trans_len;
 };
 struct kvm_sev_receive_start {
 	__u32 handle;
 	__u32 policy;
 	__u64 pdh_uaddr;
 	__u32 pdh_len;
 	__u64 session_uaddr;
 	__u32 session_len;
 };
 struct kvm_sev_receive_update_data {
 	__u64 hdr_uaddr;
 	__u32 hdr_len;
 	__u64 guest_uaddr;
 	__u32 guest_len;
 	__u64 trans_uaddr;
 	__u32 trans_len;
 };
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@ -127,6 +127,7 @@ enum perf_sw_ids {
 	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
 	PERF_COUNT_SW_DUMMY			= 9,
 	PERF_COUNT_SW_BPF_OUTPUT		= 10,
 	PERF_COUNT_SW_CGROUP_SWITCHES		= 11,
 	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
@ -326,6 +327,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
 #define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
 #define PERF_ATTR_SIZE_VER6	120	/* add: aux_sample_size */
 #define PERF_ATTR_SIZE_VER7	128	/* add: sig_data */
 /*
 * Hardware event_id to monitor via a performance monitoring event:
@ -404,7 +406,10 @@ struct perf_event_attr {
 				cgroup         :  1, /* include cgroup events */
 				text_poke      :  1, /* include text poke events */
 				build_id       :  1, /* use build id in mmap2 events */
-				__reserved_1   : 29;
+				inherit_thread :  1, /* children only inherit if cloned with CLONE_THREAD */
 				remove_on_exec :  1, /* event is removed from task on exec */
 				sigtrap        :  1, /* send synchronous SIGTRAP on event */
 				__reserved_1   : 26;
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@ -456,6 +461,12 @@ struct perf_event_attr {
 	__u16	__reserved_2;
 	__u32	aux_sample_size;
 	__u32	__reserved_3;
 	/*
 	 * User provided data if sigtrap=1, passed back to user via
 	 * siginfo_t::si_perf, e.g. to permit user to identify the event.
 	 */
 	__u64	sig_data;
 };
 /*
@ -1171,10 +1182,15 @@ enum perf_callchain_context {
 /**
 * PERF_RECORD_AUX::flags bits
 */
-#define PERF_AUX_FLAG_TRUNCATED		0x01	/* record was truncated to fit */
+#define PERF_AUX_FLAG_TRUNCATED			0x01	/* record was truncated to fit */
-#define PERF_AUX_FLAG_OVERWRITE		0x02	/* snapshot from overwrite mode */
+#define PERF_AUX_FLAG_OVERWRITE			0x02	/* snapshot from overwrite mode */
-#define PERF_AUX_FLAG_PARTIAL		0x04	/* record contains gaps */
+#define PERF_AUX_FLAG_PARTIAL			0x04	/* record contains gaps */
-#define PERF_AUX_FLAG_COLLISION		0x08	/* sample collided with another */
+#define PERF_AUX_FLAG_COLLISION			0x08	/* sample collided with another */
 #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK	0xff00	/* PMU specific trace format type */
 /* CoreSight PMU AUX buffer formats */
 #define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT	0x0000 /* Default for backward compatibility */
 #define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW		0x0100 /* Raw format of the source */
 #define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@ -255,4 +255,8 @@ struct prctl_mm_map {
 # define SYSCALL_DISPATCH_FILTER_ALLOW	0
 # define SYSCALL_DISPATCH_FILTER_BLOCK	1
 /* Set/get enabled arm64 pointer authentication keys */
 #define PR_PAC_SET_ENABLED_KEYS		60
 #define PR_PAC_GET_ENABLED_KEYS		61
 #endif /* _LINUX_PRCTL_H */
--- a/tools/kvm/kvm_stat/kvm_stat.txt
+++ b/tools/kvm/kvm_stat/kvm_stat.txt
@ -111,7 +111,7 @@ OPTIONS
 --tracepoints::
        retrieve statistics from tracepoints
-*z*::
+-z::
 --skip-zero-records::
        omit records with all zeros in logging mode
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@ -540,6 +540,7 @@ ifndef NO_LIBELF
      ifdef LIBBPF_DYNAMIC
        ifeq ($(feature-libbpf), 1)
          EXTLIBS += -lbpf
          $(call detected,CONFIG_LIBBPF_DYNAMIC)
        else
          dummy := $(error Error: No libbpf devel library found, please install libbpf-devel);
        endif
--- a/tools/perf/arch/arm64/util/kvm-stat.c
+++ b/tools/perf/arch/arm64/util/kvm-stat.c
@ -71,7 +71,7 @@ struct kvm_reg_events_ops kvm_reg_events_ops[] = {
 		.name	= "vmexit",
 		.ops	= &exit_events,
 	},
-	{ NULL },
+	{ NULL, NULL },
 };
 const char * const kvm_skip_events[] = {
--- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
+++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
@ -356,3 +356,8 @@
 439	n64	faccessat2			sys_faccessat2
 440	n64	process_madvise			sys_process_madvise
 441	n64	epoll_pwait2			sys_epoll_pwait2
 442	n64	mount_setattr			sys_mount_setattr
 443	n64	quotactl_path			sys_quotactl_path
 444	n64	landlock_create_ruleset		sys_landlock_create_ruleset
 445	n64	landlock_add_rule		sys_landlock_add_rule
 446	n64	landlock_restrict_self		sys_landlock_restrict_self
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@ -522,3 +522,7 @@
 440	common	process_madvise			sys_process_madvise
 441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
 442	common	mount_setattr			sys_mount_setattr
 443	common	quotactl_path			sys_quotactl_path
 444	common	landlock_create_ruleset		sys_landlock_create_ruleset
 445	common	landlock_add_rule		sys_landlock_add_rule
 446	common	landlock_restrict_self		sys_landlock_restrict_self
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@ -445,3 +445,7 @@
 440  common	process_madvise		sys_process_madvise		sys_process_madvise
 441  common	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
 442  common	mount_setattr		sys_mount_setattr		sys_mount_setattr
 443  common	quotactl_path		sys_quotactl_path		sys_quotactl_path
 444  common	landlock_create_ruleset	sys_landlock_create_ruleset	sys_landlock_create_ruleset
 445  common	landlock_add_rule	sys_landlock_add_rule		sys_landlock_add_rule
 446  common	landlock_restrict_self	sys_landlock_restrict_self	sys_landlock_restrict_self
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@ -364,6 +364,10 @@
 440	common	process_madvise		sys_process_madvise
 441	common	epoll_pwait2		sys_epoll_pwait2
 442	common	mount_setattr		sys_mount_setattr
 443	common	quotactl_path		sys_quotactl_path
 444	common	landlock_create_ruleset	sys_landlock_create_ruleset
 445	common	landlock_add_rule	sys_landlock_add_rule
 446	common	landlock_restrict_self	sys_landlock_restrict_self
 #
 # Due to a historical design error, certain syscalls are numbered differently
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c
@ -1123,8 +1123,10 @@ static int process_one_file(const char *fpath, const struct stat *sb,
 			mapfile = strdup(fpath);
 			return 0;
 		}
-
+		if (is_json_file(bname))
-		pr_info("%s: Ignoring file %s\n", prog, fpath);
+			pr_debug("%s: ArchStd json is preprocessed %s\n", prog, fpath);
 		else
 			pr_info("%s: Ignoring file %s\n", prog, fpath);
 		return 0;
 	}
--- a/tools/perf/tests/attr/base-record
+++ b/tools/perf/tests/attr/base-record
@ -5,7 +5,7 @@ group_fd=-1
 flags=0|8
 cpu=*
 type=0|1
-size=120
+size=128
 config=0
 sample_period=*
 sample_type=263
--- a/tools/perf/tests/attr/base-stat
+++ b/tools/perf/tests/attr/base-stat
@ -5,7 +5,7 @@ group_fd=-1
 flags=0|8
 cpu=*
 type=0
-size=120
+size=128
 config=0
 sample_period=0
 sample_type=65536
--- a/tools/perf/tests/attr/system-wide-dummy
+++ b/tools/perf/tests/attr/system-wide-dummy
@ -7,7 +7,7 @@ cpu=*
 pid=-1
 flags=8
 type=1
-size=120
+size=128
 config=9
 sample_period=4000
 sample_type=455
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@ -145,7 +145,14 @@ perf-$(CONFIG_LIBELF) += symbol-elf.o
 perf-$(CONFIG_LIBELF) += probe-file.o
 perf-$(CONFIG_LIBELF) += probe-event.o
 ifdef CONFIG_LIBBPF_DYNAMIC
  hashmap := 1
 endif
 ifndef CONFIG_LIBBPF
  hashmap := 1
 endif
 ifdef hashmap
 perf-y += hashmap.o
 endif
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c
@ -157,9 +157,15 @@ static int get_max_rate(unsigned int *rate)
 static int record_opts__config_freq(struct record_opts *opts)
 {
 	bool user_freq = opts->user_freq != UINT_MAX;
 	bool user_interval = opts->user_interval != ULLONG_MAX;
 	unsigned int max_rate;
-	if (opts->user_interval != ULLONG_MAX)
+	if (user_interval && user_freq) {
 		pr_err("cannot set frequency and period at the same time\n");
 		return -1;
 	}
 	if (user_interval)
 		opts->default_interval = opts->user_interval;
 	if (user_freq)
 		opts->freq = opts->user_freq;
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@ -904,7 +904,7 @@ static void perf_event__cpu_map_swap(union perf_event *event,
 	struct perf_record_record_cpu_map *mask;
 	unsigned i;
-	data->type = bswap_64(data->type);
+	data->type = bswap_16(data->type);
 	switch (data->type) {
 	case PERF_CPU_MAP__CPUS:
@ -937,7 +937,7 @@ static void perf_event__stat_config_swap(union perf_event *event,
 {
 	u64 size;
-	size  = event->stat_config.nr * sizeof(event->stat_config.data[0]);
+	size  = bswap_64(event->stat_config.nr) * sizeof(event->stat_config.data[0]);
 	size += 1; /* nr item itself */
 	mem_bswap_64(&event->stat_config.nr, size);
 }
--- a/tools/testing/selftests/kvm/lib/x86_64/handlers.S
+++ b/tools/testing/selftests/kvm/lib/x86_64/handlers.S
@ -54,9 +54,9 @@ idt_handlers:
 	.align 8
 	/* Fetch current address and append it to idt_handlers. */
-	current_handler = .
+666 :
 .pushsection .rodata
-.quad current_handler
+	.quad 666b
 .popsection
 	.if ! \has_error
--- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
@ -18,6 +18,28 @@
 #include "vmx.h"
 #define VCPU_ID		5
 #define NMI_VECTOR	2
 static int ud_count;
 void enable_x2apic(void)
 {
 	uint32_t spiv_reg = APIC_BASE_MSR + (APIC_SPIV >> 4);
 	wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) |
 	      MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD);
 	wrmsr(spiv_reg, rdmsr(spiv_reg) | APIC_SPIV_APIC_ENABLED);
 }
 static void guest_ud_handler(struct ex_regs *regs)
 {
 	ud_count++;
 	regs->rip += 3; /* VMLAUNCH */
 }
 static void guest_nmi_handler(struct ex_regs *regs)
 {
 }
 void l2_guest_code(void)
 {
@ -25,15 +47,23 @@ void l2_guest_code(void)
 	GUEST_SYNC(8);
 	/* Forced exit to L1 upon restore */
 	GUEST_SYNC(9);
 	/* Done, exit to L1 and never come back.  */
 	vmcall();
 }
-void l1_guest_code(struct vmx_pages *vmx_pages)
+void guest_code(struct vmx_pages *vmx_pages)
 {
 #define L2_GUEST_STACK_SIZE 64
 	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
 	enable_x2apic();
 	GUEST_SYNC(1);
 	GUEST_SYNC(2);
 	enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist);
 	GUEST_ASSERT(vmx_pages->vmcs_gpa);
@ -55,27 +85,40 @@ void l1_guest_code(struct vmx_pages *vmx_pages)
 	current_evmcs->revision_id = EVMCS_VERSION;
 	GUEST_SYNC(6);
 	current_evmcs->pin_based_vm_exec_control |=
 		PIN_BASED_NMI_EXITING;
 	GUEST_ASSERT(!vmlaunch());
 	GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
-	GUEST_SYNC(9);
+
 	/*
 	 * NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is
 	 * up-to-date (RIP points where it should and not at the beginning
 	 * of l2_guest_code(). GUEST_SYNC(9) checkes that.
 	 */
 	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
 	GUEST_SYNC(10);
 }
-void guest_code(struct vmx_pages *vmx_pages)
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-{
+	GUEST_SYNC(11);
 	GUEST_SYNC(1);
 	GUEST_SYNC(2);
 	if (vmx_pages)
 		l1_guest_code(vmx_pages);
 	GUEST_DONE();
 	/* Try enlightened vmptrld with an incorrect GPA */
 	evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs);
 	GUEST_ASSERT(vmlaunch());
 	GUEST_ASSERT(ud_count == 1);
 	GUEST_DONE();
 }
 void inject_nmi(struct kvm_vm *vm)
 {
 	struct kvm_vcpu_events events;
 	vcpu_events_get(vm, VCPU_ID, &events);
 	events.nmi.pending = 1;
 	events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
 	vcpu_events_set(vm, VCPU_ID, &events);
 }
 int main(int argc, char *argv[])
@ -109,6 +152,13 @@ int main(int argc, char *argv[])
 	vcpu_alloc_vmx(vm, &vmx_pages_gva);
 	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
 	vm_init_descriptor_tables(vm);
 	vcpu_init_descriptor_tables(vm, VCPU_ID);
 	vm_handle_exception(vm, UD_VECTOR, guest_ud_handler);
 	vm_handle_exception(vm, NMI_VECTOR, guest_nmi_handler);
 	pr_info("Running L1 which uses EVMCS to run L2\n");
 	for (stage = 1;; stage++) {
 		_vcpu_run(vm, VCPU_ID);
 		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
@ -124,7 +174,7 @@ int main(int argc, char *argv[])
 		case UCALL_SYNC:
 			break;
 		case UCALL_DONE:
-			goto part1_done;
+			goto done;
 		default:
 			TEST_FAIL("Unknown ucall %lu", uc.cmd);
 		}
@ -154,12 +204,14 @@ int main(int argc, char *argv[])
 		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
 			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
 			    (ulong) regs2.rdi, (ulong) regs2.rsi);
 		/* Force immediate L2->L1 exit before resuming */
 		if (stage == 8) {
 			pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n");
 			inject_nmi(vm);
 		}
 	}
-part1_done:
+done:
 	_vcpu_run(vm, VCPU_ID);
 	TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
 		    "Unexpected successful VMEnter with invalid eVMCS pointer!");
 	kvm_vm_free(vm);
 }
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@ -2893,8 +2893,8 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 	if (val < grow_start)
 		val = grow_start;
-	if (val > halt_poll_ns)
+	if (val > vcpu->kvm->max_halt_poll_ns)
-		val = halt_poll_ns;
+		val = vcpu->kvm->max_halt_poll_ns;
 	vcpu->halt_poll_ns = val;
 out:
@ -2973,7 +2973,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 				goto out;
 			}
 			poll_end = cur = ktime_get();
-		} while (single_task_running() && ktime_before(cur, stop));
+		} while (single_task_running() && !need_resched() &&
 			 ktime_before(cur, stop));
 	}
 	prepare_to_rcuwait(&vcpu->wait);