Use hypercall to emulate MSR read/write for the TDX platform. There are two viable approaches for doing MSRs in a TD guest: 1. Execute the RDMSR/WRMSR instructions like most VMs and bare metal do. Some will succeed, others will cause a #VE. All of those that cause a #VE will be handled with a TDCALL. 2. Use paravirt infrastructure. The paravirt hook has to keep a list of which MSRs would cause a #VE and use a TDCALL. All other MSRs execute RDMSR/WRMSR instructions directly. The second option can be ruled out because the list of MSRs was challenging to maintain. That leaves option #1 as the only viable solution for the minimal TDX support. Kernel relies on the exception fixup machinery to handle MSR access errors. #VE handler uses the same exception fixup code as #GP. It covers MSR accesses along with other types of fixups. For performance-critical MSR writes (like TSC_DEADLINE), future patches will replace the WRMSR/#VE sequence with the direct TDCALL. RDMSR and WRMSR specification details can be found in Guest-Host-Communication Interface (GHCI) for Intel Trust Domain Extensions (Intel TDX) specification, sec titled "TDG.VP. VMCALL<Instruction.RDMSR>" and "TDG.VP.VMCALL<Instruction.WRMSR>". Co-developed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Andi Kleen <ak@linux.intel.com> Reviewed-by: Tony Luck <tony.luck@intel.com> Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lkml.kernel.org/r/20220405232939.73860-10-kirill.shutemov@linux.intel.com
270 lines
6.9 KiB
C
270 lines
6.9 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/* Copyright (C) 2021-2022 Intel Corporation */
|
|
|
|
#undef pr_fmt
|
|
#define pr_fmt(fmt) "tdx: " fmt
|
|
|
|
#include <linux/cpufeature.h>
|
|
#include <asm/coco.h>
|
|
#include <asm/tdx.h>
|
|
#include <asm/vmx.h>
|
|
|
|
/* TDX module Call Leaf IDs */
|
|
#define TDX_GET_INFO 1
|
|
#define TDX_GET_VEINFO 3
|
|
|
|
/*
|
|
* Wrapper for standard use of __tdx_hypercall with no output aside from
|
|
* return code.
|
|
*/
|
|
static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
|
|
{
|
|
struct tdx_hypercall_args args = {
|
|
.r10 = TDX_HYPERCALL_STANDARD,
|
|
.r11 = fn,
|
|
.r12 = r12,
|
|
.r13 = r13,
|
|
.r14 = r14,
|
|
.r15 = r15,
|
|
};
|
|
|
|
return __tdx_hypercall(&args, 0);
|
|
}
|
|
|
|
/* Called from __tdx_hypercall() for unrecoverable failure */
|
|
void __tdx_hypercall_failed(void)
|
|
{
|
|
panic("TDVMCALL failed. TDX module bug?");
|
|
}
|
|
|
|
/*
|
|
* The TDG.VP.VMCALL-Instruction-execution sub-functions are defined
|
|
* independently from but are currently matched 1:1 with VMX EXIT_REASONs.
|
|
* Reusing the KVM EXIT_REASON macros makes it easier to connect the host and
|
|
* guest sides of these calls.
|
|
*/
|
|
static u64 hcall_func(u64 exit_reason)
|
|
{
|
|
return exit_reason;
|
|
}
|
|
|
|
/*
|
|
* Used for TDX guests to make calls directly to the TD module. This
|
|
* should only be used for calls that have no legitimate reason to fail
|
|
* or where the kernel can not survive the call failing.
|
|
*/
|
|
static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
|
|
struct tdx_module_output *out)
|
|
{
|
|
if (__tdx_module_call(fn, rcx, rdx, r8, r9, out))
|
|
panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
|
|
}
|
|
|
|
static u64 get_cc_mask(void)
|
|
{
|
|
struct tdx_module_output out;
|
|
unsigned int gpa_width;
|
|
|
|
/*
|
|
* TDINFO TDX module call is used to get the TD execution environment
|
|
* information like GPA width, number of available vcpus, debug mode
|
|
* information, etc. More details about the ABI can be found in TDX
|
|
* Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
|
|
* [TDG.VP.INFO].
|
|
*
|
|
* The GPA width that comes out of this call is critical. TDX guests
|
|
* can not meaningfully run without it.
|
|
*/
|
|
tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
|
|
|
|
gpa_width = out.rcx & GENMASK(5, 0);
|
|
|
|
/*
|
|
* The highest bit of a guest physical address is the "sharing" bit.
|
|
* Set it for shared pages and clear it for private pages.
|
|
*/
|
|
return BIT_ULL(gpa_width - 1);
|
|
}
|
|
|
|
static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
|
|
{
|
|
struct tdx_hypercall_args args = {
|
|
.r10 = TDX_HYPERCALL_STANDARD,
|
|
.r11 = hcall_func(EXIT_REASON_HLT),
|
|
.r12 = irq_disabled,
|
|
};
|
|
|
|
/*
|
|
* Emulate HLT operation via hypercall. More info about ABI
|
|
* can be found in TDX Guest-Host-Communication Interface
|
|
* (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
|
|
*
|
|
* The VMM uses the "IRQ disabled" param to understand IRQ
|
|
* enabled status (RFLAGS.IF) of the TD guest and to determine
|
|
* whether or not it should schedule the halted vCPU if an
|
|
* IRQ becomes pending. E.g. if IRQs are disabled, the VMM
|
|
* can keep the vCPU in virtual HLT, even if an IRQ is
|
|
* pending, without hanging/breaking the guest.
|
|
*/
|
|
return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
|
|
}
|
|
|
|
static bool handle_halt(void)
|
|
{
|
|
/*
|
|
* Since non safe halt is mainly used in CPU offlining
|
|
* and the guest will always stay in the halt state, don't
|
|
* call the STI instruction (set do_sti as false).
|
|
*/
|
|
const bool irq_disabled = irqs_disabled();
|
|
const bool do_sti = false;
|
|
|
|
if (__halt(irq_disabled, do_sti))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
void __cpuidle tdx_safe_halt(void)
|
|
{
|
|
/*
|
|
* For do_sti=true case, __tdx_hypercall() function enables
|
|
* interrupts using the STI instruction before the TDCALL. So
|
|
* set irq_disabled as false.
|
|
*/
|
|
const bool irq_disabled = false;
|
|
const bool do_sti = true;
|
|
|
|
/*
|
|
* Use WARN_ONCE() to report the failure.
|
|
*/
|
|
if (__halt(irq_disabled, do_sti))
|
|
WARN_ONCE(1, "HLT instruction emulation failed\n");
|
|
}
|
|
|
|
static bool read_msr(struct pt_regs *regs)
|
|
{
|
|
struct tdx_hypercall_args args = {
|
|
.r10 = TDX_HYPERCALL_STANDARD,
|
|
.r11 = hcall_func(EXIT_REASON_MSR_READ),
|
|
.r12 = regs->cx,
|
|
};
|
|
|
|
/*
|
|
* Emulate the MSR read via hypercall. More info about ABI
|
|
* can be found in TDX Guest-Host-Communication Interface
|
|
* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
|
|
*/
|
|
if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
|
|
return false;
|
|
|
|
regs->ax = lower_32_bits(args.r11);
|
|
regs->dx = upper_32_bits(args.r11);
|
|
return true;
|
|
}
|
|
|
|
static bool write_msr(struct pt_regs *regs)
|
|
{
|
|
struct tdx_hypercall_args args = {
|
|
.r10 = TDX_HYPERCALL_STANDARD,
|
|
.r11 = hcall_func(EXIT_REASON_MSR_WRITE),
|
|
.r12 = regs->cx,
|
|
.r13 = (u64)regs->dx << 32 | regs->ax,
|
|
};
|
|
|
|
/*
|
|
* Emulate the MSR write via hypercall. More info about ABI
|
|
* can be found in TDX Guest-Host-Communication Interface
|
|
* (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
|
|
*/
|
|
return !__tdx_hypercall(&args, 0);
|
|
}
|
|
|
|
void tdx_get_ve_info(struct ve_info *ve)
|
|
{
|
|
struct tdx_module_output out;
|
|
|
|
/*
|
|
* Called during #VE handling to retrieve the #VE info from the
|
|
* TDX module.
|
|
*
|
|
* This has to be called early in #VE handling. A "nested" #VE which
|
|
* occurs before this will raise a #DF and is not recoverable.
|
|
*
|
|
* The call retrieves the #VE info from the TDX module, which also
|
|
* clears the "#VE valid" flag. This must be done before anything else
|
|
* because any #VE that occurs while the valid flag is set will lead to
|
|
* #DF.
|
|
*
|
|
* Note, the TDX module treats virtual NMIs as inhibited if the #VE
|
|
* valid flag is set. It means that NMI=>#VE will not result in a #DF.
|
|
*/
|
|
tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out);
|
|
|
|
/* Transfer the output parameters */
|
|
ve->exit_reason = out.rcx;
|
|
ve->exit_qual = out.rdx;
|
|
ve->gla = out.r8;
|
|
ve->gpa = out.r9;
|
|
ve->instr_len = lower_32_bits(out.r10);
|
|
ve->instr_info = upper_32_bits(out.r10);
|
|
}
|
|
|
|
/* Handle the kernel #VE */
|
|
static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
|
|
{
|
|
switch (ve->exit_reason) {
|
|
case EXIT_REASON_HLT:
|
|
return handle_halt();
|
|
case EXIT_REASON_MSR_READ:
|
|
return read_msr(regs);
|
|
case EXIT_REASON_MSR_WRITE:
|
|
return write_msr(regs);
|
|
default:
|
|
pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
|
|
{
|
|
bool ret;
|
|
|
|
if (user_mode(regs))
|
|
ret = false;
|
|
else
|
|
ret = virt_exception_kernel(regs, ve);
|
|
|
|
/* After successful #VE handling, move the IP */
|
|
if (ret)
|
|
regs->ip += ve->instr_len;
|
|
|
|
return ret;
|
|
}
|
|
|
|
void __init tdx_early_init(void)
|
|
{
|
|
u64 cc_mask;
|
|
u32 eax, sig[3];
|
|
|
|
cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
|
|
|
|
if (memcmp(TDX_IDENT, sig, sizeof(sig)))
|
|
return;
|
|
|
|
setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
|
|
|
|
cc_set_vendor(CC_VENDOR_INTEL);
|
|
cc_mask = get_cc_mask();
|
|
cc_set_mask(cc_mask);
|
|
|
|
/*
|
|
* All bits above GPA width are reserved and kernel treats shared bit
|
|
* as flag, not as part of physical address.
|
|
*
|
|
* Adjust physical mask to only cover valid GPA bits.
|
|
*/
|
|
physical_mask &= cc_mask - 1;
|
|
|
|
pr_info("Guest detected\n");
|
|
}
|