1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00
linux/arch/x86/coco/tdx/tdx.c
Kirill A. Shutemov ae87f609cd x86/tdx: Add MSR support for TDX guests
Use hypercall to emulate MSR read/write for the TDX platform.

There are two viable approaches for doing MSRs in a TD guest:

1. Execute the RDMSR/WRMSR instructions like most VMs and bare metal
   do. Some will succeed, others will cause a #VE. All of those that
   cause a #VE will be handled with a TDCALL.
2. Use paravirt infrastructure.  The paravirt hook has to keep a list
   of which MSRs would cause a #VE and use a TDCALL.  All other MSRs
   execute RDMSR/WRMSR instructions directly.

The second option can be ruled out because the list of MSRs was
challenging to maintain. That leaves option #1 as the only viable
solution for the minimal TDX support.

Kernel relies on the exception fixup machinery to handle MSR access
errors. #VE handler uses the same exception fixup code as #GP. It
covers MSR accesses along with other types of fixups.

For performance-critical MSR writes (like TSC_DEADLINE), future patches
will replace the WRMSR/#VE sequence with the direct TDCALL.

RDMSR and WRMSR specification details can be found in
Guest-Host-Communication Interface (GHCI) for Intel Trust Domain
Extensions (Intel TDX) specification, sec titled "TDG.VP.
VMCALL<Instruction.RDMSR>" and "TDG.VP.VMCALL<Instruction.WRMSR>".

Co-developed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20220405232939.73860-10-kirill.shutemov@linux.intel.com
2022-04-07 08:27:51 -07:00

270 lines
6.9 KiB
C

// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2021-2022 Intel Corporation */
#undef pr_fmt
#define pr_fmt(fmt) "tdx: " fmt
#include <linux/cpufeature.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
/* TDX module Call Leaf IDs */
#define TDX_GET_INFO 1
#define TDX_GET_VEINFO 3
/*
* Wrapper for standard use of __tdx_hypercall with no output aside from
* return code.
*/
static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = fn,
.r12 = r12,
.r13 = r13,
.r14 = r14,
.r15 = r15,
};
return __tdx_hypercall(&args, 0);
}
/* Called from __tdx_hypercall() for unrecoverable failure */
void __tdx_hypercall_failed(void)
{
panic("TDVMCALL failed. TDX module bug?");
}
/*
* The TDG.VP.VMCALL-Instruction-execution sub-functions are defined
* independently from but are currently matched 1:1 with VMX EXIT_REASONs.
* Reusing the KVM EXIT_REASON macros makes it easier to connect the host and
* guest sides of these calls.
*/
static u64 hcall_func(u64 exit_reason)
{
return exit_reason;
}
/*
* Used for TDX guests to make calls directly to the TD module. This
* should only be used for calls that have no legitimate reason to fail
* or where the kernel can not survive the call failing.
*/
static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
struct tdx_module_output *out)
{
if (__tdx_module_call(fn, rcx, rdx, r8, r9, out))
panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
}
static u64 get_cc_mask(void)
{
struct tdx_module_output out;
unsigned int gpa_width;
/*
* TDINFO TDX module call is used to get the TD execution environment
* information like GPA width, number of available vcpus, debug mode
* information, etc. More details about the ABI can be found in TDX
* Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
* [TDG.VP.INFO].
*
* The GPA width that comes out of this call is critical. TDX guests
* can not meaningfully run without it.
*/
tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
gpa_width = out.rcx & GENMASK(5, 0);
/*
* The highest bit of a guest physical address is the "sharing" bit.
* Set it for shared pages and clear it for private pages.
*/
return BIT_ULL(gpa_width - 1);
}
static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_HLT),
.r12 = irq_disabled,
};
/*
* Emulate HLT operation via hypercall. More info about ABI
* can be found in TDX Guest-Host-Communication Interface
* (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
*
* The VMM uses the "IRQ disabled" param to understand IRQ
* enabled status (RFLAGS.IF) of the TD guest and to determine
* whether or not it should schedule the halted vCPU if an
* IRQ becomes pending. E.g. if IRQs are disabled, the VMM
* can keep the vCPU in virtual HLT, even if an IRQ is
* pending, without hanging/breaking the guest.
*/
return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
}
static bool handle_halt(void)
{
/*
* Since non safe halt is mainly used in CPU offlining
* and the guest will always stay in the halt state, don't
* call the STI instruction (set do_sti as false).
*/
const bool irq_disabled = irqs_disabled();
const bool do_sti = false;
if (__halt(irq_disabled, do_sti))
return false;
return true;
}
void __cpuidle tdx_safe_halt(void)
{
/*
* For do_sti=true case, __tdx_hypercall() function enables
* interrupts using the STI instruction before the TDCALL. So
* set irq_disabled as false.
*/
const bool irq_disabled = false;
const bool do_sti = true;
/*
* Use WARN_ONCE() to report the failure.
*/
if (__halt(irq_disabled, do_sti))
WARN_ONCE(1, "HLT instruction emulation failed\n");
}
static bool read_msr(struct pt_regs *regs)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_MSR_READ),
.r12 = regs->cx,
};
/*
* Emulate the MSR read via hypercall. More info about ABI
* can be found in TDX Guest-Host-Communication Interface
* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
*/
if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
return false;
regs->ax = lower_32_bits(args.r11);
regs->dx = upper_32_bits(args.r11);
return true;
}
static bool write_msr(struct pt_regs *regs)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
.r11 = hcall_func(EXIT_REASON_MSR_WRITE),
.r12 = regs->cx,
.r13 = (u64)regs->dx << 32 | regs->ax,
};
/*
* Emulate the MSR write via hypercall. More info about ABI
* can be found in TDX Guest-Host-Communication Interface
* (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
*/
return !__tdx_hypercall(&args, 0);
}
void tdx_get_ve_info(struct ve_info *ve)
{
struct tdx_module_output out;
/*
* Called during #VE handling to retrieve the #VE info from the
* TDX module.
*
* This has to be called early in #VE handling. A "nested" #VE which
* occurs before this will raise a #DF and is not recoverable.
*
* The call retrieves the #VE info from the TDX module, which also
* clears the "#VE valid" flag. This must be done before anything else
* because any #VE that occurs while the valid flag is set will lead to
* #DF.
*
* Note, the TDX module treats virtual NMIs as inhibited if the #VE
* valid flag is set. It means that NMI=>#VE will not result in a #DF.
*/
tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out);
/* Transfer the output parameters */
ve->exit_reason = out.rcx;
ve->exit_qual = out.rdx;
ve->gla = out.r8;
ve->gpa = out.r9;
ve->instr_len = lower_32_bits(out.r10);
ve->instr_info = upper_32_bits(out.r10);
}
/* Handle the kernel #VE */
static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
{
switch (ve->exit_reason) {
case EXIT_REASON_HLT:
return handle_halt();
case EXIT_REASON_MSR_READ:
return read_msr(regs);
case EXIT_REASON_MSR_WRITE:
return write_msr(regs);
default:
pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
return false;
}
}
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
{
bool ret;
if (user_mode(regs))
ret = false;
else
ret = virt_exception_kernel(regs, ve);
/* After successful #VE handling, move the IP */
if (ret)
regs->ip += ve->instr_len;
return ret;
}
void __init tdx_early_init(void)
{
u64 cc_mask;
u32 eax, sig[3];
cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
if (memcmp(TDX_IDENT, sig, sizeof(sig)))
return;
setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
cc_set_vendor(CC_VENDOR_INTEL);
cc_mask = get_cc_mask();
cc_set_mask(cc_mask);
/*
* All bits above GPA width are reserved and kernel treats shared bit
* as flag, not as part of physical address.
*
* Adjust physical mask to only cover valid GPA bits.
*/
physical_mask &= cc_mask - 1;
pr_info("Guest detected\n");
}