1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00
linux/arch/x86/kvm/mmu/tdp_iter.h
Isaku Yamahata 3fc3f71851 KVM: x86/mmu: Support GFN direct bits
Teach the MMU to map guest GFNs at a massaged position on the TDP, to aid
in implementing TDX shared memory.

Like other Coco technologies, TDX has the concept of private and shared
memory. For TDX the private and shared mappings are managed on separate
EPT roots. The private half is managed indirectly through calls into a
protected runtime environment called the TDX module, where the shared half
is managed within KVM in normal page tables.

For TDX, the shared half will be mapped in the higher alias, with a "shared
bit" set in the GPA. However, KVM will still manage it with the same
memslots as the private half. This means memslot looks ups and zapping
operations will be provided with a GFN without the shared bit set.

So KVM will either need to apply or strip the shared bit before mapping or
zapping the shared EPT. Having GFNs sometimes have the shared bit and
sometimes not would make the code confusing.

So instead arrange the code such that GFNs never have shared bit set.
Create a concept of "direct bits", that is stripped from the fault
address when setting fault->gfn, and applied within the TDP MMU iterator.
Calling code will behave as if it is operating on the PTE mapping the GFN
(without shared bits) but within the iterator, the actual mappings will be
shifted using bits specific for the root. SPs will have the GFN set
without the shared bit. In the end the TDP MMU will behave like it is
mapping things at the GFN without the shared bit but with a strange page
table format where everything is offset by the shared bit.

Since TDX only needs to shift the mapping like this for the shared bit,
which is mapped as the normal TDP root, add a "gfn_direct_bits" field to
the kvm_arch structure for each VM with a default value of 0. It will
have the bit set at the position of the GPA shared bit in GFN through TD
specific initialization code. Keep TDX specific concepts out of the MMU
code by not naming it "shared".

Ranged TLB flushes (i.e. flush_remote_tlbs_range()) target specific GFN
ranges. In convention established above, these would need to target the
shifted GFN range. It won't matter functionally, since the actual
implementation will always result in a full flush for the only planned
user (TDX). For correctness reasons, future changes can provide a TDX
x86_ops.flush_remote_tlbs_range implementation to return -EOPNOTSUPP and
force the full flush for TDs.

This leaves one problem. Some operations use a concept of max GFN (i.e.
kvm_mmu_max_gfn()), to iterate over the whole TDP range. When applying the
direct mask to the start of the range, the iterator would end up skipping
iterating over the range not covered by the direct mask bit. For safety,
make sure the __tdp_mmu_zap_root() operation iterates over the full GFN
range supported by the underlying TDP format. Add a new iterator helper,
for_each_tdp_pte_min_level_all(), that iterates the entire TDP GFN range,
regardless of root.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Co-developed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Message-ID: <20240718211230.1492011-9-rick.p.edgecombe@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-12-23 08:31:54 -05:00

147 lines
4.8 KiB
C

// SPDX-License-Identifier: GPL-2.0
#ifndef __KVM_X86_MMU_TDP_ITER_H
#define __KVM_X86_MMU_TDP_ITER_H
#include <linux/kvm_host.h>
#include "mmu.h"
#include "spte.h"
/*
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
* to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
* batched without having to collect the list of zapped SPs. Flows that can
* remove SPs must service pending TLB flushes prior to dropping RCU protection.
*/
static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
{
return READ_ONCE(*rcu_dereference(sptep));
}
static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
{
KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
return xchg(rcu_dereference(sptep), new_spte);
}
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
{
KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
}
/*
* SPTEs must be modified atomically if they are shadow-present, leaf
* SPTEs, and have volatile bits, i.e. has bits that can be set outside
* of mmu_lock. The Writable bit can be set by KVM's fast page fault
* handler, and Accessed and Dirty bits can be set by the CPU.
*
* Note, non-leaf SPTEs do have Accessed bits and those bits are
* technically volatile, but KVM doesn't consume the Accessed bit of
* non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
* logic needs to be reassessed if KVM were to use non-leaf Accessed
* bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
*/
static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
{
return is_shadow_present_pte(old_spte) &&
is_last_spte(old_spte, level) &&
spte_has_volatile_bits(old_spte);
}
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
u64 new_spte, int level)
{
if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
__kvm_tdp_mmu_write_spte(sptep, new_spte);
return old_spte;
}
static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
u64 mask, int level)
{
atomic64_t *sptep_atomic;
if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
return (u64)atomic64_fetch_and(~mask, sptep_atomic);
}
__kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
return old_spte;
}
/*
* A TDP iterator performs a pre-order walk over a TDP paging structure.
*/
struct tdp_iter {
/*
* The iterator will traverse the paging structure towards the mapping
* for this GFN.
*/
gfn_t next_last_level_gfn;
/*
* The next_last_level_gfn at the time when the thread last
* yielded. Only yielding when the next_last_level_gfn !=
* yielded_gfn helps ensure forward progress.
*/
gfn_t yielded_gfn;
/* Pointers to the page tables traversed to reach the current SPTE */
tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
/* A pointer to the current SPTE */
tdp_ptep_t sptep;
/* The lowest GFN (mask bits excluded) mapped by the current SPTE */
gfn_t gfn;
/* Mask applied to convert the GFN to the mapping GPA */
gfn_t gfn_bits;
/* The level of the root page given to the iterator */
int root_level;
/* The lowest level the iterator should traverse to */
int min_level;
/* The iterator's current level within the paging structure */
int level;
/* The address space ID, i.e. SMM vs. regular. */
int as_id;
/* A snapshot of the value at sptep */
u64 old_spte;
/*
* Whether the iterator has a valid state. This will be false if the
* iterator walks off the end of the paging structure.
*/
bool valid;
/*
* True if KVM dropped mmu_lock and yielded in the middle of a walk, in
* which case tdp_iter_next() needs to restart the walk at the root
* level instead of advancing to the next entry.
*/
bool yielded;
};
/*
* Iterates over every SPTE mapping the GFN range [start, end) in a
* preorder traversal.
*/
#define for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) \
for (tdp_iter_start(&iter, root, min_level, start, kvm_gfn_root_bits(kvm, root)); \
iter.valid && iter.gfn < end; \
tdp_iter_next(&iter))
#define for_each_tdp_pte_min_level_all(iter, root, min_level) \
for (tdp_iter_start(&iter, root, min_level, 0, 0); \
iter.valid && iter.gfn < tdp_mmu_max_gfn_exclusive(); \
tdp_iter_next(&iter))
#define for_each_tdp_pte(iter, kvm, root, start, end) \
for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end)
tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
#endif /* __KVM_X86_MMU_TDP_ITER_H */