1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00
linux/arch/x86/kvm/mmu/tdp_iter.c
Isaku Yamahata 3fc3f71851 KVM: x86/mmu: Support GFN direct bits
Teach the MMU to map guest GFNs at a massaged position on the TDP, to aid
in implementing TDX shared memory.

Like other Coco technologies, TDX has the concept of private and shared
memory. For TDX the private and shared mappings are managed on separate
EPT roots. The private half is managed indirectly through calls into a
protected runtime environment called the TDX module, where the shared half
is managed within KVM in normal page tables.

For TDX, the shared half will be mapped in the higher alias, with a "shared
bit" set in the GPA. However, KVM will still manage it with the same
memslots as the private half. This means memslot looks ups and zapping
operations will be provided with a GFN without the shared bit set.

So KVM will either need to apply or strip the shared bit before mapping or
zapping the shared EPT. Having GFNs sometimes have the shared bit and
sometimes not would make the code confusing.

So instead arrange the code such that GFNs never have shared bit set.
Create a concept of "direct bits", that is stripped from the fault
address when setting fault->gfn, and applied within the TDP MMU iterator.
Calling code will behave as if it is operating on the PTE mapping the GFN
(without shared bits) but within the iterator, the actual mappings will be
shifted using bits specific for the root. SPs will have the GFN set
without the shared bit. In the end the TDP MMU will behave like it is
mapping things at the GFN without the shared bit but with a strange page
table format where everything is offset by the shared bit.

Since TDX only needs to shift the mapping like this for the shared bit,
which is mapped as the normal TDP root, add a "gfn_direct_bits" field to
the kvm_arch structure for each VM with a default value of 0. It will
have the bit set at the position of the GPA shared bit in GFN through TD
specific initialization code. Keep TDX specific concepts out of the MMU
code by not naming it "shared".

Ranged TLB flushes (i.e. flush_remote_tlbs_range()) target specific GFN
ranges. In convention established above, these would need to target the
shifted GFN range. It won't matter functionally, since the actual
implementation will always result in a full flush for the only planned
user (TDX). For correctness reasons, future changes can provide a TDX
x86_ops.flush_remote_tlbs_range implementation to return -EOPNOTSUPP and
force the full flush for TDs.

This leaves one problem. Some operations use a concept of max GFN (i.e.
kvm_mmu_max_gfn()), to iterate over the whole TDP range. When applying the
direct mask to the start of the range, the iterator would end up skipping
iterating over the range not covered by the direct mask bit. For safety,
make sure the __tdp_mmu_zap_root() operation iterates over the full GFN
range supported by the underlying TDP format. Add a new iterator helper,
for_each_tdp_pte_min_level_all(), that iterates the entire TDP GFN range,
regardless of root.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Co-developed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Message-ID: <20240718211230.1492011-9-rick.p.edgecombe@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-12-23 08:31:54 -05:00

179 lines
5 KiB
C

// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "mmu_internal.h"
#include "tdp_iter.h"
#include "spte.h"
/*
* Recalculates the pointer to the SPTE for the current GFN and level and
* reread the SPTE.
*/
static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level);
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
/*
* Return the TDP iterator to the root PT and allow it to continue its
* traversal over the paging structure from there.
*/
void tdp_iter_restart(struct tdp_iter *iter)
{
iter->yielded = false;
iter->yielded_gfn = iter->next_last_level_gfn;
iter->level = iter->root_level;
iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
iter->valid = true;
}
/*
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits)
{
if (WARN_ON_ONCE(!root || (root->role.level < 1) ||
(root->role.level > PT64_ROOT_MAX_LEVEL) ||
(gfn_bits && next_last_level_gfn >= gfn_bits))) {
iter->valid = false;
return;
}
iter->next_last_level_gfn = next_last_level_gfn;
iter->gfn_bits = gfn_bits;
iter->root_level = root->role.level;
iter->min_level = min_level;
iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
iter->as_id = kvm_mmu_page_as_id(root);
tdp_iter_restart(iter);
}
/*
* Given an SPTE and its level, returns a pointer containing the host virtual
* address of the child page table referenced by the SPTE. Returns null if
* there is no such entry.
*/
tdp_ptep_t spte_to_child_pt(u64 spte, int level)
{
/*
* There's no child entry if this entry isn't present or is a
* last-level entry.
*/
if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
return NULL;
return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
}
/*
* Steps down one level in the paging structure towards the goal GFN. Returns
* true if the iterator was able to step down a level, false otherwise.
*/
static bool try_step_down(struct tdp_iter *iter)
{
tdp_ptep_t child_pt;
if (iter->level == iter->min_level)
return false;
/*
* Reread the SPTE before stepping down to avoid traversing into page
* tables that are no longer linked from this entry.
*/
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
child_pt = spte_to_child_pt(iter->old_spte, iter->level);
if (!child_pt)
return false;
iter->level--;
iter->pt_path[iter->level - 1] = child_pt;
iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
return true;
}
/*
* Steps to the next entry in the current page table, at the current page table
* level. The next entry could point to a page backing guest memory or another
* page table, or it could be non-present. Returns true if the iterator was
* able to step to the next entry in the page table, false if the iterator was
* already at the end of the current page table.
*/
static bool try_step_side(struct tdp_iter *iter)
{
/*
* Check if the iterator is already at the end of the current page
* table.
*/
if (SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level) ==
(SPTE_ENT_PER_PAGE - 1))
return false;
iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
iter->next_last_level_gfn = iter->gfn;
iter->sptep++;
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
return true;
}
/*
* Tries to traverse back up a level in the paging structure so that the walk
* can continue from the next entry in the parent page table. Returns true on a
* successful step up, false if already in the root page.
*/
static bool try_step_up(struct tdp_iter *iter)
{
if (iter->level == iter->root_level)
return false;
iter->level++;
iter->gfn = gfn_round_for_level(iter->gfn, iter->level);
tdp_iter_refresh_sptep(iter);
return true;
}
/*
* Step to the next SPTE in a pre-order traversal of the paging structure.
* To get to the next SPTE, the iterator either steps down towards the goal
* GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
* higher GFN.
*
* The basic algorithm is as follows:
* 1. If the current SPTE is a non-last-level SPTE, step down into the page
* table it points to.
* 2. If the iterator cannot step down, it will try to step to the next SPTE
* in the current page of the paging structure.
* 3. If the iterator cannot step to the next entry in the current page, it will
* try to step up to the parent paging structure page. In this case, that
* SPTE will have already been visited, and so the iterator must also step
* to the side again.
*/
void tdp_iter_next(struct tdp_iter *iter)
{
if (iter->yielded) {
tdp_iter_restart(iter);
return;
}
if (try_step_down(iter))
return;
do {
if (try_step_side(iter))
return;
} while (try_step_up(iter));
iter->valid = false;
}