On Wed, Oct 21, 2020 at 11:08:52AM -0700, Ben Gardon wrote:
> On Wed, Oct 21, 2020 at 7:59 AM Yu Zhang <yu.c.zh...@linux.intel.com> wrote:
> >
> > On Wed, Oct 14, 2020 at 11:26:42AM -0700, Ben Gardon wrote:
> > > The TDP iterator implements a pre-order traversal of a TDP paging
> > > structure. This iterator will be used in future patches to create
> > > an efficient implementation of the KVM MMU for the TDP case.
> > >
> > > Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> > > machine. This series introduced no new failures.
> > >
> > > This series can be viewed in Gerrit at:
> > >       https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538
> > >
> > > Signed-off-by: Ben Gardon <bgar...@google.com>
> > > ---
> > >  arch/x86/kvm/Makefile           |   3 +-
> > >  arch/x86/kvm/mmu/mmu.c          |  66 ------------
> > >  arch/x86/kvm/mmu/mmu_internal.h |  66 ++++++++++++
> > >  arch/x86/kvm/mmu/tdp_iter.c     | 176 ++++++++++++++++++++++++++++++++
> > >  arch/x86/kvm/mmu/tdp_iter.h     |  56 ++++++++++
> > >  5 files changed, 300 insertions(+), 67 deletions(-)
> > >  create mode 100644 arch/x86/kvm/mmu/tdp_iter.c
> > >  create mode 100644 arch/x86/kvm/mmu/tdp_iter.h
> > >
> > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > index 7f86a14aed0e9..4525c1151bf99 100644
> > > --- a/arch/x86/kvm/Makefile
> > > +++ b/arch/x86/kvm/Makefile
> > > @@ -15,7 +15,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF)  += $(KVM)/async_pf.o
> > >
> > >  kvm-y                        += x86.o emulate.o i8259.o irq.o lapic.o \
> > >                          i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o 
> > > \
> > > -                        hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
> > > +                        hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
> > > +                        mmu/tdp_iter.o
> > >
> > >  kvm-intel-y          += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o 
> > > vmx/vmcs12.o \
> > >                          vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 6c9db349600c8..6d82784ed5679 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -121,28 +121,6 @@ module_param(dbg, bool, 0644);
> > >
> > >  #define PTE_PREFETCH_NUM             8
> > >
> > > -#define PT_FIRST_AVAIL_BITS_SHIFT 10
> > > -#define PT64_SECOND_AVAIL_BITS_SHIFT 54
> > > -
> > > -/*
> > > - * The mask used to denote special SPTEs, which can be either MMIO SPTEs 
> > > or
> > > - * Access Tracking SPTEs.
> > > - */
> > > -#define SPTE_SPECIAL_MASK (3ULL << 52)
> > > -#define SPTE_AD_ENABLED_MASK (0ULL << 52)
> > > -#define SPTE_AD_DISABLED_MASK (1ULL << 52)
> > > -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
> > > -#define SPTE_MMIO_MASK (3ULL << 52)
> > > -
> > > -#define PT64_LEVEL_BITS 9
> > > -
> > > -#define PT64_LEVEL_SHIFT(level) \
> > > -             (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> > > -
> > > -#define PT64_INDEX(address, level)\
> > > -     (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 
> > > 1))
> > > -
> > > -
> > >  #define PT32_LEVEL_BITS 10
> > >
> > >  #define PT32_LEVEL_SHIFT(level) \
> > > @@ -155,19 +133,6 @@ module_param(dbg, bool, 0644);
> > >  #define PT32_INDEX(address, level)\
> > >       (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 
> > > 1))
> > >
> > > -
> > > -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
> > > -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
> > > -#else
> > > -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
> > > -#endif
> > > -#define PT64_LVL_ADDR_MASK(level) \
> > > -     (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > > -                                             * PT64_LEVEL_BITS))) - 1))
> > > -#define PT64_LVL_OFFSET_MASK(level) \
> > > -     (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > > -                                             * PT64_LEVEL_BITS))) - 1))
> > > -
> > >  #define PT32_BASE_ADDR_MASK PAGE_MASK
> > >  #define PT32_DIR_BASE_ADDR_MASK \
> > >       (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
> > > @@ -192,8 +157,6 @@ module_param(dbg, bool, 0644);
> > >  #define SPTE_HOST_WRITEABLE  (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> > >  #define SPTE_MMU_WRITEABLE   (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> > >
> > > -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > > -
> > >  /* make pte_list_desc fit well in cache line */
> > >  #define PTE_LIST_EXT 3
> > >
> > > @@ -349,11 +312,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 
> > > access_mask)
> > >  }
> > >  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
> > >
> > > -static bool is_mmio_spte(u64 spte)
> > > -{
> > > -     return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
> > > -}
> > > -
> > >  static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
> > >  {
> > >       return sp->role.ad_disabled;
> > > @@ -626,35 +584,11 @@ static int is_nx(struct kvm_vcpu *vcpu)
> > >       return vcpu->arch.efer & EFER_NX;
> > >  }
> > >
> > > -static int is_shadow_present_pte(u64 pte)
> > > -{
> > > -     return (pte != 0) && !is_mmio_spte(pte);
> > > -}
> > > -
> > > -static int is_large_pte(u64 pte)
> > > -{
> > > -     return pte & PT_PAGE_SIZE_MASK;
> > > -}
> > > -
> > > -static int is_last_spte(u64 pte, int level)
> > > -{
> > > -     if (level == PG_LEVEL_4K)
> > > -             return 1;
> > > -     if (is_large_pte(pte))
> > > -             return 1;
> > > -     return 0;
> > > -}
> > > -
> > >  static bool is_executable_pte(u64 spte)
> > >  {
> > >       return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
> > >  }
> > >
> > > -static kvm_pfn_t spte_to_pfn(u64 pte)
> > > -{
> > > -     return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> > > -}
> > > -
> > >  static gfn_t pse36_gfn_delta(u32 gpte)
> > >  {
> > >       int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
> > > diff --git a/arch/x86/kvm/mmu/mmu_internal.h 
> > > b/arch/x86/kvm/mmu/mmu_internal.h
> > > index 3acf3b8eb469d..74ccbf001a42e 100644
> > > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > > @@ -2,6 +2,8 @@
> > >  #ifndef __KVM_X86_MMU_INTERNAL_H
> > >  #define __KVM_X86_MMU_INTERNAL_H
> > >
> > > +#include "mmu.h"
> > > +
> > >  #include <linux/types.h>
> > >
> > >  #include <asm/kvm_host.h>
> > > @@ -60,4 +62,68 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot 
> > > *slot, gfn_t gfn);
> > >  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
> > >                                   struct kvm_memory_slot *slot, u64 gfn);
> > >
> > > +#define PT64_LEVEL_BITS 9
> > > +
> > > +#define PT64_LEVEL_SHIFT(level) \
> > > +             (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> > > +
> > > +#define PT64_INDEX(address, level)\
> > > +     (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 
> > > 1))
> > > +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > > +
> > > +#define PT_FIRST_AVAIL_BITS_SHIFT 10
> > > +#define PT64_SECOND_AVAIL_BITS_SHIFT 54
> > > +
> > > +/*
> > > + * The mask used to denote special SPTEs, which can be either MMIO SPTEs 
> > > or
> > > + * Access Tracking SPTEs.
> > > + */
> > > +#define SPTE_SPECIAL_MASK (3ULL << 52)
> > > +#define SPTE_AD_ENABLED_MASK (0ULL << 52)
> > > +#define SPTE_AD_DISABLED_MASK (1ULL << 52)
> > > +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
> > > +#define SPTE_MMIO_MASK (3ULL << 52)
> > > +
> > > +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
> > > +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
> > > +#else
> > > +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
> > > +#endif
> > > +#define PT64_LVL_ADDR_MASK(level) \
> > > +     (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > > +                                             * PT64_LEVEL_BITS))) - 1))
> > > +#define PT64_LVL_OFFSET_MASK(level) \
> > > +     (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
> > > +                                             * PT64_LEVEL_BITS))) - 1))
> > > +
> > > +/* Functions for interpreting SPTEs */
> > > +static inline bool is_mmio_spte(u64 spte)
> > > +{
> > > +     return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
> > > +}
> > > +
> > > +static inline int is_shadow_present_pte(u64 pte)
> > > +{
> > > +     return (pte != 0) && !is_mmio_spte(pte);
> > > +}
> > > +
> > > +static inline int is_large_pte(u64 pte)
> > > +{
> > > +     return pte & PT_PAGE_SIZE_MASK;
> > > +}
> > > +
> > > +static inline int is_last_spte(u64 pte, int level)
> > > +{
> > > +     if (level == PG_LEVEL_4K)
> > > +             return 1;
> > > +     if (is_large_pte(pte))
> > > +             return 1;
> > > +     return 0;
> > > +}
> > > +
> > > +static inline kvm_pfn_t spte_to_pfn(u64 pte)
> > > +{
> > > +     return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> > > +}
> > > +
> > >  #endif /* __KVM_X86_MMU_INTERNAL_H */
> > > diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> > > new file mode 100644
> > > index 0000000000000..b07e9f0c5d4aa
> > > --- /dev/null
> > > +++ b/arch/x86/kvm/mmu/tdp_iter.c
> > > @@ -0,0 +1,176 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +
> > > +#include "mmu_internal.h"
> > > +#include "tdp_iter.h"
> > > +
> > > +/*
> > > + * Recalculates the pointer to the SPTE for the current GFN and level and
> > > + * reread the SPTE.
> > > + */
> > > +static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
> > > +{
> > > +     iter->sptep = iter->pt_path[iter->level - 1] +
> > > +             SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
> > > +     iter->old_spte = READ_ONCE(*iter->sptep);
> > > +}
> > > +
> > > +static gfn_t round_gfn_for_level(gfn_t gfn, int level)
> > > +{
> > > +     return gfn - (gfn % KVM_PAGES_PER_HPAGE(level));
> >
> > Instead of the modulo operator, how about we use:
> >         return gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
> > here? :)
> >
> > > +}
> > > +
> > > +/*
> > > + * Sets a TDP iterator to walk a pre-order traversal of the paging 
> > > structure
> > > + * rooted at root_pt, starting with the walk to translate goal_gfn.
> > > + */
> > > +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> > > +                 int min_level, gfn_t goal_gfn)
> > > +{
> > > +     WARN_ON(root_level < 1);
> > > +     WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
> > > +
> > > +     iter->goal_gfn = goal_gfn;
> > > +     iter->root_level = root_level;
> > > +     iter->min_level = min_level;
> > > +     iter->level = root_level;
> > > +     iter->pt_path[iter->level - 1] = root_pt;
> > > +
> > > +     iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
> > > +     tdp_iter_refresh_sptep(iter);
> > > +
> > > +     iter->valid = true;
> > > +}
> > > +
> > > +/*
> > > + * Given an SPTE and its level, returns a pointer containing the host 
> > > virtual
> > > + * address of the child page table referenced by the SPTE. Returns null 
> > > if
> > > + * there is no such entry.
> > > + */
> > > +u64 *spte_to_child_pt(u64 spte, int level)
> > > +{
> > > +     /*
> > > +      * There's no child entry if this entry isn't present or is a
> > > +      * last-level entry.
> > > +      */
> > > +     if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
> > > +             return NULL;
> > > +
> > > +     return __va(spte_to_pfn(spte) << PAGE_SHIFT);
> > > +}
> > > +
> > > +/*
> > > + * Steps down one level in the paging structure towards the goal GFN. 
> > > Returns
> > > + * true if the iterator was able to step down a level, false otherwise.
> > > + */
> > > +static bool try_step_down(struct tdp_iter *iter)
> > > +{
> > > +     u64 *child_pt;
> > > +
> > > +     if (iter->level == iter->min_level)
> > > +             return false;
> > > +
> > > +     /*
> > > +      * Reread the SPTE before stepping down to avoid traversing into 
> > > page
> > > +      * tables that are no longer linked from this entry.
> > > +      */
> > > +     iter->old_spte = READ_ONCE(*iter->sptep);
> > > +
> > > +     child_pt = spte_to_child_pt(iter->old_spte, iter->level);
> > > +     if (!child_pt)
> > > +             return false;
> > > +
> > > +     iter->level--;
> > > +     iter->pt_path[iter->level - 1] = child_pt;
> > > +     iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
> > > +     tdp_iter_refresh_sptep(iter);
> > > +
> > > +     return true;
> > > +}
> > > +
> > > +/*
> > > + * Steps to the next entry in the current page table, at the current 
> > > page table
> > > + * level. The next entry could point to a page backing guest memory or 
> > > another
> > > + * page table, or it could be non-present. Returns true if the iterator 
> > > was
> > > + * able to step to the next entry in the page table, false if the 
> > > iterator was
> > > + * already at the end of the current page table.
> > > + */
> > > +static bool try_step_side(struct tdp_iter *iter)
> > > +{
> > > +     /*
> > > +      * Check if the iterator is already at the end of the current page
> > > +      * table.
> > > +      */
> > > +     if (!((iter->gfn + KVM_PAGES_PER_HPAGE(iter->level)) %
> > > +           KVM_PAGES_PER_HPAGE(iter->level + 1)))
> > > +             return false;
> > > +
> >
> > And maybe:
> >         if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) ==
> >             (PT64_ENT_PER_PAGE - 1))
> > here?
> >
> > > +     iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
> > > +     iter->goal_gfn = iter->gfn;
> > > +     iter->sptep++;
> > > +     iter->old_spte = READ_ONCE(*iter->sptep);
> > > +
> > > +     return true;
> > > +}
> > > +
> > > +/*
> > > + * Tries to traverse back up a level in the paging structure so that the 
> > > walk
> > > + * can continue from the next entry in the parent page table. Returns 
> > > true on a
> > > + * successful step up, false if already in the root page.
> > > + */
> > > +static bool try_step_up(struct tdp_iter *iter)
> > > +{
> > > +     if (iter->level == iter->root_level)
> > > +             return false;
> > > +
> > > +     iter->level++;
> > > +     iter->gfn = round_gfn_for_level(iter->gfn, iter->level);
> > > +     tdp_iter_refresh_sptep(iter);
> > > +
> > > +     return true;
> > > +}
> > > +
> > > +/*
> > > + * Step to the next SPTE in a pre-order traversal of the paging 
> > > structure.
> > > + * To get to the next SPTE, the iterator either steps down towards the 
> > > goal
> > > + * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
> > > + * highter GFN.
> > > + *
> > > + * The basic algorithm is as follows:
> > > + * 1. If the current SPTE is a non-last-level SPTE, step down into the 
> > > page
> > > + *    table it points to.
> > > + * 2. If the iterator cannot step down, it will try to step to the next 
> > > SPTE
> > > + *    in the current page of the paging structure.
> > > + * 3. If the iterator cannot step to the next entry in the current page, 
> > > it will
> > > + *    try to step up to the parent paging structure page. In this case, 
> > > that
> > > + *    SPTE will have already been visited, and so the iterator must also 
> > > step
> > > + *    to the side again.
> > > + */
> > > +void tdp_iter_next(struct tdp_iter *iter)
> > > +{
> > > +     if (try_step_down(iter))
> > > +             return;
> > > +
> > > +     do {
> > > +             if (try_step_side(iter))
> > > +                     return;
> > > +     } while (try_step_up(iter));
> > > +     iter->valid = false;
> > > +}
> > > +
> > > +/*
> > > + * Restart the walk over the paging structure from the root, starting 
> > > from the
> > > + * highest gfn the iterator had previously reached. Assumes that the 
> > > entire
> > > + * paging structure, except the root page, may have been completely torn 
> > > down
> > > + * and rebuilt.
> > > + */
> > > +void tdp_iter_refresh_walk(struct tdp_iter *iter)
> > > +{
> > > +     gfn_t goal_gfn = iter->goal_gfn;
> > > +
> > > +     if (iter->gfn > goal_gfn)
> > > +             goal_gfn = iter->gfn;
> > > +
> > > +     tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
> > > +                    iter->root_level, iter->min_level, goal_gfn);
> > > +}
> > > +
> > > diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> > > new file mode 100644
> > > index 0000000000000..d629a53e1b73f
> > > --- /dev/null
> > > +++ b/arch/x86/kvm/mmu/tdp_iter.h
> > > @@ -0,0 +1,56 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +
> > > +#ifndef __KVM_X86_MMU_TDP_ITER_H
> > > +#define __KVM_X86_MMU_TDP_ITER_H
> > > +
> > > +#include <linux/kvm_host.h>
> > > +
> > > +#include "mmu.h"
> > > +
> > > +/*
> > > + * A TDP iterator performs a pre-order walk over a TDP paging structure.
> > > + */
> > > +struct tdp_iter {
> > > +     /*
> > > +      * The iterator will traverse the paging structure towards the 
> > > mapping
> > > +      * for this GFN.
> > > +      */
> > > +     gfn_t goal_gfn;
> > > +     /* Pointers to the page tables traversed to reach the current SPTE 
> > > */
> > > +     u64 *pt_path[PT64_ROOT_MAX_LEVEL];
> > > +     /* A pointer to the current SPTE */
> > > +     u64 *sptep;
> > > +     /* The lowest GFN mapped by the current SPTE */
> > > +     gfn_t gfn;
> > > +     /* The level of the root page given to the iterator */
> > > +     int root_level;
> > > +     /* The lowest level the iterator should traverse to */
> > > +     int min_level;
> > > +     /* The iterator's current level within the paging structure */
> > > +     int level;
> > > +     /* A snapshot of the value at sptep */
> > > +     u64 old_spte;
> > > +     /*
> > > +      * Whether the iterator has a valid state. This will be false if the
> > > +      * iterator walks off the end of the paging structure.
> > > +      */
> > > +     bool valid;
> > > +};
> > > +
> > > +/*
> > > + * Iterates over every SPTE mapping the GFN range [start, end) in a
> > > + * preorder traversal.
> > > + */
> > > +#define for_each_tdp_pte(iter, root, root_level, start, end) \
> > > +     for (tdp_iter_start(&iter, root, root_level, PG_LEVEL_4K, start); \
> > > +          iter.valid && iter.gfn < end;                   \
> > > +          tdp_iter_next(&iter))
> > > +
> > > +u64 *spte_to_child_pt(u64 pte, int level);
> > > +
> > > +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> > > +                 int min_level, gfn_t goal_gfn);
> > > +void tdp_iter_next(struct tdp_iter *iter);
> > > +void tdp_iter_refresh_walk(struct tdp_iter *iter);
> > > +
> > > +#endif /* __KVM_X86_MMU_TDP_ITER_H */
> > > --
> > > 2.28.0.1011.ga647a8990f-goog
> > >
> >
> > I am just suggesting to replace the modulo operations with bit-shifts...
> > Also, it's very exciting to see such patch set. Thanks!
> 
> Those are great suggestions, thank you. I wonder if the compiler would
> have been smart enough to convert those to bit shifts. (I kind of
> doubt it)
> I'm glad you're excited about this patch set, thank you for taking a
> look and helping to review it!
> It doesn't sound like I'll be sending out a v3 of this set, but I'll
> be sure to amend the code with your suggestions if they don't make it
> into the version Paolo queues.

Great. And looking forward to the performance improvement series! :)

Yu

> 
> >
> > B.R.
> > Yu
> >
> 

Reply via email to