from:"Ben Gardon"

Re: [RFC PATCH v2 2/2] KVM: x86: Not wr-protect huge page with init_all_set dirty log

2021-04-20 Thread Ben Gardon

On Tue, Apr 20, 2021 at 12:49 AM Keqian Zhu  wrote:
>
> Hi Ben,
>
> On 2021/4/20 3:20, Ben Gardon wrote:
> > On Fri, Apr 16, 2021 at 1:25 AM Keqian Zhu  wrote:
> >>
> >> Currently during start dirty logging, if we're with init-all-set,
> >> we write protect huge pages and leave normal pages untouched, for
> >> that we can enable dirty logging for these pages lazily.
> >>
> >> Actually enable dirty logging lazily for huge pages is feasible
> >> too, which not only reduces the time of start dirty logging, also
> >> greatly reduces side-effect on guest when there is high dirty rate.
> >>
> >> Signed-off-by: Keqian Zhu 
> >> ---
> >>  arch/x86/kvm/mmu/mmu.c | 48 ++
> >>  arch/x86/kvm/x86.c | 37 +---
> >>  2 files changed, 54 insertions(+), 31 deletions(-)
> >>
> >> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> >> index 2ce5bc2ea46d..98fa25172b9a 100644
> >> --- a/arch/x86/kvm/mmu/mmu.c
> >> +++ b/arch/x86/kvm/mmu/mmu.c
> >> @@ -1188,8 +1188,7 @@ static bool __rmap_clear_dirty(struct kvm *kvm, 
> >> struct kvm_rmap_head *rmap_head,
> >>   * @gfn_offset: start of the BITS_PER_LONG pages we care about
> >>   * @mask: indicates which pages we should protect
> >>   *
> >> - * Used when we do not need to care about huge page mappings: e.g. during 
> >> dirty
> >> - * logging we do not have any such mappings.
> >> + * Used when we do not need to care about huge page mappings.
> >>   */
> >>  static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
> >>  struct kvm_memory_slot *slot,
> >> @@ -1246,13 +1245,54 @@ static void kvm_mmu_clear_dirty_pt_masked(struct 
> >> kvm *kvm,
> >>   * It calls kvm_mmu_write_protect_pt_masked to write protect selected 
> >> pages to
> >>   * enable dirty logging for them.
> >>   *
> >> - * Used when we do not need to care about huge page mappings: e.g. during 
> >> dirty
> >> - * logging we do not have any such mappings.
> >> + * We need to care about huge page mappings: e.g. during dirty logging we 
> >> may
> >> + * have any such mappings.
> >>   */
> >>  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
> >> struct kvm_memory_slot *slot,
> >> gfn_t gfn_offset, unsigned long mask)
> >>  {
> >> +   gfn_t start, end;
> >> +
> >> +   /*
> >> +* Huge pages are NOT write protected when we start dirty log with
> >> +* init-all-set, so we must write protect them at here.
> >> +*
> >> +* The gfn_offset is guaranteed to be aligned to 64, but the 
> >> base_gfn
> >> +* of memslot has no such restriction, so the range can cross two 
> >> large
> >> +* pages.
> >> +*/
> >> +   if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
> >> +   start = slot->base_gfn + gfn_offset + __ffs(mask);
> >> +   end = slot->base_gfn + gfn_offset + __fls(mask);
> >> +   kvm_mmu_slot_gfn_write_protect(kvm, slot, start, 
> >> PG_LEVEL_2M);
> >> +
> >> +   /* Cross two large pages? */
> >> +   if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
> >> +   ALIGN(end << PAGE_SHIFT, PMD_SIZE))
> >> +   kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
> >> +  PG_LEVEL_2M);
> >> +   }
> >> +
> >> +   /*
> >> +* RFC:
> >> +*
> >> +* 1. I don't return early when kvm_mmu_slot_gfn_write_protect() 
> >> returns
> >> +* true, because I am not very clear about the relationship between
> >> +* legacy mmu and tdp mmu. AFAICS, the code logic is NOT an if/else
> >> +* manner.
> >> +*
> >> +* The kvm_mmu_slot_gfn_write_protect() returns true when we hit a
> >> +* writable large page mapping in legacy mmu mapping or tdp mmu 
> >> mapping.
> >> +* Do we still have normal mapping in that case? (e.g. We have 
> >> large
> >> +* mapping in legacy mmu and normal mapping in tdp mmu).
> >
> > Right, we can&

Re: [RFC PATCH v2 2/2] KVM: x86: Not wr-protect huge page with init_all_set dirty log

2021-04-19 Thread Ben Gardon

On Fri, Apr 16, 2021 at 1:25 AM Keqian Zhu  wrote:
>
> Currently during start dirty logging, if we're with init-all-set,
> we write protect huge pages and leave normal pages untouched, for
> that we can enable dirty logging for these pages lazily.
>
> Actually enable dirty logging lazily for huge pages is feasible
> too, which not only reduces the time of start dirty logging, also
> greatly reduces side-effect on guest when there is high dirty rate.
>
> Signed-off-by: Keqian Zhu 
> ---
>  arch/x86/kvm/mmu/mmu.c | 48 ++
>  arch/x86/kvm/x86.c | 37 +---
>  2 files changed, 54 insertions(+), 31 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 2ce5bc2ea46d..98fa25172b9a 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1188,8 +1188,7 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct 
> kvm_rmap_head *rmap_head,
>   * @gfn_offset: start of the BITS_PER_LONG pages we care about
>   * @mask: indicates which pages we should protect
>   *
> - * Used when we do not need to care about huge page mappings: e.g. during 
> dirty
> - * logging we do not have any such mappings.
> + * Used when we do not need to care about huge page mappings.
>   */
>  static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
>  struct kvm_memory_slot *slot,
> @@ -1246,13 +1245,54 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm 
> *kvm,
>   * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages 
> to
>   * enable dirty logging for them.
>   *
> - * Used when we do not need to care about huge page mappings: e.g. during 
> dirty
> - * logging we do not have any such mappings.
> + * We need to care about huge page mappings: e.g. during dirty logging we may
> + * have any such mappings.
>   */
>  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
> struct kvm_memory_slot *slot,
> gfn_t gfn_offset, unsigned long mask)
>  {
> +   gfn_t start, end;
> +
> +   /*
> +* Huge pages are NOT write protected when we start dirty log with
> +* init-all-set, so we must write protect them at here.
> +*
> +* The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
> +* of memslot has no such restriction, so the range can cross two 
> large
> +* pages.
> +*/
> +   if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
> +   start = slot->base_gfn + gfn_offset + __ffs(mask);
> +   end = slot->base_gfn + gfn_offset + __fls(mask);
> +   kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
> +
> +   /* Cross two large pages? */
> +   if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
> +   ALIGN(end << PAGE_SHIFT, PMD_SIZE))
> +   kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
> +  PG_LEVEL_2M);
> +   }
> +
> +   /*
> +* RFC:
> +*
> +* 1. I don't return early when kvm_mmu_slot_gfn_write_protect() 
> returns
> +* true, because I am not very clear about the relationship between
> +* legacy mmu and tdp mmu. AFAICS, the code logic is NOT an if/else
> +* manner.
> +*
> +* The kvm_mmu_slot_gfn_write_protect() returns true when we hit a
> +* writable large page mapping in legacy mmu mapping or tdp mmu 
> mapping.
> +* Do we still have normal mapping in that case? (e.g. We have large
> +* mapping in legacy mmu and normal mapping in tdp mmu).

Right, we can't return early because the two MMUs could map the page
in different ways, but each MMU could also map the page in multiple
ways independently.
For example, if the legacy MMU was being used and we were running a
nested VM, a page could be mapped 2M in EPT01 and 4K in EPT02, so we'd
still need kvm_mmu_slot_gfn_write_protect  calls for both levels.
I don't think there's a case where we can return early here with the
information that the first calls to kvm_mmu_slot_gfn_write_protect
access.

> +*
> +* 2. kvm_mmu_slot_gfn_write_protect() doesn't tell us whether the 
> large
> +* page mapping exist. If it exists but is clean, we can return early.
> +* However, we have to do invasive change.

What do you mean by invasive change?

> +*/
> +
> +   /* Then we can handle the PT level pages */
> if (kvm_x86_ops.cpu_dirty_log_size)
> kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
> else
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index eca63625aee4..dfd676ffa7da 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10888,36 +10888,19 @@ static void kvm_mmu_slot_apply_flags(struct kvm 
> *kvm,
>  */
>

[PATCH 2/2] KVM: x86/mmu: Fix typo in for_each_tdp_mmu_root

2021-04-19 Thread Ben Gardon

There's a typo in for_each_tdp_mmu_root which breaks compilation with
certain configurations. Fix it.

Fixes: 078d47ee71d6 ("KVM: x86/mmu: Protect the tdp_mmu_roots list with RCU")

Reported-by: kernel test robot 
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 8ce8d0916042..f0aef4969754 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -152,7 +152,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm 
*kvm,
 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) 
\
list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, 
\
lockdep_is_held_type(&kvm->mmu_lock, 0) ||  
\
-   lockdep_is_help(&kvm->arch.tdp_mmu_pages_lock)) 
\
+   lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) 
\
if (kvm_mmu_page_as_id(_root) != _as_id) {  \
} else
 
-- 
2.31.1.368.gbe11c130af-goog

[PATCH 1/2] KVM: x86/mmu: Wrap kvm_mmu_zap_all_fast TDP MMU code in ifdefs

2021-04-19 Thread Ben Gardon

The TDP MMU code in kvm_mmu_zap_all_fast is only needed with
CONFIG_X86_64 and creates a build error without that setting. Since the
TDP MMU can only be enabled with CONFIG_X86_64, wrap those code blocks
in ifdefs.

Fixes: 1336c692abad ("KVM: x86/mmu: Fast invalidation for TDP MMU")

Reported-by: kernel test robot 
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0f311c9bf9c6..3ae59c8e129b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5407,7 +5407,9 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
  */
 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 {
+#ifdef CONFIG_X86_64
struct kvm_mmu_page *root;
+#endif /* CONFIG_X86_64 */
 
lockdep_assert_held(&kvm->slots_lock);
 
@@ -5424,6 +5426,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
 
 
+#ifdef CONFIG_X86_64
if (is_tdp_mmu_enabled(kvm)) {
/*
 * Mark each TDP MMU root as invalid so that other threads
@@ -5456,6 +5459,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
root->role.invalid = true;
}
+#endif /* CONFIG_X86_64 */
 
/*
 * Notify all vcpus to reload its shadow page table and flush TLB.
@@ -5471,11 +5475,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 
write_unlock(&kvm->mmu_lock);
 
+#ifdef CONFIG_X86_64
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_zap_invalidated_roots(kvm);
read_unlock(&kvm->mmu_lock);
}
+#endif /* CONFIG_X86_64 */
 }
 
 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
-- 
2.31.1.368.gbe11c130af-goog

Re: [RFC PATCH] KVM: x86: Support write protect huge pages lazily

2021-04-13 Thread Ben Gardon

On Tue, Apr 13, 2021 at 2:39 AM Keqian Zhu  wrote:
>
>
>
> On 2021/4/13 1:19, Ben Gardon wrote:
> > On Tue, Apr 6, 2021 at 4:42 PM Sean Christopherson  
> > wrote:
> >>
> >> +Ben
> >>
> >> On Tue, Apr 06, 2021, Keqian Zhu wrote:
> >>> Hi Paolo,
> >>>
> >>> I plan to rework this patch and do full test. What do you think about 
> >>> this idea
> >>> (enable dirty logging for huge pages lazily)?
> >>
> >> Ben, don't you also have something similar (or maybe the exact opposite?) 
> >> in the
> >> hopper?  This sounds very familiar, but I can't quite connect the dots 
> >> that are
> >> floating around my head...
> >
> > Sorry for the late response, I was out of office last week.
> Never mind, Sean has told to me. :)
>
> >
> > Yes, we have two relevant features I'd like to reconcile somehow:
> > 1.) Large page shattering - Instead of clearing a large TDP mapping,
> > flushing the TLBs, then replacing it with an empty TDP page table, go
> > straight from the large mapping to a fully pre-populated table. This
> > is slightly slower because the table needs to be pre-populated, but it
> > saves many vCPU page faults.
> > 2.) Eager page splitting - split all large mappings down to 4k when
> > enabling dirty logging, using large page shattering. This makes
> > enabling dirty logging much slower, but speeds up the first round (and
> > later rounds) of gathering / clearing the dirty log and reduces the
> > number of vCPU page faults. We've prefered to do this when enabling
> > dirty logging because it's a little less perf-sensitive than the later
> > passes where latency and convergence are critical.
> OK, I see. I think the lock stuff is an important part, so one question is 
> that
> the shattering process is designed to be locked (i.e., protect mapping) or 
> lock-less?
>
> If it's locked, vCPU thread may be blocked for a long time (For arm, there is 
> a
> mmu_lock per VM). If it's lock-less, how can we ensure the synchronization of
> mapping?

The TDP MMU for x86 could do it under the MMU read lock, but the
legacy / shadow x86 MMU and other architectures would need the whole
MMU lock.
While we do increase the time required to address a large SPTE, we can
completely avoid the vCPU needing the MMU lock on an access to that
SPTE as the translation goes straight from a large, writable SPTE, to
a 4k spte with either the d bit cleared or write protected. If it's
write protected, the fault can (at least on x86) be resolved without
the MMU lock.

When I'm able to put together a large page shattering series, I'll do
some performance analysis and see how it changes things, but that part
is sort of orthogonal to this change. The more I think about it, the
better the init-all-set approach for large pages sounds, compared to
eager splitting. I'm definitely in support of this patch and am happy
to help review when you send out the v2 with TDP MMU support and such.

>
> >
> > Large page shattering can happen in the NPT page fault handler or the
> > thread enabling dirty logging / clearing the dirty log, so it's
> > more-or-less orthogonal to this patch.
> >
> > Eager page splitting on the other hand takes the opposite approach to
> > this patch, frontloading as much of the work to enable dirty logging
> > as possible. Which approach is better is going to depend a lot on the
> > guest workload, your live migration constraints, and how the
> > user-space hypervisor makes use of KVM's growing number of dirty
> > logging options. In our case, the time to migrate a VM is usually less
> > of a concern than the performance degradation the guest experiences,
> > so we want to do everything we can to minimize vCPU exits and exit
> > latency.
> Yes, make sense to me.
>
> >
> > I think this is a reasonable change in principle if we're not write
> > protecting 4k pages already, but it's hard to really validate all the
> > performance implications. With this change we'd move pretty much all
> > the work to the first pass of clearing the dirty log, which is
> > probably an improvement since it's much more granular. The downside is
> Yes, at least split large page lazily is better than current logic.
>
> > that we do more work when we'd really like to be converging the dirty
> > set as opposed to earlier when we know all pages are dirty anyway.
> I think the dirty collecting procedure is not affected, do I miss something?

Oh yeah, good point. Since the splitting of large SPTEs is happening
in the vCPU t

Re: [PATCH v2 09/13] KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock

2021-04-12 Thread Ben Gardon

On Fri, Apr 2, 2021 at 12:53 AM Paolo Bonzini  wrote:
>
> On 02/04/21 01:37, Ben Gardon wrote:
> > +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
> > +   bool shared)
> >   {
> >   gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
> >
> > - lockdep_assert_held_write(&kvm->mmu_lock);
> > + kvm_lockdep_assert_mmu_lock_held(kvm, shared);
> >
> >   if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
> >   return;
> > @@ -81,7 +92,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
> > kvm_mmu_page *root)
> >   list_del_rcu(&root->link);
> >   spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
> >
> > - zap_gfn_range(kvm, root, 0, max_gfn, false, false);
> > + zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
> >
> >   call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
>
> Instead of patch 13, would it make sense to delay the zap_gfn_range and
> call_rcu to a work item (either unconditionally, or only if
> shared==false)?  Then the zap_gfn_range would be able to yield and take
> the mmu_lock for read, similar to kvm_tdp_mmu_zap_invalidated_roots.
>
> If done unconditionally, this would also allow removing the "shared"
> argument to kvm_tdp_mmu_put_root, tdp_mmu_next_root and
> for_each_tdp_mmu_root_yield_safe, so I would place that change before
> this patch.
>
> Paolo
>

I tried that and it created problems. I believe the issue was that on
VM teardown memslots would be freed and the memory reallocated before
the root was torn down, resulting in a use-after free from
mark_pfn_dirty. Perhaps this could be resolved by forcing memslot
changes to wait until that work item was processed before returning. I
can look into it but I suspect there will be a lot of "gotchas"
involved.

Re: [RFC PATCH] KVM: x86: Support write protect huge pages lazily

2021-04-12 Thread Ben Gardon

On Tue, Apr 6, 2021 at 4:42 PM Sean Christopherson  wrote:
>
> +Ben
>
> On Tue, Apr 06, 2021, Keqian Zhu wrote:
> > Hi Paolo,
> >
> > I plan to rework this patch and do full test. What do you think about this 
> > idea
> > (enable dirty logging for huge pages lazily)?
>
> Ben, don't you also have something similar (or maybe the exact opposite?) in 
> the
> hopper?  This sounds very familiar, but I can't quite connect the dots that 
> are
> floating around my head...

Sorry for the late response, I was out of office last week.

Yes, we have two relevant features I'd like to reconcile somehow:
1.) Large page shattering - Instead of clearing a large TDP mapping,
flushing the TLBs, then replacing it with an empty TDP page table, go
straight from the large mapping to a fully pre-populated table. This
is slightly slower because the table needs to be pre-populated, but it
saves many vCPU page faults.
2.) Eager page splitting - split all large mappings down to 4k when
enabling dirty logging, using large page shattering. This makes
enabling dirty logging much slower, but speeds up the first round (and
later rounds) of gathering / clearing the dirty log and reduces the
number of vCPU page faults. We've prefered to do this when enabling
dirty logging because it's a little less perf-sensitive than the later
passes where latency and convergence are critical.

Large page shattering can happen in the NPT page fault handler or the
thread enabling dirty logging / clearing the dirty log, so it's
more-or-less orthogonal to this patch.

Eager page splitting on the other hand takes the opposite approach to
this patch, frontloading as much of the work to enable dirty logging
as possible. Which approach is better is going to depend a lot on the
guest workload, your live migration constraints, and how the
user-space hypervisor makes use of KVM's growing number of dirty
logging options. In our case, the time to migrate a VM is usually less
of a concern than the performance degradation the guest experiences,
so we want to do everything we can to minimize vCPU exits and exit
latency.

I think this is a reasonable change in principle if we're not write
protecting 4k pages already, but it's hard to really validate all the
performance implications. With this change we'd move pretty much all
the work to the first pass of clearing the dirty log, which is
probably an improvement since it's much more granular. The downside is
that we do more work when we'd really like to be converging the dirty
set as opposed to earlier when we know all pages are dirty anyway.

>
> > PS: As dirty log of TDP MMU has been supported, I should add more code.
> >
> > On 2020/8/28 16:11, Keqian Zhu wrote:
> > > Currently during enable dirty logging, if we're with init-all-set,
> > > we just write protect huge pages and leave normal pages untouched,
> > > for that we can enable dirty logging for these pages lazily.
> > >
> > > It seems that enable dirty logging lazily for huge pages is feasible
> > > too, which not only reduces the time of start dirty logging, also
> > > greatly reduces side-effect on guest when there is high dirty rate.

The side effect on the guest would also be greatly reduced with large
page shattering above.

> > >
> > > (These codes are not tested, for RFC purpose :-) ).
> > >
> > > Signed-off-by: Keqian Zhu 
> > > ---
> > >  arch/x86/include/asm/kvm_host.h |  3 +-
> > >  arch/x86/kvm/mmu/mmu.c  | 65 ++---
> > >  arch/x86/kvm/vmx/vmx.c  |  3 +-
> > >  arch/x86/kvm/x86.c  | 22 +--
> > >  4 files changed, 62 insertions(+), 31 deletions(-)
> > >
> > > diff --git a/arch/x86/include/asm/kvm_host.h 
> > > b/arch/x86/include/asm/kvm_host.h
> > > index 5303dbc5c9bc..201a068cf43d 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -1296,8 +1296,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 
> > > accessed_mask,
> > >
> > >  void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
> > >  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
> > > - struct kvm_memory_slot *memslot,
> > > - int start_level);
> > > + struct kvm_memory_slot *memslot);
> > >  void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
> > >const struct kvm_memory_slot *memslot);
> > >  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 43fdb0c12a5d..4b7d577de6cd 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -1625,14 +1625,45 @@ static bool __rmap_set_dirty(struct kvm *kvm, 
> > > struct kvm_rmap_head *rmap_head)
> > >  }
> > >
> > >  /**
> > > - * kvm_mmu_write_protect_pt_masked - write protect selected PT level 
> > > pages
> > > + * kvm_mmu_write_protect_largepage_masked - write protect selected 
> > > largepages
> > >

[PATCH v2 12/13] KVM: x86/mmu: Fast invalidation for TDP MMU

2021-04-01 Thread Ben Gardon

Provide a real mechanism for fast invalidation by marking roots as
invalid so that their reference count will quickly fall to zero
and they will be torn down.

One negative side affect of this approach is that a vCPU thread will
likely drop the last reference to a root and be saddled with the work of
tearing down an entire paging structure. This issue will be resolved in
a later commit.

Signed-off-by: Ben Gardon 
---

Changelog
v2:
--  open code root invalidation

 arch/x86/kvm/mmu/mmu.c | 26 +++---
 arch/x86/kvm/mmu/tdp_mmu.h |  3 +++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a3837f8ad4ed..ba0c65076200 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5418,6 +5418,8 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
  */
 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 {
+   struct kvm_mmu_page *root;
+
lockdep_assert_held(&kvm->slots_lock);
 
write_lock(&kvm->mmu_lock);
@@ -5432,6 +5434,27 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 */
kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
 
+
+   if (is_tdp_mmu_enabled(kvm)) {
+   /*
+* Mark each TDP MMU root as invalid so that other threads
+* will drop their references and allow the root count to
+* go to 0.
+*
+* This has essentially the same effect for the TDP MMU
+* as updating mmu_valid_gen above does for the shadow
+* MMU.
+*
+* In order to ensure all threads see this change when
+* handling the MMU reload signal, this must happen in the
+* same critical section as kvm_reload_remote_mmus, and
+* before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages
+* could drop the MMU lock and yield.
+*/
+   list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
+   root->role.invalid = true;
+   }
+
/*
 * Notify all vcpus to reload its shadow page table and flush TLB.
 * Then all vcpus will switch to new shadow page table with the new
@@ -5444,9 +5467,6 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 
kvm_zap_obsolete_pages(kvm);
 
-   if (is_tdp_mmu_enabled(kvm))
-   kvm_tdp_mmu_zap_all(kvm);
-
write_unlock(&kvm->mmu_lock);
 }
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index d703c6d6024a..8fa3e7421a93 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -10,6 +10,9 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
 __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
 struct kvm_mmu_page *root)
 {
+   if (root->role.invalid)
+   return false;
+
return refcount_inc_not_zero(&root->tdp_mmu_root_count);
 }
 
-- 
2.31.0.208.g409f899ff0-goog

[PATCH v2 13/13] KVM: x86/mmu: Tear down roots in fast invalidation thread

2021-04-01 Thread Ben Gardon

To avoid saddling a vCPU thread with the work of tearing down an entire
paging structure, take a reference on each root before they become
obsolete, so that the thread initiating the fast invalidation can tear
down the paging structure and (most likely) release the last reference.
As a bonus, this teardown can happen under the MMU lock in read mode so
as not to block the progress of vCPU threads.

Signed-off-by: Ben Gardon 
---

Changelog
v2:
--  rename kvm_tdp_mmu_zap_all_fast to
kvm_tdp_mmu_zap_invalidated_roots

 arch/x86/kvm/mmu/mmu.c | 21 +++-
 arch/x86/kvm/mmu/tdp_mmu.c | 68 ++
 arch/x86/kvm/mmu/tdp_mmu.h |  1 +
 3 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index ba0c65076200..5f2064ee7220 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5441,6 +5441,18 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 * will drop their references and allow the root count to
 * go to 0.
 *
+* Also take a reference on all roots so that this thread
+* can do the bulk of the work required to free the roots
+* once they are invalidated. Without this reference, a
+* vCPU thread might drop the last reference to a root and
+* get stuck with tearing down the entire paging structure.
+*
+* Roots which have a zero refcount should be skipped as
+* they're already being torn down.
+* Already invalid roots should be referenced again so that
+* they aren't freed before kvm_tdp_mmu_zap_all_fast is
+* done with them.
+*
 * This has essentially the same effect for the TDP MMU
 * as updating mmu_valid_gen above does for the shadow
 * MMU.
@@ -5452,7 +5464,8 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 * could drop the MMU lock and yield.
 */
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
-   root->role.invalid = true;
+   if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
+   root->role.invalid = true;
}
 
/*
@@ -5468,6 +5481,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_zap_obsolete_pages(kvm);
 
write_unlock(&kvm->mmu_lock);
+
+   if (is_tdp_mmu_enabled(kvm)) {
+   read_lock(&kvm->mmu_lock);
+   kvm_tdp_mmu_zap_invalidated_roots(kvm);
+   read_unlock(&kvm->mmu_lock);
+   }
 }
 
 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 501722a524a7..0adcfa5750f6 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -798,6 +798,74 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
kvm_flush_remote_tlbs(kvm);
 }
 
+static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
+ struct kvm_mmu_page 
*prev_root)
+{
+   struct kvm_mmu_page *next_root;
+
+   if (prev_root)
+   next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+   else
+   next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+  typeof(*next_root), link);
+
+   while (next_root && !(next_root->role.invalid &&
+ refcount_read(&next_root->tdp_mmu_root_count)))
+   next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link,
+ typeof(*next_root), link);
+
+   return next_root;
+}
+
+/*
+ * Since kvm_mmu_zap_all_fast has acquired a reference to each
+ * invalidated root, they will not be freed until this function drops the
+ * reference. Before dropping that reference, tear down the paging
+ * structure so that whichever thread does drop the last reference
+ * only has to do a trivial ammount of work. Since the roots are invalid,
+ * no new SPTEs should be created under them.
+ */
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
+{
+   gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+   struct kvm_mmu_page *next_root;
+   struct kvm_mmu_page *root;
+   bool flush = false;
+
+   lockdep_assert_held_read(&kvm->mmu_lock);
+
+   rcu_read_lock();
+
+   root = next_invalidated_root(kvm, NU

[PATCH v2 11/13] KVM: x86/mmu: Allow enabling / disabling dirty logging under MMU read lock

2021-04-01 Thread Ben Gardon

To reduce lock contention and interference with page fault handlers,
allow the TDP MMU functions which enable and disable dirty logging
to operate under the MMU read lock.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c | 16 +++---
 arch/x86/kvm/mmu/tdp_mmu.c | 62 ++
 2 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5939813e3043..a3837f8ad4ed 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5543,10 +5543,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
write_lock(&kvm->mmu_lock);
flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
-   if (is_tdp_mmu_enabled(kvm))
-   flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
write_unlock(&kvm->mmu_lock);
 
+   if (is_tdp_mmu_enabled(kvm)) {
+   read_lock(&kvm->mmu_lock);
+   flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
+   read_unlock(&kvm->mmu_lock);
+   }
+
/*
 * We can flush all the TLBs out of the mmu lock without TLB
 * corruption since we just change the spte from writable to
@@ -5649,10 +5653,14 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 
write_lock(&kvm->mmu_lock);
flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
-   if (is_tdp_mmu_enabled(kvm))
-   flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
write_unlock(&kvm->mmu_lock);
 
+   if (is_tdp_mmu_enabled(kvm)) {
+   read_lock(&kvm->mmu_lock);
+   flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+   read_unlock(&kvm->mmu_lock);
+   }
+
/*
 * It's also safe to flush TLBs out of mmu lock here as currently this
 * function is only used for dirty logging, in which case flushing TLB
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 0e6ffa04e5e1..501722a524a7 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -496,8 +496,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, 
gfn_t gfn,
 }
 
 /*
- * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
- * associated bookkeeping
+ * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping, but do not mark the page dirty
+ * in KVM's dirty bitmaps.
  *
  * @kvm: kvm instance
  * @iter: a tdp_iter instance currently on the SPTE that should be set
@@ -505,9 +506,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, 
gfn_t gfn,
  * Returns: true if the SPTE was set, false if it was not. If false is 
returned,
  * this function will have no side-effects.
  */
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
-  struct tdp_iter *iter,
-  u64 new_spte)
+static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
+   struct tdp_iter *iter,
+   u64 new_spte)
 {
lockdep_assert_held_read(&kvm->mmu_lock);
 
@@ -522,12 +523,25 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm 
*kvm,
  new_spte) != iter->old_spte)
return false;
 
-   handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
-   new_spte, iter->level, true);
+   __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
+   handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
 
return true;
 }
 
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+  struct tdp_iter *iter,
+  u64 new_spte)
+{
+   if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
+   return false;
+
+   handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
+ iter->old_spte, new_spte, iter->level);
+   return true;
+}
+
 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
   struct tdp_iter *iter)
 {
@@ -1148,7 +1162,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct 
kvm_mmu_page *root,
 
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
   min_level, start, end) {
-   if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
+retry:
+   if (tdp_mmu_iter_cond_resched(kvm,

[PATCH v2 09/13] KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock

2021-04-01 Thread Ben Gardon

To reduce lock contention and interference with page fault handlers,
allow the TDP MMU function to zap a GFN range to operate under the MMU
read lock.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c |  22 +---
 arch/x86/kvm/mmu/tdp_mmu.c | 111 ++---
 arch/x86/kvm/mmu/tdp_mmu.h |  14 +++--
 3 files changed, 102 insertions(+), 45 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 47d996a8074f..d03a7a8b7ea2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3154,7 +3154,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t 
*root_hpa,
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
 
if (is_tdp_mmu_page(sp))
-   kvm_tdp_mmu_put_root(kvm, sp);
+   kvm_tdp_mmu_put_root(kvm, sp, false);
else if (!--sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
@@ -5507,16 +5507,24 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t 
gfn_start, gfn_t gfn_end)
}
}
 
-   if (is_tdp_mmu_enabled(kvm)) {
-   for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
-   flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
- gfn_end, flush);
-   }
-
if (flush)
kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
 
write_unlock(&kvm->mmu_lock);
+
+   if (is_tdp_mmu_enabled(kvm)) {
+   flush = false;
+
+   read_lock(&kvm->mmu_lock);
+   for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+   flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
+ gfn_end, flush, true);
+   if (flush)
+   kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+  gfn_end);
+
+   read_unlock(&kvm->mmu_lock);
+   }
 }
 
 static bool slot_rmap_write_protect(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c1d7f6b86870..6917403484ce 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
 }
 
+static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+bool shared)
+{
+   if (shared)
+   lockdep_assert_held_read(&kvm->mmu_lock);
+   else
+   lockdep_assert_held_write(&kvm->mmu_lock);
+}
+
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 {
if (!kvm->arch.tdp_mmu_enabled)
@@ -42,7 +51,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 }
 
 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush);
+ gfn_t start, gfn_t end, bool can_yield, bool flush,
+ bool shared);
 
 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 {
@@ -66,11 +76,12 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head 
*head)
tdp_mmu_free_sp(sp);
 }
 
-void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
 {
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 
-   lockdep_assert_held_write(&kvm->mmu_lock);
+   kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return;
@@ -81,7 +92,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
list_del_rcu(&root->link);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 
-   zap_gfn_range(kvm, root, 0, max_gfn, false, false);
+   zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
 
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
 }
@@ -94,11 +105,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
  * function will return NULL.
  */
 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
- struct kvm_mmu_page *prev_root)
+ struct kvm_mmu_page *prev_root,
+ bool shared)
 {
struct kvm_mmu_page *next_root;
 
-   lockdep_assert_held_write(&kvm->mmu_lock);
 
rcu_read_lock();
 
@@ -117,7 +128,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm 
*kvm,
rcu_read_unlock();
 
if (prev_root)
-   kvm_tdp_mmu_put_root(kvm, prev_root);
+

[PATCH v2 10/13] KVM: x86/mmu: Allow zapping collapsible SPTEs to use MMU read lock

2021-04-01 Thread Ben Gardon

To speed the process of disabling dirty logging, change the TDP MMU
function which zaps collapsible SPTEs to run under the MMU read lock.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c | 14 ++
 arch/x86/kvm/mmu/tdp_mmu.c | 17 +
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index d03a7a8b7ea2..5939813e3043 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5612,13 +5612,19 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
write_lock(&kvm->mmu_lock);
flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
 
-   if (is_tdp_mmu_enabled(kvm))
-   flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
-
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
-
write_unlock(&kvm->mmu_lock);
+
+   if (is_tdp_mmu_enabled(kvm)) {
+   flush = false;
+
+   read_lock(&kvm->mmu_lock);
+   flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
+   if (flush)
+   kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+   read_unlock(&kvm->mmu_lock);
+   }
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6917403484ce..0e6ffa04e5e1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1323,7 +1323,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
rcu_read_lock();
 
tdp_root_for_each_pte(iter, root, start, end) {
-   if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
+retry:
+   if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
flush = false;
continue;
}
@@ -1338,8 +1339,14 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
pfn, PG_LEVEL_NUM))
continue;
 
-   tdp_mmu_set_spte(kvm, &iter, 0);
-
+   if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+   /*
+* The iter must explicitly re-read the SPTE because
+* the atomic cmpxchg failed.
+*/
+   iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+   goto retry;
+   }
flush = true;
}
 
@@ -1358,7 +1365,9 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
 {
struct kvm_mmu_page *root;
 
-   for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
+   lockdep_assert_held_read(&kvm->mmu_lock);
+
+   for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
flush = zap_collapsible_spte_range(kvm, root, slot, flush);
 
return flush;
-- 
2.31.0.208.g409f899ff0-goog

[PATCH v2 07/13] KVM: x86/mmu: handle cmpxchg failure in kvm_tdp_mmu_get_root

2021-04-01 Thread Ben Gardon

To reduce dependence on the MMU write lock, don't rely on the assumption
that the atomic operation in kvm_tdp_mmu_get_root will always succeed.
By not relying on that assumption, threads do not need to hold the MMU
lock in write mode in order to take a reference on a TDP MMU root.

In the root iterator, this change means that some roots might have to be
skipped if they are found to have a zero refcount. This will still never
happen as of this patch, but a future patch will need that flexibility to
make the root iterator safe under the MMU read lock.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 11 ++-
 arch/x86/kvm/mmu/tdp_mmu.h | 13 +++--
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 697ea882a3e4..886bc170f2a5 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -88,10 +88,12 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm 
*kvm,
next_root = list_first_entry(&kvm->arch.tdp_mmu_roots,
 typeof(*next_root), link);
 
+   while (!list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link) &&
+  !kvm_tdp_mmu_get_root(kvm, next_root))
+   next_root = list_next_entry(next_root, link);
+
if (list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link))
next_root = NULL;
-   else
-   kvm_tdp_mmu_get_root(kvm, next_root);
 
if (prev_root)
kvm_tdp_mmu_put_root(kvm, prev_root);
@@ -161,10 +163,9 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 
/* Check for an existing root before allocating a new one. */
for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
-   if (root->role.word == role.word) {
-   kvm_tdp_mmu_get_root(kvm, root);
+   if (root->role.word == role.word &&
+   kvm_tdp_mmu_get_root(kvm, root))
goto out;
-   }
}
 
root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 1ec7914ecff9..f0a26214e999 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -7,17 +7,10 @@
 
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
 
-static inline void kvm_tdp_mmu_get_root(struct kvm *kvm,
-   struct kvm_mmu_page *root)
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
+struct kvm_mmu_page *root)
 {
-   lockdep_assert_held_write(&kvm->mmu_lock);
-
-   /*
-* This should never fail since roots are removed from the roots
-* list under the MMU write lock when their reference count falls
-* to zero.
-*/
-   refcount_inc_not_zero(&root->tdp_mmu_root_count);
+   return refcount_inc_not_zero(&root->tdp_mmu_root_count);
 }
 
 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
-- 
2.31.0.208.g409f899ff0-goog

[PATCH v2 08/13] KVM: x86/mmu: Protect the tdp_mmu_roots list with RCU

2021-04-01 Thread Ben Gardon

Protect the contents of the TDP MMU roots list with RCU in preparation
for a future patch which will allow the iterator macro to be used under
the MMU lock in read mode.

Signed-off-by: Ben Gardon 
---

Changelog
v2:
--  add lockdep condition for tdp_mmu_pages_lock to for_each_tdp_mmu_root
--  fix problem with unexported lockdep function
--  updated comments in kvm_host.h

 arch/x86/include/asm/kvm_host.h | 21 +++---
 arch/x86/kvm/mmu/tdp_mmu.c  | 69 +++--
 2 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 99778ac51243..e02e8b8a875b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1050,25 +1050,36 @@ struct kvm_arch {
bool tdp_mmu_enabled;
 
/*
-* List of struct kvmp_mmu_pages being used as roots.
+* List of struct kvm_mmu_pages being used as roots.
 * All struct kvm_mmu_pages in the list should have
 * tdp_mmu_page set.
-* All struct kvm_mmu_pages in the list should have a positive
-* root_count except when a thread holds the MMU lock and is removing
-* an entry from the list.
+*
+* For reads, this list is protected by:
+*  the MMU lock in read mode + RCU or
+*  the MMU lock in write mode
+*
+* For writes, this list is protected by:
+*  the MMU lock in read mode + the tdp_mmu_pages_lock or
+*  the MMU lock in write mode
+*
+* Roots will remain in the list until their tdp_mmu_root_count
+* drops to zero, at which point the thread that decremented the
+* count to zero should removed the root from the list and clean
+* it up, freeing the root after an RCU grace period.
 */
struct list_head tdp_mmu_roots;
 
/*
 * List of struct kvmp_mmu_pages not being used as roots.
 * All struct kvm_mmu_pages in the list should have
-* tdp_mmu_page set and a root_count of 0.
+* tdp_mmu_page set and a tdp_mmu_root_count of 0.
 */
struct list_head tdp_mmu_pages;
 
/*
 * Protects accesses to the following fields when the MMU lock
 * is held in read mode:
+*  - tdp_mmu_roots (above)
 *  - tdp_mmu_pages (above)
 *  - the link field of struct kvm_mmu_pages used by the TDP MMU
 *  - lpage_disallowed_mmu_pages
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 886bc170f2a5..c1d7f6b86870 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -50,6 +50,22 @@ static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
kmem_cache_free(mmu_page_header_cache, sp);
 }
 
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
+{
+   struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+  rcu_head);
+
+   tdp_mmu_free_sp(sp);
+}
+
 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
 {
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
@@ -61,11 +77,13 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
 
WARN_ON(!root->tdp_mmu_page);
 
-   list_del(&root->link);
+   spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+   list_del_rcu(&root->link);
+   spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 
zap_gfn_range(kvm, root, 0, max_gfn, false, false);
 
-   tdp_mmu_free_sp(root);
+   call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
 }
 
 /*
@@ -82,18 +100,21 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm 
*kvm,
 
lockdep_assert_held_write(&kvm->mmu_lock);
 
+   rcu_read_lock();
+
if (prev_root)
-   next_root = list_next_entry(prev_root, link);
+   next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
else
-   next_root = list_first_entry(&kvm->arch.tdp_mmu_roots,
-typeof(*next_root), link);
+   next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+  typeof(*next_root), link);
 
-   while (!list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link) &&
-

[PATCH v2 04/13] KVM: x86/mmu: Merge TDP MMU put and free root

2021-04-01 Thread Ben Gardon

kvm_tdp_mmu_put_root and kvm_tdp_mmu_free_root are always called
together, so merge the functions to simplify TDP MMU root refcounting /
freeing.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c |  4 +--
 arch/x86/kvm/mmu/tdp_mmu.c | 54 ++
 arch/x86/kvm/mmu/tdp_mmu.h | 10 +--
 3 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 9c7ef7ca8bf6..47d996a8074f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3153,8 +3153,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t 
*root_hpa,
 
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
 
-   if (is_tdp_mmu_page(sp) && kvm_tdp_mmu_put_root(kvm, sp))
-   kvm_tdp_mmu_free_root(kvm, sp);
+   if (is_tdp_mmu_page(sp))
+   kvm_tdp_mmu_put_root(kvm, sp);
else if (!--sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 320cc4454737..279a725061f7 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -41,10 +41,31 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
rcu_barrier();
 }
 
-static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, bool can_yield, bool flush);
+
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 {
-   if (kvm_tdp_mmu_put_root(kvm, root))
-   kvm_tdp_mmu_free_root(kvm, root);
+   free_page((unsigned long)sp->spt);
+   kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+   gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+
+   lockdep_assert_held_write(&kvm->mmu_lock);
+
+   if (--root->root_count)
+   return;
+
+   WARN_ON(!root->tdp_mmu_page);
+
+   list_del(&root->link);
+
+   zap_gfn_range(kvm, root, 0, max_gfn, false, false);
+
+   tdp_mmu_free_sp(root);
 }
 
 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
@@ -66,7 +87,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct 
kvm *kvm,
struct kvm_mmu_page *next_root;
 
next_root = list_next_entry(root, link);
-   tdp_mmu_put_root(kvm, root);
+   kvm_tdp_mmu_put_root(kvm, root);
return next_root;
 }
 
@@ -89,31 +110,6 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct 
kvm *kvm,
if (kvm_mmu_page_as_id(_root) != _as_id) {  \
} else
 
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield, bool flush);
-
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
-{
-   free_page((unsigned long)sp->spt);
-   kmem_cache_free(mmu_page_header_cache, sp);
-}
-
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
-{
-   gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-
-   lockdep_assert_held_write(&kvm->mmu_lock);
-
-   WARN_ON(root->root_count);
-   WARN_ON(!root->tdp_mmu_page);
-
-   list_del(&root->link);
-
-   zap_gfn_range(kvm, root, 0, max_gfn, false, false);
-
-   tdp_mmu_free_sp(root);
-}
-
 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
   int level)
 {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index c9a081c786a5..d4e32ac5f4c9 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -6,7 +6,6 @@
 #include 
 
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
 static inline void kvm_tdp_mmu_get_root(struct kvm *kvm,
struct kvm_mmu_page *root)
@@ -17,14 +16,7 @@ static inline void kvm_tdp_mmu_get_root(struct kvm *kvm,
++root->root_count;
 }
 
-static inline bool kvm_tdp_mmu_put_root(struct kvm *kvm,
-   struct kvm_mmu_page *root)
-{
-   lockdep_assert_held(&kvm->mmu_lock);
-   --root->root_count;
-
-   return !root->root_count;
-}
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
 gfn_t end, bool can_yield, bool flush);
-- 
2.31.0.208.g409f899ff0-goog

[PATCH v2 06/13] KVM: x86/mmu: Make TDP MMU root refcount atomic

2021-04-01 Thread Ben Gardon

In order to parallelize more operations for the TDP MMU, make the
refcount on TDP MMU roots atomic, so that a future patch can allow
multiple threads to take a reference on the root concurrently, while
holding the MMU lock in read mode.

Signed-off-by: Ben Gardon 
---

Changelog
v2:
--  Split failure handling for kvm_tdp_mmu_get_root out into a
seperate commit.

 arch/x86/kvm/mmu/mmu_internal.h |  6 +-
 arch/x86/kvm/mmu/tdp_mmu.c  |  4 ++--
 arch/x86/kvm/mmu/tdp_mmu.h  | 10 +++---
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 9347d73996b5..f63d0fdb8567 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -50,7 +50,11 @@ struct kvm_mmu_page {
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
-   int root_count;  /* Currently serving as active root */
+   /* Currently serving as active root */
+   union {
+   int root_count;
+   refcount_t tdp_mmu_root_count;
+   };
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
DECLARE_BITMAP(unsync_child_bitmap, 512);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 670c5e3ad80e..697ea882a3e4 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -56,7 +56,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
 
lockdep_assert_held_write(&kvm->mmu_lock);
 
-   if (--root->root_count)
+   if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return;
 
WARN_ON(!root->tdp_mmu_page);
@@ -168,7 +168,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
}
 
root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
-   root->root_count = 1;
+   refcount_set(&root->tdp_mmu_root_count, 1);
 
list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index d4e32ac5f4c9..1ec7914ecff9 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -10,10 +10,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
 static inline void kvm_tdp_mmu_get_root(struct kvm *kvm,
struct kvm_mmu_page *root)
 {
-   BUG_ON(!root->root_count);
-   lockdep_assert_held(&kvm->mmu_lock);
+   lockdep_assert_held_write(&kvm->mmu_lock);
 
-   ++root->root_count;
+   /*
+* This should never fail since roots are removed from the roots
+* list under the MMU write lock when their reference count falls
+* to zero.
+*/
+   refcount_inc_not_zero(&root->tdp_mmu_root_count);
 }
 
 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
-- 
2.31.0.208.g409f899ff0-goog

[PATCH v2 05/13] KVM: x86/mmu: Refactor yield safe root iterator

2021-04-01 Thread Ben Gardon

Refactor the yield safe TDP MMU root iterator to be more amenable to
changes in future commits which will allow it to be used under the MMU
lock in read mode. Currently the iterator requires a complicated dance
between the helper functions and different parts of the for loop which
makes it hard to reason about. Moving all the logic into a single function
simplifies the iterator substantially.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 45 ++
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 279a725061f7..670c5e3ad80e 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -68,26 +68,34 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
tdp_mmu_free_sp(root);
 }
 
-static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
-  struct kvm_mmu_page *root)
+/*
+ * Finds the next valid root after root (or the first valid root if root
+ * is NULL), takes a reference on it, and returns that next root. If root
+ * is not NULL, this thread should have already taken a reference on it, and
+ * that reference will be dropped. If no valid root is found, this
+ * function will return NULL.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root)
 {
-   lockdep_assert_held_write(&kvm->mmu_lock);
+   struct kvm_mmu_page *next_root;
 
-   if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
-   return false;
+   lockdep_assert_held_write(&kvm->mmu_lock);
 
-   kvm_tdp_mmu_get_root(kvm, root);
-   return true;
+   if (prev_root)
+   next_root = list_next_entry(prev_root, link);
+   else
+   next_root = list_first_entry(&kvm->arch.tdp_mmu_roots,
+typeof(*next_root), link);
 
-}
+   if (list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link))
+   next_root = NULL;
+   else
+   kvm_tdp_mmu_get_root(kvm, next_root);
 
-static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
-struct kvm_mmu_page *root)
-{
-   struct kvm_mmu_page *next_root;
+   if (prev_root)
+   kvm_tdp_mmu_put_root(kvm, prev_root);
 
-   next_root = list_next_entry(root, link);
-   kvm_tdp_mmu_put_root(kvm, root);
return next_root;
 }
 
@@ -98,11 +106,10 @@ static inline struct kvm_mmu_page 
*tdp_mmu_next_root(struct kvm *kvm,
  * recent root. (Unless keeping a live reference is desirable.)
  */
 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)  \
-   for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,\
- typeof(*_root), link);\
-tdp_mmu_next_root_valid(_kvm, _root);  \
-_root = tdp_mmu_next_root(_kvm, _root))\
-   if (kvm_mmu_page_as_id(_root) != _as_id) {  \
+   for (_root = tdp_mmu_next_root(_kvm, NULL); \
+_root; \
+_root = tdp_mmu_next_root(_kvm, _root))\
+   if (kvm_mmu_page_as_id(_root) != _as_id) {  \
} else
 
 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
-- 
2.31.0.208.g409f899ff0-goog

[PATCH v2 03/13] KVM: x86/mmu: use tdp_mmu_free_sp to free roots

2021-04-01 Thread Ben Gardon

Minor cleanup to deduplicate the code used to free a struct kvm_mmu_page
in the TDP MMU.

No functional change intended.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6f612ac755a0..320cc4454737 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -92,6 +92,12 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct 
kvm *kvm,
 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  gfn_t start, gfn_t end, bool can_yield, bool flush);
 
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
+{
+   free_page((unsigned long)sp->spt);
+   kmem_cache_free(mmu_page_header_cache, sp);
+}
+
 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
 {
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
@@ -105,8 +111,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
 
zap_gfn_range(kvm, root, 0, max_gfn, false, false);
 
-   free_page((unsigned long)root->spt);
-   kmem_cache_free(mmu_page_header_cache, root);
+   tdp_mmu_free_sp(root);
 }
 
 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
@@ -168,12 +173,6 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
return __pa(root->spt);
 }
 
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
-{
-   free_page((unsigned long)sp->spt);
-   kmem_cache_free(mmu_page_header_cache, sp);
-}
-
 /*
  * This is called through call_rcu in order to free TDP page table memory
  * safely with respect to other kernel threads that may be operating on
-- 
2.31.0.208.g409f899ff0-goog

[PATCH v2 01/13] KVM: x86/mmu: Re-add const qualifier in kvm_tdp_mmu_zap_collapsible_sptes

2021-04-01 Thread Ben Gardon

kvm_tdp_mmu_zap_collapsible_sptes unnecessarily removes the const
qualifier from its memlsot argument, leading to a compiler warning. Add
the const annotation and pass it to subsequent functions.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c  | 10 +-
 arch/x86/kvm/mmu/mmu_internal.h |  5 +++--
 arch/x86/kvm/mmu/tdp_mmu.c  |  5 +++--
 arch/x86/kvm/mmu/tdp_mmu.h  |  3 ++-
 include/linux/kvm_host.h|  2 +-
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index efb41f31e80a..617809529987 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -715,8 +715,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, 
int index, gfn_t gfn)
  * handling slots that are not large page aligned.
  */
 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
+   const struct kvm_memory_slot *slot, int level)
 {
unsigned long idx;
 
@@ -2735,7 +2734,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, 
u64 *sptep)
 }
 
 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
 {
unsigned long hva;
pte_t *pte;
@@ -2761,8 +2760,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t 
gfn, kvm_pfn_t pfn,
return level;
 }
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level)
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level)
 {
struct kvm_lpage_info *linfo;
 
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index cead1d81e663..d44fe8a43a19 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -161,8 +161,9 @@ enum {
 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
 #define SET_SPTE_SPURIOUS  BIT(2)
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level);
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level);
 int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
int max_level, kvm_pfn_t *pfnp,
bool huge_page_disallowed, int *req_level);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index ccf0d774a181..d5210a212c59 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1255,7 +1255,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
  */
 static bool zap_collapsible_spte_range(struct kvm *kvm,
   struct kvm_mmu_page *root,
-  struct kvm_memory_slot *slot,
+  const struct kvm_memory_slot *slot,
   bool flush)
 {
gfn_t start = slot->base_gfn;
@@ -1296,7 +1296,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
  * be replaced by large mappings, for GFNs within the slot.
  */
 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-  struct kvm_memory_slot *slot, bool flush)
+  const struct kvm_memory_slot *slot,
+  bool flush)
 {
struct kvm_mmu_page *root;
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index bf3ce169122e..d7007480b3d2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -57,7 +57,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
   gfn_t gfn, unsigned long mask,
   bool wrprot);
 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-  struct kvm_memory_slot *slot, bool 
flush);
+  const struct kvm_memory_slot *slot,
+  bool flush);
 
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
   struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e6d77353025c..5e0d17b1ac2b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1124,7 +1124,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
 }
 
 static inline unsigned long
-__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t

[PATCH v2 02/13] KVM: x86/mmu: Move kvm_mmu_(get|put)_root to TDP MMU

2021-04-01 Thread Ben Gardon

The TDP MMU is almost the only user of kvm_mmu_get_root and
kvm_mmu_put_root. There is only one use of put_root in mmu.c for the
legacy / shadow MMU. Open code that one use and move the get / put
functions to the TDP MMU so they can be extended in future commits.

No functional change intended.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c  | 10 --
 arch/x86/kvm/mmu/mmu_internal.h | 16 
 arch/x86/kvm/mmu/tdp_mmu.c  |  6 +++---
 arch/x86/kvm/mmu/tdp_mmu.h  | 18 ++
 4 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 617809529987..9c7ef7ca8bf6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3153,12 +3153,10 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t 
*root_hpa,
 
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
 
-   if (kvm_mmu_put_root(kvm, sp)) {
-   if (is_tdp_mmu_page(sp))
-   kvm_tdp_mmu_free_root(kvm, sp);
-   else if (sp->role.invalid)
-   kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
-   }
+   if (is_tdp_mmu_page(sp) && kvm_tdp_mmu_put_root(kvm, sp))
+   kvm_tdp_mmu_free_root(kvm, sp);
+   else if (!--sp->root_count && sp->role.invalid)
+   kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
*root_hpa = INVALID_PAGE;
 }
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index d44fe8a43a19..9347d73996b5 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -113,22 +113,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
 
-static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-   BUG_ON(!sp->root_count);
-   lockdep_assert_held(&kvm->mmu_lock);
-
-   ++sp->root_count;
-}
-
-static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-   lockdep_assert_held(&kvm->mmu_lock);
-   --sp->root_count;
-
-   return !sp->root_count;
-}
-
 static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
 {
return role.smm ? 1 : 0;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d5210a212c59..6f612ac755a0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -43,7 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 
 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
 {
-   if (kvm_mmu_put_root(kvm, root))
+   if (kvm_tdp_mmu_put_root(kvm, root))
kvm_tdp_mmu_free_root(kvm, root);
 }
 
@@ -55,7 +55,7 @@ static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
return false;
 
-   kvm_mmu_get_root(kvm, root);
+   kvm_tdp_mmu_get_root(kvm, root);
return true;
 
 }
@@ -154,7 +154,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
/* Check for an existing root before allocating a new one. */
for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
if (root->role.word == role.word) {
-   kvm_mmu_get_root(kvm, root);
+   kvm_tdp_mmu_get_root(kvm, root);
goto out;
}
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index d7007480b3d2..c9a081c786a5 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -8,6 +8,24 @@
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
+static inline void kvm_tdp_mmu_get_root(struct kvm *kvm,
+   struct kvm_mmu_page *root)
+{
+   BUG_ON(!root->root_count);
+   lockdep_assert_held(&kvm->mmu_lock);
+
+   ++root->root_count;
+}
+
+static inline bool kvm_tdp_mmu_put_root(struct kvm *kvm,
+   struct kvm_mmu_page *root)
+{
+   lockdep_assert_held(&kvm->mmu_lock);
+   --root->root_count;
+
+   return !root->root_count;
+}
+
 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
 gfn_t end, bool can_yield, bool flush);
 static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
-- 
2.31.0.208.g409f899ff0-goog

[PATCH v2 00/13] More parallel operations for the TDP MMU

2021-04-01 Thread Ben Gardon

Now that the TDP MMU is able to handle page faults in parallel, it's a
relatively small change to expand to other operations. This series allows
zapping a range of GFNs, reclaiming collapsible SPTEs (when disabling
dirty logging), and enabling dirty logging to all happen under the MMU
lock in read mode.

This is partly a cleanup + rewrite of the last few patches of the parallel
page faults series. I've incorporated feedback from Sean and Paolo, but
the patches have changed so much that I'm sending this as a separate
series.

Ran kvm-unit-tests + selftests on an SMP kernel + Intel Skylake, with the
TDP MMU enabled and disabled. This series introduces no new failures or
warnings.

I know this will conflict horribly with the patches from Sean's series
which were just queued, and I'll send a v2 to fix those conflicts +
address any feedback on this v1.

Changelog
v2:
--  Rebased patches on top of kvm/queue to incorporate Sean's recent
TLB flushing changes
--  Dropped patch 5: "KVM: x86/mmu: comment for_each_tdp_mmu_root
requires MMU write lock" as the following patch to protect the roots
list with RCU adds lockdep which makes the comment somewhat redundant.

Ben Gardon (13):
  KVM: x86/mmu: Re-add const qualifier in
kvm_tdp_mmu_zap_collapsible_sptes
  KVM: x86/mmu: Move kvm_mmu_(get|put)_root to TDP MMU
  KVM: x86/mmu: use tdp_mmu_free_sp to free roots
  KVM: x86/mmu: Merge TDP MMU put and free root
  KVM: x86/mmu: Refactor yield safe root iterator
  KVM: x86/mmu: Make TDP MMU root refcount atomic
  KVM: x86/mmu: handle cmpxchg failure in kvm_tdp_mmu_get_root
  KVM: x86/mmu: Protect the tdp_mmu_roots list with RCU
  KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock
  KVM: x86/mmu: Allow zapping collapsible SPTEs to use MMU read lock
  KVM: x86/mmu: Allow enabling / disabling dirty logging under MMU read
lock
  KVM: x86/mmu: Fast invalidation for TDP MMU
  KVM: x86/mmu: Tear down roots in fast invalidation thread

 arch/x86/include/asm/kvm_host.h |  21 +-
 arch/x86/kvm/mmu/mmu.c  | 115 +++---
 arch/x86/kvm/mmu/mmu_internal.h |  27 +--
 arch/x86/kvm/mmu/tdp_mmu.c  | 375 +++-
 arch/x86/kvm/mmu/tdp_mmu.h  |  28 ++-
 include/linux/kvm_host.h|   2 +-
 6 files changed, 407 insertions(+), 161 deletions(-)

-- 
2.31.0.208.g409f899ff0-goog

Re: [PATCH 12/13] KVM: x86/mmu: Fast invalidation for TDP MMU

2021-04-01 Thread Ben Gardon

On Thu, Apr 1, 2021 at 3:36 AM Paolo Bonzini  wrote:
>
> On 31/03/21 23:08, Ben Gardon wrote:
> >
> > + if (is_tdp_mmu_enabled(kvm))
> > + kvm_tdp_mmu_invalidate_roots(kvm);
> > +
> >   /*
> >* Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
> >* held for the entire duration of zapping obsolete pages, it's
> > @@ -5451,9 +5454,6 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
> >
> >   kvm_zap_obsolete_pages(kvm);
> >
> > - if (is_tdp_mmu_enabled(kvm))
> > - kvm_tdp_mmu_zap_all(kvm);
> > -
>
> This is just cosmetic, but I'd prefer to keep the call to
> kvm_tdp_mmu_invalidate_roots at the original place, so that it's clear
> in the next patch that it's two separate parts because of the different
> locking requirements.

I'm not sure exactly what you mean and I could certainly do a better
job explaining in the commit description, but it's actually quite
important that kvm_tdp_mmu_invalidate_roots at least precede
kvm_zap_obsolete_pages as kvm_zap_obsolete_pages drops the lock and
yields. If kvm_tdp_mmu_invalidate_roots doesn't go first then vCPUs
could wind up dropping their ref on an old root and then picking it up
again before the last root had a chance to drop its ref.

Explaining in the description that kvm_tdp_mmu_zap_all is being
dropped because it is no longer necessary (as opposed to being moved)
might help make that cleaner.

Alternatively I could just leave kvm_tdp_mmu_zap_all and replace it in
the next patch.

>
> Paolo
>

Re: [PATCH v2 20/28] KVM: x86/mmu: Use atomic ops to set SPTEs in TDP MMU map

2021-04-01 Thread Ben Gardon

On Thu, Apr 1, 2021 at 3:32 AM Paolo Bonzini  wrote:
>
> On 02/02/21 19:57, Ben Gardon wrote:
> > @@ -720,7 +790,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, 
> > u32 error_code,
> >*/
> >   if (is_shadow_present_pte(iter.old_spte) &&
> >   is_large_pte(iter.old_spte)) {
> > - tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
> > + if (!tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 0))
> > + break;
> >
> >   kvm_flush_remote_tlbs_with_address(vcpu->kvm, 
> > iter.gfn,
> >   KVM_PAGES_PER_HPAGE(iter.level));
> >
> >   /*
> >* The iter must explicitly re-read the spte here
> >* because the new value informs the !present
> >  * path below.
> >  */
> > iter.old_spte = 
> > READ_ONCE(*rcu_dereference(iter.sptep));
> > }
> >
> > if (!is_shadow_present_pte(iter.old_spte)) {
>
> Would it be easier to reason about this code by making it retry, like:
>
> retry:
>  if (is_shadow_present_pte(iter.old_spte)) {
> if (is_large_pte(iter.old_spte)) {
> if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, 
> &iter))
> break;
>
> /*
>  * The iter must explicitly re-read the SPTE 
> because
>  * the atomic cmpxchg failed.
>  */
> iter.old_spte = 
> READ_ONCE(*rcu_dereference(iter.sptep));
> goto retry;
> }
>  } else {
> ...
> }
>
> ?

To be honest, that feels less readable to me. For me retry implies
that we failed to make progress and need to repeat an operation, but
the reality is that we did make progress and there are just multiple
steps to replace the large SPTE with a child PT.
Another option which could improve readability and performance would
be to use the retry to repeat failed cmpxchgs instead of breaking out
of the loop. Then we could avoid retrying the page fault each time a
cmpxchg failed, which may happen a lot as vCPUs allocate intermediate
page tables on boot. (Probably less common for leaf entries, but
possibly useful there too.)
Another-nother option would be to remove this two part process by
eagerly splitting large page mappings in a single step. This would
substantially reduce the number of page faults incurred for NX
splitting / dirty logging splitting. It's been on our list of features
to send upstream for a while and I hope we'll be able to get it into
shape and send it out reasonably soon.

>
> Paolo
>

Re: [PATCH 07/13] KVM: x86/mmu: Make TDP MMU root refcount atomic

2021-04-01 Thread Ben Gardon

On Wed, Mar 31, 2021 at 3:22 PM Sean Christopherson  wrote:
>
> On Wed, Mar 31, 2021, Ben Gardon wrote:
> > In order to parallelize more operations for the TDP MMU, make the
> > refcount on TDP MMU roots atomic, so that a future patch can allow
> > multiple threads to take a reference on the root concurrently, while
> > holding the MMU lock in read mode.
> >
> > Signed-off-by: Ben Gardon 
> > ---
>
> ...
>
> > @@ -88,10 +88,12 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct 
> > kvm *kvm,
> >   next_root = list_first_entry(&kvm->arch.tdp_mmu_roots,
> >typeof(*next_root), link);
> >
> > + while (!list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link) 
> > &&
> > +!kvm_tdp_mmu_get_root(kvm, next_root))
> > + next_root = list_next_entry(next_root, link);
> > +
> >   if (list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link))
> >   next_root = NULL;
> > - else
> > - kvm_tdp_mmu_get_root(kvm, next_root);
> >
> >   if (prev_root)
> >   kvm_tdp_mmu_put_root(kvm, prev_root);
> > @@ -158,14 +160,13 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu 
> > *vcpu)
> >
> >   /* Check for an existing root before allocating a new one. */
> >   for_each_tdp_mmu_root(kvm, root) {
> > - if (root->role.word == role.word) {
> > - kvm_tdp_mmu_get_root(kvm, root);
> > + if (root->role.word == role.word &&
> > + kvm_tdp_mmu_get_root(kvm, root))
>
> I'm not opposed to changing this logic while making the refcount atomic, but 
> it
> needs to be explained in the changelog.  As is, the changelog makes it sound
> like the patch is a pure refactoring of the type.

Thanks for pointing that out. I'll add a note in the description in
v2. Those felt like natural changes since the introduction of the
atomic requires additional failure handling. I don't think there's any
way to add it as a separate commit without just introducing dead code,
but that would certainly be preferable.

Re: [PATCH 08/13] KVM: x86/mmu: Protect the tdp_mmu_roots list with RCU

2021-04-01 Thread Ben Gardon

On Thu, Apr 1, 2021 at 2:37 AM Paolo Bonzini  wrote:
>
> On 31/03/21 23:08, Ben Gardon wrote:
> > Protect the contents of the TDP MMU roots list with RCU in preparation
> > for a future patch which will allow the iterator macro to be used under
> > the MMU lock in read mode.
> >
> > Signed-off-by: Ben Gardon
> > ---
> >   arch/x86/kvm/mmu/tdp_mmu.c | 64 +-
> >   1 file changed, 36 insertions(+), 28 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > + spin_lock(&kvm->arch.tdp_mmu_pages_lock);
> > + list_del_rcu(&root->link);
> > + spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
>
>
> Please update the comment above tdp_mmu_pages_lock in
> arch/x86/include/asm/kvm_host.h as well.

Ah yes, thank you for catching that. Will do.

>
> >  /* Only safe under the MMU lock in write mode, without yielding. */
> >  #define for_each_tdp_mmu_root(_kvm, _root)   \
> > - list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
> > + list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
> > + lockdep_is_held_write(&kvm->mmu_lock))
>
> This should also add "... ||
> lockdep_is_help(&kvm->arch.tdp_mmu_pages_lock)", if only for
> documentation purposes.

Good idea. I hope we never have a function try to protect its loop
over the roots with that lock, but it would be correct.

>
> Paolo
>

Re: [PATCH 12/13] KVM: x86/mmu: Fast invalidation for TDP MMU

2021-04-01 Thread Ben Gardon

On Wed, Mar 31, 2021 at 3:27 PM Sean Christopherson  wrote:
>
> On Wed, Mar 31, 2021, Ben Gardon wrote:
> > Provide a real mechanism for fast invalidation by marking roots as
> > invalid so that their reference count will quickly fall to zero
> > and they will be torn down.
> >
> > One negative side affect of this approach is that a vCPU thread will
> > likely drop the last reference to a root and be saddled with the work of
> > tearing down an entire paging structure. This issue will be resolved in
> > a later commit.
> >
> > Signed-off-by: Ben Gardon 
> > ---
>
> ...
>
> > +/*
> > + * This function depends on running in the same MMU lock cirical section as
> > + * kvm_reload_remote_mmus. Since this is in the same critical section, no 
> > new
> > + * roots will be created between this function and the MMU reload signals
> > + * being sent.
>
> Eww.  That goes beyond just adding a lockdep assertion here.  I know you want 
> to
> isolate the TDP MMU as much as possible, but this really feels like it should 
> be
> open coded in kvm_mmu_zap_all_fast().  And assuming this lands after as_id is
> added to for_each_tdp_mmu_root(), it's probably easier to open code anyways, 
> e.g.
> use list_for_each_entry() directly instead of bouncing through an iterator.

Yeah, that's fair. I'll open-code it here. I agree that it will remove
confusion from the function, though it would be nice to be able to use
for_each_tdp_mmu_root for the lockdep and rcu annotations.


>
> > + */
> > +void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm)
> > +{
> > + struct kvm_mmu_page *root;
> > +
> > + for_each_tdp_mmu_root(kvm, root)
> > + root->role.invalid = true;
> > +}

Re: [PATCH 08/13] KVM: x86/mmu: Protect the tdp_mmu_roots list with RCU

2021-04-01 Thread Ben Gardon

On Thu, Apr 1, 2021 at 6:17 AM kernel test robot  wrote:
>
> Hi Ben,
>
> Thank you for the patch! Yet something to improve:
>
> [auto build test ERROR on next-20210331]
> [cannot apply to kvm/queue tip/master linux/master linus/master v5.12-rc5 
> v5.12-rc4 v5.12-rc3 v5.12-rc5]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch]
>
> url:    
> https://github.com/0day-ci/linux/commits/Ben-Gardon/More-parallel-operations-for-the-TDP-MMU/20210401-051137
> base:7a43c78d0573e00456b033e2b9a895b89464
> config: x86_64-allyesconfig (attached as .config)
> compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> reproduce (this is a W=1 build):
> # 
> https://github.com/0day-ci/linux/commit/2b2c6d3bdc35269df5f9293a02da5b71c74095f3
> git remote add linux-review https://github.com/0day-ci/linux
> git fetch --no-tags linux-review 
> Ben-Gardon/More-parallel-operations-for-the-TDP-MMU/20210401-051137
> git checkout 2b2c6d3bdc35269df5f9293a02da5b71c74095f3
> # save the attached .config to linux build tree
> make W=1 ARCH=x86_64
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot 
>
> All errors (new ones prefixed by >>):
>
>In file included from include/linux/rculist.h:11,
> from include/linux/pid.h:5,
> from include/linux/sched.h:14,
> from include/linux/hardirq.h:9,
> from include/linux/kvm_host.h:7,
> from arch/x86/kvm/mmu.h:5,
> from arch/x86/kvm/mmu/tdp_mmu.c:3:
>arch/x86/kvm/mmu/tdp_mmu.c: In function 'kvm_tdp_mmu_get_vcpu_root_hpa':
> >> arch/x86/kvm/mmu/tdp_mmu.c:139:5: error: implicit declaration of function 
> >> 'lockdep_is_held_write'; did you mean 'lockdep_is_held_type'? 
> >> [-Werror=implicit-function-declaration]
>  139 | lockdep_is_held_write(&kvm->mmu_lock))

Huh, I wonder why this isn't exported in some configuration. I'll fix
this in v2 as well.

>  | ^
>include/linux/rcupdate.h:318:52: note: in definition of macro 
> 'RCU_LOCKDEP_WARN'
>  318 |   if (debug_lockdep_rcu_enabled() && !__warned && (c)) { \
>  |^
>include/linux/rculist.h:391:7: note: in expansion of macro 
> '__list_check_rcu'
>  391 |  for (__list_check_rcu(dummy, ## cond, 0),   \
>  |   ^~~~
>arch/x86/kvm/mmu/tdp_mmu.c:138:2: note: in expansion of macro 
> 'list_for_each_entry_rcu'
>  138 |  list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
>  |  ^~~
>arch/x86/kvm/mmu/tdp_mmu.c:184:2: note: in expansion of macro 
> 'for_each_tdp_mmu_root'
>  184 |  for_each_tdp_mmu_root(kvm, root) {
>  |  ^
>cc1: some warnings being treated as errors
>
>
> vim +139 arch/x86/kvm/mmu/tdp_mmu.c
>
>  2
>> 3  #include "mmu.h"
>  4  #include "mmu_internal.h"
>  5  #include "mmutrace.h"
>  6  #include "tdp_iter.h"
>  7  #include "tdp_mmu.h"
>  8  #include "spte.h"
>  9
> 10  #include 
> 11  #include 
> 12
> 13  static bool __read_mostly tdp_mmu_enabled = false;
> 14  module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
> 15
> 16  /* Initializes the TDP MMU for the VM, if enabled. */
> 17  void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
> 18  {
> 19  if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
> 20  return;
> 21
> 22  /* This should not be changed for the lifetime of the VM. */
> 23  kvm->arch.tdp_mmu_enabled = true;
> 24
> 25  INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
> 26  spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
> 27  INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
> 28  }
> 29
> 30  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
> 31  {
> 32  if (!kvm->arch.tdp_mmu_enabled)
> 33  return;
> 34
> 35  WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
> 36
> 37  /*
> 38   * Ensure that all the outstanding RCU callbacks to free 
> shadow pages
> 39   * can run before t

Re: [PATCH 09/13] KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock

2021-04-01 Thread Ben Gardon

On Thu, Apr 1, 2021 at 2:58 AM Paolo Bonzini  wrote:
>
> On 31/03/21 23:08, Ben Gardon wrote:
> > To reduce lock contention and interference with page fault handlers,
> > allow the TDP MMU function to zap a GFN range to operate under the MMU
> > read lock.
> >
> > Signed-off-by: Ben Gardon 
> > ---
> >   arch/x86/kvm/mmu/mmu.c |  15 --
> >   arch/x86/kvm/mmu/tdp_mmu.c | 102 ++---
> >   arch/x86/kvm/mmu/tdp_mmu.h |   6 ++-
> >   3 files changed, 87 insertions(+), 36 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 667d64daa82c..dcbfc784cf2f 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -3155,7 +3155,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t 
> > *root_hpa,
> >   sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
> >
> >   if (is_tdp_mmu_page(sp))
> > - kvm_tdp_mmu_put_root(kvm, sp);
> > + kvm_tdp_mmu_put_root(kvm, sp, false);
> >   else if (!--sp->root_count && sp->role.invalid)
> >   kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
> >
> > @@ -5514,13 +5514,17 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t 
> > gfn_start, gfn_t gfn_end)
> >   }
> >   }
> >
> > + write_unlock(&kvm->mmu_lock);
> > +
> >   if (is_tdp_mmu_enabled(kvm)) {
> > - flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
> > + read_lock(&kvm->mmu_lock);
> > + flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end,
> > +   true);
> >   if (flush)
> >   kvm_flush_remote_tlbs(kvm);
> > - }
> >
> > - write_unlock(&kvm->mmu_lock);
> > + read_unlock(&kvm->mmu_lock);
> > + }
> >   }
>
> This will conflict with Sean's MMU notifier series patches:
>
> KVM: x86/mmu: Pass address space ID to __kvm_tdp_mmu_zap_gfn_range()
>
> What I can do for now is change the mmu.c part of that patch to
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e6e02360ef67..9882bbd9b742 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5510,15 +5510,15 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t 
> gfn_start, gfn_t gfn_end)
> }
> }
>
> -   if (flush)
> -   kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
> -
> if (is_tdp_mmu_enabled(kvm)) {
> -   flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
> -   if (flush)
> -   kvm_flush_remote_tlbs(kvm);
> +   for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
> +   flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
> + gfn_end, flush);
> }
>
> +   if (flush)
> +   kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
> +
> write_unlock(&kvm->mmu_lock);
>   }
>
>
> but you will have to add a separate "if (flush)" when moving the write_unlock
> earlier, since there's no downgrade function for rwlocks.  In practice it's
> not a huge deal since unless running nested there will be only one active MMU.
>
> Paolo

Thank you for doing that. I also figured that the extra flushes when
running nested would probably be worth it to get the parallelism
gains. I don't mind working out those conflicts in v2.


>
> >   static bool slot_rmap_write_protect(struct kvm *kvm,
> > @@ -5959,7 +5963,8 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
> >   WARN_ON_ONCE(!sp->lpage_disallowed);
> >   if (is_tdp_mmu_page(sp)) {
> >   kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
> > - sp->gfn + 
> > KVM_PAGES_PER_HPAGE(sp->role.level));
> > + sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level),
> > + false);
> >   } else {
> >   kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
> >   WARN_ON_ONCE(sp->lpage_disallowed);
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index d255125059c4..0e99e4675dd4 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -27,6 +27,15 @@ voi

[PATCH 12/13] KVM: x86/mmu: Fast invalidation for TDP MMU

2021-03-31 Thread Ben Gardon

Provide a real mechanism for fast invalidation by marking roots as
invalid so that their reference count will quickly fall to zero
and they will be torn down.

One negative side affect of this approach is that a vCPU thread will
likely drop the last reference to a root and be saddled with the work of
tearing down an entire paging structure. This issue will be resolved in
a later commit.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c |  6 +++---
 arch/x86/kvm/mmu/tdp_mmu.c | 14 ++
 arch/x86/kvm/mmu/tdp_mmu.h |  5 +
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index bf535c9f7ff2..49b7097fb55b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5430,6 +5430,9 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
write_lock(&kvm->mmu_lock);
trace_kvm_mmu_zap_all_fast(kvm);
 
+   if (is_tdp_mmu_enabled(kvm))
+   kvm_tdp_mmu_invalidate_roots(kvm);
+
/*
 * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
 * held for the entire duration of zapping obsolete pages, it's
@@ -5451,9 +5454,6 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 
kvm_zap_obsolete_pages(kvm);
 
-   if (is_tdp_mmu_enabled(kvm))
-   kvm_tdp_mmu_zap_all(kvm);
-
write_unlock(&kvm->mmu_lock);
 }
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 0c90dc034819..428ff6778426 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -789,6 +789,20 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
kvm_flush_remote_tlbs(kvm);
 }
 
+/*
+ * This function depends on running in the same MMU lock cirical section as
+ * kvm_reload_remote_mmus. Since this is in the same critical section, no new
+ * roots will be created between this function and the MMU reload signals
+ * being sent.
+ */
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm)
+{
+   struct kvm_mmu_page *root;
+
+   for_each_tdp_mmu_root(kvm, root)
+   root->role.invalid = true;
+}
+
 /*
  * Installs a last-level SPTE to handle a TDP page fault.
  * (NPT/EPT violation/misconfiguration)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 855e58856815..ff4978817fb8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -10,6 +10,9 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
 __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
 struct kvm_mmu_page *root)
 {
+   if (root->role.invalid)
+   return false;
+
return refcount_inc_not_zero(&root->tdp_mmu_root_count);
 }
 
@@ -20,6 +23,8 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, 
gfn_t end,
   bool shared);
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
 
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm);
+
 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
int map_writable, int max_level, kvm_pfn_t pfn,
bool prefault);
-- 
2.31.0.291.g576ba9dcdaf-goog

[PATCH 13/13] KVM: x86/mmu: Tear down roots in fast invalidation thread

2021-03-31 Thread Ben Gardon

To avoid saddling a vCPU thread with the work of tearing down an entire
paging structure, take a reference on each root before they become
obsolete, so that the thread initiating the fast invalidation can tear
down the paging structure and (most likely) release the last reference.
As a bonus, this teardown can happen under the MMU lock in read mode so
as not to block the progress of vCPU threads.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c |  6 
 arch/x86/kvm/mmu/tdp_mmu.c | 74 +-
 arch/x86/kvm/mmu/tdp_mmu.h |  1 +
 3 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 49b7097fb55b..22742619698d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5455,6 +5455,12 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_zap_obsolete_pages(kvm);
 
write_unlock(&kvm->mmu_lock);
+
+   if (is_tdp_mmu_enabled(kvm)) {
+   read_lock(&kvm->mmu_lock);
+   kvm_tdp_mmu_zap_all_fast(kvm);
+   read_unlock(&kvm->mmu_lock);
+   }
 }
 
 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 428ff6778426..5498df7e2e1f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -794,13 +794,85 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
  * kvm_reload_remote_mmus. Since this is in the same critical section, no new
  * roots will be created between this function and the MMU reload signals
  * being sent.
+ * Take a reference on all roots so that this thread can do the bulk of
+ * the work required to free the roots once they are invalidated. Without
+ * this reference, a vCPU thread might drop the last reference to a root
+ * and get stuck with tearing down the entire paging structure.
  */
 void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm)
 {
struct kvm_mmu_page *root;
 
for_each_tdp_mmu_root(kvm, root)
-   root->role.invalid = true;
+   if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
+   root->role.invalid = true;
+}
+
+static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
+ struct kvm_mmu_page 
*prev_root)
+{
+   struct kvm_mmu_page *next_root;
+
+   if (prev_root)
+   next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+   else
+   next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+  typeof(*next_root), link);
+
+   while (next_root && !(next_root->role.invalid &&
+ refcount_read(&next_root->tdp_mmu_root_count)))
+   next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link,
+ typeof(*next_root), link);
+
+   return next_root;
+}
+
+/*
+ * Since kvm_tdp_mmu_invalidate_roots has acquired a reference to each
+ * invalidated root, they will not be freed until this function drops the
+ * reference. Before dropping that reference, tear down the paging
+ * structure so that whichever thread does drop the last reference
+ * only has to do a trivial ammount of work. Since the roots are invalid,
+ * no new SPTEs should be created under them.
+ */
+void kvm_tdp_mmu_zap_all_fast(struct kvm *kvm)
+{
+   gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+   struct kvm_mmu_page *next_root;
+   struct kvm_mmu_page *root;
+   bool flush = false;
+
+   lockdep_assert_held_read(&kvm->mmu_lock);
+
+   rcu_read_lock();
+
+   root = next_invalidated_root(kvm, NULL);
+
+   while (root) {
+   next_root = next_invalidated_root(kvm, root);
+
+   rcu_read_unlock();
+
+   flush |= zap_gfn_range(kvm, root, 0, max_gfn, true, true);
+
+   /*
+* Put the reference acquired in
+* kvm_tdp_mmu_invalidate_roots
+*/
+   kvm_tdp_mmu_put_root(kvm, root, true);
+
+   root = next_root;
+
+   rcu_read_lock();
+   }
+
+   rcu_read_unlock();
+
+   if (flush)
+   kvm_flush_remote_tlbs(kvm);
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index ff4978817fb8..d6d98f9047cd 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -24,6 +24,7 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, 
gfn_t end,
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
 
 void kvm_tdp_mmu_invalidate_roots(struct kvm

[PATCH 10/13] KVM: x86/mmu: Allow zapping collapsible SPTEs to use MMU read lock

2021-03-31 Thread Ben Gardon

To speed the process of disabling dirty logging, change the TDP MMU
function which zaps collapsible SPTEs to run under the MMU read lock.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c |  9 ++---
 arch/x86/kvm/mmu/tdp_mmu.c | 17 +
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index dcbfc784cf2f..81967b4e7d76 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5610,10 +5610,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 
write_lock(&kvm->mmu_lock);
slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
-
-   if (is_tdp_mmu_enabled(kvm))
-   kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
write_unlock(&kvm->mmu_lock);
+
+   if (is_tdp_mmu_enabled(kvm)) {
+   read_lock(&kvm->mmu_lock);
+   kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
+   read_unlock(&kvm->mmu_lock);
+   }
 }
 
 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 0e99e4675dd4..862acb868abd 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1335,7 +1335,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
rcu_read_lock();
 
tdp_root_for_each_pte(iter, root, start, end) {
-   if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set, false)) {
+retry:
+   if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set, true)) {
spte_set = false;
continue;
}
@@ -1350,8 +1351,14 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
pfn, PG_LEVEL_NUM))
continue;
 
-   tdp_mmu_set_spte(kvm, &iter, 0);
-
+   if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+   /*
+* The iter must explicitly re-read the SPTE because
+* the atomic cmpxchg failed.
+*/
+   iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+   goto retry;
+   }
spte_set = true;
}
 
@@ -1370,7 +1377,9 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
struct kvm_mmu_page *root;
int root_as_id;
 
-   for_each_tdp_mmu_root_yield_safe(kvm, root, false) {
+   lockdep_assert_held_read(&kvm->mmu_lock);
+
+   for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
continue;
-- 
2.31.0.291.g576ba9dcdaf-goog

[PATCH 11/13] KVM: x86/mmu: Allow enabling / disabling dirty logging under MMU read lock

2021-03-31 Thread Ben Gardon

To reduce lock contention and interference with page fault handlers,
allow the TDP MMU functions which enable and disable dirty logging
to operate under the MMU read lock.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c | 16 +++---
 arch/x86/kvm/mmu/tdp_mmu.c | 62 ++
 2 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 81967b4e7d76..bf535c9f7ff2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5543,10 +5543,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
write_lock(&kvm->mmu_lock);
flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
-   if (is_tdp_mmu_enabled(kvm))
-   flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
write_unlock(&kvm->mmu_lock);
 
+   if (is_tdp_mmu_enabled(kvm)) {
+   read_lock(&kvm->mmu_lock);
+   flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
+   read_unlock(&kvm->mmu_lock);
+   }
+
/*
 * We can flush all the TLBs out of the mmu lock without TLB
 * corruption since we just change the spte from writable to
@@ -5641,10 +5645,14 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 
write_lock(&kvm->mmu_lock);
flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
-   if (is_tdp_mmu_enabled(kvm))
-   flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
write_unlock(&kvm->mmu_lock);
 
+   if (is_tdp_mmu_enabled(kvm)) {
+   read_lock(&kvm->mmu_lock);
+   flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+   read_unlock(&kvm->mmu_lock);
+   }
+
/*
 * It's also safe to flush TLBs out of mmu lock here as currently this
 * function is only used for dirty logging, in which case flushing TLB
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 862acb868abd..0c90dc034819 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -491,8 +491,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, 
gfn_t gfn,
 }
 
 /*
- * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
- * associated bookkeeping
+ * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping, but do not mark the page dirty
+ * in KVM's dirty bitmaps.
  *
  * @kvm: kvm instance
  * @iter: a tdp_iter instance currently on the SPTE that should be set
@@ -500,9 +501,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, 
gfn_t gfn,
  * Returns: true if the SPTE was set, false if it was not. If false is 
returned,
  * this function will have no side-effects.
  */
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
-  struct tdp_iter *iter,
-  u64 new_spte)
+static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
+   struct tdp_iter *iter,
+   u64 new_spte)
 {
lockdep_assert_held_read(&kvm->mmu_lock);
 
@@ -517,9 +518,22 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
  new_spte) != iter->old_spte)
return false;
 
-   handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
-   new_spte, iter->level, true);
+   __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
+   handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
+
+   return true;
+}
+
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+  struct tdp_iter *iter,
+  u64 new_spte)
+{
+   if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
+   return false;
 
+   handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
+ iter->old_spte, new_spte, iter->level);
return true;
 }
 
@@ -1142,7 +1156,8 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct 
kvm_mmu_page *root,
 
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
   min_level, start, end) {
-   if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
+retry:
+   if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
 
if (!is_shadow_present_pte(iter.old_spte) ||
@@ -115

[PATCH 08/13] KVM: x86/mmu: Protect the tdp_mmu_roots list with RCU

2021-03-31 Thread Ben Gardon

Protect the contents of the TDP MMU roots list with RCU in preparation
for a future patch which will allow the iterator macro to be used under
the MMU lock in read mode.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 64 +-
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 1f0b2d6124a2..d255125059c4 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -50,6 +50,22 @@ static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
kmem_cache_free(mmu_page_header_cache, sp);
 }
 
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
+{
+   struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+  rcu_head);
+
+   tdp_mmu_free_sp(sp);
+}
+
 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
 {
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
@@ -61,11 +77,13 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
 
WARN_ON(!root->tdp_mmu_page);
 
-   list_del(&root->link);
+   spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+   list_del_rcu(&root->link);
+   spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 
zap_gfn_range(kvm, root, 0, max_gfn, false);
 
-   tdp_mmu_free_sp(root);
+   call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
 }
 
 /*
@@ -82,18 +100,21 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm 
*kvm,
 
lockdep_assert_held_write(&kvm->mmu_lock);
 
+   rcu_read_lock();
+
if (prev_root)
-   next_root = list_next_entry(prev_root, link);
+   next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
else
-   next_root = list_first_entry(&kvm->arch.tdp_mmu_roots,
-typeof(*next_root), link);
+   next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+  typeof(*next_root), link);
 
-   while (!list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link) &&
-  !kvm_tdp_mmu_get_root(kvm, next_root))
-   next_root = list_next_entry(next_root, link);
+   while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
+   next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+   &next_root->link, typeof(*next_root), link);
 
-   if (list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link))
-   next_root = NULL;
+   rcu_read_unlock();
 
if (prev_root)
kvm_tdp_mmu_put_root(kvm, prev_root);
@@ -114,7 +135,8 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm 
*kvm,
 
 /* Only safe under the MMU lock in write mode, without yielding. */
 #define for_each_tdp_mmu_root(_kvm, _root) \
-   list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
+   list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
+   lockdep_is_held_write(&kvm->mmu_lock))
 
 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
   int level)
@@ -168,28 +190,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
refcount_set(&root->tdp_mmu_root_count, 1);
 
-   list_add(&root->link, &kvm->arch.tdp_mmu_roots);
+   spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+   list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+   spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 
 out:
return __pa(root->spt);
 }
 
-/*
- * This is called through call_rcu in order to free TDP page table memory
- * safely with respect to other kernel threads that may be operating on
- * the memory.
- * By only accessing TDP MMU page table memory in an RCU read critical
- * section, and freeing it after a grace period, lockless access to that
- * memory won't use it after it is freed.
- */
-static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
-{
-   struct kvm_mmu_pag

[PATCH 09/13] KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock

2021-03-31 Thread Ben Gardon

To reduce lock contention and interference with page fault handlers,
allow the TDP MMU function to zap a GFN range to operate under the MMU
read lock.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c |  15 --
 arch/x86/kvm/mmu/tdp_mmu.c | 102 ++---
 arch/x86/kvm/mmu/tdp_mmu.h |   6 ++-
 3 files changed, 87 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 667d64daa82c..dcbfc784cf2f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3155,7 +3155,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t 
*root_hpa,
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
 
if (is_tdp_mmu_page(sp))
-   kvm_tdp_mmu_put_root(kvm, sp);
+   kvm_tdp_mmu_put_root(kvm, sp, false);
else if (!--sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
@@ -5514,13 +5514,17 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t 
gfn_start, gfn_t gfn_end)
}
}
 
+   write_unlock(&kvm->mmu_lock);
+
if (is_tdp_mmu_enabled(kvm)) {
-   flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
+   read_lock(&kvm->mmu_lock);
+   flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end,
+ true);
if (flush)
kvm_flush_remote_tlbs(kvm);
-   }
 
-   write_unlock(&kvm->mmu_lock);
+   read_unlock(&kvm->mmu_lock);
+   }
 }
 
 static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5959,7 +5963,8 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
WARN_ON_ONCE(!sp->lpage_disallowed);
if (is_tdp_mmu_page(sp)) {
kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
-   sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
+   sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level),
+   false);
} else {
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->lpage_disallowed);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d255125059c4..0e99e4675dd4 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
 }
 
+static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+bool shared)
+{
+   if (shared)
+   lockdep_assert_held_read(&kvm->mmu_lock);
+   else
+   lockdep_assert_held_write(&kvm->mmu_lock);
+}
+
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 {
if (!kvm->arch.tdp_mmu_enabled)
@@ -42,7 +51,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 }
 
 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield);
+ gfn_t start, gfn_t end, bool can_yield, bool shared);
 
 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 {
@@ -66,11 +75,12 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head 
*head)
tdp_mmu_free_sp(sp);
 }
 
-void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
 {
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
 
-   lockdep_assert_held_write(&kvm->mmu_lock);
+   kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return;
@@ -81,7 +91,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
list_del_rcu(&root->link);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 
-   zap_gfn_range(kvm, root, 0, max_gfn, false);
+   zap_gfn_range(kvm, root, 0, max_gfn, false, shared);
 
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
 }
@@ -94,11 +104,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
  * function will return NULL.
  */
 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
- struct kvm_mmu_page *prev_root)
+ struct kvm_mmu_page *prev_root,
+ bool shared)
 {
struct kvm_mmu_page *next_root;
 
-   lockdep_assert_held_write(&kvm->mmu_lock);
 
rcu_read_lock();
 
@@ -117,7 +127,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm 
*kvm,
rcu_read_unloc

[PATCH 07/13] KVM: x86/mmu: Make TDP MMU root refcount atomic

2021-03-31 Thread Ben Gardon

In order to parallelize more operations for the TDP MMU, make the
refcount on TDP MMU roots atomic, so that a future patch can allow
multiple threads to take a reference on the root concurrently, while
holding the MMU lock in read mode.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu_internal.h |  6 +-
 arch/x86/kvm/mmu/tdp_mmu.c  | 15 ---
 arch/x86/kvm/mmu/tdp_mmu.h  |  9 +++--
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 788dcf77c957..0a040d6a4f35 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -50,7 +50,11 @@ struct kvm_mmu_page {
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
-   int root_count;  /* Currently serving as active root */
+   /* Currently serving as active root */
+   union {
+   int root_count;
+   refcount_t tdp_mmu_root_count;
+   };
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
DECLARE_BITMAP(unsync_child_bitmap, 512);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index ab1d26b40164..1f0b2d6124a2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -56,7 +56,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
 
lockdep_assert_held_write(&kvm->mmu_lock);
 
-   if (--root->root_count)
+   if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return;
 
WARN_ON(!root->tdp_mmu_page);
@@ -88,10 +88,12 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm 
*kvm,
next_root = list_first_entry(&kvm->arch.tdp_mmu_roots,
 typeof(*next_root), link);
 
+   while (!list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link) &&
+  !kvm_tdp_mmu_get_root(kvm, next_root))
+   next_root = list_next_entry(next_root, link);
+
if (list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link))
next_root = NULL;
-   else
-   kvm_tdp_mmu_get_root(kvm, next_root);
 
if (prev_root)
kvm_tdp_mmu_put_root(kvm, prev_root);
@@ -158,14 +160,13 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 
/* Check for an existing root before allocating a new one. */
for_each_tdp_mmu_root(kvm, root) {
-   if (root->role.word == role.word) {
-   kvm_tdp_mmu_get_root(kvm, root);
+   if (root->role.word == role.word &&
+   kvm_tdp_mmu_get_root(kvm, root))
goto out;
-   }
}
 
root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
-   root->root_count = 1;
+   refcount_set(&root->tdp_mmu_root_count, 1);
 
list_add(&root->link, &kvm->arch.tdp_mmu_roots);
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 5d950e987fc7..9961df505067 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -7,13 +7,10 @@
 
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
 
-static inline void kvm_tdp_mmu_get_root(struct kvm *kvm,
-   struct kvm_mmu_page *root)
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
+struct kvm_mmu_page *root)
 {
-   BUG_ON(!root->root_count);
-   lockdep_assert_held(&kvm->mmu_lock);
-
-   ++root->root_count;
+   return refcount_inc_not_zero(&root->tdp_mmu_root_count);
 }
 
 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
-- 
2.31.0.291.g576ba9dcdaf-goog

[PATCH 00/13] More parallel operations for the TDP MMU

2021-03-31 Thread Ben Gardon

Now that the TDP MMU is able to handle page faults in parallel, it's a
relatively small change to expand to other operations. This series allows
zapping a range of GFNs, reclaiming collapsible SPTEs (when disabling
dirty logging), and enabling dirty logging to all happen under the MMU
lock in read mode.

This is partly a cleanup + rewrite of the last few patches of the parallel
page faults series. I've incorporated feedback from Sean and Paolo, but
the patches have changed so much that I'm sending this as a separate
series.

Ran kvm-unit-tests + selftests on an SMP kernel + Intel Skylake, with the
TDP MMU enabled and disabled. This series introduces no new failures or
warnings.

I know this will conflict horribly with the patches from Sean's series
which were just queued, and I'll send a v2 to fix those conflicts +
address any feedback on this v1.

Ben Gardon (13):
  KVM: x86/mmu: Re-add const qualifier in
kvm_tdp_mmu_zap_collapsible_sptes
  KVM: x86/mmu: Move kvm_mmu_(get|put)_root to TDP MMU
  KVM: x86/mmu: use tdp_mmu_free_sp to free roots
  KVM: x86/mmu: Merge TDP MMU put and free root
  KVM: x86/mmu: comment for_each_tdp_mmu_root requires MMU write lock
  KVM: x86/mmu: Refactor yield safe root iterator
  KVM: x86/mmu: Make TDP MMU root refcount atomic
  KVM: x86/mmu: Protect the tdp_mmu_roots list with RCU
  KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock
  KVM: x86/mmu: Allow zapping collapsible SPTEs to use MMU read lock
  KVM: x86/mmu: Allow enabling / disabling dirty logging under MMU read
lock
  KVM: x86/mmu: Fast invalidation for TDP MMU
  KVM: x86/mmu: Tear down roots in fast invalidation thread

 arch/x86/kvm/mmu/mmu.c  |  70 +++---
 arch/x86/kvm/mmu/mmu_internal.h |  27 +--
 arch/x86/kvm/mmu/tdp_mmu.c  | 383 
 arch/x86/kvm/mmu/tdp_mmu.h  |  21 +-
 include/linux/kvm_host.h|   2 +-
 5 files changed, 357 insertions(+), 146 deletions(-)

-- 
2.31.0.291.g576ba9dcdaf-goog

[PATCH 01/13] KVM: x86/mmu: Re-add const qualifier in kvm_tdp_mmu_zap_collapsible_sptes

2021-03-31 Thread Ben Gardon

kvm_tdp_mmu_zap_collapsible_sptes unnecessarily removes the const
qualifier from its memlsot argument, leading to a compiler warning. Add
the const annotation and pass it to subsequent functions.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c  | 10 +-
 arch/x86/kvm/mmu/mmu_internal.h |  5 +++--
 arch/x86/kvm/mmu/tdp_mmu.c  |  4 ++--
 arch/x86/kvm/mmu/tdp_mmu.h  |  2 +-
 include/linux/kvm_host.h|  2 +-
 5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c6ed633594a2..f75cbb0fcc9c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -715,8 +715,7 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, 
int index, gfn_t gfn)
  * handling slots that are not large page aligned.
  */
 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
+   const struct kvm_memory_slot *slot, int level)
 {
unsigned long idx;
 
@@ -2736,7 +2735,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, 
u64 *sptep)
 }
 
 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
 {
unsigned long hva;
pte_t *pte;
@@ -2762,8 +2761,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t 
gfn, kvm_pfn_t pfn,
return level;
 }
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level)
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level)
 {
struct kvm_lpage_info *linfo;
 
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index e03267e93459..fc88f62d7bd9 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -156,8 +156,9 @@ enum {
 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
 #define SET_SPTE_SPURIOUS  BIT(2)
 
-int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, kvm_pfn_t pfn, int max_level);
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int max_level);
 int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
int max_level, kvm_pfn_t *pfnp,
bool huge_page_disallowed, int *req_level);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index f2c335854afb..6d4f4e305163 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1268,7 +1268,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
  */
 static void zap_collapsible_spte_range(struct kvm *kvm,
   struct kvm_mmu_page *root,
-  struct kvm_memory_slot *slot)
+  const struct kvm_memory_slot *slot)
 {
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
@@ -1309,7 +1309,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
  * be replaced by large mappings, for GFNs within the slot.
  */
 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-  struct kvm_memory_slot *slot)
+  const struct kvm_memory_slot *slot)
 {
struct kvm_mmu_page *root;
int root_as_id;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 3b761c111bff..683d1d69c8c8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -34,7 +34,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
   gfn_t gfn, unsigned long mask,
   bool wrprot);
 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
-  struct kvm_memory_slot *slot);
+  const struct kvm_memory_slot *slot);
 
 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
   struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1b65e7204344..74e56e8673a6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1116,7 +1116,7 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
 }
 
 static inline unsigned long
-__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 {
return slot->userspace_addr + (g

[PATCH 04/13] KVM: x86/mmu: Merge TDP MMU put and free root

2021-03-31 Thread Ben Gardon

kvm_tdp_mmu_put_root and kvm_tdp_mmu_free_root are always called
together, so merge the functions to simplify TDP MMU root refcounting /
freeing.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c |  4 +--
 arch/x86/kvm/mmu/tdp_mmu.c | 54 ++
 arch/x86/kvm/mmu/tdp_mmu.h | 10 +--
 3 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 618cc011f446..667d64daa82c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3154,8 +3154,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t 
*root_hpa,
 
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
 
-   if (is_tdp_mmu_page(sp) && kvm_tdp_mmu_put_root(kvm, sp))
-   kvm_tdp_mmu_free_root(kvm, sp);
+   if (is_tdp_mmu_page(sp))
+   kvm_tdp_mmu_put_root(kvm, sp);
else if (!--sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 5a2698d64957..368091adab09 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -41,10 +41,31 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
rcu_barrier();
 }
 
-static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, bool can_yield);
+
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 {
-   if (kvm_tdp_mmu_put_root(kvm, root))
-   kvm_tdp_mmu_free_root(kvm, root);
+   free_page((unsigned long)sp->spt);
+   kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+   gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+
+   lockdep_assert_held_write(&kvm->mmu_lock);
+
+   if (--root->root_count)
+   return;
+
+   WARN_ON(!root->tdp_mmu_page);
+
+   list_del(&root->link);
+
+   zap_gfn_range(kvm, root, 0, max_gfn, false);
+
+   tdp_mmu_free_sp(root);
 }
 
 static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
@@ -66,7 +87,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct 
kvm *kvm,
struct kvm_mmu_page *next_root;
 
next_root = list_next_entry(root, link);
-   tdp_mmu_put_root(kvm, root);
+   kvm_tdp_mmu_put_root(kvm, root);
return next_root;
 }
 
@@ -85,31 +106,6 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct 
kvm *kvm,
 #define for_each_tdp_mmu_root(_kvm, _root) \
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
 
-static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield);
-
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
-{
-   free_page((unsigned long)sp->spt);
-   kmem_cache_free(mmu_page_header_cache, sp);
-}
-
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
-{
-   gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-
-   lockdep_assert_held_write(&kvm->mmu_lock);
-
-   WARN_ON(root->root_count);
-   WARN_ON(!root->tdp_mmu_page);
-
-   list_del(&root->link);
-
-   zap_gfn_range(kvm, root, 0, max_gfn, false);
-
-   tdp_mmu_free_sp(root);
-}
-
 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
   int level)
 {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 2dc3b3ba48fb..5d950e987fc7 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -6,7 +6,6 @@
 #include 
 
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
-void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
 static inline void kvm_tdp_mmu_get_root(struct kvm *kvm,
struct kvm_mmu_page *root)
@@ -17,14 +16,7 @@ static inline void kvm_tdp_mmu_get_root(struct kvm *kvm,
++root->root_count;
 }
 
-static inline bool kvm_tdp_mmu_put_root(struct kvm *kvm,
-   struct kvm_mmu_page *root)
-{
-   lockdep_assert_held(&kvm->mmu_lock);
-   --root->root_count;
-
-   return !root->root_count;
-}
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end);
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
-- 
2.31.0.291.g576ba9dcdaf-goog

[PATCH 06/13] KVM: x86/mmu: Refactor yield safe root iterator

2021-03-31 Thread Ben Gardon

Refactor the yield safe TDP MMU root iterator to be more amenable to
changes in future commits which will allow it to be used under the MMU
lock in read mode. Currently the iterator requires a complicated dance
between the helper functions and different parts of the for loop which
makes it hard to reason about. Moving all the logic into a single function
simplifies the iterator substantially.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 43 ++
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 365fa9f2f856..ab1d26b40164 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -68,26 +68,34 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
tdp_mmu_free_sp(root);
 }
 
-static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
-  struct kvm_mmu_page *root)
+/*
+ * Finds the next valid root after root (or the first valid root if root
+ * is NULL), takes a reference on it, and returns that next root. If root
+ * is not NULL, this thread should have already taken a reference on it, and
+ * that reference will be dropped. If no valid root is found, this
+ * function will return NULL.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root)
 {
-   lockdep_assert_held_write(&kvm->mmu_lock);
+   struct kvm_mmu_page *next_root;
 
-   if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
-   return false;
+   lockdep_assert_held_write(&kvm->mmu_lock);
 
-   kvm_tdp_mmu_get_root(kvm, root);
-   return true;
+   if (prev_root)
+   next_root = list_next_entry(prev_root, link);
+   else
+   next_root = list_first_entry(&kvm->arch.tdp_mmu_roots,
+typeof(*next_root), link);
 
-}
+   if (list_entry_is_head(next_root, &kvm->arch.tdp_mmu_roots, link))
+   next_root = NULL;
+   else
+   kvm_tdp_mmu_get_root(kvm, next_root);
 
-static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
-struct kvm_mmu_page *root)
-{
-   struct kvm_mmu_page *next_root;
+   if (prev_root)
+   kvm_tdp_mmu_put_root(kvm, prev_root);
 
-   next_root = list_next_entry(root, link);
-   kvm_tdp_mmu_put_root(kvm, root);
return next_root;
 }
 
@@ -97,10 +105,9 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct 
kvm *kvm,
  * if exiting the loop early, the caller must drop the reference to the most
  * recent root. (Unless keeping a live reference is desirable.)
  */
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root)  
\
-   for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots,\
- typeof(*_root), link);\
-tdp_mmu_next_root_valid(_kvm, _root);  \
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root)  \
+   for (_root = tdp_mmu_next_root(_kvm, NULL); \
+_root; \
 _root = tdp_mmu_next_root(_kvm, _root))
 
 /* Only safe under the MMU lock in write mode, without yielding. */
-- 
2.31.0.291.g576ba9dcdaf-goog

[PATCH 05/13] KVM: x86/mmu: comment for_each_tdp_mmu_root requires MMU write lock

2021-03-31 Thread Ben Gardon

Currently, iterating over the list of TDP MMU roots can only be done
under the MMU write lock, but that will change in future commits. Add a
defensive comment to for_each_tdp_mmu_root noting that it must only be
used under the MMU lock in write mode. That function will not be
modified to work under the lock in read mode.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 368091adab09..365fa9f2f856 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -103,6 +103,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct 
kvm *kvm,
 tdp_mmu_next_root_valid(_kvm, _root);  \
 _root = tdp_mmu_next_root(_kvm, _root))
 
+/* Only safe under the MMU lock in write mode, without yielding. */
 #define for_each_tdp_mmu_root(_kvm, _root) \
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
 
-- 
2.31.0.291.g576ba9dcdaf-goog

[PATCH 03/13] KVM: x86/mmu: use tdp_mmu_free_sp to free roots

2021-03-31 Thread Ben Gardon

Minor cleanup to deduplicate the code used to free a struct kvm_mmu_page
in the TDP MMU.

No functional change intended.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 1929cc7a42ac..5a2698d64957 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -88,6 +88,12 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct 
kvm *kvm,
 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  gfn_t start, gfn_t end, bool can_yield);
 
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
+{
+   free_page((unsigned long)sp->spt);
+   kmem_cache_free(mmu_page_header_cache, sp);
+}
+
 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
 {
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
@@ -101,8 +107,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
 
zap_gfn_range(kvm, root, 0, max_gfn, false);
 
-   free_page((unsigned long)root->spt);
-   kmem_cache_free(mmu_page_header_cache, root);
+   tdp_mmu_free_sp(root);
 }
 
 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
@@ -164,12 +169,6 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
return __pa(root->spt);
 }
 
-static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
-{
-   free_page((unsigned long)sp->spt);
-   kmem_cache_free(mmu_page_header_cache, sp);
-}
-
 /*
  * This is called through call_rcu in order to free TDP page table memory
  * safely with respect to other kernel threads that may be operating on
-- 
2.31.0.291.g576ba9dcdaf-goog

[PATCH 02/13] KVM: x86/mmu: Move kvm_mmu_(get|put)_root to TDP MMU

2021-03-31 Thread Ben Gardon

The TDP MMU is almost the only user of kvm_mmu_get_root and
kvm_mmu_put_root. There is only one use of put_root in mmu.c for the
legacy / shadow MMU. Open code that one use and move the get / put
functions to the TDP MMU so they can be extended in future commits.

No functional change intended.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu.c  | 10 --
 arch/x86/kvm/mmu/mmu_internal.h | 16 
 arch/x86/kvm/mmu/tdp_mmu.c  |  6 +++---
 arch/x86/kvm/mmu/tdp_mmu.h  | 18 ++
 4 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f75cbb0fcc9c..618cc011f446 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3154,12 +3154,10 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t 
*root_hpa,
 
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
 
-   if (kvm_mmu_put_root(kvm, sp)) {
-   if (is_tdp_mmu_page(sp))
-   kvm_tdp_mmu_free_root(kvm, sp);
-   else if (sp->role.invalid)
-   kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
-   }
+   if (is_tdp_mmu_page(sp) && kvm_tdp_mmu_put_root(kvm, sp))
+   kvm_tdp_mmu_free_root(kvm, sp);
+   else if (!--sp->root_count && sp->role.invalid)
+   kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
*root_hpa = INVALID_PAGE;
 }
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index fc88f62d7bd9..788dcf77c957 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -118,22 +118,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
 
-static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-   BUG_ON(!sp->root_count);
-   lockdep_assert_held(&kvm->mmu_lock);
-
-   ++sp->root_count;
-}
-
-static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-   lockdep_assert_held(&kvm->mmu_lock);
-   --sp->root_count;
-
-   return !sp->root_count;
-}
-
 /*
  * Return values of handle_mmio_page_fault, mmu.page_fault, and 
fast_page_fault().
  *
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6d4f4e305163..1929cc7a42ac 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -43,7 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 
 static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
 {
-   if (kvm_mmu_put_root(kvm, root))
+   if (kvm_tdp_mmu_put_root(kvm, root))
kvm_tdp_mmu_free_root(kvm, root);
 }
 
@@ -55,7 +55,7 @@ static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
return false;
 
-   kvm_mmu_get_root(kvm, root);
+   kvm_tdp_mmu_get_root(kvm, root);
return true;
 
 }
@@ -150,7 +150,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
/* Check for an existing root before allocating a new one. */
for_each_tdp_mmu_root(kvm, root) {
if (root->role.word == role.word) {
-   kvm_mmu_get_root(kvm, root);
+   kvm_tdp_mmu_get_root(kvm, root);
goto out;
}
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 683d1d69c8c8..2dc3b3ba48fb 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -8,6 +8,24 @@
 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
 
+static inline void kvm_tdp_mmu_get_root(struct kvm *kvm,
+   struct kvm_mmu_page *root)
+{
+   BUG_ON(!root->root_count);
+   lockdep_assert_held(&kvm->mmu_lock);
+
+   ++root->root_count;
+}
+
+static inline bool kvm_tdp_mmu_put_root(struct kvm *kvm,
+   struct kvm_mmu_page *root)
+{
+   lockdep_assert_held(&kvm->mmu_lock);
+   --root->root_count;
+
+   return !root->root_count;
+}
+
 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end);
 void kvm_tdp_mmu_zap_all(struct kvm *kvm);
 
-- 
2.31.0.291.g576ba9dcdaf-goog

Re: [PATCH 00/18] KVM: Consolidate and optimize MMU notifiers

2021-03-30 Thread Ben Gardon

On Thu, Mar 25, 2021 at 7:20 PM Sean Christopherson  wrote:
>
> The end goal of this series is to optimize the MMU notifiers to take
> mmu_lock if and only if the notification is relevant to KVM, i.e. the hva
> range overlaps a memslot.   Large VMs (hundreds of vCPUs) are very
> sensitive to mmu_lock being taken for write at inopportune times, and
> such VMs also tend to be "static", e.g. backed by HugeTLB with minimal
> page shenanigans.  The vast majority of notifications for these VMs will
> be spurious (for KVM), and eliding mmu_lock for spurious notifications
> avoids an otherwise unacceptable disruption to the guest.
>
> To get there without potentially degrading performance, e.g. due to
> multiple memslot lookups, especially on non-x86 where the use cases are
> largely unknown (from my perspective), first consolidate the MMU notifier
> logic by moving the hva->gfn lookups into common KVM.
>
> Applies on my TDP MMU TLB flushing bug fixes[*], which conflict horribly
> with the TDP MMU changes in this series.  That code applies on kvm/queue
> (commit 4a98623d5d90, "KVM: x86/mmu: Mark the PAE roots as decrypted for
> shadow paging").
>
> Speaking of conflicts, Ben will soon be posting a series to convert a
> bunch of TDP MMU flows to take mmu_lock only for read.  Presumably there
> will be an absurd number of conflicts; Ben and I will sort out the
> conflicts in whichever series loses the race.
>
> Well tested on Intel and AMD.  Compile tested for arm64, MIPS, PPC,
> PPC e500, and s390.  Absolutely needs to be tested for real on non-x86,
> I give it even odds that I introduced an off-by-one bug somewhere.
>
> [*] https://lkml.kernel.org/r/20210325200119.1359384-1-sea...@google.com
>
>
> Patches 1-7 are x86 specific prep patches to play nice with moving
> the hva->gfn memslot lookups into common code.  There ended up being waaay
> more of these than I expected/wanted, but I had a hell of a time getting
> the flushing logic right when shuffling the memslot and address space
> loops.  In the end, I was more confident I got things correct by batching
> the flushes.
>
> Patch 8 moves the existing API prototypes into common code.  It could
> technically be dropped since the old APIs are gone in the end, but I
> thought the switch to the new APIs would suck a bit less this way.

Patches 1-8 look good to me. Feel free to add my Reviewed-by tag to those.
I appreciate the care you took to make all those changes tiny and reviewable.

>
> Patch 9 moves arm64's MMU notifier tracepoints into common code so that
> they are not lost when arm64 is converted to the new APIs, and so that all
> architectures can benefit.
>
> Patch 10 moves x86's memslot walkers into common KVM.  I chose x86 purely
> because I could actually test it.  All architectures use nearly identical
> code, so I don't think it actually matters in the end.

I'm still reviewing 10 and 14-18. 10 is a huge change and the diff is
pretty hard to parse.

>
> Patches 11-13 move arm64, MIPS, and PPC to the new APIs.
>
> Patch 14 yanks out the old APIs.
>
> Patch 15 adds the mmu_lock elision, but only for unpaired notifications.

Reading through all this code and considering the changes I'm
preparing for the TDP MMU have me wondering if it might help to have a
more general purpose MMU lock context struct which could be embedded
in the structs added in this patch. I'm thinking something like:
enum kvm_mmu_lock_mode {
KVM_MMU_LOCK_NONE,
KVM_MMU_LOCK_READ,
KVM_MMU_LOCK_WRITE,
};

struct kvm_mmu_lock_context {
enum kvm_mmu_lock_mode lock_mode;
bool can_block;
bool can_yield;
bool flush;
};

This could yield some grossly long lines, but it would also have
potential to unify a bunch of ad-hoc handling.
The above struct could also fit into a single byte, so it'd be pretty
easy to pass it around.

>
> Patch 16 adds mmu_lock elision for paired .invalidate_range_{start,end}().
> This is quite nasty and no small part of me thinks the patch should be
> burned with fire (I won't spoil it any further), but it's also the most
> problematic scenario for our particular use case.  :-/
>
> Patches 17-18 are additional x86 cleanups.
>
> Sean Christopherson (18):
>   KVM: x86/mmu: Coalesce TDP MMU TLB flushes when zapping collapsible
> SPTEs
>   KVM: x86/mmu: Move flushing for "slot" handlers to caller for legacy
> MMU
>   KVM: x86/mmu: Coalesce TLB flushes when zapping collapsible SPTEs
>   KVM: x86/mmu: Coalesce TLB flushes across address spaces for gfn range
> zap
>   KVM: x86/mmu: Pass address space ID to __kvm_tdp_mmu_zap_gfn_range()
>   KVM: x86/mmu: Pass address space ID to TDP MMU root walkers
>   KVM: x86/mmu: Use leaf-only loop for walking TDP SPTEs when changing
> SPTE
>   KVM: Move prototypes for MMU notifier callbacks to generic code
>   KVM: Move arm64's MMU notifier trace events to generic code
>   KVM: Move x86's MMU notifier memslot walkers to generic code
>   KVM: arm64: Convert to the gfn-based MMU notifier callbacks
>

Re: [PATCH v2 3/3] KVM: x86/mmu: Don't allow TDP MMU to yield when recovering NX pages

2021-03-25 Thread Ben Gardon

On Thu, Mar 25, 2021 at 3:25 PM Sean Christopherson  wrote:
>
> On Thu, Mar 25, 2021, Ben Gardon wrote:
> > On Thu, Mar 25, 2021 at 1:01 PM Sean Christopherson  
> > wrote:
> > > +static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t 
> > > start,
> > > +gfn_t end)
> > > +{
> > > +   return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true);
> > > +}
> > > +static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct 
> > > kvm_mmu_page *sp)
> >
> > I'm a little leary of adding an interface which takes a non-root
> > struct kvm_mmu_page as an argument to the TDP MMU.
> > In the TDP MMU, the struct kvm_mmu_pages are protected rather subtly.
> > I agree this is safe because we hold the MMU lock in write mode here,
> > but if we ever wanted to convert to holding it in read mode things
> > could get complicated fast.
> > Maybe this is more of a concern if the function started to be used
> > elsewhere since NX recovery is already so dependent on the write lock.
>
> Agreed.  Even writing the comment below felt a bit awkward when thinking about
> additional users holding mmu_lock for read.  Actually, I should remove that
> specific blurb since zapping currently requires holding mmu_lock for write.
>
> > Ideally though, NX reclaim could use MMU read lock +
> > tdp_mmu_pages_lock to protect the list and do reclaim in parallel with
> > everything else.
>
> Yar, processing all legacy MMU pages, and then all TDP MMU pages to avoid some
> of these dependencies crossed my mind.  But, it's hard to justify effectively
> walking the list twice.  And maintaining two lists might lead to balancing
> issues, e.g. the legacy MMU and thus nested VMs get zapped more often than the
> TDP MMU, or vice versa.

I think in an earlier version of the TDP that I sent out, NX reclaim
was a seperate thread for the two MMUs, sidestepping the balance
issue.
I think the TDP MMU also had a seperate NX reclaim list.
That would also make it easier to do something under the read lock.

>
> > The nice thing about drawing the TDP MMU interface in terms of GFNs
> > and address space IDs instead of SPs is that it doesn't put
> > constraints on the implementation of the TDP MMU because those GFNs
> > are always going to be valid / don't require any shared memory.
> > This is kind of innocuous because it's immediately converted into that
> > gfn interface, so I don't know how much it really matters.
> >
> > In any case this change looks correct and I don't want to hold up
> > progress with bikeshedding.
> > WDYT?
>
> I think we're kind of hosed either way.  Either we add a helper in the TDP MMU
> that takes a SP, or we bleed a lot of information about the details of TDP MMU
> into the common MMU.  E.g. the function could be open-coded verbatim, but the
> whole comment below, and the motivation for not feeding in flush is very
> dependent on the internal details of TDP MMU.
>
> I don't have a super strong preference.  One thought would be to assert that
> mmu_lock is held for write, and then it largely come future person's problem 
> :-)

Yeah, I agree and I'm happy to kick this proverbial can down the road
until we actually add an NX reclaim implementation that uses the MMU
read lock.

>
> > > +{
> > > +   gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level);
> > > +
> > > +   /*
> > > +* Don't allow yielding, as the caller may have a flush pending.  
> > > Note,
> > > +* if mmu_lock is held for write, zapping will never yield in 
> > > this case,
> > > +* but explicitly disallow it for safety.  The TDP MMU does not 
> > > yield
> > > +* until it has made forward progress (steps sideways), and when 
> > > zapping
> > > +* a single shadow page that it's guaranteed to see (thus the 
> > > mmu_lock
> > > +* requirement), its "step sideways" will always step beyond the 
> > > bounds
> > > +* of the shadow page's gfn range and stop iterating before 
> > > yielding.
> > > +*/
> > > +   return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false);
> > > +}
> > >  void kvm_tdp_mmu_zap_all(struct kvm *kvm);
> > >
> > >  int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
> > > --
> > > 2.31.0.291.g576ba9dcdaf-goog
> > >

Re: [PATCH v2 2/3] KVM: x86/mmu: Ensure TLBs are flushed for TDP MMU during NX zapping

2021-03-25 Thread Ben Gardon

On Thu, Mar 25, 2021 at 1:01 PM Sean Christopherson  wrote:
>
> Honor the "flush needed" return from kvm_tdp_mmu_zap_gfn_range(), which
> does the flush itself if and only if it yields (which it will never do in
> this particular scenario), and otherwise expects the caller to do the
> flush.  If pages are zapped from the TDP MMU but not the legacy MMU, then
> no flush will occur.
>
> Fixes: 29cf0f5007a2 ("kvm: x86/mmu: NX largepage recovery for TDP MMU")
> Cc: sta...@vger.kernel.org
> Cc: Ben Gardon 
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

> ---
>  arch/x86/kvm/mmu/mmu.c | 11 +++
>  1 file changed, 7 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index c6ed633594a2..5a53743b37bc 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5939,6 +5939,8 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
> struct kvm_mmu_page *sp;
> unsigned int ratio;
> LIST_HEAD(invalid_list);
> +   bool flush = false;
> +   gfn_t gfn_end;
> ulong to_zap;
>
> rcu_idx = srcu_read_lock(&kvm->srcu);
> @@ -5960,19 +5962,20 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
>   lpage_disallowed_link);
> WARN_ON_ONCE(!sp->lpage_disallowed);
> if (is_tdp_mmu_page(sp)) {
> -   kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
> -   sp->gfn + 
> KVM_PAGES_PER_HPAGE(sp->role.level));
> +   gfn_end = sp->gfn + 
> KVM_PAGES_PER_HPAGE(sp->role.level);
> +   flush = kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, 
> gfn_end);
> } else {
> kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
> WARN_ON_ONCE(sp->lpage_disallowed);
> }
>
> if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
> -   kvm_mmu_commit_zap_page(kvm, &invalid_list);
> +   kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, 
> flush);
> cond_resched_rwlock_write(&kvm->mmu_lock);
> +   flush = false;
> }
> }
> -   kvm_mmu_commit_zap_page(kvm, &invalid_list);
> +   kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
>
> write_unlock(&kvm->mmu_lock);
> srcu_read_unlock(&kvm->srcu, rcu_idx);
> --
> 2.31.0.291.g576ba9dcdaf-goog
>

Re: [PATCH v2 3/3] KVM: x86/mmu: Don't allow TDP MMU to yield when recovering NX pages

2021-03-25 Thread Ben Gardon

On Thu, Mar 25, 2021 at 1:01 PM Sean Christopherson  wrote:
>
> Prevent the TDP MMU from yielding when zapping a gfn range during NX
> page recovery.  If a flush is pending from a previous invocation of the
> zapping helper, either in the TDP MMU or the legacy MMU, but the TDP MMU
> has not accumulated a flush for the current invocation, then yielding
> will release mmu_lock with stale TLB entriesr

Extra r here.

>
> That being said, this isn't technically a bug fix in the current code, as
> the TDP MMU will never yield in this case.  tdp_mmu_iter_cond_resched()
> will yield if and only if it has made forward progress, as defined by the
> current gfn vs. the last yielded (or starting) gfn.  Because zapping a
> single shadow page is guaranteed to (a) find that page and (b) step
> sideways at the level of the shadow page, the TDP iter will break its loop
> before getting a chance to yield.
>
> But that is all very, very subtle, and will break at the slightest sneeze,
> e.g. zapping while holding mmu_lock for read would break as the TDP MMU
> wouldn't be guaranteed to see the present shadow page, and thus could step
> sideways at a lower level.
>
> Cc: Ben Gardon 
> Signed-off-by: Sean Christopherson 
> ---
>  arch/x86/kvm/mmu/mmu.c |  4 +---
>  arch/x86/kvm/mmu/tdp_mmu.c |  5 +++--
>  arch/x86/kvm/mmu/tdp_mmu.h | 23 ++-
>  3 files changed, 26 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 5a53743b37bc..7a99e59c8c1c 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5940,7 +5940,6 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
> unsigned int ratio;
> LIST_HEAD(invalid_list);
> bool flush = false;
> -   gfn_t gfn_end;
> ulong to_zap;
>
> rcu_idx = srcu_read_lock(&kvm->srcu);
> @@ -5962,8 +5961,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
>   lpage_disallowed_link);
> WARN_ON_ONCE(!sp->lpage_disallowed);
> if (is_tdp_mmu_page(sp)) {
> -   gfn_end = sp->gfn + 
> KVM_PAGES_PER_HPAGE(sp->role.level);
> -   flush = kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, 
> gfn_end);
> +   flush = kvm_tdp_mmu_zap_sp(kvm, sp);
> } else {
> kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
> WARN_ON_ONCE(sp->lpage_disallowed);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 6cf08c3c537f..08667e3cf091 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -709,13 +709,14 @@ static bool zap_gfn_range(struct kvm *kvm, struct 
> kvm_mmu_page *root,
>   * SPTEs have been cleared and a TLB flush is needed before releasing the
>   * MMU lock.
>   */
> -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
> +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
> +bool can_yield)
>  {
> struct kvm_mmu_page *root;
> bool flush = false;
>
> for_each_tdp_mmu_root_yield_safe(kvm, root)
> -   flush = zap_gfn_range(kvm, root, start, end, true, flush);
> +   flush = zap_gfn_range(kvm, root, start, end, can_yield, 
> flush);
>
> return flush;
>  }
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> index 3b761c111bff..715aa4e0196d 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.h
> +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> @@ -8,7 +8,28 @@
>  hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
>  void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
>
> -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end);
> +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
> +bool can_yield);
> +static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start,
> +gfn_t end)
> +{
> +   return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true);
> +}
> +static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page 
> *sp)

I'm a little leary of adding an interface which takes a non-root
struct kvm_mmu_page as an argument to the TDP MMU.
In the TDP MMU, the struct kvm_mmu_pages are protected rather subtly.
I agree this is safe because we hold the MMU lock in write mode here,
but if we ever wanted to convert to holding it in read mode things
could get complicated fast.
Maybe this is more of a concern if the function started to be used
elsewhe

Re: [PATCH 2/2] KVM: x86/mmu: Ensure TLBs are flushed when yielding during NX zapping

2021-03-23 Thread Ben Gardon

On Tue, Mar 23, 2021 at 11:58 AM Sean Christopherson  wrote:
>
> On Tue, Mar 23, 2021, Ben Gardon wrote:
> > On Mon, Mar 22, 2021 at 5:15 PM Sean Christopherson  
> > wrote:
> > >
> > > On Mon, Mar 22, 2021, Ben Gardon wrote:
> > > > It could be fixed by forbidding kvm_tdp_mmu_zap_gfn_range from
> > > > yielding. Since we should only need to zap one SPTE, the yield should
> > > > not be needed within the kvm_tdp_mmu_zap_gfn_range call. To ensure
> > > > that only one SPTE is zapped we would have to specify the root though.
> > > > Otherwise we could end up zapping all the entries for the same GFN
> > > > range under an unrelated root.
> > >
> > > Hmm, I originally did exactly that, but changed my mind because this zaps 
> > > far
> > > more than 1 SPTE.  This is zapping a SP that could be huge, but is not, 
> > > which
> > > means it's guaranteed to have a non-zero number of child SPTEs.  The 
> > > worst case
> > > scenario is that SP is a PUD (potential 1gb page) and the leafs are 4k 
> > > SPTEs.
> >
> > It's true that there are potentially 512^2 child sptes, but the code
> > to clear those after the single PUD spte is cleared doesn't yield
> > anyway. If the TDP MMU is only  operating with one root (as we would
> > expect in most cases), there should only be one chance for it to
> > yield.
>
> Ah, right, I was thinking all the iterative flows yielded.  Disallowing
> kvm_tdp_mmu_zap_gfn_range() from yielding in this case does seem like the best
> fix.  Any objection to me sending v2 with that?

That sounds good to me.

>
> > I've considered how we could allow the recursive changed spte handlers
> > to yield, but it gets complicated quite fast because the caller needs
> > to know if it yielded and reset the TDP iterator to the root, and
> > there are some cases (mmu notifiers + vCPU path) where yielding is not
> > desirable.
>
> Urgh, yeah, seems like we'd quickly end up with a mess resembling the legacy 
> MMU
> iterators.
>
> > >
> > > But, I didn't consider the interplay between invalid_list and the TDP MMU
> > > yielding.  Hrm.

Re: [PATCH 2/2] KVM: x86/mmu: Ensure TLBs are flushed when yielding during NX zapping

2021-03-23 Thread Ben Gardon

On Mon, Mar 22, 2021 at 5:15 PM Sean Christopherson  wrote:
>
> On Mon, Mar 22, 2021, Ben Gardon wrote:
> > On Fri, Mar 19, 2021 at 4:20 PM Sean Christopherson  
> > wrote:
> > > @@ -5960,19 +5963,21 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
> > >   lpage_disallowed_link);
> > > WARN_ON_ONCE(!sp->lpage_disallowed);
> > > if (is_tdp_mmu_page(sp)) {
> > > -   kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
> > > -   sp->gfn + 
> > > KVM_PAGES_PER_HPAGE(sp->role.level));
> > > +   gfn_end = sp->gfn + 
> > > KVM_PAGES_PER_HPAGE(sp->role.level);
> > > +   flush = kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, 
> > > gfn_end,
> > > + flush || 
> > > !list_empty(&invalid_list));
> > > } else {
> > > kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
> > > WARN_ON_ONCE(sp->lpage_disallowed);
> > > }
> > >
> > > if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
> > > -   kvm_mmu_commit_zap_page(kvm, &invalid_list);
> > > +   kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, 
> > > flush);
> >
> > This pattern of waiting until a yield is needed or lock contention is
> > detected has always been a little suspect to me because
> > kvm_mmu_commit_zap_page does work proportional to the work done before
> > the yield was needed. That seems like more work than we should like to
> > be doing at that point.
> >
> > The yield in kvm_tdp_mmu_zap_gfn_range makes that phenomenon even
> > worse. Because we can satisfy the need to yield without clearing out
> > the invalid list, we can potentially queue many more pages which will
> > then all need to have their zaps committed at once. This is an
> > admittedly contrived case which could only be hit in a high load
> > nested scenario.
> >
> > It could be fixed by forbidding kvm_tdp_mmu_zap_gfn_range from
> > yielding. Since we should only need to zap one SPTE, the yield should
> > not be needed within the kvm_tdp_mmu_zap_gfn_range call. To ensure
> > that only one SPTE is zapped we would have to specify the root though.
> > Otherwise we could end up zapping all the entries for the same GFN
> > range under an unrelated root.
>
> Hmm, I originally did exactly that, but changed my mind because this zaps far
> more than 1 SPTE.  This is zapping a SP that could be huge, but is not, which
> means it's guaranteed to have a non-zero number of child SPTEs.  The worst 
> case
> scenario is that SP is a PUD (potential 1gb page) and the leafs are 4k SPTEs.

It's true that there are potentially 512^2 child sptes, but the code
to clear those after the single PUD spte is cleared doesn't yield
anyway. If the TDP MMU is only  operating with one root (as we would
expect in most cases), there should only be one chance for it to
yield.

I've considered how we could allow the recursive changed spte handlers
to yield, but it gets complicated quite fast because the caller needs
to know if it yielded and reset the TDP iterator to the root, and
there are some cases (mmu notifiers + vCPU path) where yielding is not
desirable.

>
> But, I didn't consider the interplay between invalid_list and the TDP MMU
> yielding.  Hrm.

Re: [PATCH 1/2] KVM: x86/mmu: Ensure TLBs are flushed when yielding during GFN range zap

2021-03-22 Thread Ben Gardon

On Fri, Mar 19, 2021 at 4:20 PM Sean Christopherson  wrote:
>
> When flushing a range of GFNs across multiple roots, ensure any pending
> flush from a previous root is honored before yielding while walking the
> tables of the current root.
>
> Note, kvm_tdp_mmu_zap_gfn_range() now intentionally overwrites it local
> "flush" with the result to avoid redundant flushes.  zap_gfn_range()
> preserves and return the incoming "flush", unless of course the flush was
> performed prior to yielding and no new flush was triggered.
>
> Fixes: 1af4a96025b3 ("KVM: x86/mmu: Yield in TDU MMU iter even if no SPTES 
> changed")
> Cc: sta...@vger.kernel.org
> Cc: Ben Gardon 
> Signed-off-by: Sean Christopherson 

Reviewed-By: Ben Gardon 

> ---
>  arch/x86/kvm/mmu/tdp_mmu.c | 23 ---
>  1 file changed, 12 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index f0c99fa04ef2..6cf08c3c537f 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -86,7 +86,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct 
> kvm *kvm,
> list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
>
>  static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
> - gfn_t start, gfn_t end, bool can_yield);
> + gfn_t start, gfn_t end, bool can_yield, bool flush);

This function is going to acquire so many arguments. Don't need to do
anything about it here, but this is going to need some kind of cleanup
at some point.
I'll have to add another "shared" type arg for running this function
under the read lock in a series I'm prepping.


>
>  void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
>  {
> @@ -99,7 +99,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct 
> kvm_mmu_page *root)
>
> list_del(&root->link);
>
> -   zap_gfn_range(kvm, root, 0, max_gfn, false);
> +   zap_gfn_range(kvm, root, 0, max_gfn, false, false);
>
> free_page((unsigned long)root->spt);
> kmem_cache_free(mmu_page_header_cache, root);
> @@ -664,20 +664,21 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm 
> *kvm,
>   * scheduler needs the CPU or there is contention on the MMU lock. If this
>   * function cannot yield, it will not release the MMU lock or reschedule and
>   * the caller must ensure it does not supply too large a GFN range, or the
> - * operation can cause a soft lockup.
> + * operation can cause a soft lockup.  Note, in some use cases a flush may be
> + * required by prior actions.  Ensure the pending flush is performed prior to
> + * yielding.
>   */
>  static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
> - gfn_t start, gfn_t end, bool can_yield)
> + gfn_t start, gfn_t end, bool can_yield, bool flush)
>  {
> struct tdp_iter iter;
> -   bool flush_needed = false;
>
> rcu_read_lock();
>
> tdp_root_for_each_pte(iter, root, start, end) {
> if (can_yield &&
> -   tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
> -   flush_needed = false;
> +   tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
> +   flush = false;
> continue;
> }
>
> @@ -695,11 +696,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct 
> kvm_mmu_page *root,
> continue;
>
> tdp_mmu_set_spte(kvm, &iter, 0);
> -   flush_needed = true;
> +   flush = true;
> }
>
> rcu_read_unlock();
> -   return flush_needed;
> +   return flush;
>  }
>
>  /*
> @@ -714,7 +715,7 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t 
> start, gfn_t end)
> bool flush = false;
>
> for_each_tdp_mmu_root_yield_safe(kvm, root)
> -   flush |= zap_gfn_range(kvm, root, start, end, true);
> +   flush = zap_gfn_range(kvm, root, start, end, true, flush);
>
> return flush;
>  }
> @@ -931,7 +932,7 @@ static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
>  struct kvm_mmu_page *root, gfn_t start,
>  gfn_t end, unsigned long unused)
>  {
> -   return zap_gfn_range(kvm, root, start, end, false);
> +   return zap_gfn_range(kvm, root, start, end, false, false);
>  }
>
>  int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
> --
> 2.31.0.rc2.261.g7f71774620-goog
>

Re: [PATCH 2/2] KVM: x86/mmu: Ensure TLBs are flushed when yielding during NX zapping

2021-03-22 Thread Ben Gardon

On Fri, Mar 19, 2021 at 4:20 PM Sean Christopherson  wrote:
>
> Fix two intertwined bugs in the NX huge page zapping that were introduced
> by the incorporation of the TDP MMU.  Because there is a unified list of
> NX huge pages, zapping can encounter both TDP MMU and legacy MMU pages,
> and the two MMUs have different tracking for TLB flushing.  If one flavor
> needs a flush, but the code for the other flavor yields, KVM will fail to
> flush before yielding.
>
> First, honor the "flush needed" return from kvm_tdp_mmu_zap_gfn_range(),
> which does the flush itself if and only if it yields, and otherwise
> expects the caller to do the flush.  This requires feeding the result
> into kvm_mmu_remote_flush_or_zap(), and so also fixes the case where the
> TDP MMU needs a flush, the legacy MMU does not, and the main loop yields.
>
> Second, tell the TDP MMU a flush is pending if the list of zapped pages
> from legacy MMUs is not empty, i.e. the legacy MMU needs a flush.  This
> fixes the case where the TDP MMU yields, but it iteslf does not require a
> flush.
>
> Fixes: 29cf0f5007a2 ("kvm: x86/mmu: NX largepage recovery for TDP MMU")
> Cc: sta...@vger.kernel.org
> Cc: Ben Gardon 
> Signed-off-by: Sean Christopherson 

Reviewed-By: Ben Gardon 

This preserves an extremely unlikely degenerate case, which could
cause an unexpected delay.
The scenario is described below, but I don't think this change needs
to be blocked on it.

> ---
>  arch/x86/kvm/mmu/mmu.c | 15 ++-
>  arch/x86/kvm/mmu/tdp_mmu.c |  6 +++---
>  arch/x86/kvm/mmu/tdp_mmu.h |  3 ++-
>  3 files changed, 15 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index c6ed633594a2..413d6259340e 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5517,7 +5517,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t 
> gfn_start, gfn_t gfn_end)
> }
>
> if (is_tdp_mmu_enabled(kvm)) {
> -   flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
> +   flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end,
> + false);
> if (flush)
> kvm_flush_remote_tlbs(kvm);
> }
> @@ -5939,6 +5940,8 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
> struct kvm_mmu_page *sp;
> unsigned int ratio;
> LIST_HEAD(invalid_list);
> +   bool flush = false;
> +   gfn_t gfn_end;
> ulong to_zap;
>
> rcu_idx = srcu_read_lock(&kvm->srcu);
> @@ -5960,19 +5963,21 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
>   lpage_disallowed_link);
> WARN_ON_ONCE(!sp->lpage_disallowed);
> if (is_tdp_mmu_page(sp)) {
> -   kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
> -   sp->gfn + 
> KVM_PAGES_PER_HPAGE(sp->role.level));
> +   gfn_end = sp->gfn + 
> KVM_PAGES_PER_HPAGE(sp->role.level);
> +   flush = kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, 
> gfn_end,
> + flush || 
> !list_empty(&invalid_list));
> } else {
> kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
> WARN_ON_ONCE(sp->lpage_disallowed);
> }
>
> if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
> -   kvm_mmu_commit_zap_page(kvm, &invalid_list);
> +   kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, 
> flush);

This pattern of waiting until a yield is needed or lock contention is
detected has always been a little suspect to me because
kvm_mmu_commit_zap_page does work proportional to the work done before
the yield was needed. That seems like more work than we should like to
be doing at that point.

The yield in kvm_tdp_mmu_zap_gfn_range makes that phenomenon even
worse. Because we can satisfy the need to yield without clearing out
the invalid list, we can potentially queue many more pages which will
then all need to have their zaps committed at once. This is an
admittedly contrived case which could only be hit in a high load
nested scenario.

It could be fixed by forbidding kvm_tdp_mmu_zap_gfn_range from
yielding. Since we should only need to zap one SPTE, the yield should
not be needed within the kvm_tdp_mmu_zap_gfn_range call. To ensure
that only one SPTE is zapped we would have to specify the root though.
Otherwise we could end up zapping all the entries for the same GFN
range under an unrelated root.

An

[PATCH v3 4/4] KVM: x86/mmu: Store the address space ID in the TDP iterator

2021-03-15 Thread Ben Gardon

From: Sean Christopherson 

Store the address space ID in the TDP iterator so that it can be
retrieved without having to bounce through the root shadow page.  This
streamlines the code and fixes a Sparse warning about not properly using
rcu_dereference() when grabbing the ID from the root on the fly.

Reported-by: kernel test robot 
Signed-off-by: Sean Christopherson 
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu_internal.h |  5 +
 arch/x86/kvm/mmu/tdp_iter.c |  6 +-
 arch/x86/kvm/mmu/tdp_iter.h |  3 ++-
 arch/x86/kvm/mmu/tdp_mmu.c  | 23 +--
 4 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index ec4fc28b325a..1f6f98c76bdf 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -78,6 +78,11 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
 }
 
+static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
+{
+   return sp->role.smm ? 1 : 0;
+}
+
 static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
 {
/*
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index f7f94ea65243..b3ed302c1a35 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -49,6 +49,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int 
root_level,
iter->root_level = root_level;
iter->min_level = min_level;
iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
+   iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));
 
tdp_iter_restart(iter);
 }
@@ -169,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter)
iter->valid = false;
 }
 
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
-{
-   return iter->pt_path[iter->root_level - 1];
-}
-
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 8eb424d17c91..b1748b988d3a 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -36,6 +36,8 @@ struct tdp_iter {
int min_level;
/* The iterator's current level within the paging structure */
int level;
+   /* The address space ID, i.e. SMM vs. regular. */
+   int as_id;
/* A snapshot of the value at sptep */
u64 old_spte;
/*
@@ -62,7 +64,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level);
 void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
int min_level, gfn_t next_last_level_gfn);
 void tdp_iter_next(struct tdp_iter *iter);
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
 void tdp_iter_restart(struct tdp_iter *iter);
 
 #endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 38b6b6936171..462b1f71c77f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -203,11 +203,6 @@ static void handle_changed_spte(struct kvm *kvm, int 
as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared);
 
-static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
-{
-   return sp->role.smm ? 1 : 0;
-}
-
 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int 
level)
 {
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
@@ -497,10 +492,6 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
   struct tdp_iter *iter,
   u64 new_spte)
 {
-   u64 *root_pt = tdp_iter_root_pt(iter);
-   struct kvm_mmu_page *root = sptep_to_sp(root_pt);
-   int as_id = kvm_mmu_page_as_id(root);
-
lockdep_assert_held_read(&kvm->mmu_lock);
 
/*
@@ -514,8 +505,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
  new_spte) != iter->old_spte)
return false;
 
-   handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
-   iter->level, true);
+   handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+   new_spte, iter->level, true);
 
return true;
 }
@@ -569,10 +560,6 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, 
struct tdp_iter *iter,
  u64 new_spte, bool record_acc_track,
  bool record_dirty_log)
 {
-   tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
-   struct kvm_mmu_page *root = sptep_to_sp(root_pt);
-   int as_id = kvm_mmu_page_as_id(root);
-
lockdep_assert_held_write(&kvm->mmu_lock);
 
/*
@@ -586,13 +573,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, 
struct tdp_iter *iter,
 
WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 
-

[PATCH v3 1/4] KVM: x86/mmu: Fix RCU usage in handle_removed_tdp_mmu_page

2021-03-15 Thread Ben Gardon

The pt passed into handle_removed_tdp_mmu_page does not need RCU
protection, as it is not at any risk of being freed by another thread at
that point. However, the implicit cast from tdp_sptep_t to u64 * dropped
the __rcu annotation without a proper rcu_derefrence. Fix this by
passing the pt as a tdp_ptep_t and then rcu_dereferencing it in
the function.

Suggested-by: Sean Christopherson 
Reported-by: kernel test robot 
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d78915019b08..db2936cca4bf 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -301,11 +301,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct 
kvm_mmu_page *sp,
  *
  * Given a page table that has been removed from the TDP paging structure,
  * iterates through the page table to clear SPTEs and free child page tables.
+ *
+ * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
+ * protection. Since this thread removed it from the paging structure,
+ * this thread will be responsible for ensuring the page is freed. Hence the
+ * early rcu_dereferences in the function.
  */
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
bool shared)
 {
-   struct kvm_mmu_page *sp = sptep_to_sp(pt);
+   struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
u64 old_child_spte;
@@ -318,7 +323,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, 
u64 *pt,
tdp_mmu_unlink_page(kvm, sp, shared);
 
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
-   sptep = pt + i;
+   sptep = rcu_dereference(pt) + i;
gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 
if (shared) {
-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH v3 3/4] KVM: x86/mmu: Factor out tdp_iter_return_to_root

2021-03-15 Thread Ben Gardon

In tdp_mmu_iter_cond_resched there is a call to tdp_iter_start which
causes the iterator to continue its walk over the paging structure from
the root. This is needed after a yield as paging structure could have
been freed in the interim.

The tdp_iter_start call is not very clear and something of a hack. It
requires exposing tdp_iter fields not used elsewhere in tdp_mmu.c and
the effect is not obvious from the function name. Factor a more aptly
named function out of tdp_iter_start and call it from
tdp_mmu_iter_cond_resched and tdp_iter_start.

No functional change intended.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_iter.c | 24 +---
 arch/x86/kvm/mmu/tdp_iter.h |  1 +
 arch/x86/kvm/mmu/tdp_mmu.c  |  4 +---
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index e5f148106e20..f7f94ea65243 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -20,6 +20,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
return gfn & -KVM_PAGES_PER_HPAGE(level);
 }
 
+/*
+ * Return the TDP iterator to the root PT and allow it to continue its
+ * traversal over the paging structure from there.
+ */
+void tdp_iter_restart(struct tdp_iter *iter)
+{
+   iter->yielded_gfn = iter->next_last_level_gfn;
+   iter->level = iter->root_level;
+
+   iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+   tdp_iter_refresh_sptep(iter);
+
+   iter->valid = true;
+}
+
 /*
  * Sets a TDP iterator to walk a pre-order traversal of the paging structure
  * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
@@ -31,16 +46,11 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, 
int root_level,
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
 
iter->next_last_level_gfn = next_last_level_gfn;
-   iter->yielded_gfn = iter->next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
-   iter->level = root_level;
-   iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;
+   iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
 
-   iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
-   tdp_iter_refresh_sptep(iter);
-
-   iter->valid = true;
+   tdp_iter_restart(iter);
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 4cc177d75c4a..8eb424d17c91 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -63,5 +63,6 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int 
root_level,
int min_level, gfn_t next_last_level_gfn);
 void tdp_iter_next(struct tdp_iter *iter);
 tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
+void tdp_iter_restart(struct tdp_iter *iter);
 
 #endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 946da74e069c..38b6b6936171 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -664,9 +664,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm 
*kvm,
 
WARN_ON(iter->gfn > iter->next_last_level_gfn);
 
-   tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
-  iter->root_level, iter->min_level,
-  iter->next_last_level_gfn);
+   tdp_iter_restart(iter);
 
return true;
}
-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH v3 2/4] KVM: x86/mmu: Fix RCU usage when atomically zapping SPTEs

2021-03-15 Thread Ben Gardon

Fix a missing rcu_dereference in tdp_mmu_zap_spte_atomic.

Reported-by: kernel test robot 
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index db2936cca4bf..946da74e069c 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -543,7 +543,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 * here since the SPTE is going from non-present
 * to non-present.
 */
-   WRITE_ONCE(*iter->sptep, 0);
+   WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 
return true;
 }
-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH v3 0/4] Fix RCU warnings in TDP MMU

2021-03-15 Thread Ben Gardon

The Linux Test Robot found a few RCU warnings in the TDP MMU:
https://www.spinics.net/lists/kernel/msg3845500.html
https://www.spinics.net/lists/kernel/msg3845521.html

Fix these warnings and cleanup a hack in tdp_mmu_iter_cond_resched.

Tested by compiling as suggested in the test robot report and confirmed
that the warnings go away with this series applied. Also ran
kvm-unit-tests on an Intel Skylake machine with the TDP MMU enabled and
confirmed that the series introduced no new failures.

Ben Gardon (3):
  KVM: x86/mmu: Fix RCU usage in handle_removed_tdp_mmu_page
  KVM: x86/mmu: Fix RCU usage when atomically zapping SPTEs
  KVM: x86/mmu: Factor out tdp_iter_return_to_root

Sean Christopherson (1):
  KVM: x86/mmu: Store the address space ID in the TDP iterator

 arch/x86/kvm/mmu/mmu_internal.h |  5 +
 arch/x86/kvm/mmu/tdp_iter.c | 30 +++--
 arch/x86/kvm/mmu/tdp_iter.h |  4 +++-
 arch/x86/kvm/mmu/tdp_mmu.c  | 40 +
 4 files changed, 41 insertions(+), 38 deletions(-)

-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH v2 3/4] KVM: x86/mmu: Factor out tdp_iter_return_to_root

2021-03-15 Thread Ben Gardon

In tdp_mmu_iter_cond_resched there is a call to tdp_iter_start which
causes the iterator to continue its walk over the paging structure from
the root. This is needed after a yield as paging structure could have
been freed in the interim.

The tdp_iter_start call is not very clear and something of a hack. It
requires exposing tdp_iter fields not used elsewhere in tdp_mmu.c and
the effect is not obvious from the function name. Factor a more aptly
named function out of tdp_iter_start and call it from
tdp_mmu_iter_cond_resched and tdp_iter_start.

No functional change intended.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_iter.c | 24 +---
 arch/x86/kvm/mmu/tdp_iter.h |  1 +
 arch/x86/kvm/mmu/tdp_mmu.c  |  4 +---
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index e5f148106e20..f7f94ea65243 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -20,6 +20,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
return gfn & -KVM_PAGES_PER_HPAGE(level);
 }
 
+/*
+ * Return the TDP iterator to the root PT and allow it to continue its
+ * traversal over the paging structure from there.
+ */
+void tdp_iter_restart(struct tdp_iter *iter)
+{
+   iter->yielded_gfn = iter->next_last_level_gfn;
+   iter->level = iter->root_level;
+
+   iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+   tdp_iter_refresh_sptep(iter);
+
+   iter->valid = true;
+}
+
 /*
  * Sets a TDP iterator to walk a pre-order traversal of the paging structure
  * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
@@ -31,16 +46,11 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, 
int root_level,
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
 
iter->next_last_level_gfn = next_last_level_gfn;
-   iter->yielded_gfn = iter->next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
-   iter->level = root_level;
-   iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;
+   iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
 
-   iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
-   tdp_iter_refresh_sptep(iter);
-
-   iter->valid = true;
+   tdp_iter_restart(iter);
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 4cc177d75c4a..8eb424d17c91 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -63,5 +63,6 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int 
root_level,
int min_level, gfn_t next_last_level_gfn);
 void tdp_iter_next(struct tdp_iter *iter);
 tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
+void tdp_iter_restart(struct tdp_iter *iter);
 
 #endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 946da74e069c..38b6b6936171 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -664,9 +664,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm 
*kvm,
 
WARN_ON(iter->gfn > iter->next_last_level_gfn);
 
-   tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
-  iter->root_level, iter->min_level,
-  iter->next_last_level_gfn);
+   tdp_iter_restart(iter);
 
return true;
}
-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH v2 2/4] KVM: x86/mmu: Fix RCU usage when atomically zapping SPTEs

2021-03-15 Thread Ben Gardon

Fix a missing rcu_dereference in tdp_mmu_zap_spte_atomic.

Reported-by: kernel test robot 
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index db2936cca4bf..946da74e069c 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -543,7 +543,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 * here since the SPTE is going from non-present
 * to non-present.
 */
-   WRITE_ONCE(*iter->sptep, 0);
+   WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 
return true;
 }
-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH v2 4/4] KVM: x86/mmu: Store the address space ID in the TDP iterator

2021-03-15 Thread Ben Gardon

Store the address space ID in the TDP iterator so that it can be
retrieved without having to bounce through the root shadow page.  This
streamlines the code and fixes a Sparse warning about not properly using
rcu_dereference() when grabbing the ID from the root on the fly.

Reported-by: kernel test robot 
Signed-off-by: Sean Christopherson 
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/mmu_internal.h |  5 +
 arch/x86/kvm/mmu/tdp_iter.c |  6 +-
 arch/x86/kvm/mmu/tdp_iter.h |  3 ++-
 arch/x86/kvm/mmu/tdp_mmu.c  | 23 +--
 4 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index ec4fc28b325a..1f6f98c76bdf 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -78,6 +78,11 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
 }
 
+static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
+{
+   return sp->role.smm ? 1 : 0;
+}
+
 static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
 {
/*
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index f7f94ea65243..b3ed302c1a35 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -49,6 +49,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int 
root_level,
iter->root_level = root_level;
iter->min_level = min_level;
iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
+   iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));
 
tdp_iter_restart(iter);
 }
@@ -169,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter)
iter->valid = false;
 }
 
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
-{
-   return iter->pt_path[iter->root_level - 1];
-}
-
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 8eb424d17c91..b1748b988d3a 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -36,6 +36,8 @@ struct tdp_iter {
int min_level;
/* The iterator's current level within the paging structure */
int level;
+   /* The address space ID, i.e. SMM vs. regular. */
+   int as_id;
/* A snapshot of the value at sptep */
u64 old_spte;
/*
@@ -62,7 +64,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level);
 void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
int min_level, gfn_t next_last_level_gfn);
 void tdp_iter_next(struct tdp_iter *iter);
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
 void tdp_iter_restart(struct tdp_iter *iter);
 
 #endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 38b6b6936171..462b1f71c77f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -203,11 +203,6 @@ static void handle_changed_spte(struct kvm *kvm, int 
as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared);
 
-static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
-{
-   return sp->role.smm ? 1 : 0;
-}
-
 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int 
level)
 {
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
@@ -497,10 +492,6 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
   struct tdp_iter *iter,
   u64 new_spte)
 {
-   u64 *root_pt = tdp_iter_root_pt(iter);
-   struct kvm_mmu_page *root = sptep_to_sp(root_pt);
-   int as_id = kvm_mmu_page_as_id(root);
-
lockdep_assert_held_read(&kvm->mmu_lock);
 
/*
@@ -514,8 +505,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
  new_spte) != iter->old_spte)
return false;
 
-   handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
-   iter->level, true);
+   handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+   new_spte, iter->level, true);
 
return true;
 }
@@ -569,10 +560,6 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, 
struct tdp_iter *iter,
  u64 new_spte, bool record_acc_track,
  bool record_dirty_log)
 {
-   tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
-   struct kvm_mmu_page *root = sptep_to_sp(root_pt);
-   int as_id = kvm_mmu_page_as_id(root);
-
lockdep_assert_held_write(&kvm->mmu_lock);
 
/*
@@ -586,13 +573,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, 
struct tdp_iter *iter,
 
WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
 
-   __handle_changed_spte(kvm, as

[PATCH v2 1/4] KVM: x86/mmu: Fix RCU usage in handle_removed_tdp_mmu_page

2021-03-15 Thread Ben Gardon

The pt passed into handle_removed_tdp_mmu_page does not need RCU
protection, as it is not at any risk of being freed by another thread at
that point. However, the implicit cast from tdp_sptep_t to u64 * dropped
the __rcu annotation without a proper rcu_derefrence. Fix this by
passing the pt as a tdp_ptep_t and then rcu_dereferencing it in
the function.

Suggested-by: Sean Christopherson 
Reported-by: kernel test robot 
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d78915019b08..db2936cca4bf 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -301,11 +301,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct 
kvm_mmu_page *sp,
  *
  * Given a page table that has been removed from the TDP paging structure,
  * iterates through the page table to clear SPTEs and free child page tables.
+ *
+ * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
+ * protection. Since this thread removed it from the paging structure,
+ * this thread will be responsible for ensuring the page is freed. Hence the
+ * early rcu_dereferences in the function.
  */
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
bool shared)
 {
-   struct kvm_mmu_page *sp = sptep_to_sp(pt);
+   struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
u64 old_child_spte;
@@ -318,7 +323,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, 
u64 *pt,
tdp_mmu_unlink_page(kvm, sp, shared);
 
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
-   sptep = pt + i;
+   sptep = rcu_dereference(pt) + i;
gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 
if (shared) {
-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH v2 0/4] Fix RCU warnings in TDP MMU

2021-03-15 Thread Ben Gardon

The Linux Test Robot found a few RCU warnings in the TDP MMU:
https://www.spinics.net/lists/kernel/msg3845500.html
https://www.spinics.net/lists/kernel/msg3845521.html

Fix these warnings and cleanup a hack in tdp_mmu_iter_cond_resched.

Tested by compiling as suggested in the test robot report and confirmed
that the warnings go away with this series applied. Also ran
kvm-unit-tests on an Intel Skylake machine with the TDP MMU enabled and
confirmed that the series introduced no new failures.

Ben Gardon (4):
  KVM: x86/mmu: Fix RCU usage in handle_removed_tdp_mmu_page
  KVM: x86/mmu: Fix RCU usage when atomically zapping SPTEs
  KVM: x86/mmu: Factor out tdp_iter_return_to_root
  KVM: x86/mmu: Store the address space ID in the TDP iterator

 arch/x86/kvm/mmu/mmu_internal.h |  5 +
 arch/x86/kvm/mmu/tdp_iter.c | 30 +++--
 arch/x86/kvm/mmu/tdp_iter.h |  4 +++-
 arch/x86/kvm/mmu/tdp_mmu.c  | 40 +
 4 files changed, 41 insertions(+), 38 deletions(-)

-- 
2.31.0.rc2.261.g7f71774620-goog

Re: [PATCH 2/4] KVM: x86/mmu: Fix RCU usage for tdp_iter_root_pt

2021-03-15 Thread Ben Gardon

On Fri, Mar 12, 2021 at 8:22 AM Sean Christopherson  wrote:
>
> On Thu, Mar 11, 2021, Ben Gardon wrote:
> > The root page table in the TDP MMU paging structure is not protected
> > with RCU, but rather by the root_count in the associated SP. As a result
> > it is safe for tdp_iter_root_pt to simply return a u64 *. This sidesteps
> > the complexities assoicated with propagating the __rcu annotation
> > around.
> >
> > Reported-by: kernel test robot 
> > Signed-off-by: Ben Gardon 
> > ---
> >  arch/x86/kvm/mmu/tdp_iter.c | 10 --
> >  arch/x86/kvm/mmu/tdp_iter.h |  2 +-
> >  arch/x86/kvm/mmu/tdp_mmu.c  |  4 ++--
> >  3 files changed, 11 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> > index e5f148106e20..8e2c053533b6 100644
> > --- a/arch/x86/kvm/mmu/tdp_iter.c
> > +++ b/arch/x86/kvm/mmu/tdp_iter.c
> > @@ -159,8 +159,14 @@ void tdp_iter_next(struct tdp_iter *iter)
> >   iter->valid = false;
> >  }
> >
> > -tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
> > +u64 *tdp_iter_root_pt(struct tdp_iter *iter)
> >  {
> > - return iter->pt_path[iter->root_level - 1];
> > + /*
> > +  * Though it is stored in an array of tdp_ptep_t for convenience,
> > +  * the root PT is not actually protected by RCU, but by the root
> > +  * count on the associated struct kvm_mmu_page. As a result it's
> > +  * safe to rcu_dereference and return the value here.
>
> I'm not a big fan of this comment.  It implies that calling tdp_iter_root_pt()
> without RCU protection is completely ok, but that's not true, as 
> rcu_dereferecne()
> will complain when CONFIG_PROVE_RCU=1.
>
> There's also a good opportunity to streamline the the helper here, since both
> callers use the root only to get to the associated shadow page, and that's 
> only
> done to get the as_id.  If we provide tdp_iter_as_id() then the need for a
> comment goes away and we shave a few lines of code.

This is a good suggestion. I have a change to do this in another
series I was preparing to send out, but your suggestion below is even
better, so I'll add that to this series.

>
> That being said, an even better option would be to store as_id in the TDP 
> iter.
> The cost on the stack is negligible, and while the early sptep->as_id lookup
> will be unnecessary in some cases, it will be a net win when setting multiple
> sptes, e.g. in mmu_notifier callbacks.
>
> Compile tested only...
>
> From 02fb9cd2aa52d0afd318e93661d0212ccdb54218 Mon Sep 17 00:00:00 2001
> From: Sean Christopherson 
> Date: Fri, 12 Mar 2021 08:12:21 -0800
> Subject: [PATCH] KVM: x86/mmu: Store the address space ID in the TDP iterator
>
> Store the address space ID in the TDP iterator so that it can be
> retrieved without having to bounce through the root shadow page.  This
> streamlines the code and fixes a Sparse warning about not properly using
> rcu_dereference() when grabbing the ID from the root on the fly.
>
> Reported-by: kernel test robot 
> Cc: Ben Gardon 
> Signed-off-by: Sean Christopherson 
> ---
>  arch/x86/kvm/mmu/mmu_internal.h |  5 +
>  arch/x86/kvm/mmu/tdp_iter.c |  7 +--
>  arch/x86/kvm/mmu/tdp_iter.h |  3 ++-
>  arch/x86/kvm/mmu/tdp_mmu.c  | 23 +--
>  4 files changed, 13 insertions(+), 25 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index ec4fc28b325a..e844078d2374 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -119,6 +119,11 @@ static inline bool kvm_mmu_put_root(struct kvm *kvm, 
> struct kvm_mmu_page *sp)
> return !sp->root_count;
>  }
>
> +static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
> +{
> +   return sp->role.smm ? 1 : 0;
> +}
> +
>  /*
>   * Return values of handle_mmio_page_fault, mmu.page_fault, and 
> fast_page_fault().
>   *
> diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> index e5f148106e20..55d0ce2185a5 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.c
> +++ b/arch/x86/kvm/mmu/tdp_iter.c
> @@ -40,6 +40,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, 
> int root_level,
> iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, 
> iter->level);
> tdp_iter_refresh_sptep(iter);
>
> +   iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));
> iter->valid = true;
>  }
>
> @@ -158,9 +159,3 @@ void tdp_iter_next(struct tdp_iter *iter)
> } while (try_step_up(iter));
>

Re: [PATCH 1/4] KVM: x86/mmu: Fix RCU usage in handle_removed_tdp_mmu_page

2021-03-15 Thread Ben Gardon

On Fri, Mar 12, 2021 at 7:43 AM Paolo Bonzini  wrote:
>
> On 12/03/21 16:37, Sean Christopherson wrote:
> > On Thu, Mar 11, 2021, Ben Gardon wrote:
> >> The pt passed into handle_removed_tdp_mmu_page does not need RCU
> >> protection, as it is not at any risk of being freed by another thread at
> >> that point. However, the implicit cast from tdp_sptep_t to u64 * dropped
> >> the __rcu annotation without a proper rcu_derefrence. Fix this by
> >> passing the pt as a tdp_ptep_t and then rcu_dereferencing it in
> >> the function.
> >>
> >> Suggested-by: Sean Christopherson 
> >> Reported-by: kernel test robot 
> >
> > Should be .  Looks like you've been taking pointers from 
> > Paolo :-)

I'll update that in v2. I was a little confused because I was looking
at the report archived on Spinics, where all the domains are .
Didn't notice that all the emails had been redacted like that.


>
> The day someone starts confusing employers in CCs you should tell them
> "I see you have constructed a new email sending alias.  Your skills are
> now complete".
>
> Paolo
>
> > https://lkml.org/lkml/2019/6/17/1210
> >
> > Other than that,
> >
> > Reviewed-by: Sean Christopherson 
> >
> >> Signed-off-by: Ben Gardon 
> >
>

[PATCH 4/4] KVM: x86/mmu: Factor out tdp_iter_return_to_root

2021-03-11 Thread Ben Gardon

In tdp_mmu_iter_cond_resched there is a call to tdp_iter_start which
causes the iterator to continue its walk over the paging structure from
the root. This is needed after a yield as paging structure could have
been freed in the interim.

The tdp_iter_start call is not very clear and something of a hack. It
requires exposing tdp_iter fields not used elsewhere in tdp_mmu.c and
the effect is not obvious from the function name. Factor a more aptly
named function out of tdp_iter_start and call it from
tdp_mmu_iter_cond_resched and tdp_iter_start.

No functional change intended.

Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_iter.c | 24 +---
 arch/x86/kvm/mmu/tdp_iter.h |  1 +
 arch/x86/kvm/mmu/tdp_mmu.c  |  4 +---
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 8e2c053533b6..bbf53b98cc65 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -20,6 +20,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
return gfn & -KVM_PAGES_PER_HPAGE(level);
 }
 
+/*
+ * Return the TDP iterator to the root PT and allow it to continue its
+ * traversal over the paging structure from there.
+ */
+void tdp_iter_return_to_root(struct tdp_iter *iter)
+{
+   iter->yielded_gfn = iter->next_last_level_gfn;
+   iter->level = iter->root_level;
+
+   iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+   tdp_iter_refresh_sptep(iter);
+
+   iter->valid = true;
+}
+
 /*
  * Sets a TDP iterator to walk a pre-order traversal of the paging structure
  * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
@@ -31,16 +46,11 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, 
int root_level,
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
 
iter->next_last_level_gfn = next_last_level_gfn;
-   iter->yielded_gfn = iter->next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
-   iter->level = root_level;
-   iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;
+   iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
 
-   iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
-   tdp_iter_refresh_sptep(iter);
-
-   iter->valid = true;
+   tdp_iter_return_to_root(iter);
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 5a47c57810ab..2ecc48e78526 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -63,5 +63,6 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int 
root_level,
int min_level, gfn_t next_last_level_gfn);
 void tdp_iter_next(struct tdp_iter *iter);
 u64 *tdp_iter_root_pt(struct tdp_iter *iter);
+void tdp_iter_return_to_root(struct tdp_iter *iter);
 
 #endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index a8fdccf4fd06..941e9d11c7ed 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -653,9 +653,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm 
*kvm,
 
WARN_ON(iter->gfn > iter->next_last_level_gfn);
 
-   tdp_iter_start(iter, tdp_iter_root_pt(iter),
-  iter->root_level, iter->min_level,
-  iter->next_last_level_gfn);
+   tdp_iter_return_to_root(iter);
 
return true;
}
-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH 3/4] KVM: x86/mmu: Fix RCU usage when atomically zapping SPTEs

2021-03-11 Thread Ben Gardon

Fix a missing rcu_dereference in tdp_mmu_zap_spte_atomic.

Reported-by: kernel test robot 
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6c8824bcc2f2..a8fdccf4fd06 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -532,7 +532,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
 * here since the SPTE is going from non-present
 * to non-present.
 */
-   WRITE_ONCE(*iter->sptep, 0);
+   WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
 
return true;
 }
-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH 2/4] KVM: x86/mmu: Fix RCU usage for tdp_iter_root_pt

2021-03-11 Thread Ben Gardon

The root page table in the TDP MMU paging structure is not protected
with RCU, but rather by the root_count in the associated SP. As a result
it is safe for tdp_iter_root_pt to simply return a u64 *. This sidesteps
the complexities assoicated with propagating the __rcu annotation
around.

Reported-by: kernel test robot 
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_iter.c | 10 --
 arch/x86/kvm/mmu/tdp_iter.h |  2 +-
 arch/x86/kvm/mmu/tdp_mmu.c  |  4 ++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index e5f148106e20..8e2c053533b6 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -159,8 +159,14 @@ void tdp_iter_next(struct tdp_iter *iter)
iter->valid = false;
 }
 
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
+u64 *tdp_iter_root_pt(struct tdp_iter *iter)
 {
-   return iter->pt_path[iter->root_level - 1];
+   /*
+* Though it is stored in an array of tdp_ptep_t for convenience,
+* the root PT is not actually protected by RCU, but by the root
+* count on the associated struct kvm_mmu_page. As a result it's
+* safe to rcu_dereference and return the value here.
+*/
+   return rcu_dereference(iter->pt_path[iter->root_level - 1]);
 }
 
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 4cc177d75c4a..5a47c57810ab 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -62,6 +62,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level);
 void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
int min_level, gfn_t next_last_level_gfn);
 void tdp_iter_next(struct tdp_iter *iter);
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
+u64 *tdp_iter_root_pt(struct tdp_iter *iter);
 
 #endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 5387ac040f66..6c8824bcc2f2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -558,7 +558,7 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, 
struct tdp_iter *iter,
  u64 new_spte, bool record_acc_track,
  bool record_dirty_log)
 {
-   tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
+   u64 *root_pt = tdp_iter_root_pt(iter);
struct kvm_mmu_page *root = sptep_to_sp(root_pt);
int as_id = kvm_mmu_page_as_id(root);
 
@@ -653,7 +653,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm 
*kvm,
 
WARN_ON(iter->gfn > iter->next_last_level_gfn);
 
-   tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
+   tdp_iter_start(iter, tdp_iter_root_pt(iter),
   iter->root_level, iter->min_level,
   iter->next_last_level_gfn);
 
-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH 1/4] KVM: x86/mmu: Fix RCU usage in handle_removed_tdp_mmu_page

2021-03-11 Thread Ben Gardon

The pt passed into handle_removed_tdp_mmu_page does not need RCU
protection, as it is not at any risk of being freed by another thread at
that point. However, the implicit cast from tdp_sptep_t to u64 * dropped
the __rcu annotation without a proper rcu_derefrence. Fix this by
passing the pt as a tdp_ptep_t and then rcu_dereferencing it in
the function.

Suggested-by: Sean Christopherson 
Reported-by: kernel test robot 
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c926c6b899a1..5387ac040f66 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -301,11 +301,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct 
kvm_mmu_page *sp,
  *
  * Given a page table that has been removed from the TDP paging structure,
  * iterates through the page table to clear SPTEs and free child page tables.
+ *
+ * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
+ * protection. Since this thread removed it from the paging structure,
+ * this thread will be responsible for ensuring the page is freed. Hence the
+ * early rcu_dereferences in the function.
  */
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
bool shared)
 {
-   struct kvm_mmu_page *sp = sptep_to_sp(pt);
+   struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
u64 old_child_spte;
@@ -318,7 +323,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, 
u64 *pt,
tdp_mmu_unlink_page(kvm, sp, shared);
 
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
-   sptep = pt + i;
+   sptep = rcu_dereference(pt) + i;
gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
 
if (shared) {
-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH 0/4] Fix RCU warnings in TDP MMU

2021-03-11 Thread Ben Gardon

The Linux Test Robot found a few RCU warnings in the TDP MMU:
https://www.spinics.net/lists/kernel/msg3845500.html
https://www.spinics.net/lists/kernel/msg3845521.html

Fix these warnings and cleanup a hack in tdp_mmu_iter_cond_resched.

Tested by compiling as suggested in the test robot report and confirmed that
the warnings go away with this series applied. Also ran kvm-unit-tests on an
Intel Skylake machine with the TDP MMU enabled and confirmed that the series
introduced no new failures.

Ben Gardon (4):
  KVM: x86/mmu: Fix RCU usage in handle_removed_tdp_mmu_page
  KVM: x86/mmu: Fix RCU usage for tdp_iter_root_pt
  KVM: x86/mmu: Fix RCU usage when atomically zapping SPTEs
  KVM: x86/mmu: Factor out tdp_iter_return_to_root

 arch/x86/kvm/mmu/tdp_iter.c | 34 +-
 arch/x86/kvm/mmu/tdp_iter.h |  3 ++-
 arch/x86/kvm/mmu/tdp_mmu.c  | 19 +++
 3 files changed, 38 insertions(+), 18 deletions(-)

-- 
2.31.0.rc2.261.g7f71774620-goog

Re: [PATCH] KVM: x86/mmu: Skip !MMU-present SPTEs when removing SP in exclusive mode

2021-03-10 Thread Ben Gardon

On Wed, Mar 10, 2021 at 1:14 PM Sean Christopherson  wrote:
>
> On Wed, Mar 10, 2021, Paolo Bonzini wrote:
> > On 10/03/21 01:30, Sean Christopherson wrote:
> > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > > index 50ef757c5586..f0c99fa04ef2 100644
> > > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > > @@ -323,7 +323,18 @@ static void handle_removed_tdp_mmu_page(struct kvm 
> > > *kvm, u64 *pt,
> > > cpu_relax();
> > > }
> > > } else {
> > > +   /*
> > > +* If the SPTE is not MMU-present, there is no backing
> > > +* page associated with the SPTE and so no side 
> > > effects
> > > +* that need to be recorded, and exclusive ownership 
> > > of
> > > +* mmu_lock ensures the SPTE can't be made present.
> > > +* Note, zapping MMIO SPTEs is also unnecessary as 
> > > they
> > > +* are guarded by the memslots generation, not by 
> > > being
> > > +* unreachable.
> > > +*/
> > > old_child_spte = READ_ONCE(*sptep);
> > > +   if (!is_shadow_present_pte(old_child_spte))
> > > +   continue;
> > > /*
> > >  * Marking the SPTE as a removed SPTE is not
> >
> > Ben, do you plan to make this path take mmu_lock for read?  If so, this
> > wouldn't be too useful IIUC.
>
> I can see kvm_mmu_zap_all_fast()->kvm_tdp_mmu_zap_all() moving to a 
> shared-mode
> flow, but I don't think we'll ever want to move away from exclusive-mode 
> zapping
> for kvm_arch_flush_shadow_all()->kvm_mmu_zap_all()->kvm_tdp_mmu_zap_all().  In
> that case, the VM is dead or dying; freeing memory should be done as quickly 
> as
> possible.

Yeah, as Sean said, zapping under the MMU lock in write mode probably
shouldn't go away, even if we find we're able to do it in read mode in
some flows.

This optimization also makes me think we could also skip the
__handle_changed_spte call in the read mode case if the SPTE change
was !PRESENT -> REMOVED.

Re: [PATCH 02/15] KVM: x86/mmu: Alloc page for PDPTEs when shadowing 32-bit NPT with 64-bit

2021-03-03 Thread Ben Gardon

On Tue, Mar 2, 2021 at 10:45 AM Sean Christopherson  wrote:
>
> Allocate the so called pae_root page on-demand, along with the lm_root
> page, when shadowing 32-bit NPT with 64-bit NPT, i.e. when running a
> 32-bit L1.  KVM currently only allocates the page when NPT is disabled,
> or when L0 is 32-bit (using PAE paging).
>
> Note, there is an existing memory leak involving the MMU roots, as KVM
> fails to free the PAE roots on failure.  This will be addressed in a
> future commit.
>
> Fixes: ee6268ba3a68 ("KVM: x86: Skip pae_root shadow allocation if tdp 
> enabled")
> Fixes: b6b80c78af83 ("KVM: x86/mmu: Allocate PAE root array when using SVM's 
> 32-bit NPT")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

> ---
>  arch/x86/kvm/mmu/mmu.c | 44 --
>  1 file changed, 29 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 0987cc1d53eb..2ed3fac1244e 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3187,14 +3187,14 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct 
> kvm_mmu *mmu,
> if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
> (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) 
> {
> mmu_free_root_page(kvm, &mmu->root_hpa, 
> &invalid_list);
> -   } else {
> +   } else if (mmu->pae_root) {
> for (i = 0; i < 4; ++i)
> if (mmu->pae_root[i] != 0)

I was about to comment on how weird this check is since pae_root can
also be INVALID_PAGE but that case is handled in mmu_free_root_page...
but then I realized that you're already addressing that problem in
patch 7.

> mmu_free_root_page(kvm,
>&mmu->pae_root[i],
>&invalid_list);
> -   mmu->root_hpa = INVALID_PAGE;
> }
> +   mmu->root_hpa = INVALID_PAGE;
> mmu->root_pgd = 0;
> }
>
> @@ -3306,9 +3306,23 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu 
> *vcpu)
>  * the shadow page table may be a PAE or a long mode page table.
>  */
> pm_mask = PT_PRESENT_MASK;
> -   if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
> +   if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
> pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
>
> +   /*
> +* Allocate the page for the PDPTEs when shadowing 32-bit NPT
> +* with 64-bit only when needed.  Unlike 32-bit NPT, it 
> doesn't
> +* need to be in low mem.  See also lm_root below.
> +*/
> +   if (!vcpu->arch.mmu->pae_root) {
> +   WARN_ON_ONCE(!tdp_enabled);
> +
> +   vcpu->arch.mmu->pae_root = (void 
> *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> +   if (!vcpu->arch.mmu->pae_root)
> +   return -ENOMEM;
> +   }
> +   }
> +
> for (i = 0; i < 4; ++i) {
> MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
> if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
> @@ -3331,21 +3345,19 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu 
> *vcpu)
> vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
>
> /*
> -* If we shadow a 32 bit page table with a long mode page
> -* table we enter this path.
> +* When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
> +* tables are allocated and initialized at MMU creation as there is no
> +* equivalent level in the guest's NPT to shadow.  Allocate the tables
> +* on demand, as running a 32-bit L1 VMM is very rare.  The PDP is
> +* handled above (to share logic with PAE), deal with the PML4 here.
>  */
> if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
> if (vcpu->arch.mmu->lm_root == NULL) {
> -   /*
> -* The additional page necessary for this is only
> -* allocated on demand.
> -*/
> -
> u64 *lm_root;
>
> lm_root = (void*)ge

Re: [PATCH 03/15] KVM: x86/mmu: Ensure MMU pages are available when allocating roots

2021-03-03 Thread Ben Gardon

On Tue, Mar 2, 2021 at 10:46 AM Sean Christopherson  wrote:
>
> Hold the mmu_lock for write for the entire duration of allocating and
> initializing an MMU's roots.  This ensures there are MMU pages available
> and thus prevents root allocations from failing.  That in turn fixes a
> bug where KVM would fail to free valid PAE roots if a one of the later
> roots failed to allocate.
>
> Note, KVM still leaks the PAE roots if the lm_root allocation fails.
> This will be addressed in a future commit.
>
> Cc: Ben Gardon 
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

Very tidy cleanup!

> ---
>  arch/x86/kvm/mmu/mmu.c | 41 --
>  arch/x86/kvm/mmu/tdp_mmu.c | 23 +
>  2 files changed, 18 insertions(+), 46 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 2ed3fac1244e..1f129001a30c 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2398,6 +2398,9 @@ static int make_mmu_pages_available(struct kvm_vcpu 
> *vcpu)
>  {
> unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
>
> +   /* Ensure all four PAE roots can be allocated in a single pass. */
> +   BUILD_BUG_ON(KVM_MIN_FREE_MMU_PAGES < 4);
> +

For a second I thought that this should be 5 since a page is needed to
hold the 4 PAE roots, but that page is allocated at vCPU creation and
reused, so no need to check for it here.

> if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
> return 0;
>
> @@ -3220,16 +3223,9 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, 
> gfn_t gfn, gva_t gva,
>  {
> struct kvm_mmu_page *sp;
>
> -   write_lock(&vcpu->kvm->mmu_lock);
> -
> -   if (make_mmu_pages_available(vcpu)) {
> -   write_unlock(&vcpu->kvm->mmu_lock);
> -   return INVALID_PAGE;
> -   }
> sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
> ++sp->root_count;
>
> -   write_unlock(&vcpu->kvm->mmu_lock);
> return __pa(sp->spt);
>  }
>
> @@ -3241,16 +3237,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu 
> *vcpu)
>
> if (is_tdp_mmu_enabled(vcpu->kvm)) {
> root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
> -
> -   if (!VALID_PAGE(root))
> -   return -ENOSPC;
> vcpu->arch.mmu->root_hpa = root;
> } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
>   true);
> -
> -   if (!VALID_PAGE(root))
> -   return -ENOSPC;

There's so much going on in mmu_alloc_root that removing this check
makes me nervous, but I think it should be safe.
I checked though the function because I was worried it might yield
somewhere in there, which could result in the page cache being emptied
and the allocation failing, but I don't think mmu_alloc_root this
function will yield.

> vcpu->arch.mmu->root_hpa = root;
> } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
> for (i = 0; i < 4; ++i) {
> @@ -3258,8 +3248,6 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
>
> root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
>   i << 30, PT32_ROOT_LEVEL, true);
> -   if (!VALID_PAGE(root))
> -   return -ENOSPC;
> vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
> }
> vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
> @@ -3294,8 +3282,6 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>
> root = mmu_alloc_root(vcpu, root_gfn, 0,
>   vcpu->arch.mmu->shadow_root_level, 
> false);
> -   if (!VALID_PAGE(root))
> -   return -ENOSPC;
> vcpu->arch.mmu->root_hpa = root;
> goto set_root_pgd;
> }
> @@ -3325,6 +3311,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>
> for (i = 0; i < 4; ++i) {
> MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu->pae_root[i]));
> +
> if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
> pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
> if (!(pdptr & PT_PRESENT_MASK)) {
> @@ -3338,8 +3325,6 @@ static int mmu_alloc_shadow_roots(struct kvm_v

Re: arch/x86/kvm/mmu/tdp_mmu.c:533:9: sparse: sparse: cast removes address space '__rcu' of expression

2021-03-01 Thread Ben Gardon

It looks like the __rcu tags aren't being propagated around to all the
different references to PT memory. I'll look into what it would take
to fix this. I don't believe these kernel test warnings indicate a
correctness issue in the kernel, but propagating the __rcu annotations
will be helpful for developers in the future.

On Sat, Feb 27, 2021 at 3:48 PM kernel test robot  wrote:
>
> tree:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
> master
> head:   5695e51619745d4fe3ec2506a2f0cd982c5e27a4
> commit: 08f07c800e9d35b59d0c8346333f189160bd67d4 KVM: x86/mmu: Flush TLBs 
> after zap in TDP MMU PF handler
> date:   3 weeks ago
> config: x86_64-randconfig-s022-20210228 (attached as .config)
> compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
> reproduce:
> # apt-get install sparse
> # sparse version: v0.6.3-241-geaceeafa-dirty
> # 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=08f07c800e9d35b59d0c8346333f189160bd67d4
> git remote add linus 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
> git fetch --no-tags linus master
> git checkout 08f07c800e9d35b59d0c8346333f189160bd67d4
> # save the attached .config to linux build tree
> make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=x86_64
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot 
>
>
> "sparse warnings: (new ones prefixed by >>)"
>arch/x86/kvm/mmu/tdp_mmu.c:459:49: sparse: sparse: incorrect type in 
> argument 2 (different address spaces) @@ expected unsigned long long 
> [usertype] *pt @@ got unsigned long long [noderef] [usertype] __rcu * @@
>arch/x86/kvm/mmu/tdp_mmu.c:459:49: sparse: expected unsigned long long 
> [usertype] *pt
>arch/x86/kvm/mmu/tdp_mmu.c:459:49: sparse: got unsigned long long 
> [noderef] [usertype] __rcu *
>arch/x86/kvm/mmu/tdp_mmu.c:291:9: sparse: sparse: context imbalance in 
> 'tdp_mmu_link_page' - different lock contexts for basic block
>arch/x86/kvm/mmu/tdp_mmu.c:316:9: sparse: sparse: context imbalance in 
> 'tdp_mmu_unlink_page' - different lock contexts for basic block
>arch/x86/kvm/mmu/tdp_mmu.c:654:51: sparse: sparse: incorrect type in 
> argument 2 (different address spaces) @@ expected unsigned long long 
> [usertype] *root_pt @@ got unsigned long long [noderef] [usertype] __rcu 
> * @@
>arch/x86/kvm/mmu/tdp_mmu.c:654:51: sparse: expected unsigned long long 
> [usertype] *root_pt
>arch/x86/kvm/mmu/tdp_mmu.c:654:51: sparse: got unsigned long long 
> [noderef] [usertype] __rcu *
>arch/x86/kvm/mmu/tdp_mmu.c:560:49: sparse: sparse: incorrect type in 
> argument 1 (different address spaces) @@ expected unsigned long long 
> [usertype] *sptep @@ got unsigned long long [noderef] [usertype] __rcu 
> *[usertype] root_pt @@
>arch/x86/kvm/mmu/tdp_mmu.c:560:49: sparse: expected unsigned long long 
> [usertype] *sptep
>arch/x86/kvm/mmu/tdp_mmu.c:560:49: sparse: got unsigned long long 
> [noderef] [usertype] __rcu *[usertype] root_pt
>arch/x86/kvm/mmu/tdp_mmu.c:487:40: sparse: sparse: incorrect type in 
> initializer (different address spaces) @@ expected unsigned long long 
> [usertype] *root_pt @@ got unsigned long long [noderef] [usertype] __rcu 
> * @@
>arch/x86/kvm/mmu/tdp_mmu.c:487:40: sparse: expected unsigned long long 
> [usertype] *root_pt
>arch/x86/kvm/mmu/tdp_mmu.c:487:40: sparse: got unsigned long long 
> [noderef] [usertype] __rcu *
> >> arch/x86/kvm/mmu/tdp_mmu.c:533:9: sparse: sparse: cast removes address 
> >> space '__rcu' of expression
>arch/x86/kvm/mmu/tdp_mmu.c:487:40: sparse: sparse: incorrect type in 
> initializer (different address spaces) @@ expected unsigned long long 
> [usertype] *root_pt @@ got unsigned long long [noderef] [usertype] __rcu 
> * @@
>arch/x86/kvm/mmu/tdp_mmu.c:487:40: sparse: expected unsigned long long 
> [usertype] *root_pt
>arch/x86/kvm/mmu/tdp_mmu.c:487:40: sparse: got unsigned long long 
> [noderef] [usertype] __rcu *
>arch/x86/kvm/mmu/tdp_mmu.c:487:40: sparse: sparse: incorrect type in 
> initializer (different address spaces) @@ expected unsigned long long 
> [usertype] *root_pt @@ got unsigned long long [noderef] [usertype] __rcu 
> * @@
>arch/x86/kvm/mmu/tdp_mmu.c:487:40: sparse: expected unsigned long long 
> [usertype] *root_pt
>arch/x86/kvm/mmu/tdp_mmu.c:487:40: sparse: got unsigned long long 
> [noderef] [usertype] __rcu *
>arch/x86/kvm/mmu/tdp_mmu.c:560:49: sparse: sparse: incorrect type in 
> argument 1 (different address spaces) @@ expected unsigned long long 
> [usertype] *sptep @@ got unsigned long long [noderef] [usertype] __rcu 
> *[usertype] root_pt @@
>arch/x86/kvm/mmu/tdp_mmu.c:560:49: sparse: expected unsigned long long 
> [usertype] *sptep
>arch/x86/kvm/mmu/tdp_mmu.c:560:49: spars

Re: [RFC PATCH v3 6/8] KVM: selftests: List all hugetlb src types specified with page sizes

2021-03-01 Thread Ben Gardon

On Sun, Feb 28, 2021 at 11:00 PM Yanan Wang  wrote:
>
> With VM_MEM_SRC_ANONYMOUS_HUGETLB, we currently can only use system
> default hugetlb pages to back the testing guest memory. In order to
> add flexibility, now list all the known hugetlb backing src types with
> different page sizes, so that we can specify use of hugetlb pages of the
> exact granularity that we want. And as all the known hugetlb page sizes
> are listed, it's appropriate for all architectures.
>
> Besides, the helper get_backing_src_pagesz() is added to get the
> granularity of different backing src types(anonumous, thp, hugetlb).
>
> Suggested-by: Ben Gardon 
> Signed-off-by: Yanan Wang 
> ---
>  .../testing/selftests/kvm/include/test_util.h | 19 ++-
>  tools/testing/selftests/kvm/lib/kvm_util.c|  2 +-
>  tools/testing/selftests/kvm/lib/test_util.c   | 56 +++
>  3 files changed, 63 insertions(+), 14 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/include/test_util.h 
> b/tools/testing/selftests/kvm/include/test_util.h
> index ef24c76ba89a..be5d08bcdca7 100644
> --- a/tools/testing/selftests/kvm/include/test_util.h
> +++ b/tools/testing/selftests/kvm/include/test_util.h
> @@ -70,16 +70,31 @@ struct timespec timespec_div(struct timespec ts, int 
> divisor);
>  enum vm_mem_backing_src_type {
> VM_MEM_SRC_ANONYMOUS,
> VM_MEM_SRC_ANONYMOUS_THP,
> -   VM_MEM_SRC_ANONYMOUS_HUGETLB,

I apologize I didn't catch this in v2, but it looks like this patch
removes a default hugetlb size option. I could see this being
intentional if we want to force developers to think about there being
multiple page sizes, but it might also be nice for folks to have an
option to use the system default hugepage size.

Otherwise, this series looks good to me. Please feel free to add
Reviewed-by: Ben Gardon .

> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
> +   NUM_SRC_TYPES,
>  };
>
>  struct vm_mem_backing_src_alias {
> const char *name;
> -   enum vm_mem_backing_src_type type;
> +   uint32_t flag;
>  };
>
>  bool thp_configured(void);
>  size_t get_trans_hugepagesz(void);
> +const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i);
> +size_t get_backing_src_pagesz(uint32_t i);
>  void backing_src_help(void);
>  enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
>
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
> b/tools/testing/selftests/kvm/lib/kvm_util.c
> index cc22c4ab7d67..b91c8e3a7ee1 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -757,7 +757,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
> region->mmap_start = mmap(NULL, region->mmap_size,
>   PROT_READ | PROT_WRITE,
>   MAP_PRIVATE | MAP_ANONYMOUS
> - | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB 
> ? MAP_HUGETLB : 0),
> + | vm_mem_backing_src_alias(src_type)->flag,
>   -1, 0);
> TEST_ASSERT(region->mmap_start != MAP_FAILED,
> "test_malloc failed, mmap_start: %p errno: %i",
> diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
> b/tools/testing/selftests/kvm/lib/test_util.c
> index f2d133f76c67..1f5e7241c80e 100644
> --- a/tools/testing/selftests/kvm/lib/test_util.c
> +++ b/tools/testing/selftests/kvm/lib/test_util.c
> @@ -11,6 +11,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "linux/kernel.h"
>
>  #include "test_util.h"
> @@ -112,12 +113,6 @@ void print_skip(const char *fmt, ...)
> puts(", skipping test");
>  }
>
> -const struct vm_mem_backing_src_alias backing_src_aliases[] = {
> -   {"anonymous", VM_MEM_SRC_ANONYMOUS,},
> -   {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
> -   {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
> -};
> -
>  bool thp_configured(void)
>  {
> int ret;
> @@ -153,22 +148,61 @@ size_t get_trans_hugepa

Re: [RFC PATCH v2 0/7] Some improvement and a new test for kvm page table

2021-02-25 Thread Ben Gardon

On Wed, Feb 24, 2021 at 9:59 PM Yanan Wang  wrote:
>
> Hi,
> This v2 series can mainly include two parts.
> Based on kvm queue branch: 
> https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=queue
> Links of v1: 
> https://lore.kernel.org/lkml/20210208090841.333724-1-wangyana...@huawei.com/
>
> In the first part, all the known hugetlb backing src types specified
> with different hugepage sizes are listed, so that we can specify use
> of hugetlb source of the exact granularity that we want, instead of
> the system default ones. And as all the known hugetlb page sizes are
> listed, it's appropriate for all architectures. Besides, a helper that
> can get granularity of different backing src types(anonumous/thp/hugetlb)
> is added, so that we can use the accurate backing src granularity for
> kinds of alignment or guest memory accessing of vcpus.
>
> In the second part, a new test is added:
> This test is added to serve as a performance tester and a bug reproducer
> for kvm page table code (GPA->HPA mappings), it gives guidance for the
> people trying to make some improvement for kvm. And the following explains
> what we can exactly do through this test.
>
> The function guest_code() can cover the conditions where a single vcpu or
> multiple vcpus access guest pages within the same memory region, in three
> VM stages(before dirty logging, during dirty logging, after dirty logging).
> Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested
> memory region can be specified by users, which means normal page mappings
> or block mappings can be chosen by users to be created in the test.
>
> If ANONYMOUS memory is specified, kvm will create normal page mappings
> for the tested memory region before dirty logging, and update attributes
> of the page mappings from RO to RW during dirty logging. If THP/HUGETLB
> memory is specified, kvm will create block mappings for the tested memory
> region before dirty logging, and split the blcok mappings into normal page
> mappings during dirty logging, and coalesce the page mappings back into
> block mappings after dirty logging is stopped.
>
> So in summary, as a performance tester, this test can present the
> performance of kvm creating/updating normal page mappings, or the
> performance of kvm creating/splitting/recovering block mappings,
> through execution time.
>
> When we need to coalesce the page mappings back to block mappings after
> dirty logging is stopped, we have to firstly invalidate *all* the TLB
> entries for the page mappings right before installation of the block entry,
> because a TLB conflict abort error could occur if we can't invalidate the
> TLB entries fully. We have hit this TLB conflict twice on aarch64 software
> implementation and fixed it. As this test can imulate process from dirty
> logging enabled to dirty logging stopped of a VM with block mappings,
> so it can also reproduce this TLB conflict abort due to inadequate TLB
> invalidation when coalescing tables.
>
> Links about the TLB conflict abort:
> https://lore.kernel.org/lkml/20201201201034.116760-3-wangyana...@huawei.com/

Besides a few style / readability comments, this series looks good to
me. Thanks for generalizing the way these selftests handle different
hugeTLB sizes!


>
> Yanan Wang (7):
>   tools include: sync head files of mmap flag encodings about hugetlb
>   KVM: selftests: Use flag CLOCK_MONOTONIC_RAW for timing
>   KVM: selftests: Make a generic helper to get vm guest mode strings
>   KVM: selftests: Add a helper to get system configured THP page size
>   KVM: selftests: List all hugetlb src types specified with page sizes
>   KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers
>   KVM: selftests: Add a test for kvm page table code
>
>  tools/include/asm-generic/hugetlb_encode.h|   3 +
>  tools/testing/selftests/kvm/Makefile  |   3 +
>  .../selftests/kvm/demand_paging_test.c|   8 +-
>  .../selftests/kvm/dirty_log_perf_test.c   |  14 +-
>  .../testing/selftests/kvm/include/kvm_util.h  |   4 +-
>  .../testing/selftests/kvm/include/test_util.h |  21 +-
>  .../selftests/kvm/kvm_page_table_test.c   | 476 ++
>  tools/testing/selftests/kvm/lib/kvm_util.c|  58 +--
>  tools/testing/selftests/kvm/lib/test_util.c   |  92 +++-
>  tools/testing/selftests/kvm/steal_time.c  |   4 +-
>  10 files changed, 623 insertions(+), 60 deletions(-)
>  create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c
>
> --
> 2.19.1
>

Re: [RFC PATCH v2 6/7] KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers

2021-02-25 Thread Ben Gardon

On Wed, Feb 24, 2021 at 10:03 PM Yanan Wang  wrote:
>
> With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
> we have to get the transparent hugepage size for HVA alignment. With the
> new helpers, we can use get_backing_src_pagesz() to check whether THP is
> configured and then get the exact configured hugepage size.
>
> As different architectures may have different THP page sizes configured,
> this can get the accurate THP page sizes on any platform.
>
> Signed-off-by: Yanan Wang 
> ---
>  tools/testing/selftests/kvm/lib/kvm_util.c | 27 +++---
>  1 file changed, 8 insertions(+), 19 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
> b/tools/testing/selftests/kvm/lib/kvm_util.c
> index b91c8e3a7ee1..0105fbfed036 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -18,7 +18,6 @@
>  #include 
>  #include 
>
> -#define KVM_UTIL_PGS_PER_HUGEPG 512
>  #define KVM_UTIL_MIN_PFN   2
>
>  /* Aligns x up to the next multiple of size. Size must be a power of 2. */
> @@ -686,7 +685,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
>  {
> int ret;
> struct userspace_mem_region *region;
> -   size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
> +   size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
> size_t alignment;
>
> TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
> @@ -748,7 +747,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
>  #endif
>
> if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
> -   alignment = max(huge_page_size, alignment);
> +   alignment = max(backing_src_pagesz, alignment);
>
> /* Add enough memory to align up if necessary */
> if (alignment > 1)
> @@ -767,22 +766,12 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
> region->host_mem = align(region->mmap_start, alignment);
>
> /* As needed perform madvise */
> -   if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == 
> VM_MEM_SRC_ANONYMOUS_THP) {
> -   struct stat statbuf;
> -
> -   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
> -   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
> -   "stat /sys/kernel/mm/transparent_hugepage");
> -
> -   TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
> -   "VM_MEM_SRC_ANONYMOUS_THP requires THP to be 
> configured in the host kernel");
> -
> -   if (ret == 0) {
> -   ret = madvise(region->host_mem, npages * 
> vm->page_size,
> - src_type == VM_MEM_SRC_ANONYMOUS ? 
> MADV_NOHUGEPAGE : MADV_HUGEPAGE);
> -   TEST_ASSERT(ret == 0, "madvise failed, addr: %p 
> length: 0x%lx src_type: %x",
> -   region->host_mem, npages * vm->page_size, 
> src_type);
> -   }
> +   if (src_type <= VM_MEM_SRC_ANONYMOUS_THP && thp_configured()) {

This check relies on an unstated property of the backing src type
enums where VM_MEM_SRC_ANONYMOUS and VM_MEM_SRC_ANONYMOUS_THP are
declared first.
It would probably be more readable for folks if the check was explicit:
if ((src_type == VM_MEM_SRC_ANONYMOUS || src_type ==
VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {


> +   ret = madvise(region->host_mem, npages * vm->page_size,
> + src_type == VM_MEM_SRC_ANONYMOUS ? 
> MADV_NOHUGEPAGE : MADV_HUGEPAGE);
> +   TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx 
> src_type: %s",
> +   region->host_mem, npages * vm->page_size,
> +   vm_mem_backing_src_alias(src_type)->name);
> }
>
> region->unused_phy_pages = sparsebit_alloc();
> --
> 2.19.1
>

Re: [RFC PATCH v2 5/7] KVM: selftests: List all hugetlb src types specified with page sizes

2021-02-25 Thread Ben Gardon

On Wed, Feb 24, 2021 at 10:03 PM Yanan Wang  wrote:
>
> With VM_MEM_SRC_ANONYMOUS_HUGETLB, we currently can only use system
> default hugetlb pages to back the testing guest memory. In order to
> add flexibility, now list all the known hugetlb backing src types with
> different page sizes, so that we can specify use of hugetlb pages of the
> exact granularity that we want. And as all the known hugetlb page sizes
> are listed, it's appropriate for all architectures.
>
> Besides, the helper get_backing_src_pagesz() is added to get the
> granularity of different backing src types(anonumous, thp, hugetlb).
>
> Signed-off-by: Yanan Wang 
> ---
>  .../testing/selftests/kvm/include/test_util.h | 19 ++-
>  tools/testing/selftests/kvm/lib/kvm_util.c|  2 +-
>  tools/testing/selftests/kvm/lib/test_util.c   | 56 +++
>  3 files changed, 63 insertions(+), 14 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/include/test_util.h 
> b/tools/testing/selftests/kvm/include/test_util.h
> index ef24c76ba89a..be5d08bcdca7 100644
> --- a/tools/testing/selftests/kvm/include/test_util.h
> +++ b/tools/testing/selftests/kvm/include/test_util.h
> @@ -70,16 +70,31 @@ struct timespec timespec_div(struct timespec ts, int 
> divisor);
>  enum vm_mem_backing_src_type {
> VM_MEM_SRC_ANONYMOUS,
> VM_MEM_SRC_ANONYMOUS_THP,
> -   VM_MEM_SRC_ANONYMOUS_HUGETLB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
> +   VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
> +   NUM_SRC_TYPES,
>  };
>
>  struct vm_mem_backing_src_alias {
> const char *name;
> -   enum vm_mem_backing_src_type type;
> +   uint32_t flag;
>  };
>
>  bool thp_configured(void);
>  size_t get_trans_hugepagesz(void);
> +const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i);
> +size_t get_backing_src_pagesz(uint32_t i);
>  void backing_src_help(void);
>  enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
>
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
> b/tools/testing/selftests/kvm/lib/kvm_util.c
> index cc22c4ab7d67..b91c8e3a7ee1 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -757,7 +757,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
> region->mmap_start = mmap(NULL, region->mmap_size,
>   PROT_READ | PROT_WRITE,
>   MAP_PRIVATE | MAP_ANONYMOUS
> - | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB 
> ? MAP_HUGETLB : 0),
> + | vm_mem_backing_src_alias(src_type)->flag,
>   -1, 0);
> TEST_ASSERT(region->mmap_start != MAP_FAILED,
> "test_malloc failed, mmap_start: %p errno: %i",
> diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
> b/tools/testing/selftests/kvm/lib/test_util.c
> index f2d133f76c67..6780aa058f35 100644
> --- a/tools/testing/selftests/kvm/lib/test_util.c
> +++ b/tools/testing/selftests/kvm/lib/test_util.c
> @@ -11,6 +11,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "linux/kernel.h"
>
>  #include "test_util.h"
> @@ -112,12 +113,6 @@ void print_skip(const char *fmt, ...)
> puts(", skipping test");
>  }
>
> -const struct vm_mem_backing_src_alias backing_src_aliases[] = {
> -   {"anonymous", VM_MEM_SRC_ANONYMOUS,},
> -   {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
> -   {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
> -};
> -
>  bool thp_configured(void)
>  {
> int ret;
> @@ -153,22 +148,61 @@ size_t get_trans_hugepagesz(void)
> return size;
>  }
>
> +const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
> +{
> +   static const struct vm_mem_backing_src_alias aliases[] = {
> +   { "anonymous",   0},
> +   { "anonymous_thp",   0},
> +   { "anonymous_hugetlb_16kb",  MAP_HUGETLB | MAP_HUGE_16KB  },
> +   { "anonymous_hugetlb_64kb",  MAP_HUGETLB | MAP_HUGE_64KB  },
> +   { "anonymous_hugetlb_512kb", MAP_HUGETLB | MAP_HUGE_512KB },
> +   { "anonymous_hugetlb_1mb",   MAP_HUGETLB | MAP_HUGE_1MB   },
> +   { "anonymous_hugetlb_2mb",   MAP_HUGETLB | MAP_HUGE_2MB   },
> +   { "anonymous_hugetlb_8mb",   MAP_HUGETLB | MAP_HUGE_8MB   },
> +

Re: [RFC PATCH v2 4/7] KVM: selftests: Add a helper to get system configured THP page size

2021-02-25 Thread Ben Gardon

On Wed, Feb 24, 2021 at 10:00 PM Yanan Wang  wrote:
>
> If we want to have some tests about transparent hugepages, the system
> configured THP hugepage size should better be known by the tests, which
> can be used for kinds of alignment or guest memory accessing of vcpus...
> So it makes sense to add a helper to get the transparent hugepage size.
>
> With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
> we now stat /sys/kernel/mm/transparent_hugepage to check whether THP is
> configured in the host kernel before madvise(). Based on this, we can also
> read file /sys/kernel/mm/transparent_hugepage/hpage_pmd_size to get THP
> hugepage size.
>
> Signed-off-by: Yanan Wang 

Reviewed-by: Ben Gardon 

> ---
>  .../testing/selftests/kvm/include/test_util.h |  2 ++
>  tools/testing/selftests/kvm/lib/test_util.c   | 36 +++
>  2 files changed, 38 insertions(+)
>
> diff --git a/tools/testing/selftests/kvm/include/test_util.h 
> b/tools/testing/selftests/kvm/include/test_util.h
> index b7f41399f22c..ef24c76ba89a 100644
> --- a/tools/testing/selftests/kvm/include/test_util.h
> +++ b/tools/testing/selftests/kvm/include/test_util.h
> @@ -78,6 +78,8 @@ struct vm_mem_backing_src_alias {
> enum vm_mem_backing_src_type type;
>  };
>
> +bool thp_configured(void);
> +size_t get_trans_hugepagesz(void);
>  void backing_src_help(void);
>  enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
>
> diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
> b/tools/testing/selftests/kvm/lib/test_util.c
> index c7c0627c6842..f2d133f76c67 100644
> --- a/tools/testing/selftests/kvm/lib/test_util.c
> +++ b/tools/testing/selftests/kvm/lib/test_util.c
> @@ -10,6 +10,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "linux/kernel.h"
>
>  #include "test_util.h"
> @@ -117,6 +118,41 @@ const struct vm_mem_backing_src_alias 
> backing_src_aliases[] = {
> {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
>  };
>
> +bool thp_configured(void)
> +{
> +   int ret;
> +   struct stat statbuf;
> +
> +   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
> +   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
> +   "Error in stating /sys/kernel/mm/transparent_hugepage: 
> %d",
> +   errno);
> +
> +   return ret == 0;
> +}
> +
> +size_t get_trans_hugepagesz(void)
> +{
> +   size_t size;
> +   char buf[16];
> +   FILE *f;
> +
> +   TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
> +
> +   f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
> +   TEST_ASSERT(f != NULL,
> +   "Error in opening transparent_hugepage/hpage_pmd_size: 
> %d",
> +   errno);
> +
> +   if (fread(buf, sizeof(char), sizeof(buf), f) == 0) {
> +   fclose(f);
> +   TEST_FAIL("Unable to read 
> transparent_hugepage/hpage_pmd_size");
> +   }
> +
> +   size = strtoull(buf, NULL, 10);
> +   return size;
> +}
> +
>  void backing_src_help(void)
>  {
> int i;
> --
> 2.19.1
>

Re: [PATCH 09/15] KVM: selftests: Move per-VM GPA into perf_test_args

2021-02-11 Thread Ben Gardon

On Thu, Feb 11, 2021 at 7:58 AM Sean Christopherson  wrote:
>
> On Thu, Feb 11, 2021, Paolo Bonzini wrote:
> > On 11/02/21 02:56, Sean Christopherson wrote:
> > > > > +   pta->gpa = (vm_get_max_gfn(vm) - guest_num_pages) * 
> > > > > pta->guest_page_size;
> > > > > +   pta->gpa &= ~(pta->host_page_size - 1);
> > > > Also not related to this patch, but another case for align.
> > > >
> > > > >  if (backing_src == VM_MEM_SRC_ANONYMOUS_THP ||
> > > > >  backing_src == VM_MEM_SRC_ANONYMOUS_HUGETLB)
> > > > > -   guest_test_phys_mem &= ~(KVM_UTIL_HUGEPAGE_ALIGNMENT 
> > > > > - 1);
> > > > > -
> > > > > +   pta->gpa &= ~(KVM_UTIL_HUGEPAGE_ALIGNMENT - 1);
> > > > also align
> > > >
> > > > >   #ifdef __s390x__
> > > > >  /* Align to 1M (segment size) */
> > > > > -   guest_test_phys_mem &= ~((1 << 20) - 1);
> > > > > +   pta->gpa &= ~((1 << 20) - 1);
> > > > And here again (oof)
> > >
> > > Yep, I'll fix all these and the align() comment in v2.
> >
> > This is not exactly align in fact; it is x & ~y rather than (x + y) & ~y.
> > Are you going to introduce a round-down macro or is it a bug?  (I am
> > lazy...).
>
> Good question.  I, too, was lazy.  I didn't look at the guts of align() when I
> moved it, and I didn't look closely at Ben's suggestion.  I'll take a closer
> look today and make sure everything is doing what it's supposed to do.

Ooh, great point Paolo, that helper is indeed rounding up. My comment
in patch #2 was totally wrong. I forgot anyone would ever want to
round up. :/
My misunderstanding and the above use cases are probably good evidence
that it would be helpful to have both align_up and align_down helpers.

Re: [PATCH 11/15] KVM: selftests: Create VM with adjusted number of guest pages for perf tests

2021-02-10 Thread Ben Gardon

On Wed, Feb 10, 2021 at 3:06 PM Sean Christopherson  wrote:
>
> Use the already computed guest_num_pages when creating the so called
> extra VM pages for a perf test, and add a comment explaining why the
> pages are allocated as extra pages.
>
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

> ---
>  tools/testing/selftests/kvm/lib/perf_test_util.c | 9 ++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c 
> b/tools/testing/selftests/kvm/lib/perf_test_util.c
> index 982a86c8eeaa..9b0cfdf10772 100644
> --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> @@ -71,9 +71,12 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode 
> mode, int vcpus,
> TEST_ASSERT(vcpu_memory_bytes % pta->guest_page_size == 0,
> "Guest memory size is not guest page size aligned.");
>
> -   vm = vm_create_with_vcpus(mode, vcpus,
> - (vcpus * vcpu_memory_bytes) / 
> pta->guest_page_size,
> - 0, guest_code, NULL);
> +   /*
> +* Pass guest_num_pages to populate the page tables for test memory.
> +* The memory is also added to memslot 0, but that's a benign side
> +* effect as KVM allows aliasing HVAs in memslots.
> +*/
> +   vm = vm_create_with_vcpus(mode, vcpus, 0, guest_num_pages, 
> guest_code, NULL);
> pta->vm = vm;
>
> /*
> --
> 2.30.0.478.g8a0d178c01-goog
>

Re: [PATCH 10/15] KVM: selftests: Remove perf_test_args.host_page_size

2021-02-10 Thread Ben Gardon

On Wed, Feb 10, 2021 at 3:06 PM Sean Christopherson  wrote:
>
> Remove perf_test_args.host_page_size and instead use getpagesize() so
> that it's somewhat obvious that, for tests that care about the host page
> size, they care about the system page size, not the hardware page size,
> e.g. that the logic is unchanged if hugepages are in play.
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

> ---
>  tools/testing/selftests/kvm/demand_paging_test.c  | 8 
>  tools/testing/selftests/kvm/include/perf_test_util.h  | 1 -
>  tools/testing/selftests/kvm/lib/perf_test_util.c  | 6 ++
>  .../selftests/kvm/memslot_modification_stress_test.c  | 2 +-
>  4 files changed, 7 insertions(+), 10 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/demand_paging_test.c 
> b/tools/testing/selftests/kvm/demand_paging_test.c
> index 0cbf111e6c21..b937a65b0e6d 100644
> --- a/tools/testing/selftests/kvm/demand_paging_test.c
> +++ b/tools/testing/selftests/kvm/demand_paging_test.c
> @@ -83,7 +83,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
>
> copy.src = (uint64_t)guest_data_prototype;
> copy.dst = addr;
> -   copy.len = perf_test_args.host_page_size;
> +   copy.len = getpagesize();
> copy.mode = 0;
>
> clock_gettime(CLOCK_MONOTONIC, &start);
> @@ -100,7 +100,7 @@ static int handle_uffd_page_request(int uffd, uint64_t 
> addr)
> PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid,
>timespec_to_ns(ts_diff));
> PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n",
> -  perf_test_args.host_page_size, addr, tid);
> +  getpagesize(), addr, tid);
>
> return 0;
>  }
> @@ -271,10 +271,10 @@ static void run_test(enum vm_guest_mode mode, void *arg)
>
> perf_test_args.wr_fract = 1;
>
> -   guest_data_prototype = malloc(perf_test_args.host_page_size);
> +   guest_data_prototype = malloc(getpagesize());
> TEST_ASSERT(guest_data_prototype,
> "Failed to allocate buffer for guest data pattern");
> -   memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size);
> +   memset(guest_data_prototype, 0xAB, getpagesize());
>
> vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
> TEST_ASSERT(vcpu_threads, "Memory allocation failed");
> diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h 
> b/tools/testing/selftests/kvm/include/perf_test_util.h
> index cccf1c44bddb..223fe6b79a04 100644
> --- a/tools/testing/selftests/kvm/include/perf_test_util.h
> +++ b/tools/testing/selftests/kvm/include/perf_test_util.h
> @@ -28,7 +28,6 @@ struct perf_test_vcpu_args {
>
>  struct perf_test_args {
> struct kvm_vm *vm;
> -   uint64_t host_page_size;
> uint64_t gpa;
> uint64_t guest_page_size;
> int wr_fract;
> diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c 
> b/tools/testing/selftests/kvm/lib/perf_test_util.c
> index 03f125236021..982a86c8eeaa 100644
> --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> @@ -57,8 +57,6 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, 
> int vcpus,
>
> pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
>
> -   pta->host_page_size = getpagesize();
> -
> /*
>  * Snapshot the non-huge page size.  This is used by the guest code to
>  * access/dirty pages at the logging granularity.
> @@ -68,7 +66,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, 
> int vcpus,
> guest_num_pages = vm_adjust_num_guest_pages(mode,
> (vcpus * vcpu_memory_bytes) / 
> pta->guest_page_size);
>
> -   TEST_ASSERT(vcpu_memory_bytes % pta->host_page_size == 0,
> +   TEST_ASSERT(vcpu_memory_bytes % getpagesize() == 0,
> "Guest memory size is not host page size aligned.");
> TEST_ASSERT(vcpu_memory_bytes % pta->guest_page_size == 0,
> "Guest memory size is not guest page size aligned.");
> @@ -88,7 +86,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, 
> int vcpus,
> guest_num_pages, vm_get_max_gfn(vm), vcpus, 
> vcpu_memory_bytes);
>
> pta->gpa = (vm_get_max_gfn(vm) - guest_num_pages) * 
> pta->guest_page_size;
> -   pta->gpa &= ~(pta->host_page_size - 1);
> +   pta->gpa &= ~(getpages

Re: [PATCH 07/15] KVM: selftests: Capture per-vCPU GPA in perf_test_vcpu_args

2021-02-10 Thread Ben Gardon

On Wed, Feb 10, 2021 at 3:06 PM Sean Christopherson  wrote:
>
> Capture the per-vCPU GPA in perf_test_vcpu_args so that tests can get
> the GPA without having to calculate the GPA on their own.
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

> ---
>  tools/testing/selftests/kvm/include/perf_test_util.h | 1 +
>  tools/testing/selftests/kvm/lib/perf_test_util.c | 9 -
>  2 files changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h 
> b/tools/testing/selftests/kvm/include/perf_test_util.h
> index 005f2143adeb..4d53238b139f 100644
> --- a/tools/testing/selftests/kvm/include/perf_test_util.h
> +++ b/tools/testing/selftests/kvm/include/perf_test_util.h
> @@ -18,6 +18,7 @@
>  #define PERF_TEST_MEM_SLOT_INDEX   1
>
>  struct perf_test_vcpu_args {
> +   uint64_t gpa;
> uint64_t gva;
> uint64_t pages;
>
> diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c 
> b/tools/testing/selftests/kvm/lib/perf_test_util.c
> index 73b0fccc28b9..f22ce1836547 100644
> --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> @@ -127,7 +127,6 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus,
>bool partition_vcpu_memory_access)
>  {
> struct perf_test_args *pta = &perf_test_args;
> -   vm_paddr_t vcpu_gpa;
> struct perf_test_vcpu_args *vcpu_args;
> int vcpu_id;
>
> @@ -140,17 +139,17 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus,
>  (vcpu_id * vcpu_memory_bytes);
> vcpu_args->pages = vcpu_memory_bytes /
>pta->guest_page_size;
> -   vcpu_gpa = guest_test_phys_mem +
> -  (vcpu_id * vcpu_memory_bytes);
> +   vcpu_args->gpa = guest_test_phys_mem +
> +(vcpu_id * vcpu_memory_bytes);
> } else {
> vcpu_args->gva = guest_test_virt_mem;
> vcpu_args->pages = (vcpus * vcpu_memory_bytes) /
>pta->guest_page_size;
> -   vcpu_gpa = guest_test_phys_mem;
> +   vcpu_args->gpa = guest_test_phys_mem;
> }
>
> pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n",
> -vcpu_id, vcpu_gpa, vcpu_gpa +
> +vcpu_id, vcpu_args->gpa, vcpu_args->gpa +
>  (vcpu_args->pages * pta->guest_page_size));
> }
>  }
> --
> 2.30.0.478.g8a0d178c01-goog
>

Re: [PATCH 08/15] KVM: selftests: Use perf util's per-vCPU GPA/pages in demand paging test

2021-02-10 Thread Ben Gardon

On Wed, Feb 10, 2021 at 3:06 PM Sean Christopherson  wrote:
>
> Grab the per-vCPU GPA and number of pages from perf_util in the demand
> paging test instead of duplicating perf_util's calculations.
>
> Note, this may or may not result in a functional change.  It's not clear
> that the test's calculations are guaranteed to yield the same value as
> perf_util, e.g. if guest_percpu_mem_size != vcpu_args->pages.
>
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

> ---
>  .../selftests/kvm/demand_paging_test.c| 20 +--
>  1 file changed, 5 insertions(+), 15 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/demand_paging_test.c 
> b/tools/testing/selftests/kvm/demand_paging_test.c
> index 5f7a229c3af1..0cbf111e6c21 100644
> --- a/tools/testing/selftests/kvm/demand_paging_test.c
> +++ b/tools/testing/selftests/kvm/demand_paging_test.c
> @@ -294,24 +294,13 @@ static void run_test(enum vm_guest_mode mode, void *arg)
> TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd");
>
> for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
> -   vm_paddr_t vcpu_gpa;
> +   struct perf_test_vcpu_args *vcpu_args;
> void *vcpu_hva;
> -   uint64_t vcpu_mem_size;
>
> -
> -   if (p->partition_vcpu_memory_access) {
> -   vcpu_gpa = guest_test_phys_mem +
> -  (vcpu_id * guest_percpu_mem_size);
> -   vcpu_mem_size = guest_percpu_mem_size;
> -   } else {
> -   vcpu_gpa = guest_test_phys_mem;
> -   vcpu_mem_size = guest_percpu_mem_size * 
> nr_vcpus;
> -   }
> -   PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, 
> %lx)\n",
> -  vcpu_id, vcpu_gpa, vcpu_gpa + 
> vcpu_mem_size);
> +   vcpu_args = &perf_test_args.vcpu_args[vcpu_id];
>
> /* Cache the HVA pointer of the region */
> -   vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
> +   vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa);
>
> /*
>  * Set up user fault fd to handle demand paging
> @@ -325,7 +314,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
> 
> &uffd_handler_threads[vcpu_id],
> pipefds[vcpu_id * 2],
> p->uffd_delay, 
> &uffd_args[vcpu_id],
> -   vcpu_hva, vcpu_mem_size);
> +   vcpu_hva,
> +   vcpu_args->pages * 
> perf_test_args.guest_page_size);
> if (r < 0)
> exit(-r);
> }
> --
> 2.30.0.478.g8a0d178c01-goog
>

Re: [PATCH 09/15] KVM: selftests: Move per-VM GPA into perf_test_args

2021-02-10 Thread Ben Gardon

On Wed, Feb 10, 2021 at 3:06 PM Sean Christopherson  wrote:
>
> Move the per-VM GPA into perf_test_args instead of storing it as a
> separate global variable.  It's not obvious that guest_test_phys_mem
> holds a GPA, nor that it's connected/coupled with per_vcpu->gpa.
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

> ---
>  .../selftests/kvm/include/perf_test_util.h|  8 +-
>  .../selftests/kvm/lib/perf_test_util.c| 28 ---
>  .../kvm/memslot_modification_stress_test.c|  2 +-
>  3 files changed, 13 insertions(+), 25 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h 
> b/tools/testing/selftests/kvm/include/perf_test_util.h
> index 4d53238b139f..cccf1c44bddb 100644
> --- a/tools/testing/selftests/kvm/include/perf_test_util.h
> +++ b/tools/testing/selftests/kvm/include/perf_test_util.h
> @@ -29,6 +29,7 @@ struct perf_test_vcpu_args {
>  struct perf_test_args {
> struct kvm_vm *vm;
> uint64_t host_page_size;
> +   uint64_t gpa;
> uint64_t guest_page_size;
> int wr_fract;
>
> @@ -37,13 +38,6 @@ struct perf_test_args {
>
>  extern struct perf_test_args perf_test_args;
>
> -/*
> - * Guest physical memory offset of the testing memory slot.
> - * This will be set to the topmost valid physical address minus
> - * the test memory size.
> - */
> -extern uint64_t guest_test_phys_mem;
> -
>  struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
>uint64_t vcpu_memory_bytes,
>enum vm_mem_backing_src_type backing_src);
> diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c 
> b/tools/testing/selftests/kvm/lib/perf_test_util.c
> index f22ce1836547..03f125236021 100644
> --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> @@ -9,8 +9,6 @@
>
>  struct perf_test_args perf_test_args;
>
> -uint64_t guest_test_phys_mem;
> -
>  /*
>   * Guest virtual memory offset of the testing memory slot.
>   * Must not conflict with identity mapped test code.
> @@ -87,29 +85,25 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode 
> mode, int vcpus,
> TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
> "Requested more guest memory than address space allows.\n"
> "guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n",
> -   guest_num_pages, vm_get_max_gfn(vm), vcpus,
> -   vcpu_memory_bytes);
> +   guest_num_pages, vm_get_max_gfn(vm), vcpus, 
> vcpu_memory_bytes);
>
> -   guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
> - pta->guest_page_size;
> -   guest_test_phys_mem &= ~(pta->host_page_size - 1);
> +   pta->gpa = (vm_get_max_gfn(vm) - guest_num_pages) * 
> pta->guest_page_size;
> +   pta->gpa &= ~(pta->host_page_size - 1);

Also not related to this patch, but another case for align.

> if (backing_src == VM_MEM_SRC_ANONYMOUS_THP ||
> backing_src == VM_MEM_SRC_ANONYMOUS_HUGETLB)
> -   guest_test_phys_mem &= ~(KVM_UTIL_HUGEPAGE_ALIGNMENT - 1);
> -
> +   pta->gpa &= ~(KVM_UTIL_HUGEPAGE_ALIGNMENT - 1);

also align

>  #ifdef __s390x__
> /* Align to 1M (segment size) */
> -   guest_test_phys_mem &= ~((1 << 20) - 1);
> +   pta->gpa &= ~((1 << 20) - 1);

And here again (oof)

>  #endif
> -   pr_info("guest physical test memory offset: 0x%lx\n", 
> guest_test_phys_mem);
> +   pr_info("guest physical test memory offset: 0x%lx\n", pta->gpa);
>
> /* Add an extra memory slot for testing */
> -   vm_userspace_mem_region_add(vm, backing_src, guest_test_phys_mem,
> -   PERF_TEST_MEM_SLOT_INDEX,
> -   guest_num_pages, 0);
> +   vm_userspace_mem_region_add(vm, backing_src, pta->gpa,
> +   PERF_TEST_MEM_SLOT_INDEX, 
> guest_num_pages, 0);
>
> /* Do mapping for the demand paging memory slot */
> -   virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, 
> guest_num_pages, 0);
> +   virt_map(vm, guest_test_virt_mem, pta->gpa, guest_num_pages, 0);
>
> ucall_init(vm, NULL);
>
> @@ -139,13 +133,13 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus,
>  (vcpu_id * vcpu_memory_bytes);
> vcpu_ar

Re: [PATCH 06/15] KVM: selftests: Use shorthand local var to access struct perf_tests_args

2021-02-10 Thread Ben Gardon

On Wed, Feb 10, 2021 at 3:06 PM Sean Christopherson  wrote:
>
> Use 'pta' as a local pointer to the global perf_tests_args in order to
> shorten line lengths and make the code borderline readable.
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

> ---
>  .../selftests/kvm/lib/perf_test_util.c| 36 ++-
>  1 file changed, 19 insertions(+), 17 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c 
> b/tools/testing/selftests/kvm/lib/perf_test_util.c
> index f187b86f2e14..73b0fccc28b9 100644
> --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> @@ -23,7 +23,8 @@ static uint64_t guest_test_virt_mem = 
> DEFAULT_GUEST_TEST_MEM;
>   */
>  static void guest_code(uint32_t vcpu_id)
>  {
> -   struct perf_test_vcpu_args *vcpu_args = 
> &perf_test_args.vcpu_args[vcpu_id];
> +   struct perf_test_args *pta = &perf_test_args;
> +   struct perf_test_vcpu_args *vcpu_args = &pta->vcpu_args[vcpu_id];
> uint64_t gva;
> uint64_t pages;
> int i;
> @@ -36,9 +37,9 @@ static void guest_code(uint32_t vcpu_id)
>
> while (true) {
> for (i = 0; i < pages; i++) {
> -   uint64_t addr = gva + (i * 
> perf_test_args.guest_page_size);
> +   uint64_t addr = gva + (i * pta->guest_page_size);
>
> -   if (i % perf_test_args.wr_fract == 0)
> +   if (i % pta->wr_fract == 0)
> *(uint64_t *)addr = 0x0123456789ABCDEF;
> else
> READ_ONCE(*(uint64_t *)addr);
> @@ -52,32 +53,32 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode 
> mode, int vcpus,
>uint64_t vcpu_memory_bytes,
>enum vm_mem_backing_src_type backing_src)
>  {
> +   struct perf_test_args *pta = &perf_test_args;
> struct kvm_vm *vm;
> uint64_t guest_num_pages;
>
> pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
>
> -   perf_test_args.host_page_size = getpagesize();
> +   pta->host_page_size = getpagesize();
>
> /*
>  * Snapshot the non-huge page size.  This is used by the guest code to
>  * access/dirty pages at the logging granularity.
>  */
> -   perf_test_args.guest_page_size = vm_guest_mode_params[mode].page_size;
> +   pta->guest_page_size = vm_guest_mode_params[mode].page_size;
>
> guest_num_pages = vm_adjust_num_guest_pages(mode,
> -   (vcpus * vcpu_memory_bytes) / 
> perf_test_args.guest_page_size);
> +   (vcpus * vcpu_memory_bytes) / 
> pta->guest_page_size);
>
> -   TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0,
> +   TEST_ASSERT(vcpu_memory_bytes % pta->host_page_size == 0,
> "Guest memory size is not host page size aligned.");
> -   TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0,
> +   TEST_ASSERT(vcpu_memory_bytes % pta->guest_page_size == 0,
> "Guest memory size is not guest page size aligned.");
>
> vm = vm_create_with_vcpus(mode, vcpus,
> - (vcpus * vcpu_memory_bytes) / 
> perf_test_args.guest_page_size,
> + (vcpus * vcpu_memory_bytes) / 
> pta->guest_page_size,
>   0, guest_code, NULL);
> -
> -   perf_test_args.vm = vm;
> +   pta->vm = vm;
>
> /*
>  * If there should be more memory in the guest test region than there
> @@ -90,8 +91,8 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, 
> int vcpus,
> vcpu_memory_bytes);
>
> guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
> - perf_test_args.guest_page_size;
> -   guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1);
> + pta->guest_page_size;
> +   guest_test_phys_mem &= ~(pta->host_page_size - 1);

Not really germane to this patch, but the align macro could be used
here as well.

> if (backing_src == VM_MEM_SRC_ANONYMOUS_THP ||
> backing_src == VM_MEM_SRC_ANONYMOUS_HUGETLB)
> guest_test_phys_mem &= ~(KVM_UTIL_HUGEPAGE_ALIGNMENT - 1);
> @@ -125,30 +126,31 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus,
&

Re: [PATCH 05/15] KVM: selftests: Require GPA to be aligned when backed by hugepages

2021-02-10 Thread Ben Gardon

On Wed, Feb 10, 2021 at 3:06 PM Sean Christopherson  wrote:
>
> Assert that the GPA for a memslot backed by a hugepage is 1gb aligned,
> and fix perf_test_util accordingly.  Lack of GPA alignment prevents KVM
> from backing the guest with hugepages, e.g. x86's write-protection of
> hugepages when dirty logging is activated is otherwise not exercised.
>
> Add a comment explaining that guest_page_size is for non-huge pages to
> try and avoid confusion about what it actually tracks.
>
> Cc: Ben Gardon 
> Cc: Yanan Wang 
> Cc: Andrew Jones 
> Cc: Peter Xu 
> Cc: Aaron Lewis 
> Signed-off-by: Sean Christopherson 
> ---
>  tools/testing/selftests/kvm/lib/kvm_util.c   | 2 ++
>  tools/testing/selftests/kvm/lib/perf_test_util.c | 9 +
>  2 files changed, 11 insertions(+)
>
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
> b/tools/testing/selftests/kvm/lib/kvm_util.c
> index 2e497fbab6ae..855d20784ba7 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -735,6 +735,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
> else
> ASSERT_EQ(src_type, VM_MEM_SRC_ANONYMOUS);
>
> +   ASSERT_EQ(guest_paddr, align(guest_paddr, alignment));
> +
> /* Add enough memory to align up if necessary */
> if (alignment > 1)
> region->mmap_size += alignment;
> diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c 
> b/tools/testing/selftests/kvm/lib/perf_test_util.c
> index 81490b9b4e32..f187b86f2e14 100644
> --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> @@ -58,6 +58,11 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode 
> mode, int vcpus,
> pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
>
> perf_test_args.host_page_size = getpagesize();
> +
> +   /*
> +* Snapshot the non-huge page size.  This is used by the guest code to
> +* access/dirty pages at the logging granularity.
> +*/
> perf_test_args.guest_page_size = vm_guest_mode_params[mode].page_size;
>
> guest_num_pages = vm_adjust_num_guest_pages(mode,
> @@ -87,6 +92,10 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode 
> mode, int vcpus,
> guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
>   perf_test_args.guest_page_size;
> guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1);
> +   if (backing_src == VM_MEM_SRC_ANONYMOUS_THP ||
> +   backing_src == VM_MEM_SRC_ANONYMOUS_HUGETLB)
> +   guest_test_phys_mem &= ~(KVM_UTIL_HUGEPAGE_ALIGNMENT - 1);

You could use the align helper here as well. That would make this a
little easier for me to read.

> +
>  #ifdef __s390x__
> /* Align to 1M (segment size) */
> guest_test_phys_mem &= ~((1 << 20) - 1);
> --
> 2.30.0.478.g8a0d178c01-goog
>

Re: [PATCH 03/15] KVM: selftests: Align HVA for HugeTLB-backed memslots

2021-02-10 Thread Ben Gardon

On Wed, Feb 10, 2021 at 3:06 PM Sean Christopherson  wrote:
>
> Align the HVA for HugeTLB memslots, not just THP memslots.  Add an
> assert so any future backing types are forced to assess whether or not
> they need to be aligned.
>
> Cc: Ben Gardon 
> Cc: Yanan Wang 
> Cc: Andrew Jones 
> Cc: Peter Xu 
> Cc: Aaron Lewis 
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

> ---
>  tools/testing/selftests/kvm/lib/kvm_util.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
> b/tools/testing/selftests/kvm/lib/kvm_util.c
> index 584167c6dbc7..deaeb47b5a6d 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -731,8 +731,11 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
> alignment = 1;
>  #endif
>
> -   if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
> +   if (src_type == VM_MEM_SRC_ANONYMOUS_THP ||
> +   src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB)
> alignment = max(huge_page_size, alignment);
> +   else
> +   ASSERT_EQ(src_type, VM_MEM_SRC_ANONYMOUS);
>
> /* Add enough memory to align up if necessary */
> if (alignment > 1)
> --
> 2.30.0.478.g8a0d178c01-goog
>

Re: [PATCH 01/15] KVM: selftests: Explicitly state indicies for vm_guest_mode_params array

2021-02-10 Thread Ben Gardon

On Wed, Feb 10, 2021 at 3:06 PM Sean Christopherson  wrote:
>
> Explicitly state the indices when populating vm_guest_mode_params to
> make it marginally easier to visualize what's going on.
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

> ---
>  tools/testing/selftests/kvm/lib/kvm_util.c | 14 +++---
>  1 file changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
> b/tools/testing/selftests/kvm/lib/kvm_util.c
> index d787cb802b4a..960f4c5129ff 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -154,13 +154,13 @@ _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char 
> *) == NUM_VM_MODES,
>"Missing new mode strings?");
>
>  const struct vm_guest_mode_params vm_guest_mode_params[] = {
> -   { 52, 48,  0x1000, 12 },
> -   { 52, 48, 0x1, 16 },
> -   { 48, 48,  0x1000, 12 },
> -   { 48, 48, 0x1, 16 },
> -   { 40, 48,  0x1000, 12 },
> -   { 40, 48, 0x1, 16 },
> -   {  0,  0,  0x1000, 12 },
> +   [VM_MODE_P52V48_4K] = { 52, 48,  0x1000, 12 },
> +   [VM_MODE_P52V48_64K]= { 52, 48, 0x1, 16 },
> +   [VM_MODE_P48V48_4K] = { 48, 48,  0x1000, 12 },
> +   [VM_MODE_P48V48_64K]= { 48, 48, 0x1, 16 },
> +   [VM_MODE_P40V48_4K] = { 40, 48,  0x1000, 12 },
> +   [VM_MODE_P40V48_64K]= { 40, 48, 0x1, 16 },
> +   [VM_MODE_PXXV48_4K] = {  0,  0,  0x1000, 12 },
>  };
>  _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct 
> vm_guest_mode_params) == NUM_VM_MODES,
>"Missing new mode params?");
> --
> 2.30.0.478.g8a0d178c01-goog
>

Re: [PATCH 02/15] KVM: selftests: Expose align() helpers to tests

2021-02-10 Thread Ben Gardon

On Wed, Feb 10, 2021 at 3:06 PM Sean Christopherson  wrote:
>
> Refactor align() to work with non-pointers, add align_ptr() for use with
> pointers, and expose both helpers so that they can be used by tests
> and/or other utilities.  The align() helper in particular will be used
> to ensure gpa alignment for hugepages.
>
> No functional change intended.
>
> Signed-off-by: Sean Christopherson 

Reviewed-by: Ben Gardon 

> ---
>  tools/testing/selftests/kvm/include/kvm_util.h | 15 +++
>  tools/testing/selftests/kvm/lib/kvm_util.c | 11 +--
>  2 files changed, 16 insertions(+), 10 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
> b/tools/testing/selftests/kvm/include/kvm_util.h
> index 2d7eb6989e83..4b5d2362a68a 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
> @@ -79,6 +79,21 @@ struct vm_guest_mode_params {
>  };
>  extern const struct vm_guest_mode_params vm_guest_mode_params[];
>
> +/* Aligns x up to the next multiple of size. Size must be a power of 2. */

It might also be worth updating this comment to clarify that the
function rounds down, not up.

> +static inline uint64_t align(uint64_t x, uint64_t size)
> +{
> +   uint64_t mask = size - 1;
> +
> +   TEST_ASSERT(size != 0 && !(size & (size - 1)),
> +   "size not a power of 2: %lu", size);
> +   return ((x + mask) & ~mask);
> +}
> +
> +static inline void *align_ptr(void *x, size_t size)
> +{
> +   return (void *)align((unsigned long)x, size);
> +}
> +
>  int kvm_check_cap(long cap);
>  int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
>  int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
> b/tools/testing/selftests/kvm/lib/kvm_util.c
> index 960f4c5129ff..584167c6dbc7 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -21,15 +21,6 @@
>  #define KVM_UTIL_PGS_PER_HUGEPG 512
>  #define KVM_UTIL_MIN_PFN   2
>
> -/* Aligns x up to the next multiple of size. Size must be a power of 2. */
> -static void *align(void *x, size_t size)
> -{
> -   size_t mask = size - 1;
> -   TEST_ASSERT(size != 0 && !(size & (size - 1)),
> -   "size not a power of 2: %lu", size);
> -   return (void *) (((size_t) x + mask) & ~mask);
> -}
> -
>  /*
>   * Capability
>   *
> @@ -757,7 +748,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
> region->mmap_start, errno);
>
> /* Align host address */
> -   region->host_mem = align(region->mmap_start, alignment);
> +   region->host_mem = align_ptr(region->mmap_start, alignment);
>
> /* As needed perform madvise */
> if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == 
> VM_MEM_SRC_ANONYMOUS_THP) {
> --
> 2.30.0.478.g8a0d178c01-goog
>

Re: [PATCH] locking/arch: Move qrwlock.h include after qspinlock.h

2021-02-10 Thread Ben Gardon

On Wed, Feb 10, 2021 at 7:54 AM Waiman Long  wrote:
>
> On 2/10/21 10:05 AM, Guenter Roeck wrote:
> > On 2/10/21 6:45 AM, Waiman Long wrote:
> >> The queued rwlock code has a dependency on the current spinlock
> >> implementation (likely to be qspinlock), but not vice versa. Including
> >> qrwlock.h before qspinlock.h can be problematic when expanding qrwlock
> >> functionality.
> >>
> >> If both qspinlock.h and qrwlock.h are to be included, the qrwlock.h
> >> include should always be after qspinlock.h. Update the current set of
> >> asm/spinlock.h files to enforce that.
> >>
> >> Signed-off-by: Waiman Long 
> > There should be a Fixes: tag here. If the SHA of the offending commit is not
> > stable, there should be a better reference than "The queued rwlock code".
> I originally have a Fixes tag when I was modifying the mips'
> asm/spinlock.h file. After I realize that there are more files to
> modify, I take that out. Anyway, the problem was exposed by Ben's
> qrwlock patch. So existing stable releases should still be fine without
> this patch.
> >
> > This patch fixes the build problem I had observed on mips. I also tested
> > xtensa:defconfig and arm64:defconfig with no problems observed.
> >
> > Tested-by: Guenter Roeck 
>
> Thanks for the testing as I don't have a build environment to verify that.
>
> Cheers,
> Longman
>

Thanks Longman and Guenter for developing and testing this fix! I
don't have the environment to test this either, but the patch looks
good to me.
Reviewed-by: Ben Gardon

Re: [RFC PATCH 2/2] KVM: selftests: Add a test for kvm page table code

2021-02-09 Thread Ben Gardon

On Tue, Feb 9, 2021 at 1:43 AM wangyanan (Y)  wrote:
>
>
> On 2021/2/9 4:29, Ben Gardon wrote:
> > On Mon, Feb 8, 2021 at 1:08 AM Yanan Wang  wrote:
> >> This test serves as a performance tester and a bug reproducer for
> >> kvm page table code (GPA->HPA mappings), so it gives guidance for
> >> people trying to make some improvement for kvm.
> >>
> >> The function guest_code() is designed to cover conditions where a single 
> >> vcpu
> >> or multiple vcpus access guest pages within the same memory range, in three
> >> VM stages(before dirty-logging, during dirty-logging, after dirty-logging).
> >> Besides, the backing source memory type(ANONYMOUS/THP/HUGETLB) of the 
> >> tested
> >> memory region can be specified by users, which means normal page mappings 
> >> or
> >> block mappings can be chosen by users to be created in the test.
> >>
> >> If use of ANONYMOUS memory is specified, kvm will create page mappings for 
> >> the
> >> tested memory region before dirty-logging, and update attributes of the 
> >> page
> >> mappings from RO to RW during dirty-logging. If use of THP/HUGETLB memory 
> >> is
> >> specified, kvm will create block mappings for the tested memory region 
> >> before
> >> dirty-logging, and split the blcok mappings into page mappings during
> >> dirty-logging, and coalesce the page mappings back into block mappings 
> >> after
> >> dirty-logging is stopped.
> >>
> >> So in summary, as a performance tester, this test can present the 
> >> performance
> >> of kvm creating/updating normal page mappings, or the performance of kvm
> >> creating/splitting/recovering block mappings, through execution time.
> >>
> >> When we need to coalesce the page mappings back to block mappings after 
> >> dirty
> >> logging is stopped, we have to firstly invalidate *all* the TLB entries 
> >> for the
> >> page mappings right before installation of the block entry, because a TLB 
> >> conflict
> >> abort error could occur if we can't invalidate the TLB entries fully. We 
> >> have
> >> hit this TLB conflict twice on aarch64 software implementation and fixed 
> >> it.
> >> As this test can imulate process from dirty-logging enabled to 
> >> dirty-logging
> >> stopped of a VM with block mappings, so it can also reproduce this TLB 
> >> conflict
> >> abort due to inadequate TLB invalidation when coalescing tables.
> >>
> >> Signed-off-by: Yanan Wang 
> > Thanks for sending this! Happy to see more tests for weird TLB
> > flushing edge cases and races.
> >
> > Just out of curiosity, were you unable to replicate the bug with the
> > dirty_log_perf_test and setting the wr_fract option?
> > With "KVM: selftests: Disable dirty logging with vCPUs running"
> > (https://lkml.org/lkml/2021/2/2/1431), the dirty_log_perf_test has
> > most of the same features as this one.
> > Please correct me if I'm wrong, but it seems like the major difference
> > here is a more careful pattern of which pages are dirtied when.
> >
> > Within Google we have a system for pre-specifying sets of arguments to
> > e.g. the dirty_log_perf_test. I wonder if something similar, even as
> > simple as a script that just runs dirty_log_perf_test several times
> > would be helpful for cases where different arguments are needed for
> > the test to cover different specific cases. Even with this test, for
> > example, I assume the test doesn't work very well with just 1 vCPU,
> > but it's still a good default in the test, so having some kind of
> > configuration (lite) file would be useful.
> >
> >> ---
> >>   tools/testing/selftests/kvm/Makefile  |   3 +
> >>   .../selftests/kvm/kvm_page_table_test.c   | 518 ++
> >>   2 files changed, 521 insertions(+)
> >>   create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c
> >>
> >> diff --git a/tools/testing/selftests/kvm/Makefile 
> >> b/tools/testing/selftests/kvm/Makefile
> >> index fe41c6a0fa67..697318019bd4 100644
> >> --- a/tools/testing/selftests/kvm/Makefile
> >> +++ b/tools/testing/selftests/kvm/Makefile
> >> @@ -62,6 +62,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
> >>   TEST_GEN_PROGS_x86_64 += demand_paging_test
> >>   TEST_GEN_PROGS_x86_64 += dirty_log_test
> >>   TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
&g

Re: [RFC PATCH 2/2] KVM: selftests: Add a test for kvm page table code

2021-02-09 Thread Ben Gardon

On Mon, Feb 8, 2021 at 11:22 PM wangyanan (Y)  wrote:
>
> Hi Ben,
>
> On 2021/2/9 4:29, Ben Gardon wrote:
> > On Mon, Feb 8, 2021 at 1:08 AM Yanan Wang  wrote:
> >> This test serves as a performance tester and a bug reproducer for
> >> kvm page table code (GPA->HPA mappings), so it gives guidance for
> >> people trying to make some improvement for kvm.
> >>
> >> The function guest_code() is designed to cover conditions where a single 
> >> vcpu
> >> or multiple vcpus access guest pages within the same memory range, in three
> >> VM stages(before dirty-logging, during dirty-logging, after dirty-logging).
> >> Besides, the backing source memory type(ANONYMOUS/THP/HUGETLB) of the 
> >> tested
> >> memory region can be specified by users, which means normal page mappings 
> >> or
> >> block mappings can be chosen by users to be created in the test.
> >>
> >> If use of ANONYMOUS memory is specified, kvm will create page mappings for 
> >> the
> >> tested memory region before dirty-logging, and update attributes of the 
> >> page
> >> mappings from RO to RW during dirty-logging. If use of THP/HUGETLB memory 
> >> is
> >> specified, kvm will create block mappings for the tested memory region 
> >> before
> >> dirty-logging, and split the blcok mappings into page mappings during
> >> dirty-logging, and coalesce the page mappings back into block mappings 
> >> after
> >> dirty-logging is stopped.
> >>
> >> So in summary, as a performance tester, this test can present the 
> >> performance
> >> of kvm creating/updating normal page mappings, or the performance of kvm
> >> creating/splitting/recovering block mappings, through execution time.
> >>
> >> When we need to coalesce the page mappings back to block mappings after 
> >> dirty
> >> logging is stopped, we have to firstly invalidate *all* the TLB entries 
> >> for the
> >> page mappings right before installation of the block entry, because a TLB 
> >> conflict
> >> abort error could occur if we can't invalidate the TLB entries fully. We 
> >> have
> >> hit this TLB conflict twice on aarch64 software implementation and fixed 
> >> it.
> >> As this test can imulate process from dirty-logging enabled to 
> >> dirty-logging
> >> stopped of a VM with block mappings, so it can also reproduce this TLB 
> >> conflict
> >> abort due to inadequate TLB invalidation when coalescing tables.
> >>
> >> Signed-off-by: Yanan Wang 
> > Thanks for sending this! Happy to see more tests for weird TLB
> > flushing edge cases and races.
> >
> > Just out of curiosity, were you unable to replicate the bug with the
> > dirty_log_perf_test and setting the wr_fract option?
> > With "KVM: selftests: Disable dirty logging with vCPUs running"
> > (https://lkml.org/lkml/2021/2/2/1431), the dirty_log_perf_test has
> > most of the same features as this one.
> > Please correct me if I'm wrong, but it seems like the major difference
> > here is a more careful pattern of which pages are dirtied when.
> Actually the procedures in KVM_UPDATE_MAPPINGS stage are specially
> designed for
> reproduce of the TLB conflict bug. The following explains why.
> In x86 implementation, the related page mappings will be all destroyed
> in advance when
> stopping dirty logging while vcpus are still running. So after dirty
> logging is successfully
> stopped, there will certainly be page faults when accessing memory, and
> KVM will handle
> the faults and create block mappings once again. (Is this right?)
> So in this case, dirty_log_perf_test can replicate the bug theoretically.
>
> But there is difference in ARM implementation. The related page mappings
> will not be
> destroyed immediately when stopping dirty logging and will  be kept
> instead. And after
> dirty logging, KVM will destroy these mappings together with creation of
> block mappings
> when handling a guest fault (page fault or permission fault).  So based
> on guest_code() in
> dirty_log_perf_test, there will not be any page faults after dirty
> logging because all the
> page mappings have been created and KVM has no chance to recover block
> mappings
> at all. So this is why I left half of the pages clean and another half
> dirtied.

Ah okay, I'm sorry. I shouldn't have assumed that ARM does the same
thing as x86 when disabling dirty logging. It makes sense then why
your guest code is so carefully structured. Does that mean that if

Re: [RFC PATCH 1/2] KVM: selftests: Add a macro to get string of vm_mem_backing_src_type

2021-02-09 Thread Ben Gardon

On Tue, Feb 9, 2021 at 3:21 AM wangyanan (Y)  wrote:
>
>
> On 2021/2/9 2:13, Ben Gardon wrote:
> > On Mon, Feb 8, 2021 at 1:08 AM Yanan Wang  wrote:
> >> Add a macro to get string of the backing source memory type, so that
> >> application can add choices for source types in the help() function,
> >> and users can specify which type to use for testing.
> > Coincidentally, I sent out a change last week to do the same thing:
> > "KVM: selftests: Add backing src parameter to dirty_log_perf_test"
> > (https://lkml.org/lkml/2021/2/2/1430)
> > Whichever way this ends up being implemented, I'm happy to see others
> > interested in testing different backing source types too.
>
> Thanks Ben! I have a little question here.
>
> Can we just present three IDs (0/1/2) but not strings for users to
> choose which backing_src_type to use like the way of guest modes,

That would be fine with me. The string names are easier for me to read
than an ID number (especially if you were to add additional options
e.g. 1G hugetlb or file backed  / shared memory) but it's mostly an
aesthetic preference, so I don't have strong feelings either way.

>
> which I think can make cmdlines more consise and easier to print. And is
> it better to make a universal API to get backing_src_strings
>
> like Sean have suggested, so that the API can be used elsewhere ?

Definitely. This should be as easy as possible to incorporate into all
selftests.

>
> >> Signed-off-by: Yanan Wang 
> >> ---
> >>   tools/testing/selftests/kvm/include/kvm_util.h | 3 +++
> >>   tools/testing/selftests/kvm/lib/kvm_util.c | 8 
> >>   2 files changed, 11 insertions(+)
> >>
> >> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
> >> b/tools/testing/selftests/kvm/include/kvm_util.h
> >> index 5cbb861525ed..f5fc29dc9ee6 100644
> >> --- a/tools/testing/selftests/kvm/include/kvm_util.h
> >> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
> >> @@ -69,7 +69,9 @@ enum vm_guest_mode {
> >>   #define PTES_PER_MIN_PAGE  ptes_per_page(MIN_PAGE_SIZE)
> >>
> >>   #define vm_guest_mode_string(m) vm_guest_mode_string[m]
> >> +#define vm_mem_backing_src_type_string(s) 
> >> vm_mem_backing_src_type_string[s]
> >>   extern const char * const vm_guest_mode_string[];
> >> +extern const char * const vm_mem_backing_src_type_string[];
> >>
> >>   struct vm_guest_mode_params {
> >>  unsigned int pa_bits;
> >> @@ -83,6 +85,7 @@ enum vm_mem_backing_src_type {
> >>  VM_MEM_SRC_ANONYMOUS,
> >>  VM_MEM_SRC_ANONYMOUS_THP,
> >>  VM_MEM_SRC_ANONYMOUS_HUGETLB,
> >> +   NUM_VM_BACKING_SRC_TYPES,
> >>   };
> >>
> >>   int kvm_check_cap(long cap);
> >> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
> >> b/tools/testing/selftests/kvm/lib/kvm_util.c
> >> index fa5a90e6c6f0..a9b651c7f866 100644
> >> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> >> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> >> @@ -165,6 +165,14 @@ const struct vm_guest_mode_params 
> >> vm_guest_mode_params[] = {
> >>   _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct 
> >> vm_guest_mode_params) == NUM_VM_MODES,
> >> "Missing new mode params?");
> >>
> >> +const char * const vm_mem_backing_src_type_string[] = {
> >> +   "VM_MEM_SRC_ANONYMOUS",
> >> +   "VM_MEM_SRC_ANONYMOUS_THP",
> >> +   "VM_MEM_SRC_ANONYMOUS_HUGETLB",
> >> +};
> >> +_Static_assert(sizeof(vm_mem_backing_src_type_string)/sizeof(char *) == 
> >> NUM_VM_BACKING_SRC_TYPES,
> >> +  "Missing new source type strings?");
> >> +
> >>   /*
> >>* VM Create
> >>*
> >> --
> >> 2.23.0
> >>
> > .

Re: [RFC PATCH 2/2] KVM: selftests: Add a test for kvm page table code

2021-02-08 Thread Ben Gardon

On Mon, Feb 8, 2021 at 1:08 AM Yanan Wang  wrote:
>
> This test serves as a performance tester and a bug reproducer for
> kvm page table code (GPA->HPA mappings), so it gives guidance for
> people trying to make some improvement for kvm.
>
> The function guest_code() is designed to cover conditions where a single vcpu
> or multiple vcpus access guest pages within the same memory range, in three
> VM stages(before dirty-logging, during dirty-logging, after dirty-logging).
> Besides, the backing source memory type(ANONYMOUS/THP/HUGETLB) of the tested
> memory region can be specified by users, which means normal page mappings or
> block mappings can be chosen by users to be created in the test.
>
> If use of ANONYMOUS memory is specified, kvm will create page mappings for the
> tested memory region before dirty-logging, and update attributes of the page
> mappings from RO to RW during dirty-logging. If use of THP/HUGETLB memory is
> specified, kvm will create block mappings for the tested memory region before
> dirty-logging, and split the blcok mappings into page mappings during
> dirty-logging, and coalesce the page mappings back into block mappings after
> dirty-logging is stopped.
>
> So in summary, as a performance tester, this test can present the performance
> of kvm creating/updating normal page mappings, or the performance of kvm
> creating/splitting/recovering block mappings, through execution time.
>
> When we need to coalesce the page mappings back to block mappings after dirty
> logging is stopped, we have to firstly invalidate *all* the TLB entries for 
> the
> page mappings right before installation of the block entry, because a TLB 
> conflict
> abort error could occur if we can't invalidate the TLB entries fully. We have
> hit this TLB conflict twice on aarch64 software implementation and fixed it.
> As this test can imulate process from dirty-logging enabled to dirty-logging
> stopped of a VM with block mappings, so it can also reproduce this TLB 
> conflict
> abort due to inadequate TLB invalidation when coalescing tables.
>
> Signed-off-by: Yanan Wang 

Thanks for sending this! Happy to see more tests for weird TLB
flushing edge cases and races.

Just out of curiosity, were you unable to replicate the bug with the
dirty_log_perf_test and setting the wr_fract option?
With "KVM: selftests: Disable dirty logging with vCPUs running"
(https://lkml.org/lkml/2021/2/2/1431), the dirty_log_perf_test has
most of the same features as this one.
Please correct me if I'm wrong, but it seems like the major difference
here is a more careful pattern of which pages are dirtied when.

Within Google we have a system for pre-specifying sets of arguments to
e.g. the dirty_log_perf_test. I wonder if something similar, even as
simple as a script that just runs dirty_log_perf_test several times
would be helpful for cases where different arguments are needed for
the test to cover different specific cases. Even with this test, for
example, I assume the test doesn't work very well with just 1 vCPU,
but it's still a good default in the test, so having some kind of
configuration (lite) file would be useful.

> ---
>  tools/testing/selftests/kvm/Makefile  |   3 +
>  .../selftests/kvm/kvm_page_table_test.c   | 518 ++
>  2 files changed, 521 insertions(+)
>  create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c
>
> diff --git a/tools/testing/selftests/kvm/Makefile 
> b/tools/testing/selftests/kvm/Makefile
> index fe41c6a0fa67..697318019bd4 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -62,6 +62,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
>  TEST_GEN_PROGS_x86_64 += demand_paging_test
>  TEST_GEN_PROGS_x86_64 += dirty_log_test
>  TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
> +TEST_GEN_PROGS_x86_64 += kvm_page_table_test
>  TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
>  TEST_GEN_PROGS_x86_64 += set_memory_region_test
>  TEST_GEN_PROGS_x86_64 += steal_time
> @@ -71,6 +72,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
>  TEST_GEN_PROGS_aarch64 += demand_paging_test
>  TEST_GEN_PROGS_aarch64 += dirty_log_test
>  TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
> +TEST_GEN_PROGS_aarch64 += kvm_page_table_test
>  TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
>  TEST_GEN_PROGS_aarch64 += set_memory_region_test
>  TEST_GEN_PROGS_aarch64 += steal_time
> @@ -80,6 +82,7 @@ TEST_GEN_PROGS_s390x += s390x/resets
>  TEST_GEN_PROGS_s390x += s390x/sync_regs_test
>  TEST_GEN_PROGS_s390x += demand_paging_test
>  TEST_GEN_PROGS_s390x += dirty_log_test
> +TEST_GEN_PROGS_s390x += kvm_page_table_test
>  TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
>  TEST_GEN_PROGS_s390x += set_memory_region_test
>
> diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c 
> b/tools/testing/selftests/kvm/kvm_page_table_test.c
> new file mode 100644
> index ..b09c05288937
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/kvm_pag

Re: [RFC PATCH 1/2] KVM: selftests: Add a macro to get string of vm_mem_backing_src_type

2021-02-08 Thread Ben Gardon

On Mon, Feb 8, 2021 at 1:08 AM Yanan Wang  wrote:
>
> Add a macro to get string of the backing source memory type, so that
> application can add choices for source types in the help() function,
> and users can specify which type to use for testing.

Coincidentally, I sent out a change last week to do the same thing:
"KVM: selftests: Add backing src parameter to dirty_log_perf_test"
(https://lkml.org/lkml/2021/2/2/1430)
Whichever way this ends up being implemented, I'm happy to see others
interested in testing different backing source types too.

>
> Signed-off-by: Yanan Wang 
> ---
>  tools/testing/selftests/kvm/include/kvm_util.h | 3 +++
>  tools/testing/selftests/kvm/lib/kvm_util.c | 8 
>  2 files changed, 11 insertions(+)
>
> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
> b/tools/testing/selftests/kvm/include/kvm_util.h
> index 5cbb861525ed..f5fc29dc9ee6 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
> @@ -69,7 +69,9 @@ enum vm_guest_mode {
>  #define PTES_PER_MIN_PAGE  ptes_per_page(MIN_PAGE_SIZE)
>
>  #define vm_guest_mode_string(m) vm_guest_mode_string[m]
> +#define vm_mem_backing_src_type_string(s) vm_mem_backing_src_type_string[s]
>  extern const char * const vm_guest_mode_string[];
> +extern const char * const vm_mem_backing_src_type_string[];
>
>  struct vm_guest_mode_params {
> unsigned int pa_bits;
> @@ -83,6 +85,7 @@ enum vm_mem_backing_src_type {
> VM_MEM_SRC_ANONYMOUS,
> VM_MEM_SRC_ANONYMOUS_THP,
> VM_MEM_SRC_ANONYMOUS_HUGETLB,
> +   NUM_VM_BACKING_SRC_TYPES,
>  };
>
>  int kvm_check_cap(long cap);
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
> b/tools/testing/selftests/kvm/lib/kvm_util.c
> index fa5a90e6c6f0..a9b651c7f866 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -165,6 +165,14 @@ const struct vm_guest_mode_params vm_guest_mode_params[] 
> = {
>  _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct 
> vm_guest_mode_params) == NUM_VM_MODES,
>"Missing new mode params?");
>
> +const char * const vm_mem_backing_src_type_string[] = {
> +   "VM_MEM_SRC_ANONYMOUS",
> +   "VM_MEM_SRC_ANONYMOUS_THP",
> +   "VM_MEM_SRC_ANONYMOUS_HUGETLB",
> +};
> +_Static_assert(sizeof(vm_mem_backing_src_type_string)/sizeof(char *) == 
> NUM_VM_BACKING_SRC_TYPES,
> +  "Missing new source type strings?");
> +
>  /*
>   * VM Create
>   *
> --
> 2.23.0
>

Re: [PATCH] KVM: VMX: Optimize flushing the PML buffer

2021-02-04 Thread Ben Gardon

On Thu, Feb 4, 2021 at 2:51 PM Peter Xu  wrote:
>
> Hi, Ben,
>
> On Thu, Feb 04, 2021 at 02:19:59PM -0800, Ben Gardon wrote:
> > The average time for each run demonstrated a strange bimodal distribution,
> > with clusters around 2 seconds and 2.5 seconds. This may have been a
> > result of vCPU migration between NUMA nodes.
>
> Have you thought about using numactl or similar technique to verify your idea
> (force both vcpu threads binding, and memory allocations)?
>
> From the numbers it already shows improvements indeed, but just curious since
> you raised this up. :)

Frustratingly, the test machines I have don't have numactl installed
but I've been meaning to add cpu pinning to the selftests perf tests
anyway, so maybe this is a good reason to do it.

>
> > @@ -5707,13 +5708,18 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu 
> > *vcpu)
> >   else
> >   pml_idx++;
> >
> > + memslots = kvm_vcpu_memslots(vcpu);
> > +
> >   pml_buf = page_address(vmx->pml_pg);
> >   for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
> > + struct kvm_memory_slot *memslot;
> >   u64 gpa;
> >
> >   gpa = pml_buf[pml_idx];
> >   WARN_ON(gpa & (PAGE_SIZE - 1));
> > - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
> > +
> > + memslot = __gfn_to_memslot(memslots, gpa >> PAGE_SHIFT);
> > + mark_page_dirty_in_slot(vcpu->kvm, memslot, gpa >> 
> > PAGE_SHIFT);
>
> Since at it: make "gpa >> PAGE_SHIFT" a temp var too?

That's a good idea, I'll try it.

>
> Thanks,
>
> --
> Peter Xu
>

[PATCH] KVM: VMX: Optimize flushing the PML buffer

2021-02-04 Thread Ben Gardon

vmx_flush_pml_buffer repeatedly calls kvm_vcpu_mark_page_dirty, which
SRCU-derefrences kvm->memslots. In order to give the compiler more
freedom to optimize the function, SRCU-dereference the pointer
kvm->memslots only once.

Reviewed-by: Makarand Sonare 
Signed-off-by: Ben Gardon 

---

Tested by running the dirty_log_perf_test selftest on a dual socket Intel
Skylake machine:
./dirty_log_perf_test -v 4 -b 30G -i 5

The test was run 5 times with and without this patch and the dirty
memory time for iterations 2-5 was averaged across the 5 runs.
Iteration 1 was discarded for this analysis because it is still dominated
by the time spent populating memory.

The average time for each run demonstrated a strange bimodal distribution,
with clusters around 2 seconds and 2.5 seconds. This may have been a
result of vCPU migration between NUMA nodes.

In any case, the get dirty times with this patch averaged to 2.07
seconds, a 7% savings from the 2.22 second everage without this patch.

While these savings may be partly a result of the patched runs having
one more 2 second clustered run, the patched runs in the higer cluster
were also 7-8% shorter than those in the unpatched case.

Below is the raw data for anyone interested in visualizing the results
with a graph:
Iteration   BaselinePatched
2   2.038562907 2.045226614
3   2.037363248 2.045033709
4   2.037176331 1.999783966
5   1.999891981 2.007849104
2   2.569526298 2.001252504
3   2.579110209 2.008541897
4   2.585883731 2.005317983
5   2.588692727 2.007100987
2   2.01191437  2.006953735
3   2.012972236 2.04540153
4   1.968836017 2.005035246
5   1.967915154 2.003859551
2   2.037533296 1.991275846
3   2.501480125 2.391886691
4   2.454382587 2.391904789
5   2.461046772 2.398767963
2   2.036991484 2.011331436
3   2.002954418 2.002635687
4   2.053342717 2.006769959
5   2.522539759 2.006470059
Average 2.223405818 2.069119963

 arch/x86/kvm/vmx/vmx.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cc60b1fc3ee7..46c54802dfdb 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5692,6 +5692,7 @@ static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_memslots *memslots;
u64 *pml_buf;
u16 pml_idx;
 
@@ -5707,13 +5708,18 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
else
pml_idx++;
 
+   memslots = kvm_vcpu_memslots(vcpu);
+
pml_buf = page_address(vmx->pml_pg);
for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+   struct kvm_memory_slot *memslot;
u64 gpa;
 
gpa = pml_buf[pml_idx];
WARN_ON(gpa & (PAGE_SIZE - 1));
-   kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+
+   memslot = __gfn_to_memslot(memslots, gpa >> PAGE_SHIFT);
+   mark_page_dirty_in_slot(vcpu->kvm, memslot, gpa >> PAGE_SHIFT);
}
 
/* reset PML index */
-- 
2.30.0.365.g02bc693789-goog

Re: [PATCH v2 25/28] KVM: x86/mmu: Allow zapping collapsible SPTEs to use MMU read lock

2021-02-03 Thread Ben Gardon

On Wed, Feb 3, 2021 at 3:34 AM Paolo Bonzini  wrote:
>
> On 02/02/21 19:57, Ben Gardon wrote:
> > @@ -1485,7 +1489,9 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm 
> > *kvm,
> >   struct kvm_mmu_page *root;
> >   int root_as_id;
> >
> > - for_each_tdp_mmu_root_yield_safe(kvm, root, false) {
> > + read_lock(&kvm->mmu_lock);
> > +
> > + for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
> >   root_as_id = kvm_mmu_page_as_id(root);
> >   if (root_as_id != slot->as_id)
> >   continue;
> > @@ -1493,6 +1499,8 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm 
> > *kvm,
> >   zap_collapsible_spte_range(kvm, root, slot->base_gfn,
> >  slot->base_gfn + slot->npages);
> >   }
> > +
> > + read_unlock(&kvm->mmu_lock);
> >  }
>
>
> I'd prefer the functions to be consistent about who takes the lock,
> either mmu.c or tdp_mmu.c.  Since everywhere else you're doing it in
> mmu.c, that would be:
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 0554d9c5c5d4..386ee4b703d9 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5567,10 +5567,13 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
> write_lock(&kvm->mmu_lock);
> slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
>  kvm_mmu_zap_collapsible_spte, true);
> +   write_unlock(&kvm->mmu_lock);
>
> -   if (kvm->arch.tdp_mmu_enabled)
> +   if (kvm->arch.tdp_mmu_enabled) {
> +   read_lock(&kvm->mmu_lock);
> kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
> -   write_unlock(&kvm->mmu_lock);
> +   read_unlock(&kvm->mmu_lock);
> +   }
>   }
>
>   void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
>
> and just lockdep_assert_held_read here.

That makes sense to me, I agree keeping it consistent is probably a good idea.

>
> > - tdp_mmu_set_spte(kvm, &iter, 0);
> > -
> > - spte_set = true;
>
> Is it correct to remove this assignment?

No, it was not correct to remove it. Thank you for catching that.

>
> Paolo
>

Re: [PATCH v2 24/28] KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock

2021-02-03 Thread Ben Gardon

On Wed, Feb 3, 2021 at 3:26 AM Paolo Bonzini  wrote:
>
> On 02/02/21 19:57, Ben Gardon wrote:
> > +#ifdef CONFIG_LOCKDEP
> > + if (shared)
> > + lockdep_assert_held_read(&kvm->mmu_lock);
> > + else
> > + lockdep_assert_held_write(&kvm->mmu_lock);
> > +#endif /* CONFIG_LOCKDEP */
>
> Also, there's no need for the #ifdef here.

I agree, I must have misinterpreted some feedback on a previous commit
and gone overboard with it.

> Do we want a helper
> kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, bool shared)?

There are only two places that try to assert both ways as far as I can
see on a cursory check, but it couldn't hurt.

>
> Paolo
>

Re: [PATCH v2 23/28] KVM: x86/mmu: Allow parallel page faults for the TDP MMU

2021-02-03 Thread Ben Gardon

On Wed, Feb 3, 2021 at 4:40 AM Paolo Bonzini  wrote:
>
> On 02/02/21 19:57, Ben Gardon wrote:
> >
> > - write_lock(&vcpu->kvm->mmu_lock);
> > +
> > + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
> > + read_lock(&vcpu->kvm->mmu_lock);
> > + else
> > + write_lock(&vcpu->kvm->mmu_lock);
> > +
>
> I'd like to make this into two helper functions, but I'm not sure about
> the naming:
>
> - kvm_mmu_read_lock_for_root/kvm_mmu_read_unlock_for_root: not precise
> because it's really write-locked for shadow MMU roots
>
> - kvm_mmu_lock_for_root/kvm_mmu_unlock_for_root: not clear that TDP MMU
> operations will need to operate in shared-lock mode
>
> I prefer the first because at least it's the conservative option, but
> I'm open to other opinions and suggestions.
>
> Paolo
>

Of the above two options, I like the second one, though I'd be happy
with either. I agree the first is more conservative, in that it's
clear the MMU lock could be shared. It feels a little misleading,
though to have read in the name of the function but then acquire the
write lock, especially since there's code below that which expects the
write lock. I don't know of a good way to abstract this into a helper
without some comments to make it clear what's going on, but maybe
there's a slightly more open-coded compromise:
if (!kvm_mmu_read_lock_for_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
 write_lock(&vcpu->kvm->mmu_lock);
or
enum kvm_mmu_lock_mode lock_mode =
get_mmu_lock_mode_for_root(vcpu->kvm, vcpu->arch.mmu->root_hpa);

kvm_mmu_lock_for_mode(lock_mode);

Not sure if either of those are actually clearer, but the latter
trends in the direction the RCF took, having an enum to capture
read/write and whether or not yo yield in a lock mode parameter.

[PATCH v2 08/28] sched: Add cond_resched_rwlock

2021-02-02 Thread Ben Gardon

Safely rescheduling while holding a spin lock is essential for keeping
long running kernel operations running smoothly. Add the facility to
cond_resched rwlocks.

CC: Ingo Molnar 
CC: Will Deacon 
Acked-by: Peter Zijlstra 
Acked-by: Davidlohr Bueso 
Acked-by: Waiman Long 
Acked-by: Paolo Bonzini 
Signed-off-by: Ben Gardon 
---
 include/linux/sched.h | 12 
 kernel/sched/core.c   | 40 
 2 files changed, 52 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5d1378e5a040..3052d16da3cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1883,12 +1883,24 @@ static inline int _cond_resched(void) { return 0; }
 })
 
 extern int __cond_resched_lock(spinlock_t *lock);
+extern int __cond_resched_rwlock_read(rwlock_t *lock);
+extern int __cond_resched_rwlock_write(rwlock_t *lock);
 
 #define cond_resched_lock(lock) ({ \
___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
__cond_resched_lock(lock);  \
 })
 
+#define cond_resched_rwlock_read(lock) ({  \
+   __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
+   __cond_resched_rwlock_read(lock);   \
+})
+
+#define cond_resched_rwlock_write(lock) ({ \
+   __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
+   __cond_resched_rwlock_write(lock);  \
+})
+
 static inline void cond_resched_rcu(void)
 {
 #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ff74fca39ed2..efed1bf202d1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6709,6 +6709,46 @@ int __cond_resched_lock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 
+int __cond_resched_rwlock_read(rwlock_t *lock)
+{
+   int resched = should_resched(PREEMPT_LOCK_OFFSET);
+   int ret = 0;
+
+   lockdep_assert_held_read(lock);
+
+   if (rwlock_needbreak(lock) || resched) {
+   read_unlock(lock);
+   if (resched)
+   preempt_schedule_common();
+   else
+   cpu_relax();
+   ret = 1;
+   read_lock(lock);
+   }
+   return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_read);
+
+int __cond_resched_rwlock_write(rwlock_t *lock)
+{
+   int resched = should_resched(PREEMPT_LOCK_OFFSET);
+   int ret = 0;
+
+   lockdep_assert_held_write(lock);
+
+   if (rwlock_needbreak(lock) || resched) {
+   write_unlock(lock);
+   if (resched)
+   preempt_schedule_common();
+   else
+   cpu_relax();
+   ret = 1;
+   write_lock(lock);
+   }
+   return ret;
+}
+EXPORT_SYMBOL(__cond_resched_rwlock_write);
+
 /**
  * yield - yield the current processor to other threads.
  *
-- 
2.30.0.365.g02bc693789-goog

[PATCH v2 10/28] KVM: x86/mmu: Fix TDP MMU zap collapsible SPTEs

2021-02-02 Thread Ben Gardon

There is a bug in the TDP MMU function to zap SPTEs which could be
replaced with a larger mapping which prevents the function from doing
anything. Fix this by correctly zapping the last level SPTEs.

Fixes: 14881998566d ("kvm: x86/mmu: Support disabling dirty logging for the tdp 
MMU")
Signed-off-by: Ben Gardon 
---
 arch/x86/kvm/mmu/tdp_mmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c3075fb568eb..e3066d08c1dc 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1098,8 +1098,8 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct 
kvm_memory_slot *slot)
 }
 
 /*
- * Clear non-leaf entries (and free associated page tables) which could
- * be replaced by large mappings, for GFNs within the slot.
+ * Clear leaf entries which could be replaced by large mappings, for
+ * GFNs within the slot.
  */
 static void zap_collapsible_spte_range(struct kvm *kvm,
   struct kvm_mmu_page *root,
@@ -,7 +,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
 
tdp_root_for_each_pte(iter, root, start, end) {
if (!is_shadow_present_pte(iter.old_spte) ||
-   is_last_spte(iter.old_spte, iter.level))
+   !is_last_spte(iter.old_spte, iter.level))
continue;
 
pfn = spte_to_pfn(iter.old_spte);
-- 
2.30.0.365.g02bc693789-goog

1 2 3 4 >

1 - 100 of 313 matches

Mail list logo