On 2024-04-01 11:29 PM, James Houghton wrote:
> Add kvm_arch_prepare_bitmap_age() for architectures to indiciate that
> they support bitmap-based aging in kvm_mmu_notifier_test_clear_young()
> and that they do not need KVM to grab the MMU lock for writing. This
> function allows architectures to do other locking or other preparatory
> work that it needs.

There's a lot going on here. I know it's extra work but I think the
series would be easier to understand and simplify if you introduced the
KVM support for lockless test/clear_young() first, and then introduce
support for the bitmap-based look-around.

Specifically:

 1. Make all test/clear_young() notifiers lockless. i.e. Move the
    mmu_lock into the architecture-specific code (kvm_age_gfn() and
    kvm_test_age_gfn()).

 2. Convert KVM/x86's kvm_{test,}_age_gfn() to be lockless for the TDP
    MMU.

 4. Convert KVM/arm64's kvm_{test,}_age_gfn() to hold the mmu_lock in
    read-mode.

 5. Add bitmap-based look-around support to KVM/x86 and KVM/arm64
    (probably 2-3 patches).

> 
> If an architecture does not implement kvm_arch_prepare_bitmap_age() or
> is unable to do bitmap-based aging at runtime (and marks the bitmap as
> unreliable):
>  1. If a bitmap was provided, we inform the caller that the bitmap is
>     unreliable (MMU_NOTIFIER_YOUNG_BITMAP_UNRELIABLE).
>  2. If a bitmap was not provided, fall back to the old logic.
> 
> Also add logic for architectures to easily use the provided bitmap if
> they are able. The expectation is that the architecture's implementation
> of kvm_gfn_test_age() will use kvm_gfn_record_young(), and
> kvm_gfn_age() will use kvm_gfn_should_age().
> 
> Suggested-by: Yu Zhao <yuz...@google.com>
> Signed-off-by: James Houghton <jthough...@google.com>
> ---
>  include/linux/kvm_host.h | 60 ++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c      | 92 +++++++++++++++++++++++++++++-----------
>  2 files changed, 127 insertions(+), 25 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 1800d03a06a9..5862fd7b5f9b 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1992,6 +1992,26 @@ extern const struct _kvm_stats_desc 
> kvm_vm_stats_desc[];
>  extern const struct kvm_stats_header kvm_vcpu_stats_header;
>  extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[];
>  
> +/*
> + * Architectures that support using bitmaps for kvm_age_gfn() and
> + * kvm_test_age_gfn should return true for kvm_arch_prepare_bitmap_age()
> + * and do any work they need to prepare. The subsequent walk will not
> + * automatically grab the KVM MMU lock, so some architectures may opt
> + * to grab it.
> + *
> + * If true is returned, a subsequent call to kvm_arch_finish_bitmap_age() is
> + * guaranteed.
> + */
> +#ifndef kvm_arch_prepare_bitmap_age
> +static inline bool kvm_arch_prepare_bitmap_age(struct mmu_notifier *mn)

I find the name of these architecture callbacks misleading/confusing.
The lockless path is used even when a bitmap is not provided. i.e.
bitmap can be NULL in between kvm_arch_prepare/finish_bitmap_age().

> +{
> +     return false;
> +}
> +#endif
> +#ifndef kvm_arch_finish_bitmap_age
> +static inline void kvm_arch_finish_bitmap_age(struct mmu_notifier *mn) {}
> +#endif

kvm_arch_finish_bitmap_age() seems unnecessary. I think the KVM/arm64
code could acquire/release the mmu_lock in read-mode in
kvm_test_age_gfn() and kvm_age_gfn() right?

> +
>  #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
>  static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
>  {
> @@ -2076,9 +2096,16 @@ static inline bool 
> mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm,
>       return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq;
>  }
>  
> +struct test_clear_young_metadata {
> +     unsigned long *bitmap;
> +     unsigned long bitmap_offset_end;

bitmap_offset_end is unused.

> +     unsigned long end;
> +     bool unreliable;
> +};
>  union kvm_mmu_notifier_arg {
>       pte_t pte;
>       unsigned long attributes;
> +     struct test_clear_young_metadata *metadata;

nit: Maybe s/metadata/test_clear_young/ ?

>  };
>  
>  struct kvm_gfn_range {
> @@ -2087,11 +2114,44 @@ struct kvm_gfn_range {
>       gfn_t end;
>       union kvm_mmu_notifier_arg arg;
>       bool may_block;
> +     bool lockless;

Please document this as it's somewhat subtle. A reader might think this
implies the entire operation runs without taking the mmu_lock.

>  };
>  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> +
> +static inline void kvm_age_set_unreliable(struct kvm_gfn_range *range)
> +{
> +     struct test_clear_young_metadata *args = range->arg.metadata;
> +
> +     args->unreliable = true;
> +}
> +static inline unsigned long kvm_young_bitmap_offset(struct kvm_gfn_range 
> *range,
> +                                                 gfn_t gfn)
> +{
> +     struct test_clear_young_metadata *args = range->arg.metadata;
> +
> +     return hva_to_gfn_memslot(args->end - 1, range->slot) - gfn;
> +}
> +static inline void kvm_gfn_record_young(struct kvm_gfn_range *range, gfn_t 
> gfn)
> +{
> +     struct test_clear_young_metadata *args = range->arg.metadata;
> +
> +     WARN_ON_ONCE(gfn < range->start || gfn >= range->end);
> +     if (args->bitmap)
> +             __set_bit(kvm_young_bitmap_offset(range, gfn), args->bitmap);
> +}
> +static inline bool kvm_gfn_should_age(struct kvm_gfn_range *range, gfn_t gfn)
> +{
> +     struct test_clear_young_metadata *args = range->arg.metadata;
> +
> +     WARN_ON_ONCE(gfn < range->start || gfn >= range->end);
> +     if (args->bitmap)
> +             return test_bit(kvm_young_bitmap_offset(range, gfn),
> +                             args->bitmap);
> +     return true;
> +}
>  #endif
>  
>  #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index d0545d88c802..7d80321e2ece 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -550,6 +550,7 @@ struct kvm_mmu_notifier_range {
>       on_lock_fn_t on_lock;
>       bool flush_on_ret;
>       bool may_block;
> +     bool lockless;
>  };
>  
>  /*
> @@ -598,6 +599,8 @@ static __always_inline kvm_mn_ret_t 
> __kvm_handle_hva_range(struct kvm *kvm,
>       struct kvm_memslots *slots;
>       int i, idx;
>  
> +     BUILD_BUG_ON(sizeof(gfn_range.arg) != sizeof(gfn_range.arg.pte));
> +
>       if (WARN_ON_ONCE(range->end <= range->start))
>               return r;
>  
> @@ -637,15 +640,18 @@ static __always_inline kvm_mn_ret_t 
> __kvm_handle_hva_range(struct kvm *kvm,
>                       gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
>                       gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE 
> - 1, slot);
>                       gfn_range.slot = slot;
> +                     gfn_range.lockless = range->lockless;
>  
>                       if (!r.found_memslot) {
>                               r.found_memslot = true;
> -                             KVM_MMU_LOCK(kvm);
> -                             if (!IS_KVM_NULL_FN(range->on_lock))
> -                                     range->on_lock(kvm);
> -
> -                             if (IS_KVM_NULL_FN(range->handler))
> -                                     break;
> +                             if (!range->lockless) {
> +                                     KVM_MMU_LOCK(kvm);
> +                                     if (!IS_KVM_NULL_FN(range->on_lock))
> +                                             range->on_lock(kvm);
> +
> +                                     if (IS_KVM_NULL_FN(range->handler))
> +                                             break;
> +                             }
>                       }
>                       r.ret |= range->handler(kvm, &gfn_range);
>               }
> @@ -654,7 +660,7 @@ static __always_inline kvm_mn_ret_t 
> __kvm_handle_hva_range(struct kvm *kvm,
>       if (range->flush_on_ret && r.ret)
>               kvm_flush_remote_tlbs(kvm);
>  
> -     if (r.found_memslot)
> +     if (r.found_memslot && !range->lockless)
>               KVM_MMU_UNLOCK(kvm);
>  
>       srcu_read_unlock(&kvm->srcu, idx);
> @@ -682,19 +688,24 @@ static __always_inline int kvm_handle_hva_range(struct 
> mmu_notifier *mn,
>       return __kvm_handle_hva_range(kvm, &range).ret;
>  }
>  
> -static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier 
> *mn,
> -                                                      unsigned long start,
> -                                                      unsigned long end,
> -                                                      gfn_handler_t handler)
> +static __always_inline int kvm_handle_hva_range_no_flush(
> +             struct mmu_notifier *mn,
> +             unsigned long start,
> +             unsigned long end,
> +             gfn_handler_t handler,
> +             union kvm_mmu_notifier_arg arg,
> +             bool lockless)
>  {
>       struct kvm *kvm = mmu_notifier_to_kvm(mn);
>       const struct kvm_mmu_notifier_range range = {
>               .start          = start,
>               .end            = end,
>               .handler        = handler,
> +             .arg            = arg,
>               .on_lock        = (void *)kvm_null_fn,
>               .flush_on_ret   = false,
>               .may_block      = false,
> +             .lockless       = lockless,
>       };
>  
>       return __kvm_handle_hva_range(kvm, &range).ret;
> @@ -909,15 +920,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct 
> mmu_notifier *mn,
>                                   kvm_age_gfn);
>  }
>  
> -static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
> -                                     struct mm_struct *mm,
> -                                     unsigned long start,
> -                                     unsigned long end,
> -                                     unsigned long *bitmap)
> +static int kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn,
> +                                          struct mm_struct *mm,
> +                                          unsigned long start,
> +                                          unsigned long end,
> +                                          unsigned long *bitmap,
> +                                          bool clear)

Perhaps pass in the callback (kvm_test_age_gfn/kvm_age_gfn) instead of
true/false to avoid the naked booleans at the callsites?

>  {
> -     trace_kvm_age_hva(start, end);
> +     if (kvm_arch_prepare_bitmap_age(mn)) {
> +             struct test_clear_young_metadata args = {
> +                     .bitmap         = bitmap,
> +                     .end            = end,
> +                     .unreliable     = false,
> +             };
> +             union kvm_mmu_notifier_arg arg = {
> +                     .metadata = &args
> +             };
> +             bool young;
> +
> +             young = kvm_handle_hva_range_no_flush(
> +                                     mn, start, end,
> +                                     clear ? kvm_age_gfn : kvm_test_age_gfn,
> +                                     arg, true);

I suspect the end result will be cleaner we make all architectures
lockless. i.e. Move the mmu_lock acquire/release into the
architecture-specific code.

This could result in more acquire/release calls (one per memslot that
overlaps the provided range) but that should be a single memslot in the
majority of cases I think?

Then unconditionally pass in the metadata structure.

Then you don't need any special casing for the fast path / bitmap path.
The only thing needed is to figure out whether to return
MMU_NOTIFIER_YOUNG vs MMU_NOTIFIER_YOUNG_LOOK_AROUND and that can be
plumbed via test_clear_young_metadata or by changing gfn_handler_t to
return an int instead of a bool.

> +
> +             kvm_arch_finish_bitmap_age(mn);
>  
> -     /* We don't support bitmaps. Don't test or clear anything. */
> +             if (!args.unreliable)
> +                     return young ? MMU_NOTIFIER_YOUNG_FAST : 0;
> +     }
> +
> +     /* A bitmap was passed but the architecture doesn't support bitmaps */
>       if (bitmap)
>               return MMU_NOTIFIER_YOUNG_BITMAP_UNRELIABLE;
>  
> @@ -934,7 +966,21 @@ static int kvm_mmu_notifier_clear_young(struct 
> mmu_notifier *mn,
>        * cadence. If we find this inaccurate, we might come up with a
>        * more sophisticated heuristic later.
>        */
> -     return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
> +     return kvm_handle_hva_range_no_flush(
> +                     mn, start, end, clear ? kvm_age_gfn : kvm_test_age_gfn,
> +                     KVM_MMU_NOTIFIER_NO_ARG, false);

Should this return MMU_NOTIFIER_YOUNG explicitly? This code is assuming
MMU_NOTIFIER_YOUNG == (int)true.

> +}
> +
> +static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
> +                                     struct mm_struct *mm,
> +                                     unsigned long start,
> +                                     unsigned long end,
> +                                     unsigned long *bitmap)
> +{
> +     trace_kvm_age_hva(start, end);
> +
> +     return kvm_mmu_notifier_test_clear_young(mn, mm, start, end, bitmap,
> +                                              true);
>  }
>  
>  static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
> @@ -945,12 +991,8 @@ static int kvm_mmu_notifier_test_young(struct 
> mmu_notifier *mn,
>  {
>       trace_kvm_test_age_hva(start, end);
>  
> -     /* We don't support bitmaps. Don't test or clear anything. */
> -     if (bitmap)
> -             return MMU_NOTIFIER_YOUNG_BITMAP_UNRELIABLE;
> -
> -     return kvm_handle_hva_range_no_flush(mn, start, end,
> -                                          kvm_test_age_gfn);
> +     return kvm_mmu_notifier_test_clear_young(mn, mm, start, end, bitmap,
> +                                              false);
>  }
>  
>  static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
> -- 
> 2.44.0.478.gd926399ef9-goog
> 

Reply via email to