On Wed, Sep 26, 2018 at 07:02:58PM +0200, Vitaly Kuznetsov wrote:
> VP inedx almost always matches VCPU and when it does it's faster to walk
> the sparse set instead of all vcpus.
> 
> Signed-off-by: Vitaly Kuznetsov <vkuzn...@redhat.com>
> ---
>  arch/x86/kvm/hyperv.c | 96 +++++++++++++++++++++++--------------------
>  1 file changed, 52 insertions(+), 44 deletions(-)
> 
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index eeb12eacd525..cc0535a078f7 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1277,32 +1277,37 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 
> msr, u64 *pdata, bool host)
>               return kvm_hv_get_msr(vcpu, msr, pdata, host);
>  }
>  
> -static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int 
> bank_no)
> +static __always_inline bool hv_vcpu_in_sparse_set(struct kvm_vcpu_hv 
> *hv_vcpu,
> +                                               u64 sparse_banks[],
> +                                               u64 valid_bank_mask)
>  {
> -     int i = 0, j;
> +     int bank = hv_vcpu->vp_index / 64, sbank;
>  
> -     if (!(valid_bank_mask & BIT_ULL(bank_no)))
> -             return -1;
> +     if (bank >= 64)
> +             return false;
>  
> -     for (j = 0; j < bank_no; j++)
> -             if (valid_bank_mask & BIT_ULL(j))
> -                     i++;
> +     if (!(valid_bank_mask & BIT_ULL(bank)))
> +             return false;
>  
> -     return i;
> +     /* Sparse bank number equals to the number of set bits before it */
> +     sbank = bitmap_weight((unsigned long *)&valid_bank_mask, bank);
> +
> +     return !!(sparse_banks[sbank] & BIT_ULL(hv_vcpu->vp_index % 64));
>  }
>  
>  static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
>                           u16 rep_cnt, bool ex)
>  {
>       struct kvm *kvm = current_vcpu->kvm;
> -     struct kvm_vcpu_hv *hv_current = &current_vcpu->arch.hyperv;
> +     struct kvm_hv *hv = &kvm->arch.hyperv;
> +     struct kvm_vcpu_hv *hv_vcpu = &current_vcpu->arch.hyperv;
>       struct hv_tlb_flush_ex flush_ex;
>       struct hv_tlb_flush flush;
>       struct kvm_vcpu *vcpu;
>       unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0};
> -     u64 valid_bank_mask = 0;
> +     u64 valid_bank_mask;
>       u64 sparse_banks[64];
> -     int sparse_banks_len, i;
> +     int sparse_banks_len, i, bank, sbank;
>       bool all_cpus;
>  
>       if (!ex) {
> @@ -1312,6 +1317,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu 
> *current_vcpu, u64 ingpa,
>               trace_kvm_hv_flush_tlb(flush.processor_mask,
>                                      flush.address_space, flush.flags);
>  
> +             valid_bank_mask = BIT_ULL(0);
>               sparse_banks[0] = flush.processor_mask;
>               all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
>       } else {
> @@ -1344,52 +1350,54 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu 
> *current_vcpu, u64 ingpa,
>                       return HV_STATUS_INVALID_HYPERCALL_INPUT;
>       }
>  
> -     cpumask_clear(&hv_current->tlb_lush);
> +     /*
> +      * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
> +      * analyze it here, flush TLB regardless of the specified address space.
> +      */
> +     cpumask_clear(&hv_vcpu->tlb_lush);

Maybe squash this hv_current -> hv_vcpu renaming into patch 3?
(And yes this "lush" is funny, too ;)

>  
>       if (all_cpus) {
>               kvm_make_vcpus_request_mask(kvm,
>                                   KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
> -                                 NULL, &hv_current->tlb_lush);
> +                                 NULL, &hv_vcpu->tlb_lush);
>               goto ret_success;
>       }
>  
> -     kvm_for_each_vcpu(i, vcpu, kvm) {
> -             struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
> -             int bank = hv->vp_index / 64, sbank = 0;
> -
> -             /* Banks >64 can't be represented */
> -             if (bank >= 64)
> -                     continue;
> -
> -             /* Non-ex hypercalls can only address first 64 vCPUs */
> -             if (!ex && bank)
> -                     continue;
> -
> -             if (ex) {
> -                     /*
> -                      * Check is the bank of this vCPU is in sparse
> -                      * set and get the sparse bank number.
> -                      */
> -                     sbank = get_sparse_bank_no(valid_bank_mask, bank);
> -
> -                     if (sbank < 0)
> -                             continue;
> +     if (atomic_read(&hv->num_mismatched_vp_indexes)) {
> +             kvm_for_each_vcpu(i, vcpu, kvm) {
> +                     if (hv_vcpu_in_sparse_set(&vcpu->arch.hyperv,
> +                                               sparse_banks,
> +                                               valid_bank_mask))
> +                             __set_bit(i, vcpu_bitmap);
>               }
> +             goto flush_request;
> +     }
>  
> -             if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64)))
> -                     continue;
> -
> -             /*
> -              * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we
> -              * can't analyze it here, flush TLB regardless of the specified
> -              * address space.
> -              */
> -             __set_bit(i, vcpu_bitmap);
> +     /*
> +      * num_mismatched_vp_indexes is zero so every vcpu has
> +      * vp_index == vcpu_idx.
> +      */
> +     sbank = 0;
> +     for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
> +                      BITS_PER_LONG) {

s/BITS_PER_LONG/64/

> +             for_each_set_bit(i,
> +                              (unsigned long *)&sparse_banks[sbank],
> +                              BITS_PER_LONG) {

ditto

> +                     u32 vp_index = bank * 64 + i;
> +
> +                     /* A non-existent vCPU was specified */
> +                     if (vp_index >= KVM_MAX_VCPUS)
> +                             return HV_STATUS_INVALID_HYPERCALL_INPUT;
> +
> +                     __set_bit(vp_index, vcpu_bitmap);
> +             }
> +             sbank++;
>       }

I wonder if copying the bank as a whole would make it easier to follow
(and somewhat more efficient):

        sbank = 0;
        for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, 64)
                ((u64)vcpu_bitmap)[bank] = sparse_banks[sbank++];

Also it seems equally efficient but slightly easier to read if
vcpu_bitmap is filled first regardless of ->num_mismatched_vp_indexes,
and then either passed directly to kvm_make_vcpus_request_mask if
num_mismatched_vp_indexes == 0, or converted into the real vcpu mask
using the regular test_bit otherwise.

So eventually it all would look like

        ...
        u64 vp_bitmap[KVM_MAX_VCPUS / 64] = {0};
        DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS) = {0};
        unsigned long *vcpu_mask;
        ...
        if (all_cpus) {
                vcpu_mask = NULL;
                goto flush_request;
        }

        sbank = 0;
        for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, 64)
                vp_bitmap[bank] = sparse_banks[sbank++];

        if (likely(!atomic_read(&hv->num_mismatched_vp_indexes)) {
                /* for all vcpus vp_index == vcpu_idx */
                vcpu_mask = vp_bitmap;
                goto flush_request;
        }

        kvm_for_each_vcpu(i, vcpu, kvm)
                if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index, vp_bitmap))
                        __set_bit(i, vcpu_bitmap);
        vcpu_mask = vcpu_bitmap;
        ...

>  
> +flush_request:
>       kvm_make_vcpus_request_mask(kvm,
>                                   KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
> -                                 vcpu_bitmap, &hv_current->tlb_lush);
> +                                 vcpu_bitmap, &hv_vcpu->tlb_lush);
>  
>  ret_success:
>       /* We always do full TLB flush, set rep_done = rep_cnt. */

Roman.

Reply via email to