On 28/04/14 18:39, Paolo Bonzini wrote:
> From: Christian Borntraeger <borntrae...@de.ibm.com>

Given all your work, What about From: Paolo Bonzini <pbonz...@redhat.com>
plus
"Based on an inital patch from Christian Borntraeger"
> 
> When starting lots of dataplane devices the bootup takes very long on
> Christian's s390 with irqfd patches. With larger setups he is even
> able to trigger some timeouts in some components.  Turns out that the
> KVM_SET_GSI_ROUTING ioctl takes very long (strace claims up to 0.1 sec)
> when having multiple CPUs.  This is caused by the  synchronize_rcu and
> the HZ=100 of s390.  By changing the code to use a private srcu we can
> speed things up.  This patch reduces the boot time till mounting root
> from 8 to 2 seconds on my s390 guest with 100 disks.
> 
> Uses of hlist_for_each_entry_rcu, hlist_add_head_rcu, hlist_del_init_rcu
> are fine because they do not have lockdep checks (hlist_for_each_entry_rcu
> uses rcu_dereference_raw rather than rcu_dereference, and write-sides
> do not do rcu lockdep at all).
> 
> Note that we're hardly relying on the "sleepable" part of srcu.  We just
> want SRCU's faster detection of grace periods.
> 
> Testing was done by Andrew Theurer using NETPERF.  The difference between
> results "before" and "after" the patch has mean -0.2% and standard deviation
> 0.6%.  Using a paired t-test on the data points says that there is a 2.5%
> probability that the patch is the cause of the performance difference
> (rather than a random fluctuation).
> 
> Cc: Marcelo Tosatti <mtosa...@redhat.com>
> Cc: Michael S. Tsirkin <m...@redhat.com>
> Signed-off-by: Christian Borntraeger <borntrae...@de.ibm.com>
> Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>

Some questions regarding expedided vs. non expedited and a comment without a 
necessary action.

Otherwise
Reviewed-by: Christian Borntraeger <borntrae...@de.ibm.com>
Tested-by: Christian Borntraeger <borntrae...@de.ibm.com> # on s390

> ---
>  include/linux/kvm_host.h |  1 +
>  virt/kvm/eventfd.c       | 25 +++++++++++++++----------
>  virt/kvm/irq_comm.c      | 17 +++++++++--------
>  virt/kvm/irqchip.c       | 31 ++++++++++++++++---------------
>  virt/kvm/kvm_main.c      | 16 ++++++++++------
>  5 files changed, 51 insertions(+), 39 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 820fc2e1d9df..cd0df9a9352d 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -368,6 +368,7 @@ struct kvm {
>       struct mm_struct *mm; /* userspace tied to this vm */
>       struct kvm_memslots *memslots;
>       struct srcu_struct srcu;
> +     struct srcu_struct irq_srcu;
>  #ifdef CONFIG_KVM_APIC_ARCHITECTURE
>       u32 bsp_vcpu_id;
>  #endif
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 912ec5a95e2c..20c3af7692c5 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -31,6 +31,7 @@
>  #include <linux/list.h>
>  #include <linux/eventfd.h>
>  #include <linux/kernel.h>
> +#include <linux/srcu.h>
>  #include <linux/slab.h>
> 
>  #include "iodev.h"
> @@ -118,19 +119,22 @@ static void
>  irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
>  {
>       struct _irqfd_resampler *resampler;
> +     struct kvm *kvm;
>       struct _irqfd *irqfd;
> +     int idx;
> 
>       resampler = container_of(kian, struct _irqfd_resampler, notifier);
> +     kvm = resampler->kvm;
> 
> -     kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> +     kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
>                   resampler->notifier.gsi, 0, false);
> 
> -     rcu_read_lock();
> +     idx = srcu_read_lock(&kvm->irq_srcu);
> 
>       list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
>               eventfd_signal(irqfd->resamplefd, 1);
> 
> -     rcu_read_unlock();
> +     srcu_read_unlock(&kvm->irq_srcu, idx);
>  }
> 
>  static void
> @@ -142,7 +146,7 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
>       mutex_lock(&kvm->irqfds.resampler_lock);
> 
>       list_del_rcu(&irqfd->resampler_link);
> -     synchronize_rcu();
> +     synchronize_srcu(&kvm->irq_srcu);
> 
>       if (list_empty(&resampler->list)) {
>               list_del(&resampler->link);
> @@ -221,17 +225,18 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int 
> sync, void *key)
>       unsigned long flags = (unsigned long)key;
>       struct kvm_kernel_irq_routing_entry *irq;
>       struct kvm *kvm = irqfd->kvm;
> +     int idx;
> 
>       if (flags & POLLIN) {
> -             rcu_read_lock();
> -             irq = rcu_dereference(irqfd->irq_entry);
> +             idx = srcu_read_lock(&kvm->irq_srcu);
> +             irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu);
>               /* An event has been signaled, inject an interrupt */
>               if (irq)
>                       kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
>                                       false);
>               else
>                       schedule_work(&irqfd->inject);
> -             rcu_read_unlock();
> +             srcu_read_unlock(&kvm->irq_srcu, idx);
>       }
> 
>       if (flags & POLLHUP) {
> @@ -363,7 +368,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
>               }
> 
>               list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
> -             synchronize_rcu();
> +             synchronize_srcu(&kvm->irq_srcu);

No idea what resampler is, can this become time critical as well - iow do we 
need expedited here?


> 
>               mutex_unlock(&kvm->irqfds.resampler_lock);
>       }
> @@ -465,7 +470,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd 
> *args)
>                        * another thread calls kvm_irq_routing_update before
>                        * we flush workqueue below (we synchronize with
>                        * kvm_irq_routing_update using irqfds.lock).
> -                      * It is paired with synchronize_rcu done by caller
> +                      * It is paired with synchronize_srcu done by caller
>                        * of that function.
>                        */
>                       rcu_assign_pointer(irqfd->irq_entry, NULL);
> @@ -524,7 +529,7 @@ kvm_irqfd_release(struct kvm *kvm)
> 
>  /*
>   * Change irq_routing and irqfd.
> - * Caller must invoke synchronize_rcu afterwards.
> + * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
>   */
>  void kvm_irq_routing_update(struct kvm *kvm,
>                           struct kvm_irq_routing_table *irq_rt)
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index e2e6b4473a96..ced4a542a031 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -163,6 +163,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int 
> irq_source_id, u32 irq, int level)
>       struct kvm_kernel_irq_routing_entry *e;
>       int ret = -EINVAL;
>       struct kvm_irq_routing_table *irq_rt;
> +     int idx;
> 
>       trace_kvm_set_irq(irq, level, irq_source_id);
> 
> @@ -174,8 +175,8 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int 
> irq_source_id, u32 irq, int level)
>        * Since there's no easy way to do this, we only support injecting MSI
>        * which is limited to 1:1 GSI mapping.
>        */
> -     rcu_read_lock();
> -     irq_rt = rcu_dereference(kvm->irq_routing);
> +     idx = srcu_read_lock(&kvm->irq_srcu);
> +     irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
>       if (irq < irq_rt->nr_rt_entries)
>               hlist_for_each_entry(e, &irq_rt->map[irq], link) {
>                       if (likely(e->type == KVM_IRQ_ROUTING_MSI))
> @@ -184,7 +185,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int 
> irq_source_id, u32 irq, int level)
>                               ret = -EWOULDBLOCK;
>                       break;
>               }
> -     rcu_read_unlock();
> +     srcu_read_unlock(&kvm->irq_srcu, idx);
>       return ret;
>  }
> 
> @@ -253,22 +254,22 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, 
> int irq,
>       mutex_lock(&kvm->irq_lock);
>       hlist_del_rcu(&kimn->link);
>       mutex_unlock(&kvm->irq_lock);
> -     synchronize_rcu();
> +     synchronize_srcu(&kvm->irq_srcu);
>  }
> 
>  void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
>                            bool mask)
>  {
>       struct kvm_irq_mask_notifier *kimn;
> -     int gsi;
> +     int idx, gsi;
> 
> -     rcu_read_lock();
> -     gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
> +     idx = srcu_read_lock(&kvm->irq_srcu);
> +     gsi = srcu_dereference(kvm->irq_routing, 
> &kvm->irq_srcu)->chip[irqchip][pin];
>       if (gsi != -1)
>               hlist_for_each_entry_rcu(kimn, &kvm->mask_notifier_list, link)
>                       if (kimn->irq == gsi)
>                               kimn->func(kimn, mask);
> -     rcu_read_unlock();
> +     srcu_read_unlock(&kvm->irq_srcu, idx);
>  }
> 
>  int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> index 20dc9e4a8f6c..58bf5ba1aab1 100644
> --- a/virt/kvm/irqchip.c
> +++ b/virt/kvm/irqchip.c
> @@ -26,6 +26,7 @@
> 
>  #include <linux/kvm_host.h>
>  #include <linux/slab.h>
> +#include <linux/srcu.h>
>  #include <linux/export.h>
>  #include <trace/events/kvm.h>
>  #include "irq.h"
> @@ -33,19 +34,19 @@
>  bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
>  {
>       struct kvm_irq_ack_notifier *kian;
> -     int gsi;
> +     int gsi, idx;
> 
> -     rcu_read_lock();
> -     gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
> +     idx = srcu_read_lock(&kvm->irq_srcu);
> +     gsi = srcu_dereference(kvm->irq_routing, 
> &kvm->irq_srcu)->chip[irqchip][pin];
>       if (gsi != -1)
>               hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
>                                        link)
>                       if (kian->gsi == gsi) {
> -                             rcu_read_unlock();
> +                             srcu_read_unlock(&kvm->irq_srcu, idx);
>                               return true;
>                       }
> 
> -     rcu_read_unlock();
> +     srcu_read_unlock(&kvm->irq_srcu, idx);
> 
>       return false;
>  }
> @@ -54,18 +55,18 @@ EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
>  void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
>  {
>       struct kvm_irq_ack_notifier *kian;
> -     int gsi;
> +     int gsi, idx;
> 
>       trace_kvm_ack_irq(irqchip, pin);
> 
> -     rcu_read_lock();
> -     gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
> +     idx = srcu_read_lock(&kvm->irq_srcu);
> +     gsi = srcu_dereference(kvm->irq_routing, 
> &kvm->irq_srcu)->chip[irqchip][pin];
>       if (gsi != -1)
>               hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
>                                        link)
>                       if (kian->gsi == gsi)
>                               kian->irq_acked(kian);
> -     rcu_read_unlock();
> +     srcu_read_unlock(&kvm->irq_srcu, idx);
>  }
> 
>  void kvm_register_irq_ack_notifier(struct kvm *kvm,
> @@ -85,7 +86,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
>       mutex_lock(&kvm->irq_lock);
>       hlist_del_init_rcu(&kian->link);
>       mutex_unlock(&kvm->irq_lock);
> -     synchronize_rcu();
> +     synchronize_srcu_expedited(&kvm->irq_srcu);

Hmm, looks like all callers are slow path (shutdown, deregister assigned dev). 
Couldnt 
we use the non expedited variant?


>  #ifdef __KVM_HAVE_IOAPIC
>       kvm_vcpu_request_scan_ioapic(kvm);
>  #endif
> @@ -115,7 +116,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 
> irq, int level,
>               bool line_status)
>  {
>       struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
> -     int ret = -1, i = 0;
> +     int ret = -1, i = 0, idx;
>       struct kvm_irq_routing_table *irq_rt;
> 
>       trace_kvm_set_irq(irq, level, irq_source_id);
> @@ -124,12 +125,12 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 
> irq, int level,
>        * IOAPIC.  So set the bit in both. The guest will ignore
>        * writes to the unused one.
>        */
> -     rcu_read_lock();
> -     irq_rt = rcu_dereference(kvm->irq_routing);
> +     idx = srcu_read_lock(&kvm->irq_srcu);
> +     irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
>       if (irq < irq_rt->nr_rt_entries)
>               hlist_for_each_entry(e, &irq_rt->map[irq], link)
>                       irq_set[i++] = *e;
> -     rcu_read_unlock();
> +     srcu_read_unlock(&kvm->irq_srcu, idx);
> 
>       while(i--) {
>               int r;
> @@ -226,7 +227,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
>       kvm_irq_routing_update(kvm, new);
>       mutex_unlock(&kvm->irq_lock);
> 
> -     synchronize_rcu();
> +     synchronize_srcu_expedited(&kvm->irq_srcu);
> 
>       new = old;
>       r = 0;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index fa70c6e642b4..95b4c2b3906a 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -457,11 +457,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
> 
>       r = kvm_arch_init_vm(kvm, type);
>       if (r)
> -             goto out_err_nodisable;
> +             goto out_err_no_disable;
> 
>       r = hardware_enable_all();
>       if (r)
> -             goto out_err_nodisable;
> +             goto out_err_no_disable;
> 
>  #ifdef CONFIG_HAVE_KVM_IRQCHIP
>       INIT_HLIST_HEAD(&kvm->mask_notifier_list);
> @@ -473,10 +473,12 @@ static struct kvm *kvm_create_vm(unsigned long type)
>       r = -ENOMEM;
>       kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
>       if (!kvm->memslots)
> -             goto out_err_nosrcu;
> +             goto out_err_no_srcu;
>       kvm_init_memslots_id(kvm);
>       if (init_srcu_struct(&kvm->srcu))
> -             goto out_err_nosrcu;
> +             goto out_err_no_srcu;
> +     if (init_srcu_struct(&kvm->irq_srcu))
> +             goto out_err_no_irq_srcu;
>       for (i = 0; i < KVM_NR_BUSES; i++) {
>               kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
>                                       GFP_KERNEL);
> @@ -505,10 +507,12 @@ static struct kvm *kvm_create_vm(unsigned long type)
>       return kvm;
> 
>  out_err:
> +     cleanup_srcu_struct(&kvm->irq_srcu);
> +out_err_no_irq_srcu:
>       cleanup_srcu_struct(&kvm->srcu);
> -out_err_nosrcu:
> +out_err_no_srcu:
>       hardware_disable_all();
> -out_err_nodisable:
> +out_err_no_disable:


the patch would be smaller without this change, but it makes the naming more 
consistent, so ok.


>       for (i = 0; i < KVM_NR_BUSES; i++)
>               kfree(kvm->buses[i]);
>       kfree(kvm->memslots);
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to