Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-11 Thread Avi Kivity
On 09/11/2012 04:02 PM, Gleb Natapov wrote:
> Most interrupt are delivered to only one vcpu. Use pre-build tables to
> find interrupt destination instead of looping through all vcpus. In case
> of logical mode loop only through vcpus in a logical cluster irq is sent
> to.
> 
>* fix rcu issues pointed to by MST. All but one. Still use
>  call_rcu(). Do not think this is serious issue. If it is should be
>  solved by RCU subsystem.

Agree.

Patch looks good but some minor comments follow.

>  struct kvm_arch {
>   unsigned int n_used_mmu_pages;
>   unsigned int n_requested_mmu_pages;
> @@ -528,6 +536,8 @@ struct kvm_arch {
>   struct kvm_ioapic *vioapic;
>   struct kvm_pit *vpit;
>   int vapics_in_nmi_mode;
> + struct kvm_apic_map *apic_map;
> + struct mutex apic_map_lock;

Reversing the order will make it clearer what the lock protects.

>  
> +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
> + u16 *cid, u16 *lid)
> +{
> + if (ldr_bits == 32) {
> + *cid = ldr >> 16;
> + *lid = ldr & 0x;
> + } else {
> + ldr = GET_APIC_LOGICAL_ID(ldr);
> +
> + if (flat) {
> + *cid = 0;
> + *lid = ldr;
> + } else {
> + *cid = ldr >> 4;
> + *lid = ldr & 0xf;
> + }
> + }
> +}

You could precaclulate lid_shift/lid_mask/cid_shift/cid_mask and have
just one version here.  In fact you could drop the function.

> +
> +static inline void recalculate_apic_map(struct kvm *kvm)
> +{
> + struct kvm_apic_map *new, *old = NULL;
> + struct kvm_vcpu *vcpu;
> + int i;
> +
> + new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> +
> + mutex_lock(&kvm->arch.apic_map_lock);
> +
> + if (!new)
> + goto out;
> +
> + new->ldr_bits = 8;
> + new->flat = true;
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + u16 cid, lid;
> + struct kvm_lapic *apic = vcpu->arch.apic;
> +
> + if (!kvm_apic_present(vcpu))
> + continue;
> +
> + if (apic_x2apic_mode(apic)) {
> + new->ldr_bits = 32;
> + new->flat = false;
> + } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> + kvm_apic_get_reg(apic, APIC_DFR) == 
> APIC_DFR_CLUSTER)
> + new->flat = false;

While a vcpu is being hotplugged in it will be in flat mode.  The code
correctly gives precedence to x2apic and cluster modes over flat mode,
so it is correct in that respect, but the comment describing this is too
short.

> +
> + new->phys_map[kvm_apic_id(apic)] = apic;
> + kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
> + new->flat, new->ldr_bits, &cid, &lid);
> +
> + if (lid)
> + new->logical_map[cid][ffs(lid) - 1] = apic;
> + }
> +out:
> + old = kvm->arch.apic_map;

rcu_dereference(), just for kicks.

> + rcu_assign_pointer(kvm->arch.apic_map, new);
> + mutex_unlock(&kvm->arch.apic_map_lock);
> +
> + if (old)
> + kfree_rcu(old, rcu);

Nice, removes the need for rcu_barrier().

> +}
> +
>  
> +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> + struct kvm_lapic_irq *irq, int *r)
> +{
> + struct kvm_apic_map *map;
> + unsigned long bitmap = 1;
> + struct kvm_lapic **dst;
> + int i;
> +
> + *r = -1;
> +
> + if (irq->shorthand == APIC_DEST_SELF) {
> + *r = kvm_apic_set_irq(src->vcpu, irq);
> + return true;
> + }
> +
> + if (irq->shorthand)
> + return false;
> +
> + rcu_read_lock();
> + map = rcu_dereference(kvm->arch.apic_map);
> +
> + if (!map) {
> + rcu_read_unlock();
> + return false;
> + }
> +
> + if (irq->dest_mode == 0) { /* physical mode */
> + if (irq->delivery_mode == APIC_DM_LOWEST ||
> + irq->dest_id == 0xff) {
> + rcu_read_unlock();
> + return false;
> + }

Two error paths with rcu_read_unlock().  Cleaner to have a bool ret =
false; in the beginning and 'goto out_unlock' here, IMO.


> + dst = &map->phys_map[irq->dest_id & 0xff];
> + } else {
> + u16 cid, lid;
> + u32 mda = irq->dest_id;
> +
> + if (map->ldr_bits == 8)
> + mda <<= 24;

mda <<= 32 - map->ldr_bits;

> +
> + kvm_apic_get_logical_id(mda, map->flat, map->ldr_bits,
> + &cid, &lid);
> + dst = map->logical_map[cid];
> +
> + bitmap = lid;
> + if (irq->delivery_mode == APIC_DM_LOWEST &&
> + hweight_long(bitmap) > 1) {
> + int l = -1;
> +

Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-11 Thread Michael S. Tsirkin
On Tue, Sep 11, 2012 at 04:26:17PM +0300, Avi Kivity wrote:
> On 09/11/2012 04:02 PM, Gleb Natapov wrote:
> > Most interrupt are delivered to only one vcpu. Use pre-build tables to
> > find interrupt destination instead of looping through all vcpus. In case
> > of logical mode loop only through vcpus in a logical cluster irq is sent
> > to.
> > 
> >* fix rcu issues pointed to by MST. All but one. Still use
> >  call_rcu(). Do not think this is serious issue. If it is should be
> >  solved by RCU subsystem.
> 
> Agree.
> 
> Patch looks good but some minor comments follow.
> 
> >  struct kvm_arch {
> > unsigned int n_used_mmu_pages;
> > unsigned int n_requested_mmu_pages;
> > @@ -528,6 +536,8 @@ struct kvm_arch {
> > struct kvm_ioapic *vioapic;
> > struct kvm_pit *vpit;
> > int vapics_in_nmi_mode;
> > +   struct kvm_apic_map *apic_map;
> > +   struct mutex apic_map_lock;
> 
> Reversing the order will make it clearer what the lock protects.
> 
> >  
> > +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
> > +   u16 *cid, u16 *lid)
> > +{
> > +   if (ldr_bits == 32) {
> > +   *cid = ldr >> 16;
> > +   *lid = ldr & 0x;
> > +   } else {
> > +   ldr = GET_APIC_LOGICAL_ID(ldr);
> > +
> > +   if (flat) {
> > +   *cid = 0;
> > +   *lid = ldr;
> > +   } else {
> > +   *cid = ldr >> 4;
> > +   *lid = ldr & 0xf;
> > +   }
> > +   }
> > +}
> 
> You could precaclulate lid_shift/lid_mask/cid_shift/cid_mask and have
> just one version here.  In fact you could drop the function.
> 
> > +
> > +static inline void recalculate_apic_map(struct kvm *kvm)
> > +{
> > +   struct kvm_apic_map *new, *old = NULL;
> > +   struct kvm_vcpu *vcpu;
> > +   int i;
> > +
> > +   new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> > +
> > +   mutex_lock(&kvm->arch.apic_map_lock);
> > +
> > +   if (!new)
> > +   goto out;
> > +
> > +   new->ldr_bits = 8;
> > +   new->flat = true;
> > +   kvm_for_each_vcpu(i, vcpu, kvm) {
> > +   u16 cid, lid;
> > +   struct kvm_lapic *apic = vcpu->arch.apic;
> > +
> > +   if (!kvm_apic_present(vcpu))
> > +   continue;
> > +
> > +   if (apic_x2apic_mode(apic)) {
> > +   new->ldr_bits = 32;
> > +   new->flat = false;
> > +   } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> > +   kvm_apic_get_reg(apic, APIC_DFR) == 
> > APIC_DFR_CLUSTER)
> > +   new->flat = false;
> 
> While a vcpu is being hotplugged in it will be in flat mode.  The code
> correctly gives precedence to x2apic and cluster modes over flat mode,
> so it is correct in that respect, but the comment describing this is too
> short.
> 
> > +
> > +   new->phys_map[kvm_apic_id(apic)] = apic;
> > +   kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
> > +   new->flat, new->ldr_bits, &cid, &lid);
> > +
> > +   if (lid)
> > +   new->logical_map[cid][ffs(lid) - 1] = apic;
> > +   }
> > +out:
> > +   old = kvm->arch.apic_map;
> 
> rcu_dereference(), just for kicks.

rcu_dereference will give warnings with lockdep.
Should be rcu_dereference_protected.

> > +   rcu_assign_pointer(kvm->arch.apic_map, new);
> > +   mutex_unlock(&kvm->arch.apic_map_lock);
> > +
> > +   if (old)
> > +   kfree_rcu(old, rcu);
> 
> Nice, removes the need for rcu_barrier().
> 
> > +}
> > +
> >  
> > +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> > +   struct kvm_lapic_irq *irq, int *r)
> > +{
> > +   struct kvm_apic_map *map;
> > +   unsigned long bitmap = 1;
> > +   struct kvm_lapic **dst;
> > +   int i;
> > +
> > +   *r = -1;
> > +
> > +   if (irq->shorthand == APIC_DEST_SELF) {
> > +   *r = kvm_apic_set_irq(src->vcpu, irq);
> > +   return true;
> > +   }
> > +
> > +   if (irq->shorthand)
> > +   return false;
> > +
> > +   rcu_read_lock();
> > +   map = rcu_dereference(kvm->arch.apic_map);
> > +
> > +   if (!map) {
> > +   rcu_read_unlock();
> > +   return false;
> > +   }
> > +
> > +   if (irq->dest_mode == 0) { /* physical mode */
> > +   if (irq->delivery_mode == APIC_DM_LOWEST ||
> > +   irq->dest_id == 0xff) {
> > +   rcu_read_unlock();
> > +   return false;
> > +   }
> 
> Two error paths with rcu_read_unlock().  Cleaner to have a bool ret =
> false; in the beginning and 'goto out_unlock' here, IMO.

Nod.

> 
> > +   dst = &map->phys_map[irq->dest_id & 0xff];
> > +   } else {
> > +   u16 cid, lid;
> > +   u32 mda = irq->dest_id;
> > +
> > +   if (map->ldr_bits == 8)
> > +   mda <<= 24;
> 
> mda <<= 32 - map->ldr_bits;
> 
> > +
> > +   kvm_apic_get_logical_id(mda, map->flat, map->ld

Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-11 Thread Michael S. Tsirkin
On Tue, Sep 11, 2012 at 04:02:25PM +0300, Gleb Natapov wrote:
> Most interrupt are delivered to only one vcpu. Use pre-build tables to
> find interrupt destination instead of looping through all vcpus. In case
> of logical mode loop only through vcpus in a logical cluster irq is sent
> to.
> 
> Signed-off-by: Gleb Natapov 

Added Paul just to make sure we are using RCU correctly.
Paul could you pls answer the question below?

> ---
>  Changelog:
> 
>   - v1->v2
>* fix race Avi noticed
>* rcu_read_lock() out of the block as per Avi
>* fix rcu issues pointed to by MST. All but one. Still use
>  call_rcu(). Do not think this is serious issue. If it is should be
>  solved by RCU subsystem.
>* Fix phys_map overflow pointed to by MST
>* recalculate_apic_map() does not return error any more.
>* add optimization for low prio logical mode with one cpu as dst (it
>  happens)
> 
> I did not rewrote kvm_irq_delivery_to_apic_fast() MST way since my way
> looks cleaner to me.
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 64adb61..3ba8951 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -511,6 +511,14 @@ struct kvm_arch_memory_slot {
>   struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
>  };
>  
> +struct kvm_apic_map {
> + struct rcu_head rcu;
> + bool flat;
> + u8 ldr_bits;
> + struct kvm_lapic *phys_map[256];
> + struct kvm_lapic *logical_map[16][16];
> +};
> +
>  struct kvm_arch {
>   unsigned int n_used_mmu_pages;
>   unsigned int n_requested_mmu_pages;
> @@ -528,6 +536,8 @@ struct kvm_arch {
>   struct kvm_ioapic *vioapic;
>   struct kvm_pit *vpit;
>   int vapics_in_nmi_mode;
> + struct kvm_apic_map *apic_map;
> + struct mutex apic_map_lock;
>  
>   unsigned int tss_addr;
>   struct page *apic_access_page;
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 07ad628..06672e8 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -139,11 +139,92 @@ static inline int apic_enabled(struct kvm_lapic *apic)
>   (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
>APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
>  
> +static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> +{
> + return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> +}
> +
>  static inline int kvm_apic_id(struct kvm_lapic *apic)
>  {
>   return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
>  }
>  
> +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
> + u16 *cid, u16 *lid)
> +{
> + if (ldr_bits == 32) {
> + *cid = ldr >> 16;
> + *lid = ldr & 0x;
> + } else {
> + ldr = GET_APIC_LOGICAL_ID(ldr);
> +
> + if (flat) {
> + *cid = 0;
> + *lid = ldr;
> + } else {
> + *cid = ldr >> 4;
> + *lid = ldr & 0xf;
> + }
> + }
> +}
> +
> +static inline void recalculate_apic_map(struct kvm *kvm)
> +{
> + struct kvm_apic_map *new, *old = NULL;
> + struct kvm_vcpu *vcpu;
> + int i;
> +
> + new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> +
> + mutex_lock(&kvm->arch.apic_map_lock);
> +
> + if (!new)
> + goto out;
> +
> + new->ldr_bits = 8;
> + new->flat = true;
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + u16 cid, lid;
> + struct kvm_lapic *apic = vcpu->arch.apic;
> +
> + if (!kvm_apic_present(vcpu))
> + continue;
> +
> + if (apic_x2apic_mode(apic)) {
> + new->ldr_bits = 32;
> + new->flat = false;
> + } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> + kvm_apic_get_reg(apic, APIC_DFR) == 
> APIC_DFR_CLUSTER)
> + new->flat = false;
> +
> + new->phys_map[kvm_apic_id(apic)] = apic;
> + kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
> + new->flat, new->ldr_bits, &cid, &lid);
> +
> + if (lid)
> + new->logical_map[cid][ffs(lid) - 1] = apic;
> + }
> +out:
> + old = kvm->arch.apic_map;
> + rcu_assign_pointer(kvm->arch.apic_map, new);
> + mutex_unlock(&kvm->arch.apic_map_lock);
> +
> + if (old)
> + kfree_rcu(old, rcu);
> +}

Paul, I'd like to check something with you here:
this function can be triggered by userspace,
any number of times; we allocate
a 2K chunk of memory that is later freed by
kfree_rcu.

Is there a risk of DOS if RCU is delayed while
lots of memory is queued up in this way?
If yes is this a generic problem with kfree_rcu
that should be addressed in core kernel?

Thanks!

> +
> +static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
> +{
> + apic_set_reg(apic, APIC_ID,

Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-11 Thread Gleb Natapov
On Tue, Sep 11, 2012 at 04:26:17PM +0300, Avi Kivity wrote:
> On 09/11/2012 04:02 PM, Gleb Natapov wrote:
> > Most interrupt are delivered to only one vcpu. Use pre-build tables to
> > find interrupt destination instead of looping through all vcpus. In case
> > of logical mode loop only through vcpus in a logical cluster irq is sent
> > to.
> > 
> >* fix rcu issues pointed to by MST. All but one. Still use
> >  call_rcu(). Do not think this is serious issue. If it is should be
> >  solved by RCU subsystem.
> 
> Agree.
> 
> Patch looks good but some minor comments follow.
> 
> >  struct kvm_arch {
> > unsigned int n_used_mmu_pages;
> > unsigned int n_requested_mmu_pages;
> > @@ -528,6 +536,8 @@ struct kvm_arch {
> > struct kvm_ioapic *vioapic;
> > struct kvm_pit *vpit;
> > int vapics_in_nmi_mode;
> > +   struct kvm_apic_map *apic_map;
> > +   struct mutex apic_map_lock;
> 
> Reversing the order will make it clearer what the lock protects.
> 
Hmm, OK. I thought names make it clear.

> >  
> > +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
> > +   u16 *cid, u16 *lid)
> > +{
> > +   if (ldr_bits == 32) {
> > +   *cid = ldr >> 16;
> > +   *lid = ldr & 0x;
> > +   } else {
> > +   ldr = GET_APIC_LOGICAL_ID(ldr);
> > +
> > +   if (flat) {
> > +   *cid = 0;
> > +   *lid = ldr;
> > +   } else {
> > +   *cid = ldr >> 4;
> > +   *lid = ldr & 0xf;
> > +   }
> > +   }
> > +}
> 
> You could precaclulate lid_shift/lid_mask/cid_shift/cid_mask and have
> just one version here.  In fact you could drop the function.
> 
You mean precalculate them in recalculate_apic_map() and store in kvm_apic_map?
 
> > +
> > +static inline void recalculate_apic_map(struct kvm *kvm)
> > +{
> > +   struct kvm_apic_map *new, *old = NULL;
> > +   struct kvm_vcpu *vcpu;
> > +   int i;
> > +
> > +   new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> > +
> > +   mutex_lock(&kvm->arch.apic_map_lock);
> > +
> > +   if (!new)
> > +   goto out;
> > +
> > +   new->ldr_bits = 8;
> > +   new->flat = true;
> > +   kvm_for_each_vcpu(i, vcpu, kvm) {
> > +   u16 cid, lid;
> > +   struct kvm_lapic *apic = vcpu->arch.apic;
> > +
> > +   if (!kvm_apic_present(vcpu))
> > +   continue;
> > +
> > +   if (apic_x2apic_mode(apic)) {
> > +   new->ldr_bits = 32;
> > +   new->flat = false;
> > +   } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> > +   kvm_apic_get_reg(apic, APIC_DFR) == 
> > APIC_DFR_CLUSTER)
> > +   new->flat = false;
> 
> While a vcpu is being hotplugged in it will be in flat mode.  The code
> correctly gives precedence to x2apic and cluster modes over flat mode,
> so it is correct in that respect, but the comment describing this is too
> short.
> 
Almost non existent.

> > +
> > +   new->phys_map[kvm_apic_id(apic)] = apic;
> > +   kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
> > +   new->flat, new->ldr_bits, &cid, &lid);
> > +
> > +   if (lid)
> > +   new->logical_map[cid][ffs(lid) - 1] = apic;
> > +   }
> > +out:
> > +   old = kvm->arch.apic_map;
> 
> rcu_dereference(), just for kicks.
> 
MST says rcu_dereference_protected() but honestly I look at it and
rcu_dereference_check(, 1) and condition they check are so obviously
correct in the code that using them is just a clutter. In more complex
cases, when dereference happens far away from locking it have its point.
If you insist on it here should we add it too irq routing code too?

> > +   rcu_assign_pointer(kvm->arch.apic_map, new);
> > +   mutex_unlock(&kvm->arch.apic_map_lock);
> > +
> > +   if (old)
> > +   kfree_rcu(old, rcu);
> 
> Nice, removes the need for rcu_barrier().
> 
> > +}
> > +
> >  
> > +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
> > +   struct kvm_lapic_irq *irq, int *r)
> > +{
> > +   struct kvm_apic_map *map;
> > +   unsigned long bitmap = 1;
> > +   struct kvm_lapic **dst;
> > +   int i;
> > +
> > +   *r = -1;
> > +
> > +   if (irq->shorthand == APIC_DEST_SELF) {
> > +   *r = kvm_apic_set_irq(src->vcpu, irq);
> > +   return true;
> > +   }
> > +
> > +   if (irq->shorthand)
> > +   return false;
> > +
> > +   rcu_read_lock();
> > +   map = rcu_dereference(kvm->arch.apic_map);
> > +
> > +   if (!map) {
> > +   rcu_read_unlock();
> > +   return false;
> > +   }
> > +
> > +   if (irq->dest_mode == 0) { /* physical mode */
> > +   if (irq->delivery_mode == APIC_DM_LOWEST ||
> > +   irq->dest_id == 0xff) {
> > +   rcu_read_unlock();
> > +   return false;
> > +   }
> 
> Two error paths with rcu_read_unlock().  Cleaner to have a 

Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-11 Thread Avi Kivity
On 09/11/2012 05:46 PM, Gleb Natapov wrote:
> On Tue, Sep 11, 2012 at 04:26:17PM +0300, Avi Kivity wrote:
>> On 09/11/2012 04:02 PM, Gleb Natapov wrote:
>> > Most interrupt are delivered to only one vcpu. Use pre-build tables to
>> > find interrupt destination instead of looping through all vcpus. In case
>> > of logical mode loop only through vcpus in a logical cluster irq is sent
>> > to.
>> > 
>> >* fix rcu issues pointed to by MST. All but one. Still use
>> >  call_rcu(). Do not think this is serious issue. If it is should be
>> >  solved by RCU subsystem.
>> 
>> Agree.
>> 
>> Patch looks good but some minor comments follow.
>> 
>> >  struct kvm_arch {
>> >unsigned int n_used_mmu_pages;
>> >unsigned int n_requested_mmu_pages;
>> > @@ -528,6 +536,8 @@ struct kvm_arch {
>> >struct kvm_ioapic *vioapic;
>> >struct kvm_pit *vpit;
>> >int vapics_in_nmi_mode;
>> > +  struct kvm_apic_map *apic_map;
>> > +  struct mutex apic_map_lock;
>> 
>> Reversing the order will make it clearer what the lock protects.
>> 
> Hmm, OK. I thought names make it clear.

They do, but it is conventional and good practice to put the lock in front.

> 
>> >  
>> > +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
>> > +  u16 *cid, u16 *lid)
>> > +{
>> > +  if (ldr_bits == 32) {
>> > +  *cid = ldr >> 16;
>> > +  *lid = ldr & 0x;
>> > +  } else {
>> > +  ldr = GET_APIC_LOGICAL_ID(ldr);
>> > +
>> > +  if (flat) {
>> > +  *cid = 0;
>> > +  *lid = ldr;
>> > +  } else {
>> > +  *cid = ldr >> 4;
>> > +  *lid = ldr & 0xf;
>> > +  }
>> > +  }
>> > +}
>> 
>> You could precaclulate lid_shift/lid_mask/cid_shift/cid_mask and have
>> just one version here.  In fact you could drop the function.
>> 
> You mean precalculate them in recalculate_apic_map() and store in 
> kvm_apic_map?

Yes.

> 
>> > +
>> > +  new->phys_map[kvm_apic_id(apic)] = apic;
>> > +  kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
>> > +  new->flat, new->ldr_bits, &cid, &lid);
>> > +
>> > +  if (lid)
>> > +  new->logical_map[cid][ffs(lid) - 1] = apic;
>> > +  }
>> > +out:
>> > +  old = kvm->arch.apic_map;
>> 
>> rcu_dereference(), just for kicks.
>> 
> MST says rcu_dereference_protected() but honestly I look at it and
> rcu_dereference_check(, 1) and condition they check are so obviously
> correct in the code that using them is just a clutter. 

They say to the reader, or to a static checker "this case was considered
and that is the conclusion we arrived at".  Using the variable directly
says nothing.

> In more complex
> cases, when dereference happens far away from locking it have its point.
> If you insist on it here should we add it too irq routing code too?

Thanks.

>> >  /*
>> > @@ -6319,6 +6320,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>> >put_page(kvm->arch.apic_access_page);
>> >if (kvm->arch.ept_identity_pagetable)
>> >put_page(kvm->arch.ept_identity_pagetable);
>> > +  kfree(kvm->arch.apic_map);
>> 
>> rcu_dereference(), even though it cannot be needed here, to shut down
>> static code checkers.
>> 
> How to run those code checkers? Do they complain about irq routing code?
> Just curious.

No idea.  Julia Lawall sends patches from time to time with this kind of
things, so people do run them.


-- 
error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-11 Thread Paul E. McKenney
On Tue, Sep 11, 2012 at 05:10:23PM +0300, Michael S. Tsirkin wrote:
> On Tue, Sep 11, 2012 at 04:02:25PM +0300, Gleb Natapov wrote:
> > Most interrupt are delivered to only one vcpu. Use pre-build tables to
> > find interrupt destination instead of looping through all vcpus. In case
> > of logical mode loop only through vcpus in a logical cluster irq is sent
> > to.
> > 
> > Signed-off-by: Gleb Natapov 
> 
> Added Paul just to make sure we are using RCU correctly.
> Paul could you pls answer the question below?
> 
> > ---
> >  Changelog:
> > 
> >   - v1->v2
> >* fix race Avi noticed
> >* rcu_read_lock() out of the block as per Avi
> >* fix rcu issues pointed to by MST. All but one. Still use
> >  call_rcu(). Do not think this is serious issue. If it is should be
> >  solved by RCU subsystem.
> >* Fix phys_map overflow pointed to by MST
> >* recalculate_apic_map() does not return error any more.
> >* add optimization for low prio logical mode with one cpu as dst (it
> >  happens)
> > 
> > I did not rewrote kvm_irq_delivery_to_apic_fast() MST way since my way
> > looks cleaner to me.
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h 
> > b/arch/x86/include/asm/kvm_host.h
> > index 64adb61..3ba8951 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -511,6 +511,14 @@ struct kvm_arch_memory_slot {
> > struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
> >  };
> >  
> > +struct kvm_apic_map {
> > +   struct rcu_head rcu;
> > +   bool flat;
> > +   u8 ldr_bits;
> > +   struct kvm_lapic *phys_map[256];
> > +   struct kvm_lapic *logical_map[16][16];
> > +};
> > +
> >  struct kvm_arch {
> > unsigned int n_used_mmu_pages;
> > unsigned int n_requested_mmu_pages;
> > @@ -528,6 +536,8 @@ struct kvm_arch {
> > struct kvm_ioapic *vioapic;
> > struct kvm_pit *vpit;
> > int vapics_in_nmi_mode;
> > +   struct kvm_apic_map *apic_map;
> > +   struct mutex apic_map_lock;
> >  
> > unsigned int tss_addr;
> > struct page *apic_access_page;
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 07ad628..06672e8 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -139,11 +139,92 @@ static inline int apic_enabled(struct kvm_lapic *apic)
> > (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
> >  APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
> >  
> > +static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> > +{
> > +   return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> > +}
> > +
> >  static inline int kvm_apic_id(struct kvm_lapic *apic)
> >  {
> > return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> >  }
> >  
> > +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
> > +   u16 *cid, u16 *lid)
> > +{
> > +   if (ldr_bits == 32) {
> > +   *cid = ldr >> 16;
> > +   *lid = ldr & 0x;
> > +   } else {
> > +   ldr = GET_APIC_LOGICAL_ID(ldr);
> > +
> > +   if (flat) {
> > +   *cid = 0;
> > +   *lid = ldr;
> > +   } else {
> > +   *cid = ldr >> 4;
> > +   *lid = ldr & 0xf;
> > +   }
> > +   }
> > +}
> > +
> > +static inline void recalculate_apic_map(struct kvm *kvm)
> > +{
> > +   struct kvm_apic_map *new, *old = NULL;
> > +   struct kvm_vcpu *vcpu;
> > +   int i;
> > +
> > +   new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> > +
> > +   mutex_lock(&kvm->arch.apic_map_lock);
> > +
> > +   if (!new)
> > +   goto out;
> > +
> > +   new->ldr_bits = 8;
> > +   new->flat = true;
> > +   kvm_for_each_vcpu(i, vcpu, kvm) {
> > +   u16 cid, lid;
> > +   struct kvm_lapic *apic = vcpu->arch.apic;
> > +
> > +   if (!kvm_apic_present(vcpu))
> > +   continue;
> > +
> > +   if (apic_x2apic_mode(apic)) {
> > +   new->ldr_bits = 32;
> > +   new->flat = false;
> > +   } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> > +   kvm_apic_get_reg(apic, APIC_DFR) == 
> > APIC_DFR_CLUSTER)
> > +   new->flat = false;
> > +
> > +   new->phys_map[kvm_apic_id(apic)] = apic;
> > +   kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
> > +   new->flat, new->ldr_bits, &cid, &lid);
> > +
> > +   if (lid)
> > +   new->logical_map[cid][ffs(lid) - 1] = apic;
> > +   }
> > +out:
> > +   old = kvm->arch.apic_map;
> > +   rcu_assign_pointer(kvm->arch.apic_map, new);
> > +   mutex_unlock(&kvm->arch.apic_map_lock);
> > +
> > +   if (old)
> > +   kfree_rcu(old, rcu);
> > +}
> 
> Paul, I'd like to check something with you here:
> this function can be triggered by userspace,
> any number of times; we allocate
> a 2K chunk of memory that is later freed by
> kfree_rcu.
> 
> Is there a risk of DOS if RCU is delayed while
> lots of memory

Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-11 Thread Avi Kivity
On 09/11/2012 08:13 PM, Paul E. McKenney wrote:
> > Is there a risk of DOS if RCU is delayed while
> > lots of memory is queued up in this way?
> > If yes is this a generic problem with kfree_rcu
> > that should be addressed in core kernel?
>
> There is indeed a risk.  The kfree_rcu() implementation cannot really
> decide what to do here, especially given that it is callable with irqs
> disabled.
>
> The usual approach is to keep a per-CPU counter and count it down from
> some number for each kfree_rcu().  When it reaches zero, invoke
> synchronize_rcu() as well as kfree_rcu(), and then reset it to the
> "some number" mentioned above.
>
> In theory, I could create an API that did this.  In practice, I have no
> idea how to choose the number -- much depends on the size of the object
> being freed, for example.

Perhaps approach it from the other direction?  If we are under memory
pressure, start synchronize_rcu()ing, much like the shrinker operates.



-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-11 Thread Michael S. Tsirkin
On Tue, Sep 11, 2012 at 10:13:00AM -0700, Paul E. McKenney wrote:
> On Tue, Sep 11, 2012 at 05:10:23PM +0300, Michael S. Tsirkin wrote:
> > On Tue, Sep 11, 2012 at 04:02:25PM +0300, Gleb Natapov wrote:
> > > Most interrupt are delivered to only one vcpu. Use pre-build tables to
> > > find interrupt destination instead of looping through all vcpus. In case
> > > of logical mode loop only through vcpus in a logical cluster irq is sent
> > > to.
> > > 
> > > Signed-off-by: Gleb Natapov 
> > 
> > Added Paul just to make sure we are using RCU correctly.
> > Paul could you pls answer the question below?
> > 
> > > ---
> > >  Changelog:
> > > 
> > >   - v1->v2
> > >* fix race Avi noticed
> > >* rcu_read_lock() out of the block as per Avi
> > >* fix rcu issues pointed to by MST. All but one. Still use
> > >  call_rcu(). Do not think this is serious issue. If it is should be
> > >  solved by RCU subsystem.
> > >* Fix phys_map overflow pointed to by MST
> > >* recalculate_apic_map() does not return error any more.
> > >* add optimization for low prio logical mode with one cpu as dst (it
> > >  happens)
> > > 
> > > I did not rewrote kvm_irq_delivery_to_apic_fast() MST way since my way
> > > looks cleaner to me.
> > > 
> > > diff --git a/arch/x86/include/asm/kvm_host.h 
> > > b/arch/x86/include/asm/kvm_host.h
> > > index 64adb61..3ba8951 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -511,6 +511,14 @@ struct kvm_arch_memory_slot {
> > >   struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
> > >  };
> > >  
> > > +struct kvm_apic_map {
> > > + struct rcu_head rcu;
> > > + bool flat;
> > > + u8 ldr_bits;
> > > + struct kvm_lapic *phys_map[256];
> > > + struct kvm_lapic *logical_map[16][16];
> > > +};
> > > +
> > >  struct kvm_arch {
> > >   unsigned int n_used_mmu_pages;
> > >   unsigned int n_requested_mmu_pages;
> > > @@ -528,6 +536,8 @@ struct kvm_arch {
> > >   struct kvm_ioapic *vioapic;
> > >   struct kvm_pit *vpit;
> > >   int vapics_in_nmi_mode;
> > > + struct kvm_apic_map *apic_map;
> > > + struct mutex apic_map_lock;
> > >  
> > >   unsigned int tss_addr;
> > >   struct page *apic_access_page;
> > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > index 07ad628..06672e8 100644
> > > --- a/arch/x86/kvm/lapic.c
> > > +++ b/arch/x86/kvm/lapic.c
> > > @@ -139,11 +139,92 @@ static inline int apic_enabled(struct kvm_lapic 
> > > *apic)
> > >   (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
> > >APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
> > >  
> > > +static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> > > +{
> > > + return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> > > +}
> > > +
> > >  static inline int kvm_apic_id(struct kvm_lapic *apic)
> > >  {
> > >   return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> > >  }
> > >  
> > > +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
> > > + u16 *cid, u16 *lid)
> > > +{
> > > + if (ldr_bits == 32) {
> > > + *cid = ldr >> 16;
> > > + *lid = ldr & 0x;
> > > + } else {
> > > + ldr = GET_APIC_LOGICAL_ID(ldr);
> > > +
> > > + if (flat) {
> > > + *cid = 0;
> > > + *lid = ldr;
> > > + } else {
> > > + *cid = ldr >> 4;
> > > + *lid = ldr & 0xf;
> > > + }
> > > + }
> > > +}
> > > +
> > > +static inline void recalculate_apic_map(struct kvm *kvm)
> > > +{
> > > + struct kvm_apic_map *new, *old = NULL;
> > > + struct kvm_vcpu *vcpu;
> > > + int i;
> > > +
> > > + new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> > > +
> > > + mutex_lock(&kvm->arch.apic_map_lock);
> > > +
> > > + if (!new)
> > > + goto out;
> > > +
> > > + new->ldr_bits = 8;
> > > + new->flat = true;
> > > + kvm_for_each_vcpu(i, vcpu, kvm) {
> > > + u16 cid, lid;
> > > + struct kvm_lapic *apic = vcpu->arch.apic;
> > > +
> > > + if (!kvm_apic_present(vcpu))
> > > + continue;
> > > +
> > > + if (apic_x2apic_mode(apic)) {
> > > + new->ldr_bits = 32;
> > > + new->flat = false;
> > > + } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> > > + kvm_apic_get_reg(apic, APIC_DFR) == 
> > > APIC_DFR_CLUSTER)
> > > + new->flat = false;
> > > +
> > > + new->phys_map[kvm_apic_id(apic)] = apic;
> > > + kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
> > > + new->flat, new->ldr_bits, &cid, &lid);
> > > +
> > > + if (lid)
> > > + new->logical_map[cid][ffs(lid) - 1] = apic;
> > > + }
> > > +out:
> > > + old = kvm->arch.apic_map;
> > > + rcu_assign_pointer(kvm->arch.apic_map, new);
> > > + mutex_unlock(&kvm->arch.apic_map_lock);
> > > +
> > > + if (old)
> > > + kfree_rcu(old, rcu);
> > > +}
> > 
> > Paul, I'd like to check somet

Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-11 Thread Michael S. Tsirkin
On Tue, Sep 11, 2012 at 11:04:59PM +0300, Avi Kivity wrote:
> On 09/11/2012 08:13 PM, Paul E. McKenney wrote:
> > > Is there a risk of DOS if RCU is delayed while
> > > lots of memory is queued up in this way?
> > > If yes is this a generic problem with kfree_rcu
> > > that should be addressed in core kernel?
> >
> > There is indeed a risk.  The kfree_rcu() implementation cannot really
> > decide what to do here, especially given that it is callable with irqs
> > disabled.
> >
> > The usual approach is to keep a per-CPU counter and count it down from
> > some number for each kfree_rcu().  When it reaches zero, invoke
> > synchronize_rcu() as well as kfree_rcu(), and then reset it to the
> > "some number" mentioned above.
> >
> > In theory, I could create an API that did this.  In practice, I have no
> > idea how to choose the number -- much depends on the size of the object
> > being freed, for example.
> 
> Perhaps approach it from the other direction?  If we are under memory
> pressure, start synchronize_rcu()ing, much like the shrinker operates.
> 

Tricky ...

For now, how about we call synchronize_rcu_expedited in kvm and call it a day?
Also has an advantage that apic map is guaranteed to be in sync
with guest - while it seems that it's already correct as is,
synchronous operation is way simpler.

We can add a tracepoint so that we can detect it if this starts
happening a lot for some guest.

> 
> -- 
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-11 Thread Paul E. McKenney
On Wed, Sep 12, 2012 at 01:33:37AM +0300, Michael S. Tsirkin wrote:
> On Tue, Sep 11, 2012 at 10:13:00AM -0700, Paul E. McKenney wrote:
> > On Tue, Sep 11, 2012 at 05:10:23PM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Sep 11, 2012 at 04:02:25PM +0300, Gleb Natapov wrote:
> > > > Most interrupt are delivered to only one vcpu. Use pre-build tables to
> > > > find interrupt destination instead of looping through all vcpus. In case
> > > > of logical mode loop only through vcpus in a logical cluster irq is sent
> > > > to.
> > > > 
> > > > Signed-off-by: Gleb Natapov 
> > > 
> > > Added Paul just to make sure we are using RCU correctly.
> > > Paul could you pls answer the question below?
> > > 
> > > > ---
> > > >  Changelog:
> > > > 
> > > >   - v1->v2
> > > >* fix race Avi noticed
> > > >* rcu_read_lock() out of the block as per Avi
> > > >* fix rcu issues pointed to by MST. All but one. Still use
> > > >  call_rcu(). Do not think this is serious issue. If it is should be
> > > >  solved by RCU subsystem.
> > > >* Fix phys_map overflow pointed to by MST
> > > >* recalculate_apic_map() does not return error any more.
> > > >* add optimization for low prio logical mode with one cpu as dst (it
> > > >  happens)
> > > > 
> > > > I did not rewrote kvm_irq_delivery_to_apic_fast() MST way since my way
> > > > looks cleaner to me.
> > > > 
> > > > diff --git a/arch/x86/include/asm/kvm_host.h 
> > > > b/arch/x86/include/asm/kvm_host.h
> > > > index 64adb61..3ba8951 100644
> > > > --- a/arch/x86/include/asm/kvm_host.h
> > > > +++ b/arch/x86/include/asm/kvm_host.h
> > > > @@ -511,6 +511,14 @@ struct kvm_arch_memory_slot {
> > > > struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
> > > >  };
> > > >  
> > > > +struct kvm_apic_map {
> > > > +   struct rcu_head rcu;
> > > > +   bool flat;
> > > > +   u8 ldr_bits;
> > > > +   struct kvm_lapic *phys_map[256];
> > > > +   struct kvm_lapic *logical_map[16][16];
> > > > +};
> > > > +
> > > >  struct kvm_arch {
> > > > unsigned int n_used_mmu_pages;
> > > > unsigned int n_requested_mmu_pages;
> > > > @@ -528,6 +536,8 @@ struct kvm_arch {
> > > > struct kvm_ioapic *vioapic;
> > > > struct kvm_pit *vpit;
> > > > int vapics_in_nmi_mode;
> > > > +   struct kvm_apic_map *apic_map;
> > > > +   struct mutex apic_map_lock;
> > > >  
> > > > unsigned int tss_addr;
> > > > struct page *apic_access_page;
> > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > > index 07ad628..06672e8 100644
> > > > --- a/arch/x86/kvm/lapic.c
> > > > +++ b/arch/x86/kvm/lapic.c
> > > > @@ -139,11 +139,92 @@ static inline int apic_enabled(struct kvm_lapic 
> > > > *apic)
> > > > (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
> > > >  APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
> > > >  
> > > > +static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> > > > +{
> > > > +   return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> > > > +}
> > > > +
> > > >  static inline int kvm_apic_id(struct kvm_lapic *apic)
> > > >  {
> > > > return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> > > >  }
> > > >  
> > > > +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
> > > > +   u16 *cid, u16 *lid)
> > > > +{
> > > > +   if (ldr_bits == 32) {
> > > > +   *cid = ldr >> 16;
> > > > +   *lid = ldr & 0x;
> > > > +   } else {
> > > > +   ldr = GET_APIC_LOGICAL_ID(ldr);
> > > > +
> > > > +   if (flat) {
> > > > +   *cid = 0;
> > > > +   *lid = ldr;
> > > > +   } else {
> > > > +   *cid = ldr >> 4;
> > > > +   *lid = ldr & 0xf;
> > > > +   }
> > > > +   }
> > > > +}
> > > > +
> > > > +static inline void recalculate_apic_map(struct kvm *kvm)
> > > > +{
> > > > +   struct kvm_apic_map *new, *old = NULL;
> > > > +   struct kvm_vcpu *vcpu;
> > > > +   int i;
> > > > +
> > > > +   new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> > > > +
> > > > +   mutex_lock(&kvm->arch.apic_map_lock);
> > > > +
> > > > +   if (!new)
> > > > +   goto out;
> > > > +
> > > > +   new->ldr_bits = 8;
> > > > +   new->flat = true;
> > > > +   kvm_for_each_vcpu(i, vcpu, kvm) {
> > > > +   u16 cid, lid;
> > > > +   struct kvm_lapic *apic = vcpu->arch.apic;
> > > > +
> > > > +   if (!kvm_apic_present(vcpu))
> > > > +   continue;
> > > > +
> > > > +   if (apic_x2apic_mode(apic)) {
> > > > +   new->ldr_bits = 32;
> > > > +   new->flat = false;
> > > > +   } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> > > > +   kvm_apic_get_reg(apic, APIC_DFR) == 
> > > > APIC_DFR

Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-12 Thread Avi Kivity
On 09/12/2012 01:39 AM, Michael S. Tsirkin wrote:
> On Tue, Sep 11, 2012 at 11:04:59PM +0300, Avi Kivity wrote:
>> On 09/11/2012 08:13 PM, Paul E. McKenney wrote:
>> > > Is there a risk of DOS if RCU is delayed while
>> > > lots of memory is queued up in this way?
>> > > If yes is this a generic problem with kfree_rcu
>> > > that should be addressed in core kernel?
>> >
>> > There is indeed a risk.  The kfree_rcu() implementation cannot really
>> > decide what to do here, especially given that it is callable with irqs
>> > disabled.
>> >
>> > The usual approach is to keep a per-CPU counter and count it down from
>> > some number for each kfree_rcu().  When it reaches zero, invoke
>> > synchronize_rcu() as well as kfree_rcu(), and then reset it to the
>> > "some number" mentioned above.
>> >
>> > In theory, I could create an API that did this.  In practice, I have no
>> > idea how to choose the number -- much depends on the size of the object
>> > being freed, for example.
>> 
>> Perhaps approach it from the other direction?  If we are under memory
>> pressure, start synchronize_rcu()ing, much like the shrinker operates.
>> 
> 
> Tricky ...
> 
> For now, how about we call synchronize_rcu_expedited in kvm and call it a day?

I prefer to let the rcu people fix it.

> Also has an advantage that apic map is guaranteed to be in sync
> with guest - while it seems that it's already correct as is,
> synchronous operation is way simpler.

It works exactly the same way.  Interrupts started in parallel with an
ID update will use either map.  Interrupts started afterwards will use
the new map.

> 
> We can add a tracepoint so that we can detect it if this starts
> happening a lot for some guest.
>

No point, guests don't update their APIC ID (or related) after booting.

-- 
error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-12 Thread Avi Kivity
On 09/12/2012 04:03 AM, Paul E. McKenney wrote:
>> > > Paul, I'd like to check something with you here:
>> > > this function can be triggered by userspace,
>> > > any number of times; we allocate
>> > > a 2K chunk of memory that is later freed by
>> > > kfree_rcu.
>> > > 
>> > > Is there a risk of DOS if RCU is delayed while
>> > > lots of memory is queued up in this way?
>> > > If yes is this a generic problem with kfree_rcu
>> > > that should be addressed in core kernel?
>> > 
>> > There is indeed a risk.
>> 
>> In our case it's a 2K object. Is it a practical risk?
> 
> How many kfree_rcu()s per second can a given user cause to happen?

Not much more than a few hundred thousand per second per process (normal
operation is zero).

> 
>> > The kfree_rcu() implementation cannot really
>> > decide what to do here, especially given that it is callable with irqs
>> > disabled.
>> > 
>> > The usual approach is to keep a per-CPU counter and count it down from
>> > some number for each kfree_rcu().  When it reaches zero, invoke
>> > synchronize_rcu() as well as kfree_rcu(), and then reset it to the
>> > "some number" mentioned above.
>> 
>> It is a bit of a concern for me that this will hurt worst-case latency
>> for realtime guests.  In our case, we return error and this will
>> fall back on not allocating memory and using slow all-CPU scan.
>> One possible scheme that relies on this is:
>>  - increment an atomic counter, per vcpu. If above threshold ->
>>  return with error
>>  - call_rcu (+ barrier vcpu destruct)
>>  - within callback decrement an atomic counter
> 
> That certainly is a possibility, but...
> 
>> > In theory, I could create an API that did this.  In practice, I have no
>> > idea how to choose the number -- much depends on the size of the object
>> > being freed, for example.
>> 
>> We could pass an object size, no problem :)
> 
> ... before putting too much additional effort into possible solutions,
> why not force the problem to occur and see what actually happens?  We
> would then be in a much better position to work out what should be done.

Good idea.  Michael, is should be easy to modify kvm-unit-tests to write
to the APIC ID register in a loop.

-- 
error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-12 Thread Gleb Natapov
On Tue, Sep 11, 2012 at 10:13:00AM -0700, Paul E. McKenney wrote:
> On Tue, Sep 11, 2012 at 05:10:23PM +0300, Michael S. Tsirkin wrote:
> > On Tue, Sep 11, 2012 at 04:02:25PM +0300, Gleb Natapov wrote:
> > > Most interrupt are delivered to only one vcpu. Use pre-build tables to
> > > find interrupt destination instead of looping through all vcpus. In case
> > > of logical mode loop only through vcpus in a logical cluster irq is sent
> > > to.
> > > 
> > > Signed-off-by: Gleb Natapov 
> > 
> > Added Paul just to make sure we are using RCU correctly.
> > Paul could you pls answer the question below?
> > 
> > > ---
> > >  Changelog:
> > > 
> > >   - v1->v2
> > >* fix race Avi noticed
> > >* rcu_read_lock() out of the block as per Avi
> > >* fix rcu issues pointed to by MST. All but one. Still use
> > >  call_rcu(). Do not think this is serious issue. If it is should be
> > >  solved by RCU subsystem.
> > >* Fix phys_map overflow pointed to by MST
> > >* recalculate_apic_map() does not return error any more.
> > >* add optimization for low prio logical mode with one cpu as dst (it
> > >  happens)
> > > 
> > > I did not rewrote kvm_irq_delivery_to_apic_fast() MST way since my way
> > > looks cleaner to me.
> > > 
> > > diff --git a/arch/x86/include/asm/kvm_host.h 
> > > b/arch/x86/include/asm/kvm_host.h
> > > index 64adb61..3ba8951 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -511,6 +511,14 @@ struct kvm_arch_memory_slot {
> > >   struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
> > >  };
> > >  
> > > +struct kvm_apic_map {
> > > + struct rcu_head rcu;
> > > + bool flat;
> > > + u8 ldr_bits;
> > > + struct kvm_lapic *phys_map[256];
> > > + struct kvm_lapic *logical_map[16][16];
> > > +};
> > > +
> > >  struct kvm_arch {
> > >   unsigned int n_used_mmu_pages;
> > >   unsigned int n_requested_mmu_pages;
> > > @@ -528,6 +536,8 @@ struct kvm_arch {
> > >   struct kvm_ioapic *vioapic;
> > >   struct kvm_pit *vpit;
> > >   int vapics_in_nmi_mode;
> > > + struct kvm_apic_map *apic_map;
> > > + struct mutex apic_map_lock;
> > >  
> > >   unsigned int tss_addr;
> > >   struct page *apic_access_page;
> > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > index 07ad628..06672e8 100644
> > > --- a/arch/x86/kvm/lapic.c
> > > +++ b/arch/x86/kvm/lapic.c
> > > @@ -139,11 +139,92 @@ static inline int apic_enabled(struct kvm_lapic 
> > > *apic)
> > >   (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
> > >APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
> > >  
> > > +static inline int apic_x2apic_mode(struct kvm_lapic *apic)
> > > +{
> > > + return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
> > > +}
> > > +
> > >  static inline int kvm_apic_id(struct kvm_lapic *apic)
> > >  {
> > >   return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
> > >  }
> > >  
> > > +static void kvm_apic_get_logical_id(u32 ldr, bool flat, u8 ldr_bits,
> > > + u16 *cid, u16 *lid)
> > > +{
> > > + if (ldr_bits == 32) {
> > > + *cid = ldr >> 16;
> > > + *lid = ldr & 0x;
> > > + } else {
> > > + ldr = GET_APIC_LOGICAL_ID(ldr);
> > > +
> > > + if (flat) {
> > > + *cid = 0;
> > > + *lid = ldr;
> > > + } else {
> > > + *cid = ldr >> 4;
> > > + *lid = ldr & 0xf;
> > > + }
> > > + }
> > > +}
> > > +
> > > +static inline void recalculate_apic_map(struct kvm *kvm)
> > > +{
> > > + struct kvm_apic_map *new, *old = NULL;
> > > + struct kvm_vcpu *vcpu;
> > > + int i;
> > > +
> > > + new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
> > > +
> > > + mutex_lock(&kvm->arch.apic_map_lock);
> > > +
> > > + if (!new)
> > > + goto out;
> > > +
> > > + new->ldr_bits = 8;
> > > + new->flat = true;
> > > + kvm_for_each_vcpu(i, vcpu, kvm) {
> > > + u16 cid, lid;
> > > + struct kvm_lapic *apic = vcpu->arch.apic;
> > > +
> > > + if (!kvm_apic_present(vcpu))
> > > + continue;
> > > +
> > > + if (apic_x2apic_mode(apic)) {
> > > + new->ldr_bits = 32;
> > > + new->flat = false;
> > > + } else if (kvm_apic_sw_enabled(apic) && new->flat &&
> > > + kvm_apic_get_reg(apic, APIC_DFR) == 
> > > APIC_DFR_CLUSTER)
> > > + new->flat = false;
> > > +
> > > + new->phys_map[kvm_apic_id(apic)] = apic;
> > > + kvm_apic_get_logical_id(kvm_apic_get_reg(apic, APIC_LDR),
> > > + new->flat, new->ldr_bits, &cid, &lid);
> > > +
> > > + if (lid)
> > > + new->logical_map[cid][ffs(lid) - 1] = apic;
> > > + }
> > > +out:
> > > + old = kvm->arch.apic_map;
> > > + rcu_assign_pointer(kvm->arch.apic_map, new);
> > > + mutex_unlock(&kvm->arch.apic_map_lock);
> > > +
> > > + if (old)
> > > + kfree_rcu(old, rcu);
> > > +}
> > 
> > Paul, I'd like to check somet

Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-12 Thread Gleb Natapov
On Wed, Sep 12, 2012 at 10:45:22AM +0300, Avi Kivity wrote:
> On 09/12/2012 04:03 AM, Paul E. McKenney wrote:
> >> > > Paul, I'd like to check something with you here:
> >> > > this function can be triggered by userspace,
> >> > > any number of times; we allocate
> >> > > a 2K chunk of memory that is later freed by
> >> > > kfree_rcu.
> >> > > 
> >> > > Is there a risk of DOS if RCU is delayed while
> >> > > lots of memory is queued up in this way?
> >> > > If yes is this a generic problem with kfree_rcu
> >> > > that should be addressed in core kernel?
> >> > 
> >> > There is indeed a risk.
> >> 
> >> In our case it's a 2K object. Is it a practical risk?
> > 
> > How many kfree_rcu()s per second can a given user cause to happen?
> 
> Not much more than a few hundred thousand per second per process (normal
> operation is zero).
> 
I managed to do 21466 per second.

> > 
> >> > The kfree_rcu() implementation cannot really
> >> > decide what to do here, especially given that it is callable with irqs
> >> > disabled.
> >> > 
> >> > The usual approach is to keep a per-CPU counter and count it down from
> >> > some number for each kfree_rcu().  When it reaches zero, invoke
> >> > synchronize_rcu() as well as kfree_rcu(), and then reset it to the
> >> > "some number" mentioned above.
> >> 
> >> It is a bit of a concern for me that this will hurt worst-case latency
> >> for realtime guests.  In our case, we return error and this will
> >> fall back on not allocating memory and using slow all-CPU scan.
> >> One possible scheme that relies on this is:
> >>- increment an atomic counter, per vcpu. If above threshold ->
> >>return with error
> >>- call_rcu (+ barrier vcpu destruct)
> >>- within callback decrement an atomic counter
> > 
> > That certainly is a possibility, but...
> > 
> >> > In theory, I could create an API that did this.  In practice, I have no
> >> > idea how to choose the number -- much depends on the size of the object
> >> > being freed, for example.
> >> 
> >> We could pass an object size, no problem :)
> > 
> > ... before putting too much additional effort into possible solutions,
> > why not force the problem to occur and see what actually happens?  We
> > would then be in a much better position to work out what should be done.
> 
> Good idea.  Michael, is should be easy to modify kvm-unit-tests to write
> to the APIC ID register in a loop.
> 
I did. Memory consumption does not grow on otherwise idle host.

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-12 Thread Gleb Natapov
On Wed, Sep 12, 2012 at 03:36:57PM +0300, Avi Kivity wrote:
> On 09/12/2012 03:34 PM, Gleb Natapov wrote:
> > On Wed, Sep 12, 2012 at 10:45:22AM +0300, Avi Kivity wrote:
> >> On 09/12/2012 04:03 AM, Paul E. McKenney wrote:
> >> >> > > Paul, I'd like to check something with you here:
> >> >> > > this function can be triggered by userspace,
> >> >> > > any number of times; we allocate
> >> >> > > a 2K chunk of memory that is later freed by
> >> >> > > kfree_rcu.
> >> >> > > 
> >> >> > > Is there a risk of DOS if RCU is delayed while
> >> >> > > lots of memory is queued up in this way?
> >> >> > > If yes is this a generic problem with kfree_rcu
> >> >> > > that should be addressed in core kernel?
> >> >> > 
> >> >> > There is indeed a risk.
> >> >> 
> >> >> In our case it's a 2K object. Is it a practical risk?
> >> > 
> >> > How many kfree_rcu()s per second can a given user cause to happen?
> >> 
> >> Not much more than a few hundred thousand per second per process (normal
> >> operation is zero).
> >> 
> > I managed to do 21466 per second.
> 
> Strange, why so slow?
> 
Because ftrace buffer overflows :) With bigger buffer I get 169940.

> >> Good idea.  Michael, is should be easy to modify kvm-unit-tests to write
> >> to the APIC ID register in a loop.
> >> 
> > I did. Memory consumption does not grow on otherwise idle host.
> 
> Ok, thanks.
> 
> 
> -- 
> error compiling committee.c: too many arguments to function

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2012-09-12 Thread Paul E. McKenney
On Wed, Sep 12, 2012 at 03:44:26PM +0300, Gleb Natapov wrote:
> On Wed, Sep 12, 2012 at 03:36:57PM +0300, Avi Kivity wrote:
> > On 09/12/2012 03:34 PM, Gleb Natapov wrote:
> > > On Wed, Sep 12, 2012 at 10:45:22AM +0300, Avi Kivity wrote:
> > >> On 09/12/2012 04:03 AM, Paul E. McKenney wrote:
> > >> >> > > Paul, I'd like to check something with you here:
> > >> >> > > this function can be triggered by userspace,
> > >> >> > > any number of times; we allocate
> > >> >> > > a 2K chunk of memory that is later freed by
> > >> >> > > kfree_rcu.
> > >> >> > > 
> > >> >> > > Is there a risk of DOS if RCU is delayed while
> > >> >> > > lots of memory is queued up in this way?
> > >> >> > > If yes is this a generic problem with kfree_rcu
> > >> >> > > that should be addressed in core kernel?
> > >> >> > 
> > >> >> > There is indeed a risk.
> > >> >> 
> > >> >> In our case it's a 2K object. Is it a practical risk?
> > >> > 
> > >> > How many kfree_rcu()s per second can a given user cause to happen?
> > >> 
> > >> Not much more than a few hundred thousand per second per process (normal
> > >> operation is zero).
> > >> 
> > > I managed to do 21466 per second.
> > 
> > Strange, why so slow?
> > 
> Because ftrace buffer overflows :) With bigger buffer I get 169940.

Ah, good, should not be a problem.  In contrast, if you ran kfree_rcu() in
a tight loop, you could probably do in excess of 100M per CPU per second.
Now -that- might be a problem.

Well, it -might- be a problem if you somehow figured out how to allocate
memory that quickly in a steady-state manner.  ;-)

> > >> Good idea.  Michael, is should be easy to modify kvm-unit-tests to write
> > >> to the APIC ID register in a loop.
> > >> 
> > > I did. Memory consumption does not grow on otherwise idle host.

Very good -- the checks in __call_rcu(), which is common code invoked by
kfree_rcu(), seem to be doing their job, then.  These do keep a per-CPU
counter, which can be adjusted via rcutree.blimit, which defaults
to taking evasive action if more than 10K callbacks are waiting on a
given CPU.

My concern was that you might be overrunning that limit in way less
than a grace period (as in about a hundred microseconds.  My concern
was of course unfounded -- you take several grace periods in push 10K
callbacks through.

Thanx, Paul

> > Ok, thanks.
> > 
> > 
> > -- 
> > error compiling committee.c: too many arguments to function
> 
> --
>   Gleb.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2013-11-26 Thread Michael S. Tsirkin
On Wed, Sep 12, 2012 at 08:13:54AM -0700, Paul E. McKenney wrote:
> On Wed, Sep 12, 2012 at 03:44:26PM +0300, Gleb Natapov wrote:
> > On Wed, Sep 12, 2012 at 03:36:57PM +0300, Avi Kivity wrote:
> > > On 09/12/2012 03:34 PM, Gleb Natapov wrote:
> > > > On Wed, Sep 12, 2012 at 10:45:22AM +0300, Avi Kivity wrote:
> > > >> On 09/12/2012 04:03 AM, Paul E. McKenney wrote:
> > > >> >> > > Paul, I'd like to check something with you here:
> > > >> >> > > this function can be triggered by userspace,
> > > >> >> > > any number of times; we allocate
> > > >> >> > > a 2K chunk of memory that is later freed by
> > > >> >> > > kfree_rcu.
> > > >> >> > > 
> > > >> >> > > Is there a risk of DOS if RCU is delayed while
> > > >> >> > > lots of memory is queued up in this way?
> > > >> >> > > If yes is this a generic problem with kfree_rcu
> > > >> >> > > that should be addressed in core kernel?
> > > >> >> > 
> > > >> >> > There is indeed a risk.
> > > >> >> 
> > > >> >> In our case it's a 2K object. Is it a practical risk?
> > > >> > 
> > > >> > How many kfree_rcu()s per second can a given user cause to happen?
> > > >> 
> > > >> Not much more than a few hundred thousand per second per process 
> > > >> (normal
> > > >> operation is zero).
> > > >> 
> > > > I managed to do 21466 per second.
> > > 
> > > Strange, why so slow?
> > > 
> > Because ftrace buffer overflows :) With bigger buffer I get 169940.
> 
> Ah, good, should not be a problem.  In contrast, if you ran kfree_rcu() in
> a tight loop, you could probably do in excess of 100M per CPU per second.
> Now -that- might be a problem.
> 
> Well, it -might- be a problem if you somehow figured out how to allocate
> memory that quickly in a steady-state manner.  ;-)
> 
> > > >> Good idea.  Michael, is should be easy to modify kvm-unit-tests to 
> > > >> write
> > > >> to the APIC ID register in a loop.
> > > >> 
> > > > I did. Memory consumption does not grow on otherwise idle host.
> 
> Very good -- the checks in __call_rcu(), which is common code invoked by
> kfree_rcu(), seem to be doing their job, then.  These do keep a per-CPU
> counter, which can be adjusted via rcutree.blimit, which defaults
> to taking evasive action if more than 10K callbacks are waiting on a
> given CPU.
> 
> My concern was that you might be overrunning that limit in way less
> than a grace period (as in about a hundred microseconds.  My concern
> was of course unfounded -- you take several grace periods in push 10K
> callbacks through.
> 
>   Thanx, Paul

Gleb noted that Documentation/RCU/checklist.txt has this text:

An especially important property of the synchronize_rcu()
primitive is that it automatically self-limits: if grace periods
are delayed for whatever reason, then the synchronize_rcu()
primitive will correspondingly delay updates.  In contrast,
code using call_rcu() should explicitly limit update rate in
cases where grace periods are delayed, as failing to do so can
result in excessive realtime latencies or even OOM conditions.

If call_rcu is self-limiting maybe this should be documented ...

> > > Ok, thanks.
> > > 
> > > 
> > > -- 
> > > error compiling committee.c: too many arguments to function
> > 
> > --
> > Gleb.
> > 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2013-11-26 Thread Paul E. McKenney
On Tue, Nov 26, 2013 at 06:24:13PM +0200, Michael S. Tsirkin wrote:
> On Wed, Sep 12, 2012 at 08:13:54AM -0700, Paul E. McKenney wrote:
> > On Wed, Sep 12, 2012 at 03:44:26PM +0300, Gleb Natapov wrote:
> > > On Wed, Sep 12, 2012 at 03:36:57PM +0300, Avi Kivity wrote:
> > > > On 09/12/2012 03:34 PM, Gleb Natapov wrote:
> > > > > On Wed, Sep 12, 2012 at 10:45:22AM +0300, Avi Kivity wrote:
> > > > >> On 09/12/2012 04:03 AM, Paul E. McKenney wrote:
> > > > >> >> > > Paul, I'd like to check something with you here:
> > > > >> >> > > this function can be triggered by userspace,
> > > > >> >> > > any number of times; we allocate
> > > > >> >> > > a 2K chunk of memory that is later freed by
> > > > >> >> > > kfree_rcu.
> > > > >> >> > > 
> > > > >> >> > > Is there a risk of DOS if RCU is delayed while
> > > > >> >> > > lots of memory is queued up in this way?
> > > > >> >> > > If yes is this a generic problem with kfree_rcu
> > > > >> >> > > that should be addressed in core kernel?
> > > > >> >> > 
> > > > >> >> > There is indeed a risk.
> > > > >> >> 
> > > > >> >> In our case it's a 2K object. Is it a practical risk?
> > > > >> > 
> > > > >> > How many kfree_rcu()s per second can a given user cause to happen?
> > > > >> 
> > > > >> Not much more than a few hundred thousand per second per process 
> > > > >> (normal
> > > > >> operation is zero).
> > > > >> 
> > > > > I managed to do 21466 per second.
> > > > 
> > > > Strange, why so slow?
> > > > 
> > > Because ftrace buffer overflows :) With bigger buffer I get 169940.
> > 
> > Ah, good, should not be a problem.  In contrast, if you ran kfree_rcu() in
> > a tight loop, you could probably do in excess of 100M per CPU per second.
> > Now -that- might be a problem.
> > 
> > Well, it -might- be a problem if you somehow figured out how to allocate
> > memory that quickly in a steady-state manner.  ;-)
> > 
> > > > >> Good idea.  Michael, is should be easy to modify kvm-unit-tests to 
> > > > >> write
> > > > >> to the APIC ID register in a loop.
> > > > >> 
> > > > > I did. Memory consumption does not grow on otherwise idle host.
> > 
> > Very good -- the checks in __call_rcu(), which is common code invoked by
> > kfree_rcu(), seem to be doing their job, then.  These do keep a per-CPU
> > counter, which can be adjusted via rcutree.blimit, which defaults
> > to taking evasive action if more than 10K callbacks are waiting on a
> > given CPU.
> > 
> > My concern was that you might be overrunning that limit in way less
> > than a grace period (as in about a hundred microseconds.  My concern
> > was of course unfounded -- you take several grace periods in push 10K
> > callbacks through.
> > 
> > Thanx, Paul
> 
> Gleb noted that Documentation/RCU/checklist.txt has this text:
> 
> An especially important property of the synchronize_rcu()
> primitive is that it automatically self-limits: if grace periods
> are delayed for whatever reason, then the synchronize_rcu()
> primitive will correspondingly delay updates.  In contrast,
> code using call_rcu() should explicitly limit update rate in
> cases where grace periods are delayed, as failing to do so can
> result in excessive realtime latencies or even OOM conditions.
> 
> If call_rcu is self-limiting maybe this should be documented ...

It would be more accurate to say that takes has some measures to limit
the damage -- you can overwhelm these measures if you try hard enough.

And I guess I could say something to that effect.  ;-)

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2013-11-26 Thread Marcelo Tosatti
On Tue, Nov 26, 2013 at 06:24:13PM +0200, Michael S. Tsirkin wrote:
> On Wed, Sep 12, 2012 at 08:13:54AM -0700, Paul E. McKenney wrote:
> > On Wed, Sep 12, 2012 at 03:44:26PM +0300, Gleb Natapov wrote:
> > > On Wed, Sep 12, 2012 at 03:36:57PM +0300, Avi Kivity wrote:
> > > > On 09/12/2012 03:34 PM, Gleb Natapov wrote:
> > > > > On Wed, Sep 12, 2012 at 10:45:22AM +0300, Avi Kivity wrote:
> > > > >> On 09/12/2012 04:03 AM, Paul E. McKenney wrote:
> > > > >> >> > > Paul, I'd like to check something with you here:
> > > > >> >> > > this function can be triggered by userspace,
> > > > >> >> > > any number of times; we allocate
> > > > >> >> > > a 2K chunk of memory that is later freed by
> > > > >> >> > > kfree_rcu.
> > > > >> >> > > 
> > > > >> >> > > Is there a risk of DOS if RCU is delayed while
> > > > >> >> > > lots of memory is queued up in this way?
> > > > >> >> > > If yes is this a generic problem with kfree_rcu
> > > > >> >> > > that should be addressed in core kernel?
> > > > >> >> > 
> > > > >> >> > There is indeed a risk.
> > > > >> >> 
> > > > >> >> In our case it's a 2K object. Is it a practical risk?
> > > > >> > 
> > > > >> > How many kfree_rcu()s per second can a given user cause to happen?
> > > > >> 
> > > > >> Not much more than a few hundred thousand per second per process 
> > > > >> (normal
> > > > >> operation is zero).
> > > > >> 
> > > > > I managed to do 21466 per second.
> > > > 
> > > > Strange, why so slow?
> > > > 
> > > Because ftrace buffer overflows :) With bigger buffer I get 169940.
> > 
> > Ah, good, should not be a problem.  In contrast, if you ran kfree_rcu() in
> > a tight loop, you could probably do in excess of 100M per CPU per second.
> > Now -that- might be a problem.
> > 
> > Well, it -might- be a problem if you somehow figured out how to allocate
> > memory that quickly in a steady-state manner.  ;-)
> > 
> > > > >> Good idea.  Michael, is should be easy to modify kvm-unit-tests to 
> > > > >> write
> > > > >> to the APIC ID register in a loop.
> > > > >> 
> > > > > I did. Memory consumption does not grow on otherwise idle host.
> > 
> > Very good -- the checks in __call_rcu(), which is common code invoked by
> > kfree_rcu(), seem to be doing their job, then.  These do keep a per-CPU
> > counter, which can be adjusted via rcutree.blimit, which defaults
> > to taking evasive action if more than 10K callbacks are waiting on a
> > given CPU.
> > 
> > My concern was that you might be overrunning that limit in way less
> > than a grace period (as in about a hundred microseconds.  My concern
> > was of course unfounded -- you take several grace periods in push 10K
> > callbacks through.
> > 
> > Thanx, Paul
> 
> Gleb noted that Documentation/RCU/checklist.txt has this text:
> 
> An especially important property of the synchronize_rcu()
> primitive is that it automatically self-limits: if grace periods
> are delayed for whatever reason, then the synchronize_rcu()
> primitive will correspondingly delay updates.  In contrast,
> code using call_rcu() should explicitly limit update rate in
> cases where grace periods are delayed, as failing to do so can
> result in excessive realtime latencies or even OOM conditions.
> 
> If call_rcu is self-limiting maybe this should be documented ...

The documentation should be fixed, rather, to not mention that
call_rcu() must be rate-limited by the user.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2013-11-27 Thread Gleb Natapov
On Tue, Nov 26, 2013 at 11:35:06AM -0800, Paul E. McKenney wrote:
> On Tue, Nov 26, 2013 at 06:24:13PM +0200, Michael S. Tsirkin wrote:
> > On Wed, Sep 12, 2012 at 08:13:54AM -0700, Paul E. McKenney wrote:
> > > On Wed, Sep 12, 2012 at 03:44:26PM +0300, Gleb Natapov wrote:
> > > > On Wed, Sep 12, 2012 at 03:36:57PM +0300, Avi Kivity wrote:
> > > > > On 09/12/2012 03:34 PM, Gleb Natapov wrote:
> > > > > > On Wed, Sep 12, 2012 at 10:45:22AM +0300, Avi Kivity wrote:
> > > > > >> On 09/12/2012 04:03 AM, Paul E. McKenney wrote:
> > > > > >> >> > > Paul, I'd like to check something with you here:
> > > > > >> >> > > this function can be triggered by userspace,
> > > > > >> >> > > any number of times; we allocate
> > > > > >> >> > > a 2K chunk of memory that is later freed by
> > > > > >> >> > > kfree_rcu.
> > > > > >> >> > > 
> > > > > >> >> > > Is there a risk of DOS if RCU is delayed while
> > > > > >> >> > > lots of memory is queued up in this way?
> > > > > >> >> > > If yes is this a generic problem with kfree_rcu
> > > > > >> >> > > that should be addressed in core kernel?
> > > > > >> >> > 
> > > > > >> >> > There is indeed a risk.
> > > > > >> >> 
> > > > > >> >> In our case it's a 2K object. Is it a practical risk?
> > > > > >> > 
> > > > > >> > How many kfree_rcu()s per second can a given user cause to 
> > > > > >> > happen?
> > > > > >> 
> > > > > >> Not much more than a few hundred thousand per second per process 
> > > > > >> (normal
> > > > > >> operation is zero).
> > > > > >> 
> > > > > > I managed to do 21466 per second.
> > > > > 
> > > > > Strange, why so slow?
> > > > > 
> > > > Because ftrace buffer overflows :) With bigger buffer I get 169940.
> > > 
> > > Ah, good, should not be a problem.  In contrast, if you ran kfree_rcu() in
> > > a tight loop, you could probably do in excess of 100M per CPU per second.
> > > Now -that- might be a problem.
> > > 
> > > Well, it -might- be a problem if you somehow figured out how to allocate
> > > memory that quickly in a steady-state manner.  ;-)
> > > 
> > > > > >> Good idea.  Michael, is should be easy to modify kvm-unit-tests to 
> > > > > >> write
> > > > > >> to the APIC ID register in a loop.
> > > > > >> 
> > > > > > I did. Memory consumption does not grow on otherwise idle host.
> > > 
> > > Very good -- the checks in __call_rcu(), which is common code invoked by
> > > kfree_rcu(), seem to be doing their job, then.  These do keep a per-CPU
> > > counter, which can be adjusted via rcutree.blimit, which defaults
> > > to taking evasive action if more than 10K callbacks are waiting on a
> > > given CPU.
> > > 
> > > My concern was that you might be overrunning that limit in way less
> > > than a grace period (as in about a hundred microseconds.  My concern
> > > was of course unfounded -- you take several grace periods in push 10K
> > > callbacks through.
> > > 
> > >   Thanx, Paul
> > 
> > Gleb noted that Documentation/RCU/checklist.txt has this text:
> > 
> > An especially important property of the synchronize_rcu()
> > primitive is that it automatically self-limits: if grace periods
> > are delayed for whatever reason, then the synchronize_rcu()
> > primitive will correspondingly delay updates.  In contrast,
> > code using call_rcu() should explicitly limit update rate in
> > cases where grace periods are delayed, as failing to do so can
> > result in excessive realtime latencies or even OOM conditions.
> > 
> > If call_rcu is self-limiting maybe this should be documented ...
> 
> It would be more accurate to say that takes has some measures to limit
> the damage -- you can overwhelm these measures if you try hard enough.
> 
The question is: Is it safe to have a call_rcu() without any additional rate 
limiting
on user triggerable code path?

> And I guess I could say something to that effect.  ;-)
> 
>   Thanx, Paul

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2013-11-27 Thread Paul E. McKenney
On Wed, Nov 27, 2013 at 10:00:09AM +0200, Gleb Natapov wrote:
> On Tue, Nov 26, 2013 at 11:35:06AM -0800, Paul E. McKenney wrote:
> > On Tue, Nov 26, 2013 at 06:24:13PM +0200, Michael S. Tsirkin wrote:
> > > On Wed, Sep 12, 2012 at 08:13:54AM -0700, Paul E. McKenney wrote:
> > > > On Wed, Sep 12, 2012 at 03:44:26PM +0300, Gleb Natapov wrote:
> > > > > On Wed, Sep 12, 2012 at 03:36:57PM +0300, Avi Kivity wrote:
> > > > > > On 09/12/2012 03:34 PM, Gleb Natapov wrote:
> > > > > > > On Wed, Sep 12, 2012 at 10:45:22AM +0300, Avi Kivity wrote:
> > > > > > >> On 09/12/2012 04:03 AM, Paul E. McKenney wrote:
> > > > > > >> >> > > Paul, I'd like to check something with you here:
> > > > > > >> >> > > this function can be triggered by userspace,
> > > > > > >> >> > > any number of times; we allocate
> > > > > > >> >> > > a 2K chunk of memory that is later freed by
> > > > > > >> >> > > kfree_rcu.
> > > > > > >> >> > > 
> > > > > > >> >> > > Is there a risk of DOS if RCU is delayed while
> > > > > > >> >> > > lots of memory is queued up in this way?
> > > > > > >> >> > > If yes is this a generic problem with kfree_rcu
> > > > > > >> >> > > that should be addressed in core kernel?
> > > > > > >> >> > 
> > > > > > >> >> > There is indeed a risk.
> > > > > > >> >> 
> > > > > > >> >> In our case it's a 2K object. Is it a practical risk?
> > > > > > >> > 
> > > > > > >> > How many kfree_rcu()s per second can a given user cause to 
> > > > > > >> > happen?
> > > > > > >> 
> > > > > > >> Not much more than a few hundred thousand per second per process 
> > > > > > >> (normal
> > > > > > >> operation is zero).
> > > > > > >> 
> > > > > > > I managed to do 21466 per second.
> > > > > > 
> > > > > > Strange, why so slow?
> > > > > > 
> > > > > Because ftrace buffer overflows :) With bigger buffer I get 169940.
> > > > 
> > > > Ah, good, should not be a problem.  In contrast, if you ran kfree_rcu() 
> > > > in
> > > > a tight loop, you could probably do in excess of 100M per CPU per 
> > > > second.
> > > > Now -that- might be a problem.
> > > > 
> > > > Well, it -might- be a problem if you somehow figured out how to allocate
> > > > memory that quickly in a steady-state manner.  ;-)
> > > > 
> > > > > > >> Good idea.  Michael, is should be easy to modify kvm-unit-tests 
> > > > > > >> to write
> > > > > > >> to the APIC ID register in a loop.
> > > > > > >> 
> > > > > > > I did. Memory consumption does not grow on otherwise idle host.
> > > > 
> > > > Very good -- the checks in __call_rcu(), which is common code invoked by
> > > > kfree_rcu(), seem to be doing their job, then.  These do keep a per-CPU
> > > > counter, which can be adjusted via rcutree.blimit, which defaults
> > > > to taking evasive action if more than 10K callbacks are waiting on a
> > > > given CPU.
> > > > 
> > > > My concern was that you might be overrunning that limit in way less
> > > > than a grace period (as in about a hundred microseconds.  My concern
> > > > was of course unfounded -- you take several grace periods in push 10K
> > > > callbacks through.
> > > > 
> > > > Thanx, Paul
> > > 
> > > Gleb noted that Documentation/RCU/checklist.txt has this text:
> > > 
> > > An especially important property of the synchronize_rcu()
> > > primitive is that it automatically self-limits: if grace periods
> > > are delayed for whatever reason, then the synchronize_rcu()
> > > primitive will correspondingly delay updates.  In contrast,
> > > code using call_rcu() should explicitly limit update rate in
> > > cases where grace periods are delayed, as failing to do so can
> > > result in excessive realtime latencies or even OOM conditions.
> > > 
> > > If call_rcu is self-limiting maybe this should be documented ...
> > 
> > It would be more accurate to say that takes has some measures to limit
> > the damage -- you can overwhelm these measures if you try hard enough.
> > 
> The question is: Is it safe to have a call_rcu() without any additional rate 
> limiting
> on user triggerable code path?

That would be a good way to allow users to run your system out of memory,
especially on systems with limited memory.  (If you have several GB of
free space, you might be OK.)

Thanx, Paul

> > And I guess I could say something to that effect.  ;-)
> > 
> > Thanx, Paul
> 
> --
>   Gleb.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2013-11-28 Thread Gleb Natapov
On Wed, Nov 27, 2013 at 09:06:36AM -0800, Paul E. McKenney wrote:
> On Wed, Nov 27, 2013 at 10:00:09AM +0200, Gleb Natapov wrote:
> > On Tue, Nov 26, 2013 at 11:35:06AM -0800, Paul E. McKenney wrote:
> > > On Tue, Nov 26, 2013 at 06:24:13PM +0200, Michael S. Tsirkin wrote:
> > > > On Wed, Sep 12, 2012 at 08:13:54AM -0700, Paul E. McKenney wrote:
> > > > > On Wed, Sep 12, 2012 at 03:44:26PM +0300, Gleb Natapov wrote:
> > > > > > On Wed, Sep 12, 2012 at 03:36:57PM +0300, Avi Kivity wrote:
> > > > > > > On 09/12/2012 03:34 PM, Gleb Natapov wrote:
> > > > > > > > On Wed, Sep 12, 2012 at 10:45:22AM +0300, Avi Kivity wrote:
> > > > > > > >> On 09/12/2012 04:03 AM, Paul E. McKenney wrote:
> > > > > > > >> >> > > Paul, I'd like to check something with you here:
> > > > > > > >> >> > > this function can be triggered by userspace,
> > > > > > > >> >> > > any number of times; we allocate
> > > > > > > >> >> > > a 2K chunk of memory that is later freed by
> > > > > > > >> >> > > kfree_rcu.
> > > > > > > >> >> > > 
> > > > > > > >> >> > > Is there a risk of DOS if RCU is delayed while
> > > > > > > >> >> > > lots of memory is queued up in this way?
> > > > > > > >> >> > > If yes is this a generic problem with kfree_rcu
> > > > > > > >> >> > > that should be addressed in core kernel?
> > > > > > > >> >> > 
> > > > > > > >> >> > There is indeed a risk.
> > > > > > > >> >> 
> > > > > > > >> >> In our case it's a 2K object. Is it a practical risk?
> > > > > > > >> > 
> > > > > > > >> > How many kfree_rcu()s per second can a given user cause to 
> > > > > > > >> > happen?
> > > > > > > >> 
> > > > > > > >> Not much more than a few hundred thousand per second per 
> > > > > > > >> process (normal
> > > > > > > >> operation is zero).
> > > > > > > >> 
> > > > > > > > I managed to do 21466 per second.
> > > > > > > 
> > > > > > > Strange, why so slow?
> > > > > > > 
> > > > > > Because ftrace buffer overflows :) With bigger buffer I get 169940.
> > > > > 
> > > > > Ah, good, should not be a problem.  In contrast, if you ran 
> > > > > kfree_rcu() in
> > > > > a tight loop, you could probably do in excess of 100M per CPU per 
> > > > > second.
> > > > > Now -that- might be a problem.
> > > > > 
> > > > > Well, it -might- be a problem if you somehow figured out how to 
> > > > > allocate
> > > > > memory that quickly in a steady-state manner.  ;-)
> > > > > 
> > > > > > > >> Good idea.  Michael, is should be easy to modify 
> > > > > > > >> kvm-unit-tests to write
> > > > > > > >> to the APIC ID register in a loop.
> > > > > > > >> 
> > > > > > > > I did. Memory consumption does not grow on otherwise idle host.
> > > > > 
> > > > > Very good -- the checks in __call_rcu(), which is common code invoked 
> > > > > by
> > > > > kfree_rcu(), seem to be doing their job, then.  These do keep a 
> > > > > per-CPU
> > > > > counter, which can be adjusted via rcutree.blimit, which defaults
> > > > > to taking evasive action if more than 10K callbacks are waiting on a
> > > > > given CPU.
> > > > > 
> > > > > My concern was that you might be overrunning that limit in way less
> > > > > than a grace period (as in about a hundred microseconds.  My concern
> > > > > was of course unfounded -- you take several grace periods in push 10K
> > > > > callbacks through.
> > > > > 
> > > > >   Thanx, Paul
> > > > 
> > > > Gleb noted that Documentation/RCU/checklist.txt has this text:
> > > > 
> > > > An especially important property of the synchronize_rcu()
> > > > primitive is that it automatically self-limits: if grace periods
> > > > are delayed for whatever reason, then the synchronize_rcu()
> > > > primitive will correspondingly delay updates.  In contrast,
> > > > code using call_rcu() should explicitly limit update rate in
> > > > cases where grace periods are delayed, as failing to do so can
> > > > result in excessive realtime latencies or even OOM conditions.
> > > > 
> > > > If call_rcu is self-limiting maybe this should be documented ...
> > > 
> > > It would be more accurate to say that takes has some measures to limit
> > > the damage -- you can overwhelm these measures if you try hard enough.
> > > 
> > The question is: Is it safe to have a call_rcu() without any additional 
> > rate limiting
> > on user triggerable code path?
> 
> That would be a good way to allow users to run your system out of memory,
> especially on systems with limited memory.  (If you have several GB of
> free space, you might be OK.)
> 
Thanks! Got it.

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] KVM: optimize apic interrupt delivery

2013-12-05 Thread Paul E. McKenney
On Thu, Nov 28, 2013 at 10:55:06AM +0200, Gleb Natapov wrote:
> On Wed, Nov 27, 2013 at 09:06:36AM -0800, Paul E. McKenney wrote:
> > On Wed, Nov 27, 2013 at 10:00:09AM +0200, Gleb Natapov wrote:
> > > On Tue, Nov 26, 2013 at 11:35:06AM -0800, Paul E. McKenney wrote:
> > > > On Tue, Nov 26, 2013 at 06:24:13PM +0200, Michael S. Tsirkin wrote:
> > > > > On Wed, Sep 12, 2012 at 08:13:54AM -0700, Paul E. McKenney wrote:
> > > > > > On Wed, Sep 12, 2012 at 03:44:26PM +0300, Gleb Natapov wrote:
> > > > > > > On Wed, Sep 12, 2012 at 03:36:57PM +0300, Avi Kivity wrote:
> > > > > > > > On 09/12/2012 03:34 PM, Gleb Natapov wrote:
> > > > > > > > > On Wed, Sep 12, 2012 at 10:45:22AM +0300, Avi Kivity wrote:
> > > > > > > > >> On 09/12/2012 04:03 AM, Paul E. McKenney wrote:
> > > > > > > > >> >> > > Paul, I'd like to check something with you here:
> > > > > > > > >> >> > > this function can be triggered by userspace,
> > > > > > > > >> >> > > any number of times; we allocate
> > > > > > > > >> >> > > a 2K chunk of memory that is later freed by
> > > > > > > > >> >> > > kfree_rcu.
> > > > > > > > >> >> > > 
> > > > > > > > >> >> > > Is there a risk of DOS if RCU is delayed while
> > > > > > > > >> >> > > lots of memory is queued up in this way?
> > > > > > > > >> >> > > If yes is this a generic problem with kfree_rcu
> > > > > > > > >> >> > > that should be addressed in core kernel?
> > > > > > > > >> >> > 
> > > > > > > > >> >> > There is indeed a risk.
> > > > > > > > >> >> 
> > > > > > > > >> >> In our case it's a 2K object. Is it a practical risk?
> > > > > > > > >> > 
> > > > > > > > >> > How many kfree_rcu()s per second can a given user cause to 
> > > > > > > > >> > happen?
> > > > > > > > >> 
> > > > > > > > >> Not much more than a few hundred thousand per second per 
> > > > > > > > >> process (normal
> > > > > > > > >> operation is zero).
> > > > > > > > >> 
> > > > > > > > > I managed to do 21466 per second.
> > > > > > > > 
> > > > > > > > Strange, why so slow?
> > > > > > > > 
> > > > > > > Because ftrace buffer overflows :) With bigger buffer I get 
> > > > > > > 169940.
> > > > > > 
> > > > > > Ah, good, should not be a problem.  In contrast, if you ran 
> > > > > > kfree_rcu() in
> > > > > > a tight loop, you could probably do in excess of 100M per CPU per 
> > > > > > second.
> > > > > > Now -that- might be a problem.
> > > > > > 
> > > > > > Well, it -might- be a problem if you somehow figured out how to 
> > > > > > allocate
> > > > > > memory that quickly in a steady-state manner.  ;-)
> > > > > > 
> > > > > > > > >> Good idea.  Michael, is should be easy to modify 
> > > > > > > > >> kvm-unit-tests to write
> > > > > > > > >> to the APIC ID register in a loop.
> > > > > > > > >> 
> > > > > > > > > I did. Memory consumption does not grow on otherwise idle 
> > > > > > > > > host.
> > > > > > 
> > > > > > Very good -- the checks in __call_rcu(), which is common code 
> > > > > > invoked by
> > > > > > kfree_rcu(), seem to be doing their job, then.  These do keep a 
> > > > > > per-CPU
> > > > > > counter, which can be adjusted via rcutree.blimit, which defaults
> > > > > > to taking evasive action if more than 10K callbacks are waiting on a
> > > > > > given CPU.
> > > > > > 
> > > > > > My concern was that you might be overrunning that limit in way less
> > > > > > than a grace period (as in about a hundred microseconds.  My concern
> > > > > > was of course unfounded -- you take several grace periods in push 
> > > > > > 10K
> > > > > > callbacks through.
> > > > > > 
> > > > > > Thanx, Paul
> > > > > 
> > > > > Gleb noted that Documentation/RCU/checklist.txt has this text:
> > > > > 
> > > > > An especially important property of the synchronize_rcu()
> > > > > primitive is that it automatically self-limits: if grace 
> > > > > periods
> > > > > are delayed for whatever reason, then the synchronize_rcu()
> > > > > primitive will correspondingly delay updates.  In contrast,
> > > > > code using call_rcu() should explicitly limit update rate in
> > > > > cases where grace periods are delayed, as failing to do so can
> > > > > result in excessive realtime latencies or even OOM conditions.
> > > > > 
> > > > > If call_rcu is self-limiting maybe this should be documented ...
> > > > 
> > > > It would be more accurate to say that takes has some measures to limit
> > > > the damage -- you can overwhelm these measures if you try hard enough.
> > > > 
> > > The question is: Is it safe to have a call_rcu() without any additional 
> > > rate limiting
> > > on user triggerable code path?
> > 
> > That would be a good way to allow users to run your system out of memory,
> > especially on systems with limited memory.  (If you have several GB of
> > free space, you might be OK.)
> > 
> Thanks! Got it.

Does the following help?

Thanx, Paul

---

Re: [PATCHv2] KVM: optimize apic interrupt delivery

2013-12-06 Thread Gleb Natapov
On Thu, Dec 05, 2013 at 03:00:33PM -0800, Paul E. McKenney wrote:
> > > > The question is: Is it safe to have a call_rcu() without any additional 
> > > > rate limiting
> > > > on user triggerable code path?
> > > 
> > > That would be a good way to allow users to run your system out of memory,
> > > especially on systems with limited memory.  (If you have several GB of
> > > free space, you might be OK.)
> > > 
> > Thanks! Got it.
> 
> Does the following help?
> 
Looks good to me.

>   Thanx, Paul
> 
> 
> 
> rcu: Document call_rcu() safety mechanisms and limitations
> 
> The call_rcu() family of primitives will take action to accelerate
> grace periods when the number of callbacks pending on a given CPU
> becomes excessive.  Although this safety mechanism can be useful,
> it is no substitute for users of call_rcu() having rate-limit controls
> in place.  This commit adds this nuance to the documentation.
> 
> Reported-by: "Michael S. Tsirkin" 
> Reported-by: Gleb Natapov 
> Signed-off-by: Paul E. McKenney 
> 
> diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
> index 91266193b8f4..5733e31836b5 100644
> --- a/Documentation/RCU/checklist.txt
> +++ b/Documentation/RCU/checklist.txt
> @@ -256,10 +256,11 @@ over a rather long period of time, but improvements are 
> always welcome!
>   variations on this theme.
>  
>   b.  Limiting update rate.  For example, if updates occur only
> - once per hour, then no explicit rate limiting is required,
> - unless your system is already badly broken.  The dcache
> - subsystem takes this approach -- updates are guarded
> - by a global lock, limiting their rate.
> + once per hour, then no explicit rate limiting is
> + required, unless your system is already badly broken.
> + Older versions of the dcache subsystem takes this
> + approach -- updates were guarded by a global lock,
> + limiting their rate.
>  
>   c.  Trusted update -- if updates can only be done manually by
>   superuser or some other trusted user, then it might not
> @@ -268,7 +269,8 @@ over a rather long period of time, but improvements are 
> always welcome!
>   the machine.
>  
>   d.  Use call_rcu_bh() rather than call_rcu(), in order to take
> - advantage of call_rcu_bh()'s faster grace periods.
> + advantage of call_rcu_bh()'s faster grace periods.  (This
> + is only a partial solution, though.)
>  
>   e.  Periodically invoke synchronize_rcu(), permitting a limited
>   number of updates per grace period.
> @@ -276,6 +278,13 @@ over a rather long period of time, but improvements are 
> always welcome!
>   The same cautions apply to call_rcu_bh(), call_rcu_sched(),
>   call_srcu(), and kfree_rcu().
>  
> + Note that although these primitives do take action to avoid memory
> + exhaustion when any given CPU has too many callbacks, a determined
> + user could still exhaust memory.  This is especially the case
> + if a system with a large number of CPUs has been configured to
> + offload all of its RCU callbacks onto a single CPU, or if the
> + system has relatively little free memory.
> +
>  9.   All RCU list-traversal primitives, which include
>   rcu_dereference(), list_for_each_entry_rcu(), and
>   list_for_each_safe_rcu(), must be either within an RCU read-side
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html