Re: [PATCH v2 04/10] KVM: arm/arm64: vgic: Allow HW irq to be encoded in LR

2015-07-21 Thread Marc Zyngier
On 17/07/15 20:50, Christoffer Dall wrote:
 On Wed, Jul 08, 2015 at 06:56:36PM +0100, Marc Zyngier wrote:
 Now that struct vgic_lr supports the LR_HW bit and carries a hwirq
 field, we can encode that information into the list registers.

 This patch provides implementations for both GICv2 and GICv3.

 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 ---
  include/linux/irqchip/arm-gic-v3.h |  3 +++
  include/linux/irqchip/arm-gic.h|  3 ++-
  virt/kvm/arm/vgic-v2.c | 16 +++-
  virt/kvm/arm/vgic-v3.c | 21 ++---
  4 files changed, 38 insertions(+), 5 deletions(-)

 diff --git a/include/linux/irqchip/arm-gic-v3.h 
 b/include/linux/irqchip/arm-gic-v3.h
 index ffbc034..cf637d6 100644
 --- a/include/linux/irqchip/arm-gic-v3.h
 +++ b/include/linux/irqchip/arm-gic-v3.h
 @@ -268,9 +268,12 @@
  
  #define ICH_LR_EOI  (1UL  41)
  #define ICH_LR_GROUP(1UL  60)
 +#define ICH_LR_HW   (1UL  61)
  #define ICH_LR_STATE(3UL  62)
  #define ICH_LR_PENDING_BIT  (1UL  62)
  #define ICH_LR_ACTIVE_BIT   (1UL  63)
 +#define ICH_LR_PHYS_ID_SHIFT32
 +#define ICH_LR_PHYS_ID_MASK (0x3ffUL  ICH_LR_PHYS_ID_SHIFT)
  
  #define ICH_MISR_EOI(1  0)
  #define ICH_MISR_U  (1  1)
 diff --git a/include/linux/irqchip/arm-gic.h 
 b/include/linux/irqchip/arm-gic.h
 index 9de976b..ca88dad 100644
 --- a/include/linux/irqchip/arm-gic.h
 +++ b/include/linux/irqchip/arm-gic.h
 @@ -71,11 +71,12 @@
  
  #define GICH_LR_VIRTUALID   (0x3ff  0)
  #define GICH_LR_PHYSID_CPUID_SHIFT  (10)
 -#define GICH_LR_PHYSID_CPUID(7  
 GICH_LR_PHYSID_CPUID_SHIFT)
 +#define GICH_LR_PHYSID_CPUID(0x3ff  
 GICH_LR_PHYSID_CPUID_SHIFT)
  #define GICH_LR_STATE   (3  28)
  #define GICH_LR_PENDING_BIT (1  28)
  #define GICH_LR_ACTIVE_BIT  (1  29)
  #define GICH_LR_EOI (1  19)
 +#define GICH_LR_HW  (1  31)
  
  #define GICH_VMCR_CTRL_SHIFT0
  #define GICH_VMCR_CTRL_MASK (0x21f  GICH_VMCR_CTRL_SHIFT)
 diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
 index f9b9c7c..8d7b04d 100644
 --- a/virt/kvm/arm/vgic-v2.c
 +++ b/virt/kvm/arm/vgic-v2.c
 @@ -48,6 +48,10 @@ static struct vgic_lr vgic_v2_get_lr(const struct 
 kvm_vcpu *vcpu, int lr)
  lr_desc.state |= LR_STATE_ACTIVE;
  if (val  GICH_LR_EOI)
  lr_desc.state |= LR_EOI_INT;
 +if (val  GICH_LR_HW) {
 +lr_desc.state |= LR_HW;
 +lr_desc.hwirq = (val  GICH_LR_PHYSID_CPUID)  
 GICH_LR_PHYSID_CPUID_SHIFT;
 +}
  
  return lr_desc;
  }
 @@ -55,7 +59,9 @@ static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu 
 *vcpu, int lr)
  static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
 struct vgic_lr lr_desc)
  {
 -u32 lr_val = (lr_desc.source  GICH_LR_PHYSID_CPUID_SHIFT) | 
 lr_desc.irq;
 +u32 lr_val;
 +
 +lr_val = lr_desc.irq;
  
  if (lr_desc.state  LR_STATE_PENDING)
  lr_val |= GICH_LR_PENDING_BIT;
 @@ -64,6 +70,14 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
  if (lr_desc.state  LR_EOI_INT)
  lr_val |= GICH_LR_EOI;
  
 +if (lr_desc.state  LR_HW) {
 +lr_val |= GICH_LR_HW;
 +lr_val |= (u32)lr_desc.hwirq  GICH_LR_PHYSID_CPUID_SHIFT;
 +}
 +
 +if (lr_desc.irq  VGIC_NR_SGIS)
 +lr_val |= (lr_desc.source  GICH_LR_PHYSID_CPUID_SHIFT);
 +
  vcpu-arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
  }
  
 diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
 index dff0602..afbf925 100644
 --- a/virt/kvm/arm/vgic-v3.c
 +++ b/virt/kvm/arm/vgic-v3.c
 @@ -67,6 +67,10 @@ static struct vgic_lr vgic_v3_get_lr(const struct 
 kvm_vcpu *vcpu, int lr)
  lr_desc.state |= LR_STATE_ACTIVE;
  if (val  ICH_LR_EOI)
  lr_desc.state |= LR_EOI_INT;
 +if (val  ICH_LR_HW) {
 +lr_desc.state |= LR_HW;
 +lr_desc.hwirq = (val  ICH_LR_PHYS_ID_SHIFT)  GENMASK(9, 0);
 +}
  
  return lr_desc;
  }
 @@ -84,10 +88,17 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
   * Eventually we want to make this configurable, so we may revisit
   * this in the future.
   */
 -if (vcpu-kvm-arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
 +switch (vcpu-kvm-arch.vgic.vgic_model) {
 +case KVM_DEV_TYPE_ARM_VGIC_V3:
  lr_val |= ICH_LR_GROUP;
 -else
 -lr_val |= (u32)lr_desc.source  GICH_LR_PHYSID_CPUID_SHIFT;
 +break;
 +case  KVM_DEV_TYPE_ARM_VGIC_V2:
 +if (lr_desc.irq  VGIC_NR_SGIS)
 +lr_val |= (u32)lr_desc.source  
 GICH_LR_PHYSID_CPUID_SHIFT;
 
 I forget how this works: Why are we mixing GICH_LR_ stuff with ICH_LR_
 in the same function?  Aren't we always accessing 

Re: [PATCH v2 07/10] KVM: arm/arm64: vgic: Allow HW interrupts to be queued to a guest

2015-07-21 Thread Marc Zyngier
On 17/07/15 22:56, Christoffer Dall wrote:
 On Wed, Jul 08, 2015 at 06:56:39PM +0100, Marc Zyngier wrote:
 To allow a HW interrupt to be injected into a guest, we lookup the
 guest virtual interrupt in the irq_phys_map rbtree, and if we have
 
 s/rbtree/list/
 
 a match, encode both interrupts in the LR.

 We also mark the interrupt as active at the host distributor level.

 On guest EOI on the virtual interrupt, the host interrupt will be
 deactivated.

 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 ---
  virt/kvm/arm/vgic.c | 72 
 ++---
  1 file changed, 69 insertions(+), 3 deletions(-)

 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
 index 3424329..f8582d7 100644
 --- a/virt/kvm/arm/vgic.c
 +++ b/virt/kvm/arm/vgic.c
 @@ -1118,6 +1118,26 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu 
 *vcpu, int irq,
  if (!vgic_irq_is_edge(vcpu, irq))
  vlr.state |= LR_EOI_INT;
  
 +if (vlr.irq = VGIC_NR_SGIS) {
 +struct irq_phys_map *map;
 +map = vgic_irq_map_search(vcpu, irq);
 +
 +if (map) {
 +int ret;
 +
 +BUG_ON(!map-active);
 +vlr.hwirq = map-phys_irq;
 +vlr.state |= LR_HW;
 +vlr.state = ~LR_EOI_INT;
 +
 +ret = irq_set_irqchip_state(map-irq,
 +IRQCHIP_STATE_ACTIVE,
 +true);
 
 So if we have a mapping, and the virtual interrupt is being injected,
 then we must set the state to active in the physical world, because
 otherwise the guest will just exit before processing the virtual
 interrupt, yes?

Exactly. This is what will prevent the interrupt from firing on the host
(we going back to the state the GIC would be in if we were handling this
interrupt on the host).

 I think this deserves a comment here.

Sure.

 +vgic_irq_set_queued(vcpu, irq);
 
 And we set it queued only to avoid sampling it again and setting
 PENDING+ACTIVE, because the hardware doesn't support setting that state
 when the HW bit is set, yes?

Indeed.

 I think a comment here is warranted too.

Fair enough.

 +WARN_ON(ret);
 +}
 +}
 +
  vgic_set_lr(vcpu, lr_nr, vlr);
  vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
  }
 @@ -1342,6 +1362,35 @@ static bool vgic_process_maintenance(struct kvm_vcpu 
 *vcpu)
  return level_pending;
  }
  
 +/* Return 1 if HW interrupt went from active to inactive, and 0 otherwise */
 
 ... also clear the active state on the physical distributor so that
 shared devices can be used in a different context. (did I get this
 right?)

You're getting good at that! ;-)

 +static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
 +{
 +struct irq_phys_map *map;
 +int ret;
 +
 +if (!(vlr.state  LR_HW))
 +return 0;
 +
 +map = vgic_irq_map_search(vcpu, vlr.irq);
 +BUG_ON(!map || !map-active);
 +
 +ret = irq_get_irqchip_state(map-irq,
 +IRQCHIP_STATE_ACTIVE,
 +map-active);
 +
 +WARN_ON(ret);
 +
 +if (map-active) {
 +ret = irq_set_irqchip_state(map-irq,
 +IRQCHIP_STATE_ACTIVE,
 +false);
 +WARN_ON(ret);
 +return 0;
 +}
 +
 +return 1;
 +}
 +
  /* Sync back the VGIC state after a guest run */
  static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
  {
 @@ -1356,14 +1405,31 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu 
 *vcpu)
  elrsr = vgic_get_elrsr(vcpu);
  elrsr_ptr = u64_to_bitmask(elrsr);
  
 -/* Clear mappings for empty LRs */
 -for_each_set_bit(lr, elrsr_ptr, vgic-nr_lr) {
 +/* Deal with HW interrupts, and clear mappings for empty LRs */
 +for (lr = 0; lr  vgic-nr_lr; lr++) {
  struct vgic_lr vlr;
  
 -if (!test_and_clear_bit(lr, vgic_cpu-lr_used))
 +if (!test_bit(lr, vgic_cpu-lr_used))
  continue;
  
  vlr = vgic_get_lr(vcpu, lr);
 +if (vgic_sync_hwirq(vcpu, vlr)) {
 +/*
 + * So this is a HW interrupt that the guest
 + * EOI-ed. Clean the LR state and allow the
 + * interrupt to be queued again.
 
 s/queued/sampled/ ?

Indeed.

 + */
 +vlr.state = 0;
 +vlr.hwirq = 0;
 +vgic_set_lr(vcpu, lr, vlr);
 +vgic_irq_clear_queued(vcpu, vlr.irq);
 +set_bit(lr, elrsr_ptr);
 +}
 +
 +if (!test_bit(lr, elrsr_ptr))
 +continue;
 +
 +clear_bit(lr, vgic_cpu-lr_used);
  
  BUG_ON(vlr.irq = dist-nr_irqs);
  

Re: [PATCH v2 03/10] KVM: arm/arm64: vgic: Convert struct vgic_lr to use bitfields

2015-07-21 Thread Marc Zyngier
On 17/07/15 23:15, Christoffer Dall wrote:
 On Wed, Jul 08, 2015 at 06:56:35PM +0100, Marc Zyngier wrote:
 As we're about to cram more information in the vgic_lr structure
 (HW interrupt number and additional state information), we switch
 to a layout similar to the HW's:

 - use bitfields to save space (we don't need more than 10 bits
   to represent the irq numbers)
 - source CPU and HW interrupt can share the same field, as
   a SGI doesn't have a physical line.

 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 Reviewed-by: Alex Bennée alex.ben...@linaro.org
 ---
  include/kvm/arm_vgic.h | 10 +++---
  1 file changed, 7 insertions(+), 3 deletions(-)

 diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
 index 133ea00..4f9fa1d 100644
 --- a/include/kvm/arm_vgic.h
 +++ b/include/kvm/arm_vgic.h
 @@ -95,11 +95,15 @@ enum vgic_type {
  #define LR_STATE_ACTIVE (1  1)
  #define LR_STATE_MASK   (3  0)
  #define LR_EOI_INT  (1  2)
 +#define LR_HW   (1  3)
  
  struct vgic_lr {
 -u16 irq;
 -u8  source;
 -u8  state;
 +unsigned irq:10;
 +union {
 +unsigned hwirq:10;
 +unsigned source:8;
 
 why 8?  why not 3?

Because I can't count :-).

 +};
 +unsigned state:4;
  };
  
  struct vgic_vmcr {
 -- 
 2.1.4

 
 otherwise:
 
 Reviewed-by: Christoffer Dall christoffer.d...@linaro.org
 

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 06/10] KVM: arm/arm64: vgic: Allow dynamic mapping of physical/virtual interrupts

2015-07-21 Thread Marc Zyngier
On 17/07/15 22:11, Christoffer Dall wrote:
 On Wed, Jul 08, 2015 at 06:56:38PM +0100, Marc Zyngier wrote:
 In order to be able to feed physical interrupts to a guest, we need
 to be able to establish the virtual-physical mapping between the two
 worlds.

 The mappings are kept in a set of RCU lists, indexed by virtual interrupts.

 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 ---
  arch/arm/kvm/arm.c |   2 +
  include/kvm/arm_vgic.h |  25 +
  virt/kvm/arm/vgic.c| 144 
 -
  3 files changed, 170 insertions(+), 1 deletion(-)

 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 index 1583a34..d5ce5cc 100644
 --- a/arch/arm/kvm/arm.c
 +++ b/arch/arm/kvm/arm.c
 @@ -125,6 +125,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
   if (ret)
   goto out_free_stage2_pgd;

 + kvm_vgic_init(kvm);
   kvm_timer_init(kvm);

   /* Mark the initial VMID generation invalid */
 @@ -249,6 +250,7 @@ out:

  void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
  {
 + kvm_vgic_vcpu_postcreate(vcpu);
  }

  void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
 index 4f9fa1d..32e57d2 100644
 --- a/include/kvm/arm_vgic.h
 +++ b/include/kvm/arm_vgic.h
 @@ -159,6 +159,19 @@ struct vgic_io_device {
   struct kvm_io_device dev;
  };

 +struct irq_phys_map {
 + u32 virt_irq;
 + u32 phys_irq;
 + u32 irq;
 + boolactive;
 +};
 +
 +struct irq_phys_map_entry {
 + struct list_headentry;
 + struct rcu_head rcu;
 + struct irq_phys_map map;
 +};
 +
  struct vgic_dist {
   spinlock_t  lock;
   boolin_kernel;
 @@ -256,6 +269,10 @@ struct vgic_dist {
   struct vgic_vm_ops  vm_ops;
   struct vgic_io_device   dist_iodev;
   struct vgic_io_device   *redist_iodevs;
 +
 + /* Virtual irq to hwirq mapping */
 + spinlock_t  irq_phys_map_lock;
 + struct list_headirq_phys_map_list;
  };

  struct vgic_v2_cpu_if {
 @@ -307,6 +324,9 @@ struct vgic_cpu {
   struct vgic_v2_cpu_if   vgic_v2;
   struct vgic_v3_cpu_if   vgic_v3;
   };
 +
 + /* Protected by the distributor's irq_phys_map_lock */
 + struct list_headirq_phys_map_list;
  };

  #define LR_EMPTY 0xff
 @@ -321,8 +341,10 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, 
 u64 *addr, bool write);
  int kvm_vgic_hyp_init(void);
  int kvm_vgic_map_resources(struct kvm *kvm);
  int kvm_vgic_get_max_vcpus(void);
 +void kvm_vgic_init(struct kvm *kvm);
  int kvm_vgic_create(struct kvm *kvm, u32 type);
  void kvm_vgic_destroy(struct kvm *kvm);
 +void kvm_vgic_vcpu_postcreate(struct kvm_vcpu *vcpu);
  void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
  void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
  void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
 @@ -331,6 +353,9 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, 
 unsigned int irq_num,
  void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
  int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
  int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
 +struct irq_phys_map *vgic_map_phys_irq(struct kvm_vcpu *vcpu,
 +int virt_irq, int irq);
 +int vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
 
 should these functions not have a kvm_ prefix?

Guess that'd be a good idea - VFIO will probably have to use them somehow.

  #define irqchip_in_kernel(k) (!!((k)-arch.vgic.in_kernel))
  #define vgic_initialized(k)  (!!((k)-arch.vgic.nr_cpus))
 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
 index 5bd1695..3424329 100644
 --- a/virt/kvm/arm/vgic.c
 +++ b/virt/kvm/arm/vgic.c
 @@ -24,6 +24,7 @@
  #include linux/of.h
  #include linux/of_address.h
  #include linux/of_irq.h
 +#include linux/rculist.h
  #include linux/uaccess.h

  #include asm/kvm_emulate.h
 @@ -82,6 +83,8 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu 
 *vcpu);
  static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
  static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
  static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr 
 lr_desc);
 +static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
 + int virt_irq);

  static const struct vgic_ops *vgic_ops;
  static const struct vgic_params *vgic;
 @@ -1583,6 +1586,131 @@ static irqreturn_t vgic_maintenance_handler(int irq, 
 void *data)
   return IRQ_HANDLED;
  }

 +static struct list_head *vgic_get_irq_phys_map(struct kvm_vcpu *vcpu,
 +int virt_irq)
 +{
 + if (virt_irq  VGIC_NR_PRIVATE_IRQS)
 + return vcpu-arch.vgic_cpu.irq_phys_map_list;
 + else
 +

[RFC PATCH 0/1] vfio-pci/iommu: Detach iommu group on remove path

2015-07-21 Thread Gerald Schaefer
Hi,

during IOMMU API function testing on s390 I hit the following scenario:

After binding a device to vfio-pci, the user completes the VFIO_SET_IOMMU
ioctl and stops, see the sample C program below. Now the device is manually
removed via echo 1  /sys/bus/pci/devices/.../remove, which completes
instantly because the device is not considered in use in vfio_del_group_dev()
and ops-request will be skipped (probably because there was no
VFIO_GROUP_GET_DEVICE_FD ioctl so far, only the SET_IOMMU which only
triggered an attach iommu group).

Although the SET_IOMMU ioctl triggered the attach_dev callback in the
underlying IOMMU API, removing the device in this way won't trigger the
detach_dev callback, neither during remove nor when the user program
continues with closing group/container.

On s390 this eventually leads to a kernel panic when binding the device
again to its non-vfio PCI driver, because of the missing arch-specific
cleanup in detach_dev. On x86 I couldn't trigger the panic but I could
verify that detach_dev also won't get called in this scenario, which
probably means at least some kind of memory leak there.

I think I found a way to fix this in vfio code by calling
vfio_group_try_dissolve_container() from within vfio_del_group_dev(),
but I'm not really familiar with this code and so there may be better
ways to fix it. Any thoughts?

Regards,
Gerald


Here is the sample C program to trigger the ioctl:

#include stdio.h
#include fcntl.h
#include linux/vfio.h

int main(void)
{
int container, group, rc;

container = open(/dev/vfio/vfio, O_RDWR);
if (container  0) {
perror(open /dev/vfio/vfio\n);
return -1;
}

group = open(/dev/vfio/0, O_RDWR);
if (group  0) {
perror(open /dev/vfio/0\n);
return -1;
}

rc = ioctl(group, VFIO_GROUP_SET_CONTAINER, container);
if (rc) {
perror(ioctl VFIO_GROUP_SET_CONTAINER\n);
return -1;
}

rc = ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
if (rc) {
perror(ioctl VFIO_SET_IOMMU\n);
return -1;
}

printf(Try device remove...\n);
getchar();

close(group);
close(container);
return 0;
}


Gerald Schaefer (1):
  vfio-pci/iommu: Detach iommu group on remove path

 drivers/vfio/vfio.c | 3 +++
 1 file changed, 3 insertions(+)

-- 
2.3.8

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 1/1] vfio-pci/iommu: Detach iommu group on remove path

2015-07-21 Thread Gerald Schaefer
When a user completes the VFIO_SET_IOMMU ioctl and the vfio-pci device is
removed thereafter (before any other ioctl like VFIO_GROUP_GET_DEVICE_FD),
then the detach_dev callback of the underlying IOMMU API is never called.

This patch adds a call to vfio_group_try_dissolve_container() to the remove
path, which will trigger the missing detach_dev callback in this scenario.

Signed-off-by: Gerald Schaefer gerald.schae...@de.ibm.com
---
 drivers/vfio/vfio.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 2fb29df..9c5c784 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -711,6 +711,8 @@ static bool vfio_dev_present(struct vfio_group *group, 
struct device *dev)
return true;
 }
 
+static void vfio_group_try_dissolve_container(struct vfio_group *group);
+
 /*
  * Decrement the device reference count and wait for the device to be
  * removed.  Open file descriptors for the device... */
@@ -785,6 +787,7 @@ void *vfio_del_group_dev(struct device *dev)
}
} while (ret = 0);
 
+   vfio_group_try_dissolve_container(group);
vfio_group_put(group);
 
return device_data;
-- 
2.3.8

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 10/10] KVM: arm/arm64: vgic: Allow non-shared device HW interrupts

2015-07-21 Thread Marc Zyngier
On 17/07/15 23:15, Christoffer Dall wrote:
 On Wed, Jul 08, 2015 at 06:56:42PM +0100, Marc Zyngier wrote:
 So far, the only use of the HW interrupt facility is the timer,
 implying that the active state is context-switched for each vcpu,
 as the device is is shared across all vcpus.

 This does not work for a device that has been assigned to a VM,
 as the guest is entierely in control of that device (the HW is
 not shared). In that case, it makes sense to bypass the whole
 active state switchint, and only track the deactivation of the
 
 switching
 
 interrupt.

 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 ---
  include/kvm/arm_vgic.h|  5 ++--
  virt/kvm/arm/arch_timer.c |  2 +-
  virt/kvm/arm/vgic.c   | 62 
 ---
  3 files changed, 52 insertions(+), 17 deletions(-)

 diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
 index 9fd4023..31c987a 100644
 --- a/include/kvm/arm_vgic.h
 +++ b/include/kvm/arm_vgic.h
 @@ -163,7 +163,8 @@ struct irq_phys_map {
  u32 virt_irq;
  u32 phys_irq;
  u32 irq;
 -boolactive;
 +boolshared;
 +boolactive; /* Only valid if shared */
  };
  
  struct irq_phys_map_entry {
 @@ -354,7 +355,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 
 reg);
  int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
  int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
  struct irq_phys_map *vgic_map_phys_irq(struct kvm_vcpu *vcpu,
 -   int virt_irq, int irq);
 +   int virt_irq, int irq, bool shared);
  int vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
  bool vgic_get_phys_irq_active(struct irq_phys_map *map);
  void vgic_set_phys_irq_active(struct irq_phys_map *map, bool active);
 diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
 index b9fff78..9544d79 100644
 --- a/virt/kvm/arm/arch_timer.c
 +++ b/virt/kvm/arm/arch_timer.c
 @@ -202,7 +202,7 @@ void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
   * Tell the VGIC that the virtual interrupt is tied to a
   * physical interrupt. We do that once per VCPU.
   */
 -timer-map = vgic_map_phys_irq(vcpu, irq-irq, host_vtimer_irq);
 +timer-map = vgic_map_phys_irq(vcpu, irq-irq, host_vtimer_irq, true);
  WARN_ON(!timer-map);
  }
  
 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
 index 39f9479..3585bb0 100644
 --- a/virt/kvm/arm/vgic.c
 +++ b/virt/kvm/arm/vgic.c
 @@ -1123,18 +1123,21 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu 
 *vcpu, int irq,
  map = vgic_irq_map_search(vcpu, irq);
  
  if (map) {
 -int ret;
 -
 -BUG_ON(!map-active);
  vlr.hwirq = map-phys_irq;
  vlr.state |= LR_HW;
  vlr.state = ~LR_EOI_INT;
  
 -ret = irq_set_irqchip_state(map-irq,
 -IRQCHIP_STATE_ACTIVE,
 -true);
  vgic_irq_set_queued(vcpu, irq);
 -WARN_ON(ret);
 +
 +if (map-shared) {
 +int ret;
 +
 +BUG_ON(!map-active);
 +ret = irq_set_irqchip_state(map-irq,
 +
 IRQCHIP_STATE_ACTIVE,
 +true);
 +WARN_ON(ret);
 
 do we have any other example of a shared device or is this really simply
 because the timer hardware is fscking strangely tied to the gic and is a
 total one-off?

I don't think of it as a one-off. PMUs could very well be in the same
category.

 In the latter case, would it be cleaner to drop this notion of a shared
 device entirely and put all this logic in the arch timer code instead?

My initial implementation did exactly that (hence the cut-n-paste crap
you spotted in patch #6). It wasn't bad, but it did put a lot of GIC
knowledge inside the timer code, and a few callbacks too many between
the two subsystems.

 +}
  }
  }
  
 @@ -1366,21 +1369,37 @@ static bool vgic_process_maintenance(struct kvm_vcpu 
 *vcpu)
  static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
  {
  struct irq_phys_map *map;
 +bool active;
  int ret;
  
  if (!(vlr.state  LR_HW))
  return 0;
  
  map = vgic_irq_map_search(vcpu, vlr.irq);
 -BUG_ON(!map || !map-active);
 +BUG_ON(!map);
 +BUG_ON(map-shared  !map-active);
  
  ret = irq_get_irqchip_state(map-irq,
  IRQCHIP_STATE_ACTIVE,
 -map-active);
 +active);
  
  WARN_ON(ret);
  
 -

Re: [PATCH] mm: rename and document alloc_pages_exact_node

2015-07-21 Thread Michael Ellerman
On Tue, 2015-07-21 at 15:55 +0200, Vlastimil Babka wrote:
 The function alloc_pages_exact_node() was introduced in 6484eb3e2a81 (page
 allocator: do not check NUMA node ID when the caller knows the node is valid)
 as an optimized variant of alloc_pages_node(), that doesn't allow the node id
 to be -1. Unfortunately the name of the function can easily suggest that the
 allocation is restricted to the given node. In truth, the node is only
 preferred, unless __GFP_THISNODE is among the gfp flags.
 
 The misleading name has lead to mistakes in the past, see 5265047ac301 (mm,
 thp: really limit transparent hugepage allocation to local node) and
 b360edb43f8e (mm, mempolicy: migrate_to_node should only migrate to node).
 
 To prevent further mistakes, this patch renames the function to
 alloc_pages_prefer_node() and documents it together with alloc_pages_node().
 
 Signed-off-by: Vlastimil Babka vba...@suse.cz
 
  I'm CC'ing also maintainers of the callsites so they can verify that the
  callsites that don't pass __GFP_THISNODE are really not intended to restrict
  allocation to the given node. I went through them myself and each looked like
  it's better off if it can successfully allocate on a fallback node rather
  than fail. DavidR checked them also I think, but it's better if maintainers
  can verify that. I'm not completely sure about all the usages in sl*b due to
  multiple layers through which gfp flags are being passed.


 diff --git a/arch/powerpc/platforms/cell/ras.c 
 b/arch/powerpc/platforms/cell/ras.c
 index e865d74..646a310 100644
 --- a/arch/powerpc/platforms/cell/ras.c
 +++ b/arch/powerpc/platforms/cell/ras.c
 @@ -123,7 +123,7 @@ static int __init cbe_ptcal_enable_on_node(int nid, int 
 order)
  
   area-nid = nid;
   area-order = order;
 - area-pages = alloc_pages_exact_node(area-nid,
 + area-pages = alloc_pages_prefer_node(area-nid,
   GFP_KERNEL|__GFP_THISNODE,
   area-order);

Yeah that looks right to me.

This code enables some firmware memory calibration so I think it really does
want to get memory on the specified node, or else fail.

Acked-by: Michael Ellerman m...@ellerman.id.au

cheers


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] mm: rename and document alloc_pages_exact_node

2015-07-21 Thread David Rientjes
On Tue, 21 Jul 2015, Vlastimil Babka wrote:

 The function alloc_pages_exact_node() was introduced in 6484eb3e2a81 (page
 allocator: do not check NUMA node ID when the caller knows the node is valid)
 as an optimized variant of alloc_pages_node(), that doesn't allow the node id
 to be -1. Unfortunately the name of the function can easily suggest that the
 allocation is restricted to the given node. In truth, the node is only
 preferred, unless __GFP_THISNODE is among the gfp flags.
 
 The misleading name has lead to mistakes in the past, see 5265047ac301 (mm,
 thp: really limit transparent hugepage allocation to local node) and
 b360edb43f8e (mm, mempolicy: migrate_to_node should only migrate to node).
 
 To prevent further mistakes, this patch renames the function to
 alloc_pages_prefer_node() and documents it together with alloc_pages_node().
 

alloc_pages_exact_node(), as you said, connotates that the allocation will 
take place on that node or will fail.  So why not go beyond this patch and 
actually make alloc_pages_exact_node() set __GFP_THISNODE and then call 
into a new alloc_pages_prefer_node(), which would be the current 
alloc_pages_exact_node() implementation, and then fix up the callers?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv3 1/2] kvmtool: Introduce downscript option for virtio-net

2015-07-21 Thread Fan Du
To detach tap device automatically from bridge when exiting,
just like what the reverse of script does.

Signed-off-by: Fan Du fan...@intel.com
---
 include/kvm/virtio-net.h |  1 +
 virtio/net.c | 49 
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/include/kvm/virtio-net.h b/include/kvm/virtio-net.h
index f435cc3..d136a09 100644
--- a/include/kvm/virtio-net.h
+++ b/include/kvm/virtio-net.h
@@ -9,6 +9,7 @@ struct virtio_net_params {
const char *guest_ip;
const char *host_ip;
const char *script;
+   const char *downscript;
const char *trans;
const char *tapif;
char guest_mac[6];
diff --git a/virtio/net.c b/virtio/net.c
index 4a6a855..d343615 100644
--- a/virtio/net.c
+++ b/virtio/net.c
@@ -294,10 +294,29 @@ static int virtio_net_request_tap(struct net_dev *ndev, 
struct ifreq *ifr,
return ret;
 }
 
+static int virtio_net_exec_script(const char* script, char *tap_name)
+{
+   int pid;
+   int status;
+
+   pid = fork();
+   if (pid == 0) {
+   execl(script, script, tap_name, NULL);
+   _exit(1);
+   } else {
+   waitpid(pid, status, 0);
+   if (WIFEXITED(status)  WEXITSTATUS(status) != 0) {
+   pr_warning(Fail to setup tap by %s, script);
+   return -1;
+   }
+   }
+   return 0;
+}
+
 static bool virtio_net__tap_init(struct net_dev *ndev)
 {
int sock = socket(AF_INET, SOCK_STREAM, 0);
-   int pid, status, offload, hdr_len;
+   int offload, hdr_len;
struct sockaddr_in sin = {0};
struct ifreq ifr;
const struct virtio_net_params *params = ndev-params;
@@ -339,17 +358,8 @@ static bool virtio_net__tap_init(struct net_dev *ndev)
}
 
if (strcmp(params-script, none)) {
-   pid = fork();
-   if (pid == 0) {
-   execl(params-script, params-script, ndev-tap_name, 
NULL);
-   _exit(1);
-   } else {
-   waitpid(pid, status, 0);
-   if (WIFEXITED(status)  WEXITSTATUS(status) != 0) {
-   pr_warning(Fail to setup tap by %s, 
params-script);
-   goto fail;
-   }
-   }
+   if(virtio_net_exec_script(params-script, ndev-tap_name)  0)
+   goto fail;
} else if (!skipconf) {
memset(ifr, 0, sizeof(ifr));
strncpy(ifr.ifr_name, ndev-tap_name, sizeof(ndev-tap_name));
@@ -702,6 +712,8 @@ static int set_net_param(struct kvm *kvm, struct 
virtio_net_params *p,
die(Unknown network mode %s, please use user, tap or 
none, kvm-cfg.network);
} else if (strcmp(param, script) == 0) {
p-script = strdup(val);
+   } else if (strcmp(param, downscript) == 0) {
+   p-downscript = strdup(val);
} else if (strcmp(param, guest_ip) == 0) {
p-guest_ip = strdup(val);
} else if (strcmp(param, host_ip) == 0) {
@@ -877,6 +889,19 @@ virtio_dev_init(virtio_net__init);
 
 int virtio_net__exit(struct kvm *kvm)
 {
+   struct virtio_net_params *params;
+   struct net_dev *ndev;
+   struct list_head *ptr;
+
+   list_for_each(ptr, ndevs) {
+   ndev = list_entry(ptr, struct net_dev, list);
+   params = ndev-params;
+   /* Cleanup any tap device which attached to bridge */
+   if (ndev-mode == NET_MODE_TAP 
+   strcmp(params-downscript, none)) {
+   virtio_net_exec_script(params-downscript, 
ndev-tap_name);
+   }
+   }
return 0;
 }
 virtio_dev_exit(virtio_net__exit);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv3 0/2] Fixes for kvmtool virtio-net part

2015-07-21 Thread Fan Du
Changelog:
v2:
  - Rebase with Will kvmtool.git tree[1]
  - Drop patch2 from v1, as commit f83dc816a9c7 has already fix it
v3:
  - Remove -Wunused-variable warnings

Fan Du (2):
  kvmtool: Introduce downscript option for virtio-net
  kvmtool: Restrict virtio queue number to 1 when vhost on

 include/kvm/virtio-net.h |  1 +
 virtio/net.c | 53 +---
 2 files changed, 42 insertions(+), 12 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv3 2/2] kvmtool: Restrict virtio queue number to 1 when vhost on

2015-07-21 Thread Fan Du
vhost kernel driver does not support mutiple queue yet,
Tweak queue number will fail with --net mode=tap,vhost=1,mq=2
as below when lkvm trying to set ring kick fd for queue 2:

VHOST_SET_VRING_KICK failed: No buffer space available

Error on this scenario, and overide with the default one queue
configuration.

Signed-off-by: Fan Du fan...@intel.com
---
 virtio/net.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/virtio/net.c b/virtio/net.c
index d343615..21a80f3 100644
--- a/virtio/net.c
+++ b/virtio/net.c
@@ -730,6 +730,10 @@ static int set_net_param(struct kvm *kvm, struct 
virtio_net_params *p,
p-mq = atoi(val);
} else
die(Unknown network parameter %s, param);
+   if (p-vhost  p-mq  1) {
+   p-mq = 1;
+   pr_err(vhost does not support mq yet, overide mq to 1.);
+   }
 
return 0;
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/2] kvmtool: remaining musl-libc and clang fixes

2015-07-21 Thread Andre Przywara
Hi Will,

thanks for applying the fixes.
Here are the remaining patches which we talked about.
I changed the PAGE_SIZE definition according to Szabolcs' comments
(thanks for that!) and just added the -Wno-format-nonliteral option
unconditionally to our list of warning options to appease clang.

Please check if that makes sense.

Cheers,
Andre.

Andre Przywara (2):
  Makefile: avoid non-literal printf format string warnings
  avoid redefining PAGE_SIZE

 Makefile  | 1 +
 include/kvm/kvm.h | 3 +++
 2 files changed, 4 insertions(+)

-- 
2.3.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/2] avoid redefining PAGE_SIZE

2015-07-21 Thread Andre Przywara
PAGE_SIZE may have been defined by the C libary (musl-libc does that).
So avoid redefining it here unconditionally, instead only use our
definition if none has been provided by the libc.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 include/kvm/kvm.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/kvm/kvm.h b/include/kvm/kvm.h
index 754e029..37155db 100644
--- a/include/kvm/kvm.h
+++ b/include/kvm/kvm.h
@@ -11,6 +11,7 @@
 #include time.h
 #include signal.h
 #include sys/prctl.h
+#include limits.h
 
 #define SIGKVMEXIT (SIGRTMIN + 0)
 #define SIGKVMPAUSE(SIGRTMIN + 1)
@@ -19,7 +20,9 @@
 #define HOME_DIR   getenv(HOME)
 #define KVM_BINARY_NAMElkvm
 
+#ifndef PAGE_SIZE
 #define PAGE_SIZE (sysconf(_SC_PAGE_SIZE))
+#endif
 
 #define DEFINE_KVM_EXT(ext)\
.name = #ext,   \
-- 
2.3.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/2] Makefile: avoid non-literal printf format string warnings

2015-07-21 Thread Andre Przywara
The clang compiler by default dislikes non-literal format strings
in *printf functions, so it complains about kvm__set_dir() in kvm.c
and about the error reporting functions.
Since a fix is not easy and the code itself is fine (just seems that
the compiler is not smart enough to see that), let's just disable
the warning. Since GCC knows about this option as well (it just
doesn't have it enabled with -Wall), we can unconditionally add this
to the warning options.

Signed-off-by: Andre Przywara andre.przyw...@arm.com
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 285c482..1534e6f 100644
--- a/Makefile
+++ b/Makefile
@@ -335,6 +335,7 @@ WARNINGS += -Wstrict-prototypes
 WARNINGS += -Wundef
 WARNINGS += -Wvolatile-register-var
 WARNINGS += -Wwrite-strings
+WARNINGS += -Wno-format-nonliteral
 
 CFLAGS += $(WARNINGS)
 
-- 
2.3.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv3 2/2] kvmtool: Restrict virtio queue number to 1 when vhost on

2015-07-21 Thread Andre Przywara
Hi,

On 21/07/15 07:18, Fan Du wrote:
 vhost kernel driver does not support mutiple queue yet,
 Tweak queue number will fail with --net mode=tap,vhost=1,mq=2
 as below when lkvm trying to set ring kick fd for queue 2:
 
 VHOST_SET_VRING_KICK failed: No buffer space available
 
 Error on this scenario, and overide with the default one queue
 configuration.

I don't like the idea of overriding an explicitly given command line
parameter (mq=2).
So why do you provide mq=2 in the first place if you know that the
kernel does not support it?
I'd rather see the error message to be more descriptive in that case.
That would help the user to understand what's going on, also it would
still work should the kernel ever support multiple queues in the future.

Cheers,
Andre.

 
 Signed-off-by: Fan Du fan...@intel.com
 ---
  virtio/net.c | 4 
  1 file changed, 4 insertions(+)
 
 diff --git a/virtio/net.c b/virtio/net.c
 index d343615..21a80f3 100644
 --- a/virtio/net.c
 +++ b/virtio/net.c
 @@ -730,6 +730,10 @@ static int set_net_param(struct kvm *kvm, struct 
 virtio_net_params *p,
   p-mq = atoi(val);
   } else
   die(Unknown network parameter %s, param);
 + if (p-vhost  p-mq  1) {
 + p-mq = 1;
 + pr_err(vhost does not support mq yet, overide mq to 1.);
 + }
  
   return 0;
  }
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv3 1/2] kvmtool: Introduce downscript option for virtio-net

2015-07-21 Thread Andre Przywara
Hi,

thanks for the rework, that looks good to me, some minor nits below.
Not sure if that requires a new version, maybe Will can fix that up when
he applies it.

On 21/07/15 07:18, Fan Du wrote:
 To detach tap device automatically from bridge when exiting,
 just like what the reverse of script does.
 
 Signed-off-by: Fan Du fan...@intel.com
 ---
  include/kvm/virtio-net.h |  1 +
  virtio/net.c | 49 
 
  2 files changed, 38 insertions(+), 12 deletions(-)
 
 diff --git a/include/kvm/virtio-net.h b/include/kvm/virtio-net.h
 index f435cc3..d136a09 100644
 --- a/include/kvm/virtio-net.h
 +++ b/include/kvm/virtio-net.h
 @@ -9,6 +9,7 @@ struct virtio_net_params {
   const char *guest_ip;
   const char *host_ip;
   const char *script;
 + const char *downscript;
   const char *trans;
   const char *tapif;
   char guest_mac[6];
 diff --git a/virtio/net.c b/virtio/net.c
 index 4a6a855..d343615 100644
 --- a/virtio/net.c
 +++ b/virtio/net.c
 @@ -294,10 +294,29 @@ static int virtio_net_request_tap(struct net_dev *ndev, 
 struct ifreq *ifr,
   return ret;
  }
  
 +static int virtio_net_exec_script(const char* script, char *tap_name)

can you make tap_name a const char * as well?

 +{
 + int pid;

fork() returns a value of type pid_t, so pid should be of that type
too. I know it was int before, but let's fix it when we rewrite it anyway.

 + int status;
 +
 + pid = fork();
 + if (pid == 0) {
 + execl(script, script, tap_name, NULL);
 + _exit(1);
 + } else {
 + waitpid(pid, status, 0);
 + if (WIFEXITED(status)  WEXITSTATUS(status) != 0) {
 + pr_warning(Fail to setup tap by %s, script);
 + return -1;
 + }
 + }
 + return 0;
 +}
 +
  static bool virtio_net__tap_init(struct net_dev *ndev)
  {
   int sock = socket(AF_INET, SOCK_STREAM, 0);
 - int pid, status, offload, hdr_len;
 + int offload, hdr_len;
   struct sockaddr_in sin = {0};
   struct ifreq ifr;
   const struct virtio_net_params *params = ndev-params;
 @@ -339,17 +358,8 @@ static bool virtio_net__tap_init(struct net_dev *ndev)
   }
  
   if (strcmp(params-script, none)) {
 - pid = fork();
 - if (pid == 0) {
 - execl(params-script, params-script, ndev-tap_name, 
 NULL);
 - _exit(1);
 - } else {
 - waitpid(pid, status, 0);
 - if (WIFEXITED(status)  WEXITSTATUS(status) != 0) {
 - pr_warning(Fail to setup tap by %s, 
 params-script);
 - goto fail;
 - }
 - }
 + if(virtio_net_exec_script(params-script, ndev-tap_name)  0)
 + goto fail;
   } else if (!skipconf) {
   memset(ifr, 0, sizeof(ifr));
   strncpy(ifr.ifr_name, ndev-tap_name, sizeof(ndev-tap_name));
 @@ -702,6 +712,8 @@ static int set_net_param(struct kvm *kvm, struct 
 virtio_net_params *p,
   die(Unknown network mode %s, please use user, tap or 
 none, kvm-cfg.network);
   } else if (strcmp(param, script) == 0) {
   p-script = strdup(val);
 + } else if (strcmp(param, downscript) == 0) {
 + p-downscript = strdup(val);
   } else if (strcmp(param, guest_ip) == 0) {
   p-guest_ip = strdup(val);
   } else if (strcmp(param, host_ip) == 0) {
 @@ -877,6 +889,19 @@ virtio_dev_init(virtio_net__init);
  
  int virtio_net__exit(struct kvm *kvm)
  {
 + struct virtio_net_params *params;
 + struct net_dev *ndev;
 + struct list_head *ptr;
 +
 + list_for_each(ptr, ndevs) {
 + ndev = list_entry(ptr, struct net_dev, list);
 + params = ndev-params;
 + /* Cleanup any tap device which attached to bridge */
 + if (ndev-mode == NET_MODE_TAP 
 + strcmp(params-downscript, none)) {
 + virtio_net_exec_script(params-downscript, 
 ndev-tap_name);
 + }

You don't need the braces here since the condition is only a one-liner.

Cheers,
Andre.

 + }
   return 0;
  }
  virtio_dev_exit(virtio_net__exit);
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] mm: rename and document alloc_pages_exact_node

2015-07-21 Thread Vlastimil Babka
The function alloc_pages_exact_node() was introduced in 6484eb3e2a81 (page
allocator: do not check NUMA node ID when the caller knows the node is valid)
as an optimized variant of alloc_pages_node(), that doesn't allow the node id
to be -1. Unfortunately the name of the function can easily suggest that the
allocation is restricted to the given node. In truth, the node is only
preferred, unless __GFP_THISNODE is among the gfp flags.

The misleading name has lead to mistakes in the past, see 5265047ac301 (mm,
thp: really limit transparent hugepage allocation to local node) and
b360edb43f8e (mm, mempolicy: migrate_to_node should only migrate to node).

To prevent further mistakes, this patch renames the function to
alloc_pages_prefer_node() and documents it together with alloc_pages_node().

Signed-off-by: Vlastimil Babka vba...@suse.cz
Cc: Mel Gorman mgor...@suse.de
Cc: David Rientjes rient...@google.com
Cc: Greg Thelen gthe...@google.com
Cc: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
Cc: Christoph Lameter c...@linux.com
Cc: Pekka Enberg penb...@kernel.org
Cc: Joonsoo Kim iamjoonsoo@lge.com
Cc: Naoya Horiguchi n-horigu...@ah.jp.nec.com
Cc: Tony Luck tony.l...@intel.com
Cc: Fenghua Yu fenghua...@intel.com
Cc: Arnd Bergmann a...@arndb.de
Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
Cc: Paul Mackerras pau...@samba.org
Cc: Michael Ellerman m...@ellerman.id.au
Cc: Gleb Natapov g...@kernel.org
Cc: Paolo Bonzini pbonz...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
Cc: Cliff Whickman c...@sgi.com
Cc: Robin Holt robinmh...@gmail.com
---
 I hope the new name will be OK. Although it doesn't fully convey the
 difference from alloc_pages_node(), renaming the latter would be a much
 larger patch (some 60 callsites) and hopefully the new comments are enough to
 prevent further confusion.
 
 I'm CC'ing also maintainers of the callsites so they can verify that the
 callsites that don't pass __GFP_THISNODE are really not intended to restrict
 allocation to the given node. I went through them myself and each looked like
 it's better off if it can successfully allocate on a fallback node rather
 than fail. DavidR checked them also I think, but it's better if maintainers
 can verify that. I'm not completely sure about all the usages in sl*b due to
 multiple layers through which gfp flags are being passed.

 arch/ia64/hp/common/sba_iommu.c   |  2 +-
 arch/ia64/kernel/uncached.c   |  2 +-
 arch/ia64/sn/pci/pci_dma.c|  2 +-
 arch/powerpc/platforms/cell/ras.c |  2 +-
 arch/x86/kvm/vmx.c|  2 +-
 drivers/misc/sgi-xp/xpc_uv.c  |  2 +-
 include/linux/gfp.h   | 14 --
 kernel/profile.c  |  8 
 mm/filemap.c  |  2 +-
 mm/huge_memory.c  |  2 +-
 mm/hugetlb.c  |  4 ++--
 mm/memory-failure.c   |  2 +-
 mm/mempolicy.c|  4 ++--
 mm/migrate.c  |  4 ++--
 mm/page_alloc.c   |  2 --
 mm/slab.c |  2 +-
 mm/slob.c |  4 ++--
 mm/slub.c |  2 +-
 18 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 344387a..17762af 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1146,7 +1146,7 @@ sba_alloc_coherent(struct device *dev, size_t size, 
dma_addr_t *dma_handle,
if (node == NUMA_NO_NODE)
node = numa_node_id();
 
-   page = alloc_pages_exact_node(node, flags, get_order(size));
+   page = alloc_pages_prefer_node(node, flags, get_order(size));
if (unlikely(!page))
return NULL;
 
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 20e8a9b..1451961 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -97,7 +97,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, 
int nid)
 
/* attempt to allocate a granule's worth of cached memory pages */
 
-   page = alloc_pages_exact_node(nid,
+   page = alloc_pages_prefer_node(nid,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
IA64_GRANULE_SHIFT-PAGE_SHIFT);
if (!page) {
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index d0853e8..dcab82f 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -92,7 +92,7 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t 
size,
 */
node = pcibus_to_node(pdev-bus);
if (likely(node =0)) {
-   struct page *p = alloc_pages_exact_node(node,
+   struct page *p = alloc_pages_prefer_node(node,
flags, get_order(size));
 
 

Re: [PATCH] mm: rename and document alloc_pages_exact_node

2015-07-21 Thread Christoph Lameter
On Tue, 21 Jul 2015, Vlastimil Babka wrote:

 The function alloc_pages_exact_node() was introduced in 6484eb3e2a81 (page
 allocator: do not check NUMA node ID when the caller knows the node is valid)
 as an optimized variant of alloc_pages_node(), that doesn't allow the node id
 to be -1. Unfortunately the name of the function can easily suggest that the
 allocation is restricted to the given node. In truth, the node is only
 preferred, unless __GFP_THISNODE is among the gfp flags.

Yup. I complained about this when this was introduced. Glad to see this
fixed. Initially this was alloc_pages_node() which just means that a node
is specified. The exact behavior of the allocation is determined by flags
such as GFP_THISNODE. I'd rather have that restored because otherwise we
get into weird code like the one below. And such an arrangement also
leaves the way open to add more flags in the future that may change the
allocation behavior.


   area-nid = nid;
   area-order = order;
 - area-pages = alloc_pages_exact_node(area-nid,
 + area-pages = alloc_pages_prefer_node(area-nid,
   GFP_KERNEL|__GFP_THISNODE,
   area-order);

This is not preferring a node but requiring alloction on that node.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call for agenda for 2015-07-21

2015-07-21 Thread Juan Quintela
Juan Quintela quint...@redhat.com wrote:
 Hi

 Please, send any topic that you are interested in covering.

As there are no topics on the agenda, call is cancelled.



  Call details:

 By popular demand, a google calendar public entry with it

   
 https://www.google.com/calendar/embed?src=dG9iMXRqcXAzN3Y4ZXZwNzRoMHE4a3BqcXNAZ3JvdXAuY2FsZW5kYXIuZ29vZ2xlLmNvbQ

 (Let me know if you have any problems with the calendar entry.  I just
 gave up about getting right at the same time CEST, CET, EDT and DST).

 If you need phone number details,  contact me privately

 Thanks, Juan.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html