[PATCH][kvm-unit-test] Keep gui off when running test cases

2013-06-26 Thread Jan Kiszka
From: Jan Kiszka jan.kis...@siemens.com

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 x86-run |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/x86-run b/x86-run
index 14ff331..646c577 100755
--- a/x86-run
+++ b/x86-run
@@ -33,7 +33,7 @@ else
pc_testdev=-device testdev,chardev=testlog -chardev 
file,id=testlog,path=msr.out
 fi
 
-command=${qemu} -enable-kvm $pc_testdev -serial stdio $pci_testdev -kernel
+command=${qemu} -enable-kvm $pc_testdev -display none -serial stdio 
$pci_testdev -kernel
 echo ${command} $@
 ${command} $@
 ret=$?
-- 
1.7.3.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: Fix RTC interrupt coalescing tracking

2013-06-26 Thread Gleb Natapov
On Wed, Jun 26, 2013 at 07:49:37AM +0200, Jan Kiszka wrote:
 On 2013-06-24 14:19, Gleb Natapov wrote:
  This reverts most of the f1ed0450a5fac7067590317cbf027f566b6ccbca. After
  the commit kvm_apic_set_irq() no longer returns accurate information
  about interrupt injection status if injection is done into disabled
  APIC. RTC interrupt coalescing tracking relies on the information to be
  accurate and cannot recover if it is not.
  
  Signed-off-by: Gleb Natapov g...@redhat.com
  diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
  index 9d75193..9f4bea8 100644
  --- a/arch/x86/kvm/lapic.c
  +++ b/arch/x86/kvm/lapic.c
  @@ -405,17 +405,17 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
  return highest_irr;
   }
   
  -static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
  - int vector, int level, int trig_mode,
  - unsigned long *dest_map);
  +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
  +int vector, int level, int trig_mode,
  +unsigned long *dest_map);
 
 I still think __apic_accept_irq should unconditionally inject, and the
 test for acpi_enabled belongs into kvm_apic_set_irq. Why should
 __apic_accept_irq accept non-APIC_DM_FIXED messages if the APIC is off?
 See below for another reason to refactor this part of the interface.
 
10.4.7.2 Local APIC State After It Has Been Software Disabled

The local APIC will respond normally to INIT, NMI, SMI, and SIPI
messages.

   
  -void kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
  - unsigned long *dest_map)
  +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
  +   unsigned long *dest_map)
   {
  struct kvm_lapic *apic = vcpu-arch.apic;
   
  -   __apic_accept_irq(apic, irq-delivery_mode, irq-vector,
  - irq-level, irq-trig_mode, dest_map);
  +   return __apic_accept_irq(apic, irq-delivery_mode, irq-vector,
  +   irq-level, irq-trig_mode, dest_map);
   }
   
   static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
  @@ -608,8 +608,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, 
  struct kvm_lapic *src,
  *r = -1;
   
  if (irq-shorthand == APIC_DEST_SELF) {
  -   kvm_apic_set_irq(src-vcpu, irq, dest_map);
  -   *r = 1;
  +   *r = kvm_apic_set_irq(src-vcpu, irq, dest_map);
  return true;
  }
   
  @@ -654,8 +653,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, 
  struct kvm_lapic *src,
  continue;
  if (*r  0)
  *r = 0;
  -   kvm_apic_set_irq(dst[i]-vcpu, irq, dest_map);
  -   *r += 1;
  +   *r += kvm_apic_set_irq(dst[i]-vcpu, irq, dest_map);
  }
   
  ret = true;
  @@ -664,11 +662,15 @@ out:
  return ret;
   }
   
  -/* Set an IRQ pending in the lapic. */
  -static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
  - int vector, int level, int trig_mode,
  - unsigned long *dest_map)
  +/*
  + * Add a pending IRQ into lapic.
  + * Return 1 if successfully added and 0 if discarded.
  + */
  +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
  +int vector, int level, int trig_mode,
  +unsigned long *dest_map)
   {
  +   int result = 0;
  struct kvm_vcpu *vcpu = apic-vcpu;
   
  switch (delivery_mode) {
  @@ -682,10 +684,13 @@ static void __apic_accept_irq(struct kvm_lapic *apic, 
  int delivery_mode,
  if (dest_map)
  __set_bit(vcpu-vcpu_id, dest_map);
   
  -   if (kvm_x86_ops-deliver_posted_interrupt)
  +   if (kvm_x86_ops-deliver_posted_interrupt) {
  +   result = 1;
  kvm_x86_ops-deliver_posted_interrupt(vcpu, vector);
  -   else {
  -   if (apic_test_and_set_irr(vector, apic)) {
  +   } else {
  +   result = !apic_test_and_set_irr(vector, apic);
 
 This part of the revert makes no sense. If deliver_posted_interrupt is
 on, we don't have this feedback anymore, thus we decided to remove it, no?
 
Agree, but I wanted to do clear revert and fix on top.

 Jan
 
  +
  +   if (!result) {
  if (trig_mode)
  apic_debug(level trig mode repeatedly 
  for vector %d, vector);
  @@ -697,7 +702,7 @@ static void __apic_accept_irq(struct kvm_lapic *apic, 
  int delivery_mode,
  }
   out:
  trace_kvm_apic_accept_irq(vcpu-vcpu_id, delivery_mode,
  - trig_mode, vector, false);
  +   trig_mode, vector, !result);
  break;
   
  case APIC_DM_REMRD:
  @@ -709,12 

Re: [PATCH] KVM: Fix RTC interrupt coalescing tracking

2013-06-26 Thread Jan Kiszka
On 2013-06-26 08:15, Gleb Natapov wrote:
 On Wed, Jun 26, 2013 at 07:49:37AM +0200, Jan Kiszka wrote:
 On 2013-06-24 14:19, Gleb Natapov wrote:
 This reverts most of the f1ed0450a5fac7067590317cbf027f566b6ccbca. After
 the commit kvm_apic_set_irq() no longer returns accurate information
 about interrupt injection status if injection is done into disabled
 APIC. RTC interrupt coalescing tracking relies on the information to be
 accurate and cannot recover if it is not.

 Signed-off-by: Gleb Natapov g...@redhat.com
 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
 index 9d75193..9f4bea8 100644
 --- a/arch/x86/kvm/lapic.c
 +++ b/arch/x86/kvm/lapic.c
 @@ -405,17 +405,17 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 return highest_irr;
  }
  
 -static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 - int vector, int level, int trig_mode,
 - unsigned long *dest_map);
 +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 +int vector, int level, int trig_mode,
 +unsigned long *dest_map);

 I still think __apic_accept_irq should unconditionally inject, and the
 test for acpi_enabled belongs into kvm_apic_set_irq. Why should
 __apic_accept_irq accept non-APIC_DM_FIXED messages if the APIC is off?
 See below for another reason to refactor this part of the interface.

 10.4.7.2 Local APIC State After It Has Been Software Disabled
 
 The local APIC will respond normally to INIT, NMI, SMI, and SIPI
 messages.

OK, I see.

 
  
 -void kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 - unsigned long *dest_map)
 +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 +   unsigned long *dest_map)
  {
 struct kvm_lapic *apic = vcpu-arch.apic;
  
 -   __apic_accept_irq(apic, irq-delivery_mode, irq-vector,
 - irq-level, irq-trig_mode, dest_map);
 +   return __apic_accept_irq(apic, irq-delivery_mode, irq-vector,
 +   irq-level, irq-trig_mode, dest_map);
  }
  
  static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
 @@ -608,8 +608,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, 
 struct kvm_lapic *src,
 *r = -1;
  
 if (irq-shorthand == APIC_DEST_SELF) {
 -   kvm_apic_set_irq(src-vcpu, irq, dest_map);
 -   *r = 1;
 +   *r = kvm_apic_set_irq(src-vcpu, irq, dest_map);
 return true;
 }
  
 @@ -654,8 +653,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, 
 struct kvm_lapic *src,
 continue;
 if (*r  0)
 *r = 0;
 -   kvm_apic_set_irq(dst[i]-vcpu, irq, dest_map);
 -   *r += 1;
 +   *r += kvm_apic_set_irq(dst[i]-vcpu, irq, dest_map);
 }
  
 ret = true;
 @@ -664,11 +662,15 @@ out:
 return ret;
  }
  
 -/* Set an IRQ pending in the lapic. */
 -static void __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 - int vector, int level, int trig_mode,
 - unsigned long *dest_map)
 +/*
 + * Add a pending IRQ into lapic.
 + * Return 1 if successfully added and 0 if discarded.
 + */
 +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 +int vector, int level, int trig_mode,
 +unsigned long *dest_map)
  {
 +   int result = 0;
 struct kvm_vcpu *vcpu = apic-vcpu;
  
 switch (delivery_mode) {
 @@ -682,10 +684,13 @@ static void __apic_accept_irq(struct kvm_lapic *apic, 
 int delivery_mode,
 if (dest_map)
 __set_bit(vcpu-vcpu_id, dest_map);
  
 -   if (kvm_x86_ops-deliver_posted_interrupt)
 +   if (kvm_x86_ops-deliver_posted_interrupt) {
 +   result = 1;
 kvm_x86_ops-deliver_posted_interrupt(vcpu, vector);
 -   else {
 -   if (apic_test_and_set_irr(vector, apic)) {
 +   } else {
 +   result = !apic_test_and_set_irr(vector, apic);

 This part of the revert makes no sense. If deliver_posted_interrupt is
 on, we don't have this feedback anymore, thus we decided to remove it, no?

 Agree, but I wanted to do clear revert and fix on top.

Fine with me, let's write a separate fix.

Jan




signature.asc
Description: OpenPGP digital signature


Re: [nVMX w/ Haswell] KVM unit-tests in L1 - eventinj test fails trying to send NMI

2013-06-26 Thread Jan Kiszka
On 2013-06-05 11:06, Kashyap Chamarthy wrote:
 Adding Jan, Jun, to see if they have any inputs here.

Thanks for the note, it's very helpful! This test actually fails on
older CPUs as well, and I can finally reproduce the issue that Jay also
reported. I'm not able to cure it by going back to 3b656cf764^, just
alter the error report. Anyway, a start. Now I just need time to debug it...

Jan

 
 /kashyap
 
 On Tue, Jun 4, 2013 at 6:14 PM, Kashyap Chamarthy kashyap...@gmail.com 
 wrote:
 Heya,

 So, I invoked this in L1 with:
 ===
 [test@foo kvm-unit-tests]$ time qemu-system-x86_64 -enable-kvm -device
 pc-testdev -serial stdio -nographic -no-user-config -nodefaults
 -device
 isa-debug-exit,iobase=0xf4,iosize=0x4 -kernel ./x86/eventinj.flat |
 tee /var/tmp/eventinj-test.txt
 enabling apic
 paging enabled
 cr0 = 80010011
 cr3 = 7fff000
 cr4 = 20
 Try to divide by 0
 DE isr running divider is 0
 Result is 150
 DE exception: PASS
 Try int 3
 BP isr running
 After int 3
 BP exception: PASS
 Try send vec 33 to itself
 irq1 running
 After vec 33 to itself
 vec 33: PASS
 Try int $33
 irq1 running
 After int $33
 int $33: PASS
 Try send vec 32 and 33 to itself
 irq1 running
 irq0 running
 After vec 32 and 33 to itself
 vec 32/33: PASS
 Try send vec 32 and int $33
 irq1 running
 irq0 running
 After vec 32 and int $33
 vec 32/int $33: PASS
 Try send vec 33 and 62 and mask one with TPR
 irq1 running
 After 33/62 TPR test
 TPR: PASS
 irq0 running
 Try send NMI to itself
 After NMI to itself
 NMI: FAIL
 Try int 33 with shadowed stack
 irq1 running
 After int 33 with shadowed stack
 int 33 with shadowed stack: PASS

 summary: 9 tests, 1 failures

 real0m0.647s
 user0m0.164s
 sys 0m0.146s
 [test@foo kvm-unit-tests]$
 ===

 Any hints on further debugging this ?


 Other info:
 --

 - L1's qemu-kvm CLI
 ===
 # ps -ef | grep -i qemu
 qemu  5455 1 94 Jun02 ?1-07:14:29
 /usr/bin/qemu-system-x86_64 -machine accel=kvm -name regular-guest -S
 -machine pc-i440fx-1.4,accel=kvm,usb=off -cpu Haswell,+vmx -m 10240
 -smp 4,sockets=4,cores=1,threads=1 -uuid
 4ed9ac0b-7f72-dfcf-68b3-e6fe2ac588b2 -nographic -no-user-config
 -nodefaults -chardev
 socket,id=charmonitor,path=/var/lib/libvirt/qemu/regular-guest.monitor,server,nowait
 -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc
 -no-shutdown -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2
 -drive 
 file=/home/test/vmimages/regular-guest.qcow2,if=none,id=drive-virtio-disk0,format=qcow2,cache=none
 -device 
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
 -netdev tap,fd=23,id=hostnet0,vhost=on,vhostfd=24 -device
 virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:80:c1:34,bus=pci.0,addr=0x3
 -chardev pty,id=charserial0 -device
 isa-serial,chardev=charserial0,id=serial0 -device usb-tablet,id=input0
 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5
 root 12255  5419  0 08:41 pts/200:00:00 grep --color=auto -i qemu
 ===

 - Setup details --
 https://github.com/kashyapc/nvmx-haswell/blob/master/SETUP-nVMX.rst

 /kashyap



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction

2013-06-26 Thread tiejun.chen

On 06/26/2013 01:42 PM, Bharat Bhushan wrote:

ehpriv instruction is used for setting software breakpoints
by user space. This patch adds support to exit to user space
with run-debug have relevant information.

As this is the first point we are using run-debug, also defined
the run-debug structure.

Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
---
  arch/powerpc/include/asm/disassemble.h |4 
  arch/powerpc/include/uapi/asm/kvm.h|   21 +
  arch/powerpc/kvm/e500_emulate.c|   27 +++
  3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/disassemble.h 
b/arch/powerpc/include/asm/disassemble.h
index 9b198d1..856f8de 100644
--- a/arch/powerpc/include/asm/disassemble.h
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst)
return inst  0x;
  }

+static inline unsigned int get_oc(u32 inst)
+{
+   return (inst  11)  0x7fff;
+}
  #endif /* __ASM_PPC_DISASSEMBLE_H__ */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index 0fb1a6e..ded0607 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -269,7 +269,24 @@ struct kvm_fpu {
__u64 fpr[32];
  };

+/*
+ * Defines for h/w breakpoint, watchpoint (read, write or both) and
+ * software breakpoint.
+ * These are used as type in KVM_SET_GUEST_DEBUG ioctl and status
+ * for KVM_DEBUG_EXIT.
+ */
+#define KVMPPC_DEBUG_NONE  0x0
+#define KVMPPC_DEBUG_BREAKPOINT(1UL  1)
+#define KVMPPC_DEBUG_WATCH_WRITE   (1UL  2)
+#define KVMPPC_DEBUG_WATCH_READ(1UL  3)
  struct kvm_debug_exit_arch {
+   __u64 address;
+   /*
+* exiting to userspace because of h/w breakpoint, watchpoint
+* (read, write or both) and software breakpoint.
+*/
+   __u32 status;
+   __u32 reserved;
  };

  /* for KVM_SET_GUEST_DEBUG */
@@ -281,10 +298,6 @@ struct kvm_guest_debug_arch {
 * Type denotes h/w breakpoint, read watchpoint, write
 * watchpoint or watchpoint (both read and write).
 */
-#define KVMPPC_DEBUG_NONE  0x0
-#define KVMPPC_DEBUG_BREAKPOINT(1UL  1)
-#define KVMPPC_DEBUG_WATCH_WRITE   (1UL  2)
-#define KVMPPC_DEBUG_WATCH_READ(1UL  3)
__u32 type;
__u32 reserved;
} bp[16];
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index b10a012..dab9d07 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -26,6 +26,8 @@
  #define XOP_TLBRE   946
  #define XOP_TLBWE   978
  #define XOP_TLBILX  18
+#define XOP_EHPRIV  270
+#define EHPRIV_OC_DEBUG 0


As I think the case, OC = 0, is a bit specific since IIRC, if the OC
operand is omitted, its equal 0 by default. So I think we should start this OC 
value from 1 or other magic number.


And if possible, we'd better add some comments to describe this to make the OC 
definition readable.


Tiejun



  #ifdef CONFIG_KVM_E500MC
  static int dbell2prio(ulong param)
@@ -82,6 +84,26 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, 
int rb)
  }
  #endif

+static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu,
+  unsigned int inst, int *advance)
+{
+   int emulated = EMULATE_DONE;
+
+   switch (get_oc(inst)) {
+   case EHPRIV_OC_DEBUG:
+   run-exit_reason = KVM_EXIT_DEBUG;
+   run-debug.arch.address = vcpu-arch.pc;
+   run-debug.arch.status = 0;
+   kvmppc_account_exit(vcpu, DEBUG_EXITS);
+   emulated = EMULATE_EXIT_USER;
+   *advance = 0;
+   break;
+   default:
+   emulated = EMULATE_FAIL;
+   }
+   return emulated;
+}
+
  int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 unsigned int inst, int *advance)
  {
@@ -130,6 +152,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
break;

+   case XOP_EHPRIV:
+   emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst,
+  advance);
+   break;
+
default:
emulated = EMULATE_FAIL;
}



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][kvm-unit-test] Keep gui off when running test cases

2013-06-26 Thread Paolo Bonzini
Il 26/06/2013 08:06, Jan Kiszka ha scritto:
 From: Jan Kiszka jan.kis...@siemens.com
 
 Signed-off-by: Jan Kiszka jan.kis...@siemens.com
 ---
  x86-run |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)
 
 diff --git a/x86-run b/x86-run
 index 14ff331..646c577 100755
 --- a/x86-run
 +++ b/x86-run
 @@ -33,7 +33,7 @@ else
   pc_testdev=-device testdev,chardev=testlog -chardev 
 file,id=testlog,path=msr.out
  fi
  
 -command=${qemu} -enable-kvm $pc_testdev -serial stdio $pci_testdev -kernel
 +command=${qemu} -enable-kvm $pc_testdev -display none -serial stdio 
 $pci_testdev -kernel
  echo ${command} $@
  ${command} $@
  ret=$?
 

Reviewed-by: Paolo Bonzini pbonz...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 29/45] kvm/vmx: Use get/put_online_cpus_atomic() to prevent CPU offline

2013-06-26 Thread Paolo Bonzini
Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto:
 - cpu = get_cpu();
 + cpu = get_online_cpus_atomic();
   vmx_vcpu_load(vmx-vcpu, cpu);
   vmx-vcpu.cpu = cpu;
   err = vmx_vcpu_setup(vmx);
   vmx_vcpu_put(vmx-vcpu);
 - put_cpu();
 + put_online_cpus_atomic();

The new API has a weird name.  Why are you adding new functions instead
of just modifying get/put_cpu?

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [nVMX w/ Haswell] KVM unit-tests in L1 - eventinj test fails trying to send NMI

2013-06-26 Thread Kashyap Chamarthy
 Thanks for the note, it's very helpful! This test actually fails on
 older CPUs as well, and I can finally reproduce the issue that Jay also
 reported. I'm not able to cure it by going back to 3b656cf764^,

Ok, you tried w/o this commit..

commit 3b656cf764cbc43d3efb9bf5f45c618d4cf0989f
Author: Jan Kiszka jan.kis...@siemens.com
Date:   Sun Apr 14 12:12:45 2013 +0200

KVM: nVMX: Fix injection of PENDING_INTERRUPT and NMI_WINDOW exits to L1

Check if the interrupt or NMI window exit is for L1 by testing if it has
the corresponding controls enabled. This is required when we allow
direct injection from L0 to L2


  just
 alter the error report. Anyway, a start. Now I just need time to debug it...

Great, would you prefer a bug to track this? Or will that be ignored? :)

Don't hesitate to let me know if you need any further testing help or
want me to try something specific.

Thanks.


 Jan


 /kashyap

 On Tue, Jun 4, 2013 at 6:14 PM, Kashyap Chamarthy kashyap...@gmail.com 
 wrote:
 Heya,

 So, I invoked this in L1 with:
 ===
 [test@foo kvm-unit-tests]$ time qemu-system-x86_64 -enable-kvm -device
 pc-testdev -serial stdio -nographic -no-user-config -nodefaults
 -device
 isa-debug-exit,iobase=0xf4,iosize=0x4 -kernel ./x86/eventinj.flat |
 tee /var/tmp/eventinj-test.txt
 enabling apic
 paging enabled
 cr0 = 80010011
 cr3 = 7fff000
 cr4 = 20
 Try to divide by 0
 DE isr running divider is 0
 Result is 150
 DE exception: PASS
 Try int 3
 BP isr running
 After int 3
 BP exception: PASS
 Try send vec 33 to itself
 irq1 running
 After vec 33 to itself
 vec 33: PASS
 Try int $33
 irq1 running
 After int $33
 int $33: PASS
 Try send vec 32 and 33 to itself
 irq1 running
 irq0 running
 After vec 32 and 33 to itself
 vec 32/33: PASS
 Try send vec 32 and int $33
 irq1 running
 irq0 running
 After vec 32 and int $33
 vec 32/int $33: PASS
 Try send vec 33 and 62 and mask one with TPR
 irq1 running
 After 33/62 TPR test
 TPR: PASS
 irq0 running
 Try send NMI to itself
 After NMI to itself
 NMI: FAIL
 Try int 33 with shadowed stack
 irq1 running
 After int 33 with shadowed stack
 int 33 with shadowed stack: PASS

 summary: 9 tests, 1 failures

 real0m0.647s
 user0m0.164s
 sys 0m0.146s
 [test@foo kvm-unit-tests]$
 ===

 Any hints on further debugging this ?


 Other info:
 --

 - L1's qemu-kvm CLI
 ===
 # ps -ef | grep -i qemu
 qemu  5455 1 94 Jun02 ?1-07:14:29
 /usr/bin/qemu-system-x86_64 -machine accel=kvm -name regular-guest -S
 -machine pc-i440fx-1.4,accel=kvm,usb=off -cpu Haswell,+vmx -m 10240
 -smp 4,sockets=4,cores=1,threads=1 -uuid
 4ed9ac0b-7f72-dfcf-68b3-e6fe2ac588b2 -nographic -no-user-config
 -nodefaults -chardev
 socket,id=charmonitor,path=/var/lib/libvirt/qemu/regular-guest.monitor,server,nowait
 -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc
 -no-shutdown -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2
 -drive 
 file=/home/test/vmimages/regular-guest.qcow2,if=none,id=drive-virtio-disk0,format=qcow2,cache=none
 -device 
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
 -netdev tap,fd=23,id=hostnet0,vhost=on,vhostfd=24 -device
 virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:80:c1:34,bus=pci.0,addr=0x3
 -chardev pty,id=charserial0 -device
 isa-serial,chardev=charserial0,id=serial0 -device usb-tablet,id=input0
 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5
 root 12255  5419  0 08:41 pts/200:00:00 grep --color=auto -i qemu
 ===

 - Setup details --
 https://github.com/kashyapc/nvmx-haswell/blob/master/SETUP-nVMX.rst

 /kashyap

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Google Summer of Code 2013 has started

2013-06-26 Thread Stefan Hajnoczi
It is a pleasure to welcome the following GSoC 2013 students to the
QEMU, KVM, and libvirt communities:

Libvirt Wireshark Dissector - Yuto KAWAMURA (kawamuray)
http://qemu-project.org/Features/LibvirtWiresharkDissector

Libvirt Introduce API to query IP addresses for given domain - Nehal
J. Wani (nehaljwani)
http://www.google-melange.com/gsoc/project/google/gsoc2013/nehaljwani/51001

Libvirt More Intelligent virsh auto-completion - Tomas Meszaros
http://www.google-melange.com/gsoc/project/google/gsoc2013/examon/13001

QEMU Integrated Copy-Paste - Ozan Çağlayan and Pallav Agrawal (pallav)
http://qemu-project.org/Features/IntegratedCopyPaste

QEMU Continuation Passing C - Charlie Shepherd (cs648)
http://qemu-project.org/Features/Continuation-Passing_C

QEMU Kconfig - Ákos Kovács
http://qemu-project.org/Features/Kconfig

QEMU USB Media Transfer Protocol emulation - a|mond
http://www.google-melange.com/gsoc/project/google/gsoc2013/almond/1001

KVM Nested Virtualization Testsuite - Arthur Chunqi Li (xelatex)
http://www.google-melange.com/gsoc/project/google/gsoc2013/xelatex/19001

Coding started on Monday, 17th of June and ends Monday, 23rd of September.

Feel free to follow these projects - feature pages are being created
with git repo and blog links.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 29/45] kvm/vmx: Use get/put_online_cpus_atomic() to prevent CPU offline

2013-06-26 Thread Srivatsa S. Bhat
On 06/26/2013 01:16 PM, Paolo Bonzini wrote:
 Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto:
 -cpu = get_cpu();
 +cpu = get_online_cpus_atomic();
  vmx_vcpu_load(vmx-vcpu, cpu);
  vmx-vcpu.cpu = cpu;
  err = vmx_vcpu_setup(vmx);
  vmx_vcpu_put(vmx-vcpu);
 -put_cpu();
 +put_online_cpus_atomic();
 
 The new API has a weird name.  Why are you adding new functions instead
 of just modifying get/put_cpu?
 

Because the purpose of those two functions are distinctly different
from each other.

get/put_cpu() is used to disable preemption on the local CPU. (Which
also disables offlining the local CPU during that critical section).

What this patchset deals with is synchronizing with offline of *any*
CPU. Typically, we use get_online_cpus()/put_online_cpus() for that
purpose. But they can't be used in atomic context, because they take
mutex locks and hence can sleep.

So the code that executes in atomic context and which wants to prevent
*any* CPU from going offline, used to disable preemption around its
critical section. Disabling preemption prevents stop_machine(), and
CPU offline (of *any* CPU) was done via stop_machine(). So disabling
preemption disabled any CPU from going offline, as a *side-effect*.

And this patchset prepares the ground for getting rid of stop_machine()
in the CPU offline path. Which means, disabling preemption only prevents
the *local* CPU from going offline. So if code in atomic context wants
to prevent any CPU from going offline, we need a new set of APIs, like
get/put_online_cpus(), but which can be invoked from atomic context.
That's why I named it as get/put_online_cpus_atomic().

One of the key points here is that we want to preserve get/put_cpu()
as it is, since its purpose is different - disable preemption and
offline of the local CPU. There is no reason to change that API, its
useful as it is.

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH-next] kvm: don't try to take mmu_lock while holding the main raw kvm_lock

2013-06-26 Thread Paolo Bonzini
Il 26/06/2013 00:34, Paul Gortmaker ha scritto:
 In commit e935b8372cf8 (KVM: Convert kvm_lock to raw_spinlock),
 the kvm_lock was made a raw lock.  However, the kvm mmu_shrink()
 function tries to grab the (non-raw) mmu_lock within the scope of
 the raw locked kvm_lock being held.  This leads to the following:
 
 BUG: sleeping function called from invalid context at kernel/rtmutex.c:659
 in_atomic(): 1, irqs_disabled(): 0, pid: 55, name: kswapd0
 Preemption disabled at:[a0376eac] mmu_shrink+0x5c/0x1b0 [kvm]
 
 Pid: 55, comm: kswapd0 Not tainted 3.4.34_preempt-rt
 Call Trace:
  [8106f2ad] __might_sleep+0xfd/0x160
  [817d8d64] rt_spin_lock+0x24/0x50
  [a0376f3c] mmu_shrink+0xec/0x1b0 [kvm]
  [8111455d] shrink_slab+0x17d/0x3a0
  [81151f00] ? mem_cgroup_iter+0x130/0x260
  [8111824a] balance_pgdat+0x54a/0x730
  [8111fe47] ? set_pgdat_percpu_threshold+0xa7/0xd0
  [811185bf] kswapd+0x18f/0x490
  [81070961] ? get_parent_ip+0x11/0x50
  [81061970] ? __init_waitqueue_head+0x50/0x50
  [81118430] ? balance_pgdat+0x730/0x730
  [81060d2b] kthread+0xdb/0xe0
  [8106e122] ? finish_task_switch+0x52/0x100
  [817e1e94] kernel_thread_helper+0x4/0x10
  [81060c50] ? __init_kthread_worker+0x
 
 Since we only use the lock for protecting the vm_list, once we've
 found the instance we want, we can shuffle it to the end of the
 list and then drop the kvm_lock before taking the mmu_lock.  We
 can do this because after the mmu operations are completed, we
 break -- i.e. we don't continue list processing, so it doesn't
 matter if the list changed around us.
 
 Signed-off-by: Paul Gortmaker paul.gortma...@windriver.com

Since the shrinker code is asynchronous with respect to KVM, I think
that the kvm_lock here is also protecting against kvm_destroy_vm running
at the same time.

So the patch is almost okay; all that is missing is a
kvm_get_kvm/kvm_put_kvm pair, where the reference is added just before
releasing the kvm_lock.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 28/45] KVM: Use get/put_online_cpus_atomic() to prevent CPU offline

2013-06-26 Thread Paolo Bonzini
Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto:
 Once stop_machine() is gone from the CPU offline path, we won't be able
 to depend on disabling preemption to prevent CPUs from going offline
 from under us.
 
 Use the get/put_online_cpus_atomic() APIs to prevent CPUs from going
 offline, while invoking from atomic context.
 
 Cc: Gleb Natapov g...@redhat.com
 Cc: Paolo Bonzini pbonz...@redhat.com
 Cc: kvm@vger.kernel.org
 Signed-off-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
 ---
 
  virt/kvm/kvm_main.c |8 
  1 file changed, 4 insertions(+), 4 deletions(-)
 
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 302681c..5bbfa30 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -174,7 +174,7 @@ static bool make_all_cpus_request(struct kvm *kvm, 
 unsigned int req)
  
   zalloc_cpumask_var(cpus, GFP_ATOMIC);
  
 - me = get_cpu();
 + me = get_online_cpus_atomic();
   kvm_for_each_vcpu(i, vcpu, kvm) {
   kvm_make_request(req, vcpu);
   cpu = vcpu-cpu;
 @@ -192,7 +192,7 @@ static bool make_all_cpus_request(struct kvm *kvm, 
 unsigned int req)
   smp_call_function_many(cpus, ack_flush, NULL, 1);
   else
   called = false;
 - put_cpu();
 + put_online_cpus_atomic();
   free_cpumask_var(cpus);
   return called;
  }
 @@ -1707,11 +1707,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
   ++vcpu-stat.halt_wakeup;
   }
  
 - me = get_cpu();
 + me = get_online_cpus_atomic();
   if (cpu != me  (unsigned)cpu  nr_cpu_ids  cpu_online(cpu))
   if (kvm_arch_vcpu_should_kick(vcpu))
   smp_send_reschedule(cpu);
 - put_cpu();
 + put_online_cpus_atomic();
  }
  EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
  #endif /* !CONFIG_S390 */
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-pm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

Acked-by: Paolo Bonzini pbonz...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 29/45] kvm/vmx: Use get/put_online_cpus_atomic() to prevent CPU offline

2013-06-26 Thread Paolo Bonzini
Il 26/06/2013 10:06, Srivatsa S. Bhat ha scritto:
 On 06/26/2013 01:16 PM, Paolo Bonzini wrote:
 Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto:
 -   cpu = get_cpu();
 +   cpu = get_online_cpus_atomic();
 vmx_vcpu_load(vmx-vcpu, cpu);
 vmx-vcpu.cpu = cpu;
 err = vmx_vcpu_setup(vmx);
 vmx_vcpu_put(vmx-vcpu);
 -   put_cpu();
 +   put_online_cpus_atomic();

 The new API has a weird name.  Why are you adding new functions instead
 of just modifying get/put_cpu?

 
 Because the purpose of those two functions are distinctly different
 from each other.
 
 get/put_cpu() is used to disable preemption on the local CPU. (Which
 also disables offlining the local CPU during that critical section).

Ok, then I understood correctly... and I acked the other KVM patch.

However, keeping the code on the local CPU is exactly the point of this
particular use of get_cpu()/put_cpu().  Why does it need to synchronize
with offlining of other CPUs?

Paolo

 What this patchset deals with is synchronizing with offline of *any*
 CPU. Typically, we use get_online_cpus()/put_online_cpus() for that
 purpose. But they can't be used in atomic context, because they take
 mutex locks and hence can sleep.
 
 So the code that executes in atomic context and which wants to prevent
 *any* CPU from going offline, used to disable preemption around its
 critical section. Disabling preemption prevents stop_machine(), and
 CPU offline (of *any* CPU) was done via stop_machine(). So disabling
 preemption disabled any CPU from going offline, as a *side-effect*.
 
 And this patchset prepares the ground for getting rid of stop_machine()
 in the CPU offline path. Which means, disabling preemption only prevents
 the *local* CPU from going offline. So if code in atomic context wants
 to prevent any CPU from going offline, we need a new set of APIs, like
 get/put_online_cpus(), but which can be invoked from atomic context.
 That's why I named it as get/put_online_cpus_atomic().
 
 One of the key points here is that we want to preserve get/put_cpu()
 as it is, since its purpose is different - disable preemption and
 offline of the local CPU. There is no reason to change that API, its
 useful as it is.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC V10 0/18] Paravirtualized ticket spinlocks

2013-06-26 Thread Raghavendra K T

On 06/24/2013 06:47 PM, Andrew Jones wrote:

On Mon, Jun 24, 2013 at 06:10:14PM +0530, Raghavendra K T wrote:


Results:
===
base = 3.10-rc2 kernel
patched = base + this series

The test was on 32 core (model: Intel(R) Xeon(R) CPU X7560) HT disabled
with 32 KVM guest vcpu 8GB RAM.


Have you ever tried to get results with HT enabled?



+---+---+---++---+
ebizzy (records/sec) higher is better
+---+---+---++---+
 basestdevpatchedstdev%improvement
+---+---+---++---+
1x  5574.9000   237.49975618.94.0366 0.77311
2x  2741.5000   561.30903332.   102.473821.53930
3x  2146.2500   216.77182302.76.3870 7.27237
4x  1663.   141.92351753.750083.5220 5.45701
+---+---+---++---+


This looks good. Are your ebizzy results consistent run to run
though?


+---+---+---++---+
   dbench  (Throughput) higher is better
+---+---+---++---+
 basestdevpatchedstdev%improvement
+---+---+---++---+
1x 14111.5600   754.4525   14645.9900   114.3087 3.78718
2x  2481.627071.26652667.128073.8193 7.47498
3x  1510.248331.86341503.879236.0777-0.42173
4x  1029.487516.91661039.706943.8840 0.99267
+---+---+---++---+


Hmm, I wonder what 2.5x looks like. Also, the 3% improvement with
no overcommit is interesting. What's happening there? It makes
me wonder what  1x looks like.



Hi Andrew,

I tried 2.5x case sort where I used 3 guests with 27 vcpu each on 32
core (HT disabled machine) and here is the output. almost no gain there.

 throughput avgstdev
base: 1768.7458 MB/sec 54.044221
patched:  1772.5617 MB/sec 41.227689
gain %0.226

I am yet to try HT enabled cases that would give 0.5x to 2x performance
results.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks

2013-06-26 Thread Raghavendra K T

On 06/25/2013 08:20 PM, Andrew Theurer wrote:

On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote:

This series replaces the existing paravirtualized spinlock mechanism
with a paravirtualized ticketlock mechanism. The series provides
implementation for both Xen and KVM.

Changes in V9:
- Changed spin_threshold to 32k to avoid excess halt exits that are
causing undercommit degradation (after PLE handler improvement).
- Added  kvm_irq_delivery_to_apic (suggested by Gleb)
- Optimized halt exit path to use PLE handler

V8 of PVspinlock was posted last year. After Avi's suggestions to look
at PLE handler's improvements, various optimizations in PLE handling
have been tried.


Sorry for not posting this sooner.  I have tested the v9 pv-ticketlock
patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs.  I have
tested these patches with and without PLE, as PLE is still not scalable
with large VMs.



Hi Andrew,

Thanks for testing.


System: x3850X5, 40 cores, 80 threads


1x over-commit with 10-vCPU VMs (8 VMs) all running dbench:
--
Total
Configuration   Throughput(MB/s)Notes

3.10-default-ple_on 22945   5% CPU in host 
kernel, 2% spin_lock in guests
3.10-default-ple_off23184   5% CPU in host 
kernel, 2% spin_lock in guests
3.10-pvticket-ple_on22895   5% CPU in host 
kernel, 2% spin_lock in guests
3.10-pvticket-ple_off   23051   5% CPU in host 
kernel, 2% spin_lock in guests
[all 1x results look good here]


Yes. The 1x results look too close




2x over-commit with 10-vCPU VMs (16 VMs) all running dbench:
---
Total
Configuration   Throughput  Notes

3.10-default-ple_on  6287   55% CPU  host 
kernel, 17% spin_lock in guests
3.10-default-ple_off 1849   2% CPU in host 
kernel, 95% spin_lock in guests
3.10-pvticket-ple_on 6691   50% CPU in host 
kernel, 15% spin_lock in guests
3.10-pvticket-ple_off   16464   8% CPU in host 
kernel, 33% spin_lock in guests


I see 6.426% improvement with ple_on
and 161.87% improvement with ple_off. I think this is a very good sign
 for the patches


[PLE hinders pv-ticket improvements, but even with PLE off,
  we still off from ideal throughput (somewhere 2)]



Okay, The ideal throughput you are referring is getting around atleast
80% of 1x throughput for over-commit. Yes we are still far away from
there.



1x over-commit with 20-vCPU VMs (4 VMs) all running dbench:
--
Total
Configuration   Throughput  Notes

3.10-default-ple_on 22736   6% CPU in host 
kernel, 3% spin_lock in guests
3.10-default-ple_off23377   5% CPU in host 
kernel, 3% spin_lock in guests
3.10-pvticket-ple_on22471   6% CPU in host 
kernel, 3% spin_lock in guests
3.10-pvticket-ple_off   23445   5% CPU in host 
kernel, 3% spin_lock in guests
[1x looking fine here]



I see ple_off is little better here.



2x over-commit with 20-vCPU VMs (8 VMs) all running dbench:
--
Total
Configuration   Throughput  Notes

3.10-default-ple_on  1965   70% CPU in host 
kernel, 34% spin_lock in guests 
3.10-default-ple_off  226   2% CPU in host 
kernel, 94% spin_lock in guests
3.10-pvticket-ple_on 1942   70% CPU in host 
kernel, 35% spin_lock in guests
3.10-pvticket-ple_off8003   11% CPU in host 
kernel, 70% spin_lock in guests
[quite bad all around, but pv-tickets with PLE off the best so far.
  Still quite a bit off from ideal throughput]


This is again a remarkable improvement (307%).
This motivates me to add a patch to disable ple when pvspinlock is on.
probably we can add a hypercall that disables ple in kvm init patch.
but only problem I see is what if the guests are mixed.

 (i.e one guest has pvspinlock support but other does not. Host
supports pv)

/me thinks



In summary, I would state that the pv-ticket is an overall win, but the
current PLE handler tends to get in the way on these larger guests.

-Andrew



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a 

Re: [PATCH v2 29/45] kvm/vmx: Use get/put_online_cpus_atomic() to prevent CPU offline

2013-06-26 Thread Srivatsa S. Bhat
On 06/26/2013 01:53 PM, Paolo Bonzini wrote:
 Il 26/06/2013 10:06, Srivatsa S. Bhat ha scritto:
 On 06/26/2013 01:16 PM, Paolo Bonzini wrote:
 Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto:
 -  cpu = get_cpu();
 +  cpu = get_online_cpus_atomic();
vmx_vcpu_load(vmx-vcpu, cpu);
vmx-vcpu.cpu = cpu;
err = vmx_vcpu_setup(vmx);
vmx_vcpu_put(vmx-vcpu);
 -  put_cpu();
 +  put_online_cpus_atomic();

 The new API has a weird name.  Why are you adding new functions instead
 of just modifying get/put_cpu?


 Because the purpose of those two functions are distinctly different
 from each other.

 get/put_cpu() is used to disable preemption on the local CPU. (Which
 also disables offlining the local CPU during that critical section).
 
 Ok, then I understood correctly... and I acked the other KVM patch.


Thank you!
 
 However, keeping the code on the local CPU is exactly the point of this
 particular use of get_cpu()/put_cpu().  Why does it need to synchronize
 with offlining of other CPUs?
 

Now that I looked at it again, I think you are right, get/put_cpu() is
good enough here.

But let me explain why I initially thought we needed full synchronization
with CPU offline. In short, I wanted to synchronize the calls to
__loaded_vmcs_clear(). We have the scenario shown below:

CPU offline:
CPU_DYING:
hardware_disable();
-vmclear_local_loaded_vmcss();
  -__loaded_vmcs_clear(v);



And vmx_vcpu_load() (among others) can do:
   vmx_vcpu_load();
   - loaded_vmcs_clear();
  - __loaded_vmcs_clear();


So I wanted to avoid this race-condition and hence wrapped the code with
get/put_online_cpus_atomic().

But the point I missed earlier is that loaded_vmcs_clear() calls
__loaded_vmcs_clear() using smp_call_function_single(), which itself
synchronizes properly with CPU hotplug. So there is no need to add full
hotplug synchronization in the vmx code, as you noted above.

So, please ignore this patch, and sorry for the noise!

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][kvm-unit-test] Keep gui off when running test cases

2013-06-26 Thread Gleb Natapov
On Wed, Jun 26, 2013 at 09:08:12AM +0200, Paolo Bonzini wrote:
 Il 26/06/2013 08:06, Jan Kiszka ha scritto:
  From: Jan Kiszka jan.kis...@siemens.com
  
  Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Applied, thanks.

  ---
   x86-run |2 +-
   1 files changed, 1 insertions(+), 1 deletions(-)
  
  diff --git a/x86-run b/x86-run
  index 14ff331..646c577 100755
  --- a/x86-run
  +++ b/x86-run
  @@ -33,7 +33,7 @@ else
  pc_testdev=-device testdev,chardev=testlog -chardev 
  file,id=testlog,path=msr.out
   fi
   
  -command=${qemu} -enable-kvm $pc_testdev -serial stdio $pci_testdev 
  -kernel
  +command=${qemu} -enable-kvm $pc_testdev -display none -serial stdio 
  $pci_testdev -kernel
   echo ${command} $@
   ${command} $@
   ret=$?
  
 
 Reviewed-by: Paolo Bonzini pbonz...@redhat.com

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 29/45] kvm/vmx: Use get/put_online_cpus_atomic() to prevent CPU offline

2013-06-26 Thread Paolo Bonzini
Il 26/06/2013 10:41, Srivatsa S. Bhat ha scritto:
 On 06/26/2013 01:53 PM, Paolo Bonzini wrote:
 Il 26/06/2013 10:06, Srivatsa S. Bhat ha scritto:
 On 06/26/2013 01:16 PM, Paolo Bonzini wrote:
 Il 25/06/2013 22:30, Srivatsa S. Bhat ha scritto:
 - cpu = get_cpu();
 + cpu = get_online_cpus_atomic();
   vmx_vcpu_load(vmx-vcpu, cpu);
   vmx-vcpu.cpu = cpu;
   err = vmx_vcpu_setup(vmx);
   vmx_vcpu_put(vmx-vcpu);
 - put_cpu();
 + put_online_cpus_atomic();

 The new API has a weird name.  Why are you adding new functions instead
 of just modifying get/put_cpu?


 Because the purpose of those two functions are distinctly different
 from each other.

 get/put_cpu() is used to disable preemption on the local CPU. (Which
 also disables offlining the local CPU during that critical section).

 Ok, then I understood correctly... and I acked the other KVM patch.

 
 Thank you!
  
 However, keeping the code on the local CPU is exactly the point of this
 particular use of get_cpu()/put_cpu().  Why does it need to synchronize
 with offlining of other CPUs?
 
 Now that I looked at it again, I think you are right, get/put_cpu() is
 good enough here.
 
 But let me explain why I initially thought we needed full synchronization
 with CPU offline. In short, I wanted to synchronize the calls to
 __loaded_vmcs_clear(). We have the scenario shown below:
 
 CPU offline:
   CPU_DYING:
   hardware_disable();
   -vmclear_local_loaded_vmcss();
 -__loaded_vmcs_clear(v);
 
 
 
 And vmx_vcpu_load() (among others) can do:
vmx_vcpu_load();
- loaded_vmcs_clear();
   - __loaded_vmcs_clear();
 
 
 So I wanted to avoid this race-condition and hence wrapped the code with
 get/put_online_cpus_atomic().
 
 But the point I missed earlier is that loaded_vmcs_clear() calls
 __loaded_vmcs_clear() using smp_call_function_single(), which itself
 synchronizes properly with CPU hotplug. So there is no need to add full
 hotplug synchronization in the vmx code, as you noted above.

Makes sense, and I see now that it's patch 9 in this series.

In general, I would rather add an extra get_online_cpus_atomic pair
where it it actually needed (i.e. closer to where cpu_online is actually
used), and leave get_cpu/put_cpu as is in the caller... which is exactly
what happens in this case, since where it is actually needed is in
smp_call_function_single().

 So, please ignore this patch, and sorry for the noise!

No problem, thanks for the heads-up.

Paolo

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction

2013-06-26 Thread tiejun.chen

On 06/26/2013 04:44 PM, Bhushan Bharat-R65777 wrote:




-Original Message-
From: tiejun.chen [mailto:tiejun.c...@windriver.com]
Sent: Wednesday, June 26, 2013 12:25 PM
To: Bhushan Bharat-R65777
Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Wood Scott-
B07421; b...@kernel.crashing.org; linuxppc-...@lists.ozlabs.org; linux-
ker...@vger.kernel.org; mi...@neuling.org; Bhushan Bharat-R65777
Subject: Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction

On 06/26/2013 01:42 PM, Bharat Bhushan wrote:

ehpriv instruction is used for setting software breakpoints
by user space. This patch adds support to exit to user space
with run-debug have relevant information.

As this is the first point we are using run-debug, also defined
the run-debug structure.

Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
---
   arch/powerpc/include/asm/disassemble.h |4 
   arch/powerpc/include/uapi/asm/kvm.h|   21 +
   arch/powerpc/kvm/e500_emulate.c|   27 +++
   3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/disassemble.h

b/arch/powerpc/include/asm/disassemble.h

index 9b198d1..856f8de 100644
--- a/arch/powerpc/include/asm/disassemble.h
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst)
return inst  0x;
   }

+static inline unsigned int get_oc(u32 inst)
+{
+   return (inst  11)  0x7fff;
+}
   #endif /* __ASM_PPC_DISASSEMBLE_H__ */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h

b/arch/powerpc/include/uapi/asm/kvm.h

index 0fb1a6e..ded0607 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -269,7 +269,24 @@ struct kvm_fpu {
__u64 fpr[32];
   };

+/*
+ * Defines for h/w breakpoint, watchpoint (read, write or both) and
+ * software breakpoint.
+ * These are used as type in KVM_SET_GUEST_DEBUG ioctl and status
+ * for KVM_DEBUG_EXIT.
+ */
+#define KVMPPC_DEBUG_NONE  0x0
+#define KVMPPC_DEBUG_BREAKPOINT(1UL  1)
+#define KVMPPC_DEBUG_WATCH_WRITE   (1UL  2)
+#define KVMPPC_DEBUG_WATCH_READ(1UL  3)
   struct kvm_debug_exit_arch {
+   __u64 address;
+   /*
+* exiting to userspace because of h/w breakpoint, watchpoint
+* (read, write or both) and software breakpoint.
+*/
+   __u32 status;
+   __u32 reserved;
   };

   /* for KVM_SET_GUEST_DEBUG */
@@ -281,10 +298,6 @@ struct kvm_guest_debug_arch {
 * Type denotes h/w breakpoint, read watchpoint, write
 * watchpoint or watchpoint (both read and write).
 */
-#define KVMPPC_DEBUG_NONE  0x0
-#define KVMPPC_DEBUG_BREAKPOINT(1UL  1)
-#define KVMPPC_DEBUG_WATCH_WRITE   (1UL  2)
-#define KVMPPC_DEBUG_WATCH_READ(1UL  3)
__u32 type;
__u32 reserved;
} bp[16];
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index b10a012..dab9d07 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -26,6 +26,8 @@
   #define XOP_TLBRE   946
   #define XOP_TLBWE   978
   #define XOP_TLBILX  18
+#define XOP_EHPRIV  270
+#define EHPRIV_OC_DEBUG 0


As I think the case, OC = 0, is a bit specific since IIRC, if the OC
operand is omitted, its equal 0 by default. So I think we should start this OC
value from 1 or other magic number.


ehpriv instruction is defined to be used as:
ehpriv OC // where OC can be 0,1, ... n
and in extended for it can be used as
ehpriv // With no OC, and here it assumes OC = 0
So OC = 0 is not specific but ehpriv is same as ehpriv 0.


Yes, this is just what I mean.



I do not think of any special reason to reserve ehpriv and ehpriv 0.


So I still prefer we can reserve the 'ehpriv' without OC operand as one simple 
approach to test or develop something for KVM quickly because its really 
convenient to trap into the hypervisor only with one 'ehpriv' instruction easily.


But I have no further objection if you guys are fine to this ;-)

Tiejun



Thanks
-Bharat



And if possible, we'd better add some comments to describe this to make the OC
definition readable.

Tiejun



   #ifdef CONFIG_KVM_E500MC
   static int dbell2prio(ulong param)
@@ -82,6 +84,26 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu,

int rb)

   }
   #endif

+static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu

*vcpu,

+  unsigned int inst, int *advance)
+{
+   int emulated = EMULATE_DONE;
+
+   switch (get_oc(inst)) {
+   case EHPRIV_OC_DEBUG:
+   run-exit_reason = KVM_EXIT_DEBUG;
+   run-debug.arch.address = vcpu-arch.pc;
+   run-debug.arch.status = 0;
+   kvmppc_account_exit(vcpu, DEBUG_EXITS);
+   emulated = 

Re: Bug#707257: linux-image-3.8-1-686-pae: KVM crashes with entry failed, hardware error 0x80000021

2013-06-26 Thread Gleb Natapov
On Mon, Jun 24, 2013 at 10:42:57PM +0200, Stefan Pietsch wrote:
 On 24.06.2013 14:30, Gleb Natapov wrote:
  On Mon, Jun 24, 2013 at 01:59:34PM +0200, Stefan Pietsch wrote:
  As soon as I remove kvmvapic.bin the virtual machine boots with
  qemu-kvm 1.5.0. I just verified this with Linux kernel 3.10.0-rc5.
  emulate_invalid_guest_state=0 or emulate_invalid_guest_state=1 make
  no difference.
 
  Please send your patches.
  Here it is, run with it and kvmvapic.bin present. See what is printed in
  dmesg after the failure.
  
  
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index f4a5b3f..65488a4 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -3385,6 +3385,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
   {
  struct vcpu_vmx *vmx = to_vmx(vcpu);
  u32 ar;
  +   unsigned long rip;
   
  if (vmx-rmode.vm86_active  seg != VCPU_SREG_LDTR) {
  *var = vmx-rmode.segs[seg];
  @@ -3408,6 +3409,9 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
  var-db = (ar  14)  1;
  var-g = (ar  15)  1;
  var-unusable = (ar  16)  1;
  +   rip = kvm_rip_read(vcpu);
  +   if ((rip == 0xc101611c || rip == 0xc101611a)  seg == VCPU_SREG_FS)
  +   printk(base=%p limit=%p selector=%x ar=%x\n, var-base, 
  var-limit, var-selector, ar);
   }
   
   static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
 
 
 Booting kernel Linux 3.10-rc5 with your patch applied produces these
 messages in dmesg when starting a virtual machine:
 
 emulate_invalid_guest_state=0
 [  118.732151] base= limit=  (null) selector=ffff ar=0
 [  118.732341] base= limit=  (null) selector=ffff ar=0
 
I've butchered printk format, but it gives me the idea of what is going
on anyway. Can you try the patch below with
emulate_invalid_guest_state=0|1?


diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f4a5b3f..eb062ce 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3395,19 +3395,20 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
var-selector = vmx_read_guest_seg_selector(vmx, seg);
return;
}
+
var-base = vmx_read_guest_seg_base(vmx, seg);
var-limit = vmx_read_guest_seg_limit(vmx, seg);
var-selector = vmx_read_guest_seg_selector(vmx, seg);
ar = vmx_read_guest_seg_ar(vmx, seg);
+   var-unusable = (ar  16)  1;
var-type = ar  15;
var-s = (ar  4)  1;
var-dpl = (ar  5)  3;
-   var-present = (ar  7)  1;
+   var-present = !var-unusable;
var-avl = (ar  12)  1;
var-l = (ar  13)  1;
var-db = (ar  14)  1;
var-g = (ar  15)  1;
-   var-unusable = (ar  16)  1;
 }
 
 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks

2013-06-26 Thread Andrew Jones
On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote:
 On 06/25/2013 08:20 PM, Andrew Theurer wrote:
 On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote:
 This series replaces the existing paravirtualized spinlock mechanism
 with a paravirtualized ticketlock mechanism. The series provides
 implementation for both Xen and KVM.
 
 Changes in V9:
 - Changed spin_threshold to 32k to avoid excess halt exits that are
 causing undercommit degradation (after PLE handler improvement).
 - Added  kvm_irq_delivery_to_apic (suggested by Gleb)
 - Optimized halt exit path to use PLE handler
 
 V8 of PVspinlock was posted last year. After Avi's suggestions to look
 at PLE handler's improvements, various optimizations in PLE handling
 have been tried.
 
 Sorry for not posting this sooner.  I have tested the v9 pv-ticketlock
 patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs.  I have
 tested these patches with and without PLE, as PLE is still not scalable
 with large VMs.
 
 
 Hi Andrew,
 
 Thanks for testing.
 
 System: x3850X5, 40 cores, 80 threads
 
 
 1x over-commit with 10-vCPU VMs (8 VMs) all running dbench:
 --
  Total
 ConfigurationThroughput(MB/s)Notes
 
 3.10-default-ple_on  22945   5% CPU in host 
 kernel, 2% spin_lock in guests
 3.10-default-ple_off 23184   5% CPU in host 
 kernel, 2% spin_lock in guests
 3.10-pvticket-ple_on 22895   5% CPU in host 
 kernel, 2% spin_lock in guests
 3.10-pvticket-ple_off23051   5% CPU 
 in host kernel, 2% spin_lock in guests
 [all 1x results look good here]
 
 Yes. The 1x results look too close
 
 
 
 2x over-commit with 10-vCPU VMs (16 VMs) all running dbench:
 ---
  Total
 ConfigurationThroughput  Notes
 
 3.10-default-ple_on   6287   55% CPU  host 
 kernel, 17% spin_lock in guests
 3.10-default-ple_off  1849   2% CPU in host 
 kernel, 95% spin_lock in guests
 3.10-pvticket-ple_on  6691   50% CPU in host 
 kernel, 15% spin_lock in guests
 3.10-pvticket-ple_off16464   8% CPU 
 in host kernel, 33% spin_lock in guests
 
 I see 6.426% improvement with ple_on
 and 161.87% improvement with ple_off. I think this is a very good sign
  for the patches
 
 [PLE hinders pv-ticket improvements, but even with PLE off,
   we still off from ideal throughput (somewhere 2)]
 
 
 Okay, The ideal throughput you are referring is getting around atleast
 80% of 1x throughput for over-commit. Yes we are still far away from
 there.
 
 
 1x over-commit with 20-vCPU VMs (4 VMs) all running dbench:
 --
  Total
 ConfigurationThroughput  Notes
 
 3.10-default-ple_on  22736   6% CPU in host 
 kernel, 3% spin_lock in guests
 3.10-default-ple_off 23377   5% CPU in host 
 kernel, 3% spin_lock in guests
 3.10-pvticket-ple_on 22471   6% CPU in host 
 kernel, 3% spin_lock in guests
 3.10-pvticket-ple_off23445   5% CPU 
 in host kernel, 3% spin_lock in guests
 [1x looking fine here]
 
 
 I see ple_off is little better here.
 
 
 2x over-commit with 20-vCPU VMs (8 VMs) all running dbench:
 --
  Total
 ConfigurationThroughput  Notes
 
 3.10-default-ple_on   1965   70% CPU in host 
 kernel, 34% spin_lock in guests 
 3.10-default-ple_off   226   2% CPU in host 
 kernel, 94% spin_lock in guests
 3.10-pvticket-ple_on  1942   70% CPU in host 
 kernel, 35% spin_lock in guests
 3.10-pvticket-ple_off 8003   11% CPU 
 in host kernel, 70% spin_lock in guests
 [quite bad all around, but pv-tickets with PLE off the best so far.
   Still quite a bit off from ideal throughput]
 
 This is again a remarkable improvement (307%).
 This motivates me to add a patch to disable ple when pvspinlock is on.
 probably we can add a hypercall that disables ple in kvm init patch.
 but only problem I see is what if the guests are mixed.
 
  (i.e one guest has pvspinlock support but other does not. Host
 supports pv)

How about reintroducing the idea to create per-kvm ple_gap,ple_window
state. We were headed 

Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks

2013-06-26 Thread Gleb Natapov
On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote:
 On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote:
  On 06/25/2013 08:20 PM, Andrew Theurer wrote:
  On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote:
  This series replaces the existing paravirtualized spinlock mechanism
  with a paravirtualized ticketlock mechanism. The series provides
  implementation for both Xen and KVM.
  
  Changes in V9:
  - Changed spin_threshold to 32k to avoid excess halt exits that are
  causing undercommit degradation (after PLE handler improvement).
  - Added  kvm_irq_delivery_to_apic (suggested by Gleb)
  - Optimized halt exit path to use PLE handler
  
  V8 of PVspinlock was posted last year. After Avi's suggestions to look
  at PLE handler's improvements, various optimizations in PLE handling
  have been tried.
  
  Sorry for not posting this sooner.  I have tested the v9 pv-ticketlock
  patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs.  I have
  tested these patches with and without PLE, as PLE is still not scalable
  with large VMs.
  
  
  Hi Andrew,
  
  Thanks for testing.
  
  System: x3850X5, 40 cores, 80 threads
  
  
  1x over-commit with 10-vCPU VMs (8 VMs) all running dbench:
  --
 Total
  Configuration  Throughput(MB/s)Notes
  
  3.10-default-ple_on22945   5% CPU 
  in host kernel, 2% spin_lock in guests
  3.10-default-ple_off   23184   5% CPU 
  in host kernel, 2% spin_lock in guests
  3.10-pvticket-ple_on   22895   5% CPU 
  in host kernel, 2% spin_lock in guests
  3.10-pvticket-ple_off  23051   5% CPU 
  in host kernel, 2% spin_lock in guests
  [all 1x results look good here]
  
  Yes. The 1x results look too close
  
  
  
  2x over-commit with 10-vCPU VMs (16 VMs) all running dbench:
  ---
 Total
  Configuration  Throughput  Notes
  
  3.10-default-ple_on 6287   55% CPU 
   host kernel, 17% spin_lock in guests
  3.10-default-ple_off1849   2% CPU 
  in host kernel, 95% spin_lock in guests
  3.10-pvticket-ple_on6691   50% CPU 
  in host kernel, 15% spin_lock in guests
  3.10-pvticket-ple_off  16464   8% CPU 
  in host kernel, 33% spin_lock in guests
  
  I see 6.426% improvement with ple_on
  and 161.87% improvement with ple_off. I think this is a very good sign
   for the patches
  
  [PLE hinders pv-ticket improvements, but even with PLE off,
we still off from ideal throughput (somewhere 2)]
  
  
  Okay, The ideal throughput you are referring is getting around atleast
  80% of 1x throughput for over-commit. Yes we are still far away from
  there.
  
  
  1x over-commit with 20-vCPU VMs (4 VMs) all running dbench:
  --
 Total
  Configuration  Throughput  Notes
  
  3.10-default-ple_on22736   6% CPU 
  in host kernel, 3% spin_lock in guests
  3.10-default-ple_off   23377   5% CPU 
  in host kernel, 3% spin_lock in guests
  3.10-pvticket-ple_on   22471   6% CPU 
  in host kernel, 3% spin_lock in guests
  3.10-pvticket-ple_off  23445   5% CPU 
  in host kernel, 3% spin_lock in guests
  [1x looking fine here]
  
  
  I see ple_off is little better here.
  
  
  2x over-commit with 20-vCPU VMs (8 VMs) all running dbench:
  --
 Total
  Configuration  Throughput  Notes
  
  3.10-default-ple_on 1965   70% CPU 
  in host kernel, 34% spin_lock in guests 
  3.10-default-ple_off 226   2% CPU 
  in host kernel, 94% spin_lock in guests
  3.10-pvticket-ple_on1942   70% CPU 
  in host kernel, 35% spin_lock in guests
  3.10-pvticket-ple_off   8003   11% CPU 
  in host kernel, 70% spin_lock in guests
  [quite bad all around, but pv-tickets with PLE off the best so far.
Still quite a bit off from ideal throughput]
  
  This is again a remarkable improvement (307%).
  This motivates me to add a patch to disable ple when pvspinlock is on.
  probably we can add a hypercall that disables ple in kvm init patch.
  but 

Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks

2013-06-26 Thread Raghavendra K T

On 06/26/2013 06:22 PM, Gleb Natapov wrote:

On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote:

On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote:

On 06/25/2013 08:20 PM, Andrew Theurer wrote:

On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote:

This series replaces the existing paravirtualized spinlock mechanism
with a paravirtualized ticketlock mechanism. The series provides
implementation for both Xen and KVM.

Changes in V9:
- Changed spin_threshold to 32k to avoid excess halt exits that are
causing undercommit degradation (after PLE handler improvement).
- Added  kvm_irq_delivery_to_apic (suggested by Gleb)
- Optimized halt exit path to use PLE handler

V8 of PVspinlock was posted last year. After Avi's suggestions to look
at PLE handler's improvements, various optimizations in PLE handling
have been tried.


Sorry for not posting this sooner.  I have tested the v9 pv-ticketlock
patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs.  I have
tested these patches with and without PLE, as PLE is still not scalable
with large VMs.



Hi Andrew,

Thanks for testing.


System: x3850X5, 40 cores, 80 threads


1x over-commit with 10-vCPU VMs (8 VMs) all running dbench:
--
Total
Configuration   Throughput(MB/s)Notes

3.10-default-ple_on 22945   5% CPU in host 
kernel, 2% spin_lock in guests
3.10-default-ple_off23184   5% CPU in host 
kernel, 2% spin_lock in guests
3.10-pvticket-ple_on22895   5% CPU in host 
kernel, 2% spin_lock in guests
3.10-pvticket-ple_off   23051   5% CPU in host 
kernel, 2% spin_lock in guests
[all 1x results look good here]


Yes. The 1x results look too close




2x over-commit with 10-vCPU VMs (16 VMs) all running dbench:
---
Total
Configuration   Throughput  Notes

3.10-default-ple_on  6287   55% CPU  host 
kernel, 17% spin_lock in guests
3.10-default-ple_off 1849   2% CPU in host 
kernel, 95% spin_lock in guests
3.10-pvticket-ple_on 6691   50% CPU in host 
kernel, 15% spin_lock in guests
3.10-pvticket-ple_off   16464   8% CPU in host 
kernel, 33% spin_lock in guests


I see 6.426% improvement with ple_on
and 161.87% improvement with ple_off. I think this is a very good sign
  for the patches


[PLE hinders pv-ticket improvements, but even with PLE off,
  we still off from ideal throughput (somewhere 2)]



Okay, The ideal throughput you are referring is getting around atleast
80% of 1x throughput for over-commit. Yes we are still far away from
there.



1x over-commit with 20-vCPU VMs (4 VMs) all running dbench:
--
Total
Configuration   Throughput  Notes

3.10-default-ple_on 22736   6% CPU in host 
kernel, 3% spin_lock in guests
3.10-default-ple_off23377   5% CPU in host 
kernel, 3% spin_lock in guests
3.10-pvticket-ple_on22471   6% CPU in host 
kernel, 3% spin_lock in guests
3.10-pvticket-ple_off   23445   5% CPU in host 
kernel, 3% spin_lock in guests
[1x looking fine here]



I see ple_off is little better here.



2x over-commit with 20-vCPU VMs (8 VMs) all running dbench:
--
Total
Configuration   Throughput  Notes

3.10-default-ple_on  1965   70% CPU in host 
kernel, 34% spin_lock in guests 
3.10-default-ple_off  226   2% CPU in host 
kernel, 94% spin_lock in guests
3.10-pvticket-ple_on 1942   70% CPU in host 
kernel, 35% spin_lock in guests
3.10-pvticket-ple_off8003   11% CPU in host 
kernel, 70% spin_lock in guests
[quite bad all around, but pv-tickets with PLE off the best so far.
  Still quite a bit off from ideal throughput]


This is again a remarkable improvement (307%).
This motivates me to add a patch to disable ple when pvspinlock is on.
probably we can add a hypercall that disables ple in kvm init patch.
but only problem I see is what if the guests are mixed.

  (i.e one guest has pvspinlock support but other does not. Host
supports pv)


How about reintroducing the idea to create per-kvm ple_gap,ple_window

i/o threads

2013-06-26 Thread folkert
Hi,

I noticed that on my 3 VMs running server, that there are 10-20 threads
doing i/o. As the VMs are running on HDDs and not SSDs I think that is
counterproductive: won't these threads make the HDDs seek back and forth
constantly?


Folkert van Heusden

-- 
Always wondered what the latency of your webserver is? Or how much more
latency you get when you go through a proxy server/tor? The numbers
tell the tale and with HTTPing you know them!
 http://www.vanheusden.com/httping/
---
Phone: +31-6-41278122, PGP-key: 1F28D8AE, www.vanheusden.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] kvm, emulator: Rename VendorSpecific flag

2013-06-26 Thread Borislav Petkov
On Tue, Jun 25, 2013 at 02:10:20PM +0300, Gleb Natapov wrote:
  -   if (!(ctxt-d  VendorSpecific)  ctxt-only_vendor_specific_insn)
  +   if (!(ctxt-d  EmulateOnUD)  ctxt-only_vendor_specific_insn)
 Lets rename only_vendor_specific_insn to something like -ud too.

So this thing is set only when either svm or vmx encounter an #UD and go
and emulate the instruction.

I guess this is for the case where we actually do want to inject the #UD
into the guest and not emulate the instruction.

Btw, it is only checked in x86_decode_insn so we could just as well hand
down the emulation_type from the caller x86_emulate_instruction and kill
-only_vendor_specific_insn completely like so:

if (!(ctxt-d  EmulateOnUD)  (emul_type  EMULTYPE_TRAP_UD))

-- 
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks

2013-06-26 Thread Konrad Rzeszutek Wilk
On Wed, Jun 26, 2013 at 03:52:40PM +0300, Gleb Natapov wrote:
 On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote:
  On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote:
   On 06/25/2013 08:20 PM, Andrew Theurer wrote:
   On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote:
   This series replaces the existing paravirtualized spinlock mechanism
   with a paravirtualized ticketlock mechanism. The series provides
   implementation for both Xen and KVM.
   
   Changes in V9:
   - Changed spin_threshold to 32k to avoid excess halt exits that are
   causing undercommit degradation (after PLE handler improvement).
   - Added  kvm_irq_delivery_to_apic (suggested by Gleb)
   - Optimized halt exit path to use PLE handler
   
   V8 of PVspinlock was posted last year. After Avi's suggestions to look
   at PLE handler's improvements, various optimizations in PLE handling
   have been tried.
   
   Sorry for not posting this sooner.  I have tested the v9 pv-ticketlock
   patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs.  I have
   tested these patches with and without PLE, as PLE is still not scalable
   with large VMs.
   
   
   Hi Andrew,
   
   Thanks for testing.
   
   System: x3850X5, 40 cores, 80 threads
   
   
   1x over-commit with 10-vCPU VMs (8 VMs) all running dbench:
   --
Total
   ConfigurationThroughput(MB/s)Notes
   
   3.10-default-ple_on  22945   5% CPU 
   in host kernel, 2% spin_lock in guests
   3.10-default-ple_off 23184   5% CPU 
   in host kernel, 2% spin_lock in guests
   3.10-pvticket-ple_on 22895   5% CPU 
   in host kernel, 2% spin_lock in guests
   3.10-pvticket-ple_off23051   5% CPU 
   in host kernel, 2% spin_lock in guests
   [all 1x results look good here]
   
   Yes. The 1x results look too close
   
   
   
   2x over-commit with 10-vCPU VMs (16 VMs) all running dbench:
   ---
Total
   ConfigurationThroughput  Notes
   
   3.10-default-ple_on   6287   55% CPU 
host kernel, 17% spin_lock in guests
   3.10-default-ple_off  1849   2% CPU 
   in host kernel, 95% spin_lock in guests
   3.10-pvticket-ple_on  6691   50% CPU 
   in host kernel, 15% spin_lock in guests
   3.10-pvticket-ple_off16464   8% CPU 
   in host kernel, 33% spin_lock in guests
   
   I see 6.426% improvement with ple_on
   and 161.87% improvement with ple_off. I think this is a very good sign
for the patches
   
   [PLE hinders pv-ticket improvements, but even with PLE off,
 we still off from ideal throughput (somewhere 2)]
   
   
   Okay, The ideal throughput you are referring is getting around atleast
   80% of 1x throughput for over-commit. Yes we are still far away from
   there.
   
   
   1x over-commit with 20-vCPU VMs (4 VMs) all running dbench:
   --
Total
   ConfigurationThroughput  Notes
   
   3.10-default-ple_on  22736   6% CPU 
   in host kernel, 3% spin_lock in guests
   3.10-default-ple_off 23377   5% CPU 
   in host kernel, 3% spin_lock in guests
   3.10-pvticket-ple_on 22471   6% CPU 
   in host kernel, 3% spin_lock in guests
   3.10-pvticket-ple_off23445   5% CPU 
   in host kernel, 3% spin_lock in guests
   [1x looking fine here]
   
   
   I see ple_off is little better here.
   
   
   2x over-commit with 20-vCPU VMs (8 VMs) all running dbench:
   --
Total
   ConfigurationThroughput  Notes
   
   3.10-default-ple_on   1965   70% CPU 
   in host kernel, 34% spin_lock in guests 
   3.10-default-ple_off   226   2% CPU 
   in host kernel, 94% spin_lock in guests
   3.10-pvticket-ple_on  1942   70% CPU 
   in host kernel, 35% spin_lock in guests
   3.10-pvticket-ple_off 8003   11% CPU 
   in host kernel, 70% spin_lock in guests
   [quite bad all around, but pv-tickets with PLE off the best so far.
 Still quite a bit off from ideal throughput]
   
   This is again a remarkable improvement (307%).
   This motivates me to 

Re: [PATCH 3/4] kvm, emulator: Rename VendorSpecific flag

2013-06-26 Thread Gleb Natapov
On Wed, Jun 26, 2013 at 04:11:59PM +0200, Borislav Petkov wrote:
 On Tue, Jun 25, 2013 at 02:10:20PM +0300, Gleb Natapov wrote:
   - if (!(ctxt-d  VendorSpecific)  ctxt-only_vendor_specific_insn)
   + if (!(ctxt-d  EmulateOnUD)  ctxt-only_vendor_specific_insn)
  Lets rename only_vendor_specific_insn to something like -ud too.
 
 So this thing is set only when either svm or vmx encounter an #UD and go
 and emulate the instruction.
 
 I guess this is for the case where we actually do want to inject the #UD
 into the guest and not emulate the instruction.
 
 Btw, it is only checked in x86_decode_insn so we could just as well hand
 down the emulation_type from the caller x86_emulate_instruction and kill
 -only_vendor_specific_insn completely like so:
 
   if (!(ctxt-d  EmulateOnUD)  (emul_type  EMULTYPE_TRAP_UD))
 
EMULTYPE_ values are external to emulator.c and control how x86.c
invokes the emulator. I prefer not to change kvm-emulator interface
just to get rid of one ctxt field.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks

2013-06-26 Thread Raghavendra K T

On 06/26/2013 08:09 PM, Chegu Vinod wrote:

On 6/26/2013 6:40 AM, Raghavendra K T wrote:

On 06/26/2013 06:22 PM, Gleb Natapov wrote:

On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote:

On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote:

On 06/25/2013 08:20 PM, Andrew Theurer wrote:

On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote:

This series replaces the existing paravirtualized spinlock mechanism
with a paravirtualized ticketlock mechanism. The series provides
implementation for both Xen and KVM.

Changes in V9:
- Changed spin_threshold to 32k to avoid excess halt exits that are
causing undercommit degradation (after PLE handler improvement).
- Added  kvm_irq_delivery_to_apic (suggested by Gleb)
- Optimized halt exit path to use PLE handler

V8 of PVspinlock was posted last year. After Avi's suggestions to
look
at PLE handler's improvements, various optimizations in PLE handling
have been tried.


Sorry for not posting this sooner.  I have tested the v9
pv-ticketlock
patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs.  I
have
tested these patches with and without PLE, as PLE is still not
scalable
with large VMs.



Hi Andrew,

Thanks for testing.


System: x3850X5, 40 cores, 80 threads


1x over-commit with 10-vCPU VMs (8 VMs) all running dbench:
--
Total
ConfigurationThroughput(MB/s)Notes

3.10-default-ple_on229455% CPU in host
kernel, 2% spin_lock in guests
3.10-default-ple_off231845% CPU in host
kernel, 2% spin_lock in guests
3.10-pvticket-ple_on228955% CPU in host
kernel, 2% spin_lock in guests
3.10-pvticket-ple_off230515% CPU in host
kernel, 2% spin_lock in guests
[all 1x results look good here]


Yes. The 1x results look too close




2x over-commit with 10-vCPU VMs (16 VMs) all running dbench:
---
Total
ConfigurationThroughputNotes

3.10-default-ple_on 628755% CPU host
kernel, 17% spin_lock in guests
3.10-default-ple_off 18492% CPU in host
kernel, 95% spin_lock in guests
3.10-pvticket-ple_on 669150% CPU in host
kernel, 15% spin_lock in guests
3.10-pvticket-ple_off164648% CPU in host
kernel, 33% spin_lock in guests


I see 6.426% improvement with ple_on
and 161.87% improvement with ple_off. I think this is a very good sign
  for the patches


[PLE hinders pv-ticket improvements, but even with PLE off,
  we still off from ideal throughput (somewhere 2)]



Okay, The ideal throughput you are referring is getting around atleast
80% of 1x throughput for over-commit. Yes we are still far away from
there.



1x over-commit with 20-vCPU VMs (4 VMs) all running dbench:
--
Total
ConfigurationThroughputNotes

3.10-default-ple_on227366% CPU in host
kernel, 3% spin_lock in guests
3.10-default-ple_off233775% CPU in host
kernel, 3% spin_lock in guests
3.10-pvticket-ple_on224716% CPU in host
kernel, 3% spin_lock in guests
3.10-pvticket-ple_off234455% CPU in host
kernel, 3% spin_lock in guests
[1x looking fine here]



I see ple_off is little better here.



2x over-commit with 20-vCPU VMs (8 VMs) all running dbench:
--
Total
ConfigurationThroughputNotes

3.10-default-ple_on 196570% CPU in host
kernel, 34% spin_lock in guests
3.10-default-ple_off  2262% CPU in host
kernel, 94% spin_lock in guests
3.10-pvticket-ple_on 194270% CPU in host
kernel, 35% spin_lock in guests
3.10-pvticket-ple_off 800311% CPU in host
kernel, 70% spin_lock in guests
[quite bad all around, but pv-tickets with PLE off the best so far.
  Still quite a bit off from ideal throughput]


This is again a remarkable improvement (307%).
This motivates me to add a patch to disable ple when pvspinlock is on.
probably we can add a hypercall that disables ple in kvm init patch.
but only problem I see is what if the guests are mixed.

  (i.e one guest has pvspinlock support but other does not. Host
supports pv)


How about reintroducing the idea to create per-kvm ple_gap,ple_window
state. We were headed down that road when considering a dynamic
window at
one point. Then you can just set a single guest's ple_gap to zero,
which
would lead to PLE being disabled for that guest. We could also revisit
the dynamic window then.


Can be done, but lets understand why ple on is such a big problem. Is it
possible that ple gap 

Re: [PATCH RFC] pci: ACS quirk for AMD southbridge

2013-06-26 Thread Andreas Hartmann
Bjorn Helgaas wrote:
 [fix Joerg's email address]
 
 On Tue, Jun 25, 2013 at 10:15 PM, Bjorn Helgaas bhelg...@google.com wrote:
 On Wed, Jul 11, 2012 at 11:18 PM, Alex Williamson
 alex.william...@redhat.com wrote:
 We've confirmed that peer-to-peer between these devices is
 not possible.  We can therefore claim that they support a
 subset of ACS.

 Signed-off-by: Alex Williamson alex.william...@redhat.com
 Cc: Joerg Roedel joerg.roe...@amd.com
 ---

 Two things about this patch make me a little nervous.  The
 first is that I'd really like to have a pci_is_pcie() test
 in pci_mf_no_p2p_acs_enabled(), but these devices don't
 have a PCIe capability.  That means that if there was a
 topology where these devices sit on a legacy PCI bus,
 we incorrectly return that we're ACS safe here.  That leads
 to my second problem, pciids seems to suggest that some of
 these functions have been around for a while.  Is it just
 this package that's peer-to-peer safe, or is it safe to
 assume that any previous assembly of these functions is
 also p2p safe.  Maybe we need to factor in device revs if
 that uniquely identifies this package?

 Looks like another useful device to potentially quirk
 would be:

 00:15.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB700/SB800/SB900 
 PCI to PCI bridge (PCIE port 0)
 00:15.1 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB700/SB800/SB900 
 PCI to PCI bridge (PCIE port 1)
 00:15.2 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI 
 bridge (PCIE port 2)
 00:15.3 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI 
 bridge (PCIE port 3)

 00:15.0 0604: 1002:43a0
 00:15.1 0604: 1002:43a1
 00:15.2 0604: 1002:43a2
 00:15.3 0604: 1002:43a3

  drivers/pci/quirks.c |   29 +
  1 file changed, 29 insertions(+)

 diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
 index 4ebc865..2c84961 100644
 --- a/drivers/pci/quirks.c
 +++ b/drivers/pci/quirks.c
 @@ -3271,11 +3271,40 @@ struct pci_dev *pci_get_dma_source(struct pci_dev 
 *dev)
 return pci_dev_get(dev);
  }

 +/*
 + * Multifunction devices that do not support peer-to-peer between
 + * functions can claim to support a subset of ACS.  Such devices
 + * effectively enable request redirect (RR) and completion redirect (CR)
 + * since all transactions are redirected to the upstream root complex.
 + */
 +static int pci_mf_no_p2p_acs_enabled(struct pci_dev *dev, u16 acs_flags)
 +{
 +   if (!dev-multifunction)
 +   return -ENODEV;
 +
 +   /* Filter out flags not applicable to multifunction */
 +   acs_flags = (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC | PCI_ACS_DT);
 +
 +   return acs_flags  ~(PCI_ACS_RR | PCI_ACS_CR) ? 0 : 1;
 +}
 +
  static const struct pci_dev_acs_enabled {
 u16 vendor;
 u16 device;
 int (*acs_enabled)(struct pci_dev *dev, u16 acs_flags);
  } pci_dev_acs_enabled[] = {
 +   /*
 +* AMD/ATI multifunction southbridge devices.  AMD has confirmed
 +* that peer-to-peer between these devices is not possible, so
 +* they do support a subset of ACS even though the capability is
 +* not exposed in config space.
 +*/
 +   { PCI_VENDOR_ID_ATI, 0x4385, pci_mf_no_p2p_acs_enabled },
 +   { PCI_VENDOR_ID_ATI, 0x439c, pci_mf_no_p2p_acs_enabled },
 +   { PCI_VENDOR_ID_ATI, 0x4383, pci_mf_no_p2p_acs_enabled },
 +   { PCI_VENDOR_ID_ATI, 0x439d, pci_mf_no_p2p_acs_enabled },
 +   { PCI_VENDOR_ID_ATI, 0x4384, pci_mf_no_p2p_acs_enabled },
 +   { PCI_VENDOR_ID_ATI, 0x4399, pci_mf_no_p2p_acs_enabled },
 { 0 }
  };



 I was looking for something else and found this old email.  This patch
 hasn't been applied and I haven't seen any discussion about it.  Is it
 still of interest?  It seems relevant to the current ACS discussion
 [1].

It is absolutely relevant. I always have to patch my kernel to get it
working to put my pci device to VM. Meanwhile I'm doing it for
kernel 3.9. I would be very glad to get these patches to the kernel as
they don't do anything bad!

My multifunction devices are the devices defined in the patch. My
current pci device passed through is a intel ethernet device:

-[:00]-+-00.0  Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge 
(external gfx0 port B)
   +-00.2  Advanced Micro Devices [AMD] nee ATI RD990 I/O Memory 
Management Unit (IOMMU)
   +-02.0-[01]--+-00.0  Advanced Micro Devices [AMD] nee ATI Turks 
[Radeon HD 6570]
   |\-00.1  Advanced Micro Devices [AMD] nee ATI Turks HDMI 
Audio [Radeon HD 6000 Series]
   +-04.0-[02]00.0  Etron Technology, Inc. EJ168 USB 3.0 Host 
Controller
   +-05.0-[03]00.0  Atheros Communications Inc. AR9300 Wireless LAN 
adaptor
   +-09.0-[04]00.0  Realtek Semiconductor Co., Ltd. RTL8111/8168B 
PCI Express Gigabit Ethernet controller
   +-0a.0-[05]00.0  Etron Technology, Inc. EJ168 USB 3.0 Host 
Controller
 

Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks

2013-06-26 Thread Andrew Theurer
On Wed, 2013-06-26 at 15:52 +0300, Gleb Natapov wrote:
 On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote:
  On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote:
   On 06/25/2013 08:20 PM, Andrew Theurer wrote:
   On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote:
   This series replaces the existing paravirtualized spinlock mechanism
   with a paravirtualized ticketlock mechanism. The series provides
   implementation for both Xen and KVM.
   
   Changes in V9:
   - Changed spin_threshold to 32k to avoid excess halt exits that are
   causing undercommit degradation (after PLE handler improvement).
   - Added  kvm_irq_delivery_to_apic (suggested by Gleb)
   - Optimized halt exit path to use PLE handler
   
   V8 of PVspinlock was posted last year. After Avi's suggestions to look
   at PLE handler's improvements, various optimizations in PLE handling
   have been tried.
   
   Sorry for not posting this sooner.  I have tested the v9 pv-ticketlock
   patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs.  I have
   tested these patches with and without PLE, as PLE is still not scalable
   with large VMs.
   
   
   Hi Andrew,
   
   Thanks for testing.
   
   System: x3850X5, 40 cores, 80 threads
   
   
   1x over-commit with 10-vCPU VMs (8 VMs) all running dbench:
   --
Total
   ConfigurationThroughput(MB/s)Notes
   
   3.10-default-ple_on  22945   5% CPU 
   in host kernel, 2% spin_lock in guests
   3.10-default-ple_off 23184   5% CPU 
   in host kernel, 2% spin_lock in guests
   3.10-pvticket-ple_on 22895   5% CPU 
   in host kernel, 2% spin_lock in guests
   3.10-pvticket-ple_off23051   5% CPU 
   in host kernel, 2% spin_lock in guests
   [all 1x results look good here]
   
   Yes. The 1x results look too close
   
   
   
   2x over-commit with 10-vCPU VMs (16 VMs) all running dbench:
   ---
Total
   ConfigurationThroughput  Notes
   
   3.10-default-ple_on   6287   55% CPU 
host kernel, 17% spin_lock in guests
   3.10-default-ple_off  1849   2% CPU 
   in host kernel, 95% spin_lock in guests
   3.10-pvticket-ple_on  6691   50% CPU 
   in host kernel, 15% spin_lock in guests
   3.10-pvticket-ple_off16464   8% CPU 
   in host kernel, 33% spin_lock in guests
   
   I see 6.426% improvement with ple_on
   and 161.87% improvement with ple_off. I think this is a very good sign
for the patches
   
   [PLE hinders pv-ticket improvements, but even with PLE off,
 we still off from ideal throughput (somewhere 2)]
   
   
   Okay, The ideal throughput you are referring is getting around atleast
   80% of 1x throughput for over-commit. Yes we are still far away from
   there.
   
   
   1x over-commit with 20-vCPU VMs (4 VMs) all running dbench:
   --
Total
   ConfigurationThroughput  Notes
   
   3.10-default-ple_on  22736   6% CPU 
   in host kernel, 3% spin_lock in guests
   3.10-default-ple_off 23377   5% CPU 
   in host kernel, 3% spin_lock in guests
   3.10-pvticket-ple_on 22471   6% CPU 
   in host kernel, 3% spin_lock in guests
   3.10-pvticket-ple_off23445   5% CPU 
   in host kernel, 3% spin_lock in guests
   [1x looking fine here]
   
   
   I see ple_off is little better here.
   
   
   2x over-commit with 20-vCPU VMs (8 VMs) all running dbench:
   --
Total
   ConfigurationThroughput  Notes
   
   3.10-default-ple_on   1965   70% CPU 
   in host kernel, 34% spin_lock in guests 
   3.10-default-ple_off   226   2% CPU 
   in host kernel, 94% spin_lock in guests
   3.10-pvticket-ple_on  1942   70% CPU 
   in host kernel, 35% spin_lock in guests
   3.10-pvticket-ple_off 8003   11% CPU 
   in host kernel, 70% spin_lock in guests
   [quite bad all around, but pv-tickets with PLE off the best so far.
 Still quite a bit off from ideal throughput]
   
   This is again a remarkable improvement (307%).
   This motivates me to add a 

Re: [PATCH RFC] pci: ACS quirk for AMD southbridge

2013-06-26 Thread Alex Williamson
On Wed, 2013-06-26 at 17:14 +0200, Andreas Hartmann wrote:
 Bjorn Helgaas wrote:
  [fix Joerg's email address]
  
  On Tue, Jun 25, 2013 at 10:15 PM, Bjorn Helgaas bhelg...@google.com wrote:
  On Wed, Jul 11, 2012 at 11:18 PM, Alex Williamson
  alex.william...@redhat.com wrote:
  We've confirmed that peer-to-peer between these devices is
  not possible.  We can therefore claim that they support a
  subset of ACS.
 
  Signed-off-by: Alex Williamson alex.william...@redhat.com
  Cc: Joerg Roedel joerg.roe...@amd.com
  ---
 
  Two things about this patch make me a little nervous.  The
  first is that I'd really like to have a pci_is_pcie() test
  in pci_mf_no_p2p_acs_enabled(), but these devices don't
  have a PCIe capability.  That means that if there was a
  topology where these devices sit on a legacy PCI bus,
  we incorrectly return that we're ACS safe here.  That leads
  to my second problem, pciids seems to suggest that some of
  these functions have been around for a while.  Is it just
  this package that's peer-to-peer safe, or is it safe to
  assume that any previous assembly of these functions is
  also p2p safe.  Maybe we need to factor in device revs if
  that uniquely identifies this package?
 
  Looks like another useful device to potentially quirk
  would be:
 
  00:15.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI 
  SB700/SB800/SB900 PCI to PCI bridge (PCIE port 0)
  00:15.1 PCI bridge: Advanced Micro Devices [AMD] nee ATI 
  SB700/SB800/SB900 PCI to PCI bridge (PCIE port 1)
  00:15.2 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI 
  bridge (PCIE port 2)
  00:15.3 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI 
  bridge (PCIE port 3)
 
  00:15.0 0604: 1002:43a0
  00:15.1 0604: 1002:43a1
  00:15.2 0604: 1002:43a2
  00:15.3 0604: 1002:43a3
 
   drivers/pci/quirks.c |   29 +
   1 file changed, 29 insertions(+)
 
  diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
  index 4ebc865..2c84961 100644
  --- a/drivers/pci/quirks.c
  +++ b/drivers/pci/quirks.c
  @@ -3271,11 +3271,40 @@ struct pci_dev *pci_get_dma_source(struct pci_dev 
  *dev)
  return pci_dev_get(dev);
   }
 
  +/*
  + * Multifunction devices that do not support peer-to-peer between
  + * functions can claim to support a subset of ACS.  Such devices
  + * effectively enable request redirect (RR) and completion redirect (CR)
  + * since all transactions are redirected to the upstream root complex.
  + */
  +static int pci_mf_no_p2p_acs_enabled(struct pci_dev *dev, u16 acs_flags)
  +{
  +   if (!dev-multifunction)
  +   return -ENODEV;
  +
  +   /* Filter out flags not applicable to multifunction */
  +   acs_flags = (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC | PCI_ACS_DT);
  +
  +   return acs_flags  ~(PCI_ACS_RR | PCI_ACS_CR) ? 0 : 1;
  +}
  +
   static const struct pci_dev_acs_enabled {
  u16 vendor;
  u16 device;
  int (*acs_enabled)(struct pci_dev *dev, u16 acs_flags);
   } pci_dev_acs_enabled[] = {
  +   /*
  +* AMD/ATI multifunction southbridge devices.  AMD has confirmed
  +* that peer-to-peer between these devices is not possible, so
  +* they do support a subset of ACS even though the capability is
  +* not exposed in config space.
  +*/
  +   { PCI_VENDOR_ID_ATI, 0x4385, pci_mf_no_p2p_acs_enabled },
  +   { PCI_VENDOR_ID_ATI, 0x439c, pci_mf_no_p2p_acs_enabled },
  +   { PCI_VENDOR_ID_ATI, 0x4383, pci_mf_no_p2p_acs_enabled },
  +   { PCI_VENDOR_ID_ATI, 0x439d, pci_mf_no_p2p_acs_enabled },
  +   { PCI_VENDOR_ID_ATI, 0x4384, pci_mf_no_p2p_acs_enabled },
  +   { PCI_VENDOR_ID_ATI, 0x4399, pci_mf_no_p2p_acs_enabled },
  { 0 }
   };
 
 
 
  I was looking for something else and found this old email.  This patch
  hasn't been applied and I haven't seen any discussion about it.  Is it
  still of interest?  It seems relevant to the current ACS discussion
  [1].
 
 It is absolutely relevant. I always have to patch my kernel to get it
 working to put my pci device to VM. Meanwhile I'm doing it for
 kernel 3.9. I would be very glad to get these patches to the kernel as
 they don't do anything bad!

I'd still like to see this get in too.  IIRC, where we left off was that
Joerg had confirmed with the hardware folks that there is no
peer-to-peer between these devices, but we still had questions about
whether that was true for any instance of these vendor/device IDs.
These devices are re-used in several packages and I'm not sure if we
need to somehow figure out what package (ie. which chipset generation)
we're looking at to know if p2p is used.  Thanks,

Alex


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks

2013-06-26 Thread Gleb Natapov
On Wed, Jun 26, 2013 at 07:10:21PM +0530, Raghavendra K T wrote:
 On 06/26/2013 06:22 PM, Gleb Natapov wrote:
 On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote:
 On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote:
 On 06/25/2013 08:20 PM, Andrew Theurer wrote:
 On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote:
 This series replaces the existing paravirtualized spinlock mechanism
 with a paravirtualized ticketlock mechanism. The series provides
 implementation for both Xen and KVM.
 
 Changes in V9:
 - Changed spin_threshold to 32k to avoid excess halt exits that are
 causing undercommit degradation (after PLE handler improvement).
 - Added  kvm_irq_delivery_to_apic (suggested by Gleb)
 - Optimized halt exit path to use PLE handler
 
 V8 of PVspinlock was posted last year. After Avi's suggestions to look
 at PLE handler's improvements, various optimizations in PLE handling
 have been tried.
 
 Sorry for not posting this sooner.  I have tested the v9 pv-ticketlock
 patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs.  I have
 tested these patches with and without PLE, as PLE is still not scalable
 with large VMs.
 
 
 Hi Andrew,
 
 Thanks for testing.
 
 System: x3850X5, 40 cores, 80 threads
 
 
 1x over-commit with 10-vCPU VMs (8 VMs) all running dbench:
 --
   Total
 Configuration Throughput(MB/s)Notes
 
 3.10-default-ple_on   22945   5% CPU 
 in host kernel, 2% spin_lock in guests
 3.10-default-ple_off  23184   5% CPU 
 in host kernel, 2% spin_lock in guests
 3.10-pvticket-ple_on  22895   5% CPU 
 in host kernel, 2% spin_lock in guests
 3.10-pvticket-ple_off 23051   5% CPU 
 in host kernel, 2% spin_lock in guests
 [all 1x results look good here]
 
 Yes. The 1x results look too close
 
 
 
 2x over-commit with 10-vCPU VMs (16 VMs) all running dbench:
 ---
   Total
 Configuration Throughput  Notes
 
 3.10-default-ple_on6287   55% CPU 
  host kernel, 17% spin_lock in guests
 3.10-default-ple_off   1849   2% CPU 
 in host kernel, 95% spin_lock in guests
 3.10-pvticket-ple_on   6691   50% CPU 
 in host kernel, 15% spin_lock in guests
 3.10-pvticket-ple_off 16464   8% CPU 
 in host kernel, 33% spin_lock in guests
 
 I see 6.426% improvement with ple_on
 and 161.87% improvement with ple_off. I think this is a very good sign
   for the patches
 
 [PLE hinders pv-ticket improvements, but even with PLE off,
   we still off from ideal throughput (somewhere 2)]
 
 
 Okay, The ideal throughput you are referring is getting around atleast
 80% of 1x throughput for over-commit. Yes we are still far away from
 there.
 
 
 1x over-commit with 20-vCPU VMs (4 VMs) all running dbench:
 --
   Total
 Configuration Throughput  Notes
 
 3.10-default-ple_on   22736   6% CPU 
 in host kernel, 3% spin_lock in guests
 3.10-default-ple_off  23377   5% CPU 
 in host kernel, 3% spin_lock in guests
 3.10-pvticket-ple_on  22471   6% CPU 
 in host kernel, 3% spin_lock in guests
 3.10-pvticket-ple_off 23445   5% CPU 
 in host kernel, 3% spin_lock in guests
 [1x looking fine here]
 
 
 I see ple_off is little better here.
 
 
 2x over-commit with 20-vCPU VMs (8 VMs) all running dbench:
 --
   Total
 Configuration Throughput  Notes
 
 3.10-default-ple_on1965   70% CPU 
 in host kernel, 34% spin_lock in guests 
 3.10-default-ple_off226   2% CPU 
 in host kernel, 94% spin_lock in guests
 3.10-pvticket-ple_on   1942   70% CPU 
 in host kernel, 35% spin_lock in guests
 3.10-pvticket-ple_off  8003   11% CPU 
 in host kernel, 70% spin_lock in guests
 [quite bad all around, but pv-tickets with PLE off the best so far.
   Still quite a bit off from ideal throughput]
 
 This is again a remarkable improvement (307%).
 This motivates me to add a patch to disable ple when pvspinlock is on.
 probably we can add a hypercall that disables ple in kvm init patch.
 but only problem I see is what 

Re: [PATCH RFC] pci: ACS quirk for AMD southbridge

2013-06-26 Thread Andreas Hartmann
Alex Williamson wrote:
 On Wed, 2013-06-26 at 17:14 +0200, Andreas Hartmann wrote:
 Bjorn Helgaas wrote:
 [fix Joerg's email address]

 On Tue, Jun 25, 2013 at 10:15 PM, Bjorn Helgaas bhelg...@google.com wrote:
 On Wed, Jul 11, 2012 at 11:18 PM, Alex Williamson
 alex.william...@redhat.com wrote:
 We've confirmed that peer-to-peer between these devices is
 not possible.  We can therefore claim that they support a
 subset of ACS.

 Signed-off-by: Alex Williamson alex.william...@redhat.com
 Cc: Joerg Roedel joerg.roe...@amd.com
 ---

 Two things about this patch make me a little nervous.  The
 first is that I'd really like to have a pci_is_pcie() test
 in pci_mf_no_p2p_acs_enabled(), but these devices don't
 have a PCIe capability.  That means that if there was a
 topology where these devices sit on a legacy PCI bus,
 we incorrectly return that we're ACS safe here.  That leads
 to my second problem, pciids seems to suggest that some of
 these functions have been around for a while.  Is it just
 this package that's peer-to-peer safe, or is it safe to
 assume that any previous assembly of these functions is
 also p2p safe.  Maybe we need to factor in device revs if
 that uniquely identifies this package?

 Looks like another useful device to potentially quirk
 would be:

 00:15.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI 
 SB700/SB800/SB900 PCI to PCI bridge (PCIE port 0)
 00:15.1 PCI bridge: Advanced Micro Devices [AMD] nee ATI 
 SB700/SB800/SB900 PCI to PCI bridge (PCIE port 1)
 00:15.2 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI 
 bridge (PCIE port 2)
 00:15.3 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to PCI 
 bridge (PCIE port 3)

 00:15.0 0604: 1002:43a0
 00:15.1 0604: 1002:43a1
 00:15.2 0604: 1002:43a2
 00:15.3 0604: 1002:43a3

  drivers/pci/quirks.c |   29 +
  1 file changed, 29 insertions(+)

 diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
 index 4ebc865..2c84961 100644
 --- a/drivers/pci/quirks.c
 +++ b/drivers/pci/quirks.c
 @@ -3271,11 +3271,40 @@ struct pci_dev *pci_get_dma_source(struct pci_dev 
 *dev)
 return pci_dev_get(dev);
  }

 +/*
 + * Multifunction devices that do not support peer-to-peer between
 + * functions can claim to support a subset of ACS.  Such devices
 + * effectively enable request redirect (RR) and completion redirect (CR)
 + * since all transactions are redirected to the upstream root complex.
 + */
 +static int pci_mf_no_p2p_acs_enabled(struct pci_dev *dev, u16 acs_flags)
 +{
 +   if (!dev-multifunction)
 +   return -ENODEV;
 +
 +   /* Filter out flags not applicable to multifunction */
 +   acs_flags = (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC | PCI_ACS_DT);
 +
 +   return acs_flags  ~(PCI_ACS_RR | PCI_ACS_CR) ? 0 : 1;
 +}
 +
  static const struct pci_dev_acs_enabled {
 u16 vendor;
 u16 device;
 int (*acs_enabled)(struct pci_dev *dev, u16 acs_flags);
  } pci_dev_acs_enabled[] = {
 +   /*
 +* AMD/ATI multifunction southbridge devices.  AMD has confirmed
 +* that peer-to-peer between these devices is not possible, so
 +* they do support a subset of ACS even though the capability is
 +* not exposed in config space.
 +*/
 +   { PCI_VENDOR_ID_ATI, 0x4385, pci_mf_no_p2p_acs_enabled },
 +   { PCI_VENDOR_ID_ATI, 0x439c, pci_mf_no_p2p_acs_enabled },
 +   { PCI_VENDOR_ID_ATI, 0x4383, pci_mf_no_p2p_acs_enabled },
 +   { PCI_VENDOR_ID_ATI, 0x439d, pci_mf_no_p2p_acs_enabled },
 +   { PCI_VENDOR_ID_ATI, 0x4384, pci_mf_no_p2p_acs_enabled },
 +   { PCI_VENDOR_ID_ATI, 0x4399, pci_mf_no_p2p_acs_enabled },
 { 0 }
  };



 I was looking for something else and found this old email.  This patch
 hasn't been applied and I haven't seen any discussion about it.  Is it
 still of interest?  It seems relevant to the current ACS discussion
 [1].

 It is absolutely relevant. I always have to patch my kernel to get it
 working to put my pci device to VM. Meanwhile I'm doing it for
 kernel 3.9. I would be very glad to get these patches to the kernel as
 they don't do anything bad!
 
 I'd still like to see this get in too.  IIRC, where we left off was that
 Joerg had confirmed with the hardware folks that there is no
 peer-to-peer between these devices, but we still had questions about
 whether that was true for any instance of these vendor/device IDs.
 These devices are re-used in several packages and I'm not sure if we
 need to somehow figure out what package (ie. which chipset generation)
 we're looking at to know if p2p is used. 

Does this statement cover your question?
http://article.gmane.org/gmane.comp.emulators.kvm.devel/99402

Andreas
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] pci: ACS quirk for AMD southbridge

2013-06-26 Thread Alex Williamson
On Wed, 2013-06-26 at 18:24 +0200, Andreas Hartmann wrote:
 Alex Williamson wrote:
  On Wed, 2013-06-26 at 17:14 +0200, Andreas Hartmann wrote:
  Bjorn Helgaas wrote:
  [fix Joerg's email address]
 
  On Tue, Jun 25, 2013 at 10:15 PM, Bjorn Helgaas bhelg...@google.com 
  wrote:
  On Wed, Jul 11, 2012 at 11:18 PM, Alex Williamson
  alex.william...@redhat.com wrote:
  We've confirmed that peer-to-peer between these devices is
  not possible.  We can therefore claim that they support a
  subset of ACS.
 
  Signed-off-by: Alex Williamson alex.william...@redhat.com
  Cc: Joerg Roedel joerg.roe...@amd.com
  ---
 
  Two things about this patch make me a little nervous.  The
  first is that I'd really like to have a pci_is_pcie() test
  in pci_mf_no_p2p_acs_enabled(), but these devices don't
  have a PCIe capability.  That means that if there was a
  topology where these devices sit on a legacy PCI bus,
  we incorrectly return that we're ACS safe here.  That leads
  to my second problem, pciids seems to suggest that some of
  these functions have been around for a while.  Is it just
  this package that's peer-to-peer safe, or is it safe to
  assume that any previous assembly of these functions is
  also p2p safe.  Maybe we need to factor in device revs if
  that uniquely identifies this package?
 
  Looks like another useful device to potentially quirk
  would be:
 
  00:15.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI 
  SB700/SB800/SB900 PCI to PCI bridge (PCIE port 0)
  00:15.1 PCI bridge: Advanced Micro Devices [AMD] nee ATI 
  SB700/SB800/SB900 PCI to PCI bridge (PCIE port 1)
  00:15.2 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to 
  PCI bridge (PCIE port 2)
  00:15.3 PCI bridge: Advanced Micro Devices [AMD] nee ATI SB900 PCI to 
  PCI bridge (PCIE port 3)
 
  00:15.0 0604: 1002:43a0
  00:15.1 0604: 1002:43a1
  00:15.2 0604: 1002:43a2
  00:15.3 0604: 1002:43a3
 
   drivers/pci/quirks.c |   29 +
   1 file changed, 29 insertions(+)
 
  diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
  index 4ebc865..2c84961 100644
  --- a/drivers/pci/quirks.c
  +++ b/drivers/pci/quirks.c
  @@ -3271,11 +3271,40 @@ struct pci_dev *pci_get_dma_source(struct 
  pci_dev *dev)
  return pci_dev_get(dev);
   }
 
  +/*
  + * Multifunction devices that do not support peer-to-peer between
  + * functions can claim to support a subset of ACS.  Such devices
  + * effectively enable request redirect (RR) and completion redirect 
  (CR)
  + * since all transactions are redirected to the upstream root complex.
  + */
  +static int pci_mf_no_p2p_acs_enabled(struct pci_dev *dev, u16 
  acs_flags)
  +{
  +   if (!dev-multifunction)
  +   return -ENODEV;
  +
  +   /* Filter out flags not applicable to multifunction */
  +   acs_flags = (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC | 
  PCI_ACS_DT);
  +
  +   return acs_flags  ~(PCI_ACS_RR | PCI_ACS_CR) ? 0 : 1;
  +}
  +
   static const struct pci_dev_acs_enabled {
  u16 vendor;
  u16 device;
  int (*acs_enabled)(struct pci_dev *dev, u16 acs_flags);
   } pci_dev_acs_enabled[] = {
  +   /*
  +* AMD/ATI multifunction southbridge devices.  AMD has confirmed
  +* that peer-to-peer between these devices is not possible, so
  +* they do support a subset of ACS even though the capability is
  +* not exposed in config space.
  +*/
  +   { PCI_VENDOR_ID_ATI, 0x4385, pci_mf_no_p2p_acs_enabled },
  +   { PCI_VENDOR_ID_ATI, 0x439c, pci_mf_no_p2p_acs_enabled },
  +   { PCI_VENDOR_ID_ATI, 0x4383, pci_mf_no_p2p_acs_enabled },
  +   { PCI_VENDOR_ID_ATI, 0x439d, pci_mf_no_p2p_acs_enabled },
  +   { PCI_VENDOR_ID_ATI, 0x4384, pci_mf_no_p2p_acs_enabled },
  +   { PCI_VENDOR_ID_ATI, 0x4399, pci_mf_no_p2p_acs_enabled },
  { 0 }
   };
 
 
 
  I was looking for something else and found this old email.  This patch
  hasn't been applied and I haven't seen any discussion about it.  Is it
  still of interest?  It seems relevant to the current ACS discussion
  [1].
 
  It is absolutely relevant. I always have to patch my kernel to get it
  working to put my pci device to VM. Meanwhile I'm doing it for
  kernel 3.9. I would be very glad to get these patches to the kernel as
  they don't do anything bad!
  
  I'd still like to see this get in too.  IIRC, where we left off was that
  Joerg had confirmed with the hardware folks that there is no
  peer-to-peer between these devices, but we still had questions about
  whether that was true for any instance of these vendor/device IDs.
  These devices are re-used in several packages and I'm not sure if we
  need to somehow figure out what package (ie. which chipset generation)
  we're looking at to know if p2p is used. 
 
 Does this statement cover your question?
 http://article.gmane.org/gmane.comp.emulators.kvm.devel/99402

Yeah, perhaps it does.  I initially disregarded it because it's easy to

Re: [nVMX w/ Haswell] KVM unit-tests in L1 - eventinj test fails trying to send NMI

2013-06-26 Thread Jan Kiszka
On 2013-06-26 10:03, Kashyap Chamarthy wrote:
 Thanks for the note, it's very helpful! This test actually fails on
 older CPUs as well, and I can finally reproduce the issue that Jay also
 reported. I'm not able to cure it by going back to 3b656cf764^,
 
 Ok, you tried w/o this commit..
 
 commit 3b656cf764cbc43d3efb9bf5f45c618d4cf0989f
 Author: Jan Kiszka jan.kis...@siemens.com
 Date:   Sun Apr 14 12:12:45 2013 +0200
 
 KVM: nVMX: Fix injection of PENDING_INTERRUPT and NMI_WINDOW exits to L1
 
 Check if the interrupt or NMI window exit is for L1 by testing if it has
 the corresponding controls enabled. This is required when we allow
 direct injection from L0 to L2
 

I first tried by reverting to the commit before this one, just like Jay
reported for https://bugzilla.kernel.org/show_bug.cgi?id=58941. But this
just varied the error (kvm reports an internal error), didn't solve the
issue. Now I simply reverted the commit on top of next, but without an
effect. Looks like those problems are not directly related.

Kashyap, you can do us a favor and try to find out if there was a commit
in the recent history (roughly before I started to hack on nVMX this
year) where this test cases succeeded.

TIA,
Jan




signature.asc
Description: OpenPGP digital signature


[PATCH qom-cpu v3 01/14] kvm: Free current_cpu identifier

2013-06-26 Thread Andreas Färber
Since CPU loops are done as last step in kvm_{insert,remove}_breakpoint()
and kvm_remove_all_breakpoints(), we do not need to distinguish between
invoking CPU and iterated CPUs and can thereby free the identifier for
use as a global variable.

Acked-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Andreas Färber afaer...@suse.de
---
 include/sysemu/kvm.h | 10 +-
 kvm-all.c| 39 +--
 kvm-stub.c   |  6 +++---
 3 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index fe8bc40..c88aee9 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -159,11 +159,11 @@ void *kvm_arch_ram_alloc(ram_addr_t size);
 void kvm_setup_guest_memory(void *start, size_t size);
 void kvm_flush_coalesced_mmio_buffer(void);
 
-int kvm_insert_breakpoint(CPUArchState *current_env, target_ulong addr,
+int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr,
   target_ulong len, int type);
-int kvm_remove_breakpoint(CPUArchState *current_env, target_ulong addr,
+int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr,
   target_ulong len, int type);
-void kvm_remove_all_breakpoints(CPUArchState *current_env);
+void kvm_remove_all_breakpoints(CPUArchState *env);
 int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap);
 #ifndef _WIN32
 int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset);
@@ -241,9 +241,9 @@ struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState 
*cpu,
 
 int kvm_sw_breakpoints_active(CPUState *cpu);
 
-int kvm_arch_insert_sw_breakpoint(CPUState *current_cpu,
+int kvm_arch_insert_sw_breakpoint(CPUState *cpu,
   struct kvm_sw_breakpoint *bp);
-int kvm_arch_remove_sw_breakpoint(CPUState *current_cpu,
+int kvm_arch_remove_sw_breakpoint(CPUState *cpu,
   struct kvm_sw_breakpoint *bp);
 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
   target_ulong len, int type);
diff --git a/kvm-all.c b/kvm-all.c
index 7a1684e..d074597 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1896,16 +1896,15 @@ int kvm_update_guest_debug(CPUArchState *env, unsigned 
long reinject_trap)
 return data.err;
 }
 
-int kvm_insert_breakpoint(CPUArchState *current_env, target_ulong addr,
+int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr,
   target_ulong len, int type)
 {
-CPUState *current_cpu = ENV_GET_CPU(current_env);
+CPUState *cpu = ENV_GET_CPU(env);
 struct kvm_sw_breakpoint *bp;
-CPUArchState *env;
 int err;
 
 if (type == GDB_BREAKPOINT_SW) {
-bp = kvm_find_sw_breakpoint(current_cpu, addr);
+bp = kvm_find_sw_breakpoint(cpu, addr);
 if (bp) {
 bp-use_count++;
 return 0;
@@ -1918,14 +1917,13 @@ int kvm_insert_breakpoint(CPUArchState *current_env, 
target_ulong addr,
 
 bp-pc = addr;
 bp-use_count = 1;
-err = kvm_arch_insert_sw_breakpoint(current_cpu, bp);
+err = kvm_arch_insert_sw_breakpoint(cpu, bp);
 if (err) {
 g_free(bp);
 return err;
 }
 
-QTAILQ_INSERT_HEAD(current_cpu-kvm_state-kvm_sw_breakpoints,
-  bp, entry);
+QTAILQ_INSERT_HEAD(cpu-kvm_state-kvm_sw_breakpoints, bp, entry);
 } else {
 err = kvm_arch_insert_hw_breakpoint(addr, len, type);
 if (err) {
@@ -1942,16 +1940,15 @@ int kvm_insert_breakpoint(CPUArchState *current_env, 
target_ulong addr,
 return 0;
 }
 
-int kvm_remove_breakpoint(CPUArchState *current_env, target_ulong addr,
+int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr,
   target_ulong len, int type)
 {
-CPUState *current_cpu = ENV_GET_CPU(current_env);
+CPUState *cpu = ENV_GET_CPU(env);
 struct kvm_sw_breakpoint *bp;
-CPUArchState *env;
 int err;
 
 if (type == GDB_BREAKPOINT_SW) {
-bp = kvm_find_sw_breakpoint(current_cpu, addr);
+bp = kvm_find_sw_breakpoint(cpu, addr);
 if (!bp) {
 return -ENOENT;
 }
@@ -1961,12 +1958,12 @@ int kvm_remove_breakpoint(CPUArchState *current_env, 
target_ulong addr,
 return 0;
 }
 
-err = kvm_arch_remove_sw_breakpoint(current_cpu, bp);
+err = kvm_arch_remove_sw_breakpoint(cpu, bp);
 if (err) {
 return err;
 }
 
-QTAILQ_REMOVE(current_cpu-kvm_state-kvm_sw_breakpoints, bp, entry);
+QTAILQ_REMOVE(cpu-kvm_state-kvm_sw_breakpoints, bp, entry);
 g_free(bp);
 } else {
 err = kvm_arch_remove_hw_breakpoint(addr, len, type);
@@ -1984,16 +1981,14 @@ int kvm_remove_breakpoint(CPUArchState *current_env, 
target_ulong addr,
 return 0;
 }
 
-void kvm_remove_all_breakpoints(CPUArchState *current_env)
+void kvm_remove_all_breakpoints(CPUArchState 

[PATCH qom-cpu v3 12/14] target-s390x: Don't overuse ENV_GET_CPU()

2013-06-26 Thread Andreas Färber
Commit 3474b679486caa8f6448bae974e131370f360c13 (Utilize selective
runtime reg sync for hot code paths) introduced two uses of
ENV_GET_CPU() inside target-s390x/ KVM code. In one case we can use a
direct CPU() cast instead.

Cc: Jason J. Herne jjhe...@us.ibm.com
Signed-off-by: Andreas Färber afaer...@suse.de
---
 target-s390x/kvm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index b524c35..4660074 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -469,7 +469,7 @@ static int kvm_handle_css_inst(S390CPU *cpu, struct kvm_run 
*run,
 int r = 0;
 int no_cc = 0;
 CPUS390XState *env = cpu-env;
-CPUState *cs = ENV_GET_CPU(env);
+CPUState *cs = CPU(cpu);
 
 if (ipa0 != 0xb2) {
 /* Not handled for now. */
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH qom-cpu v3 13/14] target-s390x: Change handle_{hypercall,diag}() argument to S390CPU

2013-06-26 Thread Andreas Färber
This allows to get rid of the last remaining ENV_GET_CPU() in
target-s390x/ by using CPU() cast directly on the argument.

Cc: Jason J. Herne jjhe...@us.ibm.com
Signed-off-by: Andreas Färber afaer...@suse.de
---
 target-s390x/kvm.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index 4660074..33ca7a7 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -607,9 +607,10 @@ static int handle_priv(S390CPU *cpu, struct kvm_run *run,
 return r;
 }
 
-static int handle_hypercall(CPUS390XState *env, struct kvm_run *run)
+static int handle_hypercall(S390CPU *cpu, struct kvm_run *run)
 {
-CPUState *cs = ENV_GET_CPU(env);
+CPUState *cs = CPU(cpu);
+CPUS390XState *env = cpu-env;
 
 kvm_s390_get_registers_partial(cs);
 cs-kvm_vcpu_dirty = true;
@@ -618,13 +619,13 @@ static int handle_hypercall(CPUS390XState *env, struct 
kvm_run *run)
 return 0;
 }
 
-static int handle_diag(CPUS390XState *env, struct kvm_run *run, int ipb_code)
+static int handle_diag(S390CPU *cpu, struct kvm_run *run, int ipb_code)
 {
 int r = 0;
 
 switch (ipb_code) {
 case DIAG_KVM_HYPERCALL:
-r = handle_hypercall(env, run);
+r = handle_hypercall(cpu, run);
 break;
 case DIAG_KVM_BREAKPOINT:
 sleep(10);
@@ -735,7 +736,6 @@ out:
 
 static int handle_instruction(S390CPU *cpu, struct kvm_run *run)
 {
-CPUS390XState *env = cpu-env;
 unsigned int ipa0 = (run-s390_sieic.ipa  0xff00);
 uint8_t ipa1 = run-s390_sieic.ipa  0x00ff;
 int ipb_code = (run-s390_sieic.ipb  0x0fff)  16;
@@ -749,7 +749,7 @@ static int handle_instruction(S390CPU *cpu, struct kvm_run 
*run)
 r = handle_priv(cpu, run, ipa0  8, ipa1);
 break;
 case IPA0_DIAG:
-r = handle_diag(env, run, ipb_code);
+r = handle_diag(cpu, run, ipb_code);
 break;
 case IPA0_SIGP:
 r = handle_sigp(cpu, run, ipa1);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH qom-cpu v3 03/14] kvm: Change kvm_remove_all_breakpoints() argument to CPUState

2013-06-26 Thread Andreas Färber
Acked-by: Paolo Bonzini pbonz...@redhat.com
Reviewed-by: Richard Henderson r...@twiddle.net
Signed-off-by: Andreas Färber afaer...@suse.de
---
 gdbstub.c| 2 +-
 include/sysemu/kvm.h | 2 +-
 kvm-all.c| 6 +++---
 kvm-stub.c   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/gdbstub.c b/gdbstub.c
index 3101a43..9e7f7a1 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -2019,7 +2019,7 @@ static void gdb_breakpoint_remove_all(void)
 CPUArchState *env;
 
 if (kvm_enabled()) {
-kvm_remove_all_breakpoints(gdbserver_state-c_cpu);
+kvm_remove_all_breakpoints(ENV_GET_CPU(gdbserver_state-c_cpu));
 return;
 }
 
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index c88aee9..9460d5a 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -163,7 +163,7 @@ int kvm_insert_breakpoint(CPUArchState *env, target_ulong 
addr,
   target_ulong len, int type);
 int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr,
   target_ulong len, int type);
-void kvm_remove_all_breakpoints(CPUArchState *env);
+void kvm_remove_all_breakpoints(CPUState *cpu);
 int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap);
 #ifndef _WIN32
 int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset);
diff --git a/kvm-all.c b/kvm-all.c
index d074597..ee0ee02 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1981,11 +1981,11 @@ int kvm_remove_breakpoint(CPUArchState *env, 
target_ulong addr,
 return 0;
 }
 
-void kvm_remove_all_breakpoints(CPUArchState *env)
+void kvm_remove_all_breakpoints(CPUState *cpu)
 {
-CPUState *cpu = ENV_GET_CPU(env);
 struct kvm_sw_breakpoint *bp, *next;
 KVMState *s = cpu-kvm_state;
+CPUArchState *env;
 
 QTAILQ_FOREACH_SAFE(bp, s-kvm_sw_breakpoints, entry, next) {
 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
@@ -2026,7 +2026,7 @@ int kvm_remove_breakpoint(CPUArchState *env, target_ulong 
addr,
 return -EINVAL;
 }
 
-void kvm_remove_all_breakpoints(CPUArchState *env)
+void kvm_remove_all_breakpoints(CPUState *cpu)
 {
 }
 #endif /* !KVM_CAP_SET_GUEST_DEBUG */
diff --git a/kvm-stub.c b/kvm-stub.c
index 76da61e..a6c2b01 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -95,7 +95,7 @@ int kvm_remove_breakpoint(CPUArchState *env, target_ulong 
addr,
 return -EINVAL;
 }
 
-void kvm_remove_all_breakpoints(CPUArchState *env)
+void kvm_remove_all_breakpoints(CPUState *cpu)
 {
 }
 
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC V9 0/19] Paravirtualized ticket spinlocks

2013-06-26 Thread Raghavendra K T

On 06/26/2013 09:41 PM, Gleb Natapov wrote:

On Wed, Jun 26, 2013 at 07:10:21PM +0530, Raghavendra K T wrote:

On 06/26/2013 06:22 PM, Gleb Natapov wrote:

On Wed, Jun 26, 2013 at 01:37:45PM +0200, Andrew Jones wrote:

On Wed, Jun 26, 2013 at 02:15:26PM +0530, Raghavendra K T wrote:

On 06/25/2013 08:20 PM, Andrew Theurer wrote:

On Sun, 2013-06-02 at 00:51 +0530, Raghavendra K T wrote:

This series replaces the existing paravirtualized spinlock mechanism
with a paravirtualized ticketlock mechanism. The series provides
implementation for both Xen and KVM.

Changes in V9:
- Changed spin_threshold to 32k to avoid excess halt exits that are
causing undercommit degradation (after PLE handler improvement).
- Added  kvm_irq_delivery_to_apic (suggested by Gleb)
- Optimized halt exit path to use PLE handler

V8 of PVspinlock was posted last year. After Avi's suggestions to look
at PLE handler's improvements, various optimizations in PLE handling
have been tried.


Sorry for not posting this sooner.  I have tested the v9 pv-ticketlock
patches in 1x and 2x over-commit with 10-vcpu and 20-vcpu VMs.  I have
tested these patches with and without PLE, as PLE is still not scalable
with large VMs.



Hi Andrew,

Thanks for testing.


System: x3850X5, 40 cores, 80 threads


1x over-commit with 10-vCPU VMs (8 VMs) all running dbench:
--
Total
Configuration   Throughput(MB/s)Notes

3.10-default-ple_on 22945   5% CPU in host 
kernel, 2% spin_lock in guests
3.10-default-ple_off23184   5% CPU in host 
kernel, 2% spin_lock in guests
3.10-pvticket-ple_on22895   5% CPU in host 
kernel, 2% spin_lock in guests
3.10-pvticket-ple_off   23051   5% CPU in host 
kernel, 2% spin_lock in guests
[all 1x results look good here]


Yes. The 1x results look too close




2x over-commit with 10-vCPU VMs (16 VMs) all running dbench:
---
Total
Configuration   Throughput  Notes

3.10-default-ple_on  6287   55% CPU  host 
kernel, 17% spin_lock in guests
3.10-default-ple_off 1849   2% CPU in host 
kernel, 95% spin_lock in guests
3.10-pvticket-ple_on 6691   50% CPU in host 
kernel, 15% spin_lock in guests
3.10-pvticket-ple_off   16464   8% CPU in host 
kernel, 33% spin_lock in guests


I see 6.426% improvement with ple_on
and 161.87% improvement with ple_off. I think this is a very good sign
  for the patches


[PLE hinders pv-ticket improvements, but even with PLE off,
  we still off from ideal throughput (somewhere 2)]



Okay, The ideal throughput you are referring is getting around atleast
80% of 1x throughput for over-commit. Yes we are still far away from
there.



1x over-commit with 20-vCPU VMs (4 VMs) all running dbench:
--
Total
Configuration   Throughput  Notes

3.10-default-ple_on 22736   6% CPU in host 
kernel, 3% spin_lock in guests
3.10-default-ple_off23377   5% CPU in host 
kernel, 3% spin_lock in guests
3.10-pvticket-ple_on22471   6% CPU in host 
kernel, 3% spin_lock in guests
3.10-pvticket-ple_off   23445   5% CPU in host 
kernel, 3% spin_lock in guests
[1x looking fine here]



I see ple_off is little better here.



2x over-commit with 20-vCPU VMs (8 VMs) all running dbench:
--
Total
Configuration   Throughput  Notes

3.10-default-ple_on  1965   70% CPU in host 
kernel, 34% spin_lock in guests 
3.10-default-ple_off  226   2% CPU in host 
kernel, 94% spin_lock in guests
3.10-pvticket-ple_on 1942   70% CPU in host 
kernel, 35% spin_lock in guests
3.10-pvticket-ple_off8003   11% CPU in host 
kernel, 70% spin_lock in guests
[quite bad all around, but pv-tickets with PLE off the best so far.
  Still quite a bit off from ideal throughput]


This is again a remarkable improvement (307%).
This motivates me to add a patch to disable ple when pvspinlock is on.
probably we can add a hypercall that disables ple in kvm init patch.
but only problem I see is what if the guests are mixed.

  (i.e one guest has pvspinlock support 

[PATCH-next v2] kvm: don't try to take mmu_lock while holding the main raw kvm_lock

2013-06-26 Thread Paul Gortmaker
In commit e935b8372cf8 (KVM: Convert kvm_lock to raw_spinlock),
the kvm_lock was made a raw lock.  However, the kvm mmu_shrink()
function tries to grab the (non-raw) mmu_lock within the scope of
the raw locked kvm_lock being held.  This leads to the following:

BUG: sleeping function called from invalid context at kernel/rtmutex.c:659
in_atomic(): 1, irqs_disabled(): 0, pid: 55, name: kswapd0
Preemption disabled at:[a0376eac] mmu_shrink+0x5c/0x1b0 [kvm]

Pid: 55, comm: kswapd0 Not tainted 3.4.34_preempt-rt
Call Trace:
 [8106f2ad] __might_sleep+0xfd/0x160
 [817d8d64] rt_spin_lock+0x24/0x50
 [a0376f3c] mmu_shrink+0xec/0x1b0 [kvm]
 [8111455d] shrink_slab+0x17d/0x3a0
 [81151f00] ? mem_cgroup_iter+0x130/0x260
 [8111824a] balance_pgdat+0x54a/0x730
 [8111fe47] ? set_pgdat_percpu_threshold+0xa7/0xd0
 [811185bf] kswapd+0x18f/0x490
 [81070961] ? get_parent_ip+0x11/0x50
 [81061970] ? __init_waitqueue_head+0x50/0x50
 [81118430] ? balance_pgdat+0x730/0x730
 [81060d2b] kthread+0xdb/0xe0
 [8106e122] ? finish_task_switch+0x52/0x100
 [817e1e94] kernel_thread_helper+0x4/0x10
 [81060c50] ? __init_kthread_worker+0x

Note that the above was seen on an earlier 3.4 preempt-rt, for where
the lock distinction (raw vs. non-raw) actually matters.

Since we only use the lock for protecting the vm_list, once we've found
the instance we want, we can shuffle it to the end of the list and then
drop the kvm_lock before taking the mmu_lock.  We can do this because
after the mmu operations are completed, we break -- i.e. we don't continue
list processing, so it doesn't matter if the list changed around us.

Since the shrinker code runs asynchronously with respect to KVM, we do
need to still protect against the users_count going to zero and then
kvm_destroy_vm() being called, so we use kvm_get_kvm/kvm_put_kvm, as
suggested by Paolo.

Cc: Paolo Bonzini pbonz...@redhat.com
Cc: Gleb Natapov g...@redhat.com
Signed-off-by: Paul Gortmaker paul.gortma...@windriver.com
---

[v2: add the kvm_get_kvm, update comments and log appropriately]

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 748e0d8..662b679 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4322,6 +4322,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct 
shrink_control *sc)
 {
struct kvm *kvm;
int nr_to_scan = sc-nr_to_scan;
+   int found = 0;
unsigned long freed = 0;
 
raw_spin_lock(kvm_lock);
@@ -4349,6 +4350,18 @@ mmu_shrink_scan(struct shrinker *shrink, struct 
shrink_control *sc)
continue;
 
idx = srcu_read_lock(kvm-srcu);
+
+   list_move_tail(kvm-vm_list, vm_list);
+   found = 1;
+   /*
+* We are done with the list, so drop kvm_lock, as we can't be
+* holding a raw lock and take the non-raw mmu_lock.  But we
+* don't want to be unprotected from kvm_destroy_vm either,
+* so we bump users_count.
+*/
+   kvm_get_kvm(kvm);
+   raw_spin_unlock(kvm_lock);
+
spin_lock(kvm-mmu_lock);
 
if (kvm_has_zapped_obsolete_pages(kvm)) {
@@ -4363,6 +4376,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct 
shrink_control *sc)
 
 unlock:
spin_unlock(kvm-mmu_lock);
+   kvm_put_kvm(kvm);
srcu_read_unlock(kvm-srcu, idx);
 
/*
@@ -4370,11 +4384,12 @@ unlock:
 * per-vm shrinkers cry out
 * sadness comes quickly
 */
-   list_move_tail(kvm-vm_list, vm_list);
break;
}
 
-   raw_spin_unlock(kvm_lock);
+   if (!found)
+   raw_spin_unlock(kvm_lock);
+
return freed;
 
 }
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] KVM: small type cleanups

2013-06-26 Thread Mathias Krause
Hi,

this small series contains a few type and style cleanups. It has no
impact on the generated code but removes a few small nits from the
code.

Please apply!

Thanks,

Mathias Krause (3):
  KVM: VMX: Use proper types to access const arrays
  KVM: VMX: Use size_t to store sizeof() values
  KVM: x86: Drop useless cast

 arch/x86/kvm/vmx.c |   19 +--
 arch/x86/kvm/x86.c |2 +-
 2 files changed, 10 insertions(+), 11 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] KVM: VMX: Use size_t to store sizeof() values

2013-06-26 Thread Mathias Krause
The type for storing values of the sizeof operator should be size_t.
No semantical changes, only type correctness.

Signed-off-by: Mathias Krause mini...@googlemail.com
---
 arch/x86/kvm/vmx.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7393164..cd9090f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3909,7 +3909,7 @@ static void free_vpid(struct vcpu_vmx *vmx)
 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type)
 {
-   int f = sizeof(unsigned long);
+   const size_t f = sizeof(unsigned long);
 
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -3944,7 +3944,7 @@ static void __vmx_disable_intercept_for_msr(unsigned long 
*msr_bitmap,
 static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type)
 {
-   int f = sizeof(unsigned long);
+   const size_t f = sizeof(unsigned long);
 
if (!cpu_has_vmx_msr_bitmap())
return;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] KVM: x86: Drop useless cast

2013-06-26 Thread Mathias Krause
Void pointers don't need no casting, drop it.

Signed-off-by: Mathias Krause mini...@googlemail.com
---
 arch/x86/kvm/x86.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e8ba99c..472350c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5300,7 +5300,7 @@ static struct notifier_block pvclock_gtod_notifier = {
 int kvm_arch_init(void *opaque)
 {
int r;
-   struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+   struct kvm_x86_ops *ops = opaque;
 
if (kvm_x86_ops) {
printk(KERN_ERR kvm: already loaded the other module\n);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] KVM: VMX: Use proper types to access const arrays

2013-06-26 Thread Mathias Krause
Use a const pointer type instead of casting away the const qualifier
from const arrays. Keep the pointer array on the stack, nonetheless.
Making it static just increases the object size.

Signed-off-by: Mathias Krause mini...@googlemail.com
---
 arch/x86/kvm/vmx.c |   15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 260a919..7393164 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5956,8 +5956,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
unsigned long field;
u64 field_value;
struct vmcs *shadow_vmcs = vmx-nested.current_shadow_vmcs;
-   unsigned long *fields = (unsigned long *)shadow_read_write_fields;
-   int num_fields = max_shadow_read_write_fields;
+   const unsigned long *fields = shadow_read_write_fields;
+   const int num_fields = max_shadow_read_write_fields;
 
vmcs_load(shadow_vmcs);
 
@@ -5986,12 +5986,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 {
-   unsigned long *fields[] = {
-   (unsigned long *)shadow_read_write_fields,
-   (unsigned long *)shadow_read_only_fields
+   const unsigned long *fields[] = {
+   shadow_read_write_fields,
+   shadow_read_only_fields
};
-   int num_lists =  ARRAY_SIZE(fields);
-   int max_fields[] = {
+   const int max_fields[] = {
max_shadow_read_write_fields,
max_shadow_read_only_fields
};
@@ -6002,7 +6001,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 
vmcs_load(shadow_vmcs);
 
-   for (q = 0; q  num_lists; q++) {
+   for (q = 0; q  ARRAY_SIZE(fields); q++) {
for (i = 0; i  max_fields[q]; i++) {
field = fields[q][i];
vmcs12_read_any(vmx-vcpu, field, field_value);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Rebaseing kvm-arm-next

2013-06-26 Thread Christoffer Dall
Hi all,

I messed up my workflow earlier on, so I had to rebase kvm-arm-next onto
kvm/next.  I will do everything in my powers to avoid this in the
future.

Sorry for any troubles.

-Christoffer
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] KVM/ARM queue for 3.11

2013-06-26 Thread Christoffer Dall
Hi Gleb and Paolo,

The following changes since commit 87d41fb4da6467622b7a87fd6afe8071abab6dae:

  KVM: s390: Fixed priority of execution in STSI (2013-06-20 23:33:01 +0200)

are available in the git repository at:

  git://git.linaro.org/people/cdall/linux-kvm-arm.git tags/kvm-arm-3.11

for you to fetch changes up to 8bd4ffd6b3a98f00267051dc095076ea2ff06ea8:

  ARM: kvm: don't include drivers/virtio/Kconfig (2013-06-26 10:50:06 -0700)


Thanks,
-Christoffer


Anup Patel (1):
  ARM: KVM: Allow host virt timer irq to be different from guest timer virt 
irq

Arnd Bergmann (1):
  ARM: kvm: don't include drivers/virtio/Kconfig

Christoffer Dall (1):
  Update MAINTAINERS: KVM/ARM work now funded by Linaro

Dave P Martin (1):
  ARM: KVM: Don't handle PSCI calls via SMC

Geoff Levand (1):
  arm/kvm: Cleanup KVM_ARM_MAX_VCPUS logic

Marc Zyngier (7):
  ARM: KVM: remove dead prototype for __kvm_tlb_flush_vmid
  ARM: KVM: use phys_addr_t instead of unsigned long long for HYP PGDs
  ARM: KVM: don't special case PC when doing an MMIO
  ARM: KVM: get rid of S2_PGD_SIZE
  ARM: KVM: perform save/restore of PAR
  ARM: KVM: add missing dsb before invalidating Stage-2 TLBs
  ARM: KVM: clear exclusive monitor on all exception returns

 MAINTAINERS|  4 ++--
 arch/arm/include/asm/kvm_arm.h |  1 -
 arch/arm/include/asm/kvm_asm.h | 24 
 arch/arm/include/asm/kvm_emulate.h |  5 -
 arch/arm/include/asm/kvm_host.h|  9 +++--
 arch/arm/kvm/Kconfig   |  8 +++-
 arch/arm/kvm/arm.c |  8 
 arch/arm/kvm/coproc.c  |  4 
 arch/arm/kvm/handle_exit.c |  3 ---
 arch/arm/kvm/interrupts.S  | 16 +++-
 arch/arm/kvm/interrupts_head.S | 10 --
 arch/arm/kvm/mmio.c|  6 --
 arch/arm/kvm/mmu.c |  3 ---
 arch/arm/kvm/psci.c|  2 +-
 arch/arm/kvm/reset.c   | 12 
 include/kvm/arm_arch_timer.h   |  4 
 virt/kvm/arm/arch_timer.c  | 29 -
 17 files changed, 92 insertions(+), 56 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] pci: Enable overrides for missing ACS capabilities

2013-06-26 Thread Alex Williamson
On Mon, 2013-06-24 at 11:43 -0600, Bjorn Helgaas wrote:
 On Wed, Jun 19, 2013 at 6:43 AM, Don Dutile ddut...@redhat.com wrote:
  On 06/18/2013 10:52 PM, Bjorn Helgaas wrote:
 
  On Tue, Jun 18, 2013 at 5:03 PM, Don Dutileddut...@redhat.com  wrote:
 
  On 06/18/2013 06:22 PM, Alex Williamson wrote:
 
 
  On Tue, 2013-06-18 at 15:31 -0600, Bjorn Helgaas wrote:
 
 
  On Tue, Jun 18, 2013 at 12:20 PM, Alex Williamson
  alex.william...@redhat.com   wrote:
 
 
  On Tue, 2013-06-18 at 11:28 -0600, Bjorn Helgaas wrote:
 
 
  On Thu, May 30, 2013 at 12:40:19PM -0600, Alex Williamson wrote:
 
  ...
 
  Who do you expect to decide whether to use this option?  I think it
  requires intimate knowledge of how the device works.
 
  I think the benefit of using the option is that it makes assignment
  of
  devices to guests more flexible, which will make it attractive to
  users.
  But most users have no way of knowing whether it's actually *safe* to
  use this.  So I worry that you're adding an easy way to pretend
  isolation
  exists when there's no good way of being confident that it actually
  does.
 
 
  ...
 
 
  I wonder if we should taint the kernel if this option is used (but not
  for specific devices added to pci_dev_acs_enabled[]).  It would also
  be nice if pci_dev_specific_acs_enabled() gave some indication in
  dmesg for the specific devices you're hoping to add to
  pci_dev_acs_enabled[].  It's not an enumeration-time quirk right now,
  so I'm not sure how we'd limit it to one message per device.
 
 
  Right, setup vs use and getting single prints is a lot of extra code.
  Tainting is troublesome for support, Don had some objections when I
  suggested the same to him.
 
  For RH GSS (Global Support Services), a 'taint' in the kernel printk
  means
  RH doesn't support that system.  The 'non-support' due to 'taint' being
  printed
  out in this case may be incorrect -- RH may support that use, at least
  until
  a more sufficient patched kernel is provided.
  Thus my dissension that 'taint' be output.  WARN is ok. 'driver beware',
  'unleashed dog afoot' sure...
 
 
  So ...  that's really a RH-specific support issue, and easily worked
  around by RH adding a patch that turns off tainting.
 
  sure. what's another patch to the thousands... :-/
 
  It still sounds like a good idea to me for upstream, where use of this
  option can very possibly lead to corruption or information leakage
  between devices the user claimed were isolated, but in fact were not.
 
  Did I miss something?  This patch provides a user-level/chosen override;
  like all other overrides, (pci=realloc, etc.), it can lead to a failing
  system.
  IMO, this patch is no different.  If you want to tag this patch with taint,
  then let's audit all the (PCI) overrides and taint them appropriately.
  Taint should be reserved to changes to the kernel that were done outside
  the development of the kernel, or with the explicit intent to circumvent
  the normal operation of the kernel.  This patch provides a way to enable
  ACS checking to succeed when the devices have not provided sufficiently
  complete
  ACS information.  i.e., it's a growth path for PCIe-ACS and its need for
  proper support.
 
 We're telling the kernel to assume something (the hardware provides
 protection) that may not be true.  If that assumption turns out to be
 false, the result is that a VM can be crashed or comprised by another
 VM.
 
 One difference I see is that this override can lead to a crash that
 looks like random memory corruption and has no apparent connection to
 the actual cause.  Most other overrides won't cause run-time crashes
 (I think they're more likely to cause boot or device configuration
 failures), and the dmesg log will probably have good clues as to the
 reason.
 
 But the possibility of compromise is probably even more serious,
 because there would be no crash at all, and we'd have no indication
 that VM A read or corrupted data in VM B.  I'm very concerned about
 that, enough so that it's not clear to me that an override belongs in
 the upstream kernel at all.
 
 Yes, that would mean some hardware is not suitable for device
 assignment.  That just sounds like if hardware manufacturers do their
 homework and support ACS properly, their hardware is more useful for
 virtualization than other hardware.  I don't see the problem with
 that.

That's easy to say for someone that doesn't get caught trying to explain
this to users over and over.  In many cases devices don't do
peer-to-peer and missing ACS is an oversight.  I imagine that quite a
few vendors also see the ACS capability as a means to allow control of
ACS and therefore see it as a much larger investment that just providing
an empty ACS structure in config space to indicate the lack of
peer-to-peer.

Even if we taint the kernel when this is enabled and add extremely
verbose warnings in kernel-parameters.txt, I think there's value to
providing an on-the-spot workaround to users.  In many 

Re: [PATCH-next v2] kvm: don't try to take mmu_lock while holding the main raw kvm_lock

2013-06-26 Thread Paolo Bonzini
Il 26/06/2013 20:11, Paul Gortmaker ha scritto:
   spin_unlock(kvm-mmu_lock);
 + kvm_put_kvm(kvm);
   srcu_read_unlock(kvm-srcu, idx);
  

kvm_put_kvm needs to go last.  I can fix when applying, but I'll wait
for Gleb to take a look too.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH] uio: uio_pci_generic: Add support for MSI interrupts

2013-06-26 Thread Guenter Roeck
Enable support for MSI interrupts if the device supports it.
Since MSI interrupts are edge triggered, it is no longer necessary to
disable interrupts in the kernel and re-enable them from user-space.
Instead, clearing the interrupt condition in the user space application
automatically re-enables the interrupt.

Signed-off-by: Guenter Roeck li...@roeck-us.net
---
An open question is if we can just do this unconditionally
or if there should be some flag to enable it. A module parameter, maybe ?

 Documentation/DocBook/uio-howto.tmpl |   23 ---
 drivers/uio/uio_pci_generic.c|   15 ---
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/Documentation/DocBook/uio-howto.tmpl 
b/Documentation/DocBook/uio-howto.tmpl
index 9561815..69b54e0 100644
--- a/Documentation/DocBook/uio-howto.tmpl
+++ b/Documentation/DocBook/uio-howto.tmpl
@@ -46,6 +46,12 @@ GPL version 2.
 
 revhistory
revision
+   revnumber0.10/revnumber
+   date2013-06-26/date
+   authorinitialsgr/authorinitials
+   revremarkAdded MSI support to uio_pci_generic./revremark
+   /revision
+   revision
revnumber0.9/revnumber
date2009-07-16/date
authorinitialsmst/authorinitials
@@ -935,15 +941,26 @@ and look in the output for failure reasons
 sect1 id=uio_pci_generic_internals
 titleThings to know about uio_pci_generic/title
para
-Interrupts are handled using the Interrupt Disable bit in the PCI command
+Interrupts are handled either as MSI interrupts (if the device supports it) or
+as legacy INTx interrupts.
+   /para
+   para
+uio_pci_generic automatically configures a device to use MSI interrupts
+if the device supports it. If an MSI interrupt is received, the user space
+driver is notified. Since MSI interrupts are edge sensitive, the user space
+driver needs to clear the interrupt condition in the device before blocking
+and waiting for more interrupts.
+   /para
+   para
+Legacy interrupts are handled using the Interrupt Disable bit in the PCI 
command
 register and Interrupt Status bit in the PCI status register.  All devices
 compliant to PCI 2.3 (circa 2002) and all compliant PCI Express devices should
 support these bits.  uio_pci_generic detects this support, and won't bind to
 devices which do not support the Interrupt Disable Bit in the command register.
/para
para
-On each interrupt, uio_pci_generic sets the Interrupt Disable bit.
-This prevents the device from generating further interrupts
+If legacy interrupts are used, uio_pci_generic sets the Interrupt Disable bit 
on
+each interrupt. This prevents the device from generating further interrupts
 until the bit is cleared. The userspace driver should clear this
 bit before blocking and waiting for more interrupts.
/para
diff --git a/drivers/uio/uio_pci_generic.c b/drivers/uio/uio_pci_generic.c
index 14aa10c..3366fdb 100644
--- a/drivers/uio/uio_pci_generic.c
+++ b/drivers/uio/uio_pci_generic.c
@@ -32,6 +32,7 @@
 struct uio_pci_generic_dev {
struct uio_info info;
struct pci_dev *pdev;
+   bool have_msi;
 };
 
 static inline struct uio_pci_generic_dev *
@@ -46,7 +47,7 @@ static irqreturn_t irqhandler(int irq, struct uio_info *info)
 {
struct uio_pci_generic_dev *gdev = to_uio_pci_generic_dev(info);
 
-   if (!pci_check_and_mask_intx(gdev-pdev))
+   if (!gdev-have_msi  !pci_check_and_mask_intx(gdev-pdev))
return IRQ_NONE;
 
/* UIO core will signal the user process. */
@@ -58,6 +59,7 @@ static int probe(struct pci_dev *pdev,
 {
struct uio_pci_generic_dev *gdev;
int err;
+   bool have_msi = false;
 
err = pci_enable_device(pdev);
if (err) {
@@ -73,7 +75,9 @@ static int probe(struct pci_dev *pdev,
return -ENODEV;
}
 
-   if (!pci_intx_mask_supported(pdev)) {
+   if (!pci_enable_msi(pdev)) {
+   have_msi = true;
+   } else if (!pci_intx_mask_supported(pdev)) {
err = -ENODEV;
goto err_verify;
}
@@ -84,10 +88,11 @@ static int probe(struct pci_dev *pdev,
goto err_alloc;
}
 
+   gdev-have_msi = have_msi;
gdev-info.name = uio_pci_generic;
gdev-info.version = DRIVER_VERSION;
gdev-info.irq = pdev-irq;
-   gdev-info.irq_flags = IRQF_SHARED;
+   gdev-info.irq_flags = have_msi ? 0 : IRQF_SHARED;
gdev-info.handler = irqhandler;
gdev-pdev = pdev;
 
@@ -99,6 +104,8 @@ static int probe(struct pci_dev *pdev,
 err_register:
kfree(gdev);
 err_alloc:
+   if (have_msi)
+   pci_disable_msi(pdev);
 err_verify:
pci_disable_device(pdev);
return err;
@@ -109,6 +116,8 @@ static void remove(struct pci_dev *pdev)
struct uio_pci_generic_dev *gdev = pci_get_drvdata(pdev);
 
uio_unregister_device(gdev-info);
+   if (gdev-have_msi)
+   

Migration route from Parallels on Mac for Windows images?

2013-06-26 Thread Ken Roberts
Sorry for the user query but I'm not finding expertise on the Linux mailing 
lists I belong to.  The web site says one-off user questions are OK.

I have a few VM images on Parallels 8 for Mac. I want them to be on KVM/Linux.

Some of the images are Linux, but the critical ones are a few types of Windows. 
 I don't want to trash my licenses.

I noticed that kvm-img has a parallels format option, and it seems to work 
while the conversion is going on.  I've tried kvm-img to convert to qcow2 and 
to raw, both cases the image converts but the disk is not bootable.  The only 
file the kvm-img doesn't immediately fail on is the one that contains the data.

The best answer to my problem is to find out how to make the disk bootable.

The next best answer is to find out if there is a reliable migration path, even 
if it means going to VMware first.

Also, if VMware is a necessary intermediate point, it would help to know which 
VMware format to use for best results.

I'm not a KVM expert, I've made some VMs on LVM and installed Linux on them 
with bridged networking, that's about the extent of it.  For the record that 
was insanely simple.

Thanks.

--
Ken Roberts
k...@9ci.com
ken.roberts163 @ skype
605-222-5758 @ cell


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH-next v2] kvm: don't try to take mmu_lock while holding the main raw kvm_lock

2013-06-26 Thread Paul Gortmaker
[Re: [PATCH-next v2] kvm: don't try to take mmu_lock while holding the main raw 
kvm_lock] On 26/06/2013 (Wed 23:59) Paolo Bonzini wrote:

 Il 26/06/2013 20:11, Paul Gortmaker ha scritto:
  spin_unlock(kvm-mmu_lock);
  +   kvm_put_kvm(kvm);
  srcu_read_unlock(kvm-srcu, idx);
   
 
 kvm_put_kvm needs to go last.  I can fix when applying, but I'll wait
 for Gleb to take a look too.

I'm curious why you would say that -- since the way I sent it has the
lock tear down be symmetrical and opposite to the build up - e.g.

idx = srcu_read_lock(kvm-srcu);

[...]

+   kvm_get_kvm(kvm);

[...]
spin_lock(kvm-mmu_lock);
 
[...]

 unlock:
spin_unlock(kvm-mmu_lock);
+   kvm_put_kvm(kvm);
srcu_read_unlock(kvm-srcu, idx);
 
You'd originally said to put the kvm_get_kvm where it currently is;
perhaps instead we want the get/put to encompass the whole 
srcu_read locked section?

P.
--

 
 Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/6 v5] powerpc: export debug registers save function for KVM

2013-06-26 Thread Stephen Rothwell
Hi,

On Wed, 26 Jun 2013 11:12:23 +0530 Bharat Bhushan r65...@freescale.com wrote:

 diff --git a/arch/powerpc/include/asm/switch_to.h 
 b/arch/powerpc/include/asm/switch_to.h
 index 200d763..50b357f 100644
 --- a/arch/powerpc/include/asm/switch_to.h
 +++ b/arch/powerpc/include/asm/switch_to.h
 @@ -30,6 +30,10 @@ extern void enable_kernel_spe(void);
  extern void giveup_spe(struct task_struct *);
  extern void load_up_spe(struct task_struct *);
  
 +#ifdef CONFIG_PPC_ADV_DEBUG_REGS
 +extern void switch_booke_debug_regs(struct thread_struct *new_thread);
 +#endif

We usually don't bother guarding function declarations.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgp_yJYfcoXUd.pgp
Description: PGP signature


[PATCH 0/8 v4] KVM: PPC: IOMMU in-kernel handling

2013-06-26 Thread Alexey Kardashevskiy
The changes are:
1. rebased on v3.10-rc7
2. removed spinlocks from real mode
3. added security checks between KVM and VFIO

MOre details in the individual patch comments.


Alexey Kardashevskiy (8):
  KVM: PPC: reserve a capability number for multitce support
  KVM: PPC: reserve a capability and ioctl numbers for realmode VFIO
  vfio: add external user support
  hashtable: add hash_for_each_possible_rcu_notrace()
  powerpc: Prepare to support kernel handling of IOMMU map/unmap
  KVM: PPC: Add support for multiple-TCE hcalls
  KVM: PPC: Add support for IOMMU in-kernel handling
  KVM: PPC: Add hugepage support for IOMMU in-kernel handling

 Documentation/virtual/kvm/api.txt|   51 +++
 arch/powerpc/include/asm/kvm_host.h  |   31 ++
 arch/powerpc/include/asm/kvm_ppc.h   |   18 +-
 arch/powerpc/include/asm/pgtable-ppc64.h |4 +
 arch/powerpc/include/uapi/asm/kvm.h  |8 +
 arch/powerpc/kvm/book3s_64_vio.c |  506 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c  |  439 --
 arch/powerpc/kvm/book3s_hv.c |   41 ++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |6 +
 arch/powerpc/kvm/book3s_pr_papr.c|   37 ++-
 arch/powerpc/kvm/powerpc.c   |   15 +
 arch/powerpc/mm/init_64.c|   78 -
 drivers/vfio/vfio.c  |   53 
 include/linux/hashtable.h|   15 +
 include/linux/page-flags.h   |4 +-
 include/uapi/linux/kvm.h |3 +
 16 files changed, 1279 insertions(+), 30 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/8] powerpc: Prepare to support kernel handling of IOMMU map/unmap

2013-06-26 Thread Alexey Kardashevskiy
The current VFIO-on-POWER implementation supports only user mode
driven mapping, i.e. QEMU is sending requests to map/unmap pages.
However this approach is really slow, so we want to move that to KVM.
Since H_PUT_TCE can be extremely performance sensitive (especially with
network adapters where each packet needs to be mapped/unmapped) we chose
to implement that as a fast hypercall directly in real
mode (processor still in the guest context but MMU off).

To be able to do that, we need to provide some facilities to
access the struct page count within that real mode environment as things
like the sparsemem vmemmap mappings aren't accessible.

This adds an API to increment/decrement page counter as
get_user_pages API used for user mode mapping does not work
in the real mode.

CONFIG_SPARSEMEM_VMEMMAP and CONFIG_FLATMEM are supported.

Reviewed-by: Paul Mackerras pau...@samba.org
Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---

Changes:
2013/06/27:
* realmode_get_page() fixed to use get_page_unless_zero(). If failed,
the call will be passed from real to virtual mode and safely handled.
* added comment to PageCompound() in include/linux/page-flags.h.

2013/05/20:
* PageTail() is replaced by PageCompound() in order to have the same checks
for whether the page is huge in realmode_get_page() and realmode_put_page()
---
 arch/powerpc/include/asm/pgtable-ppc64.h |4 ++
 arch/powerpc/mm/init_64.c|   78 +-
 include/linux/page-flags.h   |4 +-
 3 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index e3d55f6f..7b46e5f 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -376,6 +376,10 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t 
*pgdir, unsigned long ea,
 }
 #endif /* !CONFIG_HUGETLB_PAGE */
 
+struct page *realmode_pfn_to_page(unsigned long pfn);
+int realmode_get_page(struct page *page);
+int realmode_put_page(struct page *page);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a90b9c4..7031be3 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -297,5 +297,81 @@ void vmemmap_free(unsigned long start, unsigned long end)
 {
 }
 
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+/*
+ * We do not have access to the sparsemem vmemmap, so we fallback to
+ * walking the list of sparsemem blocks which we already maintain for
+ * the sake of crashdump. In the long run, we might want to maintain
+ * a tree if performance of that linear walk becomes a problem.
+ *
+ * Any of realmode_ functions can fail due to:
+ * 1) As real sparsemem blocks do not lay in RAM continously (they
+ * are in virtual address space which is not available in the real mode),
+ * the requested page struct can be split between blocks so get_page/put_page
+ * may fail.
+ * 2) When huge pages are used, the get_page/put_page API will fail
+ * in real mode as the linked addresses in the page struct are virtual
+ * too.
+ * When 1) or 2) takes place, the API returns an error code to cause
+ * an exit to kernel virtual mode where the operation will be completed.
+ */
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+   struct vmemmap_backing *vmem_back;
+   struct page *page;
+   unsigned long page_size = 1  mmu_psize_defs[mmu_vmemmap_psize].shift;
+   unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
+
+   for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back-list) {
+   if (pg_va  vmem_back-virt_addr)
+   continue;
 
+   /* Check that page struct is not split between real pages */
+   if ((pg_va + sizeof(struct page)) 
+   (vmem_back-virt_addr + page_size))
+   return NULL;
+
+   page = (struct page *) (vmem_back-phys + pg_va -
+   vmem_back-virt_addr);
+   return page;
+   }
+
+   return NULL;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#elif defined(CONFIG_FLATMEM)
+
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+   struct page *page = pfn_to_page(pfn);
+   return page;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
+
+#if defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_FLATMEM)
+int realmode_get_page(struct page *page)
+{
+   if (PageCompound(page))
+   return -EAGAIN;
+
+   if (!get_page_unless_zero(page))
+   return -EAGAIN;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(realmode_get_page);
+
+int realmode_put_page(struct page *page)
+{
+   if (PageCompound(page))
+   return -EAGAIN;
+
+   

[PATCH 7/8] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-26 Thread Alexey Kardashevskiy
This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
and H_STUFF_TCE requests without passing them to QEMU, which saves time
on switching to QEMU and back.

Both real and virtual modes are supported. First the kernel tries to
handle a TCE request in the real mode, if failed it passes it to
the virtual mode to complete the operation. If it a virtual mode
handler fails, a request is passed to the user mode.

This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to associate
a virtual PCI bus ID (LIOBN) with an IOMMU group which enables
in-kernel handling of IOMMU map/unmap. The external user API support
in VFIO is required.

Tests show that this patch increases transmission speed from 220MB/s
to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).

Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---

Changes:
2013/06/27:
* tce_list page is referenced now in order to protect it from accident
invalidation during H_PUT_TCE_INDIRECT execution
* added use of the external user VFIO API

2013/06/05:
* changed capability number
* changed ioctl number
* update the doc article number

2013/05/20:
* removed get_user() from real mode handlers
* kvm_vcpu_arch::tce_tmp usage extended. Now real mode handler puts there
translated TCEs, tries realmode_get_page() on those and if it fails, it
passes control over the virtual mode handler which tries to finish
the request handling
* kvmppc_lookup_pte() now does realmode_get_page() protected by BUSY bit
on a page
* The only reason to pass the request to user mode now is when the user mode
did not register TCE table in the kernel, in all other cases the virtual mode
handler is expected to do the job
---
 Documentation/virtual/kvm/api.txt   |   26 
 arch/powerpc/include/asm/kvm_host.h |4 +
 arch/powerpc/include/asm/kvm_ppc.h  |2 +
 arch/powerpc/include/uapi/asm/kvm.h |8 +
 arch/powerpc/kvm/book3s_64_vio.c|  294 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c |  165 
 arch/powerpc/kvm/powerpc.c  |   12 ++
 7 files changed, 509 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 762c703..01b0dc2 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2387,6 +2387,32 @@ slows operations a lot.
 Unlike other capabilities of this section, this one is always enabled.
 
 
+4.87 KVM_CREATE_SPAPR_TCE_IOMMU
+
+Capability: KVM_CAP_SPAPR_TCE_IOMMU
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_iommu (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_create_spapr_tce_iommu {
+   __u64 liobn;
+   __u32 iommu_id;
+   __u32 flags;
+};
+
+This creates a link between IOMMU group and a hardware TCE (translation
+control entry) table. This link lets the host kernel know what IOMMU
+group (i.e. TCE table) to use for the LIOBN number passed with
+H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls.
+
+In response to a TCE hypercall, the kernel looks for a TCE table descriptor
+in the list and handles the hypercall in real or virtual modes if
+the descriptor is found. Otherwise the hypercall is passed to the user mode.
+
+No flag is supported at the moment.
+
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 3bf407b..716ab18 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -180,6 +180,8 @@ struct kvmppc_spapr_tce_table {
struct kvm *kvm;
u64 liobn;
u32 window_size;
+   struct iommu_group *grp;/* used for IOMMU groups */
+   struct file *vfio_filp; /* used for IOMMU groups */
struct page *pages[0];
 };
 
@@ -611,6 +613,8 @@ struct kvm_vcpu_arch {
u64 busy_preempt;
 
unsigned long *tce_tmp;/* TCE cache for TCE_PUT_INDIRECT hcall */
+   unsigned long tce_tmp_num; /* Number of handled TCEs in the cache */
+   unsigned long tce_reason;  /* The reason of switching to the virtmode */
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index e852921b..934e01d 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -133,6 +133,8 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
+   struct kvm_create_spapr_tce_iommu *args);
 extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
struct kvm_vcpu *vcpu, unsigned long liobn);
 extern long kvmppc_emulated_validate_tce(unsigned long tce);
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 

[PATCH 8/8] KVM: PPC: Add hugepage support for IOMMU in-kernel handling

2013-06-26 Thread Alexey Kardashevskiy
This adds special support for huge pages (16MB).  The reference
counting cannot be easily done for such pages in real mode (when
MMU is off) so we added a list of huge pages.  It is populated in
virtual mode and get_page is called just once per a huge page.
Real mode handlers check if the requested page is huge and in the list,
then no reference counting is done, otherwise an exit to virtual mode
happens.  The list is released at KVM exit.  At the moment the fastest
card available for tests uses up to 9 huge pages so walking through this
list is not very expensive.  However this can change and we may want
to optimize this.

Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---

Changes:
2013/06/27:
* list of huge pages replaces with hashtable for better performance
* spinlock removed from real mode and only protects insertion of new
huge [ages descriptors into the hashtable

2013/06/05:
* fixed compile error when CONFIG_IOMMU_API=n

2013/05/20:
* the real mode handler now searches for a huge page by gpa (used to be pte)
* the virtual mode handler prints warning if it is called twice for the same
huge page as the real mode handler is expected to fail just once - when a huge
page is not in the list yet.
* the huge page is refcounted twice - when added to the hugepage list and
when used in the virtual mode hcall handler (can be optimized but it will
make the patch less nice).
---
 arch/powerpc/include/asm/kvm_host.h |   25 +
 arch/powerpc/kvm/book3s_64_vio.c|   95 +--
 arch/powerpc/kvm/book3s_64_vio_hv.c |   24 +++--
 3 files changed, 138 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 716ab18..0ad6189 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -30,6 +30,7 @@
 #include linux/kvm_para.h
 #include linux/list.h
 #include linux/atomic.h
+#include linux/hashtable.h
 #include asm/kvm_asm.h
 #include asm/processor.h
 #include asm/page.h
@@ -182,9 +183,33 @@ struct kvmppc_spapr_tce_table {
u32 window_size;
struct iommu_group *grp;/* used for IOMMU groups */
struct file *vfio_filp; /* used for IOMMU groups */
+   DECLARE_HASHTABLE(hash_tab, ilog2(64)); /* used for IOMMU groups */
+   spinlock_t hugepages_write_lock;/* used for IOMMU groups */
struct page *pages[0];
 };
 
+/*
+ * The KVM guest can be backed with 16MB pages.
+ * In this case, we cannot do page counting from the real mode
+ * as the compound pages are used - they are linked in a list
+ * with pointers as virtual addresses which are inaccessible
+ * in real mode.
+ *
+ * The code below keeps a 16MB pages list and uses page struct
+ * in real mode if it is already locked in RAM and inserted into
+ * the list or switches to the virtual mode where it can be
+ * handled in a usual manner.
+ */
+#define KVMPPC_HUGEPAGE_HASH(gpa)  hash_32(gpa  24, 32)
+
+struct kvmppc_iommu_hugepage {
+   struct hlist_node hash_node;
+   unsigned long gpa;  /* Guest physical address */
+   unsigned long hpa;  /* Host physical address */
+   struct page *page;  /* page struct of the very first subpage */
+   unsigned long size; /* Huge page size (always 16MB at the moment) */
+};
+
 struct kvmppc_linear_info {
void*base_virt;
unsigned longbase_pfn;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index a5d0195..6cedfe9 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -47,6 +47,78 @@
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 #define ERROR_ADDR  ((void *)~(unsigned long)0x0)
 
+#ifdef CONFIG_IOMMU_API
+/* Adds a new huge page descriptor to the hashtable */
+static long kvmppc_iommu_hugepage_try_add(
+   struct kvmppc_spapr_tce_table *tt,
+   pte_t pte, unsigned long hva, unsigned long gpa,
+   unsigned long pg_size)
+{
+   long ret = 0;
+   struct kvmppc_iommu_hugepage *hp;
+   struct page *pg;
+   unsigned key = KVMPPC_HUGEPAGE_HASH(gpa);
+
+   spin_lock(tt-hugepages_write_lock);
+   hash_for_each_possible_rcu(tt-hash_tab, hp, hash_node, key) {
+   if (KVMPPC_HUGEPAGE_HASH(hp-gpa) != key)
+   continue;
+   if ((gpa  hp-gpa) || (gpa = hp-gpa + hp-size))
+   continue;
+   goto unlock_exit;
+   }
+
+   hva = hva  ~(pg_size - 1);
+   ret = get_user_pages_fast(hva, 1, true/*write*/, pg);
+   if ((ret != 1) || !pg) {
+   ret = -EFAULT;
+   goto unlock_exit;
+   }
+   ret = 0;
+
+   hp = kzalloc(sizeof(*hp), GFP_KERNEL);
+   if (!hp) {
+   ret = -ENOMEM;
+   goto unlock_exit;
+   }
+
+   hp-page = pg;
+   

[PATCH 6/8] KVM: PPC: Add support for multiple-TCE hcalls

2013-06-26 Thread Alexey Kardashevskiy
This adds real mode handlers for the H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO
devices or emulated PCI.  These calls allow adding multiple entries
(up to 512) into the TCE table in one call which saves time on
transition to/from real mode.

This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
(copied from user and verified) before writing the whole list into
the TCE table. This cache will be utilized more in the upcoming
VFIO/IOMMU support to continue TCE list processing in the virtual
mode in the case if the real mode handler failed for some reason.

This adds a guest physical to host real address converter
and calls the existing H_PUT_TCE handler. The converting function
is going to be fully utilized by upcoming VFIO supporting patches.

This also implements the KVM_CAP_PPC_MULTITCE capability,
so in order to support the functionality of this patch, QEMU
needs to query for this capability and set the hcall-multi-tce
hypertas property only if the capability is present, otherwise
there will be serious performance degradation.

Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---
Changelog:
2013/06/27:
* fixed clear of BUSY bit in kvmppc_lookup_pte()
* H_PUT_TCE_INDIRECT does realmode_get_page() now
* KVM_CAP_SPAPR_MULTITCE now depends on CONFIG_PPC_BOOK3S_64
* updated doc

2013/06/05:
* fixed mistype about IBMVIO in the commit message
* updated doc and moved it to another section
* changed capability number

2013/05/21:
* added kvm_vcpu_arch::tce_tmp
* removed cleanup if put_indirect failed, instead we do not even start
writing to TCE table if we cannot get TCEs from the user and they are
invalid
* kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
and kvmppc_emulated_validate_tce (for the previous item)
* fixed bug with failthrough for H_IPI
* removed all get_user() from real mode handlers
* kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)
---
 Documentation/virtual/kvm/api.txt   |   25 +++
 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/include/asm/kvm_ppc.h  |   16 +-
 arch/powerpc/kvm/book3s_64_vio.c|  123 ++
 arch/powerpc/kvm/book3s_64_vio_hv.c |  270 +++
 arch/powerpc/kvm/book3s_hv.c|   41 -
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |6 +
 arch/powerpc/kvm/book3s_pr_papr.c   |   37 -
 arch/powerpc/kvm/powerpc.c  |3 +
 9 files changed, 490 insertions(+), 33 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 6365fef..762c703 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2362,6 +2362,31 @@ calls by the guest for that service will be passed to 
userspace to be
 handled.
 
 
+4.86 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significanly accelerates DMA operations for PPC KVM guests.
+The user space should expect that its handlers for these hypercalls
+are not going to be called.
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+the user space might have to advertise it for the guest. For example,
+IBM pSeries guest starts using them if hcall-multi-tce is present in
+the ibm,hypertas-functions device-tree property.
+
+Without this capability, only H_PUT_TCE is handled by the kernel and
+therefore the use of H_PUT_TCE_INDIRECT and H_STUFF_TCE is not recommended
+unless the capability is present as passing hypercalls to the userspace
+slows operations a lot.
+
+Unlike other capabilities of this section, this one is always enabled.
+
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index af326cd..3bf407b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -609,6 +609,8 @@ struct kvm_vcpu_arch {
spinlock_t tbacct_lock;
u64 busy_stolen;
u64 busy_preempt;
+
+   unsigned long *tce_tmp;/* TCE cache for TCE_PUT_INDIRECT hcall */
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index a5287fe..e852921b 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -133,8 +133,20 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
-extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-unsigned long ioba, unsigned long tce);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
+   struct kvm_vcpu *vcpu, 

[PATCH 4/8] hashtable: add hash_for_each_possible_rcu_notrace()

2013-06-26 Thread Alexey Kardashevskiy
This adds hash_for_each_possible_rcu_notrace() which is basically
a notrace clone of hash_for_each_possible_rcu() which cannot be
used in real mode due to its tracing/debugging capability.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 include/linux/hashtable.h |   15 +++
 1 file changed, 15 insertions(+)

diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h
index a9df51f..af8b169 100644
--- a/include/linux/hashtable.h
+++ b/include/linux/hashtable.h
@@ -174,6 +174,21 @@ static inline void hash_del_rcu(struct hlist_node *node)
member)
 
 /**
+ * hash_for_each_possible_rcu_notrace - iterate over all possible objects 
hashing
+ * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable
+ * @name: hashtable to iterate
+ * @obj: the type * to use as a loop cursor for each entry
+ * @member: the name of the hlist_node within the struct
+ * @key: the key of the objects to iterate over
+ *
+ * This is the same as hash_for_each_possible_rcu() except that it does
+ * not do any RCU debugging or tracing.
+ */
+#define hash_for_each_possible_rcu_notrace(name, obj, member, key) \
+   hlist_for_each_entry_rcu_notrace(obj, name[hash_min(key, 
HASH_BITS(name))],\
+   member)
+
+/**
  * hash_for_each_possible_safe - iterate over all possible objects hashing to 
the
  * same bucket safe against removals
  * @name: hashtable to iterate
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/8] vfio: add external user support

2013-06-26 Thread Alexey Kardashevskiy
VFIO is designed to be used via ioctls on file descriptors
returned by VFIO.

However in some situations support for an external user is required.
The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
use the existing VFIO groups for exclusive access in real/virtual mode
in the host kernel to avoid passing map/unmap requests to the user
space which would made things pretty slow.

The proposed protocol includes:

1. do normal VFIO init stuff such as opening a new container, attaching
group(s) to it, setting an IOMMU driver for a container. When IOMMU is
set for a container, all groups in it are considered ready to use by
an external user.

2. pass a fd of the group we want to accelerate to KVM. KVM calls
vfio_group_iommu_id_from_file() to verify if the group is initialized
and IOMMU is set for it. The current TCE IOMMU driver marks the whole
IOMMU table as busy when IOMMU is set for a container what this prevents
other DMA users from allocating from it so it is safe to pass the group
to the user space.

3. KVM increases the container users counter via
vfio_group_add_external_user(). This prevents the VFIO group from
being disposed prior to exiting KVM.

4. When KVM is finished and doing cleanup, it releases the group file
and decrements the container users counter. Everything gets released.

5. KVM also keeps the group file as otherwise its fd might have been
closed at the moment of KVM finish so vfio_group_del_external_user()
call will not be possible.

The vfio: Limit group opens patch is also required for the consistency.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio.c |   53 +++
 1 file changed, 53 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index c488da5..54192b2 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1370,6 +1370,59 @@ static const struct file_operations vfio_device_fops = {
 };
 
 /**
+ * External user API, exported by symbols to be linked dynamically.
+ */
+
+/* Allows an external user (for example, KVM) to lock an IOMMU group */
+static int vfio_group_add_external_user(struct file *filep)
+{
+   struct vfio_group *group = filep-private_data;
+
+   if (filep-f_op != vfio_group_fops)
+   return -EINVAL;
+
+   if (!atomic_inc_not_zero(group-container_users))
+   return -EINVAL;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_group_add_external_user);
+
+/* Allows an external user (for example, KVM) to unlock an IOMMU group */
+static void vfio_group_del_external_user(struct file *filep)
+{
+   struct vfio_group *group = filep-private_data;
+
+   BUG_ON(filep-f_op != vfio_group_fops);
+
+   vfio_group_try_dissolve_container(group);
+}
+EXPORT_SYMBOL_GPL(vfio_group_del_external_user);
+
+/*
+ * Checks if a group for the specified file can be used by
+ * an external user and returns the IOMMU ID if external use is possible.
+ */
+static int vfio_group_iommu_id_from_file(struct file *filep)
+{
+   int ret;
+   struct vfio_group *group = filep-private_data;
+
+   if (WARN_ON(filep-f_op != vfio_group_fops))
+   return -EINVAL;
+
+   if (0 == atomic_read(group-container_users) ||
+   !group-container-iommu_driver ||
+   !vfio_group_viable(group))
+   return -EINVAL;
+
+   ret = iommu_group_id(group-iommu_group);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_group_iommu_id_from_file);
+
+/**
  * Module/class support
  */
 static char *vfio_devnode(struct device *dev, umode_t *mode)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/8] KVM: PPC: reserve a capability number for multitce support

2013-06-26 Thread Alexey Kardashevskiy
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 include/uapi/linux/kvm.h |1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d88c8ee..970b1f5 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -666,6 +666,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IRQ_MPIC 90
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
+#define KVM_CAP_SPAPR_MULTITCE 93
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/8] KVM: PPC: reserve a capability and ioctl numbers for realmode VFIO

2013-06-26 Thread Alexey Kardashevskiy
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 include/uapi/linux/kvm.h |2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 970b1f5..0865c01 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
 #define KVM_CAP_SPAPR_MULTITCE 93
+#define KVM_CAP_SPAPR_TCE_IOMMU 94
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -923,6 +924,7 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PPC_ALLOC_HTAB */
 #define KVM_PPC_ALLOCATE_HTAB_IOWR(KVMIO, 0xa7, __u32)
 #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO,  0xa8, struct 
kvm_create_spapr_tce)
+#define KVM_CREATE_SPAPR_TCE_IOMMU _IOW(KVMIO,  0xaf, struct 
kvm_create_spapr_tce_iommu)
 /* Available with KVM_CAP_RMA */
 #define KVM_ALLOCATE_RMA _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 /* Available with KVM_CAP_PPC_HTAB_FD */
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction

2013-06-26 Thread tiejun.chen

On 06/26/2013 01:42 PM, Bharat Bhushan wrote:

ehpriv instruction is used for setting software breakpoints
by user space. This patch adds support to exit to user space
with run-debug have relevant information.

As this is the first point we are using run-debug, also defined
the run-debug structure.

Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
---
  arch/powerpc/include/asm/disassemble.h |4 
  arch/powerpc/include/uapi/asm/kvm.h|   21 +
  arch/powerpc/kvm/e500_emulate.c|   27 +++
  3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/disassemble.h 
b/arch/powerpc/include/asm/disassemble.h
index 9b198d1..856f8de 100644
--- a/arch/powerpc/include/asm/disassemble.h
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst)
return inst  0x;
  }

+static inline unsigned int get_oc(u32 inst)
+{
+   return (inst  11)  0x7fff;
+}
  #endif /* __ASM_PPC_DISASSEMBLE_H__ */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index 0fb1a6e..ded0607 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -269,7 +269,24 @@ struct kvm_fpu {
__u64 fpr[32];
  };

+/*
+ * Defines for h/w breakpoint, watchpoint (read, write or both) and
+ * software breakpoint.
+ * These are used as type in KVM_SET_GUEST_DEBUG ioctl and status
+ * for KVM_DEBUG_EXIT.
+ */
+#define KVMPPC_DEBUG_NONE  0x0
+#define KVMPPC_DEBUG_BREAKPOINT(1UL  1)
+#define KVMPPC_DEBUG_WATCH_WRITE   (1UL  2)
+#define KVMPPC_DEBUG_WATCH_READ(1UL  3)
  struct kvm_debug_exit_arch {
+   __u64 address;
+   /*
+* exiting to userspace because of h/w breakpoint, watchpoint
+* (read, write or both) and software breakpoint.
+*/
+   __u32 status;
+   __u32 reserved;
  };

  /* for KVM_SET_GUEST_DEBUG */
@@ -281,10 +298,6 @@ struct kvm_guest_debug_arch {
 * Type denotes h/w breakpoint, read watchpoint, write
 * watchpoint or watchpoint (both read and write).
 */
-#define KVMPPC_DEBUG_NONE  0x0
-#define KVMPPC_DEBUG_BREAKPOINT(1UL  1)
-#define KVMPPC_DEBUG_WATCH_WRITE   (1UL  2)
-#define KVMPPC_DEBUG_WATCH_READ(1UL  3)
__u32 type;
__u32 reserved;
} bp[16];
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index b10a012..dab9d07 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -26,6 +26,8 @@
  #define XOP_TLBRE   946
  #define XOP_TLBWE   978
  #define XOP_TLBILX  18
+#define XOP_EHPRIV  270
+#define EHPRIV_OC_DEBUG 0


As I think the case, OC = 0, is a bit specific since IIRC, if the OC
operand is omitted, its equal 0 by default. So I think we should start this OC 
value from 1 or other magic number.


And if possible, we'd better add some comments to describe this to make the OC 
definition readable.


Tiejun



  #ifdef CONFIG_KVM_E500MC
  static int dbell2prio(ulong param)
@@ -82,6 +84,26 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, 
int rb)
  }
  #endif

+static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu,
+  unsigned int inst, int *advance)
+{
+   int emulated = EMULATE_DONE;
+
+   switch (get_oc(inst)) {
+   case EHPRIV_OC_DEBUG:
+   run-exit_reason = KVM_EXIT_DEBUG;
+   run-debug.arch.address = vcpu-arch.pc;
+   run-debug.arch.status = 0;
+   kvmppc_account_exit(vcpu, DEBUG_EXITS);
+   emulated = EMULATE_EXIT_USER;
+   *advance = 0;
+   break;
+   default:
+   emulated = EMULATE_FAIL;
+   }
+   return emulated;
+}
+
  int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 unsigned int inst, int *advance)
  {
@@ -130,6 +152,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
break;

+   case XOP_EHPRIV:
+   emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst,
+  advance);
+   break;
+
default:
emulated = EMULATE_FAIL;
}



--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction

2013-06-26 Thread tiejun.chen

On 06/26/2013 04:44 PM, Bhushan Bharat-R65777 wrote:




-Original Message-
From: tiejun.chen [mailto:tiejun.c...@windriver.com]
Sent: Wednesday, June 26, 2013 12:25 PM
To: Bhushan Bharat-R65777
Cc: kvm-ppc@vger.kernel.org; k...@vger.kernel.org; ag...@suse.de; Wood Scott-
B07421; b...@kernel.crashing.org; linuxppc-...@lists.ozlabs.org; linux-
ker...@vger.kernel.org; mi...@neuling.org; Bhushan Bharat-R65777
Subject: Re: [PATCH 4/6 v5] KVM: PPC: exit to user space on ehpriv instruction

On 06/26/2013 01:42 PM, Bharat Bhushan wrote:

ehpriv instruction is used for setting software breakpoints
by user space. This patch adds support to exit to user space
with run-debug have relevant information.

As this is the first point we are using run-debug, also defined
the run-debug structure.

Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
---
   arch/powerpc/include/asm/disassemble.h |4 
   arch/powerpc/include/uapi/asm/kvm.h|   21 +
   arch/powerpc/kvm/e500_emulate.c|   27 +++
   3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/disassemble.h

b/arch/powerpc/include/asm/disassemble.h

index 9b198d1..856f8de 100644
--- a/arch/powerpc/include/asm/disassemble.h
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -77,4 +77,8 @@ static inline unsigned int get_d(u32 inst)
return inst  0x;
   }

+static inline unsigned int get_oc(u32 inst)
+{
+   return (inst  11)  0x7fff;
+}
   #endif /* __ASM_PPC_DISASSEMBLE_H__ */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h

b/arch/powerpc/include/uapi/asm/kvm.h

index 0fb1a6e..ded0607 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -269,7 +269,24 @@ struct kvm_fpu {
__u64 fpr[32];
   };

+/*
+ * Defines for h/w breakpoint, watchpoint (read, write or both) and
+ * software breakpoint.
+ * These are used as type in KVM_SET_GUEST_DEBUG ioctl and status
+ * for KVM_DEBUG_EXIT.
+ */
+#define KVMPPC_DEBUG_NONE  0x0
+#define KVMPPC_DEBUG_BREAKPOINT(1UL  1)
+#define KVMPPC_DEBUG_WATCH_WRITE   (1UL  2)
+#define KVMPPC_DEBUG_WATCH_READ(1UL  3)
   struct kvm_debug_exit_arch {
+   __u64 address;
+   /*
+* exiting to userspace because of h/w breakpoint, watchpoint
+* (read, write or both) and software breakpoint.
+*/
+   __u32 status;
+   __u32 reserved;
   };

   /* for KVM_SET_GUEST_DEBUG */
@@ -281,10 +298,6 @@ struct kvm_guest_debug_arch {
 * Type denotes h/w breakpoint, read watchpoint, write
 * watchpoint or watchpoint (both read and write).
 */
-#define KVMPPC_DEBUG_NONE  0x0
-#define KVMPPC_DEBUG_BREAKPOINT(1UL  1)
-#define KVMPPC_DEBUG_WATCH_WRITE   (1UL  2)
-#define KVMPPC_DEBUG_WATCH_READ(1UL  3)
__u32 type;
__u32 reserved;
} bp[16];
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index b10a012..dab9d07 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -26,6 +26,8 @@
   #define XOP_TLBRE   946
   #define XOP_TLBWE   978
   #define XOP_TLBILX  18
+#define XOP_EHPRIV  270
+#define EHPRIV_OC_DEBUG 0


As I think the case, OC = 0, is a bit specific since IIRC, if the OC
operand is omitted, its equal 0 by default. So I think we should start this OC
value from 1 or other magic number.


ehpriv instruction is defined to be used as:
ehpriv OC // where OC can be 0,1, ... n
and in extended for it can be used as
ehpriv // With no OC, and here it assumes OC = 0
So OC = 0 is not specific but ehpriv is same as ehpriv 0.


Yes, this is just what I mean.



I do not think of any special reason to reserve ehpriv and ehpriv 0.


So I still prefer we can reserve the 'ehpriv' without OC operand as one simple 
approach to test or develop something for KVM quickly because its really 
convenient to trap into the hypervisor only with one 'ehpriv' instruction easily.


But I have no further objection if you guys are fine to this ;-)

Tiejun



Thanks
-Bharat



And if possible, we'd better add some comments to describe this to make the OC
definition readable.

Tiejun



   #ifdef CONFIG_KVM_E500MC
   static int dbell2prio(ulong param)
@@ -82,6 +84,26 @@ static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu,

int rb)

   }
   #endif

+static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu

*vcpu,

+  unsigned int inst, int *advance)
+{
+   int emulated = EMULATE_DONE;
+
+   switch (get_oc(inst)) {
+   case EHPRIV_OC_DEBUG:
+   run-exit_reason = KVM_EXIT_DEBUG;
+   run-debug.arch.address = vcpu-arch.pc;
+   run-debug.arch.status = 0;
+   kvmppc_account_exit(vcpu, DEBUG_EXITS);
+   emulated = 

Re: [PATCH 3/6 v5] powerpc: export debug registers save function for KVM

2013-06-26 Thread Stephen Rothwell
Hi,

On Wed, 26 Jun 2013 11:12:23 +0530 Bharat Bhushan r65...@freescale.com wrote:

 diff --git a/arch/powerpc/include/asm/switch_to.h 
 b/arch/powerpc/include/asm/switch_to.h
 index 200d763..50b357f 100644
 --- a/arch/powerpc/include/asm/switch_to.h
 +++ b/arch/powerpc/include/asm/switch_to.h
 @@ -30,6 +30,10 @@ extern void enable_kernel_spe(void);
  extern void giveup_spe(struct task_struct *);
  extern void load_up_spe(struct task_struct *);
  
 +#ifdef CONFIG_PPC_ADV_DEBUG_REGS
 +extern void switch_booke_debug_regs(struct thread_struct *new_thread);
 +#endif

We usually don't bother guarding function declarations.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgpkE_3z91GtM.pgp
Description: PGP signature


[PATCH 0/8 v4] KVM: PPC: IOMMU in-kernel handling

2013-06-26 Thread Alexey Kardashevskiy
The changes are:
1. rebased on v3.10-rc7
2. removed spinlocks from real mode
3. added security checks between KVM and VFIO

MOre details in the individual patch comments.


Alexey Kardashevskiy (8):
  KVM: PPC: reserve a capability number for multitce support
  KVM: PPC: reserve a capability and ioctl numbers for realmode VFIO
  vfio: add external user support
  hashtable: add hash_for_each_possible_rcu_notrace()
  powerpc: Prepare to support kernel handling of IOMMU map/unmap
  KVM: PPC: Add support for multiple-TCE hcalls
  KVM: PPC: Add support for IOMMU in-kernel handling
  KVM: PPC: Add hugepage support for IOMMU in-kernel handling

 Documentation/virtual/kvm/api.txt|   51 +++
 arch/powerpc/include/asm/kvm_host.h  |   31 ++
 arch/powerpc/include/asm/kvm_ppc.h   |   18 +-
 arch/powerpc/include/asm/pgtable-ppc64.h |4 +
 arch/powerpc/include/uapi/asm/kvm.h  |8 +
 arch/powerpc/kvm/book3s_64_vio.c |  506 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c  |  439 --
 arch/powerpc/kvm/book3s_hv.c |   41 ++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |6 +
 arch/powerpc/kvm/book3s_pr_papr.c|   37 ++-
 arch/powerpc/kvm/powerpc.c   |   15 +
 arch/powerpc/mm/init_64.c|   78 -
 drivers/vfio/vfio.c  |   53 
 include/linux/hashtable.h|   15 +
 include/linux/page-flags.h   |4 +-
 include/uapi/linux/kvm.h |3 +
 16 files changed, 1279 insertions(+), 30 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/8] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-26 Thread Alexey Kardashevskiy
This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
and H_STUFF_TCE requests without passing them to QEMU, which saves time
on switching to QEMU and back.

Both real and virtual modes are supported. First the kernel tries to
handle a TCE request in the real mode, if failed it passes it to
the virtual mode to complete the operation. If it a virtual mode
handler fails, a request is passed to the user mode.

This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to associate
a virtual PCI bus ID (LIOBN) with an IOMMU group which enables
in-kernel handling of IOMMU map/unmap. The external user API support
in VFIO is required.

Tests show that this patch increases transmission speed from 220MB/s
to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).

Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---

Changes:
2013/06/27:
* tce_list page is referenced now in order to protect it from accident
invalidation during H_PUT_TCE_INDIRECT execution
* added use of the external user VFIO API

2013/06/05:
* changed capability number
* changed ioctl number
* update the doc article number

2013/05/20:
* removed get_user() from real mode handlers
* kvm_vcpu_arch::tce_tmp usage extended. Now real mode handler puts there
translated TCEs, tries realmode_get_page() on those and if it fails, it
passes control over the virtual mode handler which tries to finish
the request handling
* kvmppc_lookup_pte() now does realmode_get_page() protected by BUSY bit
on a page
* The only reason to pass the request to user mode now is when the user mode
did not register TCE table in the kernel, in all other cases the virtual mode
handler is expected to do the job
---
 Documentation/virtual/kvm/api.txt   |   26 
 arch/powerpc/include/asm/kvm_host.h |4 +
 arch/powerpc/include/asm/kvm_ppc.h  |2 +
 arch/powerpc/include/uapi/asm/kvm.h |8 +
 arch/powerpc/kvm/book3s_64_vio.c|  294 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c |  165 
 arch/powerpc/kvm/powerpc.c  |   12 ++
 7 files changed, 509 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 762c703..01b0dc2 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2387,6 +2387,32 @@ slows operations a lot.
 Unlike other capabilities of this section, this one is always enabled.
 
 
+4.87 KVM_CREATE_SPAPR_TCE_IOMMU
+
+Capability: KVM_CAP_SPAPR_TCE_IOMMU
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_iommu (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_create_spapr_tce_iommu {
+   __u64 liobn;
+   __u32 iommu_id;
+   __u32 flags;
+};
+
+This creates a link between IOMMU group and a hardware TCE (translation
+control entry) table. This link lets the host kernel know what IOMMU
+group (i.e. TCE table) to use for the LIOBN number passed with
+H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls.
+
+In response to a TCE hypercall, the kernel looks for a TCE table descriptor
+in the list and handles the hypercall in real or virtual modes if
+the descriptor is found. Otherwise the hypercall is passed to the user mode.
+
+No flag is supported at the moment.
+
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 3bf407b..716ab18 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -180,6 +180,8 @@ struct kvmppc_spapr_tce_table {
struct kvm *kvm;
u64 liobn;
u32 window_size;
+   struct iommu_group *grp;/* used for IOMMU groups */
+   struct file *vfio_filp; /* used for IOMMU groups */
struct page *pages[0];
 };
 
@@ -611,6 +613,8 @@ struct kvm_vcpu_arch {
u64 busy_preempt;
 
unsigned long *tce_tmp;/* TCE cache for TCE_PUT_INDIRECT hcall */
+   unsigned long tce_tmp_num; /* Number of handled TCEs in the cache */
+   unsigned long tce_reason;  /* The reason of switching to the virtmode */
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index e852921b..934e01d 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -133,6 +133,8 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
+   struct kvm_create_spapr_tce_iommu *args);
 extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
struct kvm_vcpu *vcpu, unsigned long liobn);
 extern long kvmppc_emulated_validate_tce(unsigned long tce);
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 

[PATCH 8/8] KVM: PPC: Add hugepage support for IOMMU in-kernel handling

2013-06-26 Thread Alexey Kardashevskiy
This adds special support for huge pages (16MB).  The reference
counting cannot be easily done for such pages in real mode (when
MMU is off) so we added a list of huge pages.  It is populated in
virtual mode and get_page is called just once per a huge page.
Real mode handlers check if the requested page is huge and in the list,
then no reference counting is done, otherwise an exit to virtual mode
happens.  The list is released at KVM exit.  At the moment the fastest
card available for tests uses up to 9 huge pages so walking through this
list is not very expensive.  However this can change and we may want
to optimize this.

Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---

Changes:
2013/06/27:
* list of huge pages replaces with hashtable for better performance
* spinlock removed from real mode and only protects insertion of new
huge [ages descriptors into the hashtable

2013/06/05:
* fixed compile error when CONFIG_IOMMU_API=n

2013/05/20:
* the real mode handler now searches for a huge page by gpa (used to be pte)
* the virtual mode handler prints warning if it is called twice for the same
huge page as the real mode handler is expected to fail just once - when a huge
page is not in the list yet.
* the huge page is refcounted twice - when added to the hugepage list and
when used in the virtual mode hcall handler (can be optimized but it will
make the patch less nice).
---
 arch/powerpc/include/asm/kvm_host.h |   25 +
 arch/powerpc/kvm/book3s_64_vio.c|   95 +--
 arch/powerpc/kvm/book3s_64_vio_hv.c |   24 +++--
 3 files changed, 138 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 716ab18..0ad6189 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -30,6 +30,7 @@
 #include linux/kvm_para.h
 #include linux/list.h
 #include linux/atomic.h
+#include linux/hashtable.h
 #include asm/kvm_asm.h
 #include asm/processor.h
 #include asm/page.h
@@ -182,9 +183,33 @@ struct kvmppc_spapr_tce_table {
u32 window_size;
struct iommu_group *grp;/* used for IOMMU groups */
struct file *vfio_filp; /* used for IOMMU groups */
+   DECLARE_HASHTABLE(hash_tab, ilog2(64)); /* used for IOMMU groups */
+   spinlock_t hugepages_write_lock;/* used for IOMMU groups */
struct page *pages[0];
 };
 
+/*
+ * The KVM guest can be backed with 16MB pages.
+ * In this case, we cannot do page counting from the real mode
+ * as the compound pages are used - they are linked in a list
+ * with pointers as virtual addresses which are inaccessible
+ * in real mode.
+ *
+ * The code below keeps a 16MB pages list and uses page struct
+ * in real mode if it is already locked in RAM and inserted into
+ * the list or switches to the virtual mode where it can be
+ * handled in a usual manner.
+ */
+#define KVMPPC_HUGEPAGE_HASH(gpa)  hash_32(gpa  24, 32)
+
+struct kvmppc_iommu_hugepage {
+   struct hlist_node hash_node;
+   unsigned long gpa;  /* Guest physical address */
+   unsigned long hpa;  /* Host physical address */
+   struct page *page;  /* page struct of the very first subpage */
+   unsigned long size; /* Huge page size (always 16MB at the moment) */
+};
+
 struct kvmppc_linear_info {
void*base_virt;
unsigned longbase_pfn;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index a5d0195..6cedfe9 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -47,6 +47,78 @@
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
 #define ERROR_ADDR  ((void *)~(unsigned long)0x0)
 
+#ifdef CONFIG_IOMMU_API
+/* Adds a new huge page descriptor to the hashtable */
+static long kvmppc_iommu_hugepage_try_add(
+   struct kvmppc_spapr_tce_table *tt,
+   pte_t pte, unsigned long hva, unsigned long gpa,
+   unsigned long pg_size)
+{
+   long ret = 0;
+   struct kvmppc_iommu_hugepage *hp;
+   struct page *pg;
+   unsigned key = KVMPPC_HUGEPAGE_HASH(gpa);
+
+   spin_lock(tt-hugepages_write_lock);
+   hash_for_each_possible_rcu(tt-hash_tab, hp, hash_node, key) {
+   if (KVMPPC_HUGEPAGE_HASH(hp-gpa) != key)
+   continue;
+   if ((gpa  hp-gpa) || (gpa = hp-gpa + hp-size))
+   continue;
+   goto unlock_exit;
+   }
+
+   hva = hva  ~(pg_size - 1);
+   ret = get_user_pages_fast(hva, 1, true/*write*/, pg);
+   if ((ret != 1) || !pg) {
+   ret = -EFAULT;
+   goto unlock_exit;
+   }
+   ret = 0;
+
+   hp = kzalloc(sizeof(*hp), GFP_KERNEL);
+   if (!hp) {
+   ret = -ENOMEM;
+   goto unlock_exit;
+   }
+
+   hp-page = pg;
+   

[PATCH 5/8] powerpc: Prepare to support kernel handling of IOMMU map/unmap

2013-06-26 Thread Alexey Kardashevskiy
The current VFIO-on-POWER implementation supports only user mode
driven mapping, i.e. QEMU is sending requests to map/unmap pages.
However this approach is really slow, so we want to move that to KVM.
Since H_PUT_TCE can be extremely performance sensitive (especially with
network adapters where each packet needs to be mapped/unmapped) we chose
to implement that as a fast hypercall directly in real
mode (processor still in the guest context but MMU off).

To be able to do that, we need to provide some facilities to
access the struct page count within that real mode environment as things
like the sparsemem vmemmap mappings aren't accessible.

This adds an API to increment/decrement page counter as
get_user_pages API used for user mode mapping does not work
in the real mode.

CONFIG_SPARSEMEM_VMEMMAP and CONFIG_FLATMEM are supported.

Reviewed-by: Paul Mackerras pau...@samba.org
Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---

Changes:
2013/06/27:
* realmode_get_page() fixed to use get_page_unless_zero(). If failed,
the call will be passed from real to virtual mode and safely handled.
* added comment to PageCompound() in include/linux/page-flags.h.

2013/05/20:
* PageTail() is replaced by PageCompound() in order to have the same checks
for whether the page is huge in realmode_get_page() and realmode_put_page()
---
 arch/powerpc/include/asm/pgtable-ppc64.h |4 ++
 arch/powerpc/mm/init_64.c|   78 +-
 include/linux/page-flags.h   |4 +-
 3 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index e3d55f6f..7b46e5f 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -376,6 +376,10 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t 
*pgdir, unsigned long ea,
 }
 #endif /* !CONFIG_HUGETLB_PAGE */
 
+struct page *realmode_pfn_to_page(unsigned long pfn);
+int realmode_get_page(struct page *page);
+int realmode_put_page(struct page *page);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a90b9c4..7031be3 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -297,5 +297,81 @@ void vmemmap_free(unsigned long start, unsigned long end)
 {
 }
 
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+/*
+ * We do not have access to the sparsemem vmemmap, so we fallback to
+ * walking the list of sparsemem blocks which we already maintain for
+ * the sake of crashdump. In the long run, we might want to maintain
+ * a tree if performance of that linear walk becomes a problem.
+ *
+ * Any of realmode_ functions can fail due to:
+ * 1) As real sparsemem blocks do not lay in RAM continously (they
+ * are in virtual address space which is not available in the real mode),
+ * the requested page struct can be split between blocks so get_page/put_page
+ * may fail.
+ * 2) When huge pages are used, the get_page/put_page API will fail
+ * in real mode as the linked addresses in the page struct are virtual
+ * too.
+ * When 1) or 2) takes place, the API returns an error code to cause
+ * an exit to kernel virtual mode where the operation will be completed.
+ */
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+   struct vmemmap_backing *vmem_back;
+   struct page *page;
+   unsigned long page_size = 1  mmu_psize_defs[mmu_vmemmap_psize].shift;
+   unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
+
+   for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back-list) {
+   if (pg_va  vmem_back-virt_addr)
+   continue;
 
+   /* Check that page struct is not split between real pages */
+   if ((pg_va + sizeof(struct page)) 
+   (vmem_back-virt_addr + page_size))
+   return NULL;
+
+   page = (struct page *) (vmem_back-phys + pg_va -
+   vmem_back-virt_addr);
+   return page;
+   }
+
+   return NULL;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#elif defined(CONFIG_FLATMEM)
+
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+   struct page *page = pfn_to_page(pfn);
+   return page;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
+
+#if defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_FLATMEM)
+int realmode_get_page(struct page *page)
+{
+   if (PageCompound(page))
+   return -EAGAIN;
+
+   if (!get_page_unless_zero(page))
+   return -EAGAIN;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(realmode_get_page);
+
+int realmode_put_page(struct page *page)
+{
+   if (PageCompound(page))
+   return -EAGAIN;
+
+   

[PATCH 6/8] KVM: PPC: Add support for multiple-TCE hcalls

2013-06-26 Thread Alexey Kardashevskiy
This adds real mode handlers for the H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO
devices or emulated PCI.  These calls allow adding multiple entries
(up to 512) into the TCE table in one call which saves time on
transition to/from real mode.

This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
(copied from user and verified) before writing the whole list into
the TCE table. This cache will be utilized more in the upcoming
VFIO/IOMMU support to continue TCE list processing in the virtual
mode in the case if the real mode handler failed for some reason.

This adds a guest physical to host real address converter
and calls the existing H_PUT_TCE handler. The converting function
is going to be fully utilized by upcoming VFIO supporting patches.

This also implements the KVM_CAP_PPC_MULTITCE capability,
so in order to support the functionality of this patch, QEMU
needs to query for this capability and set the hcall-multi-tce
hypertas property only if the capability is present, otherwise
there will be serious performance degradation.

Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---
Changelog:
2013/06/27:
* fixed clear of BUSY bit in kvmppc_lookup_pte()
* H_PUT_TCE_INDIRECT does realmode_get_page() now
* KVM_CAP_SPAPR_MULTITCE now depends on CONFIG_PPC_BOOK3S_64
* updated doc

2013/06/05:
* fixed mistype about IBMVIO in the commit message
* updated doc and moved it to another section
* changed capability number

2013/05/21:
* added kvm_vcpu_arch::tce_tmp
* removed cleanup if put_indirect failed, instead we do not even start
writing to TCE table if we cannot get TCEs from the user and they are
invalid
* kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
and kvmppc_emulated_validate_tce (for the previous item)
* fixed bug with failthrough for H_IPI
* removed all get_user() from real mode handlers
* kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)
---
 Documentation/virtual/kvm/api.txt   |   25 +++
 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/include/asm/kvm_ppc.h  |   16 +-
 arch/powerpc/kvm/book3s_64_vio.c|  123 ++
 arch/powerpc/kvm/book3s_64_vio_hv.c |  270 +++
 arch/powerpc/kvm/book3s_hv.c|   41 -
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |6 +
 arch/powerpc/kvm/book3s_pr_papr.c   |   37 -
 arch/powerpc/kvm/powerpc.c  |3 +
 9 files changed, 490 insertions(+), 33 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 6365fef..762c703 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2362,6 +2362,31 @@ calls by the guest for that service will be passed to 
userspace to be
 handled.
 
 
+4.86 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significanly accelerates DMA operations for PPC KVM guests.
+The user space should expect that its handlers for these hypercalls
+are not going to be called.
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+the user space might have to advertise it for the guest. For example,
+IBM pSeries guest starts using them if hcall-multi-tce is present in
+the ibm,hypertas-functions device-tree property.
+
+Without this capability, only H_PUT_TCE is handled by the kernel and
+therefore the use of H_PUT_TCE_INDIRECT and H_STUFF_TCE is not recommended
+unless the capability is present as passing hypercalls to the userspace
+slows operations a lot.
+
+Unlike other capabilities of this section, this one is always enabled.
+
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index af326cd..3bf407b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -609,6 +609,8 @@ struct kvm_vcpu_arch {
spinlock_t tbacct_lock;
u64 busy_stolen;
u64 busy_preempt;
+
+   unsigned long *tce_tmp;/* TCE cache for TCE_PUT_INDIRECT hcall */
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index a5287fe..e852921b 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -133,8 +133,20 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
-extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-unsigned long ioba, unsigned long tce);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
+   struct kvm_vcpu *vcpu, 

[PATCH 4/8] hashtable: add hash_for_each_possible_rcu_notrace()

2013-06-26 Thread Alexey Kardashevskiy
This adds hash_for_each_possible_rcu_notrace() which is basically
a notrace clone of hash_for_each_possible_rcu() which cannot be
used in real mode due to its tracing/debugging capability.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 include/linux/hashtable.h |   15 +++
 1 file changed, 15 insertions(+)

diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h
index a9df51f..af8b169 100644
--- a/include/linux/hashtable.h
+++ b/include/linux/hashtable.h
@@ -174,6 +174,21 @@ static inline void hash_del_rcu(struct hlist_node *node)
member)
 
 /**
+ * hash_for_each_possible_rcu_notrace - iterate over all possible objects 
hashing
+ * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable
+ * @name: hashtable to iterate
+ * @obj: the type * to use as a loop cursor for each entry
+ * @member: the name of the hlist_node within the struct
+ * @key: the key of the objects to iterate over
+ *
+ * This is the same as hash_for_each_possible_rcu() except that it does
+ * not do any RCU debugging or tracing.
+ */
+#define hash_for_each_possible_rcu_notrace(name, obj, member, key) \
+   hlist_for_each_entry_rcu_notrace(obj, name[hash_min(key, 
HASH_BITS(name))],\
+   member)
+
+/**
  * hash_for_each_possible_safe - iterate over all possible objects hashing to 
the
  * same bucket safe against removals
  * @name: hashtable to iterate
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/8] KVM: PPC: reserve a capability number for multitce support

2013-06-26 Thread Alexey Kardashevskiy
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 include/uapi/linux/kvm.h |1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d88c8ee..970b1f5 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -666,6 +666,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IRQ_MPIC 90
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
+#define KVM_CAP_SPAPR_MULTITCE 93
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/8] vfio: add external user support

2013-06-26 Thread Alexey Kardashevskiy
VFIO is designed to be used via ioctls on file descriptors
returned by VFIO.

However in some situations support for an external user is required.
The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
use the existing VFIO groups for exclusive access in real/virtual mode
in the host kernel to avoid passing map/unmap requests to the user
space which would made things pretty slow.

The proposed protocol includes:

1. do normal VFIO init stuff such as opening a new container, attaching
group(s) to it, setting an IOMMU driver for a container. When IOMMU is
set for a container, all groups in it are considered ready to use by
an external user.

2. pass a fd of the group we want to accelerate to KVM. KVM calls
vfio_group_iommu_id_from_file() to verify if the group is initialized
and IOMMU is set for it. The current TCE IOMMU driver marks the whole
IOMMU table as busy when IOMMU is set for a container what this prevents
other DMA users from allocating from it so it is safe to pass the group
to the user space.

3. KVM increases the container users counter via
vfio_group_add_external_user(). This prevents the VFIO group from
being disposed prior to exiting KVM.

4. When KVM is finished and doing cleanup, it releases the group file
and decrements the container users counter. Everything gets released.

5. KVM also keeps the group file as otherwise its fd might have been
closed at the moment of KVM finish so vfio_group_del_external_user()
call will not be possible.

The vfio: Limit group opens patch is also required for the consistency.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 drivers/vfio/vfio.c |   53 +++
 1 file changed, 53 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index c488da5..54192b2 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1370,6 +1370,59 @@ static const struct file_operations vfio_device_fops = {
 };
 
 /**
+ * External user API, exported by symbols to be linked dynamically.
+ */
+
+/* Allows an external user (for example, KVM) to lock an IOMMU group */
+static int vfio_group_add_external_user(struct file *filep)
+{
+   struct vfio_group *group = filep-private_data;
+
+   if (filep-f_op != vfio_group_fops)
+   return -EINVAL;
+
+   if (!atomic_inc_not_zero(group-container_users))
+   return -EINVAL;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_group_add_external_user);
+
+/* Allows an external user (for example, KVM) to unlock an IOMMU group */
+static void vfio_group_del_external_user(struct file *filep)
+{
+   struct vfio_group *group = filep-private_data;
+
+   BUG_ON(filep-f_op != vfio_group_fops);
+
+   vfio_group_try_dissolve_container(group);
+}
+EXPORT_SYMBOL_GPL(vfio_group_del_external_user);
+
+/*
+ * Checks if a group for the specified file can be used by
+ * an external user and returns the IOMMU ID if external use is possible.
+ */
+static int vfio_group_iommu_id_from_file(struct file *filep)
+{
+   int ret;
+   struct vfio_group *group = filep-private_data;
+
+   if (WARN_ON(filep-f_op != vfio_group_fops))
+   return -EINVAL;
+
+   if (0 == atomic_read(group-container_users) ||
+   !group-container-iommu_driver ||
+   !vfio_group_viable(group))
+   return -EINVAL;
+
+   ret = iommu_group_id(group-iommu_group);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_group_iommu_id_from_file);
+
+/**
  * Module/class support
  */
 static char *vfio_devnode(struct device *dev, umode_t *mode)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html