Re: [PATCH v3 1/4] KVM: nVMX: Rework interception of IRQs and NMIs

2014-03-09 Thread Jan Kiszka
On 2014-03-09 08:33, Paolo Bonzini wrote:
 Il 08/03/2014 10:21, Jan Kiszka ha scritto:
 On 2014-03-07 20:48, Paolo Bonzini wrote:
 Il 07/03/2014 20:03, Jan Kiszka ha scritto:
 @@ -4631,22 +4631,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu
 *vcpu, bool masked)

  static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
  {
 -if (is_guest_mode(vcpu)) {
 -if (to_vmx(vcpu)-nested.nested_run_pending)
 -return 0;
 -if (nested_exit_on_nmi(vcpu)) {
 -nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 -  NMI_VECTOR | INTR_TYPE_NMI_INTR |
 -  INTR_INFO_VALID_MASK, 0);
 -/*
 - * The NMI-triggered VM exit counts as injection:
 - * clear this one and block further NMIs.
 - */
 -vcpu-arch.nmi_pending = 0;
 -vmx_set_nmi_mask(vcpu, true);
 -return 0;
 -}
 -}
 +if (to_vmx(vcpu)-nested.nested_run_pending)
 +return 0;

  if (!cpu_has_virtual_nmis()  to_vmx(vcpu)-soft_vnmi_blocked)
  return 0;
 @@ -4658,19 +4644,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu
 *vcpu)

  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
 -if (is_guest_mode(vcpu)) {
 -if (to_vmx(vcpu)-nested.nested_run_pending)
 -return 0;
 -if (nested_exit_on_intr(vcpu)) {
 -nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
 -  0, 0);
 -/*
 - * fall through to normal code, but now in L1, not L2
 - */
 -}
 -}
 -
 -return (vmcs_readl(GUEST_RFLAGS)  X86_EFLAGS_IF) 
 +return (!to_vmx(vcpu)-nested.nested_run_pending 
 +vmcs_readl(GUEST_RFLAGS)  X86_EFLAGS_IF) 
  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 
  (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));

 The checks on nested_run_pending are not needed anymore and can be
 replaced with a WARN_ON.  Otherwise,

 Nope, that won't be correct: If we have a pending interrupt that L1 does
 not intercept, we still trigger this condition legally.
 
 Right, this is the case of !nested_exit_on_intr(vcpu) or
 !nested_exit_on_nmi(vcpu).
 
 Why don't we need to request an immediate exit in that case, in order to
 inject the interrupt into L2?

We enable the hardware interrupt/NMI window request for L2 instead.

Jan




signature.asc
Description: OpenPGP digital signature


Re: [PATCH v3 1/4] KVM: nVMX: Rework interception of IRQs and NMIs

2014-03-09 Thread Paolo Bonzini

Il 09/03/2014 08:33, Paolo Bonzini ha scritto:

Il 08/03/2014 10:21, Jan Kiszka ha scritto:

On 2014-03-07 20:48, Paolo Bonzini wrote:

Il 07/03/2014 20:03, Jan Kiszka ha scritto:

@@ -4631,22 +4631,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu
*vcpu, bool masked)

 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 {
-if (is_guest_mode(vcpu)) {
-if (to_vmx(vcpu)-nested.nested_run_pending)
-return 0;
-if (nested_exit_on_nmi(vcpu)) {
-nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-  NMI_VECTOR | INTR_TYPE_NMI_INTR |
-  INTR_INFO_VALID_MASK, 0);
-/*
- * The NMI-triggered VM exit counts as injection:
- * clear this one and block further NMIs.
- */
-vcpu-arch.nmi_pending = 0;
-vmx_set_nmi_mask(vcpu, true);
-return 0;
-}
-}
+if (to_vmx(vcpu)-nested.nested_run_pending)
+return 0;

 if (!cpu_has_virtual_nmis()  to_vmx(vcpu)-soft_vnmi_blocked)
 return 0;
@@ -4658,19 +4644,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu
*vcpu)

 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-if (is_guest_mode(vcpu)) {
-if (to_vmx(vcpu)-nested.nested_run_pending)
-return 0;
-if (nested_exit_on_intr(vcpu)) {
-nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
-  0, 0);
-/*
- * fall through to normal code, but now in L1, not L2
- */
-}
-}
-
-return (vmcs_readl(GUEST_RFLAGS)  X86_EFLAGS_IF) 
+return (!to_vmx(vcpu)-nested.nested_run_pending 
+vmcs_readl(GUEST_RFLAGS)  X86_EFLAGS_IF) 
 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 
 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));


The checks on nested_run_pending are not needed anymore and can be
replaced with a WARN_ON.  Otherwise,


Nope, that won't be correct: If we have a pending interrupt that L1 does
not intercept, we still trigger this condition legally.


Right, this is the case of !nested_exit_on_intr(vcpu) or
!nested_exit_on_nmi(vcpu).

Why don't we need to request an immediate exit in that case, in order to
inject the interrupt into L2?


Nevermind, this makes no sense.  I was confusing *_allowed with 
enable_*_window.


Applying v3 to kvm/queue, thanks!

Paolo

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 1/4] KVM: nVMX: Rework interception of IRQs and NMIs

2014-03-09 Thread Jan Kiszka
On 2014-03-09 09:03, Paolo Bonzini wrote:
 Il 09/03/2014 08:33, Paolo Bonzini ha scritto:
 Il 08/03/2014 10:21, Jan Kiszka ha scritto:
 On 2014-03-07 20:48, Paolo Bonzini wrote:
 Il 07/03/2014 20:03, Jan Kiszka ha scritto:
 @@ -4631,22 +4631,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu
 *vcpu, bool masked)

  static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
  {
 -if (is_guest_mode(vcpu)) {
 -if (to_vmx(vcpu)-nested.nested_run_pending)
 -return 0;
 -if (nested_exit_on_nmi(vcpu)) {
 -nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 -  NMI_VECTOR | INTR_TYPE_NMI_INTR |
 -  INTR_INFO_VALID_MASK, 0);
 -/*
 - * The NMI-triggered VM exit counts as injection:
 - * clear this one and block further NMIs.
 - */
 -vcpu-arch.nmi_pending = 0;
 -vmx_set_nmi_mask(vcpu, true);
 -return 0;
 -}
 -}
 +if (to_vmx(vcpu)-nested.nested_run_pending)
 +return 0;

  if (!cpu_has_virtual_nmis()  to_vmx(vcpu)-soft_vnmi_blocked)
  return 0;
 @@ -4658,19 +4644,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu
 *vcpu)

  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
 -if (is_guest_mode(vcpu)) {
 -if (to_vmx(vcpu)-nested.nested_run_pending)
 -return 0;
 -if (nested_exit_on_intr(vcpu)) {
 -nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
 -  0, 0);
 -/*
 - * fall through to normal code, but now in L1, not L2
 - */
 -}
 -}
 -
 -return (vmcs_readl(GUEST_RFLAGS)  X86_EFLAGS_IF) 
 +return (!to_vmx(vcpu)-nested.nested_run_pending 
 +vmcs_readl(GUEST_RFLAGS)  X86_EFLAGS_IF) 
  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 
  (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));

 The checks on nested_run_pending are not needed anymore and can be
 replaced with a WARN_ON.  Otherwise,

 Nope, that won't be correct: If we have a pending interrupt that L1 does
 not intercept, we still trigger this condition legally.

 Right, this is the case of !nested_exit_on_intr(vcpu) or
 !nested_exit_on_nmi(vcpu).

 Why don't we need to request an immediate exit in that case, in order to
 inject the interrupt into L2?
 
 Nevermind, this makes no sense.  I was confusing *_allowed with
 enable_*_window.

This code is mind-blowing and probably still not perfect. I wouldn't be
surprised if we are going to find bugs there until we retire. ;)

 
 Applying v3 to kvm/queue, thanks!
 

Great, thank you!
Jan




signature.asc
Description: OpenPGP digital signature


Re: [PATCH 0/7] KVM: x86: Let the guest write to multiple debug registers with one vmexit

2014-03-09 Thread Jan Kiszka
On 2014-03-07 12:42, Paolo Bonzini wrote:
 Alex Williamson reported that a Windows game does something weird that
 makes the guest save and restore debug registers on each context switch.
 This cause several hundred thousands vmexits per second, and basically
 cuts performance in half when running under KVM.
 
 However, when not running in guest-debug mode, the guest controls the
 debug registers and having to take an exit for each DR access is a waste
 of time.  We just need one vmexit to load any stale values of DR0-DR6,
 and then we can let the guest run freely.  On the next vmexit (whatever
 the reason) we will read out whatever changes the guest made to the
 debug registers.
 
 Tested with x86/debug.flat on both Intel and AMD, both direct and
 nested virtualization.
 
 Changes from RFC: changed get_dr7 callback to sync_dirty_debug_regs,
   new patches 5-7.

This looks good now to me from KVM perspective. I was just wondering how
the case is handled that the host used debug registers on the thread the
runs a VCPU? What if I set a hw breakpoint on its userspace path e.g.?
What if I debug the kernel side with kgdb?

Jan

 
 Paolo Bonzini (7):
   KVM: vmx: we do rely on loading DR7 on entry
   KVM: x86: change vcpu-arch.switch_db_regs to a bit mask
   KVM: x86: Allow the guest to run with dirty debug registers
   KVM: vmx: Allow the guest to run with dirty debug registers
   KVM: nVMX: Allow nested guests to run with dirty debug registers
   KVM: svm: set/clear all DR intercepts in one swoop
   KVM: svm: Allow the guest to run with dirty debug registers
 
  arch/x86/include/asm/kvm_host.h |  8 -
  arch/x86/kvm/svm.c  | 68 
 -
  arch/x86/kvm/vmx.c  | 43 --
  arch/x86/kvm/x86.c  | 20 +++-
  4 files changed, 114 insertions(+), 25 deletions(-)
 




signature.asc
Description: OpenPGP digital signature


Re: [PATCH 0/7] KVM: x86: Let the guest write to multiple debug registers with one vmexit

2014-03-09 Thread Jan Kiszka
On 2014-03-09 09:11, Jan Kiszka wrote:
 On 2014-03-07 12:42, Paolo Bonzini wrote:
 Alex Williamson reported that a Windows game does something weird that
 makes the guest save and restore debug registers on each context switch.
 This cause several hundred thousands vmexits per second, and basically
 cuts performance in half when running under KVM.

 However, when not running in guest-debug mode, the guest controls the
 debug registers and having to take an exit for each DR access is a waste
 of time.  We just need one vmexit to load any stale values of DR0-DR6,
 and then we can let the guest run freely.  On the next vmexit (whatever
 the reason) we will read out whatever changes the guest made to the
 debug registers.

 Tested with x86/debug.flat on both Intel and AMD, both direct and
 nested virtualization.

 Changes from RFC: changed get_dr7 callback to sync_dirty_debug_regs,
  new patches 5-7.
 
 This looks good now to me from KVM perspective. I was just wondering how
 the case is handled that the host used debug registers on the thread the
 runs a VCPU? What if I set a hw breakpoint on its userspace path e.g.?
 What if I debug the kernel side with kgdb?

Ah, this part didn't change, we still switch between host and guest
state on vmentries/exits. All fine!

Jan




signature.asc
Description: OpenPGP digital signature


Re: [PATCH 0/7] KVM: x86: Let the guest write to multiple debug registers with one vmexit

2014-03-09 Thread Paolo Bonzini

Il 09/03/2014 09:15, Jan Kiszka ha scritto:

 This looks good now to me from KVM perspective. I was just wondering how
 the case is handled that the host used debug registers on the thread the
 runs a VCPU? What if I set a hw breakpoint on its userspace path e.g.?
 What if I debug the kernel side with kgdb?

Ah, this part didn't change, we still switch between host and guest
state on vmentries/exits. All fine!


Yes, that's why sync_dirty_debug_regs is added so early.  Thanks for the 
review.


Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/7] KVM: vmx: Allow the guest to run with dirty debug registers

2014-03-09 Thread Radim Krčmář
2014-03-07 12:42+0100, Paolo Bonzini:
 When not running in guest-debug mode (i.e. the guest controls the debug
 registers, having to take an exit for each DR access is a waste of time.
 If the guest gets into a state where each context switch causes DR to be
 saved and restored, this can take away as much as 40% of the execution
 time from the guest.
 
 If the guest is running with vcpu-arch.db == vcpu-arch.eff_db, we
 can let it write freely to the debug registers and reload them on the
 next exit.  We still need to exit on the first access, so that the
 KVM_DEBUGREG_WONT_EXIT flag is set in switch_db_regs; after that, further
 accesses to the debug registers will not cause a vmexit.
 
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  arch/x86/kvm/vmx.c | 37 -
  1 file changed, 36 insertions(+), 1 deletion(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 06e4ec877a1c..e377522408b1 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2843,7 +2843,7 @@ static __init int setup_vmcs_config(struct vmcs_config 
 *vmcs_conf)
 vmx_capability.ept, vmx_capability.vpid);
   }
  
 - min = 0;
 + min = VM_EXIT_SAVE_DEBUG_CONTROLS;
  #ifdef CONFIG_X86_64
   min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
  #endif
 @@ -5113,6 +5113,22 @@ static int handle_dr(struct kvm_vcpu *vcpu)
   }
   }
  
 + if (vcpu-guest_debug == 0) {
 + u32 cpu_based_vm_exec_control;
 +
 + cpu_based_vm_exec_control = 
 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 + cpu_based_vm_exec_control = ~CPU_BASED_MOV_DR_EXITING;
 + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 
 cpu_based_vm_exec_control);

vmcs_clear_bits() covers exactly this use-case.
(Barring the explicit bit-width.)

 +
 + /*
 +  * No more DR vmexits; force a reload of the debug registers
 +  * and reenter on this instruction.  The next vmexit will
 +  * retrieve the full state of the debug registers.
 +  */
 + vcpu-arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
 + return 1;
 + }
 +

We could make the code slighly uglier and move the functional part of
this block before the previous one, so it would do both things in one
exit.  (Exception handler will likely access DR too.)

   exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
   dr = exit_qualification  DEBUG_REG_ACCESS_NUM;
   reg = DEBUG_REG_ACCESS_REG(exit_qualification);
 @@ -5139,6 +5155,24 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, 
 unsigned long val)
  {
  }
  
 +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
 +{
 + u32 cpu_based_vm_exec_control;
 +
 + get_debugreg(vcpu-arch.db[0], 0);
 + get_debugreg(vcpu-arch.db[1], 1);
 + get_debugreg(vcpu-arch.db[2], 2);
 + get_debugreg(vcpu-arch.db[3], 3);
 + get_debugreg(vcpu-arch.dr6, 6);
 + vcpu-arch.dr7 = vmcs_readl(GUEST_DR7);
 +
 + vcpu-arch.switch_db_regs = ~KVM_DEBUGREG_WONT_EXIT;
 +
 + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 + cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
 + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);

(I'll be calling this sneaky later on.)

 +}
 +
  static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
  {
   vmcs_writel(GUEST_DR7, val);
 @@ -8590,6 +8624,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
   .get_dr6 = vmx_get_dr6,
   .set_dr6 = vmx_set_dr6,
   .set_dr7 = vmx_set_dr7,
 + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
   .cache_reg = vmx_cache_reg,
   .get_rflags = vmx_get_rflags,
   .set_rflags = vmx_set_rflags,
 -- 
 1.8.3.1
 
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/7] KVM: x86: Allow the guest to run with dirty debug registers

2014-03-09 Thread Radim Krčmář
2014-03-07 12:42+0100, Paolo Bonzini:
 When not running in guest-debug mode, the guest controls the debug
 registers and having to take an exit for each DR access is a waste
 of time.  If the guest gets into a state where each context switch
 causes DR to be saved and restored, this can take away as much as 40%
 of the execution time from the guest.
 
 After this patch, VMX- and SVM-specific code can set a flag in
 switch_db_regs, telling vcpu_enter_guest that on the next exit the debug
 registers might be dirty and need to be reloaded (syncing will be taken
 care of by a new callback in kvm_x86_ops).  This flag can be set on the
 first access to a debug registers, so that multiple accesses to the
 debug registers only cause one vmexit.
 
 Note that since the guest will be able to read debug registers and
 enable breakpoints in DR7, we need to ensure that they are synchronized
 on entry to the guest---including DR6 that was not synced before.
 
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 ---
  arch/x86/include/asm/kvm_host.h |  2 ++
  arch/x86/kvm/x86.c  | 16 
  2 files changed, 18 insertions(+)
 
 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index 5ef59d3b6c63..74eb361eaa8f 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -339,6 +339,7 @@ struct kvm_pmu {
  
  enum {
   KVM_DEBUGREG_BP_ENABLED = 1,
 + KVM_DEBUGREG_WONT_EXIT = 2,
  };
  
  struct kvm_vcpu_arch {
 @@ -707,6 +708,7 @@ struct kvm_x86_ops {
   void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
   u64 (*get_dr6)(struct kvm_vcpu *vcpu);
   void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
 + void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
   void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
   void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
   unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 252b47e85c69..c48818aa04c0 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -6033,12 +6033,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
   set_debugreg(vcpu-arch.eff_db[1], 1);
   set_debugreg(vcpu-arch.eff_db[2], 2);
   set_debugreg(vcpu-arch.eff_db[3], 3);
 + set_debugreg(vcpu-arch.dr6, 6);
   }
  
   trace_kvm_entry(vcpu-vcpu_id);
   kvm_x86_ops-run(vcpu);
  
   /*
 +  * Do this here before restoring debug registers on the host.  And
 +  * since we do this before handling the vmexit, a DR access vmexit
 +  * can (a) read the correct value of the debug registers, (b) set
 +  * KVM_DEBUGREG_WONT_EXIT again.

We re-enable intercepts on the next exit for the sake of simplicity?
(Batched accesses make perfect sense, but it seems we don't have to care
 about DRs at all, without guest-debug.)

 +  */
 + if (unlikely(vcpu-arch.switch_db_regs  KVM_DEBUGREG_WONT_EXIT)) {
 + int i;
 +
 + WARN_ON(vcpu-guest_debug  KVM_GUESTDBG_USE_HW_BP);

Is this possible? (I presumed that we vmexit before handling ioctls.)

 + kvm_x86_ops-sync_dirty_debug_regs(vcpu);

Sneaky functionality ... it would be nicer to split 'enable DR
intercepts' into a new kvm op.
I think we want to disable them whenever we are not in guest-debug mode
anyway, so it would be a pair.
And we don't have to modify DR intercepts then, which is probably the
main reason why sync_dirty_debug_regs() does two things.

 + for (i = 0; i  KVM_NR_DB_REGS; i++)
 + vcpu-arch.eff_db[i] = vcpu-arch.db[i];
 + }
 +
 + /*
* If the guest has used debug registers, at least dr7
* will be disabled while returning to the host.
* If we don't have active breakpoints in the host, we don't
 -- 
 1.8.3.1
 
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/7] KVM: x86: Allow the guest to run with dirty debug registers

2014-03-09 Thread Paolo Bonzini

Il 09/03/2014 19:28, Radim Krčmář ha scritto:

/*
 +   * Do this here before restoring debug registers on the host.  And
 +   * since we do this before handling the vmexit, a DR access vmexit
 +   * can (a) read the correct value of the debug registers, (b) set
 +   * KVM_DEBUGREG_WONT_EXIT again.

We re-enable intercepts on the next exit for the sake of simplicity?
(Batched accesses make perfect sense, but it seems we don't have to care
 about DRs at all, without guest-debug.)


It's not just for the sake of simplicity.  Synchronizing debug registers 
on entry does have some cost, and it's required for running without 
debug register intercepts.  You would incur this cost always, since 
no-guest-debug is actually the common case.


We're well into diminishing returns; Alex timed this patch vs. one that 
completely ignores MOV DR exits and (to the surprise of both of us) this 
patch completely restored performance despite still having a few tens of 
thousands MOV DR exits/second.


In the problematic case, each context switch will do a save and restore 
of debug registers; that's in total 13 debug register accesses (read 6 
registers, write DR7 to disable all registers, write 6 registers 
including DR7 which enables breakpoints), all very close in time.  It's 
quite likely that the final DR7 write is very expensive (e.g. it might 
have to flush the pipeline).  Also, this test case must be very heavy on 
context switches, and each context switch already incurs a few exits 
(for example to inject the interrupt that causes it, to read the current 
time).


A different optimization could be to skip the LOAD_DEBUG_CONTROLS 
vm-entry control if DR7 == host DR7 == 0x400  MOV DR exits are 
enabled.  Not sure it's worthwhile though, and there's also the DEBUGCTL 
MSR to take into account.  Better do these kinds of tweaks if they 
actually show up in the profile: Alex's testcase shows that when they 
do, the impact is massive.



+   kvm_x86_ops-sync_dirty_debug_regs(vcpu);


Sneaky functionality ... it would be nicer to split 'enable DR
intercepts' into a new kvm op.


Why?  Disable DR intercepts is already folded into the handler for MOV 
DR exits.



And we don't have to modify DR intercepts then, which is probably the
main reason why sync_dirty_debug_regs() does two things.


Another is that indirect calls are relatively expensive and add 
complexity; in this case they would always be used back-to-back.


Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/7] KVM: vmx: Allow the guest to run with dirty debug registers

2014-03-09 Thread Paolo Bonzini

Il 09/03/2014 19:26, Radim Krčmář ha scritto:

 +
 +  cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 +  cpu_based_vm_exec_control = ~CPU_BASED_MOV_DR_EXITING;
 +  vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);

vmcs_clear_bits() covers exactly this use-case.
(Barring the explicit bit-width.)


Good idea.


 +
 +  /*
 +   * No more DR vmexits; force a reload of the debug registers
 +   * and reenter on this instruction.  The next vmexit will
 +   * retrieve the full state of the debug registers.
 +   */
 +  vcpu-arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
 +  return 1;
 +  }
 +

We could make the code slighly uglier and move the functional part of
this block before the previous one, so it would do both things in one
exit.


I considered this, but decided that it's unlikely for emulation to be 
faster than hardware---especially on those AMD CPUs that lack decode 
assists (and it's good for VMX and SVM code to look as similar as possible).



(Exception handler will likely access DR too.)


Which exception handler?

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH qom-cpu v2 18/40] cpu: Move watchpoint fields from CPU_COMMON to CPUState

2014-03-09 Thread Andreas Färber
Signed-off-by: Andreas Färber afaer...@suse.de
---
 cpu-exec.c  |  5 +++--
 exec.c  | 33 -
 gdbstub.c   |  8 
 include/exec/cpu-defs.h | 10 --
 include/qom/cpu.h   | 10 ++
 linux-user/main.c   |  5 +++--
 target-i386/cpu.h   |  2 +-
 target-i386/helper.c|  7 ---
 target-i386/kvm.c   |  8 
 target-lm32/cpu.h   |  2 +-
 target-lm32/helper.c|  7 ---
 target-xtensa/cpu.h |  2 +-
 target-xtensa/helper.c  |  8 +---
 13 files changed, 60 insertions(+), 47 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index 798dc08..d7c21d3 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -200,10 +200,11 @@ void cpu_set_debug_excp_handler(CPUDebugExcpHandler 
*handler)
 
 static void cpu_handle_debug_exception(CPUArchState *env)
 {
+CPUState *cpu = ENV_GET_CPU(env);
 CPUWatchpoint *wp;
 
-if (!env-watchpoint_hit) {
-QTAILQ_FOREACH(wp, env-watchpoints, entry) {
+if (!cpu-watchpoint_hit) {
+QTAILQ_FOREACH(wp, cpu-watchpoints, entry) {
 wp-flags = ~BP_WATCHPOINT_HIT;
 }
 }
diff --git a/exec.c b/exec.c
index 9b02c6a..cf4a0ef 100644
--- a/exec.c
+++ b/exec.c
@@ -485,7 +485,7 @@ void cpu_exec_init(CPUArchState *env)
 cpu-cpu_index = cpu_index;
 cpu-numa_node = 0;
 QTAILQ_INIT(env-breakpoints);
-QTAILQ_INIT(env-watchpoints);
+QTAILQ_INIT(cpu-watchpoints);
 #ifndef CONFIG_USER_ONLY
 cpu-as = address_space_memory;
 cpu-thread_id = qemu_get_thread_id();
@@ -542,6 +542,7 @@ int cpu_watchpoint_insert(CPUArchState *env, target_ulong 
addr, target_ulong len
 int cpu_watchpoint_insert(CPUArchState *env, target_ulong addr, target_ulong 
len,
   int flags, CPUWatchpoint **watchpoint)
 {
+CPUState *cpu = ENV_GET_CPU(env);
 target_ulong len_mask = ~(len - 1);
 CPUWatchpoint *wp;
 
@@ -559,10 +560,11 @@ int cpu_watchpoint_insert(CPUArchState *env, target_ulong 
addr, target_ulong len
 wp-flags = flags;
 
 /* keep all GDB-injected watchpoints in front */
-if (flags  BP_GDB)
-QTAILQ_INSERT_HEAD(env-watchpoints, wp, entry);
-else
-QTAILQ_INSERT_TAIL(env-watchpoints, wp, entry);
+if (flags  BP_GDB) {
+QTAILQ_INSERT_HEAD(cpu-watchpoints, wp, entry);
+} else {
+QTAILQ_INSERT_TAIL(cpu-watchpoints, wp, entry);
+}
 
 tlb_flush_page(env, addr);
 
@@ -575,10 +577,11 @@ int cpu_watchpoint_insert(CPUArchState *env, target_ulong 
addr, target_ulong len
 int cpu_watchpoint_remove(CPUArchState *env, target_ulong addr, target_ulong 
len,
   int flags)
 {
+CPUState *cpu = ENV_GET_CPU(env);
 target_ulong len_mask = ~(len - 1);
 CPUWatchpoint *wp;
 
-QTAILQ_FOREACH(wp, env-watchpoints, entry) {
+QTAILQ_FOREACH(wp, cpu-watchpoints, entry) {
 if (addr == wp-vaddr  len_mask == wp-len_mask
  flags == (wp-flags  ~BP_WATCHPOINT_HIT)) {
 cpu_watchpoint_remove_by_ref(env, wp);
@@ -591,7 +594,9 @@ int cpu_watchpoint_remove(CPUArchState *env, target_ulong 
addr, target_ulong len
 /* Remove a specific watchpoint by reference.  */
 void cpu_watchpoint_remove_by_ref(CPUArchState *env, CPUWatchpoint *watchpoint)
 {
-QTAILQ_REMOVE(env-watchpoints, watchpoint, entry);
+CPUState *cpu = ENV_GET_CPU(env);
+
+QTAILQ_REMOVE(cpu-watchpoints, watchpoint, entry);
 
 tlb_flush_page(env, watchpoint-vaddr);
 
@@ -601,9 +606,10 @@ void cpu_watchpoint_remove_by_ref(CPUArchState *env, 
CPUWatchpoint *watchpoint)
 /* Remove all matching watchpoints.  */
 void cpu_watchpoint_remove_all(CPUArchState *env, int mask)
 {
+CPUState *cpu = ENV_GET_CPU(env);
 CPUWatchpoint *wp, *next;
 
-QTAILQ_FOREACH_SAFE(wp, env-watchpoints, entry, next) {
+QTAILQ_FOREACH_SAFE(wp, cpu-watchpoints, entry, next) {
 if (wp-flags  mask)
 cpu_watchpoint_remove_by_ref(env, wp);
 }
@@ -799,6 +805,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
int prot,
target_ulong *address)
 {
+CPUState *cpu = ENV_GET_CPU(env);
 hwaddr iotlb;
 CPUWatchpoint *wp;
 
@@ -818,7 +825,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
 
 /* Make accesses to pages with watchpoints go via the
watchpoint trap routines.  */
-QTAILQ_FOREACH(wp, env-watchpoints, entry) {
+QTAILQ_FOREACH(wp, cpu-watchpoints, entry) {
 if (vaddr == (wp-vaddr  TARGET_PAGE_MASK)) {
 /* Avoid trapping reads of pages with a write breakpoint. */
 if ((prot  PAGE_WRITE) || (wp-flags  BP_MEM_READ)) {
@@ -1573,7 +1580,7 @@ static void check_watchpoint(int offset, int len_mask, 
int flags)
 CPUWatchpoint *wp;
 int cpu_flags;
 
-if (env-watchpoint_hit) {
+if (cpu-watchpoint_hit) {
 /* We re-entered the check after replacing the TB. 

Re: [PATCH net V2] vhost: net: switch to use data copy if pending DMAs exceed the limit

2014-03-09 Thread Qin Chuanyu

On 2014/3/7 13:28, Jason Wang wrote:

We used to stop the handling of tx when the number of pending DMAs
exceeds VHOST_MAX_PEND. This is used to reduce the memory occupation
of both host and guest. But it was too aggressive in some cases, since
any delay or blocking of a single packet may delay or block the guest
transmission. Consider the following setup:

 +-++-+
 | VM1 || VM2 |
 +--+--++--+--+
|  |
 +--+--++--+--+
 | tap0|| tap1|
 +--+--++--+--+
|  |
 pfifo_fast   htb(10Mbit/s)
|  |
 +--+--+---+
 | bridge  |
 +--+--+
|
 pfifo_fast
|
 +-+
 | eth0|(100Mbit/s)
 +-+

- start two VMs and connect them to a bridge
- add an physical card (100Mbit/s) to that bridge
- setup htb on tap1 and limit its throughput to 10Mbit/s
- run two netperfs in the same time, one is from VM1 to VM2. Another is
   from VM1 to an external host through eth0.
- result shows that not only the VM1 to VM2 traffic were throttled but
   also the VM1 to external host through eth0 is also throttled somehow.

This is because the delay added by htb may lead the delay the finish
of DMAs and cause the pending DMAs for tap0 exceeds the limit
(VHOST_MAX_PEND). In this case vhost stop handling tx request until
htb send some packets. The problem here is all of the packets
transmission were blocked even if it does not go to VM2.

We can solve this issue by relaxing it a little bit: switching to use
data copy instead of stopping tx when the number of pending DMAs
exceed half of the vq size. This is safe because:

- The number of pending DMAs were still limited (half of the vq size)
- The out of order completion during mode switch can make sure that
   most of the tx buffers were freed in time in guest.

So even if about 50% packets were delayed in zero-copy case, vhost
could continue to do the transmission through data copy in this case.

Test result:

Before this patch:
VM1 to VM2 throughput is 9.3Mbit/s
VM1 to External throughput is 40Mbit/s
CPU utilization is 7%

After this patch:
VM1 to VM2 throughput is 9.3Mbit/s
Vm1 to External throughput is 93Mbit/s
CPU utilization is 16%

Completed performance test on 40gbe shows no obvious changes in both
throughput and cpu utilization with this patch.

The patch only solve this issue when unlimited sndbuf. We still need a
solution for limited sndbuf.

Cc: Michael S. Tsirkin m...@redhat.com
Cc: Qin Chuanyu qinchua...@huawei.com
Signed-off-by: Jason Wang jasow...@redhat.com
---
Changes from V1:
- Remove VHOST_MAX_PEND and switch to use half of the vq size as the limit
- Add cpu utilization in commit log
---
  drivers/vhost/net.c | 19 +++
  1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index a0fa5de..2925e9a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -38,8 +38,6 @@ MODULE_PARM_DESC(experimental_zcopytx, Enable Zero Copy TX;
   * Using this limit prevents one virtqueue from starving others. */
  #define VHOST_NET_WEIGHT 0x8

-/* MAX number of TX used buffers for outstanding zerocopy */
-#define VHOST_MAX_PEND 128
  #define VHOST_GOODCOPY_LEN 256

  /*
@@ -345,7 +343,7 @@ static void handle_tx(struct vhost_net *net)
.msg_flags = MSG_DONTWAIT,
};
size_t len, total_len = 0;
-   int err;
+   int err, num_pends;
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
@@ -366,13 +364,6 @@ static void handle_tx(struct vhost_net *net)
if (zcopy)
vhost_zerocopy_signal_used(net, vq);

-   /* If more outstanding DMAs, queue the work.
-* Handle upend_idx wrap around
-*/
-   if (unlikely((nvq-upend_idx + vq-num - VHOST_MAX_PEND)
- % UIO_MAXIOV == nvq-done_idx))
-   break;
-
head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 ARRAY_SIZE(vq-iov),
 out, in,
@@ -405,9 +396,13 @@ static void handle_tx(struct vhost_net *net)
break;
}

+   num_pends = likely(nvq-upend_idx = nvq-done_idx) ?
+   (nvq-upend_idx - nvq-done_idx) :
+   (nvq-upend_idx + UIO_MAXIOV -
+nvq-done_idx);
+
zcopy_used = zcopy  len = VHOST_GOODCOPY_LEN
-   (nvq-upend_idx + 1) % UIO_MAXIOV !=
- nvq-done_idx
+   num_pends = vq-num  1
vhost_net_tx_select_zcopy(net);

/* use msg_control to pass vhost zerocopy ubuf info 

Re: [PATCH net V2] vhost: net: switch to use data copy if pending DMAs exceed the limit

2014-03-09 Thread Jason Wang
On 03/08/2014 05:39 AM, David Miller wrote:
 From: Jason Wang jasow...@redhat.com
 Date: Fri,  7 Mar 2014 13:28:27 +0800

 This is because the delay added by htb may lead the delay the finish
 of DMAs and cause the pending DMAs for tap0 exceeds the limit
 (VHOST_MAX_PEND). In this case vhost stop handling tx request until
 htb send some packets. The problem here is all of the packets
 transmission were blocked even if it does not go to VM2.
 Isn't this essentially head of line blocking?

Yes it is.
 We can solve this issue by relaxing it a little bit: switching to use
 data copy instead of stopping tx when the number of pending DMAs
 exceed half of the vq size. This is safe because:

 - The number of pending DMAs were still limited (half of the vq size)
 - The out of order completion during mode switch can make sure that
   most of the tx buffers were freed in time in guest.

 So even if about 50% packets were delayed in zero-copy case, vhost
 could continue to do the transmission through data copy in this case.

 Test result:

 Before this patch:
 VM1 to VM2 throughput is 9.3Mbit/s
 VM1 to External throughput is 40Mbit/s
 CPU utilization is 7%

 After this patch:
 VM1 to VM2 throughput is 9.3Mbit/s
 Vm1 to External throughput is 93Mbit/s
 CPU utilization is 16%

 Completed performance test on 40gbe shows no obvious changes in both
 throughput and cpu utilization with this patch.

 The patch only solve this issue when unlimited sndbuf. We still need a
 solution for limited sndbuf.

 Cc: Michael S. Tsirkin m...@redhat.com
 Cc: Qin Chuanyu qinchua...@huawei.com
 Signed-off-by: Jason Wang jasow...@redhat.com
 I'd like some vhost experts reviewing this before I apply it.

Sure.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] drivers/vfio/pci: Fix wrong MSI interrupt count

2014-03-09 Thread Gavin Shan
According PCI local bus specification, the register of Message
Control for MSI (offset: 2, length: 2) has bit#0 to enable or
disable MSI logic and it shouldn't be part contributing to the
calculation of MSI interrupt count. The patch fixes the issue.

Signed-off-by: Gavin Shan sha...@linux.vnet.ibm.com
---
 drivers/vfio/pci/vfio_pci.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 7ba0424..6b8cd07 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -196,8 +196,7 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device 
*vdev, int irq_type)
if (pos) {
pci_read_config_word(vdev-pdev,
 pos + PCI_MSI_FLAGS, flags);
-
-   return 1  (flags  PCI_MSI_FLAGS_QMASK);
+   return 1  ((flags  PCI_MSI_FLAGS_QMASK)  1);
}
} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
u8 pos;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] drivers/vfio: Rework offsetofend()

2014-03-09 Thread Gavin Shan
The macro offsetofend() introduces unnecessary temporary variable
tmp. The patch avoids that and saves a bit memory in stack.

Signed-off-by: Gavin Shan sha...@linux.vnet.ibm.com
---
 include/linux/vfio.h |5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 24579a0..43f6bf4 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -86,9 +86,8 @@ extern void vfio_unregister_iommu_driver(
  * from user space.  This allows us to easily determine if the provided
  * structure is sized to include various fields.
  */
-#define offsetofend(TYPE, MEMBER) ({   \
-   TYPE tmp;   \
-   offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \
+#define offsetofend(TYPE, MEMBER) \
+   (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)-MEMBER))
 
 /*
  * External user API
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] VFIO Bug Fixes

2014-03-09 Thread Gavin Shan
v1 - v2:
* Don't change the name of variable flags in [PATCH 2/3].
* Comment and commit log cleanup in [PATCH 3/3].

Gavin Shan (3):
  drivers/vfio: Rework offsetofend()
  drivers/vfio/pci: Fix wrong MSI interrupt count
  drivers/vfio/pci: Fix MSIx message lost

---

 drivers/vfio/pci/vfio_pci.c   |3 +--
 drivers/vfio/pci/vfio_pci_intrs.c |   11 +++
 include/linux/vfio.h  |5 ++---
 3 files changed, 14 insertions(+), 5 deletions(-)

Thanks,
Gavin

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] drivers/vfio/pci: Fix MSIx message lost

2014-03-09 Thread Gavin Shan
The problem is specific to the case of BIST reset issued to IPR
adapter on the guest side. The IPR driver calls pci_save_state(),
issues BIST reset and then pci_restore_state(). The issued BIST
cleans out MSIx table and pci_restore_state() doesn't restore
the MSIx table as expected. Eventually, MSIx messages with all
zeros are sent and causes EEH error on Power platform.

The patch fixes it by writing MSIx message to MSIx table before
reenabling the individual interrupts in the following path:

qemu/hw/pci/msix.c::msix_table_mmio_write
msix_handle_mask_update
msix_fire_vector_notifier
qemu/hw/misc/vfio.c::vfio_msix_vector_use
vfio_msix_vector_do_use

IOCTL Command VFIO_DEVICE_SET_IRQS to VFIO-PCI
vfio/pci/vfio_pci.c::vfio_pci_ioctl
vfio/pci/vfio_pci_intrs.c::vfio_pci_set_irqs_ioctl
vfio_pci_set_msi_trigger
vfio_msi_set_block
vfio_msi_set_vector_signal

Reported-by: Wen Xiong wenxi...@us.ibm.com
Signed-off-by: Gavin Shan sha...@linux.vnet.ibm.com
---
 drivers/vfio/pci/vfio_pci_intrs.c |   11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 2103576..83e0638 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -17,6 +17,7 @@
 #include linux/interrupt.h
 #include linux/eventfd.h
 #include linux/pci.h
+#include linux/msi.h
 #include linux/file.h
 #include linux/poll.h
 #include linux/vfio.h
@@ -517,6 +518,7 @@ static int vfio_msi_set_vector_signal(struct 
vfio_pci_device *vdev,
struct pci_dev *pdev = vdev-pdev;
int irq = msix ? vdev-msix[vector].vector : pdev-irq + vector;
char *name = msix ? vfio-msix : vfio-msi;
+   struct msi_msg msg;
struct eventfd_ctx *trigger;
int ret;
 
@@ -544,6 +546,15 @@ static int vfio_msi_set_vector_signal(struct 
vfio_pci_device *vdev,
return PTR_ERR(trigger);
}
 
+   /* We possiblly lose the MSI/MSIx message in some cases, one
+* of which is pci_save_state(), BIST reset and pci_restore_state()
+* for IPR adapter. The BIST reset cleans out MSIx table and we
+* don't have chance to restore it. Here, we have the trick to
+* restore it before enabling individual interrupts. For MSI messages,
+* it's harmless to write them back.
+*/
+   get_cached_msi_msg(irq, msg);
+   write_msi_msg(irq, msg);
ret = request_irq(irq, vfio_msihandler, 0,
  vdev-ctx[vector].name, trigger);
if (ret) {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html