Re: [PATCH v3 1/4] KVM: nVMX: Rework interception of IRQs and NMIs
On 2014-03-09 08:33, Paolo Bonzini wrote: Il 08/03/2014 10:21, Jan Kiszka ha scritto: On 2014-03-07 20:48, Paolo Bonzini wrote: Il 07/03/2014 20:03, Jan Kiszka ha scritto: @@ -4631,22 +4631,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { -if (is_guest_mode(vcpu)) { -if (to_vmx(vcpu)-nested.nested_run_pending) -return 0; -if (nested_exit_on_nmi(vcpu)) { -nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - NMI_VECTOR | INTR_TYPE_NMI_INTR | - INTR_INFO_VALID_MASK, 0); -/* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ -vcpu-arch.nmi_pending = 0; -vmx_set_nmi_mask(vcpu, true); -return 0; -} -} +if (to_vmx(vcpu)-nested.nested_run_pending) +return 0; if (!cpu_has_virtual_nmis() to_vmx(vcpu)-soft_vnmi_blocked) return 0; @@ -4658,19 +4644,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { -if (is_guest_mode(vcpu)) { -if (to_vmx(vcpu)-nested.nested_run_pending) -return 0; -if (nested_exit_on_intr(vcpu)) { -nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, - 0, 0); -/* - * fall through to normal code, but now in L1, not L2 - */ -} -} - -return (vmcs_readl(GUEST_RFLAGS) X86_EFLAGS_IF) +return (!to_vmx(vcpu)-nested.nested_run_pending +vmcs_readl(GUEST_RFLAGS) X86_EFLAGS_IF) !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); The checks on nested_run_pending are not needed anymore and can be replaced with a WARN_ON. Otherwise, Nope, that won't be correct: If we have a pending interrupt that L1 does not intercept, we still trigger this condition legally. Right, this is the case of !nested_exit_on_intr(vcpu) or !nested_exit_on_nmi(vcpu). Why don't we need to request an immediate exit in that case, in order to inject the interrupt into L2? We enable the hardware interrupt/NMI window request for L2 instead. Jan signature.asc Description: OpenPGP digital signature
Re: [PATCH v3 1/4] KVM: nVMX: Rework interception of IRQs and NMIs
Il 09/03/2014 08:33, Paolo Bonzini ha scritto: Il 08/03/2014 10:21, Jan Kiszka ha scritto: On 2014-03-07 20:48, Paolo Bonzini wrote: Il 07/03/2014 20:03, Jan Kiszka ha scritto: @@ -4631,22 +4631,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { -if (is_guest_mode(vcpu)) { -if (to_vmx(vcpu)-nested.nested_run_pending) -return 0; -if (nested_exit_on_nmi(vcpu)) { -nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - NMI_VECTOR | INTR_TYPE_NMI_INTR | - INTR_INFO_VALID_MASK, 0); -/* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ -vcpu-arch.nmi_pending = 0; -vmx_set_nmi_mask(vcpu, true); -return 0; -} -} +if (to_vmx(vcpu)-nested.nested_run_pending) +return 0; if (!cpu_has_virtual_nmis() to_vmx(vcpu)-soft_vnmi_blocked) return 0; @@ -4658,19 +4644,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { -if (is_guest_mode(vcpu)) { -if (to_vmx(vcpu)-nested.nested_run_pending) -return 0; -if (nested_exit_on_intr(vcpu)) { -nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, - 0, 0); -/* - * fall through to normal code, but now in L1, not L2 - */ -} -} - -return (vmcs_readl(GUEST_RFLAGS) X86_EFLAGS_IF) +return (!to_vmx(vcpu)-nested.nested_run_pending +vmcs_readl(GUEST_RFLAGS) X86_EFLAGS_IF) !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); The checks on nested_run_pending are not needed anymore and can be replaced with a WARN_ON. Otherwise, Nope, that won't be correct: If we have a pending interrupt that L1 does not intercept, we still trigger this condition legally. Right, this is the case of !nested_exit_on_intr(vcpu) or !nested_exit_on_nmi(vcpu). Why don't we need to request an immediate exit in that case, in order to inject the interrupt into L2? Nevermind, this makes no sense. I was confusing *_allowed with enable_*_window. Applying v3 to kvm/queue, thanks! Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 1/4] KVM: nVMX: Rework interception of IRQs and NMIs
On 2014-03-09 09:03, Paolo Bonzini wrote: Il 09/03/2014 08:33, Paolo Bonzini ha scritto: Il 08/03/2014 10:21, Jan Kiszka ha scritto: On 2014-03-07 20:48, Paolo Bonzini wrote: Il 07/03/2014 20:03, Jan Kiszka ha scritto: @@ -4631,22 +4631,8 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) { -if (is_guest_mode(vcpu)) { -if (to_vmx(vcpu)-nested.nested_run_pending) -return 0; -if (nested_exit_on_nmi(vcpu)) { -nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - NMI_VECTOR | INTR_TYPE_NMI_INTR | - INTR_INFO_VALID_MASK, 0); -/* - * The NMI-triggered VM exit counts as injection: - * clear this one and block further NMIs. - */ -vcpu-arch.nmi_pending = 0; -vmx_set_nmi_mask(vcpu, true); -return 0; -} -} +if (to_vmx(vcpu)-nested.nested_run_pending) +return 0; if (!cpu_has_virtual_nmis() to_vmx(vcpu)-soft_vnmi_blocked) return 0; @@ -4658,19 +4644,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { -if (is_guest_mode(vcpu)) { -if (to_vmx(vcpu)-nested.nested_run_pending) -return 0; -if (nested_exit_on_intr(vcpu)) { -nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, - 0, 0); -/* - * fall through to normal code, but now in L1, not L2 - */ -} -} - -return (vmcs_readl(GUEST_RFLAGS) X86_EFLAGS_IF) +return (!to_vmx(vcpu)-nested.nested_run_pending +vmcs_readl(GUEST_RFLAGS) X86_EFLAGS_IF) !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); The checks on nested_run_pending are not needed anymore and can be replaced with a WARN_ON. Otherwise, Nope, that won't be correct: If we have a pending interrupt that L1 does not intercept, we still trigger this condition legally. Right, this is the case of !nested_exit_on_intr(vcpu) or !nested_exit_on_nmi(vcpu). Why don't we need to request an immediate exit in that case, in order to inject the interrupt into L2? Nevermind, this makes no sense. I was confusing *_allowed with enable_*_window. This code is mind-blowing and probably still not perfect. I wouldn't be surprised if we are going to find bugs there until we retire. ;) Applying v3 to kvm/queue, thanks! Great, thank you! Jan signature.asc Description: OpenPGP digital signature
Re: [PATCH 0/7] KVM: x86: Let the guest write to multiple debug registers with one vmexit
On 2014-03-07 12:42, Paolo Bonzini wrote: Alex Williamson reported that a Windows game does something weird that makes the guest save and restore debug registers on each context switch. This cause several hundred thousands vmexits per second, and basically cuts performance in half when running under KVM. However, when not running in guest-debug mode, the guest controls the debug registers and having to take an exit for each DR access is a waste of time. We just need one vmexit to load any stale values of DR0-DR6, and then we can let the guest run freely. On the next vmexit (whatever the reason) we will read out whatever changes the guest made to the debug registers. Tested with x86/debug.flat on both Intel and AMD, both direct and nested virtualization. Changes from RFC: changed get_dr7 callback to sync_dirty_debug_regs, new patches 5-7. This looks good now to me from KVM perspective. I was just wondering how the case is handled that the host used debug registers on the thread the runs a VCPU? What if I set a hw breakpoint on its userspace path e.g.? What if I debug the kernel side with kgdb? Jan Paolo Bonzini (7): KVM: vmx: we do rely on loading DR7 on entry KVM: x86: change vcpu-arch.switch_db_regs to a bit mask KVM: x86: Allow the guest to run with dirty debug registers KVM: vmx: Allow the guest to run with dirty debug registers KVM: nVMX: Allow nested guests to run with dirty debug registers KVM: svm: set/clear all DR intercepts in one swoop KVM: svm: Allow the guest to run with dirty debug registers arch/x86/include/asm/kvm_host.h | 8 - arch/x86/kvm/svm.c | 68 - arch/x86/kvm/vmx.c | 43 -- arch/x86/kvm/x86.c | 20 +++- 4 files changed, 114 insertions(+), 25 deletions(-) signature.asc Description: OpenPGP digital signature
Re: [PATCH 0/7] KVM: x86: Let the guest write to multiple debug registers with one vmexit
On 2014-03-09 09:11, Jan Kiszka wrote: On 2014-03-07 12:42, Paolo Bonzini wrote: Alex Williamson reported that a Windows game does something weird that makes the guest save and restore debug registers on each context switch. This cause several hundred thousands vmexits per second, and basically cuts performance in half when running under KVM. However, when not running in guest-debug mode, the guest controls the debug registers and having to take an exit for each DR access is a waste of time. We just need one vmexit to load any stale values of DR0-DR6, and then we can let the guest run freely. On the next vmexit (whatever the reason) we will read out whatever changes the guest made to the debug registers. Tested with x86/debug.flat on both Intel and AMD, both direct and nested virtualization. Changes from RFC: changed get_dr7 callback to sync_dirty_debug_regs, new patches 5-7. This looks good now to me from KVM perspective. I was just wondering how the case is handled that the host used debug registers on the thread the runs a VCPU? What if I set a hw breakpoint on its userspace path e.g.? What if I debug the kernel side with kgdb? Ah, this part didn't change, we still switch between host and guest state on vmentries/exits. All fine! Jan signature.asc Description: OpenPGP digital signature
Re: [PATCH 0/7] KVM: x86: Let the guest write to multiple debug registers with one vmexit
Il 09/03/2014 09:15, Jan Kiszka ha scritto: This looks good now to me from KVM perspective. I was just wondering how the case is handled that the host used debug registers on the thread the runs a VCPU? What if I set a hw breakpoint on its userspace path e.g.? What if I debug the kernel side with kgdb? Ah, this part didn't change, we still switch between host and guest state on vmentries/exits. All fine! Yes, that's why sync_dirty_debug_regs is added so early. Thanks for the review. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/7] KVM: vmx: Allow the guest to run with dirty debug registers
2014-03-07 12:42+0100, Paolo Bonzini: When not running in guest-debug mode (i.e. the guest controls the debug registers, having to take an exit for each DR access is a waste of time. If the guest gets into a state where each context switch causes DR to be saved and restored, this can take away as much as 40% of the execution time from the guest. If the guest is running with vcpu-arch.db == vcpu-arch.eff_db, we can let it write freely to the debug registers and reload them on the next exit. We still need to exit on the first access, so that the KVM_DEBUGREG_WONT_EXIT flag is set in switch_db_regs; after that, further accesses to the debug registers will not cause a vmexit. Signed-off-by: Paolo Bonzini pbonz...@redhat.com --- arch/x86/kvm/vmx.c | 37 - 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06e4ec877a1c..e377522408b1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2843,7 +2843,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = 0; + min = VM_EXIT_SAVE_DEBUG_CONTROLS; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif @@ -5113,6 +5113,22 @@ static int handle_dr(struct kvm_vcpu *vcpu) } } + if (vcpu-guest_debug == 0) { + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control = ~CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); vmcs_clear_bits() covers exactly this use-case. (Barring the explicit bit-width.) + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu-arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + We could make the code slighly uglier and move the functional part of this block before the previous one, so it would do both things in one exit. (Exception handler will likely access DR too.) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); @@ -5139,6 +5155,24 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) { } +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + get_debugreg(vcpu-arch.db[0], 0); + get_debugreg(vcpu-arch.db[1], 1); + get_debugreg(vcpu-arch.db[2], 2); + get_debugreg(vcpu-arch.db[3], 3); + get_debugreg(vcpu-arch.dr6, 6); + vcpu-arch.dr7 = vmcs_readl(GUEST_DR7); + + vcpu-arch.switch_db_regs = ~KVM_DEBUGREG_WONT_EXIT; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); (I'll be calling this sneaky later on.) +} + static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -8590,6 +8624,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_dr6 = vmx_get_dr6, .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/7] KVM: x86: Allow the guest to run with dirty debug registers
2014-03-07 12:42+0100, Paolo Bonzini: When not running in guest-debug mode, the guest controls the debug registers and having to take an exit for each DR access is a waste of time. If the guest gets into a state where each context switch causes DR to be saved and restored, this can take away as much as 40% of the execution time from the guest. After this patch, VMX- and SVM-specific code can set a flag in switch_db_regs, telling vcpu_enter_guest that on the next exit the debug registers might be dirty and need to be reloaded (syncing will be taken care of by a new callback in kvm_x86_ops). This flag can be set on the first access to a debug registers, so that multiple accesses to the debug registers only cause one vmexit. Note that since the guest will be able to read debug registers and enable breakpoints in DR7, we need to ensure that they are synchronized on entry to the guest---including DR6 that was not synced before. Signed-off-by: Paolo Bonzini pbonz...@redhat.com --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 16 2 files changed, 18 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5ef59d3b6c63..74eb361eaa8f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -339,6 +339,7 @@ struct kvm_pmu { enum { KVM_DEBUGREG_BP_ENABLED = 1, + KVM_DEBUGREG_WONT_EXIT = 2, }; struct kvm_vcpu_arch { @@ -707,6 +708,7 @@ struct kvm_x86_ops { void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); u64 (*get_dr6)(struct kvm_vcpu *vcpu); void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); + void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 252b47e85c69..c48818aa04c0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6033,12 +6033,28 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu-arch.eff_db[1], 1); set_debugreg(vcpu-arch.eff_db[2], 2); set_debugreg(vcpu-arch.eff_db[3], 3); + set_debugreg(vcpu-arch.dr6, 6); } trace_kvm_entry(vcpu-vcpu_id); kvm_x86_ops-run(vcpu); /* + * Do this here before restoring debug registers on the host. And + * since we do this before handling the vmexit, a DR access vmexit + * can (a) read the correct value of the debug registers, (b) set + * KVM_DEBUGREG_WONT_EXIT again. We re-enable intercepts on the next exit for the sake of simplicity? (Batched accesses make perfect sense, but it seems we don't have to care about DRs at all, without guest-debug.) + */ + if (unlikely(vcpu-arch.switch_db_regs KVM_DEBUGREG_WONT_EXIT)) { + int i; + + WARN_ON(vcpu-guest_debug KVM_GUESTDBG_USE_HW_BP); Is this possible? (I presumed that we vmexit before handling ioctls.) + kvm_x86_ops-sync_dirty_debug_regs(vcpu); Sneaky functionality ... it would be nicer to split 'enable DR intercepts' into a new kvm op. I think we want to disable them whenever we are not in guest-debug mode anyway, so it would be a pair. And we don't have to modify DR intercepts then, which is probably the main reason why sync_dirty_debug_regs() does two things. + for (i = 0; i KVM_NR_DB_REGS; i++) + vcpu-arch.eff_db[i] = vcpu-arch.db[i]; + } + + /* * If the guest has used debug registers, at least dr7 * will be disabled while returning to the host. * If we don't have active breakpoints in the host, we don't -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/7] KVM: x86: Allow the guest to run with dirty debug registers
Il 09/03/2014 19:28, Radim Krčmář ha scritto: /* + * Do this here before restoring debug registers on the host. And + * since we do this before handling the vmexit, a DR access vmexit + * can (a) read the correct value of the debug registers, (b) set + * KVM_DEBUGREG_WONT_EXIT again. We re-enable intercepts on the next exit for the sake of simplicity? (Batched accesses make perfect sense, but it seems we don't have to care about DRs at all, without guest-debug.) It's not just for the sake of simplicity. Synchronizing debug registers on entry does have some cost, and it's required for running without debug register intercepts. You would incur this cost always, since no-guest-debug is actually the common case. We're well into diminishing returns; Alex timed this patch vs. one that completely ignores MOV DR exits and (to the surprise of both of us) this patch completely restored performance despite still having a few tens of thousands MOV DR exits/second. In the problematic case, each context switch will do a save and restore of debug registers; that's in total 13 debug register accesses (read 6 registers, write DR7 to disable all registers, write 6 registers including DR7 which enables breakpoints), all very close in time. It's quite likely that the final DR7 write is very expensive (e.g. it might have to flush the pipeline). Also, this test case must be very heavy on context switches, and each context switch already incurs a few exits (for example to inject the interrupt that causes it, to read the current time). A different optimization could be to skip the LOAD_DEBUG_CONTROLS vm-entry control if DR7 == host DR7 == 0x400 MOV DR exits are enabled. Not sure it's worthwhile though, and there's also the DEBUGCTL MSR to take into account. Better do these kinds of tweaks if they actually show up in the profile: Alex's testcase shows that when they do, the impact is massive. + kvm_x86_ops-sync_dirty_debug_regs(vcpu); Sneaky functionality ... it would be nicer to split 'enable DR intercepts' into a new kvm op. Why? Disable DR intercepts is already folded into the handler for MOV DR exits. And we don't have to modify DR intercepts then, which is probably the main reason why sync_dirty_debug_regs() does two things. Another is that indirect calls are relatively expensive and add complexity; in this case they would always be used back-to-back. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/7] KVM: vmx: Allow the guest to run with dirty debug registers
Il 09/03/2014 19:26, Radim Krčmář ha scritto: + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control = ~CPU_BASED_MOV_DR_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); vmcs_clear_bits() covers exactly this use-case. (Barring the explicit bit-width.) Good idea. + + /* + * No more DR vmexits; force a reload of the debug registers + * and reenter on this instruction. The next vmexit will + * retrieve the full state of the debug registers. + */ + vcpu-arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; + return 1; + } + We could make the code slighly uglier and move the functional part of this block before the previous one, so it would do both things in one exit. I considered this, but decided that it's unlikely for emulation to be faster than hardware---especially on those AMD CPUs that lack decode assists (and it's good for VMX and SVM code to look as similar as possible). (Exception handler will likely access DR too.) Which exception handler? Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH qom-cpu v2 18/40] cpu: Move watchpoint fields from CPU_COMMON to CPUState
Signed-off-by: Andreas Färber afaer...@suse.de --- cpu-exec.c | 5 +++-- exec.c | 33 - gdbstub.c | 8 include/exec/cpu-defs.h | 10 -- include/qom/cpu.h | 10 ++ linux-user/main.c | 5 +++-- target-i386/cpu.h | 2 +- target-i386/helper.c| 7 --- target-i386/kvm.c | 8 target-lm32/cpu.h | 2 +- target-lm32/helper.c| 7 --- target-xtensa/cpu.h | 2 +- target-xtensa/helper.c | 8 +--- 13 files changed, 60 insertions(+), 47 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index 798dc08..d7c21d3 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -200,10 +200,11 @@ void cpu_set_debug_excp_handler(CPUDebugExcpHandler *handler) static void cpu_handle_debug_exception(CPUArchState *env) { +CPUState *cpu = ENV_GET_CPU(env); CPUWatchpoint *wp; -if (!env-watchpoint_hit) { -QTAILQ_FOREACH(wp, env-watchpoints, entry) { +if (!cpu-watchpoint_hit) { +QTAILQ_FOREACH(wp, cpu-watchpoints, entry) { wp-flags = ~BP_WATCHPOINT_HIT; } } diff --git a/exec.c b/exec.c index 9b02c6a..cf4a0ef 100644 --- a/exec.c +++ b/exec.c @@ -485,7 +485,7 @@ void cpu_exec_init(CPUArchState *env) cpu-cpu_index = cpu_index; cpu-numa_node = 0; QTAILQ_INIT(env-breakpoints); -QTAILQ_INIT(env-watchpoints); +QTAILQ_INIT(cpu-watchpoints); #ifndef CONFIG_USER_ONLY cpu-as = address_space_memory; cpu-thread_id = qemu_get_thread_id(); @@ -542,6 +542,7 @@ int cpu_watchpoint_insert(CPUArchState *env, target_ulong addr, target_ulong len int cpu_watchpoint_insert(CPUArchState *env, target_ulong addr, target_ulong len, int flags, CPUWatchpoint **watchpoint) { +CPUState *cpu = ENV_GET_CPU(env); target_ulong len_mask = ~(len - 1); CPUWatchpoint *wp; @@ -559,10 +560,11 @@ int cpu_watchpoint_insert(CPUArchState *env, target_ulong addr, target_ulong len wp-flags = flags; /* keep all GDB-injected watchpoints in front */ -if (flags BP_GDB) -QTAILQ_INSERT_HEAD(env-watchpoints, wp, entry); -else -QTAILQ_INSERT_TAIL(env-watchpoints, wp, entry); +if (flags BP_GDB) { +QTAILQ_INSERT_HEAD(cpu-watchpoints, wp, entry); +} else { +QTAILQ_INSERT_TAIL(cpu-watchpoints, wp, entry); +} tlb_flush_page(env, addr); @@ -575,10 +577,11 @@ int cpu_watchpoint_insert(CPUArchState *env, target_ulong addr, target_ulong len int cpu_watchpoint_remove(CPUArchState *env, target_ulong addr, target_ulong len, int flags) { +CPUState *cpu = ENV_GET_CPU(env); target_ulong len_mask = ~(len - 1); CPUWatchpoint *wp; -QTAILQ_FOREACH(wp, env-watchpoints, entry) { +QTAILQ_FOREACH(wp, cpu-watchpoints, entry) { if (addr == wp-vaddr len_mask == wp-len_mask flags == (wp-flags ~BP_WATCHPOINT_HIT)) { cpu_watchpoint_remove_by_ref(env, wp); @@ -591,7 +594,9 @@ int cpu_watchpoint_remove(CPUArchState *env, target_ulong addr, target_ulong len /* Remove a specific watchpoint by reference. */ void cpu_watchpoint_remove_by_ref(CPUArchState *env, CPUWatchpoint *watchpoint) { -QTAILQ_REMOVE(env-watchpoints, watchpoint, entry); +CPUState *cpu = ENV_GET_CPU(env); + +QTAILQ_REMOVE(cpu-watchpoints, watchpoint, entry); tlb_flush_page(env, watchpoint-vaddr); @@ -601,9 +606,10 @@ void cpu_watchpoint_remove_by_ref(CPUArchState *env, CPUWatchpoint *watchpoint) /* Remove all matching watchpoints. */ void cpu_watchpoint_remove_all(CPUArchState *env, int mask) { +CPUState *cpu = ENV_GET_CPU(env); CPUWatchpoint *wp, *next; -QTAILQ_FOREACH_SAFE(wp, env-watchpoints, entry, next) { +QTAILQ_FOREACH_SAFE(wp, cpu-watchpoints, entry, next) { if (wp-flags mask) cpu_watchpoint_remove_by_ref(env, wp); } @@ -799,6 +805,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env, int prot, target_ulong *address) { +CPUState *cpu = ENV_GET_CPU(env); hwaddr iotlb; CPUWatchpoint *wp; @@ -818,7 +825,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env, /* Make accesses to pages with watchpoints go via the watchpoint trap routines. */ -QTAILQ_FOREACH(wp, env-watchpoints, entry) { +QTAILQ_FOREACH(wp, cpu-watchpoints, entry) { if (vaddr == (wp-vaddr TARGET_PAGE_MASK)) { /* Avoid trapping reads of pages with a write breakpoint. */ if ((prot PAGE_WRITE) || (wp-flags BP_MEM_READ)) { @@ -1573,7 +1580,7 @@ static void check_watchpoint(int offset, int len_mask, int flags) CPUWatchpoint *wp; int cpu_flags; -if (env-watchpoint_hit) { +if (cpu-watchpoint_hit) { /* We re-entered the check after replacing the TB.
Re: [PATCH net V2] vhost: net: switch to use data copy if pending DMAs exceed the limit
On 2014/3/7 13:28, Jason Wang wrote: We used to stop the handling of tx when the number of pending DMAs exceeds VHOST_MAX_PEND. This is used to reduce the memory occupation of both host and guest. But it was too aggressive in some cases, since any delay or blocking of a single packet may delay or block the guest transmission. Consider the following setup: +-++-+ | VM1 || VM2 | +--+--++--+--+ | | +--+--++--+--+ | tap0|| tap1| +--+--++--+--+ | | pfifo_fast htb(10Mbit/s) | | +--+--+---+ | bridge | +--+--+ | pfifo_fast | +-+ | eth0|(100Mbit/s) +-+ - start two VMs and connect them to a bridge - add an physical card (100Mbit/s) to that bridge - setup htb on tap1 and limit its throughput to 10Mbit/s - run two netperfs in the same time, one is from VM1 to VM2. Another is from VM1 to an external host through eth0. - result shows that not only the VM1 to VM2 traffic were throttled but also the VM1 to external host through eth0 is also throttled somehow. This is because the delay added by htb may lead the delay the finish of DMAs and cause the pending DMAs for tap0 exceeds the limit (VHOST_MAX_PEND). In this case vhost stop handling tx request until htb send some packets. The problem here is all of the packets transmission were blocked even if it does not go to VM2. We can solve this issue by relaxing it a little bit: switching to use data copy instead of stopping tx when the number of pending DMAs exceed half of the vq size. This is safe because: - The number of pending DMAs were still limited (half of the vq size) - The out of order completion during mode switch can make sure that most of the tx buffers were freed in time in guest. So even if about 50% packets were delayed in zero-copy case, vhost could continue to do the transmission through data copy in this case. Test result: Before this patch: VM1 to VM2 throughput is 9.3Mbit/s VM1 to External throughput is 40Mbit/s CPU utilization is 7% After this patch: VM1 to VM2 throughput is 9.3Mbit/s Vm1 to External throughput is 93Mbit/s CPU utilization is 16% Completed performance test on 40gbe shows no obvious changes in both throughput and cpu utilization with this patch. The patch only solve this issue when unlimited sndbuf. We still need a solution for limited sndbuf. Cc: Michael S. Tsirkin m...@redhat.com Cc: Qin Chuanyu qinchua...@huawei.com Signed-off-by: Jason Wang jasow...@redhat.com --- Changes from V1: - Remove VHOST_MAX_PEND and switch to use half of the vq size as the limit - Add cpu utilization in commit log --- drivers/vhost/net.c | 19 +++ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index a0fa5de..2925e9a 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -38,8 +38,6 @@ MODULE_PARM_DESC(experimental_zcopytx, Enable Zero Copy TX; * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x8 -/* MAX number of TX used buffers for outstanding zerocopy */ -#define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256 /* @@ -345,7 +343,7 @@ static void handle_tx(struct vhost_net *net) .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; - int err; + int err, num_pends; size_t hdr_size; struct socket *sock; struct vhost_net_ubuf_ref *uninitialized_var(ubufs); @@ -366,13 +364,6 @@ static void handle_tx(struct vhost_net *net) if (zcopy) vhost_zerocopy_signal_used(net, vq); - /* If more outstanding DMAs, queue the work. -* Handle upend_idx wrap around -*/ - if (unlikely((nvq-upend_idx + vq-num - VHOST_MAX_PEND) - % UIO_MAXIOV == nvq-done_idx)) - break; - head = vhost_get_vq_desc(net-dev, vq, vq-iov, ARRAY_SIZE(vq-iov), out, in, @@ -405,9 +396,13 @@ static void handle_tx(struct vhost_net *net) break; } + num_pends = likely(nvq-upend_idx = nvq-done_idx) ? + (nvq-upend_idx - nvq-done_idx) : + (nvq-upend_idx + UIO_MAXIOV - +nvq-done_idx); + zcopy_used = zcopy len = VHOST_GOODCOPY_LEN - (nvq-upend_idx + 1) % UIO_MAXIOV != - nvq-done_idx + num_pends = vq-num 1 vhost_net_tx_select_zcopy(net); /* use msg_control to pass vhost zerocopy ubuf info
Re: [PATCH net V2] vhost: net: switch to use data copy if pending DMAs exceed the limit
On 03/08/2014 05:39 AM, David Miller wrote: From: Jason Wang jasow...@redhat.com Date: Fri, 7 Mar 2014 13:28:27 +0800 This is because the delay added by htb may lead the delay the finish of DMAs and cause the pending DMAs for tap0 exceeds the limit (VHOST_MAX_PEND). In this case vhost stop handling tx request until htb send some packets. The problem here is all of the packets transmission were blocked even if it does not go to VM2. Isn't this essentially head of line blocking? Yes it is. We can solve this issue by relaxing it a little bit: switching to use data copy instead of stopping tx when the number of pending DMAs exceed half of the vq size. This is safe because: - The number of pending DMAs were still limited (half of the vq size) - The out of order completion during mode switch can make sure that most of the tx buffers were freed in time in guest. So even if about 50% packets were delayed in zero-copy case, vhost could continue to do the transmission through data copy in this case. Test result: Before this patch: VM1 to VM2 throughput is 9.3Mbit/s VM1 to External throughput is 40Mbit/s CPU utilization is 7% After this patch: VM1 to VM2 throughput is 9.3Mbit/s Vm1 to External throughput is 93Mbit/s CPU utilization is 16% Completed performance test on 40gbe shows no obvious changes in both throughput and cpu utilization with this patch. The patch only solve this issue when unlimited sndbuf. We still need a solution for limited sndbuf. Cc: Michael S. Tsirkin m...@redhat.com Cc: Qin Chuanyu qinchua...@huawei.com Signed-off-by: Jason Wang jasow...@redhat.com I'd like some vhost experts reviewing this before I apply it. Sure. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] drivers/vfio/pci: Fix wrong MSI interrupt count
According PCI local bus specification, the register of Message Control for MSI (offset: 2, length: 2) has bit#0 to enable or disable MSI logic and it shouldn't be part contributing to the calculation of MSI interrupt count. The patch fixes the issue. Signed-off-by: Gavin Shan sha...@linux.vnet.ibm.com --- drivers/vfio/pci/vfio_pci.c |3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 7ba0424..6b8cd07 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -196,8 +196,7 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) if (pos) { pci_read_config_word(vdev-pdev, pos + PCI_MSI_FLAGS, flags); - - return 1 (flags PCI_MSI_FLAGS_QMASK); + return 1 ((flags PCI_MSI_FLAGS_QMASK) 1); } } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { u8 pos; -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] drivers/vfio: Rework offsetofend()
The macro offsetofend() introduces unnecessary temporary variable tmp. The patch avoids that and saves a bit memory in stack. Signed-off-by: Gavin Shan sha...@linux.vnet.ibm.com --- include/linux/vfio.h |5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 24579a0..43f6bf4 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -86,9 +86,8 @@ extern void vfio_unregister_iommu_driver( * from user space. This allows us to easily determine if the provided * structure is sized to include various fields. */ -#define offsetofend(TYPE, MEMBER) ({ \ - TYPE tmp; \ - offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \ +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)-MEMBER)) /* * External user API -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/3] VFIO Bug Fixes
v1 - v2: * Don't change the name of variable flags in [PATCH 2/3]. * Comment and commit log cleanup in [PATCH 3/3]. Gavin Shan (3): drivers/vfio: Rework offsetofend() drivers/vfio/pci: Fix wrong MSI interrupt count drivers/vfio/pci: Fix MSIx message lost --- drivers/vfio/pci/vfio_pci.c |3 +-- drivers/vfio/pci/vfio_pci_intrs.c | 11 +++ include/linux/vfio.h |5 ++--- 3 files changed, 14 insertions(+), 5 deletions(-) Thanks, Gavin -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] drivers/vfio/pci: Fix MSIx message lost
The problem is specific to the case of BIST reset issued to IPR adapter on the guest side. The IPR driver calls pci_save_state(), issues BIST reset and then pci_restore_state(). The issued BIST cleans out MSIx table and pci_restore_state() doesn't restore the MSIx table as expected. Eventually, MSIx messages with all zeros are sent and causes EEH error on Power platform. The patch fixes it by writing MSIx message to MSIx table before reenabling the individual interrupts in the following path: qemu/hw/pci/msix.c::msix_table_mmio_write msix_handle_mask_update msix_fire_vector_notifier qemu/hw/misc/vfio.c::vfio_msix_vector_use vfio_msix_vector_do_use IOCTL Command VFIO_DEVICE_SET_IRQS to VFIO-PCI vfio/pci/vfio_pci.c::vfio_pci_ioctl vfio/pci/vfio_pci_intrs.c::vfio_pci_set_irqs_ioctl vfio_pci_set_msi_trigger vfio_msi_set_block vfio_msi_set_vector_signal Reported-by: Wen Xiong wenxi...@us.ibm.com Signed-off-by: Gavin Shan sha...@linux.vnet.ibm.com --- drivers/vfio/pci/vfio_pci_intrs.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 2103576..83e0638 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -17,6 +17,7 @@ #include linux/interrupt.h #include linux/eventfd.h #include linux/pci.h +#include linux/msi.h #include linux/file.h #include linux/poll.h #include linux/vfio.h @@ -517,6 +518,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, struct pci_dev *pdev = vdev-pdev; int irq = msix ? vdev-msix[vector].vector : pdev-irq + vector; char *name = msix ? vfio-msix : vfio-msi; + struct msi_msg msg; struct eventfd_ctx *trigger; int ret; @@ -544,6 +546,15 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, return PTR_ERR(trigger); } + /* We possiblly lose the MSI/MSIx message in some cases, one +* of which is pci_save_state(), BIST reset and pci_restore_state() +* for IPR adapter. The BIST reset cleans out MSIx table and we +* don't have chance to restore it. Here, we have the trick to +* restore it before enabling individual interrupts. For MSI messages, +* it's harmless to write them back. +*/ + get_cached_msi_msg(irq, msg); + write_msi_msg(irq, msg); ret = request_irq(irq, vfio_msihandler, 0, vdev-ctx[vector].name, trigger); if (ret) { -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html