Older VMX supporting CPUs do not provide the "Virtual NMI" feature for
tracking the NMI-blocked state after injecting such events. For now
KVM is unable to inject NMIs on those CPUs.

Derived from Sheng Yang's suggestion to use the IRQ window notification
for detecting the end of NMI handlers, this patch implements virtual
NMI support without impact on the host's ability to receive real NMIs.
The downside is that the given approach requires some heuristics that
can cause NMI nesting in vary rare corner cases.

The approach works as follows:
 - check if the guest will receive the next NMI via an interrupt gate
   (i.e. handler will have interrupts disable), reject injection if not
 - inject NMI and set a software-based NMI-blocked flag
 - arm the IRQ window start notification whenever an NMI window is
   requested
 - if the guest exits due to an opening IRQ window, clear the emulated
   NMI-blocked flag
 - if the guest net execution time with NMI-blocked but without an IRQ
   window exceeds 1 second, force NMI-blocked reset and inject anyway

This approach covers most practical scenarios:
 - succeeding NMIs are seperated by at least one open IRQ window
 - the guest may spin with IRQs disabled (e.g. due to a bug), but
   leaving the NMI handler takes much less time than one second
 - the guest does not rely on strict ordering or timing of NMIs
   (would be problematic in virtualized environments anyway)

Successfully tested with the 'nmi n' monitor command, the kgdbts
testsuite on smp guests (additional patches required to add debug
register support to kvm), the kernel's nmi_watchdog=1, and a Siemens-
specific board emulation (+ guest) that comes with its own NMI
watchdog mechanism.

Signed-off-by: Jan Kiszka <[EMAIL PROTECTED]>
---
 arch/x86/kvm/vmx.c |  173 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 120 insertions(+), 53 deletions(-)

Index: b/arch/x86/kvm/vmx.c
===================================================================
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -90,6 +90,11 @@ struct vcpu_vmx {
        } rmode;
        int vpid;
        bool emulation_required;
+
+       /* Support for vnmi-less CPUs */
+       int soft_vnmi_blocked;
+       ktime_t entry_time;
+       s64 vnmi_blocked_time;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -2331,6 +2336,29 @@ out:
        return ret;
 }
 
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+       u32 cpu_based_vm_exec_control;
+
+       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+       u32 cpu_based_vm_exec_control;
+
+       if (!cpu_has_virtual_nmis()) {
+               enable_irq_window(vcpu);
+               return;
+       }
+
+       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
 static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2356,6 +2384,29 @@ static void vmx_inject_nmi(struct kvm_vc
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+       if (!cpu_has_virtual_nmis()) {
+               int desc_size = is_long_mode(vcpu) ? 16 : 8;
+               struct descriptor_table dt;
+               gpa_t gpa;
+               u64 desc;
+
+               /*
+                * Deny delivery if the NMI will not be handled by an
+                * interrupt gate (workaround depends on IRQ masking).
+                */
+               vmx_get_idt(vcpu, &dt);
+               if (!vcpu->arch.rmode.active && dt.limit
+                   >= desc_size * (NMI_VECTOR + 1) - 1) {
+                       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu,
+                                       dt.base + desc_size * NMI_VECTOR);
+                       if (kvm_read_guest(vcpu->kvm, gpa, &desc, 8) == 0
+                           && ((desc >> 40) & 0x7) != 0x6)
+                               return;
+               }
+               vmx->soft_vnmi_blocked = 1;
+               vmx->vnmi_blocked_time = 0;
+       }
+
        ++vcpu->stat.nmi_injections;
        if (vcpu->arch.rmode.active) {
                vmx->rmode.irq.pending = true;
@@ -2374,6 +2425,7 @@ static void vmx_inject_nmi(struct kvm_vc
 
 static void vmx_update_window_states(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
 
        vcpu->arch.nmi_window_open =
@@ -2385,6 +2437,13 @@ static void vmx_update_window_states(str
                ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
                 !(guest_intr & (GUEST_INTR_STATE_STI |
                                 GUEST_INTR_STATE_MOV_SS)));
+
+       if (!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked) {
+               if (vcpu->arch.interrupt_window_open)
+                       vmx->soft_vnmi_blocked = 0;
+               else
+                       vcpu->arch.nmi_window_open = 0;
+       }
 }
 
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
@@ -2399,51 +2458,28 @@ static void kvm_do_inject_irq(struct kvm
        kvm_queue_interrupt(vcpu, irq);
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-       u32 cpu_based_vm_exec_control;
-
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
-       u32 cpu_based_vm_exec_control;
-
-       if (!cpu_has_virtual_nmis())
-               return;
-
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
                                       struct kvm_run *kvm_run)
 {
        vmx_update_window_states(vcpu);
 
-       if (cpu_has_virtual_nmis()) {
-               if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-                       if (vcpu->arch.nmi_window_open) {
-                               vcpu->arch.nmi_pending = false;
-                               vcpu->arch.nmi_injected = true;
-                       } else {
-                               enable_nmi_window(vcpu);
-                               return;
-                       }
-               }
-               if (vcpu->arch.nmi_injected) {
-                       vmx_inject_nmi(vcpu);
-                       if (vcpu->arch.nmi_pending)
-                               enable_nmi_window(vcpu);
-                       else if (vcpu->arch.irq_summary)
-                               enable_irq_window(vcpu);
+       if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+               if (vcpu->arch.nmi_window_open) {
+                       vcpu->arch.nmi_pending = false;
+                       vcpu->arch.nmi_injected = true;
+               } else {
+                       enable_nmi_window(vcpu);
                        return;
                }
        }
+       if (vcpu->arch.nmi_injected) {
+               vmx_inject_nmi(vcpu);
+               if (vcpu->arch.nmi_pending)
+                       enable_nmi_window(vcpu);
+               else if (vcpu->arch.irq_summary)
+                       enable_irq_window(vcpu);
+               return;
+       }
 
        if (vcpu->arch.interrupt_window_open) {
                if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
@@ -2813,6 +2849,7 @@ static int handle_tpr_below_threshold(st
 static int handle_interrupt_window(struct kvm_vcpu *vcpu,
                                   struct kvm_run *kvm_run)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 cpu_based_vm_exec_control;
 
        /* clear pending irq */
@@ -2823,6 +2860,19 @@ static int handle_interrupt_window(struc
        KVMTRACE_0D(PEND_INTR, vcpu, handler);
        ++vcpu->stat.irq_window_exits;
 
+       if (!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked) {
+               vmx->soft_vnmi_blocked = 0;
+
+               /*
+                * If the user space waits to inject an NNI, exit ASAP
+                */
+               if (kvm_run->request_nmi_window && !vcpu->arch.nmi_pending) {
+                       kvm_run->exit_reason = KVM_EXIT_NMI_WINDOW_OPEN;
+                       ++vcpu->stat.nmi_window_exits;
+                       return 0;
+               }
+       }
+
        /*
         * If the user space waits to inject interrupts, exit as soon as
         * possible
@@ -3116,6 +3166,21 @@ static void vmx_complete_interrupts(stru
                if (unblock_nmi && vector != DF_VECTOR)
                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
                                      GUEST_INTR_STATE_NMI);
+       } else if (unlikely(vmx->soft_vnmi_blocked)) {
+               vmx->vnmi_blocked_time +=
+                       ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+               if (vmx->vnmi_blocked_time > 1000000000LL) {
+                       /*
+                        * This CPU don't support us in finding the end of an
+                        * NMI-blocked window if the guest runs with IRQs
+                        * disabled. So we pull the trigger after 1 s of
+                        * futile waiting, but inform the user about this.
+                        */
+                       vmx->soft_vnmi_blocked = 0;
+                       vmx->vcpu.arch.nmi_window_open = 1;
+                       printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+                              "state after 1 s timeout\n", __func__);
+               }
        }
 
        idt_vectoring_info = vmx->idt_vectoring_info;
@@ -3156,25 +3221,23 @@ static void vmx_intr_assist(struct kvm_v
 
        vmx_update_window_states(vcpu);
 
-       if (cpu_has_virtual_nmis()) {
-               if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-                       if (vcpu->arch.nmi_window_open) {
-                               vcpu->arch.nmi_pending = false;
-                               vcpu->arch.nmi_injected = true;
-                       } else {
-                               enable_nmi_window(vcpu);
-                               return;
-                       }
-               }
-               if (vcpu->arch.nmi_injected) {
-                       vmx_inject_nmi(vcpu);
-                       if (vcpu->arch.nmi_pending)
-                               enable_nmi_window(vcpu);
-                       else if (kvm_cpu_has_interrupt(vcpu))
-                               enable_irq_window(vcpu);
+       if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+               if (vcpu->arch.nmi_window_open) {
+                       vcpu->arch.nmi_pending = false;
+                       vcpu->arch.nmi_injected = true;
+               } else {
+                       enable_nmi_window(vcpu);
                        return;
                }
        }
+       if (vcpu->arch.nmi_injected) {
+               vmx_inject_nmi(vcpu);
+               if (vcpu->arch.nmi_pending)
+                       enable_nmi_window(vcpu);
+               else if (kvm_cpu_has_interrupt(vcpu))
+                       enable_irq_window(vcpu);
+               return;
+       }
        if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
                if (vcpu->arch.interrupt_window_open)
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
@@ -3223,6 +3286,10 @@ static void vmx_vcpu_run(struct kvm_vcpu
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info;
 
+       /* Record the guest's net vcpu time for enforced NMI injections. */
+       if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+               vmx->entry_time = ktime_get();
+
        /* Handle invalid guest state instead of entering VMX */
        if (vmx->emulation_required && emulate_invalid_guest_state) {
                handle_invalid_guest_state(vcpu, kvm_run);

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to