KVM's "Lazy FPU loading" means that sometimes L0 needs to set CR0.TS, even
if a guest didn't set it. Moreover, L0 must also trap CR0.TS changes and
NM exceptions, even if we have a guest hypervisor (L1) who didn't want these
traps. And of course, conversely: If L1 wanted to trap these events, we
must let it, even if L0 is not interested in them.

This patch fixes some existing KVM code (in update_exception_bitmap(),
vmx_fpu_activate(), vmx_fpu_deactivate(), handle_cr()) to do the correct
merging of L0's and L1's needs. Note that new code in introduced in previous
patches already handles CR0 correctly (see prepare_vmcs_02(),
prepare_vmcs_12(), and nested_vmx_vmexit()).

Signed-off-by: Nadav Har'El <n...@il.ibm.com>
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:30.000000000 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:30.000000000 +0300
@@ -1144,6 +1144,27 @@ static void update_exception_bitmap(stru
                eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
        if (vcpu->fpu_active)
                eb &= ~(1u << NM_VECTOR);
+
+       /* When we are running a nested L2 guest and L1 specified for it a
+        * certain exception bitmap, we must trap the same exceptions and pass
+        * them to L1. When running L2, we will only handle the exceptions
+        * specified above if L1 did not want them.
+        */
+       if (to_vmx(vcpu)->nested.nested_mode) {
+               u32 nested_eb;
+               if (to_vmx(vcpu)->nested.current_l2_page)
+                       nested_eb = get_shadow_vmcs(vcpu)->exception_bitmap;
+               else {
+                       if (!nested_map_current(vcpu)) {
+                               to_vmx(vcpu)->fail = 1;
+                               return;
+                       }
+                       nested_eb = get_shadow_vmcs(vcpu)->exception_bitmap;
+                       nested_unmap_current(vcpu);
+               }
+               eb |= nested_eb;
+       }
+
        vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -1488,8 +1509,25 @@ static void vmx_fpu_activate(struct kvm_
        cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
        cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
        vmcs_writel(GUEST_CR0, cr0);
-       update_exception_bitmap(vcpu);
        vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+       if (to_vmx(vcpu)->nested.nested_mode) {
+               /* While we (L0) no longer care about NM exceptions or cr0.TS
+                * changes, our guest hypervisor (L1) might care in which case
+                * we must trap them for it.
+                */
+               u32 eb = vmcs_read32(EXCEPTION_BITMAP) & ~(1u << NM_VECTOR);
+               struct shadow_vmcs *vmcs12;
+               if (!nested_map_current(vcpu)) {
+                       to_vmx(vcpu)->fail = 1;
+                       return;
+               }
+               vmcs12 = get_shadow_vmcs(vcpu);
+               eb |= vmcs12->exception_bitmap;
+               vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+               nested_unmap_current(vcpu);
+               vmcs_write32(EXCEPTION_BITMAP, eb);
+       } else
+               update_exception_bitmap(vcpu);
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 }
 
@@ -1497,12 +1535,24 @@ static void vmx_decache_cr0_guest_bits(s
 
 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
+       /* Note that there is no vcpu->fpu_active = 0 here. The caller must
+        * set this *before* calling this function.
+        */
        vmx_decache_cr0_guest_bits(vcpu);
        vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
-       update_exception_bitmap(vcpu);
+       vmcs_write32(EXCEPTION_BITMAP,
+               vmcs_read32(EXCEPTION_BITMAP) | (1u << NM_VECTOR));
        vcpu->arch.cr0_guest_owned_bits = 0;
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-       vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+       if (to_vmx(vcpu)->nested.nested_mode)
+               /* Unfortunately in nested mode we play with arch.cr0's PG
+                * bit, so we musn't copy it all, just the relevant TS bit
+                */
+               vmcs_writel(CR0_READ_SHADOW,
+                       (vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS) |
+                       (vcpu->arch.cr0 & X86_CR0_TS));
+       else
+               vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
 }
 
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -3998,6 +4048,53 @@ vmx_patch_hypercall(struct kvm_vcpu *vcp
        hypercall[2] = 0xc1;
 }
 
+/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+static void handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+       if (to_vmx(vcpu)->nested.nested_mode) {
+               /* When running L2, we usually do what L1 wants: it decides
+                * which cr0 bits to intercept, we forward it cr0-change events
+                * (see nested_vmx_exit_handled()). We only get here when a cr0
+                * bit was changed that L1 did not ask to intercept, but L0
+                * nevertheless did. Currently this can only happen with the TS
+                * bit (see CR0_GUEST_HOST_MASK in prepare_vmcs_02()).
+                * We must change only this bit in GUEST_CR0 and CR0_READ_SHADOW
+                * and not call kvm_set_cr0 because it enforces a relationship
+                * between the two that is specific to KVM (i.e., only the TS
+                * bit might differ) and with which L1 might not agree.
+                */
+               long new_cr0 = vmcs_readl(GUEST_CR0);
+               long new_cr0_rs = vmcs_readl(CR0_READ_SHADOW);
+               if (val & X86_CR0_TS) {
+                       new_cr0 |= X86_CR0_TS;
+                       new_cr0_rs |= X86_CR0_TS;
+                       vcpu->arch.cr0 |= X86_CR0_TS;
+               } else {
+                       new_cr0 &= ~X86_CR0_TS;
+                       new_cr0_rs &= ~X86_CR0_TS;
+                       vcpu->arch.cr0 &= ~X86_CR0_TS;
+               }
+               vmcs_writel(GUEST_CR0, new_cr0);
+               vmcs_writel(CR0_READ_SHADOW, new_cr0_rs);
+               to_vmx(vcpu)->nested.nested_run_pending = 1;
+       } else
+               kvm_set_cr0(vcpu, val);
+}
+
+/* called to set cr0 as approriate for clts instruction exit. */
+static void handle_clts(struct kvm_vcpu *vcpu)
+{
+       if (to_vmx(vcpu)->nested.nested_mode) {
+               /* As in handle_set_cr0(), we can't call vmx_set_cr0 here */
+               vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0) & ~X86_CR0_TS);
+               vmcs_writel(CR0_READ_SHADOW,
+                               vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
+               vcpu->arch.cr0 &= ~X86_CR0_TS;
+               to_vmx(vcpu)->nested.nested_run_pending = 1;
+       } else
+               vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+}
+
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification, val;
@@ -4013,7 +4110,7 @@ static int handle_cr(struct kvm_vcpu *vc
                trace_kvm_cr_write(cr, val);
                switch (cr) {
                case 0:
-                       kvm_set_cr0(vcpu, val);
+                       handle_set_cr0(vcpu, val);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 3:
@@ -4039,7 +4136,7 @@ static int handle_cr(struct kvm_vcpu *vc
                };
                break;
        case 2: /* clts */
-               vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+               handle_clts(vcpu);
                trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
                skip_emulated_instruction(vcpu);
                vmx_fpu_activate(vcpu);
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to