From: Orit Wasserman <[email protected]>

---
 arch/x86/kvm/svm.c |    3 -
 arch/x86/kvm/vmx.c |  265 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c |   11 ++-
 arch/x86/kvm/x86.h |    2 +
 4 files changed, 274 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3de0b37..3f63cdd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -121,9 +121,6 @@ static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
 
-static int nested = 1;
-module_param(nested, int, S_IRUGO);
-
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9a0a2cf..2726a6c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -92,6 +92,16 @@ struct shared_msr_entry {
        u64 mask;
 };
 
+struct __attribute__ ((__packed__)) level_state {
+};
+
+struct nested_vmx {
+       /* Has the level1 guest done vmxon? */
+       bool vmxon;
+       /* Level 1 state for switching to level 2 and back */
+       struct level_state *l1_state;
+};
+
 struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        struct list_head      local_vcpus_link;
@@ -136,6 +146,9 @@ struct vcpu_vmx {
        ktime_t entry_time;
        s64 vnmi_blocked_time;
        u32 exit_reason;
+
+       /* Nested vmx */
+       struct nested_vmx nested;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -201,6 +214,7 @@ static struct kvm_vmx_segment_field {
 static u64 host_efer;
 
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+static int create_l1_state(struct kvm_vcpu *vcpu);
 
 /*
  * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
@@ -961,6 +975,95 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 }
 
 /*
+ * Handles msr read for nested virtualization
+ */
+static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index,
+                             u64 *pdata)
+{
+       u64 vmx_msr = 0;
+
+       switch (msr_index) {
+       case MSR_IA32_FEATURE_CONTROL:
+               *pdata = 0;
+               break;
+       case MSR_IA32_VMX_BASIC:
+               *pdata = 0;
+               rdmsrl(MSR_IA32_VMX_BASIC, vmx_msr);
+               *pdata = (vmx_msr & 0x00ffffcfffffffff);
+               break;
+       case MSR_IA32_VMX_PINBASED_CTLS:
+               rdmsrl(MSR_IA32_VMX_PINBASED_CTLS, vmx_msr);
+               *pdata = (PIN_BASED_EXT_INTR_MASK & 
vmcs_config.pin_based_exec_ctrl) |
+                       (PIN_BASED_NMI_EXITING & 
vmcs_config.pin_based_exec_ctrl) |
+                       (PIN_BASED_VIRTUAL_NMIS & 
vmcs_config.pin_based_exec_ctrl);
+               break;
+       case MSR_IA32_VMX_PROCBASED_CTLS:
+       {
+               u32 vmx_msr_high, vmx_msr_low;
+               u32 control = CPU_BASED_HLT_EXITING |
+#ifdef CONFIG_X86_64
+                       CPU_BASED_CR8_LOAD_EXITING |
+                       CPU_BASED_CR8_STORE_EXITING |
+#endif
+                       CPU_BASED_CR3_LOAD_EXITING |
+                       CPU_BASED_CR3_STORE_EXITING |
+                       CPU_BASED_USE_IO_BITMAPS |
+                       CPU_BASED_MOV_DR_EXITING |
+                       CPU_BASED_USE_TSC_OFFSETING |
+                       CPU_BASED_INVLPG_EXITING |
+                       CPU_BASED_TPR_SHADOW |
+                       CPU_BASED_USE_MSR_BITMAPS |
+                       CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+
+               rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+
+               control &= vmx_msr_high; /* bit == 0 in high word ==> must be 
zero */
+               control |= vmx_msr_low;  /* bit == 1 in low word  ==> must be 
one  */
+
+               *pdata = (CPU_BASED_HLT_EXITING & control) |
+#ifdef CONFIG_X86_64
+                       (CPU_BASED_CR8_LOAD_EXITING & control) |
+                       (CPU_BASED_CR8_STORE_EXITING & control) |
+#endif
+                       (CPU_BASED_CR3_LOAD_EXITING & control) |
+                       (CPU_BASED_CR3_STORE_EXITING & control) |
+                       (CPU_BASED_USE_IO_BITMAPS & control) |
+                       (CPU_BASED_MOV_DR_EXITING & control) |
+                       (CPU_BASED_USE_TSC_OFFSETING & control) |
+                       (CPU_BASED_INVLPG_EXITING & control) ;
+
+               if (cpu_has_secondary_exec_ctrls())
+                       *pdata |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+
+               if (vm_need_tpr_shadow(vcpu->kvm))
+                       *pdata |= CPU_BASED_TPR_SHADOW;
+               break;
+       }
+       case MSR_IA32_VMX_EXIT_CTLS:
+               *pdata = 0;
+#ifdef CONFIG_X86_64
+               *pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+               break;
+       case MSR_IA32_VMX_ENTRY_CTLS:
+               *pdata = 0;
+               break;
+       case MSR_IA32_VMX_PROCBASED_CTLS2:
+               *pdata = 0;
+               if (vm_need_virtualize_apic_accesses(vcpu->kvm))
+                       *pdata |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+               break;
+       case MSR_IA32_VMX_EPT_VPID_CAP:
+               *pdata = 0;
+               break;
+       default:
+               return 1;
+       }
+
+       return 0;
+}
+
+/*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
@@ -1004,6 +1107,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
                break;
        default:
                vmx_load_host_state(to_vmx(vcpu));
+               if (nested &&
+                   !nested_vmx_get_msr(vcpu, msr_index, &data))
+                       break;
                msr = find_msr_entry(to_vmx(vcpu), msr_index);
                if (msr) {
                        vmx_load_host_state(to_vmx(vcpu));
@@ -1018,6 +1124,27 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
 }
 
 /*
+ * Writes msr value for nested virtualization
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+       switch (msr_index) {
+       case MSR_IA32_FEATURE_CONTROL:
+               if ((data & (FEATURE_CONTROL_LOCKED |
+                            FEATURE_CONTROL_VMXON_ENABLED))
+                   != (FEATURE_CONTROL_LOCKED |
+                       FEATURE_CONTROL_VMXON_ENABLED))
+                       return 1;
+               break;
+       default:
+               return 1;
+       }
+
+       return 0;
+}
+
+/*
  * Writes msr value into into the appropriate "register".
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
@@ -1067,6 +1194,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 data)
                }
                /* Otherwise falls through to kvm_set_msr_common */
        default:
+               if (nested &&
+                   !nested_vmx_set_msr(vcpu, msr_index, data))
+                       break;
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
                        vmx_load_host_state(vmx);
@@ -1163,6 +1293,31 @@ static void vmclear_local_vcpus(void)
                __vcpu_clear(vmx);
 }
 
+static struct level_state *create_state(void)
+{
+       struct level_state *state = NULL;
+
+       state = kzalloc(sizeof(struct level_state), GFP_KERNEL);
+       if (!state) {
+               printk(KERN_INFO "Error create level state\n");
+               return NULL;
+       }
+       return state;
+}
+
+static int create_l1_state(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!vmx->nested.l1_state) {
+               vmx->nested.l1_state = create_state();
+               if (!vmx->nested.l1_state)
+                       return -ENOMEM;
+       } else
+               return 0;
+
+       return 0;
+}
 
 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
  * tricks.
@@ -1333,6 +1488,18 @@ static void free_vmcs(struct vmcs *vmcs)
        free_pages((unsigned long)vmcs, vmcs_config.order);
 }
 
+static void free_l1_state(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!vmx->nested.l1_state)
+               return;
+
+       kfree(vmx->nested.l1_state);
+       vmx->nested.l1_state = NULL;
+}
+
+
 static void free_kvm_area(void)
 {
        int cpu;
@@ -3146,12 +3313,105 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+/*
+ * Check to see if vcpu can execute vmx command
+ * Inject the corrseponding exception
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment cs;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+       if (!vmx->nested.vmxon) {
+               pr_debug("%s: vmx not on\n", __func__);
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 0;
+       }
+
+       if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+           (is_long_mode(vcpu) && !cs.l)) {
+               pr_debug("%s: invalid mode cs.l %d is_long mode %d\n",
+                        __func__, cs.l, is_long_mode(vcpu));
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 0;
+       }
+
+       if (vmx_get_cpl(vcpu)) {
+               kvm_inject_gp(vcpu, 0);
+               return 0;
+       }
+
+       return 1;
+}
+
 static int handle_vmx_insn(struct kvm_vcpu *vcpu)
 {
        kvm_queue_exception(vcpu, UD_VECTOR);
        return 1;
 }
 
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!nested_vmx_check_permission(vcpu))
+               return 1;
+
+       vmx->nested.vmxon = 0;
+
+       free_l1_state(vcpu);
+
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment cs;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!nested) {
+               pr_debug("%s: nested vmx not enabled\n", __func__);
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+
+       if (!(vcpu->arch.cr4 & X86_CR4_VMXE) ||
+           !(vcpu->arch.cr0 & X86_CR0_PE) ||
+           (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               printk(KERN_INFO "%s invalid register state\n", __func__);
+               return 1;
+       }
+
+       if (is_long_mode(vcpu) && !cs.l) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               printk(KERN_INFO "%s invalid register state\n", __func__);
+               return 1;
+       }
+
+       if (vmx_get_cpl(vcpu)) {
+               printk(KERN_INFO "%s no permission\n", __func__);
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       if (create_l1_state(vcpu)) {
+               printk(KERN_ERR "%s create_l1_state failed\n", __func__);
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       vmx->nested.vmxon = 1;
+
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3442,8 +3702,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu 
*vcpu) = {
        [EXIT_REASON_VMREAD]                  = handle_vmx_insn,
        [EXIT_REASON_VMRESUME]                = handle_vmx_insn,
        [EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
-       [EXIT_REASON_VMOFF]                   = handle_vmx_insn,
-       [EXIT_REASON_VMON]                    = handle_vmx_insn,
+       [EXIT_REASON_VMOFF]                   = handle_vmoff,
+       [EXIT_REASON_VMON]                    = handle_vmon,
        [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
        [EXIT_REASON_WBINVD]                  = handle_wbinvd,
@@ -3823,6 +4083,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
        if (vmx->vpid != 0)
                __clear_bit(vmx->vpid, vmx_vpid_bitmap);
        spin_unlock(&vmx_vpid_lock);
+       free_l1_state(vcpu);
        vmx_free_vmcs(vcpu);
        kfree(vmx->guest_msrs);
        kvm_vcpu_uninit(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd15d7a..b698952 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -88,6 +88,10 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
 int ignore_msrs = 0;
 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
+int nested = 1;
+EXPORT_SYMBOL_GPL(nested);
+module_param(nested, int, S_IRUGO);
+
 #define KVM_NR_SHARED_MSRS 16
 
 struct kvm_shared_msrs_global {
@@ -505,7 +509,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                return;
        }
 
-       if (cr4 & X86_CR4_VMXE) {
+       if (cr4 & X86_CR4_VMXE && !nested) {
                printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
                kvm_inject_gp(vcpu, 0);
                return;
@@ -615,7 +619,10 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-       MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+       MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+       MSR_IA32_FEATURE_CONTROL,  MSR_IA32_VMX_BASIC, 
MSR_IA32_VMX_PINBASED_CTLS,
+       MSR_IA32_VMX_PROCBASED_CTLS, MSR_IA32_VMX_EXIT_CTLS, 
MSR_IA32_VMX_ENTRY_CTLS,
+       MSR_IA32_VMX_PROCBASED_CTLS2, MSR_IA32_VMX_EPT_VPID_CAP, 
MSR_IA32_FEATURE_CONTROL
 };
 
 static unsigned num_msrs_to_save;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea5..57204cb 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -35,4 +35,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                                              u32 function, u32 index);
 
+extern int nested;
+
 #endif
-- 
1.6.0.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to