[PATCH v2] KVM: VMX: disable SMEP feature when guest is in non-paging mode
Changes from v1 to v2: - Modify commit message and comments according to Gleb's suggestions. SMEP is disabled if CPU is in non-paging mode in hardware. However KVM always uses paging mode to emulate guest non-paging mode with TDP. To emulate this behavior, SMEP needs to be manually disabled when guest switches to non-paging mode. We met an issue that, SMP Linux guest with recent kernel (enable SMEP support, for example, 3.5.3) would crash with triple fault if setting unrestricted_guest=0. This is because KVM uses an identity mapping page table to emulate the non-paging mode, where the page table is set with USER flag. If SMEP is still enabled in this case, guest will meet unhandlable page fault and then crash. Signed-off-by: Dongxiao Xu dongxiao...@intel.com Signed-off-by: Xiantao Zhang xiantao.zh...@intel.com --- arch/x86/kvm/vmx.c |8 1 files changed, 8 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9120ae1..70393e2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3155,6 +3155,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!is_paging(vcpu)) { hw_cr4 = ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; + /* +* SMEP is disabled if CPU is in non-paging mode in +* hardware. However KVM always uses paging mode to +* emulate guest non-paging mode with TDP. +* To emulate this behavior, SMEP needs to be manually +* disabled when guest switches to non-paging mode. +*/ + hw_cr4 = ~X86_CR4_SMEP; } else if (!(cr4 X86_CR4_PAE)) { hw_cr4 = ~X86_CR4_PAE; } -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: VMX: disable SMEP feature when guest is in non-paging mode
SMEP is disabled if CPU is in non-paging mode in hardware. However KVM always uses paging mode to emulate guest non-paging mode with HAP. To emulate this behavior, SMEP needs to be manually disabled when guest switches to non-paging mode. We met an issue that, SMP Linux guest with recent kernel (enable SMEP support, for example, 3.5.3) would crash with triple fault if setting unrestricted_guest=0. This is because KVM uses an identity mapping page table to emulate the non-paging mode, where the page table is set with USER flag. If SMEP is still enabled in this case, guest will meet unhandlable page fault and then crash. Signed-off-by: Dongxiao Xu dongxiao...@intel.com Signed-off-by: Xiantao Zhang xiantao.zh...@intel.com --- arch/x86/kvm/vmx.c |8 1 files changed, 8 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9120ae1..e82f20d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3155,6 +3155,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!is_paging(vcpu)) { hw_cr4 = ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; + /* +* SMEP is disabled if CPU is in non-paging mode in +* hardware. However KVM always uses paging mode to +* emulate guest non-paging mode with HAP. +* To emulate this behavior, SMEP needs to be manually +* disabled when guest switches to non-paging mode. +*/ + hw_cr4 = ~X86_CR4_SMEP; } else if (!(cr4 X86_CR4_PAE)) { hw_cr4 = ~X86_CR4_PAE; } -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 4/4] nested vmx: use a list to store the launched vmcs12 for L1 VMM
The launch state is not a member in the VMCS area, use a separate variable (list) to store it instead. Signed-off-by: Dongxiao Xu dongxiao...@intel.com --- arch/x86/kvm/vmx.c | 86 +--- 1 files changed, 81 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 20de88b..3be9265 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -177,8 +177,7 @@ struct __packed vmcs12 { u32 revision_id; u32 abort; - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ + u32 padding[8]; /* room for future expansion */ u64 io_bitmap_a; u64 io_bitmap_b; @@ -339,6 +338,11 @@ struct vmcs02_list { struct loaded_vmcs vmcs02; }; +struct vmcs12_list { + unsigned long vmcs12_pa; + struct list_head node; +}; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -364,6 +368,8 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + /* vmcs12_pool contains the launched vmcs12. */ + struct list_head vmcs12_pool; }; struct vcpu_vmx { @@ -619,6 +625,58 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static int vmcs12_launched(struct list_head *vmcs12_pool, + unsigned long vmcs12_pa) +{ + struct vmcs12_list *iter; + struct list_head *pos; + int launched = 0; + + list_for_each(pos, vmcs12_pool) { + iter = list_entry(pos, struct vmcs12_list, node); + if (vmcs12_pa == iter-vmcs12_pa) { + launched = 1; + break; + } + } + + return launched; +} + +static int set_vmcs12_launched(struct list_head *vmcs12_pool, + unsigned long vmcs12_pa) +{ + struct vmcs12_list *vmcs12; + + if (vmcs12_launched(vmcs12_pool, vmcs12_pa)) + return 0; + + vmcs12 = kzalloc(sizeof(struct vmcs12_list), GFP_KERNEL); + if (!vmcs12) + return -ENOMEM; + + vmcs12-vmcs12_pa = vmcs12_pa; + list_add(vmcs12-node, vmcs12_pool); + + return 0; +} + +static void clear_vmcs12_launched(struct list_head *vmcs12_pool, + unsigned long vmcs12_pa) +{ + struct vmcs12_list *iter; + struct list_head *pos; + + list_for_each(pos, vmcs12_pool) { + iter = list_entry(pos, struct vmcs12_list, node); + if (vmcs12_pa == iter-vmcs12_pa) { + list_del(iter-node); + kfree(iter); + break; + } + } +} + static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); @@ -5116,6 +5174,18 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) } /* + * Free the vmcs12 list. + */ +static void nested_free_vmcs12_list(struct vcpu_vmx *vmx) +{ + struct vmcs12_list *item, *n; + list_for_each_entry_safe(item, n, vmx-nested.vmcs12_pool, node) { + list_del(item-node); + kfree(item); + } +} + +/* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even * inspect the argument to VMXON (the so-called VMXON pointer) because we @@ -5212,6 +5282,7 @@ static void free_nested(struct vcpu_vmx *vmx) } nested_free_all_saved_vmcss(vmx); + nested_free_vmcs12_list(vmx); } /* Emulate the VMXOFF instruction */ @@ -5364,7 +5435,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return 1; } vmcs12 = kmap(page); - vmcs12-launch_state = 0; + clear_vmcs12_launched(vmx-nested.vmcs12_pool, __pa(vmcs12)); kunmap(page); nested_release_page(page); @@ -6460,6 +6531,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx-nested.current_vmptr = -1ull; vmx-nested.current_vmcs12 = NULL; + INIT_LIST_HEAD(vmx-nested.vmcs12_pool); return vmx-vcpu; @@ -6839,6 +6911,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; struct loaded_vmcs *vmcs02; + int is_launched; if (!nested_vmx_check_permission(vcpu) || !nested_vmx_check_vmcs12(vcpu)) @@ -6857,7 +6930,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ - if (vmcs12-launch_state == launch) { + is_launched = + vmcs12_launched(vmx
[PATCH v2 1/4] nested vmx: clean up for vmcs12 read and write
abstract vmcs12_read and vmcs12_write functions to do the vmcs12 read/write operations. Signed-off-by: Dongxiao Xu dongxiao...@intel.com --- arch/x86/kvm/vmx.c | 85 +--- 1 files changed, 41 insertions(+), 44 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f858159..2f8344f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -583,10 +583,15 @@ static const unsigned short vmcs_field_to_offset_table[] = { }; static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table); -static inline short vmcs_field_to_offset(unsigned long field) +static inline bool vmcs_field_valid(unsigned long field) { if (field = max_vmcs_field || vmcs_field_to_offset_table[field] == 0) - return -1; + return 0; + return 1; +} + +static inline short vmcs_field_to_offset(unsigned long field) +{ return vmcs_field_to_offset_table[field]; } @@ -5407,32 +5412,45 @@ static inline int vmcs_field_readonly(unsigned long field) * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of * 64-bit fields are to be returned). */ -static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 *ret) +static inline u64 vmcs12_read(struct kvm_vcpu *vcpu, unsigned long field) { - short offset = vmcs_field_to_offset(field); - char *p; + char *p = ((char *)(get_vmcs12(vcpu))) + vmcs_field_to_offset(field); - if (offset 0) - return 0; + switch (vmcs_field_type(field)) { + case VMCS_FIELD_TYPE_NATURAL_WIDTH: + return *((natural_width *)p); + case VMCS_FIELD_TYPE_U16: + return *((u16 *)p); + case VMCS_FIELD_TYPE_U32: + return *((u32 *)p); + case VMCS_FIELD_TYPE_U64: + return *((u64 *)p); + default: + return 0; /* can never happen. */ + } +} - p = ((char *)(get_vmcs12(vcpu))) + offset; +static inline void vmcs12_write(struct kvm_vcpu *vcpu, + unsigned long field, + u64 value) +{ + char *p = ((char *)(get_vmcs12(vcpu))) + vmcs_field_to_offset(field); switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_NATURAL_WIDTH: - *ret = *((natural_width *)p); - return 1; + *(natural_width *)p = value; + break; case VMCS_FIELD_TYPE_U16: - *ret = *((u16 *)p); - return 1; + *(u16 *)p = value; + break; case VMCS_FIELD_TYPE_U32: - *ret = *((u32 *)p); - return 1; + *(u32 *)p = value; + break; case VMCS_FIELD_TYPE_U64: - *ret = *((u64 *)p); - return 1; + *(u64 *)p = value; + break; default: - return 0; /* can never happen. */ + break; /* can never happen. */ } } @@ -5465,12 +5483,13 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((vmx_instruction_info) 28) 0xf)); - /* Read the field, zero-extended to a u64 field_value */ - if (!vmcs12_read_any(vcpu, field, field_value)) { + if (!vmcs_field_valid(field)) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; } + /* Read the field, zero-extended to a u64 field_value */ + field_value = vmcs12_read(vcpu, field); /* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending @@ -5500,8 +5519,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) gva_t gva; unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - char *p; - short offset; /* The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 @@ -5537,33 +5554,13 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } - - offset = vmcs_field_to_offset(field); - if (offset 0) { + if (!vmcs_field_valid(field)) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); skip_emulated_instruction(vcpu); return 1; } - p = ((char *) get_vmcs12(vcpu)) + offset; - switch (vmcs_field_type(field)) { - case
[PATCH v2 3/4] nested vmx: use vmcs12_read/write() to operate VMCS fields
When referencing vmcs12 fields, the current approach is to use struct.field style. This commit replace all the current solution by calling vmcs12_read() and vmcs12_write() fucntions. Signed-off-by: Dongxiao Xu dongxiao...@intel.com --- arch/x86/kvm/vmx.c | 591 1 files changed, 317 insertions(+), 274 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 639cad0..20de88b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -629,6 +629,11 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +static inline u64 vmcs12_read(struct kvm_vcpu *vcpu, unsigned long field); +static inline void vmcs12_write(struct kvm_vcpu *vcpu, + unsigned long field, + u64 value); + static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* @@ -891,19 +896,19 @@ static inline bool report_flexpriority(void) static inline bool nested_cpu_has(struct kvm_vcpu *vcpu, u32 bit) { - return get_vmcs12(vcpu)-cpu_based_vm_exec_control bit; + return vmcs12_read(vcpu, CPU_BASED_VM_EXEC_CONTROL) bit; } static inline bool nested_cpu_has2(struct kvm_vcpu *vcpu, u32 bit) { - return (get_vmcs12(vcpu)-cpu_based_vm_exec_control + return (vmcs12_read(vcpu, CPU_BASED_VM_EXEC_CONTROL) CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) - (get_vmcs12(vcpu)-secondary_vm_exec_control bit); + (vmcs12_read(vcpu, SECONDARY_VM_EXEC_CONTROL) bit); } static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu) { - return get_vmcs12(vcpu)-pin_based_vm_exec_control + return vmcs12_read(vcpu, PIN_BASED_VM_EXEC_CONTROL) PIN_BASED_VIRTUAL_NMIS; } @@ -915,7 +920,6 @@ static inline bool is_exception(u32 intr_info) static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, u32 reason, unsigned long qualification); static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) @@ -1220,7 +1224,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) * specified above if L1 did not want them. */ if (is_guest_mode(vcpu)) - eb |= get_vmcs12(vcpu)-exception_bitmap; + eb |= vmcs12_read(vcpu, EXCEPTION_BITMAP); vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -1582,7 +1586,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) vcpu-arch.cr0_guest_owned_bits = X86_CR0_TS; if (is_guest_mode(vcpu)) vcpu-arch.cr0_guest_owned_bits = - ~get_vmcs12(vcpu)-cr0_guest_host_mask; + ~vmcs12_read(vcpu, CR0_GUEST_HOST_MASK); vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu-arch.cr0_guest_owned_bits); } @@ -1593,15 +1597,19 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by * its hypervisor (cr0_read_shadow). */ -static inline unsigned long nested_read_cr0(struct vmcs12 *fields) +static inline unsigned long nested_read_cr0(struct kvm_vcpu *vcpu) { - return (fields-guest_cr0 ~fields-cr0_guest_host_mask) | - (fields-cr0_read_shadow fields-cr0_guest_host_mask); + return (vmcs12_read(vcpu, GUEST_CR0) + ~vmcs12_read(vcpu, CR0_GUEST_HOST_MASK)) | + (vmcs12_read(vcpu, CR0_READ_SHADOW) + vmcs12_read(vcpu, CR0_GUEST_HOST_MASK)); } -static inline unsigned long nested_read_cr4(struct vmcs12 *fields) +static inline unsigned long nested_read_cr4(struct kvm_vcpu *vcpu) { - return (fields-guest_cr4 ~fields-cr4_guest_host_mask) | - (fields-cr4_read_shadow fields-cr4_guest_host_mask); + return (vmcs12_read(vcpu, GUEST_CR4) + ~vmcs12_read(vcpu, CR4_GUEST_HOST_MASK)) | + (vmcs12_read(vcpu, CR4_READ_SHADOW) + vmcs12_read(vcpu, CR4_GUEST_HOST_MASK)); } static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) @@ -1623,10 +1631,10 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) * up-to-date here because we just decached cr0.TS (and we'll * only update vmcs12-guest_cr0 on nested exit). */ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12-guest_cr0 = (vmcs12-guest_cr0 ~X86_CR0_TS) | + u64 guest_cr0 = (vmcs12_read(vcpu, GUEST_CR0) ~X86_CR0_TS) | (vcpu-arch.cr0 X86_CR0_TS); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); + vmcs12_write(vcpu, GUEST_CR0, guest_cr0
[PATCH v2 2/4] nested vmx: clean up for nested_cpu_has_xxx functions
This is a preparation for the later change, which use vmcs12_read() and vmcs12_write() to replace the way to access vmcs12 fields. Since the above functions uses 'vcpu' as parameter, we also use 'vcpu' as the parameter in nested_cpu_has_xxx functions. Signed-off-by: Dongxiao Xu dongxiao...@intel.com --- arch/x86/kvm/vmx.c | 57 +-- 1 files changed, 28 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2f8344f..639cad0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -889,22 +889,22 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } -static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) +static inline bool nested_cpu_has(struct kvm_vcpu *vcpu, u32 bit) { - return vmcs12-cpu_based_vm_exec_control bit; + return get_vmcs12(vcpu)-cpu_based_vm_exec_control bit; } -static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) +static inline bool nested_cpu_has2(struct kvm_vcpu *vcpu, u32 bit) { - return (vmcs12-cpu_based_vm_exec_control + return (get_vmcs12(vcpu)-cpu_based_vm_exec_control CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) - (vmcs12-secondary_vm_exec_control bit); + (get_vmcs12(vcpu)-secondary_vm_exec_control bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, - struct kvm_vcpu *vcpu) +static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu) { - return vmcs12-pin_based_vm_exec_control PIN_BASED_VIRTUAL_NMIS; + return get_vmcs12(vcpu)-pin_based_vm_exec_control + PIN_BASED_VIRTUAL_NMIS; } static inline bool is_exception(u32 intr_info) @@ -1888,7 +1888,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) /* recalculate vmcs02.TSC_OFFSET: */ vmcs12 = get_vmcs12(vcpu); vmcs_write64(TSC_OFFSET, offset + - (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? + (nested_cpu_has(vcpu, CPU_BASED_USE_TSC_OFFSETING) ? vmcs12-tsc_offset : 0)); } else { vmcs_write64(TSC_OFFSET, offset); @@ -5712,7 +5712,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, u32 msr_index = vcpu-arch.regs[VCPU_REGS_RCX]; gpa_t bitmap; - if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) + if (!nested_cpu_has(vcpu, CPU_BASED_USE_MSR_BITMAPS)) return 1; /* @@ -5768,7 +5768,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, (vmcs12-cr3_target_count = 4 vmcs12-cr3_target_value3 == val)) return 0; - if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) + if (nested_cpu_has(vcpu, CPU_BASED_CR3_LOAD_EXITING)) return 1; break; case 4: @@ -5777,7 +5777,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return 1; break; case 8: - if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) + if (nested_cpu_has(vcpu, CPU_BASED_CR8_LOAD_EXITING)) return 1; break; } @@ -5865,15 +5865,15 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_CPUID: return 1; case EXIT_REASON_HLT: - return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); + return nested_cpu_has(vcpu, CPU_BASED_HLT_EXITING); case EXIT_REASON_INVD: return 1; case EXIT_REASON_INVLPG: - return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); + return nested_cpu_has(vcpu, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_RDPMC: - return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); + return nested_cpu_has(vcpu, CPU_BASED_RDPMC_EXITING); case EXIT_REASON_RDTSC: - return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + return nested_cpu_has(vcpu, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: @@ -5887,7 +5887,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_CR_ACCESS: return nested_vmx_exit_handled_cr(vcpu, vmcs12); case EXIT_REASON_DR_ACCESS: - return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); + return nested_cpu_has(vcpu
[PATCH v2 0/4] nested vmx code clean up and restructure
This patch series clean up and restructure part of the nested vmx code. The main purpose is to abstract the vmcs12_read() and vmcs12_write() functions. With this change, we have a unified API to get/set field values from/to vmcs12. Changes from v1 to v2: Move the VMCS field valid check into handle_vmread() and handle_vmwrite() functions. Thanks, Dongxiao Dongxiao Xu (4): nested vmx: clean up for vmcs12 read and write nested vmx: clean up for nested_cpu_has_xxx functions nested vmx: use vmcs12_read/write() to operate VMCS fields nested vmx: use a list to store the launched vmcs12 for L1 VMM arch/x86/kvm/vmx.c | 811 ++-- 1 files changed, 463 insertions(+), 348 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/4] nested vmx: clean up for vmcs12 read and write
abstract vmcs12_read and vmcs12_write functions to do the vmcs12 read/write operations. Signed-off-by: Dongxiao Xu dongxiao...@intel.com --- arch/x86/kvm/vmx.c | 86 +++- 1 files changed, 45 insertions(+), 41 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f858159..d8670e4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5407,32 +5407,67 @@ static inline int vmcs_field_readonly(unsigned long field) * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of * 64-bit fields are to be returned). */ -static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 *ret) +static inline u64 vmcs12_read(struct kvm_vcpu *vcpu, unsigned long field) { short offset = vmcs_field_to_offset(field); char *p; - if (offset 0) + if (offset 0) { + nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + skip_emulated_instruction(vcpu); return 0; + } p = ((char *)(get_vmcs12(vcpu))) + offset; switch (vmcs_field_type(field)) { case VMCS_FIELD_TYPE_NATURAL_WIDTH: - *ret = *((natural_width *)p); + return *((natural_width *)p); + case VMCS_FIELD_TYPE_U16: + return *((u16 *)p); + case VMCS_FIELD_TYPE_U32: + return *((u32 *)p); + case VMCS_FIELD_TYPE_U64: + return *((u64 *)p); + default: + nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + skip_emulated_instruction(vcpu); + return 0; /* can never happen. */ + } +} + +static inline int vmcs12_write(struct kvm_vcpu *vcpu, + unsigned long field, + u64 value) +{ + short offset = vmcs_field_to_offset(field); + char *p; + + if (offset 0) { + nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + skip_emulated_instruction(vcpu); + return 0; + } + + p = ((char *)(get_vmcs12(vcpu))) + offset; + + switch (vmcs_field_type(field)) { + case VMCS_FIELD_TYPE_NATURAL_WIDTH: + *(natural_width *)p = value; return 1; case VMCS_FIELD_TYPE_U16: - *ret = *((u16 *)p); + *(u16 *)p = value; return 1; case VMCS_FIELD_TYPE_U32: - *ret = *((u32 *)p); + *(u32 *)p = value; return 1; case VMCS_FIELD_TYPE_U64: - *ret = *((u64 *)p); + *(u64 *)p = value; return 1; default: - return 0; /* can never happen. */ + nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); + skip_emulated_instruction(vcpu); + return 0; } } @@ -5466,11 +5501,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((vmx_instruction_info) 28) 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (!vmcs12_read_any(vcpu, field, field_value)) { - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; - } + field_value = vmcs12_read(vcpu, field); /* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending @@ -5500,8 +5531,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) gva_t gva; unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - char *p; - short offset; /* The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 @@ -5537,33 +5566,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } - - offset = vmcs_field_to_offset(field); - if (offset 0) { - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; - } - p = ((char *) get_vmcs12(vcpu)) + offset; - - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: - *(u16 *)p = field_value; - break; - case VMCS_FIELD_TYPE_U32: - *(u32 *)p = field_value; - break; - case VMCS_FIELD_TYPE_U64: - *(u64 *)p = field_value; - break
[PATCH 2/4] nested vmx: clean up for nested_cpu_has_xxx functions
This is a preparation for the later change, which use vmcs12_read() and vmcs12_write() to replace the way to access vmcs12 fields. Since the above functions uses 'vcpu' as parameter, we also use 'vcpu' as the parameter in nested_cpu_has_xxx functions. Signed-off-by: Dongxiao Xu dongxiao...@intel.com --- arch/x86/kvm/vmx.c | 57 +-- 1 files changed, 28 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d8670e4..b036f9c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -884,22 +884,22 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } -static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) +static inline bool nested_cpu_has(struct kvm_vcpu *vcpu, u32 bit) { - return vmcs12-cpu_based_vm_exec_control bit; + return get_vmcs12(vcpu)-cpu_based_vm_exec_control bit; } -static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) +static inline bool nested_cpu_has2(struct kvm_vcpu *vcpu, u32 bit) { - return (vmcs12-cpu_based_vm_exec_control + return (get_vmcs12(vcpu)-cpu_based_vm_exec_control CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) - (vmcs12-secondary_vm_exec_control bit); + (get_vmcs12(vcpu)-secondary_vm_exec_control bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, - struct kvm_vcpu *vcpu) +static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu) { - return vmcs12-pin_based_vm_exec_control PIN_BASED_VIRTUAL_NMIS; + return get_vmcs12(vcpu)-pin_based_vm_exec_control + PIN_BASED_VIRTUAL_NMIS; } static inline bool is_exception(u32 intr_info) @@ -1883,7 +1883,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) /* recalculate vmcs02.TSC_OFFSET: */ vmcs12 = get_vmcs12(vcpu); vmcs_write64(TSC_OFFSET, offset + - (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? + (nested_cpu_has(vcpu, CPU_BASED_USE_TSC_OFFSETING) ? vmcs12-tsc_offset : 0)); } else { vmcs_write64(TSC_OFFSET, offset); @@ -5719,7 +5719,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, u32 msr_index = vcpu-arch.regs[VCPU_REGS_RCX]; gpa_t bitmap; - if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) + if (!nested_cpu_has(vcpu, CPU_BASED_USE_MSR_BITMAPS)) return 1; /* @@ -5775,7 +5775,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, (vmcs12-cr3_target_count = 4 vmcs12-cr3_target_value3 == val)) return 0; - if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) + if (nested_cpu_has(vcpu, CPU_BASED_CR3_LOAD_EXITING)) return 1; break; case 4: @@ -5784,7 +5784,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return 1; break; case 8: - if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) + if (nested_cpu_has(vcpu, CPU_BASED_CR8_LOAD_EXITING)) return 1; break; } @@ -5872,15 +5872,15 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_CPUID: return 1; case EXIT_REASON_HLT: - return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); + return nested_cpu_has(vcpu, CPU_BASED_HLT_EXITING); case EXIT_REASON_INVD: return 1; case EXIT_REASON_INVLPG: - return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); + return nested_cpu_has(vcpu, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_RDPMC: - return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); + return nested_cpu_has(vcpu, CPU_BASED_RDPMC_EXITING); case EXIT_REASON_RDTSC: - return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + return nested_cpu_has(vcpu, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: @@ -5894,7 +5894,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_CR_ACCESS: return nested_vmx_exit_handled_cr(vcpu, vmcs12); case EXIT_REASON_DR_ACCESS: - return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); + return nested_cpu_has(vcpu
[PATCH 3/4] nested vmx: use vmcs12_read/write() to operate VMCS fields
When referencing vmcs12 fields, the current approach is to use struct.field style. This commit replace all the current solution by calling vmcs12_read() and vmcs12_write() fucntions. Signed-off-by: Dongxiao Xu dongxiao...@intel.com --- arch/x86/kvm/vmx.c | 591 1 files changed, 317 insertions(+), 274 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b036f9c..6687fb6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -624,6 +624,11 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +static inline u64 vmcs12_read(struct kvm_vcpu *vcpu, unsigned long field); +static inline int vmcs12_write(struct kvm_vcpu *vcpu, + unsigned long field, + u64 value); + static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* @@ -886,19 +891,19 @@ static inline bool report_flexpriority(void) static inline bool nested_cpu_has(struct kvm_vcpu *vcpu, u32 bit) { - return get_vmcs12(vcpu)-cpu_based_vm_exec_control bit; + return vmcs12_read(vcpu, CPU_BASED_VM_EXEC_CONTROL) bit; } static inline bool nested_cpu_has2(struct kvm_vcpu *vcpu, u32 bit) { - return (get_vmcs12(vcpu)-cpu_based_vm_exec_control + return (vmcs12_read(vcpu, CPU_BASED_VM_EXEC_CONTROL) CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) - (get_vmcs12(vcpu)-secondary_vm_exec_control bit); + (vmcs12_read(vcpu, SECONDARY_VM_EXEC_CONTROL) bit); } static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu) { - return get_vmcs12(vcpu)-pin_based_vm_exec_control + return vmcs12_read(vcpu, PIN_BASED_VM_EXEC_CONTROL) PIN_BASED_VIRTUAL_NMIS; } @@ -910,7 +915,6 @@ static inline bool is_exception(u32 intr_info) static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, u32 reason, unsigned long qualification); static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) @@ -1215,7 +1219,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) * specified above if L1 did not want them. */ if (is_guest_mode(vcpu)) - eb |= get_vmcs12(vcpu)-exception_bitmap; + eb |= vmcs12_read(vcpu, EXCEPTION_BITMAP); vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -1577,7 +1581,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) vcpu-arch.cr0_guest_owned_bits = X86_CR0_TS; if (is_guest_mode(vcpu)) vcpu-arch.cr0_guest_owned_bits = - ~get_vmcs12(vcpu)-cr0_guest_host_mask; + ~vmcs12_read(vcpu, CR0_GUEST_HOST_MASK); vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu-arch.cr0_guest_owned_bits); } @@ -1588,15 +1592,19 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by * its hypervisor (cr0_read_shadow). */ -static inline unsigned long nested_read_cr0(struct vmcs12 *fields) +static inline unsigned long nested_read_cr0(struct kvm_vcpu *vcpu) { - return (fields-guest_cr0 ~fields-cr0_guest_host_mask) | - (fields-cr0_read_shadow fields-cr0_guest_host_mask); + return (vmcs12_read(vcpu, GUEST_CR0) + ~vmcs12_read(vcpu, CR0_GUEST_HOST_MASK)) | + (vmcs12_read(vcpu, CR0_READ_SHADOW) + vmcs12_read(vcpu, CR0_GUEST_HOST_MASK)); } -static inline unsigned long nested_read_cr4(struct vmcs12 *fields) +static inline unsigned long nested_read_cr4(struct kvm_vcpu *vcpu) { - return (fields-guest_cr4 ~fields-cr4_guest_host_mask) | - (fields-cr4_read_shadow fields-cr4_guest_host_mask); + return (vmcs12_read(vcpu, GUEST_CR4) + ~vmcs12_read(vcpu, CR4_GUEST_HOST_MASK)) | + (vmcs12_read(vcpu, CR4_READ_SHADOW) + vmcs12_read(vcpu, CR4_GUEST_HOST_MASK)); } static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) @@ -1618,10 +1626,10 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) * up-to-date here because we just decached cr0.TS (and we'll * only update vmcs12-guest_cr0 on nested exit). */ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12-guest_cr0 = (vmcs12-guest_cr0 ~X86_CR0_TS) | + u64 guest_cr0 = (vmcs12_read(vcpu, GUEST_CR0) ~X86_CR0_TS) | (vcpu-arch.cr0 X86_CR0_TS); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); + vmcs12_write(vcpu, GUEST_CR0, guest_cr0); + vmcs_writel
[PATCH 4/4] nested vmx: use a list to store the launched vmcs12 for L1 VMM
The launch state is not a member in the VMCS area, use a separate variable (list) to store it instead. Signed-off-by: Dongxiao Xu dongxiao...@intel.com --- arch/x86/kvm/vmx.c | 86 +--- 1 files changed, 81 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6687fb6..d03ab4e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -177,8 +177,7 @@ struct __packed vmcs12 { u32 revision_id; u32 abort; - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ + u32 padding[8]; /* room for future expansion */ u64 io_bitmap_a; u64 io_bitmap_b; @@ -339,6 +338,11 @@ struct vmcs02_list { struct loaded_vmcs vmcs02; }; +struct vmcs12_list { + unsigned long vmcs12_pa; + struct list_head node; +}; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -364,6 +368,8 @@ struct nested_vmx { * we must keep them pinned while L2 runs. */ struct page *apic_access_page; + /* vmcs12_pool contains the launched vmcs12. */ + struct list_head vmcs12_pool; }; struct vcpu_vmx { @@ -614,6 +620,58 @@ static void nested_release_page_clean(struct page *page) kvm_release_page_clean(page); } +static int vmcs12_launched(struct list_head *vmcs12_pool, + unsigned long vmcs12_pa) +{ + struct vmcs12_list *iter; + struct list_head *pos; + int launched = 0; + + list_for_each(pos, vmcs12_pool) { + iter = list_entry(pos, struct vmcs12_list, node); + if (vmcs12_pa == iter-vmcs12_pa) { + launched = 1; + break; + } + } + + return launched; +} + +static int set_vmcs12_launched(struct list_head *vmcs12_pool, + unsigned long vmcs12_pa) +{ + struct vmcs12_list *vmcs12; + + if (vmcs12_launched(vmcs12_pool, vmcs12_pa)) + return 0; + + vmcs12 = kzalloc(sizeof(struct vmcs12_list), GFP_KERNEL); + if (!vmcs12) + return -ENOMEM; + + vmcs12-vmcs12_pa = vmcs12_pa; + list_add(vmcs12-node, vmcs12_pool); + + return 0; +} + +static void clear_vmcs12_launched(struct list_head *vmcs12_pool, + unsigned long vmcs12_pa) +{ + struct vmcs12_list *iter; + struct list_head *pos; + + list_for_each(pos, vmcs12_pool) { + iter = list_entry(pos, struct vmcs12_list, node); + if (vmcs12_pa == iter-vmcs12_pa) { + list_del(iter-node); + kfree(iter); + break; + } + } +} + static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); @@ -5111,6 +5169,18 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) } /* + * Free the vmcs12 list. + */ +static void nested_free_vmcs12_list(struct vcpu_vmx *vmx) +{ + struct vmcs12_list *item, *n; + list_for_each_entry_safe(item, n, vmx-nested.vmcs12_pool, node) { + list_del(item-node); + kfree(item); + } +} + +/* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even * inspect the argument to VMXON (the so-called VMXON pointer) because we @@ -5207,6 +5277,7 @@ static void free_nested(struct vcpu_vmx *vmx) } nested_free_all_saved_vmcss(vmx); + nested_free_vmcs12_list(vmx); } /* Emulate the VMXOFF instruction */ @@ -5359,7 +5430,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return 1; } vmcs12 = kmap(page); - vmcs12-launch_state = 0; + clear_vmcs12_launched(vmx-nested.vmcs12_pool, __pa(vmcs12)); kunmap(page); nested_release_page(page); @@ -6467,6 +6538,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx-nested.current_vmptr = -1ull; vmx-nested.current_vmcs12 = NULL; + INIT_LIST_HEAD(vmx-nested.vmcs12_pool); return vmx-vcpu; @@ -6846,6 +6918,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; struct loaded_vmcs *vmcs02; + int is_launched; if (!nested_vmx_check_permission(vcpu) || !nested_vmx_check_vmcs12(vcpu)) @@ -6864,7 +6937,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ - if (vmcs12-launch_state == launch) { + is_launched = + vmcs12_launched(vmx
[PATCH 0/4] nested vmx code clean up and restructure
This patch series clean up and restructure part of the nested vmx code. The main purpose is to abstract the vmcs12_read() and vmcs12_write() functions. With this change, we have a unified API to get/set field values from/to vmcs12. Thanks, Dongxiao Dongxiao Xu (4): nested vmx: clean up for vmcs12 read and write nested vmx: clean up for nested_cpu_has_xxx functions nested vmx: use vmcs12_read/write() to operate VMCS fields nested vmx: use a list to store the launched vmcs12 for L1 VMM arch/x86/kvm/vmx.c | 812 ++-- 1 files changed, 467 insertions(+), 345 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html