RE: [PATCH 14/24] Prepare vmcs02 from vmcs01 and vmcs12
Nadav Har'El wrote: > This patch contains code to prepare the VMCS which can be used to > actually run the L2 guest, vmcs02. prepare_vmcs02 appropriately > merges the information in shadow_vmcs that L1 built for L2 (vmcs12), > and that in the VMCS that we built for L1 (vmcs01). > > VMREAD/WRITE can only access one VMCS at a time (the "current" VMCS), > which makes it difficult for us to read from vmcs01 while writing to > vmcs12. This is why we first make a copy of vmcs01 in memory > (l1_shadow_vmcs) and then read that memory copy while writing to > vmcs12. > > Signed-off-by: Nadav Har'El > --- > --- .before/arch/x86/kvm/vmx.c2010-06-13 15:01:29.0 +0300 > +++ .after/arch/x86/kvm/vmx.c 2010-06-13 15:01:29.0 +0300 > @@ -849,6 +849,36 @@ static inline bool report_flexpriority(v > return flexpriority_enabled; > } > > +static inline bool nested_cpu_has_vmx_tpr_shadow(struct kvm_vcpu > *vcpu) +{ > + return cpu_has_vmx_tpr_shadow() && > + get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control & > + CPU_BASED_TPR_SHADOW; > +} > + > +static inline bool nested_cpu_has_secondary_exec_ctrls(struct > kvm_vcpu *vcpu) +{ > + return cpu_has_secondary_exec_ctrls() && > + get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control & > + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; > +} > + > +static inline bool nested_vm_need_virtualize_apic_accesses(struct > kvm_vcpu + *vcpu) > +{ > + return nested_cpu_has_secondary_exec_ctrls(vcpu) && > + (get_shadow_vmcs(vcpu)->secondary_vm_exec_control & > + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); > +} > + > +static inline bool nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu) > +{ > + return nested_cpu_has_secondary_exec_ctrls(vcpu) && > + (get_shadow_vmcs(vcpu)->secondary_vm_exec_control & > + SECONDARY_EXEC_ENABLE_EPT); > +} > + > + > static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) > { > int i; > @@ -1292,6 +1322,39 @@ static void vmx_load_host_state(struct v > preempt_enable(); > } > > +int load_vmcs_host_state(struct shadow_vmcs *src) > +{ > + vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector); > + vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector); > + vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector); > + vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector); > + vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector); > + vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector); > + vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector); > + > + vmcs_write64(TSC_OFFSET, src->tsc_offset); > + > + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) > + vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat); > + > + vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs); > + > + vmcs_writel(HOST_CR0, src->host_cr0); > + vmcs_writel(HOST_CR3, src->host_cr3); > + vmcs_writel(HOST_CR4, src->host_cr4); > + vmcs_writel(HOST_FS_BASE, src->host_fs_base); > + vmcs_writel(HOST_GS_BASE, src->host_gs_base); > + vmcs_writel(HOST_TR_BASE, src->host_tr_base); > + vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base); > + vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base); > + vmcs_writel(HOST_RSP, src->host_rsp); > + vmcs_writel(HOST_RIP, src->host_rip); > + vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp); > + vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip); > + > + return 0; > +} > + > /* > * Switches to specified vcpu, until a matching vcpu_put(), but > assumes > * vcpu mutex is already taken. > @@ -1922,6 +1985,71 @@ static void vmclear_local_vcpus(void) > __vcpu_clear(vmx); > } > > +int load_vmcs_common(struct shadow_vmcs *src) > +{ > + vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector); > + vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector); > + vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector); > + vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector); > + vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector); > + vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector); > + vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector); > + vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector); > + > + vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl); > + > + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) > + vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat); > + > + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, > src->vm_entry_intr_info_field); > + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + > src->vm_entry_exception_error_code); > + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, > src->vm_entry_instruction_len); + + vmcs_write32(GUEST_ES_LIMIT, > src->guest_es_limit); + vmcs_write32(GUEST_CS_LIMIT, > src->guest_cs_limit); + vmcs_write32
Re: [PATCH 14/24] Prepare vmcs02 from vmcs01 and vmcs12
On Sun, Jun 13, 2010 at 03:29:44PM +0300, Nadav Har'El wrote: > This patch contains code to prepare the VMCS which can be used to actually > run the L2 guest, vmcs02. prepare_vmcs02 appropriately merges the information > in shadow_vmcs that L1 built for L2 (vmcs12), and that in the VMCS that we > built for L1 (vmcs01). > > VMREAD/WRITE can only access one VMCS at a time (the "current" VMCS), which > makes it difficult for us to read from vmcs01 while writing to vmcs12. This > is why we first make a copy of vmcs01 in memory (l1_shadow_vmcs) and then > read that memory copy while writing to vmcs12. > Did you mean vmcs02 instead of vmcs12 in above paragraph? > Signed-off-by: Nadav Har'El > --- > --- .before/arch/x86/kvm/vmx.c2010-06-13 15:01:29.0 +0300 > +++ .after/arch/x86/kvm/vmx.c 2010-06-13 15:01:29.0 +0300 > @@ -849,6 +849,36 @@ static inline bool report_flexpriority(v > return flexpriority_enabled; > } > > +static inline bool nested_cpu_has_vmx_tpr_shadow(struct kvm_vcpu *vcpu) > +{ > + return cpu_has_vmx_tpr_shadow() && > + get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control & > + CPU_BASED_TPR_SHADOW; > +} > + > +static inline bool nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu) > +{ > + return cpu_has_secondary_exec_ctrls() && > + get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control & > + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; > +} > + > +static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu > +*vcpu) > +{ > + return nested_cpu_has_secondary_exec_ctrls(vcpu) && > + (get_shadow_vmcs(vcpu)->secondary_vm_exec_control & > + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); > +} > + > +static inline bool nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu) > +{ > + return nested_cpu_has_secondary_exec_ctrls(vcpu) && > + (get_shadow_vmcs(vcpu)->secondary_vm_exec_control & > + SECONDARY_EXEC_ENABLE_EPT); > +} > + > + > static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) > { > int i; > @@ -1292,6 +1322,39 @@ static void vmx_load_host_state(struct v > preempt_enable(); > } > > +int load_vmcs_host_state(struct shadow_vmcs *src) > +{ > + vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector); > + vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector); > + vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector); > + vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector); > + vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector); > + vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector); > + vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector); > + > + vmcs_write64(TSC_OFFSET, src->tsc_offset); > + > + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) > + vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat); > + > + vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs); > + > + vmcs_writel(HOST_CR0, src->host_cr0); > + vmcs_writel(HOST_CR3, src->host_cr3); > + vmcs_writel(HOST_CR4, src->host_cr4); > + vmcs_writel(HOST_FS_BASE, src->host_fs_base); > + vmcs_writel(HOST_GS_BASE, src->host_gs_base); > + vmcs_writel(HOST_TR_BASE, src->host_tr_base); > + vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base); > + vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base); > + vmcs_writel(HOST_RSP, src->host_rsp); > + vmcs_writel(HOST_RIP, src->host_rip); > + vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp); > + vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip); > + > + return 0; > +} > + > /* > * Switches to specified vcpu, until a matching vcpu_put(), but assumes > * vcpu mutex is already taken. > @@ -1922,6 +1985,71 @@ static void vmclear_local_vcpus(void) > __vcpu_clear(vmx); > } > > +int load_vmcs_common(struct shadow_vmcs *src) > +{ > + vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector); > + vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector); > + vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector); > + vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector); > + vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector); > + vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector); > + vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector); > + vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector); > + > + vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl); > + > + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) > + vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat); > + > + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field); > + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, > + src->vm_entry_exception_error_code); > + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->vm_entry_instruction_len); > + > + vmcs_writ
Re: [PATCH 14/24] Prepare vmcs02 from vmcs01 and vmcs12
On 06/13/2010 03:29 PM, Nadav Har'El wrote: This patch contains code to prepare the VMCS which can be used to actually run the L2 guest, vmcs02. prepare_vmcs02 appropriately merges the information in shadow_vmcs that L1 built for L2 (vmcs12), and that in the VMCS that we built for L1 (vmcs01). VMREAD/WRITE can only access one VMCS at a time (the "current" VMCS), which makes it difficult for us to read from vmcs01 while writing to vmcs12. This is why we first make a copy of vmcs01 in memory (l1_shadow_vmcs) and then read that memory copy while writing to vmcs12. Signed-off-by: Nadav Har'El --- --- .before/arch/x86/kvm/vmx.c 2010-06-13 15:01:29.0 +0300 +++ .after/arch/x86/kvm/vmx.c 2010-06-13 15:01:29.0 +0300 @@ -849,6 +849,36 @@ static inline bool report_flexpriority(v return flexpriority_enabled; } +static inline bool nested_cpu_has_vmx_tpr_shadow(struct kvm_vcpu *vcpu) +{ + return cpu_has_vmx_tpr_shadow()&& + get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control& + CPU_BASED_TPR_SHADOW; Operator precedence is with you, but the line width limit is not. Use parentheses to improve readability. @@ -1292,6 +1322,39 @@ static void vmx_load_host_state(struct v preempt_enable(); } +int load_vmcs_host_state(struct shadow_vmcs *src) +{ + vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector); + vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector); + vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector); + vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector); + vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector); + vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector); + vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector); Why do we need to go through a shadow_vmcs for host fields? Instead of cloning a vmcs, you can call a common init routing to initialize the host fields. + + vmcs_write64(TSC_OFFSET, src->tsc_offset); Don't you need to adjust for our TSC_OFFSET? +/* prepare_vmcs_02 is called in when the L1 guest hypervisor runs its nested + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it + * with L0's wishes for its guest (vmsc01), so we can run the L2 guest in a + * way that will both be appropriate to L1's requests, and our needs. + */ +int prepare_vmcs_02(struct kvm_vcpu *vcpu, + struct shadow_vmcs *vmcs12, struct shadow_vmcs *vmcs01) +{ + u32 exec_control; + + load_vmcs_common(vmcs12); + + vmcs_write64(VMCS_LINK_POINTER, vmcs12->vmcs_link_pointer); Not sure about this. We don't use the vmcs link pointer, it's better to keep it at its default of -1ull. + vmcs_write64(IO_BITMAP_A, vmcs01->io_bitmap_a); + vmcs_write64(IO_BITMAP_B, vmcs01->io_bitmap_b); I guess merging the io bitmaps doesn't make much sense, at least at this stage. + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, vmcs01->msr_bitmap); However, merging the msr bitmaps is critical. Perhaps you do it in a later patch. + + if (vmcs12->vm_entry_msr_load_count> 0 || + vmcs12->vm_exit_msr_load_count> 0 || + vmcs12->vm_exit_msr_store_count> 0) { + printk(KERN_WARNING + "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__); Unfortunate, since kvm has started to use this feature. For all unsupported mandatory features, we need reporting that is always enabled (i.e. no dprintk()), but to avoid flooding dmesg, use printk_ratelimit() or report just once. Also, it's better to kill the guest (KVM_REQ_TRIPLE_FAULT, or a VM instruction error) than to let it continue incorrectly. + } + + if (nested_cpu_has_vmx_tpr_shadow(vcpu)) { + struct page *page = + nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); + if (!page) + return 1; ? + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page)); + kvm_release_page_clean(page); Hm. The host can move this page around. If it happens not to be mapped into the guest, we won't get any notification. So we need to keep a reference to this page, or else force an exit from the mmu notifier callbacks if it is removed by the host. + } + + if (nested_vm_need_virtualize_apic_accesses(vcpu)) { + struct page *page = + nested_get_page(vcpu, vmcs12->apic_access_addr); + if (!page) + return 1; + vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page)); + kvm_release_page_clean(page); + } Ditto. + + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, +(vmcs01->pin_based_vm_exec_control | + vmcs12->pin_based_vm_exec_control)); We don't really want the guest's pin-based contr
[PATCH 14/24] Prepare vmcs02 from vmcs01 and vmcs12
This patch contains code to prepare the VMCS which can be used to actually run the L2 guest, vmcs02. prepare_vmcs02 appropriately merges the information in shadow_vmcs that L1 built for L2 (vmcs12), and that in the VMCS that we built for L1 (vmcs01). VMREAD/WRITE can only access one VMCS at a time (the "current" VMCS), which makes it difficult for us to read from vmcs01 while writing to vmcs12. This is why we first make a copy of vmcs01 in memory (l1_shadow_vmcs) and then read that memory copy while writing to vmcs12. Signed-off-by: Nadav Har'El --- --- .before/arch/x86/kvm/vmx.c 2010-06-13 15:01:29.0 +0300 +++ .after/arch/x86/kvm/vmx.c 2010-06-13 15:01:29.0 +0300 @@ -849,6 +849,36 @@ static inline bool report_flexpriority(v return flexpriority_enabled; } +static inline bool nested_cpu_has_vmx_tpr_shadow(struct kvm_vcpu *vcpu) +{ + return cpu_has_vmx_tpr_shadow() && + get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control & + CPU_BASED_TPR_SHADOW; +} + +static inline bool nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu) +{ + return cpu_has_secondary_exec_ctrls() && + get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; +} + +static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu + *vcpu) +{ + return nested_cpu_has_secondary_exec_ctrls(vcpu) && + (get_shadow_vmcs(vcpu)->secondary_vm_exec_control & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); +} + +static inline bool nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu) +{ + return nested_cpu_has_secondary_exec_ctrls(vcpu) && + (get_shadow_vmcs(vcpu)->secondary_vm_exec_control & + SECONDARY_EXEC_ENABLE_EPT); +} + + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -1292,6 +1322,39 @@ static void vmx_load_host_state(struct v preempt_enable(); } +int load_vmcs_host_state(struct shadow_vmcs *src) +{ + vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector); + vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector); + vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector); + vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector); + vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector); + vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector); + vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector); + + vmcs_write64(TSC_OFFSET, src->tsc_offset); + + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) + vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat); + + vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs); + + vmcs_writel(HOST_CR0, src->host_cr0); + vmcs_writel(HOST_CR3, src->host_cr3); + vmcs_writel(HOST_CR4, src->host_cr4); + vmcs_writel(HOST_FS_BASE, src->host_fs_base); + vmcs_writel(HOST_GS_BASE, src->host_gs_base); + vmcs_writel(HOST_TR_BASE, src->host_tr_base); + vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base); + vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base); + vmcs_writel(HOST_RSP, src->host_rsp); + vmcs_writel(HOST_RIP, src->host_rip); + vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp); + vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip); + + return 0; +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -1922,6 +1985,71 @@ static void vmclear_local_vcpus(void) __vcpu_clear(vmx); } +int load_vmcs_common(struct shadow_vmcs *src) +{ + vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector); + vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector); + vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector); + vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector); + vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector); + vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector); + vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector); + vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector); + + vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl); + + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->vm_entry_intr_info_field); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, +src->vm_entry_exception_error_code); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->vm_entry_instruction_len); + + vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit); + vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit); + vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit); + vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit); + vmcs_write32(GUEST