On Thu, Dec 10, 2009 at 08:38:24PM +0200, [email protected] wrote:
> From: Orit Wasserman <[email protected]>
>
> ---
> arch/x86/kvm/vmx.c | 235
> +++++++++++++++++++++++++++++++++++++++++++++++++++-
> arch/x86/kvm/x86.c | 5 +-
> arch/x86/kvm/x86.h | 3 +
> 3 files changed, 240 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 2726a6c..a7ffd5e 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -93,13 +93,39 @@ struct shared_msr_entry {
> };
>
> struct __attribute__ ((__packed__)) level_state {
> + /* Has the level1 guest done vmclear? */
> + bool vmclear;
> +};
> +
> +/*
> + * This structure is mapped to guest memory.
> + * It is packed in order to preseve the binary content
> + * after live migration.
> + * If there are changed in the content or layout the revision_id must be
> updated.
> + */
> +struct __attribute__ ((__packed__)) nested_vmcs_page {
> + u32 revision_id;
> + u32 abort;
> + struct level_state l2_state;
> +};
> +
> +struct nested_vmcs_list {
> + struct list_head list;
> + gpa_t vmcs_addr;
> + struct vmcs *l2_vmcs;
> };
>
> struct nested_vmx {
> /* Has the level1 guest done vmxon? */
> bool vmxon;
> + /* What is the location of the current vmcs l1 keeps for l2 */
> + gpa_t current_vmptr;
> /* Level 1 state for switching to level 2 and back */
> struct level_state *l1_state;
> + /* list of vmcs for each l2 guest created by l1 */
> + struct list_head l2_vmcs_list;
> + /* l2 page corresponding to the current vmcs set by l1 */
> + struct nested_vmcs_page *current_l2_page;
> };
>
> struct vcpu_vmx {
> @@ -156,6 +182,76 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu
> *vcpu)
> return container_of(vcpu, struct vcpu_vmx, vcpu);
> }
>
> +static struct page *nested_get_page(struct kvm_vcpu *vcpu,
> + u64 vmcs_addr)
> +{
> + struct page *vmcs_page = NULL;
> +
> + down_read(¤t->mm->mmap_sem);
> + vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT);
> + up_read(¤t->mm->mmap_sem);
> +
> + if (is_error_page(vmcs_page)) {
> + printk(KERN_ERR "%s error allocating page 0x%llx\n",
> + __func__, vmcs_addr);
> + kvm_release_page_clean(vmcs_page);
> + return NULL;
> + }
> +
> + return vmcs_page;
> +
> +}
> +
> +static int nested_map_current(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct page *vmcs_page =
> + nested_get_page(vcpu, vmx->nested.current_vmptr);
> + struct nested_vmcs_page *mapped_page;
> +
> + if (vmcs_page == NULL) {
> + printk(KERN_INFO "%s: failure in nested_get_page\n", __func__);
> + return 0;
> + }
> +
> + if (vmx->nested.current_l2_page) {
> + printk(KERN_INFO "%s: shadow vmcs already mapped\n", __func__);
> + WARN_ON(1);
> + return 0;
> + }
> +
> + mapped_page = kmap_atomic(vmcs_page, KM_USER0);
> +
> + if (!mapped_page) {
> + printk(KERN_INFO "%s: error in kmap_atomic\n", __func__);
> + return 0;
> + }
> +
> + vmx->nested.current_l2_page = mapped_page;
> +
> + return 1;
> +}
> +
> +static void nested_unmap_current(struct kvm_vcpu *vcpu)
> +{
> + struct page *page;
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> + if (!vmx->nested.current_l2_page) {
> + printk(KERN_INFO "Shadow vmcs already unmapped\n");
> + WARN_ON(1);
> + return;
> + }
> +
> + page = kmap_atomic_to_page(vmx->nested.current_l2_page);
> +
> + kunmap_atomic(vmx->nested.current_l2_page, KM_USER0);
> +
> + kvm_release_page_dirty(page);
> +
> + vmx->nested.current_l2_page = NULL;
> +}
> +
> static int init_rmode(struct kvm *kvm);
> static u64 construct_eptp(unsigned long root_hpa);
>
> @@ -1144,6 +1240,35 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu,
> u32 msr_index, u64 data)
> return 0;
> }
>
> +static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry)
> +{
> + int r = 0;
> + uint size;
> +
> + *gentry = 0;
> +
> + if (is_long_mode(vcpu))
> + size = sizeof(u64);
> + else
> + size = sizeof(u32);
> +
> + r = kvm_read_guest_virt(gva, gentry,
> + size, vcpu);
> + if (r) {
> + printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
> + __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
> + return r;
> + }
> +
> + if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> + printk(KERN_DEBUG "%s addr %llx not aligned\n",
> + __func__, *gentry);
> + return 1;
> + }
> +
> + return 0;
> +}
> +
> /*
> * Writes msr value into into the appropriate "register".
> * Returns 0 on success, non-0 otherwise.
> @@ -1316,6 +1441,7 @@ static int create_l1_state(struct kvm_vcpu *vcpu)
> } else
> return 0;
>
> + INIT_LIST_HEAD(&(vmx->nested.l2_vmcs_list));
> return 0;
> }
>
> @@ -1488,15 +1614,35 @@ static void free_vmcs(struct vmcs *vmcs)
> free_pages((unsigned long)vmcs, vmcs_config.order);
> }
>
> +static void nested_free_current_vmcs(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct nested_vmcs_list *list_item, *n;
> +
> + list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list, list)
> + if (list_item->vmcs_addr == vmx->nested.current_vmptr) {
> + free_vmcs(list_item->l2_vmcs);
> + list_del(&(list_item->list));
> + return;
> + }
> +}
> +
> static void free_l1_state(struct kvm_vcpu *vcpu)
> {
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct nested_vmcs_list *list_item, *n;
>
> if (!vmx->nested.l1_state)
> return;
>
> kfree(vmx->nested.l1_state);
> vmx->nested.l1_state = NULL;
> +
> + list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list,
> + list) {
> + free_vmcs(list_item->l2_vmcs);
> + list_del(&(list_item->list));
> + }
> }
>
>
> @@ -3352,6 +3498,93 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
> return 1;
> }
>
> +static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
> +{
> + unsigned long rflags;
> + rflags = vmx_get_rflags(vcpu);
> + rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_ZF);
> + vmx_set_rflags(vcpu, rflags);
> +}
> +
> +/*
> + * Decode the memory address (operand) of a vmx instruction according to
> Table 23-12/23-11
> + * For additional information regarding offset calculation see 3.7.5
> + */
> +static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu,
> + unsigned long exit_qualification,
> + u32 vmx_instruction_info)
> +{
> + int scaling = vmx_instruction_info & 3; /* bits 0:1
> scaling */
> + int addr_size = (vmx_instruction_info >> 7) & 7; /* bits 7:9
> address size, 0=16bit, 1=32bit, 2=64bit */
> + bool is_reg = vmx_instruction_info & (1u << 10); /* bit 10
> 1=register operand, 0= memory */
> + int seg_reg = (vmx_instruction_info >> 15) & 7; /* bits
> 15:17 segment register */
> + int index_reg = (vmx_instruction_info >> 18) & 0xf; /* bits
> 18:21 index register */
> + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); /* bit 22
> index register validity, 0=valid, 1=invalid */
> + int base_reg = (vmx_instruction_info >> 23) & 0xf; /* bits
> 23:26 index register */
> + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); /* bit 27
> base register validity, 0=valid, 1=invalid */
> + gva_t addr;
> +
> + if (is_reg)
> + return 0;
> +
> + switch (addr_size) {
> + case 1:
> + exit_qualification &= 0xffffffff; /* 32 high bits are undefied
> according to the spec, page 23-7 */
> + break;
> + case 2:
> + break;
> + default:
> + return 0;
> + }
> +
> + /* Addr = segment_base + offset */
> + /* offfset = Base + [Index * Scale] + Displacement, see Figure 3-11 */
> + addr = vmx_get_segment_base(vcpu, seg_reg);
> + if (base_is_valid)
> + addr += kvm_register_read(vcpu, base_reg);
> + if (index_is_valid)
> + addr += kvm_register_read(vcpu, index_reg)*scaling;
> + addr += exit_qualification; /* exit qualification holds the
> displacement, spec page 23-7 */
> +
> + return addr;
> +}
> +
> +static int handle_vmclear(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct level_state *l2_state;
> + gpa_t guest_vmcs_addr;
> + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> + gva_t vmcs_gva;
> +
> + if (!nested_vmx_check_permission(vcpu))
> + return 1;
> +
> + vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
> + vmx_instruction_info);
> +
> + if (read_guest_vmcs_gpa(vcpu, vmcs_gva, &guest_vmcs_addr))
> + return 1;
> +
Should check that vmcs address is 4K aligned and given address is not equal
to vmxon pointer.
> + vmx->nested.current_vmptr = guest_vmcs_addr;
vmclear doesn't change current vmcs pointer.
> + if (!nested_map_current(vcpu))
> + return 1;
> +
> + l2_state = &(to_vmx(vcpu)->nested.current_l2_page->l2_state);
> + l2_state->vmclear = 1;
> + nested_free_current_vmcs(vcpu);
> +
> + vmx->nested.current_vmptr = -1ull;
> +
vmclear reset current vmcs pointer to -1 only if it was called with
current vmcs pointer as an argument.
> + nested_unmap_current(vcpu);
> +
> + skip_emulated_instruction(vcpu);
> + clear_rflags_cf_zf(vcpu);
> +
> + return 1;
> +}
> +
> static int handle_vmoff(struct kvm_vcpu *vcpu)
> {
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -3695,7 +3928,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu
> *vcpu) = {
> [EXIT_REASON_HLT] = handle_halt,
> [EXIT_REASON_INVLPG] = handle_invlpg,
> [EXIT_REASON_VMCALL] = handle_vmcall,
> - [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
> + [EXIT_REASON_VMCLEAR] = handle_vmclear,
> [EXIT_REASON_VMLAUNCH] = handle_vmx_insn,
> [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
> [EXIT_REASON_VMPTRST] = handle_vmx_insn,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b698952..e5acf22 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2773,8 +2773,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t
> addr, int len, void *v)
> return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
> }
>
> -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
> - struct kvm_vcpu *vcpu)
> +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
> + struct kvm_vcpu *vcpu)
> {
> void *data = val;
> int r = X86EMUL_CONTINUE;
> @@ -2802,6 +2802,7 @@ static int kvm_read_guest_virt(gva_t addr, void *val,
> unsigned int bytes,
> out:
> return r;
> }
> +EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
>
> static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
> struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 57204cb..2d7b2dc 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -35,6 +35,9 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
> struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
> u32 function, u32 index);
>
> +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
> + struct kvm_vcpu *vcpu);
> +
> extern int nested;
>
> #endif
> --
> 1.6.0.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html