Re: [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor

Bandan Das Fri, 30 Jun 2017 10:21:34 -0700

Hi Paolo,

Paolo Bonzini <pbonz...@redhat.com> writes:


> ----- Original Message -----
>> From: "Bandan Das" <b...@redhat.com>
>> To: k...@vger.kernel.org
>> Cc: pbonz...@redhat.com, linux-kernel@vger.kernel.org
>> Sent: Friday, June 30, 2017 1:29:55 AM
>> Subject: [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 
>> hypervisor
>> 
>> This is a mix of emulation/passthrough to implement EPTP
>> switching for the nested hypervisor.
>> 
>> If the shadow EPT are absent, a vmexit occurs with reason 59.
>> L0 can then create shadow structures based on the entry that the
>> guest calls with to obtain a new root_hpa that can be written to
>> the shadow list and subsequently, reload the mmu to resume L2.
>> On the next vmfunc(0, index) however, the processor will load the
>> entry without an exit.
>
> What happens if the root_hpa is dropped by the L0 MMU?  I'm not sure
> what removes it from the shadow EPT list.

That would result in a vmfunc vmexit, which will jump to handle_vmfunc
and then a call to mmu_alloc_shadow_roots() that will overwrite the shadow
eptp entry with the new one.

I believe part of your question is also that root_hpa is valid, just not
tracking the current l1 eptp and in that case, the processor can jump to
some other guest's page tables which is a problem. In that case, I think it 
should
be possible to invalidate that entry in the shadow eptp list.

> For a first version of the patch, I would prefer a less optimized
> version that always goes through L0 when L2 executes VMFUNC.
> To achieve this effect, you can copy "enable VM functions" secondary
> execution control from vmcs12 to vmcs02, but leave the VM-function
> controls to 0 in vmcs02.

Is the current approach prone to other undesired corner cases like the one
you pointed above ? :) I would be uncomfortable having this in if you feel
having the cpu jump to a new eptp feels like an interesting exploitable
interface; however, I think it would be nice to have l2 execute vmfunc without
exiting to L0. 

Thanks for the quick review!

> Paolo
>
>> Signed-off-by: Bandan Das <b...@redhat.com>
>> ---
>>  arch/x86/include/asm/vmx.h |   5 +++
>>  arch/x86/kvm/vmx.c         | 104
>>  +++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 109 insertions(+)
>> 
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index 35cd06f..e06783e 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -72,6 +72,7 @@
>>  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING   0x00000400
>>  #define SECONDARY_EXEC_RDRAND                       0x00000800
>>  #define SECONDARY_EXEC_ENABLE_INVPCID               0x00001000
>> +#define SECONDARY_EXEC_ENABLE_VMFUNC            0x00002000
>>  #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
>>  #define SECONDARY_EXEC_RDSEED                       0x00010000
>>  #define SECONDARY_EXEC_ENABLE_PML               0x00020000
>> @@ -114,6 +115,10 @@
>>  #define VMX_MISC_SAVE_EFER_LMA                      0x00000020
>>  #define VMX_MISC_ACTIVITY_HLT                       0x00000040
>>  
>> +/* VMFUNC functions */
>> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
>> +#define VMFUNC_EPTP_ENTRIES  512
>> +
>>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>>  {
>>      return vmx_basic & GENMASK_ULL(30, 0);
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index ca5d2b9..75049c0 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -240,11 +240,13 @@ struct __packed vmcs12 {
>>      u64 virtual_apic_page_addr;
>>      u64 apic_access_addr;
>>      u64 posted_intr_desc_addr;
>> +    u64 vm_function_control;
>>      u64 ept_pointer;
>>      u64 eoi_exit_bitmap0;
>>      u64 eoi_exit_bitmap1;
>>      u64 eoi_exit_bitmap2;
>>      u64 eoi_exit_bitmap3;
>> +    u64 eptp_list_address;
>>      u64 xss_exit_bitmap;
>>      u64 guest_physical_address;
>>      u64 vmcs_link_pointer;
>> @@ -441,6 +443,7 @@ struct nested_vmx {
>>      struct page *apic_access_page;
>>      struct page *virtual_apic_page;
>>      struct page *pi_desc_page;
>> +    struct page *shadow_eptp_list;
>>      struct pi_desc *pi_desc;
>>      bool pi_pending;
>>      u16 posted_intr_nv;
>> @@ -481,6 +484,7 @@ struct nested_vmx {
>>      u64 nested_vmx_cr4_fixed0;
>>      u64 nested_vmx_cr4_fixed1;
>>      u64 nested_vmx_vmcs_enum;
>> +    u64 nested_vmx_vmfunc_controls;
>>  };
>>  
>>  #define POSTED_INTR_ON  0
>> @@ -1314,6 +1318,22 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
>>              SECONDARY_EXEC_TSC_SCALING;
>>  }
>>  
>> +static inline bool cpu_has_vmx_vmfunc(void)
>> +{
>> +    return vmcs_config.cpu_based_exec_ctrl &
>> +            SECONDARY_EXEC_ENABLE_VMFUNC;
>> +}
>> +
>> +static inline u64 vmx_vmfunc_controls(void)
>> +{
>> +    u64 controls = 0;
>> +
>> +    if (cpu_has_vmx_vmfunc())
>> +            rdmsrl(MSR_IA32_VMX_VMFUNC, controls);
>> +
>> +    return controls;
>> +}
>> +
>>  static inline bool report_flexpriority(void)
>>  {
>>      return flexpriority_enabled;
>> @@ -1388,6 +1408,18 @@ static inline bool nested_cpu_has_posted_intr(struct
>> vmcs12 *vmcs12)
>>      return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
>>  }
>>  
>> +static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>> +{
>> +    return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>> +}
>> +
>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>> +{
>> +    return nested_cpu_has_vmfunc(vmcs12) &&
>> +            (vmcs12->vm_function_control &
>> +             VMX_VMFUNC_EPTP_SWITCHING);
>> +}
>> +
>>  static inline bool is_nmi(u32 intr_info)
>>  {
>>      return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>> @@ -3143,6 +3175,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32
>> msr_index, u64 *pdata)
>>              *pdata = vmx->nested.nested_vmx_ept_caps |
>>                      ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
>>              break;
>> +    case MSR_IA32_VMX_VMFUNC:
>> +            *pdata = vmx->nested.nested_vmx_vmfunc_controls;
>> +            break;
>>      default:
>>              return 1;
>>      }
>> @@ -6959,6 +6994,14 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
>>              vmx->vmcs01.shadow_vmcs = shadow_vmcs;
>>      }
>>  
>> +    if (vmx_vmfunc_controls() & 1) {
>> +            struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
>> +
>> +            if (!page)
>> +                    goto out_shadow_vmcs;
>> +            vmx->nested.shadow_eptp_list = page;
>> +    }
>> +
>>      INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
>>      vmx->nested.vmcs02_num = 0;
>>  
>> @@ -7128,6 +7171,11 @@ static void free_nested(struct vcpu_vmx *vmx)
>>              vmx->vmcs01.shadow_vmcs = NULL;
>>      }
>>      kfree(vmx->nested.cached_vmcs12);
>> +
>> +    if (vmx->nested.shadow_eptp_list) {
>> +            __free_page(vmx->nested.shadow_eptp_list);
>> +            vmx->nested.shadow_eptp_list = NULL;
>> +    }
>>      /* Unpin physical memory we referred to in current vmcs02 */
>>      if (vmx->nested.apic_access_page) {
>>              nested_release_page(vmx->nested.apic_access_page);
>> @@ -7740,6 +7788,61 @@ static int handle_preemption_timer(struct kvm_vcpu
>> *vcpu)
>>      return 1;
>>  }
>>  
>> +static int handle_vmfunc(struct kvm_vcpu *vcpu)
>> +{
>> +    struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +    struct vmcs12 *vmcs12;
>> +    struct page *page = NULL,
>> +            *shadow_page = vmx->nested.shadow_eptp_list;
>> +    u64 *l1_eptp_list, *shadow_eptp_list;
>> +    u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>> +    u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
>> +
>> +    if (is_guest_mode(vcpu)) {
>> +            vmcs12 = get_vmcs12(vcpu);
>> +            if (!nested_cpu_has_ept(vmcs12) ||
>> +                !nested_cpu_has_eptp_switching(vmcs12))
>> +                    goto fail;
>> +
>> +            /*
>> +             * Only function 0 is valid, everything upto 63 injects VMFUNC
>> +             * exit reason to L1, and #UD thereafter
>> +             */
>> +            if (function || !vmcs12->eptp_list_address ||
>> +                index >= VMFUNC_EPTP_ENTRIES)
>> +                    goto fail;
>> +
>> +            page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>> +            if (!page)
>> +                    goto fail;
>> +
>> +            l1_eptp_list = kmap(page);
>> +            if (!l1_eptp_list[index])
>> +                    goto fail;
>> +
>> +            kvm_mmu_unload(vcpu);
>> +            /*
>> +             * TODO: Verify that guest ept satisfies vmentry prereqs
>> +             */
>> +            vmcs12->ept_pointer = l1_eptp_list[index];
>> +            shadow_eptp_list = phys_to_virt(page_to_phys(shadow_page));
>> +            kvm_mmu_reload(vcpu);
>> +            shadow_eptp_list[index] =
>> +                    construct_eptp(vcpu->arch.mmu.root_hpa);
>> +            kunmap(page);
>> +
>> +            return kvm_skip_emulated_instruction(vcpu);
>> +    }
>> +
>> +fail:
>> +    if (page)
>> +            kunmap(page);
>> +    nested_vmx_vmexit(vcpu, vmx->exit_reason,
>> +                      vmcs_read32(VM_EXIT_INTR_INFO),
>> +                      vmcs_readl(EXIT_QUALIFICATION));
>> +    return 1;
>> +}
>> +
>>  /*
>>   * The exit handlers return 1 if the exit was handled fully and guest
>>   execution
>>   * may resume.  Otherwise they set the kvm_run parameter to indicate what
>>   needs
>> @@ -7790,6 +7893,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct
>> kvm_vcpu *vcpu) = {
>>      [EXIT_REASON_XSAVES]                  = handle_xsaves,
>>      [EXIT_REASON_XRSTORS]                 = handle_xrstors,
>>      [EXIT_REASON_PML_FULL]                = handle_pml_full,
>> +    [EXIT_REASON_VMFUNC]                  = handle_vmfunc,
>>      [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
>>  };
>>  
>> --
>> 2.9.4
>> 
>>

Re: [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor

Reply via email to