On Mon, 2018-12-03 at 14:59 +0100, KarimAllah Ahmed wrote: > The "APIC-access address" is simply a token that the hypervisor puts into > the PFN of a 4K EPTE (or PTE if using shadow paging) that triggers APIC > virtualization whenever a page walk terminates with that PFN. This address > has to be a legal address (i.e. within the physical address supported by > the CPU), but it need not have WB memory behind it. In fact, it need not > have anything at all behind it. When bit 31 ("activate secondary controls") > of the primary processor-based VM-execution controls is set and bit 0 > ("virtualize APIC accesses") of the secondary processor-based VM-execution > controls is set, the PFN recorded in the VMCS "APIC-access address" field > will never be touched. (Instead, the access triggers APIC virtualization, > which may access the PFN recorded in the "Virtual-APIC address" field of > the VMCS.) > > So stop mapping the "APIC-access address" page into the kernel and even > drop the requirements to have a valid page backing it. Instead, just use > some token that: > > 1) Not one of the valid guest pages. > 2) Within the physical address supported by the CPU. > > Suggested-by: Jim Mattson <jmatt...@google.com> > Signed-off-by: KarimAllah Ahmed <karah...@amazon.de> > --- > > Thanks Jim for the commit message :) > --- > arch/x86/include/asm/kvm_host.h | 1 + > arch/x86/kvm/mmu.c | 10 ++++++ > arch/x86/kvm/vmx.c | 71 > ++++++++++++++++++----------------------- > 3 files changed, 42 insertions(+), 40 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index fbda5a9..7e50196 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1077,6 +1077,7 @@ struct kvm_x86_ops { > void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); > void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); > void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); > + bool (*nested_apic_access_addr)(struct kvm_vcpu *vcpu, gpa_t gpa, hpa_t > *hpa); > void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); > int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); > int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 7c03c0f..ae46a8d 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -3962,9 +3962,19 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) > static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, > gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) > { > + hpa_t hpa; > struct kvm_memory_slot *slot; > bool async; > > + if (is_guest_mode(vcpu) && > + kvm_x86_ops->nested_apic_access_addr && > + kvm_x86_ops->nested_apic_access_addr(vcpu, gfn_to_gpa(gfn), &hpa)) { > + *pfn = hpa >> PAGE_SHIFT; > + if (writable) > + *writable = true; > + return false; > + }
Now thinking further about this, I actually still need to validate that the L12 EPT for this gfn actually contains the apic_access address. To ensure that I only fixup the fault when the L1 hypervisor sets up both VMCS L12 APIC_ACCESS and L12 EPT to contain the same address. Will fix and send v2. > + > /* > * Don't expose private memslots to L2. > */ > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 83a614f..340cf56 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -864,7 +864,6 @@ struct nested_vmx { > * Guest pages referred to in the vmcs02 with host-physical > * pointers, so we must keep them pinned while L2 runs. > */ > - struct page *apic_access_page; > struct kvm_host_map virtual_apic_map; > struct kvm_host_map pi_desc_map; > struct kvm_host_map msr_bitmap_map; > @@ -8512,10 +8511,6 @@ static void free_nested(struct kvm_vcpu *vcpu) > kfree(vmx->nested.cached_vmcs12); > kfree(vmx->nested.cached_shadow_vmcs12); > /* Unpin physical memory we referred to in the vmcs02 */ > - if (vmx->nested.apic_access_page) { > - kvm_release_page_dirty(vmx->nested.apic_access_page); > - vmx->nested.apic_access_page = NULL; > - } > kvm_vcpu_unmap(&vmx->nested.virtual_apic_map); > kvm_vcpu_unmap(&vmx->nested.pi_desc_map); > vmx->nested.pi_desc = NULL; > @@ -11901,41 +11896,27 @@ static void vmx_inject_page_fault_nested(struct > kvm_vcpu *vcpu, > static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, > struct vmcs12 *vmcs12); > > +static hpa_t vmx_apic_access_addr(void) > +{ > + /* > + * The physical address choosen here has to: > + * 1) Never be an address that could be assigned to a guest. > + * 2) Within the maximum physical limits of the CPU. > + * > + * So our choice below is completely random, but at least it follows > + * these two rules. > + */ > + return __pa_symbol(_text) & PAGE_MASK; > +} > + > static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) > { > struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > struct vcpu_vmx *vmx = to_vmx(vcpu); > struct kvm_host_map *map; > - struct page *page; > - u64 hpa; > > - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { > - /* > - * Translate L1 physical address to host physical > - * address for vmcs02. Keep the page pinned, so this > - * physical address remains valid. We keep a reference > - * to it so we can release it later. > - */ > - if (vmx->nested.apic_access_page) { /* shouldn't happen */ > - kvm_release_page_dirty(vmx->nested.apic_access_page); > - vmx->nested.apic_access_page = NULL; > - } > - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); > - /* > - * If translation failed, no matter: This feature asks > - * to exit when accessing the given address, and if it > - * can never be accessed, this feature won't do > - * anything anyway. > - */ > - if (!is_error_page(page)) { > - vmx->nested.apic_access_page = page; > - hpa = page_to_phys(vmx->nested.apic_access_page); > - vmcs_write64(APIC_ACCESS_ADDR, hpa); > - } else { > - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, > - > SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); > - } > - } > + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) > + vmcs_write64(APIC_ACCESS_ADDR, vmx_apic_access_addr()); > > if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { > map = &vmx->nested.virtual_apic_map; > @@ -14196,12 +14177,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, > u32 exit_reason, > /* This is needed for same reason as it was needed in prepare_vmcs02 */ > vmx->host_rsp = 0; > > - /* Unpin physical memory we referred to in vmcs02 */ > - if (vmx->nested.apic_access_page) { > - kvm_release_page_dirty(vmx->nested.apic_access_page); > - vmx->nested.apic_access_page = NULL; > - } > - > kvm_vcpu_unmap(&vmx->nested.virtual_apic_map); > kvm_vcpu_unmap(&vmx->nested.pi_desc_map); > vmx->nested.pi_desc = NULL; > @@ -14949,6 +14924,21 @@ static int vmx_set_nested_state(struct kvm_vcpu > *vcpu, > return 0; > } > > +static bool vmx_nested_apic_access_addr(struct kvm_vcpu *vcpu, > + gpa_t gpa, hpa_t *hpa) > +{ > + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > + > + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && > + page_address_valid(vcpu, vmcs12->apic_access_addr) && > + (gpa_to_gfn(gpa) == gpa_to_gfn(vmcs12->apic_access_addr))) { > + *hpa = vmx_apic_access_addr(); > + return true; > + } > + > + return false; > +} > + > static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { > .cpu_has_kvm_support = cpu_has_kvm_support, > .disabled_by_bios = vmx_disabled_by_bios, > @@ -15022,6 +15012,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init > = { > .update_cr8_intercept = update_cr8_intercept, > .set_virtual_apic_mode = vmx_set_virtual_apic_mode, > .set_apic_access_page_addr = vmx_set_apic_access_page_addr, > + .nested_apic_access_addr = vmx_nested_apic_access_addr, > .get_enable_apicv = vmx_get_enable_apicv, > .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, > .load_eoi_exitmap = vmx_load_eoi_exitmap, Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrer: Christian Schlaeger, Ralf Herbrich Ust-ID: DE 289 237 879 Eingetragen am Amtsgericht Charlottenburg HRB 149173 B