Re: [PATCH] KVM/nVMX: Stop mapping the "APIC-access address" page into the kernel

2018-12-03 Thread Raslan, KarimAllah
On Mon, 2018-12-03 at 14:59 +0100, KarimAllah Ahmed wrote:
> The "APIC-access address" is simply a token that the hypervisor puts into
> the PFN of a 4K EPTE (or PTE if using shadow paging) that triggers APIC
> virtualization whenever a page walk terminates with that PFN. This address
> has to be a legal address (i.e.  within the physical address supported by
> the CPU), but it need not have WB memory behind it. In fact, it need not
> have anything at all behind it. When bit 31 ("activate secondary controls")
> of the primary processor-based VM-execution controls is set and bit 0
> ("virtualize APIC accesses") of the secondary processor-based VM-execution
> controls is set, the PFN recorded in the VMCS "APIC-access address" field
> will never be touched. (Instead, the access triggers APIC virtualization,
> which may access the PFN recorded in the "Virtual-APIC address" field of
> the VMCS.)
> 
> So stop mapping the "APIC-access address" page into the kernel and even
> drop the requirements to have a valid page backing it. Instead, just use
> some token that:
> 
> 1) Not one of the valid guest pages.
> 2) Within the physical address supported by the CPU.
> 
> Suggested-by: Jim Mattson 
> Signed-off-by: KarimAllah Ahmed 
> ---
> 
> Thanks Jim for the commit message :)
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/mmu.c  | 10 ++
>  arch/x86/kvm/vmx.c  | 71 
> ++---
>  3 files changed, 42 insertions(+), 40 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index fbda5a9..7e50196 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1077,6 +1077,7 @@ struct kvm_x86_ops {
>   void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
>   void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
>   void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
> + bool (*nested_apic_access_addr)(struct kvm_vcpu *vcpu, gpa_t gpa, hpa_t 
> *hpa);
>   void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
>   int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
>   int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 7c03c0f..ae46a8d 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3962,9 +3962,19 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
>  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
>gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
>  {
> + hpa_t hpa;
>   struct kvm_memory_slot *slot;
>   bool async;
>  
> + if (is_guest_mode(vcpu) &&
> + kvm_x86_ops->nested_apic_access_addr &&
> + kvm_x86_ops->nested_apic_access_addr(vcpu, gfn_to_gpa(gfn), &hpa)) {
> + *pfn = hpa >> PAGE_SHIFT;
> + if (writable)
> + *writable = true;
> + return false;
> + }

Now thinking further about this, I actually still need to validate that the L12 
EPT for this gfn actually contains the apic_access address. To ensure that I 
only fixup the fault when the L1 hypervisor sets up both VMCS L12 APIC_ACCESS 
and L12 EPT to contain the same address.

Will fix and send v2.

> +
>   /*
>* Don't expose private memslots to L2.
>*/
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 83a614f..340cf56 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -864,7 +864,6 @@ struct nested_vmx {
>* Guest pages referred to in the vmcs02 with host-physical
>* pointers, so we must keep them pinned while L2 runs.
>*/
> - struct page *apic_access_page;
>   struct kvm_host_map virtual_apic_map;
>   struct kvm_host_map pi_desc_map;
>   struct kvm_host_map msr_bitmap_map;
> @@ -8512,10 +8511,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
>   kfree(vmx->nested.cached_vmcs12);
>   kfree(vmx->nested.cached_shadow_vmcs12);
>   /* Unpin physical memory we referred to in the vmcs02 */
> - if (vmx->nested.apic_access_page) {
> - kvm_release_page_dirty(vmx->nested.apic_access_page);
> - vmx->nested.apic_access_page = NULL;
> - }
>   kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
>   kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
>   vmx->nested.pi_desc = NULL;
> @@ -11901,41 +11896,27 @@ static void vmx_inject_page_fault_nested(struct 
> kvm_vcpu *vcpu,
>  static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>struct vmcs12 *vmcs12);
>  
> +static hpa_t vmx_apic_access_addr(void)
> +{
> + /*
> +  * The physical address choosen here has to:
> +  * 1) Never be an address that could be assigned to a guest.
> +  * 2) Within the maximum physical limits of the CPU.
> +  *
> +  * So our choice below 

[PATCH] KVM/nVMX: Stop mapping the "APIC-access address" page into the kernel

2018-12-03 Thread KarimAllah Ahmed
The "APIC-access address" is simply a token that the hypervisor puts into
the PFN of a 4K EPTE (or PTE if using shadow paging) that triggers APIC
virtualization whenever a page walk terminates with that PFN. This address
has to be a legal address (i.e.  within the physical address supported by
the CPU), but it need not have WB memory behind it. In fact, it need not
have anything at all behind it. When bit 31 ("activate secondary controls")
of the primary processor-based VM-execution controls is set and bit 0
("virtualize APIC accesses") of the secondary processor-based VM-execution
controls is set, the PFN recorded in the VMCS "APIC-access address" field
will never be touched. (Instead, the access triggers APIC virtualization,
which may access the PFN recorded in the "Virtual-APIC address" field of
the VMCS.)

So stop mapping the "APIC-access address" page into the kernel and even
drop the requirements to have a valid page backing it. Instead, just use
some token that:

1) Not one of the valid guest pages.
2) Within the physical address supported by the CPU.

Suggested-by: Jim Mattson 
Signed-off-by: KarimAllah Ahmed 
---

Thanks Jim for the commit message :)
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c  | 10 ++
 arch/x86/kvm/vmx.c  | 71 ++---
 3 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fbda5a9..7e50196 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1077,6 +1077,7 @@ struct kvm_x86_ops {
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
+   bool (*nested_apic_access_addr)(struct kvm_vcpu *vcpu, gpa_t gpa, hpa_t 
*hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7c03c0f..ae46a8d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3962,9 +3962,19 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
 {
+   hpa_t hpa;
struct kvm_memory_slot *slot;
bool async;
 
+   if (is_guest_mode(vcpu) &&
+   kvm_x86_ops->nested_apic_access_addr &&
+   kvm_x86_ops->nested_apic_access_addr(vcpu, gfn_to_gpa(gfn), &hpa)) {
+   *pfn = hpa >> PAGE_SHIFT;
+   if (writable)
+   *writable = true;
+   return false;
+   }
+
/*
 * Don't expose private memslots to L2.
 */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 83a614f..340cf56 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -864,7 +864,6 @@ struct nested_vmx {
 * Guest pages referred to in the vmcs02 with host-physical
 * pointers, so we must keep them pinned while L2 runs.
 */
-   struct page *apic_access_page;
struct kvm_host_map virtual_apic_map;
struct kvm_host_map pi_desc_map;
struct kvm_host_map msr_bitmap_map;
@@ -8512,10 +8511,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
kfree(vmx->nested.cached_vmcs12);
kfree(vmx->nested.cached_shadow_vmcs12);
/* Unpin physical memory we referred to in the vmcs02 */
-   if (vmx->nested.apic_access_page) {
-   kvm_release_page_dirty(vmx->nested.apic_access_page);
-   vmx->nested.apic_access_page = NULL;
-   }
kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
vmx->nested.pi_desc = NULL;
@@ -11901,41 +11896,27 @@ static void vmx_inject_page_fault_nested(struct 
kvm_vcpu *vcpu,
 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 struct vmcs12 *vmcs12);
 
+static hpa_t vmx_apic_access_addr(void)
+{
+   /*
+* The physical address choosen here has to:
+* 1) Never be an address that could be assigned to a guest.
+* 2) Within the maximum physical limits of the CPU.
+*
+* So our choice below is completely random, but at least it follows
+* these two rules.
+*/
+   return __pa_symbol(_text) & PAGE_MASK;
+}
+
 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_host_map *map;
-   struct page *page;
-   u64 hpa;
 
-   if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-   /*
-