[PATCH] mm: sparse: Skip no-map regions in memblocks_present

2019-07-12 Thread KarimAllah Ahmed
Do not mark regions that are marked with nomap to be present, otherwise
these memblock cause unnecessarily allocation of metadata.

Cc: Andrew Morton 
Cc: Pavel Tatashin 
Cc: Oscar Salvador 
Cc: Michal Hocko 
Cc: Mike Rapoport 
Cc: Baoquan He 
Cc: Qian Cai 
Cc: Wei Yang 
Cc: Logan Gunthorpe 
Cc: linux...@kvack.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
 mm/sparse.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/mm/sparse.c b/mm/sparse.c
index fd13166..33810b6 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -256,6 +256,10 @@ void __init memblocks_present(void)
struct memblock_region *reg;
 
for_each_memblock(memory, reg) {
+
+   if (memblock_is_nomap(reg))
+   continue;
+
memory_present(memblock_get_region_node(reg),
   memblock_region_memory_base_pfn(reg),
   memblock_region_memory_end_pfn(reg));
-- 
2.7.4



[PATCH] fdt: Properly handle "no-map" field in the memory region

2019-07-12 Thread KarimAllah Ahmed
Mark the memory region with NOMAP flag instead of completely removing it
from the memory blocks. That makes the FDT handling consistent with the EFI
memory map handling.

Cc: Rob Herring 
Cc: Frank Rowand 
Cc: devicet...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
 drivers/of/fdt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index de893c9..77982ae 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1175,7 +1175,7 @@ int __init __weak 
early_init_dt_reserve_memory_arch(phys_addr_t base,
phys_addr_t size, bool nomap)
 {
if (nomap)
-   return memblock_remove(base, size);
+   return memblock_mark_nomap(base, size);
return memblock_reserve(base, size);
 }
 
-- 
2.7.4



[PATCH] arm: Extend the check for RAM in /dev/mem

2019-07-11 Thread KarimAllah Ahmed
Some valid RAM can live outside kernel control (e.g. using mem= kernel
command-line). For these regions, pfn_valid would return "false" causing
system RAM to be mapped as uncached. Use memblock instead to identify RAM.

Cc: Russell King 
Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Mike Rapoport 
Cc: Andrew Morton 
Cc: Anders Roxell 
Cc: Enrico Weigelt 
Cc: Thomas Gleixner 
Cc: KarimAllah Ahmed 
Cc: Mark Rutland 
Cc: James Morse 
Cc: Anshuman Khandual 
Cc: Jun Yao 
Cc: Yu Zhao 
Cc: Robin Murphy 
Cc: Ard Biesheuvel 
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
 arch/arm/mm/mmu.c   | 2 +-
 arch/arm64/mm/mmu.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 1aa2586..492774b 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -705,7 +705,7 @@ static void __init build_mem_type_table(void)
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
  unsigned long size, pgprot_t vma_prot)
 {
-   if (!pfn_valid(pfn))
+   if (!memblock_is_memory(__pfn_to_phys(pfn)))
return pgprot_noncached(vma_prot);
else if (file->f_flags & O_SYNC)
return pgprot_writecombine(vma_prot);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 3645f29..cdc3e8e 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -78,7 +78,7 @@ void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
  unsigned long size, pgprot_t vma_prot)
 {
-   if (!pfn_valid(pfn))
+   if (!memblock_is_memory(__pfn_to_phys(pfn)))
return pgprot_noncached(vma_prot);
else if (file->f_flags & O_SYNC)
return pgprot_writecombine(vma_prot);
-- 
2.7.4



[PATCH v6 03/14] X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs

2019-01-31 Thread KarimAllah Ahmed
From: Filippo Sironi 

cmpxchg_gpte() calls get_user_pages_fast() to retrieve the number of
pages and the respective struct page to map in the kernel virtual
address space.
This doesn't work if get_user_pages_fast() is invoked with a userspace
virtual address that's backed by PFNs outside of kernel reach (e.g., when
limiting the kernel memory with mem= in the command line and using
/dev/mem to map memory).

If get_user_pages_fast() fails, look up the VMA that back the userspace
virtual address, compute the PFN and the physical address, and map it in
the kernel virtual address space with memremap().

Signed-off-by: Filippo Sironi 
Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/paging_tmpl.h | 38 +-
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bdca39..c40af67 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -141,15 +141,35 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
struct page *page;
 
npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
-   /* Check if the user is doing something meaningless. */
-   if (unlikely(npages != 1))
-   return -EFAULT;
-
-   table = kmap_atomic(page);
-   ret = CMPXCHG(&table[index], orig_pte, new_pte);
-   kunmap_atomic(table);
-
-   kvm_release_page_dirty(page);
+   if (likely(npages == 1)) {
+   table = kmap_atomic(page);
+   ret = CMPXCHG(&table[index], orig_pte, new_pte);
+   kunmap_atomic(table);
+
+   kvm_release_page_dirty(page);
+   } else {
+   struct vm_area_struct *vma;
+   unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
+   unsigned long pfn;
+   unsigned long paddr;
+
+   down_read(¤t->mm->mmap_sem);
+   vma = find_vma_intersection(current->mm, vaddr, vaddr + 
PAGE_SIZE);
+   if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
+   up_read(¤t->mm->mmap_sem);
+   return -EFAULT;
+   }
+   pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+   paddr = pfn << PAGE_SHIFT;
+   table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
+   if (!table) {
+   up_read(¤t->mm->mmap_sem);
+   return -EFAULT;
+   }
+   ret = CMPXCHG(&table[index], orig_pte, new_pte);
+   memunmap(table);
+   up_read(¤t->mm->mmap_sem);
+   }
 
return (ret != orig_pte);
 }
-- 
2.7.4



[PATCH v6 01/14] X86/nVMX: handle_vmon: Read 4 bytes from guest memory

2019-01-31 Thread KarimAllah Ahmed
Read the data directly from guest memory instead of the map->read->unmap
sequence. This also avoids using kvm_vcpu_gpa_to_page() and kmap() which
assumes that there is a "struct page" for guest memory.

Suggested-by: Jim Mattson 
Signed-off-by: KarimAllah Ahmed 
Reviewed-by: Jim Mattson 
Reviewed-by: David Hildenbrand 
Reviewed-by: Konrad Rzeszutek Wilk 

---
v1 -> v2:
- Massage commit message a bit.
---
 arch/x86/kvm/vmx/nested.c | 14 +++---
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 8ff2052..11b44a9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4192,7 +4192,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 {
int ret;
gpa_t vmptr;
-   struct page *page;
+   uint32_t revision;
struct vcpu_vmx *vmx = to_vmx(vcpu);
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -4241,18 +4241,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
return nested_vmx_failInvalid(vcpu);
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-   if (is_error_page(page))
+   if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
+   revision != VMCS12_REVISION)
return nested_vmx_failInvalid(vcpu);
 
-   if (*(u32 *)kmap(page) != VMCS12_REVISION) {
-   kunmap(page);
-   kvm_release_page_clean(page);
-   return nested_vmx_failInvalid(vcpu);
-   }
-   kunmap(page);
-   kvm_release_page_clean(page);
-
vmx->nested.vmxon_ptr = vmptr;
ret = enter_vmx_operation(vcpu);
if (ret)
-- 
2.7.4



[PATCH v6 08/14] KVM/nVMX: Use kvm_vcpu_map when mapping the posted interrupt descriptor table

2019-01-31 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the posted interrupt descriptor table since
using kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory
that has a "struct page".

One additional semantic change is that the virtual host mapping lifecycle
has changed a bit. It now has the same lifetime of the pinning of the
interrupt descriptor table page on the host side.

Signed-off-by: KarimAllah Ahmed 
Reviewed-by: Konrad Rzeszutek Wilk 
---
v4 -> v5:
- unmap with dirty flag

v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
---
 arch/x86/kvm/vmx/nested.c | 43 ---
 arch/x86/kvm/vmx/vmx.h|  2 +-
 2 files changed, 13 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 31b352c..53b1063 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -230,12 +230,8 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
-   if (vmx->nested.pi_desc_page) {
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   }
+   kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+   vmx->nested.pi_desc = NULL;
 
kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
 
@@ -2868,26 +2864,15 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
-   if (vmx->nested.pi_desc_page) { /* shouldn't happen */
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
+   map = &vmx->nested.pi_desc_map;
+
+   if (!kvm_vcpu_map(vcpu, 
gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
+   vmx->nested.pi_desc =
+   (struct pi_desc *)(((void *)map->hva) +
+   offset_in_page(vmcs12->posted_intr_desc_addr));
+   vmcs_write64(POSTED_INTR_DESC_ADDR,
+pfn_to_hpa(map->pfn) + 
offset_in_page(vmcs12->posted_intr_desc_addr));
}
-   page = kvm_vcpu_gpa_to_page(vcpu, 
vmcs12->posted_intr_desc_addr);
-   if (is_error_page(page))
-   return;
-   vmx->nested.pi_desc_page = page;
-   vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc =
-   (struct pi_desc *)((void *)vmx->nested.pi_desc +
-   (unsigned long)(vmcs12->posted_intr_desc_addr &
-   (PAGE_SIZE - 1)));
-   vmcs_write64(POSTED_INTR_DESC_ADDR,
-   page_to_phys(vmx->nested.pi_desc_page) +
-   (unsigned long)(vmcs12->posted_intr_desc_addr &
-   (PAGE_SIZE - 1)));
}
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
@@ -3911,12 +3896,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 
exit_reason,
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
-   if (vmx->nested.pi_desc_page) {
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   }
+   kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+   vmx->nested.pi_desc = NULL;
 
/*
 * We are now running in L2, mmu_notifier will force to reload the
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index f618f52..bd04725 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -143,7 +143,7 @@ struct nested_vmx {
 */
struct page *apic_access_page;
struct kvm_host_map virtual_apic_map;
-   struct page *pi_desc_page;
+   struct kvm_host_map pi_desc_map;
 
struct kvm_host_map msr_bitmap_map;
 
-- 
2.7.4



[PATCH v6 04/14] KVM: Introduce a new guest mapping API

2019-01-31 Thread KarimAllah Ahmed
In KVM, specially for nested guests, there is a dominant pattern of:

=> map guest memory -> do_something -> unmap guest memory

In addition to all this unnecessarily noise in the code due to boiler plate
code, most of the time the mapping function does not properly handle memory
that is not backed by "struct page". This new guest mapping API encapsulate
most of this boiler plate code and also handles guest memory that is not
backed by "struct page".

The current implementation of this API is using memremap for memory that is
not backed by a "struct page" which would lead to a huge slow-down if it
was used for high-frequency mapping operations. The API does not have any
effect on current setups where guest memory is backed by a "struct page".
Further patches are going to also introduce a pfn-cache which would
significantly improve the performance of the memremap case.

Signed-off-by: KarimAllah Ahmed 
Reviewed-by: Konrad Rzeszutek Wilk 

---
v5 -> v6:
- Added a helper function to check if the mapping is mapped or not.
- Added more comments on the struct.
- Setting ->page to NULL on unmap and to a poison ptr if unused during map.
- Checking for map ptr before using it.
- Change kvm_vcpu_unmap to also mark page dirty for LM. That requires
  passing the vCPU pointer again to this function.
v3 -> v4:
- Update the commit message.
v1 -> v2:
- Drop the caching optimization (pbonzini)
- Use 'hva' instead of 'kaddr' (pbonzini)
- Return 0/-EINVAL/-EFAULT instead of true/false. -EFAULT will be used for
  AMD patch (pbonzini)
- Introduce __kvm_map_gfn which accepts a memory slot and use it (pbonzini)
- Only clear map->hva instead of memsetting the whole structure.
- Drop kvm_vcpu_map_valid since it is no longer used.
- Fix EXPORT_MODULE naming.
---
 include/linux/kvm_host.h | 28 +
 virt/kvm/kvm_main.c  | 64 
 2 files changed, 92 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c38cc5e..15879ed 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -205,6 +205,32 @@ enum {
READING_SHADOW_PAGE_TABLES,
 };
 
+#define KVM_UNMAPPED_PAGE  ((void *) 0x500 + POISON_POINTER_DELTA)
+
+struct kvm_host_map {
+   /*
+* Only valid if the 'pfn' is managed by the host kernel (i.e. There is
+* a 'struct page' for it. When using mem= kernel parameter some memory
+* can be used as guest memory but they are not managed by host
+* kernel).
+* If 'pfn' is not managed by the host kernel, this field is
+* initialized to KVM_UNMAPPED_PAGE.
+*/
+   struct page *page;
+   void *hva;
+   kvm_pfn_t pfn;
+   kvm_pfn_t gfn;
+};
+
+/*
+ * Used to check if the mapping is valid or not. Never use 'kvm_host_map'
+ * directly to check for that.
+ */
+static inline bool kvm_vcpu_mapped(struct kvm_host_map *map)
+{
+   return !!map->hva;
+}
+
 /*
  * Sometimes a large or cross-page mmio needs to be broken up into separate
  * exits for userspace servicing.
@@ -710,7 +736,9 @@ struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu 
*vcpu);
 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t 
gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
+void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool 
dirty);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool 
*writable);
 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int 
offset,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5ecea81..da3a8fc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1738,6 +1738,70 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
+static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn,
+struct kvm_host_map *map)
+{
+   kvm_pfn_t pfn;
+   void *hva = NULL;
+   struct page *page = KVM_UNMAPPED_PAGE;
+
+   if (!map)
+   return -EINVAL;
+
+   pfn = gfn_to_pfn_memslot(slot, gfn);
+   if (is_error_noslot_pfn(pfn))
+   return -EINVAL;
+
+   if (pfn_valid(pfn)) {
+   page = pfn_to_page(pfn);
+   hva = kmap(page);
+   } else {
+   hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+   }
+
+   if (!hva)
+   return -EFAULT;
+
+   map->page = page;
+   map->hva = hva;
+   map->pfn = pfn;
+   

[PATCH v6 06/14] KVM/nVMX: Use kvm_vcpu_map when mapping the L1 MSR bitmap

2019-01-31 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the L1 MSR bitmap since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v4 -> v5:
- unmap with dirty flag

v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
---
 arch/x86/kvm/vmx/nested.c | 11 +--
 arch/x86/kvm/vmx/vmx.h|  3 +++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 8fc327f..1813211 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -507,9 +507,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
 struct vmcs12 *vmcs12)
 {
int msr;
-   struct page *page;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+   struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
+
/*
 * pred_cmd & spec_ctrl are trying to verify two things:
 *
@@ -535,11 +536,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
!pred_cmd && !spec_ctrl)
return false;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
 
-   msr_bitmap_l1 = (unsigned long *)kmap(page);
+   msr_bitmap_l1 = (unsigned long *)map->hva;
if (nested_cpu_has_apic_reg_virt(vmcs12)) {
/*
 * L0 need not intercept reads for MSRs between 0x800 and 
0x8ff, it
@@ -587,8 +587,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
MSR_IA32_PRED_CMD,
MSR_TYPE_W);
 
-   kunmap(page);
-   kvm_release_page_clean(page);
+   kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
 
return true;
 }
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9932895..6fb69d8 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -144,6 +144,9 @@ struct nested_vmx {
struct page *apic_access_page;
struct page *virtual_apic_page;
struct page *pi_desc_page;
+
+   struct kvm_host_map msr_bitmap_map;
+
struct pi_desc *pi_desc;
bool pi_pending;
u16 posted_intr_nv;
-- 
2.7.4



[PATCH v6 10/14] KVM/nSVM: Use the new mapping API for mapping guest memory

2019-01-31 Thread KarimAllah Ahmed
Use the new mapping API for mapping guest memory to avoid depending on
"struct page".

Signed-off-by: KarimAllah Ahmed 
Reviewed-by: Konrad Rzeszutek Wilk 
---
v4 -> v5:
- unmap with dirty flag
---
 arch/x86/kvm/svm.c | 97 +++---
 1 file changed, 49 insertions(+), 48 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f13a3a2..d30a35b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3062,32 +3062,6 @@ static inline bool nested_svm_nmi(struct vcpu_svm *svm)
return false;
 }
 
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
-{
-   struct page *page;
-
-   might_sleep();
-
-   page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
-   goto error;
-
-   *_page = page;
-
-   return kmap(page);
-
-error:
-   kvm_inject_gp(&svm->vcpu, 0);
-
-   return NULL;
-}
-
-static void nested_svm_unmap(struct page *page)
-{
-   kunmap(page);
-   kvm_release_page_dirty(page);
-}
-
 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
 {
unsigned port, size, iopm_len;
@@ -3290,10 +3264,11 @@ static inline void copy_vmcb_control_area(struct vmcb 
*dst_vmcb, struct vmcb *fr
 
 static int nested_svm_vmexit(struct vcpu_svm *svm)
 {
+   int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
-   struct page *page;
+   struct kvm_host_map map;
 
trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
   vmcb->control.exit_info_1,
@@ -3302,9 +3277,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
   vmcb->control.exit_int_info_err,
   KVM_ISA_SVM);
 
-   nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
-   if (!nested_vmcb)
+   rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map);
+   if (rc) {
+   if (rc == -EINVAL)
+   kvm_inject_gp(&svm->vcpu, 0);
return 1;
+   }
+
+   nested_vmcb = map.hva;
 
/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
@@ -3408,7 +3388,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
mark_all_dirty(svm->vmcb);
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(&svm->vcpu, &map, true);
 
nested_svm_uninit_mmu_context(&svm->vcpu);
kvm_mmu_reset_context(&svm->vcpu);
@@ -3474,7 +3454,7 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 }
 
 static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-struct vmcb *nested_vmcb, struct page *page)
+struct vmcb *nested_vmcb, struct kvm_host_map 
*map)
 {
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -3558,7 +3538,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, 
u64 vmcb_gpa,
svm->vmcb->control.pause_filter_thresh =
nested_vmcb->control.pause_filter_thresh;
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(&svm->vcpu, map, true);
 
/* Enter Guest-Mode */
enter_guest_mode(&svm->vcpu);
@@ -3578,17 +3558,23 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, 
u64 vmcb_gpa,
 
 static bool nested_svm_vmrun(struct vcpu_svm *svm)
 {
+   int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
-   struct page *page;
+   struct kvm_host_map map;
u64 vmcb_gpa;
 
vmcb_gpa = svm->vmcb->save.rax;
 
-   nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
-   if (!nested_vmcb)
+   rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map);
+   if (rc) {
+   if (rc == -EINVAL)
+   kvm_inject_gp(&svm->vcpu, 0);
return false;
+   }
+
+   nested_vmcb = map.hva;
 
if (!nested_vmcb_checks(nested_vmcb)) {
nested_vmcb->control.exit_code= SVM_EXIT_ERR;
@@ -3596,7 +3582,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_1  = 0;
nested_vmcb->control.exit_info_2  = 0;
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(&svm->vcpu, &map, true);
 
return false;
}
@@ -3640,7 +3626,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
copy_vmcb_control_area(hsave, vmcb);
 
-   enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
+   enter_svm_guest_mode(svm, vmcb_

[PATCH v6 11/14] KVM/nVMX: Use kvm_vcpu_map for accessing the shadow VMCS

2019-01-31 Thread KarimAllah Ahmed
Use kvm_vcpu_map for accessing the shadow VMCS since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
Reviewed-by: Konrad Rzessutek Wilk 
---
v4 -> v5:
- unmap with dirty flag
---
 arch/x86/kvm/vmx/nested.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 53b1063..3c173b9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -588,20 +588,20 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
+   struct kvm_host_map map;
struct vmcs12 *shadow;
-   struct page *page;
 
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
vmcs12->vmcs_link_pointer == -1ull)
return;
 
shadow = get_shadow_vmcs12(vcpu);
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
 
-   memcpy(shadow, kmap(page), VMCS12_SIZE);
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
+   return;
 
-   kunmap(page);
-   kvm_release_page_clean(page);
+   memcpy(shadow, map.hva, VMCS12_SIZE);
+   kvm_vcpu_unmap(vcpu, &map, false);
 }
 
 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
@@ -2637,9 +2637,9 @@ static int nested_vmx_check_vmentry_prereqs(struct 
kvm_vcpu *vcpu,
 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
  struct vmcs12 *vmcs12)
 {
-   int r;
-   struct page *page;
+   int r = 0;
struct vmcs12 *shadow;
+   struct kvm_host_map map;
 
if (vmcs12->vmcs_link_pointer == -1ull)
return 0;
@@ -2647,17 +2647,16 @@ static int nested_vmx_check_vmcs_link_ptr(struct 
kvm_vcpu *vcpu,
if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
return -EINVAL;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
return -EINVAL;
 
-   r = 0;
-   shadow = kmap(page);
+   shadow = map.hva;
+
if (shadow->hdr.revision_id != VMCS12_REVISION ||
shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
r = -EINVAL;
-   kunmap(page);
-   kvm_release_page_clean(page);
+
+   kvm_vcpu_unmap(vcpu, &map, false);
return r;
 }
 
-- 
2.7.4



[PATCH v6 13/14] KVM/nVMX: Use page_address_valid in a few more locations

2019-01-31 Thread KarimAllah Ahmed
Use page_address_valid in a few more locations that is already checking for
a page aligned address that does not cross the maximum physical address.

Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/vmx/nested.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 60ba582..91e42f9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4203,7 +4203,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
 * which replaces physical address width with 32
 */
-   if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+   if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failInvalid(vcpu);
 
if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
@@ -4266,7 +4266,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
 
-   if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+   if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_INVALID_ADDRESS);
 
@@ -4473,7 +4473,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
 
-   if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+   if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INVALID_ADDRESS);
 
-- 
2.7.4



[PATCH v6 14/14] kvm, x86: Properly check whether a pfn is an MMIO or not

2019-01-31 Thread KarimAllah Ahmed
pfn_valid check is not sufficient because it only checks if a page has a struct
page or not, if "mem=" was passed to the kernel some valid pages won't have a
struct page. This means that if guests were assigned valid memory that lies
after the mem= boundary it will be passed uncached to the guest no matter what
the guest caching attributes are for this memory.

Introduce a new function e820__mapped_raw_any which is equivalent to
e820__mapped_any but uses the original e820 unmodified and use it to
identify real *RAM*.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Introduce e820__mapped_raw_any
---
 arch/x86/include/asm/e820/api.h |  1 +
 arch/x86/kernel/e820.c  | 18 +++---
 arch/x86/kvm/mmu.c  |  5 -
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 4562062..b3f6e89 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -12,6 +12,7 @@ extern unsigned long e820_saved_max_low_pfn;
 
 extern unsigned long pci_mem_start;
 
+extern bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type);
 extern bool e820__mapped_any(u64 start, u64 end, enum e820_type type);
 extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type);
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index ef914d0..3039659 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -75,12 +75,13 @@ EXPORT_SYMBOL(pci_mem_start);
  * This function checks if any part of the range  is mapped
  * with type.
  */
-bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
+static bool _e820__mapped_any(struct e820_table *table,
+ u64 start, u64 end, enum e820_type type)
 {
int i;
 
-   for (i = 0; i < e820_table->nr_entries; i++) {
-   struct e820_entry *entry = &e820_table->entries[i];
+   for (i = 0; i < table->nr_entries; i++) {
+   struct e820_entry *entry = &table->entries[i];
 
if (type && entry->type != type)
continue;
@@ -90,6 +91,17 @@ bool e820__mapped_any(u64 start, u64 end, enum e820_type 
type)
}
return 0;
 }
+
+bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
+{
+   return _e820__mapped_any(e820_table_firmware, start, end, type);
+}
+EXPORT_SYMBOL_GPL(e820__mapped_raw_any);
+
+bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
+{
+   return _e820__mapped_any(e820_table, start, end, type);
+}
 EXPORT_SYMBOL_GPL(e820__mapped_any);
 
 /*
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index da9c423..abf6a0d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2875,7 +2876,9 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
 */
(!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
 
-   return true;
+   return !e820__mapped_raw_any(pfn_to_hpa(pfn),
+pfn_to_hpa(pfn + 1) - 1,
+E820_TYPE_RAM);
 }
 
 /* Bits which may be returned by set_spte() */
-- 
2.7.4



[PATCH v6 09/14] KVM/X86: Use kvm_vcpu_map in emulator_cmpxchg_emulated

2019-01-31 Thread KarimAllah Ahmed
Use kvm_vcpu_map in emulator_cmpxchg_emulated since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
Reviewed-by: Konrad Rzeszutek Wilk 
---
v5 -> v6:
- Remove explicit call to dirty the page. It is now done by kvm_vcpu_unmap

v4 -> v5:
- unmap with dirty flag

v1 -> v2:
- Update to match the new API return codes
---
 arch/x86/kvm/x86.c | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d27206..f6800e0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5494,9 +5494,9 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
 unsigned int bytes,
 struct x86_exception *exception)
 {
+   struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
-   struct page *page;
char *kaddr;
bool exchanged;
 
@@ -5513,12 +5513,11 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
goto emul_write;
 
-   page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
goto emul_write;
 
-   kaddr = kmap_atomic(page);
-   kaddr += offset_in_page(gpa);
+   kaddr = map.hva + offset_in_page(gpa);
+
switch (bytes) {
case 1:
exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
@@ -5535,13 +5534,12 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
default:
BUG();
}
-   kunmap_atomic(kaddr);
-   kvm_release_page_dirty(page);
+
+   kvm_vcpu_unmap(vcpu, &map, true);
 
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
 
-   kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
kvm_page_track_write(vcpu, gpa, new, bytes);
 
return X86EMUL_CONTINUE;
-- 
2.7.4



[PATCH v6 12/14] KVM/nVMX: Use kvm_vcpu_map for accessing the enlightened VMCS

2019-01-31 Thread KarimAllah Ahmed
Use kvm_vcpu_map for accessing the enlightened VMCS since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
Reviewed-by: Konrad Rzeszutek Wilk 
---
v4 -> v5:
- s/enhanced/enlightened
- unmap with dirty flag
---
 arch/x86/kvm/vmx/nested.c | 14 +-
 arch/x86/kvm/vmx/vmx.h|  2 +-
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3c173b9..60ba582 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -193,10 +193,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu 
*vcpu)
if (!vmx->nested.hv_evmcs)
return;
 
-   kunmap(vmx->nested.hv_evmcs_page);
-   kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
+   kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
vmx->nested.hv_evmcs_vmptr = -1ull;
-   vmx->nested.hv_evmcs_page = NULL;
vmx->nested.hv_evmcs = NULL;
 }
 
@@ -1769,13 +1767,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct 
kvm_vcpu *vcpu,
 
nested_release_evmcs(vcpu);
 
-   vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
-   vcpu, assist_page.current_nested_vmcs);
-
-   if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
+   if (kvm_vcpu_map(vcpu, 
gpa_to_gfn(assist_page.current_nested_vmcs),
+&vmx->nested.hv_evmcs_map))
return 0;
 
-   vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
+   vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
 
/*
 * Currently, KVM only supports eVMCS version 1
@@ -4278,7 +4274,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_VMXON_POINTER);
 
-   if (vmx->nested.hv_evmcs_page) {
+   if (vmx->nested.hv_evmcs_map.hva) {
if (vmptr == vmx->nested.hv_evmcs_vmptr)
nested_release_evmcs(vcpu);
} else {
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index bd04725..a130139 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -172,7 +172,7 @@ struct nested_vmx {
} smm;
 
gpa_t hv_evmcs_vmptr;
-   struct page *hv_evmcs_page;
+   struct kvm_host_map hv_evmcs_map;
struct hv_enlightened_vmcs *hv_evmcs;
 };
 
-- 
2.7.4



[PATCH v6 05/14] X86/nVMX: handle_vmptrld: Use kvm_vcpu_map when copying VMCS12 from guest memory

2019-01-31 Thread KarimAllah Ahmed
Use kvm_vcpu_map to the map the VMCS12 from guest memory because
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
Reviewed-by: Konrad Rzeszutek Wilk 
---
v4 -> v5:
- Switch to the new guest mapping API instead of reading directly from
  guest.
- unmap with dirty flag
v3 -> v4:
- Return VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID on failure (jmattson@)
v1 -> v2:
- Massage commit message a bit.
---
 arch/x86/kvm/vmx/nested.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 11b44a9..8fc327f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4521,11 +4521,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return 1;
 
if (vmx->nested.current_vmptr != vmptr) {
+   struct kvm_host_map map;
struct vmcs12 *new_vmcs12;
-   struct page *page;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-   if (is_error_page(page)) {
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
/*
 * Reads from an unbacked page return all 1s,
 * which means that the 32 bits located at the
@@ -4535,12 +4534,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
-   new_vmcs12 = kmap(page);
+
+   new_vmcs12 = map.hva;
+
if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
(new_vmcs12->hdr.shadow_vmcs &&
 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
-   kunmap(page);
-   kvm_release_page_clean(page);
+   kvm_vcpu_unmap(vcpu, &map, false);
return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
@@ -4552,8 +4552,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 * cached.
 */
memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
-   kunmap(page);
-   kvm_release_page_clean(page);
+   kvm_vcpu_unmap(vcpu, &map, false);
 
set_current_vmptr(vmx, vmptr);
}
-- 
2.7.4



[PATCH v6 07/14] KVM/nVMX: Use kvm_vcpu_map when mapping the virtual APIC page

2019-01-31 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the virtual APIC page since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

One additional semantic change is that the virtual host mapping lifecycle
has changed a bit. It now has the same lifetime of the pinning of the
virtual APIC page on the host side.

Signed-off-by: KarimAllah Ahmed 
Reviewed-by: Konrad Rzeszutek Wilk 
---
v4 -> v5:
- unmap with dirty flag

v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
- Use pfn_to_hpa instead of gfn_to_gpa
---
 arch/x86/kvm/vmx/nested.c | 32 +++-
 arch/x86/kvm/vmx/vmx.c|  5 ++---
 arch/x86/kvm/vmx/vmx.h|  2 +-
 3 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1813211..31b352c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -229,10 +229,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
-   if (vmx->nested.virtual_apic_page) {
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
+   kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
kvm_release_page_dirty(vmx->nested.pi_desc_page);
@@ -2817,6 +2814,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_host_map *map;
struct page *page;
u64 hpa;
 
@@ -2849,11 +2847,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
}
 
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-   if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
-   page = kvm_vcpu_gpa_to_page(vcpu, 
vmcs12->virtual_apic_page_addr);
+   map = &vmx->nested.virtual_apic_map;
 
/*
 * If translation failed, VM entry will fail because
@@ -2868,11 +2862,9 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
 * control.  But such a configuration is useless, so
 * let's keep the code simple.
 */
-   if (!is_error_page(page)) {
-   vmx->nested.virtual_apic_page = page;
-   hpa = page_to_phys(vmx->nested.virtual_apic_page);
-   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
-   }
+   if (!kvm_vcpu_map(vcpu, 
gpa_to_gfn(vmcs12->virtual_apic_page_addr), map))
+   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 
pfn_to_hpa(map->pfn));
+
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -3279,11 +3271,12 @@ static void vmx_complete_nested_posted_interrupt(struct 
kvm_vcpu *vcpu)
 
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
-   vapic_page = kmap(vmx->nested.virtual_apic_page);
+   vapic_page = vmx->nested.virtual_apic_map.hva;
+   if (!vapic_page)
+   return;
+
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
vapic_page, &max_irr);
-   kunmap(vmx->nested.virtual_apic_page);
-
status = vmcs_read16(GUEST_INTR_STATUS);
if ((u8)max_irr > ((u8)status & 0xff)) {
status &= ~0xff;
@@ -3917,10 +3910,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 
exit_reason,
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
-   if (vmx->nested.virtual_apic_page) {
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
+   kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
kvm_release_page_dirty(vmx->nested.pi_desc_page);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a3da75a..ea9cffc 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3627,14 +3627,13 @@ static bool vmx_guest_apic_has_interrupt(struct 
kvm_vcpu *vcpu)
 
if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
-   WARN_ON_ONCE(

[PATCH v6 00/14] KVM/X86: Introduce a new guest mapping interface

2019-01-31 Thread KarimAllah Ahmed
Guest memory can either be directly managed by the kernel (i.e. have a "struct
page") or they can simply live outside kernel control (i.e. do not have a
"struct page"). KVM mostly support these two modes, except in a few places
where the code seems to assume that guest memory must have a "struct page".

This patchset introduces a new mapping interface to map guest memory into host
kernel memory which also supports PFN-based memory (i.e. memory without 'struct
page'). It also converts all offending code to this interface or simply
read/write directly from guest memory. Patch 2 is additionally fixing an
incorrect page release and marking the page as dirty (i.e. as a side-effect of
using the helper function to write).

As far as I can see all offending code is now fixed except the APIC-access page
which I will handle in a seperate series along with dropping
kvm_vcpu_gfn_to_page and kvm_vcpu_gpa_to_page from the internal KVM API.

The current implementation of the new API uses memremap to map memory that does
not have a "struct page". This proves to be very slow for high frequency
mappings. Since this does not affect the normal use-case where a "struct page"
is available, the performance of this API will be handled by a seperate patch
series.

So the simple way to use memory outside kernel control is:

1- Pass 'mem=' in the kernel command-line to limit the amount of memory managed 
   by the kernel.
2- Map this physical memory you want to give to the guest with:
   mmap("/dev/mem", physical_address_offset, ..)
3- Use the user-space virtual address as the "userspace_addr" field in
   KVM_SET_USER_MEMORY_REGION ioctl.

v5 -> v6:
- Added one extra patch to ensure that support for this mem= case is complete
  for x86.
- Added a helper function to check if the mapping is mapped or not.
- Added more comments on the struct.
- Setting ->page to NULL on unmap and to a poison ptr if unused during map
- Checking for map ptr before using it.
- Change kvm_vcpu_unmap to also mark page dirty for LM. That requires
  passing the vCPU pointer again to this function.

v4 -> v5:
- Introduce a new parameter 'dirty' into kvm_vcpu_unmap
- A horrible rebase due to nested.c :)
- Dropped a couple of hyperv patches as the code was fixed already as a
  side-effect of another patch.
- Added a new trivial cleanup patch.

v3 -> v4:
- Rebase
- Add a new patch to also fix the newly introduced enlightned VMCS.

v2 -> v3:
- Rebase
- Add a new patch to also fix the newly introduced shadow VMCS.

Filippo Sironi (1):
  X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs

KarimAllah Ahmed (13):
  X86/nVMX: handle_vmon: Read 4 bytes from guest memory
  X86/nVMX: Update the PML table without mapping and unmapping the page
  KVM: Introduce a new guest mapping API
  X86/nVMX: handle_vmptrld: Use kvm_vcpu_map when copying VMCS12 from
guest memory
  KVM/nVMX: Use kvm_vcpu_map when mapping the L1 MSR bitmap
  KVM/nVMX: Use kvm_vcpu_map when mapping the virtual APIC page
  KVM/nVMX: Use kvm_vcpu_map when mapping the posted interrupt
descriptor table
  KVM/X86: Use kvm_vcpu_map in emulator_cmpxchg_emulated
  KVM/nSVM: Use the new mapping API for mapping guest memory
  KVM/nVMX: Use kvm_vcpu_map for accessing the shadow VMCS
  KVM/nVMX: Use kvm_vcpu_map for accessing the enlightened VMCS
  KVM/nVMX: Use page_address_valid in a few more locations
  kvm, x86: Properly check whether a pfn is an MMIO or not

 arch/x86/include/asm/e820/api.h |   1 +
 arch/x86/kernel/e820.c  |  18 -
 arch/x86/kvm/mmu.c  |   5 +-
 arch/x86/kvm/paging_tmpl.h  |  38 +++---
 arch/x86/kvm/svm.c  |  97 
 arch/x86/kvm/vmx/nested.c   | 160 +++-
 arch/x86/kvm/vmx/vmx.c  |  19 ++---
 arch/x86/kvm/vmx/vmx.h  |   9 ++-
 arch/x86/kvm/x86.c  |  14 ++--
 include/linux/kvm_host.h|  28 +++
 virt/kvm/kvm_main.c |  64 
 11 files changed, 267 insertions(+), 186 deletions(-)

-- 
2.7.4



[PATCH v6 02/14] X86/nVMX: Update the PML table without mapping and unmapping the page

2019-01-31 Thread KarimAllah Ahmed
Update the PML table without mapping and unmapping the page. This also
avoids using kvm_vcpu_gpa_to_page(..) which assumes that there is a "struct
page" for guest memory.

As a side-effect of using kvm_write_guest_page the page is also properly
marked as dirty.

Signed-off-by: KarimAllah Ahmed 
Reviewed-by: David Hildenbrand 
Reviewed-by: Konrad Rzeszutek Wilk 

---
v1 -> v2:
- Use kvm_write_guest_page instead of kvm_write_guest (pbonzini)
- Do not use pointer arithmetic for pml_address (pbonzini)
---
 arch/x86/kvm/vmx/vmx.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4341175..a3da75a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7206,9 +7206,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
 {
struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
-   gpa_t gpa;
-   struct page *page = NULL;
-   u64 *pml_address;
+   gpa_t gpa, dst;
 
if (is_guest_mode(vcpu)) {
WARN_ON_ONCE(vmx->nested.pml_full);
@@ -7228,15 +7226,13 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
}
 
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+   dst = vmcs12->pml_address + sizeof(u64) * 
vmcs12->guest_pml_index;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
-   if (is_error_page(page))
+   if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+offset_in_page(dst), sizeof(gpa)))
return 0;
 
-   pml_address = kmap(page);
-   pml_address[vmcs12->guest_pml_index--] = gpa;
-   kunmap(page);
-   kvm_release_page_clean(page);
+   vmcs12->guest_pml_index--;
}
 
return 0;
-- 
2.7.4



[PATCH v5 01/13] X86/nVMX: handle_vmon: Read 4 bytes from guest memory

2019-01-09 Thread KarimAllah Ahmed
Read the data directly from guest memory instead of the map->read->unmap
sequence. This also avoids using kvm_vcpu_gpa_to_page() and kmap() which
assumes that there is a "struct page" for guest memory.

Suggested-by: Jim Mattson 
Signed-off-by: KarimAllah Ahmed 
Reviewed-by: Jim Mattson 
Reviewed-by: David Hildenbrand 

---
v1 -> v2:
- Massage commit message a bit.
---
 arch/x86/kvm/vmx/nested.c | 14 +++---
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3170e29..536468a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4192,7 +4192,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 {
int ret;
gpa_t vmptr;
-   struct page *page;
+   uint32_t revision;
struct vcpu_vmx *vmx = to_vmx(vcpu);
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -4241,18 +4241,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
return nested_vmx_failInvalid(vcpu);
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-   if (is_error_page(page))
+   if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
+   revision != VMCS12_REVISION)
return nested_vmx_failInvalid(vcpu);
 
-   if (*(u32 *)kmap(page) != VMCS12_REVISION) {
-   kunmap(page);
-   kvm_release_page_clean(page);
-   return nested_vmx_failInvalid(vcpu);
-   }
-   kunmap(page);
-   kvm_release_page_clean(page);
-
vmx->nested.vmxon_ptr = vmptr;
ret = enter_vmx_operation(vcpu);
if (ret)
-- 
2.7.4



[PATCH v5 00/13] KVM/X86: Introduce a new guest mapping interface

2019-01-09 Thread KarimAllah Ahmed
Guest memory can either be directly managed by the kernel (i.e. have a "struct
page") or they can simply live outside kernel control (i.e. do not have a
"struct page"). KVM mostly support these two modes, except in a few places
where the code seems to assume that guest memory must have a "struct page".

This patchset introduces a new mapping interface to map guest memory into host
kernel memory which also supports PFN-based memory (i.e. memory without 'struct
page'). It also converts all offending code to this interface or simply
read/write directly from guest memory. Patch 2 is additionally fixing an
incorrect page release and marking the page as dirty (i.e. as a side-effect of
using the helper function to write).

As far as I can see all offending code is now fixed except the APIC-access page
which I will handle in a seperate series along with dropping
kvm_vcpu_gfn_to_page and kvm_vcpu_gpa_to_page from the internal KVM API.

The current implementation of the new API uses memremap to map memory that does
not have a "struct page". This proves to be very slow for high frequency
mappings. Since this does not affect the normal use-case where a "struct page"
is available, the performance of this API will be handled by a seperate patch
series.

v4 -> v5:
- Introduce a new parameter 'dirty' into kvm_vcpu_unmap
- A horrible rebase due to nested.c :)
- Dropped a couple of hyperv patches as the code was fixed already as a
  side-effect of another patch.
- Added a new trivial cleanup patch.

v3 -> v4:
- Rebase
- Add a new patch to also fix the newly introduced enlightned VMCS.

v2 -> v3:
- Rebase
- Add a new patch to also fix the newly introduced shadow VMCS.

Filippo Sironi (1):
  X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs

KarimAllah Ahmed (12):
  X86/nVMX: handle_vmon: Read 4 bytes from guest memory
  X86/nVMX: Update the PML table without mapping and unmapping the page
  KVM: Introduce a new guest mapping API
  X86/nVMX: handle_vmptrld: Use kvm_vcpu_map when copying VMCS12 from
guest memory
  KVM/nVMX: Use kvm_vcpu_map when mapping the L1 MSR bitmap
  KVM/nVMX: Use kvm_vcpu_map when mapping the virtual APIC page
  KVM/nVMX: Use kvm_vcpu_map when mapping the posted interrupt
descriptor table
  KVM/X86: Use kvm_vcpu_map in emulator_cmpxchg_emulated
  KVM/nSVM: Use the new mapping API for mapping guest memory
  KVM/nVMX: Use kvm_vcpu_map for accessing the shadow VMCS
  KVM/nVMX: Use kvm_vcpu_map for accessing the enlightened VMCS
  KVM/nVMX: Use page_address_valid in a few more locations

 arch/x86/kvm/paging_tmpl.h |  38 ---
 arch/x86/kvm/svm.c |  97 +--
 arch/x86/kvm/vmx/nested.c  | 160 -
 arch/x86/kvm/vmx/vmx.c |  19 ++
 arch/x86/kvm/vmx/vmx.h |   9 ++-
 arch/x86/kvm/x86.c |  13 ++--
 include/linux/kvm_host.h   |   9 +++
 virt/kvm/kvm_main.c|  53 +++
 8 files changed, 217 insertions(+), 181 deletions(-)

-- 
2.7.4



[PATCH v5 12/13] KVM/nVMX: Use kvm_vcpu_map for accessing the enlightened VMCS

2019-01-09 Thread KarimAllah Ahmed
Use kvm_vcpu_map for accessing the enhanced VMCS since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 

---
v4 -> v5:
- s/enhanced/enlightened
- unmap with dirty flag
---
 arch/x86/kvm/vmx/nested.c | 14 +-
 arch/x86/kvm/vmx/vmx.h|  2 +-
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 04a8b43..ccb3b63 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -193,10 +193,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu 
*vcpu)
if (!vmx->nested.hv_evmcs)
return;
 
-   kunmap(vmx->nested.hv_evmcs_page);
-   kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
+   kvm_vcpu_unmap(&vmx->nested.hv_evmcs_map, true);
vmx->nested.hv_evmcs_vmptr = -1ull;
-   vmx->nested.hv_evmcs_page = NULL;
vmx->nested.hv_evmcs = NULL;
 }
 
@@ -1769,13 +1767,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct 
kvm_vcpu *vcpu,
 
nested_release_evmcs(vcpu);
 
-   vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
-   vcpu, assist_page.current_nested_vmcs);
-
-   if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
+   if (kvm_vcpu_map(vcpu, 
gpa_to_gfn(assist_page.current_nested_vmcs),
+&vmx->nested.hv_evmcs_map))
return 0;
 
-   vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
+   vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
 
/*
 * Currently, KVM only supports eVMCS version 1
@@ -4278,7 +4274,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_VMXON_POINTER);
 
-   if (vmx->nested.hv_evmcs_page) {
+   if (vmx->nested.hv_evmcs_map.hva) {
if (vmptr == vmx->nested.hv_evmcs_vmptr)
nested_release_evmcs(vcpu);
} else {
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index bd04725..a130139 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -172,7 +172,7 @@ struct nested_vmx {
} smm;
 
gpa_t hv_evmcs_vmptr;
-   struct page *hv_evmcs_page;
+   struct kvm_host_map hv_evmcs_map;
struct hv_enlightened_vmcs *hv_evmcs;
 };
 
-- 
2.7.4



[PATCH v5 08/13] KVM/nVMX: Use kvm_vcpu_map when mapping the posted interrupt descriptor table

2019-01-09 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the posted interrupt descriptor table since
using kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory
that has a "struct page".

One additional semantic change is that the virtual host mapping lifecycle
has changed a bit. It now has the same lifetime of the pinning of the
interrupt descriptor table page on the host side.

Signed-off-by: KarimAllah Ahmed 
---
v4 -> v5:
- unmap with dirty flag

v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
---
 arch/x86/kvm/vmx/nested.c | 43 ---
 arch/x86/kvm/vmx/vmx.h|  2 +-
 2 files changed, 13 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index dcff99d..b4230ce 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -230,12 +230,8 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(&vmx->nested.virtual_apic_map, true);
-   if (vmx->nested.pi_desc_page) {
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.pi_desc_map, true);
+   vmx->nested.pi_desc = NULL;
 
kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
 
@@ -2868,26 +2864,15 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
-   if (vmx->nested.pi_desc_page) { /* shouldn't happen */
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
+   map = &vmx->nested.pi_desc_map;
+
+   if (!kvm_vcpu_map(vcpu, 
gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
+   vmx->nested.pi_desc =
+   (struct pi_desc *)(((void *)map->hva) +
+   offset_in_page(vmcs12->posted_intr_desc_addr));
+   vmcs_write64(POSTED_INTR_DESC_ADDR,
+pfn_to_hpa(map->pfn) + 
offset_in_page(vmcs12->posted_intr_desc_addr));
}
-   page = kvm_vcpu_gpa_to_page(vcpu, 
vmcs12->posted_intr_desc_addr);
-   if (is_error_page(page))
-   return;
-   vmx->nested.pi_desc_page = page;
-   vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc =
-   (struct pi_desc *)((void *)vmx->nested.pi_desc +
-   (unsigned long)(vmcs12->posted_intr_desc_addr &
-   (PAGE_SIZE - 1)));
-   vmcs_write64(POSTED_INTR_DESC_ADDR,
-   page_to_phys(vmx->nested.pi_desc_page) +
-   (unsigned long)(vmcs12->posted_intr_desc_addr &
-   (PAGE_SIZE - 1)));
}
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
@@ -3911,12 +3896,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 
exit_reason,
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(&vmx->nested.virtual_apic_map, true);
-   if (vmx->nested.pi_desc_page) {
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.pi_desc_map, true);
+   vmx->nested.pi_desc = NULL;
 
/*
 * We are now running in L2, mmu_notifier will force to reload the
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index f618f52..bd04725 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -143,7 +143,7 @@ struct nested_vmx {
 */
struct page *apic_access_page;
struct kvm_host_map virtual_apic_map;
-   struct page *pi_desc_page;
+   struct kvm_host_map pi_desc_map;
 
struct kvm_host_map msr_bitmap_map;
 
-- 
2.7.4



[PATCH v5 09/13] KVM/X86: Use kvm_vcpu_map in emulator_cmpxchg_emulated

2019-01-09 Thread KarimAllah Ahmed
Use kvm_vcpu_map in emulator_cmpxchg_emulated since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v4 -> v5:
- unmap with dirty flag

v1 -> v2:
- Update to match the new API return codes
---
 arch/x86/kvm/x86.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 02c8e09..0c35cfc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5492,9 +5492,9 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
 unsigned int bytes,
 struct x86_exception *exception)
 {
+   struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
-   struct page *page;
char *kaddr;
bool exchanged;
 
@@ -5511,12 +5511,11 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
goto emul_write;
 
-   page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
goto emul_write;
 
-   kaddr = kmap_atomic(page);
-   kaddr += offset_in_page(gpa);
+   kaddr = map.hva + offset_in_page(gpa);
+
switch (bytes) {
case 1:
exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
@@ -5533,8 +5532,8 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
default:
BUG();
}
-   kunmap_atomic(kaddr);
-   kvm_release_page_dirty(page);
+
+   kvm_vcpu_unmap(&map, true);
 
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
-- 
2.7.4



[PATCH v5 02/13] X86/nVMX: Update the PML table without mapping and unmapping the page

2019-01-09 Thread KarimAllah Ahmed
Update the PML table without mapping and unmapping the page. This also
avoids using kvm_vcpu_gpa_to_page(..) which assumes that there is a "struct
page" for guest memory.

As a side-effect of using kvm_write_guest_page the page is also properly
marked as dirty.

Signed-off-by: KarimAllah Ahmed 
Reviewed-by: David Hildenbrand 
---
v1 -> v2:
- Use kvm_write_guest_page instead of kvm_write_guest (pbonzini)
- Do not use pointer arithmetic for pml_address (pbonzini)
---
 arch/x86/kvm/vmx/vmx.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4d39f73..71d88df 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7199,9 +7199,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
 {
struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
-   gpa_t gpa;
-   struct page *page = NULL;
-   u64 *pml_address;
+   gpa_t gpa, dst;
 
if (is_guest_mode(vcpu)) {
WARN_ON_ONCE(vmx->nested.pml_full);
@@ -7221,15 +7219,13 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
}
 
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+   dst = vmcs12->pml_address + sizeof(u64) * 
vmcs12->guest_pml_index;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
-   if (is_error_page(page))
+   if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+offset_in_page(dst), sizeof(gpa)))
return 0;
 
-   pml_address = kmap(page);
-   pml_address[vmcs12->guest_pml_index--] = gpa;
-   kunmap(page);
-   kvm_release_page_clean(page);
+   vmcs12->guest_pml_index--;
}
 
return 0;
-- 
2.7.4



[PATCH v5 07/13] KVM/nVMX: Use kvm_vcpu_map when mapping the virtual APIC page

2019-01-09 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the virtual APIC page since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

One additional semantic change is that the virtual host mapping lifecycle
has changed a bit. It now has the same lifetime of the pinning of the
virtual APIC page on the host side.

Signed-off-by: KarimAllah Ahmed 
---
v4 -> v5:
- unmap with dirty flag

v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
- Use pfn_to_hpa instead of gfn_to_gpa
---
 arch/x86/kvm/vmx/nested.c | 32 +++-
 arch/x86/kvm/vmx/vmx.c|  5 ++---
 arch/x86/kvm/vmx/vmx.h|  2 +-
 3 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 4127ad9..dcff99d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -229,10 +229,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
-   if (vmx->nested.virtual_apic_page) {
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.virtual_apic_map, true);
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
kvm_release_page_dirty(vmx->nested.pi_desc_page);
@@ -2817,6 +2814,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_host_map *map;
struct page *page;
u64 hpa;
 
@@ -2849,11 +2847,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
}
 
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-   if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
-   page = kvm_vcpu_gpa_to_page(vcpu, 
vmcs12->virtual_apic_page_addr);
+   map = &vmx->nested.virtual_apic_map;
 
/*
 * If translation failed, VM entry will fail because
@@ -2868,11 +2862,9 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
 * control.  But such a configuration is useless, so
 * let's keep the code simple.
 */
-   if (!is_error_page(page)) {
-   vmx->nested.virtual_apic_page = page;
-   hpa = page_to_phys(vmx->nested.virtual_apic_page);
-   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
-   }
+   if (!kvm_vcpu_map(vcpu, 
gpa_to_gfn(vmcs12->virtual_apic_page_addr), map))
+   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 
pfn_to_hpa(map->pfn));
+
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -3279,11 +3271,12 @@ static void vmx_complete_nested_posted_interrupt(struct 
kvm_vcpu *vcpu)
 
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
-   vapic_page = kmap(vmx->nested.virtual_apic_page);
+   vapic_page = vmx->nested.virtual_apic_map.hva;
+   if (!vapic_page)
+   return;
+
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
vapic_page, &max_irr);
-   kunmap(vmx->nested.virtual_apic_page);
-
status = vmcs_read16(GUEST_INTR_STATUS);
if ((u8)max_irr > ((u8)status & 0xff)) {
status &= ~0xff;
@@ -3917,10 +3910,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 
exit_reason,
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
-   if (vmx->nested.virtual_apic_page) {
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.virtual_apic_map, true);
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
kvm_release_page_dirty(vmx->nested.pi_desc_page);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 71d88df..e13308e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3627,14 +3627,13 @@ static bool vmx_guest_apic_has_interrupt(struct 
kvm_vcpu *vcpu)
 
if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
-   WARN_ON_ONCE(!vmx->nested.virtual_apic_page))

[PATCH v5 11/13] KVM/nVMX: Use kvm_vcpu_map for accessing the shadow VMCS

2019-01-09 Thread KarimAllah Ahmed
Use kvm_vcpu_map for accessing the shadow VMCS since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v4 -> v5:
- unmap with dirty flag
---
 arch/x86/kvm/vmx/nested.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b4230ce..04a8b43 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -588,20 +588,20 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
+   struct kvm_host_map map;
struct vmcs12 *shadow;
-   struct page *page;
 
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
vmcs12->vmcs_link_pointer == -1ull)
return;
 
shadow = get_shadow_vmcs12(vcpu);
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
 
-   memcpy(shadow, kmap(page), VMCS12_SIZE);
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
+   return;
 
-   kunmap(page);
-   kvm_release_page_clean(page);
+   memcpy(shadow, map.hva, VMCS12_SIZE);
+   kvm_vcpu_unmap(&map, false);
 }
 
 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
@@ -2637,9 +2637,9 @@ static int nested_vmx_check_vmentry_prereqs(struct 
kvm_vcpu *vcpu,
 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
  struct vmcs12 *vmcs12)
 {
-   int r;
-   struct page *page;
+   int r = 0;
struct vmcs12 *shadow;
+   struct kvm_host_map map;
 
if (vmcs12->vmcs_link_pointer == -1ull)
return 0;
@@ -2647,17 +2647,16 @@ static int nested_vmx_check_vmcs_link_ptr(struct 
kvm_vcpu *vcpu,
if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
return -EINVAL;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
return -EINVAL;
 
-   r = 0;
-   shadow = kmap(page);
+   shadow = map.hva;
+
if (shadow->hdr.revision_id != VMCS12_REVISION ||
shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
r = -EINVAL;
-   kunmap(page);
-   kvm_release_page_clean(page);
+
+   kvm_vcpu_unmap(&map, false);
return r;
 }
 
-- 
2.7.4



[PATCH v5 10/13] KVM/nSVM: Use the new mapping API for mapping guest memory

2019-01-09 Thread KarimAllah Ahmed
Use the new mapping API for mapping guest memory to avoid depending on
"struct page".

Signed-off-by: KarimAllah Ahmed 
---
v4 -> v5:
- unmap with dirty flag
---
 arch/x86/kvm/svm.c | 97 +++---
 1 file changed, 49 insertions(+), 48 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 307e5bd..d886664 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3062,32 +3062,6 @@ static inline bool nested_svm_nmi(struct vcpu_svm *svm)
return false;
 }
 
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
-{
-   struct page *page;
-
-   might_sleep();
-
-   page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
-   goto error;
-
-   *_page = page;
-
-   return kmap(page);
-
-error:
-   kvm_inject_gp(&svm->vcpu, 0);
-
-   return NULL;
-}
-
-static void nested_svm_unmap(struct page *page)
-{
-   kunmap(page);
-   kvm_release_page_dirty(page);
-}
-
 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
 {
unsigned port, size, iopm_len;
@@ -3290,10 +3264,11 @@ static inline void copy_vmcb_control_area(struct vmcb 
*dst_vmcb, struct vmcb *fr
 
 static int nested_svm_vmexit(struct vcpu_svm *svm)
 {
+   int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
-   struct page *page;
+   struct kvm_host_map map;
 
trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
   vmcb->control.exit_info_1,
@@ -3302,9 +3277,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
   vmcb->control.exit_int_info_err,
   KVM_ISA_SVM);
 
-   nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
-   if (!nested_vmcb)
+   rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map);
+   if (rc) {
+   if (rc == -EINVAL)
+   kvm_inject_gp(&svm->vcpu, 0);
return 1;
+   }
+
+   nested_vmcb = map.hva;
 
/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
@@ -3408,7 +3388,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
mark_all_dirty(svm->vmcb);
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(&map, true);
 
nested_svm_uninit_mmu_context(&svm->vcpu);
kvm_mmu_reset_context(&svm->vcpu);
@@ -3466,7 +3446,7 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 }
 
 static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-struct vmcb *nested_vmcb, struct page *page)
+struct vmcb *nested_vmcb, struct kvm_host_map 
*map)
 {
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -3550,7 +3530,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, 
u64 vmcb_gpa,
svm->vmcb->control.pause_filter_thresh =
nested_vmcb->control.pause_filter_thresh;
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(map, true);
 
/* Enter Guest-Mode */
enter_guest_mode(&svm->vcpu);
@@ -3570,17 +3550,23 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, 
u64 vmcb_gpa,
 
 static bool nested_svm_vmrun(struct vcpu_svm *svm)
 {
+   int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
-   struct page *page;
+   struct kvm_host_map map;
u64 vmcb_gpa;
 
vmcb_gpa = svm->vmcb->save.rax;
 
-   nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
-   if (!nested_vmcb)
+   rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map);
+   if (rc) {
+   if (rc == -EINVAL)
+   kvm_inject_gp(&svm->vcpu, 0);
return false;
+   }
+
+   nested_vmcb = map.hva;
 
if (!nested_vmcb_checks(nested_vmcb)) {
nested_vmcb->control.exit_code= SVM_EXIT_ERR;
@@ -3588,7 +3574,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_1  = 0;
nested_vmcb->control.exit_info_2  = 0;
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(&map, true);
 
return false;
}
@@ -3632,7 +3618,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
copy_vmcb_control_area(hsave, vmcb);
 
-   enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
+   enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
 
return true;
 }
@@ -3656,21 +3642,26 @@ static void nested_svm

[PATCH v5 06/13] KVM/nVMX: Use kvm_vcpu_map when mapping the L1 MSR bitmap

2019-01-09 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the L1 MSR bitmap since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v4 -> v5:
- unmap with dirty flag

v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
---
 arch/x86/kvm/vmx/nested.c | 11 +--
 arch/x86/kvm/vmx/vmx.h|  3 +++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 5602b0c..4127ad9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -507,9 +507,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
 struct vmcs12 *vmcs12)
 {
int msr;
-   struct page *page;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+   struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
+
/*
 * pred_cmd & spec_ctrl are trying to verify two things:
 *
@@ -535,11 +536,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
!pred_cmd && !spec_ctrl)
return false;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
 
-   msr_bitmap_l1 = (unsigned long *)kmap(page);
+   msr_bitmap_l1 = (unsigned long *)map->hva;
if (nested_cpu_has_apic_reg_virt(vmcs12)) {
/*
 * L0 need not intercept reads for MSRs between 0x800 and 
0x8ff, it
@@ -587,8 +587,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
MSR_IA32_PRED_CMD,
MSR_TYPE_W);
 
-   kunmap(page);
-   kvm_release_page_clean(page);
+   kvm_vcpu_unmap(&to_vmx(vcpu)->nested.msr_bitmap_map, false);
 
return true;
 }
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9932895..6fb69d8 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -144,6 +144,9 @@ struct nested_vmx {
struct page *apic_access_page;
struct page *virtual_apic_page;
struct page *pi_desc_page;
+
+   struct kvm_host_map msr_bitmap_map;
+
struct pi_desc *pi_desc;
bool pi_pending;
u16 posted_intr_nv;
-- 
2.7.4



[PATCH v5 04/13] KVM: Introduce a new guest mapping API

2019-01-09 Thread KarimAllah Ahmed
In KVM, specially for nested guests, there is a dominant pattern of:

=> map guest memory -> do_something -> unmap guest memory

In addition to all this unnecessarily noise in the code due to boiler plate
code, most of the time the mapping function does not properly handle memory
that is not backed by "struct page". This new guest mapping API encapsulate
most of this boiler plate code and also handles guest memory that is not
backed by "struct page".

The current implementation of this API is using memremap for memory that is
not backed by a "struct page" which would lead to a huge slow-down if it
was used for high-frequency mapping operations. The API does not have any
effect on current setups where guest memory is backed by a "struct page".
Further patches are going to also introduce a pfn-cache which would
significantly improve the performance of the memremap case.

Signed-off-by: KarimAllah Ahmed 
---
v3 -> v4:
- Update the commit message.
v1 -> v2:
- Drop the caching optimization (pbonzini)
- Use 'hva' instead of 'kaddr' (pbonzini)
- Return 0/-EINVAL/-EFAULT instead of true/false. -EFAULT will be used for
  AMD patch (pbonzini)
- Introduce __kvm_map_gfn which accepts a memory slot and use it (pbonzini)
- Only clear map->hva instead of memsetting the whole structure.
- Drop kvm_vcpu_map_valid since it is no longer used.
- Fix EXPORT_MODULE naming.
---
 include/linux/kvm_host.h |  9 
 virt/kvm/kvm_main.c  | 53 
 2 files changed, 62 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c38cc5e..8a2f5fa 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -205,6 +205,13 @@ enum {
READING_SHADOW_PAGE_TABLES,
 };
 
+struct kvm_host_map {
+   struct page *page;
+   void *hva;
+   kvm_pfn_t pfn;
+   kvm_pfn_t gfn;
+};
+
 /*
  * Sometimes a large or cross-page mmio needs to be broken up into separate
  * exits for userspace servicing.
@@ -710,7 +717,9 @@ struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu 
*vcpu);
 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t 
gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
+void kvm_vcpu_unmap(struct kvm_host_map *map, bool dirty);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool 
*writable);
 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int 
offset,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1f888a1..4d8f2e3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1733,6 +1733,59 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
+static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn,
+struct kvm_host_map *map)
+{
+   kvm_pfn_t pfn;
+   void *hva = NULL;
+   struct page *page = NULL;
+
+   pfn = gfn_to_pfn_memslot(slot, gfn);
+   if (is_error_noslot_pfn(pfn))
+   return -EINVAL;
+
+   if (pfn_valid(pfn)) {
+   page = pfn_to_page(pfn);
+   hva = kmap(page);
+   } else {
+   hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+   }
+
+   if (!hva)
+   return -EFAULT;
+
+   map->page = page;
+   map->hva = hva;
+   map->pfn = pfn;
+   map->gfn = gfn;
+
+   return 0;
+}
+
+int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
+{
+   return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_map);
+
+void kvm_vcpu_unmap(struct kvm_host_map *map, bool dirty)
+{
+   if (!map->hva)
+   return;
+
+   if (map->page)
+   kunmap(map->page);
+   else
+   memunmap(map->hva);
+
+   if (dirty)
+   kvm_release_pfn_dirty(map->pfn);
+   else
+   kvm_release_pfn_clean(map->pfn);
+   map->hva = NULL;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
+
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
kvm_pfn_t pfn;
-- 
2.7.4



[PATCH v5 03/13] X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs

2019-01-09 Thread KarimAllah Ahmed
From: Filippo Sironi 

cmpxchg_gpte() calls get_user_pages_fast() to retrieve the number of
pages and the respective struct page to map in the kernel virtual
address space.
This doesn't work if get_user_pages_fast() is invoked with a userspace
virtual address that's backed by PFNs outside of kernel reach (e.g., when
limiting the kernel memory with mem= in the command line and using
/dev/mem to map memory).

If get_user_pages_fast() fails, look up the VMA that back the userspace
virtual address, compute the PFN and the physical address, and map it in
the kernel virtual address space with memremap().

Signed-off-by: Filippo Sironi 
Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/paging_tmpl.h | 38 +-
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bdca39..c40af67 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -141,15 +141,35 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
struct page *page;
 
npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
-   /* Check if the user is doing something meaningless. */
-   if (unlikely(npages != 1))
-   return -EFAULT;
-
-   table = kmap_atomic(page);
-   ret = CMPXCHG(&table[index], orig_pte, new_pte);
-   kunmap_atomic(table);
-
-   kvm_release_page_dirty(page);
+   if (likely(npages == 1)) {
+   table = kmap_atomic(page);
+   ret = CMPXCHG(&table[index], orig_pte, new_pte);
+   kunmap_atomic(table);
+
+   kvm_release_page_dirty(page);
+   } else {
+   struct vm_area_struct *vma;
+   unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
+   unsigned long pfn;
+   unsigned long paddr;
+
+   down_read(¤t->mm->mmap_sem);
+   vma = find_vma_intersection(current->mm, vaddr, vaddr + 
PAGE_SIZE);
+   if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
+   up_read(¤t->mm->mmap_sem);
+   return -EFAULT;
+   }
+   pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+   paddr = pfn << PAGE_SHIFT;
+   table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
+   if (!table) {
+   up_read(¤t->mm->mmap_sem);
+   return -EFAULT;
+   }
+   ret = CMPXCHG(&table[index], orig_pte, new_pte);
+   memunmap(table);
+   up_read(¤t->mm->mmap_sem);
+   }
 
return (ret != orig_pte);
 }
-- 
2.7.4



[PATCH v5 13/13] KVM/nVMX: Use page_address_valid in a few more locations

2019-01-09 Thread KarimAllah Ahmed
Use page_address_valid in a few more locations that is already checking for
a page aligned address that does not cross the maximum physical address.

Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/vmx/nested.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ccb3b63..77aad46 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4203,7 +4203,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
 * which replaces physical address width with 32
 */
-   if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+   if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failInvalid(vcpu);
 
if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
@@ -4266,7 +4266,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
 
-   if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+   if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_INVALID_ADDRESS);
 
@@ -4473,7 +4473,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
if (nested_vmx_get_vmptr(vcpu, &vmptr))
return 1;
 
-   if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+   if (!page_address_valid(vcpu, vmptr))
return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INVALID_ADDRESS);
 
-- 
2.7.4



[PATCH v5 05/13] X86/nVMX: handle_vmptrld: Use kvm_vcpu_map when copying VMCS12 from guest memory

2019-01-09 Thread KarimAllah Ahmed
Use kvm_vcpu_map to the map the VMCS12 from guest memory because
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v4 -> v5:
- Switch to the new guest mapping API instead of reading directly from
  guest.
- unmap with dirty flag
v3 -> v4:
- Return VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID on failure (jmattson@)
v1 -> v2:
- Massage commit message a bit.
---
 arch/x86/kvm/vmx/nested.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 536468a..5602b0c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4521,11 +4521,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return 1;
 
if (vmx->nested.current_vmptr != vmptr) {
+   struct kvm_host_map map;
struct vmcs12 *new_vmcs12;
-   struct page *page;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-   if (is_error_page(page)) {
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
/*
 * Reads from an unbacked page return all 1s,
 * which means that the 32 bits located at the
@@ -4536,12 +4535,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
return kvm_skip_emulated_instruction(vcpu);
}
-   new_vmcs12 = kmap(page);
+
+   new_vmcs12 = map.hva;
+
if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
(new_vmcs12->hdr.shadow_vmcs &&
 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
-   kunmap(page);
-   kvm_release_page_clean(page);
+   kvm_vcpu_unmap(&map, false);
return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
@@ -4553,8 +4553,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 * cached.
 */
memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
-   kunmap(page);
-   kvm_release_page_clean(page);
+   kvm_vcpu_unmap(&map, false);
 
set_current_vmptr(vmx, vmptr);
}
-- 
2.7.4



[PATCH] KVM/nVMX: Stop mapping the "APIC-access address" page into the kernel

2018-12-03 Thread KarimAllah Ahmed
The "APIC-access address" is simply a token that the hypervisor puts into
the PFN of a 4K EPTE (or PTE if using shadow paging) that triggers APIC
virtualization whenever a page walk terminates with that PFN. This address
has to be a legal address (i.e.  within the physical address supported by
the CPU), but it need not have WB memory behind it. In fact, it need not
have anything at all behind it. When bit 31 ("activate secondary controls")
of the primary processor-based VM-execution controls is set and bit 0
("virtualize APIC accesses") of the secondary processor-based VM-execution
controls is set, the PFN recorded in the VMCS "APIC-access address" field
will never be touched. (Instead, the access triggers APIC virtualization,
which may access the PFN recorded in the "Virtual-APIC address" field of
the VMCS.)

So stop mapping the "APIC-access address" page into the kernel and even
drop the requirements to have a valid page backing it. Instead, just use
some token that:

1) Not one of the valid guest pages.
2) Within the physical address supported by the CPU.

Suggested-by: Jim Mattson 
Signed-off-by: KarimAllah Ahmed 
---

Thanks Jim for the commit message :)
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c  | 10 ++
 arch/x86/kvm/vmx.c  | 71 ++---
 3 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fbda5a9..7e50196 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1077,6 +1077,7 @@ struct kvm_x86_ops {
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
+   bool (*nested_apic_access_addr)(struct kvm_vcpu *vcpu, gpa_t gpa, hpa_t 
*hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7c03c0f..ae46a8d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3962,9 +3962,19 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
 {
+   hpa_t hpa;
struct kvm_memory_slot *slot;
bool async;
 
+   if (is_guest_mode(vcpu) &&
+   kvm_x86_ops->nested_apic_access_addr &&
+   kvm_x86_ops->nested_apic_access_addr(vcpu, gfn_to_gpa(gfn), &hpa)) {
+   *pfn = hpa >> PAGE_SHIFT;
+   if (writable)
+   *writable = true;
+   return false;
+   }
+
/*
 * Don't expose private memslots to L2.
 */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 83a614f..340cf56 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -864,7 +864,6 @@ struct nested_vmx {
 * Guest pages referred to in the vmcs02 with host-physical
 * pointers, so we must keep them pinned while L2 runs.
 */
-   struct page *apic_access_page;
struct kvm_host_map virtual_apic_map;
struct kvm_host_map pi_desc_map;
struct kvm_host_map msr_bitmap_map;
@@ -8512,10 +8511,6 @@ static void free_nested(struct kvm_vcpu *vcpu)
kfree(vmx->nested.cached_vmcs12);
kfree(vmx->nested.cached_shadow_vmcs12);
/* Unpin physical memory we referred to in the vmcs02 */
-   if (vmx->nested.apic_access_page) {
-   kvm_release_page_dirty(vmx->nested.apic_access_page);
-   vmx->nested.apic_access_page = NULL;
-   }
kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
vmx->nested.pi_desc = NULL;
@@ -11901,41 +11896,27 @@ static void vmx_inject_page_fault_nested(struct 
kvm_vcpu *vcpu,
 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 struct vmcs12 *vmcs12);
 
+static hpa_t vmx_apic_access_addr(void)
+{
+   /*
+* The physical address choosen here has to:
+* 1) Never be an address that could be assigned to a guest.
+* 2) Within the maximum physical limits of the CPU.
+*
+* So our choice below is completely random, but at least it follows
+* these two rules.
+*/
+   return __pa_symbol(_text) & PAGE_MASK;
+}
+
 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_hos

[PATCH v4 05/14] KVM: Introduce a new guest mapping API

2018-12-03 Thread KarimAllah Ahmed
In KVM, specially for nested guests, there is a dominant pattern of:

=> map guest memory -> do_something -> unmap guest memory

In addition to all this unnecessarily noise in the code due to boiler plate
code, most of the time the mapping function does not properly handle memory
that is not backed by "struct page". This new guest mapping API encapsulate
most of this boiler plate code and also handles guest memory that is not
backed by "struct page".

The current implementation of this API is using memremap for memory that is
not backed by a "struct page" which would lead to a huge slow-down if it
was used for high-frequency mapping operations. The API does not have any
effect on current setups where guest memory is backed by a "struct page".
Further patches are going to also introduce a pfn-cache which would
significantly improve the performance of the memremap case.

Signed-off-by: KarimAllah Ahmed 
---
v3 -> v4:
- Update the commit message.
v1 -> v2:
- Drop the caching optimization (pbonzini)
- Use 'hva' instead of 'kaddr' (pbonzini)
- Return 0/-EINVAL/-EFAULT instead of true/false. -EFAULT will be used for
  AMD patch (pbonzini)
- Introduce __kvm_map_gfn which accepts a memory slot and use it (pbonzini)
- Only clear map->hva instead of memsetting the whole structure.
- Drop kvm_vcpu_map_valid since it is no longer used.
- Fix EXPORT_MODULE naming.
---
 include/linux/kvm_host.h |  9 +
 virt/kvm/kvm_main.c  | 50 
 2 files changed, 59 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c926698..59e56b8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -205,6 +205,13 @@ enum {
READING_SHADOW_PAGE_TABLES,
 };
 
+struct kvm_host_map {
+   struct page *page;
+   void *hva;
+   kvm_pfn_t pfn;
+   kvm_pfn_t gfn;
+};
+
 /*
  * Sometimes a large or cross-page mmio needs to be broken up into separate
  * exits for userspace servicing.
@@ -708,6 +715,8 @@ struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu 
*vcpu);
 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t 
gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
+void kvm_vcpu_unmap(struct kvm_host_map *map);
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool 
*writable);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2679e47..ea7c82f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1658,6 +1658,56 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
+static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn,
+struct kvm_host_map *map)
+{
+   kvm_pfn_t pfn;
+   void *hva = NULL;
+   struct page *page = NULL;
+
+   pfn = gfn_to_pfn_memslot(slot, gfn);
+   if (is_error_noslot_pfn(pfn))
+   return -EINVAL;
+
+   if (pfn_valid(pfn)) {
+   page = pfn_to_page(pfn);
+   hva = kmap(page);
+   } else {
+   hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+   }
+
+   if (!hva)
+   return -EFAULT;
+
+   map->page = page;
+   map->hva = hva;
+   map->pfn = pfn;
+   map->gfn = gfn;
+
+   return 0;
+}
+
+int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
+{
+   return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_map);
+
+void kvm_vcpu_unmap(struct kvm_host_map *map)
+{
+   if (!map->hva)
+   return;
+
+   if (map->page)
+   kunmap(map->page);
+   else
+   memunmap(map->hva);
+
+   kvm_release_pfn_dirty(map->pfn);
+   map->hva = NULL;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
+
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
kvm_pfn_t pfn;
-- 
2.7.4



[PATCH v4 00/14] KVM/X86: Introduce a new guest mapping interface

2018-12-03 Thread KarimAllah Ahmed
Guest memory can either be directly managed by the kernel (i.e. have a "struct
page") or they can simply live outside kernel control (i.e. do not have a
"struct page"). KVM mostly support these two modes, except in a few places
where the code seems to assume that guest memory must have a "struct page".

This patchset introduces a new mapping interface to map guest memory into host
kernel memory which also supports PFN-based memory (i.e. memory without 'struct
page'). It also converts all offending code to this interface or simply
read/write directly from guest memory.

As far as I can see all offending code is now fixed except the APIC-access page
which I will handle in a seperate series along with dropping
kvm_vcpu_gfn_to_page and kvm_vcpu_gpa_to_page from the internal KVM API.

The current implementation of the new API uses memremap to map memory that does
not have a "struct page". This proves to be very slow for high frequency
mappings. Since this does not affect the normal use-case where a "struct page"
is available, the performance of this API will be handled by a seperate patch
series.

v3 -> v4:
- Rebase
- Add a new patch to also fix the newly introduced enhanced VMCS.

v2 -> v3:
- Rebase
- Add a new patch to also fix the newly introduced shadow VMCS.

Filippo Sironi (1):
  X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs

KarimAllah Ahmed (13):
  X86/nVMX: handle_vmon: Read 4 bytes from guest memory
  X86/nVMX: handle_vmptrld: Copy the VMCS12 directly from guest memory
  X86/nVMX: Update the PML table without mapping and unmapping the page
  KVM: Introduce a new guest mapping API
  KVM/nVMX: Use kvm_vcpu_map when mapping the L1 MSR bitmap
  KVM/nVMX: Use kvm_vcpu_map when mapping the virtual APIC page
  KVM/nVMX: Use kvm_vcpu_map when mapping the posted interrupt
descriptor table
  KVM/X86: Use kvm_vcpu_map in emulator_cmpxchg_emulated
  KVM/X86: hyperv: Use kvm_vcpu_map in synic_clear_sint_msg_pending
  KVM/X86: hyperv: Use kvm_vcpu_map in synic_deliver_msg
  KVM/nSVM: Use the new mapping API for mapping guest memory
  KVM/nVMX: Use kvm_vcpu_map for accessing the shadow VMCS
  KVM/nVMX: Use kvm_vcpu_map for accessing the enhanced VMCS

 arch/x86/kvm/hyperv.c  |  28 +++
 arch/x86/kvm/paging_tmpl.h |  38 ++---
 arch/x86/kvm/svm.c |  97 +++
 arch/x86/kvm/vmx.c | 189 +
 arch/x86/kvm/x86.c |  13 ++--
 include/linux/kvm_host.h   |   9 +++
 virt/kvm/kvm_main.c|  50 
 7 files changed, 228 insertions(+), 196 deletions(-)

-- 
2.7.4



[PATCH v4 13/14] KVM/nVMX: Use kvm_vcpu_map for accessing the shadow VMCS

2018-12-03 Thread KarimAllah Ahmed
Use kvm_vcpu_map for accessing the shadow VMCS since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/vmx.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 390a971..a958700 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -12138,20 +12138,20 @@ static inline bool 
nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
+   struct kvm_host_map map;
struct vmcs12 *shadow;
-   struct page *page;
 
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
vmcs12->vmcs_link_pointer == -1ull)
return;
 
shadow = get_shadow_vmcs12(vcpu);
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
 
-   memcpy(shadow, kmap(page), VMCS12_SIZE);
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
+   return;
 
-   kunmap(page);
-   kvm_release_page_clean(page);
+   memcpy(shadow, map.hva, VMCS12_SIZE);
+   kvm_vcpu_unmap(&map);
 }
 
 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
@@ -13133,9 +13133,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
  struct vmcs12 *vmcs12)
 {
-   int r;
-   struct page *page;
+   int r = 0;
struct vmcs12 *shadow;
+   struct kvm_host_map map;
 
if (vmcs12->vmcs_link_pointer == -1ull)
return 0;
@@ -13143,17 +13143,16 @@ static int nested_vmx_check_vmcs_link_ptr(struct 
kvm_vcpu *vcpu,
if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
return -EINVAL;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
return -EINVAL;
 
-   r = 0;
-   shadow = kmap(page);
+   shadow = map.hva;
+
if (shadow->hdr.revision_id != VMCS12_REVISION ||
shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
r = -EINVAL;
-   kunmap(page);
-   kvm_release_page_clean(page);
+
+   kvm_vcpu_unmap(&map);
return r;
 }
 
-- 
2.7.4



[PATCH v4 07/14] KVM/nVMX: Use kvm_vcpu_map when mapping the virtual APIC page

2018-12-03 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the virtual APIC page since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

One additional semantic change is that the virtual host mapping lifecycle
has changed a bit. It now has the same lifetime of the pinning of the
virtual APIC page on the host side.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
- Use pfn_to_hpa instead of gfn_to_gpa
---
 arch/x86/kvm/vmx.c | 39 +--
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cca3ba0..4a99289 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -865,9 +865,8 @@ struct nested_vmx {
 * pointers, so we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
-   struct page *virtual_apic_page;
+   struct kvm_host_map virtual_apic_map;
struct page *pi_desc_page;
-
struct kvm_host_map msr_bitmap_map;
 
struct pi_desc *pi_desc;
@@ -6183,11 +6182,12 @@ static void vmx_complete_nested_posted_interrupt(struct 
kvm_vcpu *vcpu)
 
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
-   vapic_page = kmap(vmx->nested.virtual_apic_page);
+   vapic_page = vmx->nested.virtual_apic_map.hva;
+   if (!vapic_page)
+   return;
+
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
vapic_page, &max_irr);
-   kunmap(vmx->nested.virtual_apic_page);
-
status = vmcs_read16(GUEST_INTR_STATUS);
if ((u8)max_irr > ((u8)status & 0xff)) {
status &= ~0xff;
@@ -6213,14 +6213,13 @@ static bool vmx_guest_apic_has_interrupt(struct 
kvm_vcpu *vcpu)
 
if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
-   WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
+   WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
return false;
 
rvi = vmx_get_rvi();
 
-   vapic_page = kmap(vmx->nested.virtual_apic_page);
+   vapic_page = vmx->nested.virtual_apic_map.hva;
vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
-   kunmap(vmx->nested.virtual_apic_page);
 
return ((rvi & 0xf0) > (vppr & 0xf0));
 }
@@ -8519,10 +8518,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
-   if (vmx->nested.virtual_apic_page) {
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
kvm_release_page_dirty(vmx->nested.pi_desc_page);
@@ -11917,6 +11913,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
 {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_host_map *map;
struct page *page;
u64 hpa;
 
@@ -11949,11 +11946,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
}
 
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-   if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
-   page = kvm_vcpu_gpa_to_page(vcpu, 
vmcs12->virtual_apic_page_addr);
+   map = &vmx->nested.virtual_apic_map;
 
/*
 * If translation failed, VM entry will fail because
@@ -11968,11 +11961,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
 * control.  But such a configuration is useless, so
 * let's keep the code simple.
 */
-   if (!is_error_page(page)) {
-   vmx->nested.virtual_apic_page = page;
-   hpa = page_to_phys(vmx->nested.virtual_apic_page);
-   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
-   }
+   if (!kvm_vcpu_map(vcpu, 
gpa_to_gfn(vmcs12->virtual_apic_page_addr), map))
+   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 
pfn_to_hpa(map->pfn));
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -14228,10 +14218,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
u32 exit_reason,
kvm_release_page_dirty(vmx->nested.apic_access_page);

[PATCH v4 08/14] KVM/nVMX: Use kvm_vcpu_map when mapping the posted interrupt descriptor table

2018-12-03 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the posted interrupt descriptor table since
using kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory
that has a "struct page".

One additional semantic change is that the virtual host mapping lifecycle
has changed a bit. It now has the same lifetime of the pinning of the
interrupt descriptor table page on the host side.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
---
 arch/x86/kvm/vmx.c | 45 +++--
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4a99289..390a971 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -866,7 +866,7 @@ struct nested_vmx {
 */
struct page *apic_access_page;
struct kvm_host_map virtual_apic_map;
-   struct page *pi_desc_page;
+   struct kvm_host_map pi_desc_map;
struct kvm_host_map msr_bitmap_map;
 
struct pi_desc *pi_desc;
@@ -8519,12 +8519,8 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
-   if (vmx->nested.pi_desc_page) {
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
+   vmx->nested.pi_desc = NULL;
 
kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
 
@@ -11966,24 +11962,16 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
-   if (vmx->nested.pi_desc_page) { /* shouldn't happen */
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
+   map = &vmx->nested.pi_desc_map;
+
+   if (!kvm_vcpu_map(vcpu, 
gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
+   vmx->nested.pi_desc =
+   (struct pi_desc *)(((void *)map->hva) +
+   offset_in_page(vmcs12->posted_intr_desc_addr));
+   vmcs_write64(POSTED_INTR_DESC_ADDR, 
pfn_to_hpa(map->pfn) +
+   
offset_in_page(vmcs12->posted_intr_desc_addr));
}
-   page = kvm_vcpu_gpa_to_page(vcpu, 
vmcs12->posted_intr_desc_addr);
-   if (is_error_page(page))
-   return;
-   vmx->nested.pi_desc_page = page;
-   vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc =
-   (struct pi_desc *)((void *)vmx->nested.pi_desc +
-   (unsigned long)(vmcs12->posted_intr_desc_addr &
-   (PAGE_SIZE - 1)));
-   vmcs_write64(POSTED_INTR_DESC_ADDR,
-   page_to_phys(vmx->nested.pi_desc_page) +
-   (unsigned long)(vmcs12->posted_intr_desc_addr &
-   (PAGE_SIZE - 1)));
+
}
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
@@ -14218,13 +14206,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
u32 exit_reason,
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
+
kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
-   if (vmx->nested.pi_desc_page) {
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
+   vmx->nested.pi_desc = NULL;
 
/*
 * We are now running in L2, mmu_notifier will force to reload the
-- 
2.7.4



[PATCH v4 04/14] X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs

2018-12-03 Thread KarimAllah Ahmed
From: Filippo Sironi 

cmpxchg_gpte() calls get_user_pages_fast() to retrieve the number of
pages and the respective struct page to map in the kernel virtual
address space.
This doesn't work if get_user_pages_fast() is invoked with a userspace
virtual address that's backed by PFNs outside of kernel reach (e.g., when
limiting the kernel memory with mem= in the command line and using
/dev/mem to map memory).

If get_user_pages_fast() fails, look up the VMA that back the userspace
virtual address, compute the PFN and the physical address, and map it in
the kernel virtual address space with memremap().

Signed-off-by: Filippo Sironi 
Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/paging_tmpl.h | 38 +-
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7cf2185..b953545 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -141,15 +141,35 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
struct page *page;
 
npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
-   /* Check if the user is doing something meaningless. */
-   if (unlikely(npages != 1))
-   return -EFAULT;
-
-   table = kmap_atomic(page);
-   ret = CMPXCHG(&table[index], orig_pte, new_pte);
-   kunmap_atomic(table);
-
-   kvm_release_page_dirty(page);
+   if (likely(npages == 1)) {
+   table = kmap_atomic(page);
+   ret = CMPXCHG(&table[index], orig_pte, new_pte);
+   kunmap_atomic(table);
+
+   kvm_release_page_dirty(page);
+   } else {
+   struct vm_area_struct *vma;
+   unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
+   unsigned long pfn;
+   unsigned long paddr;
+
+   down_read(¤t->mm->mmap_sem);
+   vma = find_vma_intersection(current->mm, vaddr, vaddr + 
PAGE_SIZE);
+   if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
+   up_read(¤t->mm->mmap_sem);
+   return -EFAULT;
+   }
+   pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+   paddr = pfn << PAGE_SHIFT;
+   table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
+   if (!table) {
+   up_read(¤t->mm->mmap_sem);
+   return -EFAULT;
+   }
+   ret = CMPXCHG(&table[index], orig_pte, new_pte);
+   memunmap(table);
+   up_read(¤t->mm->mmap_sem);
+   }
 
return (ret != orig_pte);
 }
-- 
2.7.4



[PATCH v4 09/14] KVM/X86: Use kvm_vcpu_map in emulator_cmpxchg_emulated

2018-12-03 Thread KarimAllah Ahmed
Use kvm_vcpu_map in emulator_cmpxchg_emulated since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Update to match the new API return codes
---
 arch/x86/kvm/x86.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d029377..81d75af 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5417,9 +5417,9 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
 unsigned int bytes,
 struct x86_exception *exception)
 {
+   struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
-   struct page *page;
char *kaddr;
bool exchanged;
 
@@ -5436,12 +5436,11 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
goto emul_write;
 
-   page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
goto emul_write;
 
-   kaddr = kmap_atomic(page);
-   kaddr += offset_in_page(gpa);
+   kaddr = map.hva + offset_in_page(gpa);
+
switch (bytes) {
case 1:
exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
@@ -5458,8 +5457,8 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
default:
BUG();
}
-   kunmap_atomic(kaddr);
-   kvm_release_page_dirty(page);
+
+   kvm_vcpu_unmap(&map);
 
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
-- 
2.7.4



[PATCH v4 10/14] KVM/X86: hyperv: Use kvm_vcpu_map in synic_clear_sint_msg_pending

2018-12-03 Thread KarimAllah Ahmed
Use kvm_vcpu_map in synic_clear_sint_msg_pending since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Update to match the new API return codes
---
 arch/x86/kvm/hyperv.c | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 4e80080..63941ac 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -162,26 +162,22 @@ static void synic_clear_sint_msg_pending(struct 
kvm_vcpu_hv_synic *synic,
u32 sint)
 {
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
-   struct page *page;
-   gpa_t gpa;
+   struct kvm_host_map map;
struct hv_message *msg;
struct hv_message_page *msg_page;
 
-   gpa = synic->msg_page & PAGE_MASK;
-   page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page)) {
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(synic->msg_page), &map)) {
vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n",
-gpa);
+synic->msg_page);
return;
}
-   msg_page = kmap_atomic(page);
 
+   msg_page = map.hva;
msg = &msg_page->sint_message[sint];
msg->header.message_flags.msg_pending = 0;
 
-   kunmap_atomic(msg_page);
-   kvm_release_page_dirty(page);
-   kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+   kvm_vcpu_unmap(&map);
+   kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(synic->msg_page));
 }
 
 static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
-- 
2.7.4



[PATCH v4 11/14] KVM/X86: hyperv: Use kvm_vcpu_map in synic_deliver_msg

2018-12-03 Thread KarimAllah Ahmed
Use kvm_vcpu_map in synic_deliver_msg since using kvm_vcpu_gpa_to_page()
and kmap() will only work for guest memory that has a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Update to match the new API return codes
---
 arch/x86/kvm/hyperv.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 63941ac..af6bd18 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -585,7 +585,7 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic 
*synic, u32 sint,
 struct hv_message *src_msg)
 {
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
-   struct page *page;
+   struct kvm_host_map map;
gpa_t gpa;
struct hv_message *dst_msg;
int r;
@@ -595,11 +595,11 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic 
*synic, u32 sint,
return -ENOENT;
 
gpa = synic->msg_page & PAGE_MASK;
-   page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
+
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
return -EFAULT;
 
-   msg_page = kmap_atomic(page);
+   msg_page = map.hva;
dst_msg = &msg_page->sint_message[sint];
if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE,
 src_msg->header.message_type) != HVMSG_NONE) {
@@ -616,8 +616,8 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic 
*synic, u32 sint,
else if (r == 0)
r = -EFAULT;
}
-   kunmap_atomic(msg_page);
-   kvm_release_page_dirty(page);
+
+   kvm_vcpu_unmap(&map);
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
return r;
 }
-- 
2.7.4



[PATCH v4 14/14] KVM/nVMX: Use kvm_vcpu_map for accessing the enhanced VMCS

2018-12-03 Thread KarimAllah Ahmed
Use kvm_vcpu_map for accessing the enhanced VMCS since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/vmx.c | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a958700..83a614f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -894,7 +894,7 @@ struct nested_vmx {
} smm;
 
gpa_t hv_evmcs_vmptr;
-   struct page *hv_evmcs_page;
+   struct kvm_host_map hv_evmcs_map;
struct hv_enlightened_vmcs *hv_evmcs;
 };
 
@@ -8456,10 +8456,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu 
*vcpu)
if (!vmx->nested.hv_evmcs)
return;
 
-   kunmap(vmx->nested.hv_evmcs_page);
-   kvm_release_page_dirty(vmx->nested.hv_evmcs_page);
+   kvm_vcpu_unmap(&vmx->nested.hv_evmcs_map);
vmx->nested.hv_evmcs_vmptr = -1ull;
-   vmx->nested.hv_evmcs_page = NULL;
vmx->nested.hv_evmcs = NULL;
 }
 
@@ -8559,7 +8557,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_VMXON_POINTER);
 
-   if (vmx->nested.hv_evmcs_page) {
+   if (vmx->nested.hv_evmcs_map.hva) {
if (vmptr == vmx->nested.hv_evmcs_vmptr)
nested_release_evmcs(vcpu);
} else {
@@ -9355,13 +9353,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct 
kvm_vcpu *vcpu,
 
nested_release_evmcs(vcpu);
 
-   vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page(
-   vcpu, assist_page.current_nested_vmcs);
-
-   if (unlikely(is_error_page(vmx->nested.hv_evmcs_page)))
+   if (kvm_vcpu_map(vcpu, 
gpa_to_gfn(assist_page.current_nested_vmcs),
+&vmx->nested.hv_evmcs_map))
return 0;
 
-   vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
+   vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
 
/*
 * Currently, KVM only supports eVMCS version 1
-- 
2.7.4



[PATCH v4 12/14] KVM/nSVM: Use the new mapping API for mapping guest memory

2018-12-03 Thread KarimAllah Ahmed
Use the new mapping API for mapping guest memory to avoid depending on
"struct page".

Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/svm.c | 97 +++---
 1 file changed, 49 insertions(+), 48 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cc6467b..005cb2c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3053,32 +3053,6 @@ static inline bool nested_svm_nmi(struct vcpu_svm *svm)
return false;
 }
 
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
-{
-   struct page *page;
-
-   might_sleep();
-
-   page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
-   goto error;
-
-   *_page = page;
-
-   return kmap(page);
-
-error:
-   kvm_inject_gp(&svm->vcpu, 0);
-
-   return NULL;
-}
-
-static void nested_svm_unmap(struct page *page)
-{
-   kunmap(page);
-   kvm_release_page_dirty(page);
-}
-
 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
 {
unsigned port, size, iopm_len;
@@ -3279,10 +3253,11 @@ static inline void copy_vmcb_control_area(struct vmcb 
*dst_vmcb, struct vmcb *fr
 
 static int nested_svm_vmexit(struct vcpu_svm *svm)
 {
+   int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
-   struct page *page;
+   struct kvm_host_map map;
 
trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
   vmcb->control.exit_info_1,
@@ -3291,9 +3266,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
   vmcb->control.exit_int_info_err,
   KVM_ISA_SVM);
 
-   nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
-   if (!nested_vmcb)
+   rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map);
+   if (rc) {
+   if (rc == -EINVAL)
+   kvm_inject_gp(&svm->vcpu, 0);
return 1;
+   }
+
+   nested_vmcb = map.hva;
 
/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
@@ -3392,7 +3372,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
mark_all_dirty(svm->vmcb);
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(&map);
 
nested_svm_uninit_mmu_context(&svm->vcpu);
kvm_mmu_reset_context(&svm->vcpu);
@@ -3450,7 +3430,7 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 }
 
 static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-struct vmcb *nested_vmcb, struct page *page)
+struct vmcb *nested_vmcb, struct kvm_host_map 
*map)
 {
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -3530,7 +3510,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, 
u64 vmcb_gpa,
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(map);
 
/* Enter Guest-Mode */
enter_guest_mode(&svm->vcpu);
@@ -3550,17 +3530,23 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, 
u64 vmcb_gpa,
 
 static bool nested_svm_vmrun(struct vcpu_svm *svm)
 {
+   int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
-   struct page *page;
+   struct kvm_host_map map;
u64 vmcb_gpa;
 
vmcb_gpa = svm->vmcb->save.rax;
 
-   nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
-   if (!nested_vmcb)
+   rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map);
+   if (rc) {
+   if (rc == -EINVAL)
+   kvm_inject_gp(&svm->vcpu, 0);
return false;
+   }
+
+   nested_vmcb = map.hva;
 
if (!nested_vmcb_checks(nested_vmcb)) {
nested_vmcb->control.exit_code= SVM_EXIT_ERR;
@@ -3568,7 +3554,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_1  = 0;
nested_vmcb->control.exit_info_2  = 0;
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(&map);
 
return false;
}
@@ -3612,7 +3598,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
copy_vmcb_control_area(hsave, vmcb);
 
-   enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
+   enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
 
return true;
 }
@@ -3636,21 +3622,26 @@ static void nested_svm_vmloadsave

[PATCH v4 01/14] X86/nVMX: handle_vmon: Read 4 bytes from guest memory

2018-12-03 Thread KarimAllah Ahmed
Read the data directly from guest memory instead of the map->read->unmap
sequence. This also avoids using kvm_vcpu_gpa_to_page() and kmap() which
assumes that there is a "struct page" for guest memory.

Suggested-by: Jim Mattson 
Signed-off-by: KarimAllah Ahmed 
Reviewed-by: Jim Mattson 

---
v1 -> v2:
- Massage commit message a bit.
---
 arch/x86/kvm/vmx.c | 14 +++---
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 02edd99..b84f230 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8358,7 +8358,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 {
int ret;
gpa_t vmptr;
-   struct page *page;
+   uint32_t revision;
struct vcpu_vmx *vmx = to_vmx(vcpu);
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -8407,18 +8407,10 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
return nested_vmx_failInvalid(vcpu);
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-   if (is_error_page(page))
+   if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
+   revision != VMCS12_REVISION)
return nested_vmx_failInvalid(vcpu);
 
-   if (*(u32 *)kmap(page) != VMCS12_REVISION) {
-   kunmap(page);
-   kvm_release_page_clean(page);
-   return nested_vmx_failInvalid(vcpu);
-   }
-   kunmap(page);
-   kvm_release_page_clean(page);
-
vmx->nested.vmxon_ptr = vmptr;
ret = enter_vmx_operation(vcpu);
if (ret)
-- 
2.7.4



[PATCH v4 06/14] KVM/nVMX: Use kvm_vcpu_map when mapping the L1 MSR bitmap

2018-12-03 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the L1 MSR bitmap since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
---
 arch/x86/kvm/vmx.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6d6dfa9..cca3ba0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -867,6 +867,9 @@ struct nested_vmx {
struct page *apic_access_page;
struct page *virtual_apic_page;
struct page *pi_desc_page;
+
+   struct kvm_host_map msr_bitmap_map;
+
struct pi_desc *pi_desc;
bool pi_pending;
u16 posted_intr_nv;
@@ -12069,9 +12072,10 @@ static inline bool 
nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 struct vmcs12 *vmcs12)
 {
int msr;
-   struct page *page;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+   struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
+
/*
 * pred_cmd & spec_ctrl are trying to verify two things:
 *
@@ -12097,11 +12101,10 @@ static inline bool 
nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
!pred_cmd && !spec_ctrl)
return false;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
 
-   msr_bitmap_l1 = (unsigned long *)kmap(page);
+   msr_bitmap_l1 = (unsigned long *)map->hva;
if (nested_cpu_has_apic_reg_virt(vmcs12)) {
/*
 * L0 need not intercept reads for MSRs between 0x800 and 
0x8ff, it
@@ -12149,8 +12152,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
MSR_IA32_PRED_CMD,
MSR_TYPE_W);
 
-   kunmap(page);
-   kvm_release_page_clean(page);
+   kvm_vcpu_unmap(&to_vmx(vcpu)->nested.msr_bitmap_map);
 
return true;
 }
-- 
2.7.4



[PATCH v4 02/14] X86/nVMX: handle_vmptrld: Copy the VMCS12 directly from guest memory

2018-12-03 Thread KarimAllah Ahmed
Copy the VMCS12 directly from guest memory instead of the map->copy->unmap
sequence. This also avoids using kvm_vcpu_gpa_to_page() and kmap() which
assumes that there is a "struct page" for guest memory.

Signed-off-by: KarimAllah Ahmed 
---
v3 -> v4:
- Return VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID on failure (jmattson@)
v1 -> v2:
- Massage commit message a bit.
---
 arch/x86/kvm/vmx.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b84f230..75817cb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9301,20 +9301,22 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return 1;
 
if (vmx->nested.current_vmptr != vmptr) {
-   struct vmcs12 *new_vmcs12;
-   struct page *page;
-   page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-   if (is_error_page(page))
-   return nested_vmx_failInvalid(vcpu);
+   struct vmcs12 *new_vmcs12 = (struct vmcs12 
*)__get_free_page(GFP_KERNEL);
+
+   if (!new_vmcs12 ||
+   kvm_read_guest(vcpu->kvm, vmptr, new_vmcs12,
+  sizeof(*new_vmcs12))) {
+   free_page((unsigned long)new_vmcs12);
+   return nested_vmx_failValid(vcpu,
+   
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+   }
 
-   new_vmcs12 = kmap(page);
if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
(new_vmcs12->hdr.shadow_vmcs &&
 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
-   kunmap(page);
-   kvm_release_page_clean(page);
+   free_page((unsigned long)new_vmcs12);
return nested_vmx_failValid(vcpu,
-   VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+   
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
 
nested_release_vmcs12(vcpu);
@@ -9324,9 +9326,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 * cached.
 */
memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
-   kunmap(page);
-   kvm_release_page_clean(page);
-
+   free_page((unsigned long)new_vmcs12);
set_current_vmptr(vmx, vmptr);
}
 
-- 
2.7.4



[PATCH v4 03/14] X86/nVMX: Update the PML table without mapping and unmapping the page

2018-12-03 Thread KarimAllah Ahmed
Update the PML table without mapping and unmapping the page. This also
avoids using kvm_vcpu_gpa_to_page(..) which assumes that there is a "struct
page" for guest memory.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Use kvm_write_guest_page instead of kvm_write_guest (pbonzini)
- Do not use pointer arithmetic for pml_address (pbonzini)
---
 arch/x86/kvm/vmx.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 75817cb..6d6dfa9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -14427,9 +14427,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
 {
struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
-   gpa_t gpa;
-   struct page *page = NULL;
-   u64 *pml_address;
+   gpa_t gpa, dst;
 
if (is_guest_mode(vcpu)) {
WARN_ON_ONCE(vmx->nested.pml_full);
@@ -14449,15 +14447,13 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
}
 
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+   dst = vmcs12->pml_address + sizeof(u64) * 
vmcs12->guest_pml_index;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
-   if (is_error_page(page))
+   if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+offset_in_page(dst), sizeof(gpa)))
return 0;
 
-   pml_address = kmap(page);
-   pml_address[vmcs12->guest_pml_index--] = gpa;
-   kunmap(page);
-   kvm_release_page_clean(page);
+   vmcs12->guest_pml_index--;
}
 
return 0;
-- 
2.7.4



[RESEND PATCH v3 07/13] KVM/nVMX: Use kvm_vcpu_map when mapping the virtual APIC page

2018-10-21 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the virtual APIC page since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

One additional semantic change is that the virtual host mapping lifecycle
has changed a bit. It now has the same lifetime of the pinning of the
virtual APIC page on the host side.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
- Use pfn_to_hpa instead of gfn_to_gpa
---
 arch/x86/kvm/vmx.c | 39 +--
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5b15ca2..9683923 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -845,9 +845,8 @@ struct nested_vmx {
 * pointers, so we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
-   struct page *virtual_apic_page;
+   struct kvm_host_map virtual_apic_map;
struct page *pi_desc_page;
-
struct kvm_host_map msr_bitmap_map;
 
struct pi_desc *pi_desc;
@@ -6152,11 +6151,12 @@ static void vmx_complete_nested_posted_interrupt(struct 
kvm_vcpu *vcpu)
 
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
-   vapic_page = kmap(vmx->nested.virtual_apic_page);
+   vapic_page = vmx->nested.virtual_apic_map.hva;
+   if (!vapic_page)
+   return;
+
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
vapic_page, &max_irr);
-   kunmap(vmx->nested.virtual_apic_page);
-
status = vmcs_read16(GUEST_INTR_STATUS);
if ((u8)max_irr > ((u8)status & 0xff)) {
status &= ~0xff;
@@ -6182,14 +6182,13 @@ static bool vmx_guest_apic_has_interrupt(struct 
kvm_vcpu *vcpu)
 
if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
-   WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
+   WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
return false;
 
rvi = vmx_get_rvi();
 
-   vapic_page = kmap(vmx->nested.virtual_apic_page);
+   vapic_page = vmx->nested.virtual_apic_map.hva;
vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
-   kunmap(vmx->nested.virtual_apic_page);
 
return ((rvi & 0xf0) > (vppr & 0xf0));
 }
@@ -8468,10 +8467,7 @@ static void free_nested(struct vcpu_vmx *vmx)
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
-   if (vmx->nested.virtual_apic_page) {
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
kvm_release_page_dirty(vmx->nested.pi_desc_page);
@@ -11394,6 +11390,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
 {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_host_map *map;
struct page *page;
u64 hpa;
 
@@ -11426,11 +11423,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
}
 
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-   if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
-   page = kvm_vcpu_gpa_to_page(vcpu, 
vmcs12->virtual_apic_page_addr);
+   map = &vmx->nested.virtual_apic_map;
 
/*
 * If translation failed, VM entry will fail because
@@ -11445,11 +11438,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
 * control.  But such a configuration is useless, so
 * let's keep the code simple.
 */
-   if (!is_error_page(page)) {
-   vmx->nested.virtual_apic_page = page;
-   hpa = page_to_phys(vmx->nested.virtual_apic_page);
-   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
-   }
+   if (!kvm_vcpu_map(vcpu, 
gpa_to_gfn(vmcs12->virtual_apic_page_addr), map))
+   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 
pfn_to_hpa(map->pfn));
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -13353,10 +13343,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
u32 exit_reason,
kvm_release_page_dirty(vmx->nested.apic_access_page);

[PATCH v2] KVM/nVMX: Do not validate that posted_intr_desc_addr is page aligned

2018-10-20 Thread KarimAllah Ahmed
The spec only requires the posted interrupt descriptor address to be
64-bytes aligned (i.e. bits[0:5] == 0). Using page_address_valid also
forces the address to be page aligned.

Only validate that the address does not cross the maximum physical address
without enforcing a page alignment.

v1 -> v2:
- Add a missing parenthesis (dropped while merging!)

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Fixes: 6de84e581c0 ("nVMX x86: check posted-interrupt descriptor addresss on 
vmentry of L2")
Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 38f1a16..bb0fcdb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11667,7 +11667,7 @@ static int nested_vmx_check_apicv_controls(struct 
kvm_vcpu *vcpu,
!nested_exit_intr_ack_set(vcpu) ||
(vmcs12->posted_intr_nv & 0xff00) ||
(vmcs12->posted_intr_desc_addr & 0x3f) ||
-   (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr
+   (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu
return -EINVAL;
 
/* tpr shadow is needed by all apicv features. */
-- 
2.7.4



[PATCH v3 06/13] KVM/nVMX: Use kvm_vcpu_map when mapping the L1 MSR bitmap

2018-10-20 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the L1 MSR bitmap since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
---
 arch/x86/kvm/vmx.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d857401..5b15ca2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -847,6 +847,9 @@ struct nested_vmx {
struct page *apic_access_page;
struct page *virtual_apic_page;
struct page *pi_desc_page;
+
+   struct kvm_host_map msr_bitmap_map;
+
struct pi_desc *pi_desc;
bool pi_pending;
u16 posted_intr_nv;
@@ -11546,9 +11549,10 @@ static inline bool 
nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 struct vmcs12 *vmcs12)
 {
int msr;
-   struct page *page;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+   struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
+
/*
 * pred_cmd & spec_ctrl are trying to verify two things:
 *
@@ -11574,11 +11578,10 @@ static inline bool 
nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
!pred_cmd && !spec_ctrl)
return false;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
 
-   msr_bitmap_l1 = (unsigned long *)kmap(page);
+   msr_bitmap_l1 = (unsigned long *)map->hva;
if (nested_cpu_has_apic_reg_virt(vmcs12)) {
/*
 * L0 need not intercept reads for MSRs between 0x800 and 
0x8ff, it
@@ -11626,8 +11629,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
MSR_IA32_PRED_CMD,
MSR_TYPE_W);
 
-   kunmap(page);
-   kvm_release_page_clean(page);
+   kvm_vcpu_unmap(&to_vmx(vcpu)->nested.msr_bitmap_map);
 
return true;
 }
-- 
2.7.4



[PATCH v3 01/13] X86/nVMX: handle_vmon: Read 4 bytes from guest memory

2018-10-20 Thread KarimAllah Ahmed
Read the data directly from guest memory instead of the map->read->unmap
sequence. This also avoids using kvm_vcpu_gpa_to_page() and kmap() which
assumes that there is a "struct page" for guest memory.

Suggested-by: Jim Mattson 
Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Massage commit message a bit.
---
 arch/x86/kvm/vmx.c | 14 +++---
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4f5d4bd..358759a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8321,7 +8321,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 {
int ret;
gpa_t vmptr;
-   struct page *page;
+   uint32_t revision;
struct vcpu_vmx *vmx = to_vmx(vcpu);
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -8373,19 +8373,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-   if (is_error_page(page)) {
+   if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
+   revision != VMCS12_REVISION) {
nested_vmx_failInvalid(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
-   if (*(u32 *)kmap(page) != VMCS12_REVISION) {
-   kunmap(page);
-   kvm_release_page_clean(page);
-   nested_vmx_failInvalid(vcpu);
-   return kvm_skip_emulated_instruction(vcpu);
-   }
-   kunmap(page);
-   kvm_release_page_clean(page);
 
vmx->nested.vmxon_ptr = vmptr;
ret = enter_vmx_operation(vcpu);
-- 
2.7.4



[PATCH v3 05/13] KVM: Introduce a new guest mapping API

2018-10-20 Thread KarimAllah Ahmed
In KVM, specially for nested guests, there is a dominant pattern of:

=> map guest memory -> do_something -> unmap guest memory

In addition to all this unnecessarily noise in the code due to boiler plate
code, most of the time the mapping function does not properly handle memory
that is not backed by "struct page". This new guest mapping API encapsulate
most of this boiler plate code and also handles guest memory that is not
backed by "struct page".

Keep in mind that memremap is horribly slow, so this mapping API should not
be used for high-frequency mapping operations. But rather for low-frequency
mappings.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Drop the caching optimization (pbonzini)
- Use 'hva' instead of 'kaddr' (pbonzini)
- Return 0/-EINVAL/-EFAULT instead of true/false. -EFAULT will be used for
  AMD patch (pbonzini)
- Introduce __kvm_map_gfn which accepts a memory slot and use it (pbonzini)
- Only clear map->hva instead of memsetting the whole structure.
- Drop kvm_vcpu_map_valid since it is no longer used.
- Fix EXPORT_MODULE naming.
---
 include/linux/kvm_host.h |  9 +
 virt/kvm/kvm_main.c  | 50 
 2 files changed, 59 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c926698..59e56b8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -205,6 +205,13 @@ enum {
READING_SHADOW_PAGE_TABLES,
 };
 
+struct kvm_host_map {
+   struct page *page;
+   void *hva;
+   kvm_pfn_t pfn;
+   kvm_pfn_t gfn;
+};
+
 /*
  * Sometimes a large or cross-page mmio needs to be broken up into separate
  * exits for userspace servicing.
@@ -708,6 +715,8 @@ struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu 
*vcpu);
 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t 
gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
+void kvm_vcpu_unmap(struct kvm_host_map *map);
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool 
*writable);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 94e931f..a79a3c4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1652,6 +1652,56 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
+static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn,
+struct kvm_host_map *map)
+{
+   kvm_pfn_t pfn;
+   void *hva = NULL;
+   struct page *page = NULL;
+
+   pfn = gfn_to_pfn_memslot(slot, gfn);
+   if (is_error_noslot_pfn(pfn))
+   return -EINVAL;
+
+   if (pfn_valid(pfn)) {
+   page = pfn_to_page(pfn);
+   hva = kmap(page);
+   } else {
+   hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+   }
+
+   if (!hva)
+   return -EFAULT;
+
+   map->page = page;
+   map->hva = hva;
+   map->pfn = pfn;
+   map->gfn = gfn;
+
+   return 0;
+}
+
+int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
+{
+   return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_map);
+
+void kvm_vcpu_unmap(struct kvm_host_map *map)
+{
+   if (!map->hva)
+   return;
+
+   if (map->page)
+   kunmap(map->page);
+   else
+   memunmap(map->hva);
+
+   kvm_release_pfn_dirty(map->pfn);
+   map->hva = NULL;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
+
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
kvm_pfn_t pfn;
-- 
2.7.4



[PATCH v3 03/13] X86/nVMX: Update the PML table without mapping and unmapping the page

2018-10-20 Thread KarimAllah Ahmed
Update the PML table without mapping and unmapping the page. This also
avoids using kvm_vcpu_gpa_to_page(..) which assumes that there is a "struct
page" for guest memory.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Use kvm_write_guest_page instead of kvm_write_guest (pbonzini)
- Do not use pointer arithmetic for pml_address (pbonzini)
---
 arch/x86/kvm/vmx.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bc45347..d857401 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -13571,9 +13571,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
 {
struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
-   gpa_t gpa;
-   struct page *page = NULL;
-   u64 *pml_address;
+   gpa_t gpa, dst;
 
if (is_guest_mode(vcpu)) {
WARN_ON_ONCE(vmx->nested.pml_full);
@@ -13593,15 +13591,13 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
}
 
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+   dst = vmcs12->pml_address + sizeof(u64) * 
vmcs12->guest_pml_index;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
-   if (is_error_page(page))
+   if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+offset_in_page(dst), sizeof(gpa)))
return 0;
 
-   pml_address = kmap(page);
-   pml_address[vmcs12->guest_pml_index--] = gpa;
-   kunmap(page);
-   kvm_release_page_clean(page);
+   vmcs12->guest_pml_index--;
}
 
return 0;
-- 
2.7.4



[PATCH v3 11/13] KVM/X86: hyperv: Use kvm_vcpu_map in synic_deliver_msg

2018-10-20 Thread KarimAllah Ahmed
Use kvm_vcpu_map in synic_deliver_msg since using kvm_vcpu_gpa_to_page()
and kmap() will only work for guest memory that has a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Update to match the new API return codes
---
 arch/x86/kvm/hyperv.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 8bdc78d..5310b8b 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -581,7 +581,7 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic 
*synic, u32 sint,
 struct hv_message *src_msg)
 {
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
-   struct page *page;
+   struct kvm_host_map map;
gpa_t gpa;
struct hv_message *dst_msg;
int r;
@@ -591,11 +591,11 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic 
*synic, u32 sint,
return -ENOENT;
 
gpa = synic->msg_page & PAGE_MASK;
-   page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
+
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
return -EFAULT;
 
-   msg_page = kmap_atomic(page);
+   msg_page = map.hva;
dst_msg = &msg_page->sint_message[sint];
if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE,
 src_msg->header.message_type) != HVMSG_NONE) {
@@ -612,8 +612,8 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic 
*synic, u32 sint,
else if (r == 0)
r = -EFAULT;
}
-   kunmap_atomic(msg_page);
-   kvm_release_page_dirty(page);
+
+   kvm_vcpu_unmap(&map);
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
return r;
 }
-- 
2.7.4



[PATCH v3 10/13] KVM/X86: hyperv: Use kvm_vcpu_map in synic_clear_sint_msg_pending

2018-10-20 Thread KarimAllah Ahmed
Use kvm_vcpu_map in synic_clear_sint_msg_pending since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Update to match the new API return codes
---
 arch/x86/kvm/hyperv.c | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 01d209a..8bdc78d 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -158,26 +158,22 @@ static void synic_clear_sint_msg_pending(struct 
kvm_vcpu_hv_synic *synic,
u32 sint)
 {
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
-   struct page *page;
-   gpa_t gpa;
+   struct kvm_host_map map;
struct hv_message *msg;
struct hv_message_page *msg_page;
 
-   gpa = synic->msg_page & PAGE_MASK;
-   page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page)) {
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(synic->msg_page), &map)) {
vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n",
-gpa);
+synic->msg_page);
return;
}
-   msg_page = kmap_atomic(page);
 
+   msg_page = map.hva;
msg = &msg_page->sint_message[sint];
msg->header.message_flags.msg_pending = 0;
 
-   kunmap_atomic(msg_page);
-   kvm_release_page_dirty(page);
-   kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+   kvm_vcpu_unmap(&map);
+   kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(synic->msg_page));
 }
 
 static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
-- 
2.7.4



[PATCH v3 07/13] KVM/nVMX: Use kvm_vcpu_map when mapping the virtual APIC page

2018-10-20 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the virtual APIC page since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

One additional semantic change is that the virtual host mapping lifecycle
has changed a bit. It now has the same lifetime of the pinning of the
virtual APIC page on the host side.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
- Use pfn_to_hpa instead of gfn_to_gpa
---
 arch/x86/kvm/vmx.c | 34 +++---
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5b15ca2..83a5e95 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -845,9 +845,8 @@ struct nested_vmx {
 * pointers, so we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
-   struct page *virtual_apic_page;
+   struct kvm_host_map virtual_apic_map;
struct page *pi_desc_page;
-
struct kvm_host_map msr_bitmap_map;
 
struct pi_desc *pi_desc;
@@ -6152,11 +6151,12 @@ static void vmx_complete_nested_posted_interrupt(struct 
kvm_vcpu *vcpu)
 
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
-   vapic_page = kmap(vmx->nested.virtual_apic_page);
+   vapic_page = vmx->nested.virtual_apic_map.hva;
+   if (!vapic_page)
+   return;
+
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
vapic_page, &max_irr);
-   kunmap(vmx->nested.virtual_apic_page);
-
status = vmcs_read16(GUEST_INTR_STATUS);
if ((u8)max_irr > ((u8)status & 0xff)) {
status &= ~0xff;
@@ -8468,10 +8468,7 @@ static void free_nested(struct vcpu_vmx *vmx)
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
-   if (vmx->nested.virtual_apic_page) {
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
kvm_release_page_dirty(vmx->nested.pi_desc_page);
@@ -11394,6 +11391,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
 {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_host_map *map;
struct page *page;
u64 hpa;
 
@@ -11426,11 +11424,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
}
 
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-   if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
-   page = kvm_vcpu_gpa_to_page(vcpu, 
vmcs12->virtual_apic_page_addr);
+   map = &vmx->nested.virtual_apic_map;
 
/*
 * If translation failed, VM entry will fail because
@@ -11445,11 +11439,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
 * control.  But such a configuration is useless, so
 * let's keep the code simple.
 */
-   if (!is_error_page(page)) {
-   vmx->nested.virtual_apic_page = page;
-   hpa = page_to_phys(vmx->nested.virtual_apic_page);
-   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
-   }
+   if (!kvm_vcpu_map(vcpu, 
gpa_to_gfn(vmcs12->virtual_apic_page_addr), map))
+   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 
pfn_to_hpa(map->pfn));
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -13353,10 +13344,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
u32 exit_reason,
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
-   if (vmx->nested.virtual_apic_page) {
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
kvm_release_page_dirty(vmx->nested.pi_desc_page);
-- 
2.7.4



[PATCH v3 09/13] KVM/X86: Use kvm_vcpu_map in emulator_cmpxchg_emulated

2018-10-20 Thread KarimAllah Ahmed
Use kvm_vcpu_map in emulator_cmpxchg_emulated since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Update to match the new API return codes
---
 arch/x86/kvm/x86.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ca71773..f083e53 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5280,9 +5280,9 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
 unsigned int bytes,
 struct x86_exception *exception)
 {
+   struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
-   struct page *page;
char *kaddr;
bool exchanged;
 
@@ -5299,12 +5299,11 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
goto emul_write;
 
-   page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
goto emul_write;
 
-   kaddr = kmap_atomic(page);
-   kaddr += offset_in_page(gpa);
+   kaddr = map.hva + offset_in_page(gpa);
+
switch (bytes) {
case 1:
exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
@@ -5321,8 +5320,8 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
default:
BUG();
}
-   kunmap_atomic(kaddr);
-   kvm_release_page_dirty(page);
+
+   kvm_vcpu_unmap(&map);
 
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
-- 
2.7.4



[PATCH v3 00/13] KVM/X86: Introduce a new guest mapping interface

2018-10-20 Thread KarimAllah Ahmed
Guest memory can either be directly managed by the kernel (i.e. have a "struct
page") or they can simply live outside kernel control (i.e. do not have a
"struct page"). KVM mostly support these two modes, except in a few places
where the code seems to assume that guest memory must have a "struct page".

This patchset introduces a new mapping interface to map guest memory into host
kernel memory which also supports PFN-based memory (i.e. memory without 'struct
page'). It also converts all offending code to this interface or simply
read/write directly from guest memory.

As far as I can see all offending code is now fixed except the APIC-access page
which I will handle in a seperate series along with dropping
kvm_vcpu_gfn_to_page and kvm_vcpu_gpa_to_page from the internal KVM API.

v3 -> v2:

- rebase
- Add a new patch to also fix the newly introducing shadow VMCS support for
  nested.

Filippo Sironi (1):
  X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs

KarimAllah Ahmed (12):
  X86/nVMX: handle_vmon: Read 4 bytes from guest memory
  X86/nVMX: handle_vmptrld: Copy the VMCS12 directly from guest memory
  X86/nVMX: Update the PML table without mapping and unmapping the page
  KVM: Introduce a new guest mapping API
  KVM/nVMX: Use kvm_vcpu_map when mapping the L1 MSR bitmap
  KVM/nVMX: Use kvm_vcpu_map when mapping the virtual APIC page
  KVM/nVMX: Use kvm_vcpu_map when mapping the posted interrupt
descriptor table
  KVM/X86: Use kvm_vcpu_map in emulator_cmpxchg_emulated
  KVM/X86: hyperv: Use kvm_vcpu_map in synic_clear_sint_msg_pending
  KVM/X86: hyperv: Use kvm_vcpu_map in synic_deliver_msg
  KVM/nSVM: Use the new mapping API for mapping guest memory
  KVM/nVMX: Use kvm_vcpu_map for accessing the shadow VMCS

 arch/x86/kvm/hyperv.c  |  28 
 arch/x86/kvm/paging_tmpl.h |  38 ---
 arch/x86/kvm/svm.c |  97 +-
 arch/x86/kvm/vmx.c | 167 +
 arch/x86/kvm/x86.c |  13 ++--
 include/linux/kvm_host.h   |   9 +++
 virt/kvm/kvm_main.c|  50 ++
 7 files changed, 217 insertions(+), 185 deletions(-)

-- 
2.7.4



[PATCH v3 13/13] KVM/nVMX: Use kvm_vcpu_map for accessing the shadow VMCS

2018-10-20 Thread KarimAllah Ahmed
Use kvm_vcpu_map for accessing the shadow VMCS since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/vmx.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0ae25c3..2892770 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11616,20 +11616,20 @@ static inline bool 
nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
   struct vmcs12 *vmcs12)
 {
+   struct kvm_host_map map;
struct vmcs12 *shadow;
-   struct page *page;
 
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
vmcs12->vmcs_link_pointer == -1ull)
return;
 
shadow = get_shadow_vmcs12(vcpu);
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
 
-   memcpy(shadow, kmap(page), VMCS12_SIZE);
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
+   return;
 
-   kunmap(page);
-   kvm_release_page_clean(page);
+   memcpy(shadow, map.hva, VMCS12_SIZE);
+   kvm_vcpu_unmap(&map);
 }
 
 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
@@ -12494,9 +12494,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
  struct vmcs12 *vmcs12)
 {
-   int r;
-   struct page *page;
+   int r = 0;
struct vmcs12 *shadow;
+   struct kvm_host_map map;
 
if (vmcs12->vmcs_link_pointer == -1ull)
return 0;
@@ -12504,17 +12504,16 @@ static int nested_vmx_check_vmcs_link_ptr(struct 
kvm_vcpu *vcpu,
if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
return -EINVAL;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
return -EINVAL;
 
-   r = 0;
-   shadow = kmap(page);
+   shadow = map.hva;
+
if (shadow->hdr.revision_id != VMCS12_REVISION ||
shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
r = -EINVAL;
-   kunmap(page);
-   kvm_release_page_clean(page);
+
+   kvm_vcpu_unmap(&map);
return r;
 }
 
-- 
2.7.4



[PATCH v3 12/13] KVM/nSVM: Use the new mapping API for mapping guest memory

2018-10-20 Thread KarimAllah Ahmed
Use the new mapping API for mapping guest memory to avoid depending on
"struct page".

Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/svm.c | 97 +++---
 1 file changed, 49 insertions(+), 48 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d96092b..911d853 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3036,32 +3036,6 @@ static inline bool nested_svm_nmi(struct vcpu_svm *svm)
return false;
 }
 
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
-{
-   struct page *page;
-
-   might_sleep();
-
-   page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
-   goto error;
-
-   *_page = page;
-
-   return kmap(page);
-
-error:
-   kvm_inject_gp(&svm->vcpu, 0);
-
-   return NULL;
-}
-
-static void nested_svm_unmap(struct page *page)
-{
-   kunmap(page);
-   kvm_release_page_dirty(page);
-}
-
 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
 {
unsigned port, size, iopm_len;
@@ -3262,10 +3236,11 @@ static inline void copy_vmcb_control_area(struct vmcb 
*dst_vmcb, struct vmcb *fr
 
 static int nested_svm_vmexit(struct vcpu_svm *svm)
 {
+   int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
-   struct page *page;
+   struct kvm_host_map map;
 
trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
   vmcb->control.exit_info_1,
@@ -3274,9 +3249,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
   vmcb->control.exit_int_info_err,
   KVM_ISA_SVM);
 
-   nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
-   if (!nested_vmcb)
+   rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map);
+   if (rc) {
+   if (rc == -EINVAL)
+   kvm_inject_gp(&svm->vcpu, 0);
return 1;
+   }
+
+   nested_vmcb = map.hva;
 
/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
@@ -3375,7 +3355,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
mark_all_dirty(svm->vmcb);
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(&map);
 
nested_svm_uninit_mmu_context(&svm->vcpu);
kvm_mmu_reset_context(&svm->vcpu);
@@ -3433,7 +3413,7 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 }
 
 static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-struct vmcb *nested_vmcb, struct page *page)
+struct vmcb *nested_vmcb, struct kvm_host_map 
*map)
 {
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -3513,7 +3493,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, 
u64 vmcb_gpa,
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(map);
 
/* Enter Guest-Mode */
enter_guest_mode(&svm->vcpu);
@@ -3533,17 +3513,23 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, 
u64 vmcb_gpa,
 
 static bool nested_svm_vmrun(struct vcpu_svm *svm)
 {
+   int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
-   struct page *page;
+   struct kvm_host_map map;
u64 vmcb_gpa;
 
vmcb_gpa = svm->vmcb->save.rax;
 
-   nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
-   if (!nested_vmcb)
+   rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map);
+   if (rc) {
+   if (rc == -EINVAL)
+   kvm_inject_gp(&svm->vcpu, 0);
return false;
+   }
+
+   nested_vmcb = map.hva;
 
if (!nested_vmcb_checks(nested_vmcb)) {
nested_vmcb->control.exit_code= SVM_EXIT_ERR;
@@ -3551,7 +3537,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_1  = 0;
nested_vmcb->control.exit_info_2  = 0;
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(&map);
 
return false;
}
@@ -3595,7 +3581,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
copy_vmcb_control_area(hsave, vmcb);
 
-   enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
+   enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
 
return true;
 }
@@ -3619,21 +3605,26 @@ static void nested_svm_vmloadsave

[PATCH v3 02/13] X86/nVMX: handle_vmptrld: Copy the VMCS12 directly from guest memory

2018-10-20 Thread KarimAllah Ahmed
Copy the VMCS12 directly from guest memory instead of the map->copy->unmap
sequence. This also avoids using kvm_vcpu_gpa_to_page() and kmap() which
assumes that there is a "struct page" for guest memory.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Massage commit message a bit.
---
 arch/x86/kvm/vmx.c | 23 +--
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 358759a..bc45347 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8879,33 +8879,28 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
}
 
if (vmx->nested.current_vmptr != vmptr) {
-   struct vmcs12 *new_vmcs12;
-   struct page *page;
-   page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-   if (is_error_page(page)) {
+   struct vmcs12 *new_vmcs12 = (struct vmcs12 
*)__get_free_page(GFP_KERNEL);
+
+   if (!new_vmcs12 ||
+   kvm_read_guest(vcpu->kvm, vmptr, new_vmcs12,
+  sizeof(*new_vmcs12))) {
+   free_page((unsigned long)new_vmcs12);
nested_vmx_failInvalid(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
-   new_vmcs12 = kmap(page);
+
if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
(new_vmcs12->hdr.shadow_vmcs &&
 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
-   kunmap(page);
-   kvm_release_page_clean(page);
+   free_page((unsigned long)new_vmcs12);
nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
return kvm_skip_emulated_instruction(vcpu);
}
 
nested_release_vmcs12(vmx);
-   /*
-* Load VMCS12 from guest memory since it is not already
-* cached.
-*/
memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
-   kunmap(page);
-   kvm_release_page_clean(page);
-
+   free_page((unsigned long)new_vmcs12);
set_current_vmptr(vmx, vmptr);
}
 
-- 
2.7.4



[PATCH v3 08/13] KVM/nVMX: Use kvm_vcpu_map when mapping the posted interrupt descriptor table

2018-10-20 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the posted interrupt descriptor table since
using kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory
that has a "struct page".

One additional semantic change is that the virtual host mapping lifecycle
has changed a bit. It now has the same lifetime of the pinning of the
interrupt descriptor table page on the host side.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
---
 arch/x86/kvm/vmx.c | 45 +++--
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 83a5e95..0ae25c3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -846,7 +846,7 @@ struct nested_vmx {
 */
struct page *apic_access_page;
struct kvm_host_map virtual_apic_map;
-   struct page *pi_desc_page;
+   struct kvm_host_map pi_desc_map;
struct kvm_host_map msr_bitmap_map;
 
struct pi_desc *pi_desc;
@@ -8469,12 +8469,8 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
-   if (vmx->nested.pi_desc_page) {
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
+   vmx->nested.pi_desc = NULL;
 
free_loaded_vmcs(&vmx->nested.vmcs02);
 }
@@ -11444,24 +11440,16 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
-   if (vmx->nested.pi_desc_page) { /* shouldn't happen */
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
+   map = &vmx->nested.pi_desc_map;
+
+   if (!kvm_vcpu_map(vcpu, 
gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
+   vmx->nested.pi_desc =
+   (struct pi_desc *)(((void *)map->hva) +
+   offset_in_page(vmcs12->posted_intr_desc_addr));
+   vmcs_write64(POSTED_INTR_DESC_ADDR, 
pfn_to_hpa(map->pfn) +
+   
offset_in_page(vmcs12->posted_intr_desc_addr));
}
-   page = kvm_vcpu_gpa_to_page(vcpu, 
vmcs12->posted_intr_desc_addr);
-   if (is_error_page(page))
-   return;
-   vmx->nested.pi_desc_page = page;
-   vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc =
-   (struct pi_desc *)((void *)vmx->nested.pi_desc +
-   (unsigned long)(vmcs12->posted_intr_desc_addr &
-   (PAGE_SIZE - 1)));
-   vmcs_write64(POSTED_INTR_DESC_ADDR,
-   page_to_phys(vmx->nested.pi_desc_page) +
-   (unsigned long)(vmcs12->posted_intr_desc_addr &
-   (PAGE_SIZE - 1)));
+
}
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
@@ -13344,13 +13332,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
u32 exit_reason,
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
+
kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
-   if (vmx->nested.pi_desc_page) {
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
+   vmx->nested.pi_desc = NULL;
 
/*
 * We are now running in L2, mmu_notifier will force to reload the
-- 
2.7.4



[PATCH v3 04/13] X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs

2018-10-20 Thread KarimAllah Ahmed
From: Filippo Sironi 

cmpxchg_gpte() calls get_user_pages_fast() to retrieve the number of
pages and the respective struct page to map in the kernel virtual
address space.
This doesn't work if get_user_pages_fast() is invoked with a userspace
virtual address that's backed by PFNs outside of kernel reach (e.g., when
limiting the kernel memory with mem= in the command line and using
/dev/mem to map memory).

If get_user_pages_fast() fails, look up the VMA that back the userspace
virtual address, compute the PFN and the physical address, and map it in
the kernel virtual address space with memremap().

Signed-off-by: Filippo Sironi 
Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/paging_tmpl.h | 38 +-
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 14ffd97..8f7bc8f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -141,15 +141,35 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
struct page *page;
 
npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
-   /* Check if the user is doing something meaningless. */
-   if (unlikely(npages != 1))
-   return -EFAULT;
-
-   table = kmap_atomic(page);
-   ret = CMPXCHG(&table[index], orig_pte, new_pte);
-   kunmap_atomic(table);
-
-   kvm_release_page_dirty(page);
+   if (likely(npages == 1)) {
+   table = kmap_atomic(page);
+   ret = CMPXCHG(&table[index], orig_pte, new_pte);
+   kunmap_atomic(table);
+
+   kvm_release_page_dirty(page);
+   } else {
+   struct vm_area_struct *vma;
+   unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
+   unsigned long pfn;
+   unsigned long paddr;
+
+   down_read(¤t->mm->mmap_sem);
+   vma = find_vma_intersection(current->mm, vaddr, vaddr + 
PAGE_SIZE);
+   if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
+   up_read(¤t->mm->mmap_sem);
+   return -EFAULT;
+   }
+   pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+   paddr = pfn << PAGE_SHIFT;
+   table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
+   if (!table) {
+   up_read(¤t->mm->mmap_sem);
+   return -EFAULT;
+   }
+   ret = CMPXCHG(&table[index], orig_pte, new_pte);
+   memunmap(table);
+   up_read(¤t->mm->mmap_sem);
+   }
 
return (ret != orig_pte);
 }
-- 
2.7.4



[PATCH] KVM/nVMX: Do not validate that posted_intr_desc_addr is page aligned

2018-10-20 Thread KarimAllah Ahmed
The spec only requires the posted interrupt descriptor address to be
64-bytes aligned (i.e. bits[0:5] == 0). Using page_address_valid also
forces the address to be page aligned.

Only validate that the address does not cross the maximum physical address
without enforcing a page alignment.

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Fixes: 6de84e581c0 ("nVMX x86: check posted-interrupt descriptor addresss on 
vmentry of L2")
Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 30bf860..47962f2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11668,7 +11668,7 @@ static int nested_vmx_check_apicv_controls(struct 
kvm_vcpu *vcpu,
!nested_exit_intr_ack_set(vcpu) ||
(vmcs12->posted_intr_nv & 0xff00) ||
(vmcs12->posted_intr_desc_addr & 0x3f) ||
-   (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr
+   (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))
return -EINVAL;
 
/* tpr shadow is needed by all apicv features. */
-- 
2.7.4



[PATCH] rcu: Benefit from expedited grace period in __wait_rcu_gp

2018-10-18 Thread KarimAllah Ahmed
When expedited grace-period is set, both synchronize_sched
synchronize_rcu_bh can be optimized to have a significantly lower latency.

Improve wait_rcu_gp handling to also account for expedited grace-period.
The downside is that wait_rcu_gp will not wait anymore for all RCU variants
concurrently when an expedited grace-period is set, however, given the
improved latency it does not really matter.

Cc: Paul E. McKenney 
Cc: Josh Triplett 
Cc: Steven Rostedt 
Cc: Mathieu Desnoyers 
Cc: Lai Jiangshan 
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
 kernel/rcu/update.c | 34 --
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 68fa19a..44b8817 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -392,13 +392,27 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t 
*crcu_array,
might_sleep();
continue;
}
-   init_rcu_head_on_stack(&rs_array[i].head);
-   init_completion(&rs_array[i].completion);
+
for (j = 0; j < i; j++)
if (crcu_array[j] == crcu_array[i])
break;
-   if (j == i)
-   (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+   if (j != i)
+   continue;
+
+   if ((crcu_array[i] == call_rcu_sched ||
+crcu_array[i] == call_rcu_bh)
+   && rcu_gp_is_expedited()) {
+   if (crcu_array[i] == call_rcu_sched)
+   synchronize_sched_expedited();
+   else
+   synchronize_rcu_bh_expedited();
+
+   continue;
+   }
+
+   init_rcu_head_on_stack(&rs_array[i].head);
+   init_completion(&rs_array[i].completion);
+   (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
}
 
/* Wait for all callbacks to be invoked. */
@@ -407,11 +421,19 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t 
*crcu_array,
(crcu_array[i] == call_rcu ||
 crcu_array[i] == call_rcu_bh))
continue;
+
+   if ((crcu_array[i] == call_rcu_sched ||
+crcu_array[i] == call_rcu_bh)
+   && rcu_gp_is_expedited())
+   continue;
+
for (j = 0; j < i; j++)
if (crcu_array[j] == crcu_array[i])
break;
-   if (j == i)
-   wait_for_completion(&rs_array[i].completion);
+   if (j != i)
+   continue;
+
+   wait_for_completion(&rs_array[i].completion);
destroy_rcu_head_on_stack(&rs_array[i].head);
}
 }
-- 
2.7.4



[PATCH v2] PCI/IOV: Use VF0 cached config space size for other VFs

2018-10-10 Thread KarimAllah Ahmed
Cache the config space size from VF0 and use it for all other VFs instead
of reading it from the config space of each VF. We assume that it will be
the same across all associated VFs.

This is an optimization when enabling SR-IOV on a device with many VFs.

Cc: Bjorn Helgaas 
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 

---
v1 -> v2:
- Drop the __pci_cfg_space_size (bhelgaas@)
- Extend pci_cfg_space_size to return the cached value for all VFs except
  VF0 (bhelgaas@)
---
 drivers/pci/iov.c   |  2 ++
 drivers/pci/pci.h   |  1 +
 drivers/pci/probe.c | 17 +
 3 files changed, 20 insertions(+)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index c5f3cd4e..4238b53 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -133,6 +133,8 @@ static void pci_read_vf_config_common(struct pci_dev 
*virtfn)
 &physfn->sriov->subsystem_vendor);
pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID,
 &physfn->sriov->subsystem_device);
+
+   physfn->sriov->cfg_size = pci_cfg_space_size(virtfn);
 }
 
 int pci_iov_add_virtfn(struct pci_dev *dev, int id)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 6e0d152..2f14542 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -285,6 +285,7 @@ struct pci_sriov {
u16 driver_max_VFs; /* Max num VFs driver supports */
struct pci_dev  *dev;   /* Lowest numbered PF */
struct pci_dev  *self;  /* This PF */
+   u32 cfg_size;   /* VF config space size */
u32 class;  /* VF device */
u8  hdr_type;   /* VF header type */
u16 subsystem_vendor; /* VF subsystem vendor */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 201f9e5..8c0f428 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1438,12 +1438,29 @@ static int pci_cfg_space_size_ext(struct pci_dev *dev)
return PCI_CFG_SPACE_EXP_SIZE;
 }
 
+#ifdef CONFIG_PCI_ATS
+static bool is_vf0(struct pci_dev *dev)
+{
+   if (pci_iov_virtfn_devfn(dev->physfn, 0) == dev->devfn &&
+   pci_iov_virtfn_bus(dev->physfn, 0) == dev->bus->number)
+   return true;
+
+   return false;
+}
+#endif
+
 int pci_cfg_space_size(struct pci_dev *dev)
 {
int pos;
u32 status;
u16 class;
 
+#ifdef CONFIG_PCI_ATS
+   /* Read cached value for all VFs except for VF0 */
+   if (dev->is_virtfn && !is_vf0(dev))
+   return dev->physfn->sriov->cfg_size;
+#endif
+
if (dev->bus->bus_flags & PCI_BUS_FLAGS_NO_EXTCFG)
return PCI_CFG_SPACE_SIZE;
 
-- 
2.7.4



[PATCH v5 2/2] kvm: nVMX: Introduce KVM_CAP_NESTED_STATE

2018-07-10 Thread KarimAllah Ahmed
From: Jim Mattson 

For nested virtualization L0 KVM is managing a bit of state for L2 guests,
this state can not be captured through the currently available IOCTLs. In
fact the state captured through all of these IOCTLs is usually a mix of L1
and L2 state. It is also dependent on whether the L2 guest was running at
the moment when the process was interrupted to save its state.

With this capability, there are two new vcpu ioctls: KVM_GET_NESTED_STATE
and KVM_SET_NESTED_STATE. These can be used for saving and restoring a VM
that is in VMX operation.

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Jim Mattson 
[karahmed@ - rename structs and functions and make them ready for AMD and
 address previous comments.
   - handle nested.smm state.
   - rebase & a bit of refactoring.
   - Merge 7/8 and 8/8 into one patch. ]
Signed-off-by: KarimAllah Ahmed 
---
v4 -> v5:
- Drop the update to KVM_REQUEST_ARCH_BASE in favor of a patch to switch to
  u64 instead.
- Fix commit message.
- Handle nested.smm state as well.
- rebase

v3 -> v4:
- Rename function to have _nested

v2 -> v3:
- Remove the forced VMExit from L2 after reading the kvm_state. The actual
  problem is solved.
- Rebase again!
- Set nested_run_pending during restore (not sure if it makes sense yet or
  not).
- Reduce KVM_REQUEST_ARCH_BASE to 7 instead of 8 (the other alternative is
  to switch everything to u64)

v1 -> v2:
- Rename structs and functions and make them ready for AMD and address
  previous comments.
- Rebase & a bit of refactoring.
- Merge 7/8 and 8/8 into one patch.
- Force a VMExit from L2 after reading the kvm_state to avoid mixed state
  between L1 and L2 on resurrecting the instance.
---
 Documentation/virtual/kvm/api.txt |  46 +
 arch/x86/include/asm/kvm_host.h   |   7 ++
 arch/x86/include/uapi/asm/kvm.h   |  45 +
 arch/x86/kvm/vmx.c| 202 +-
 arch/x86/kvm/x86.c|  21 
 include/uapi/linux/kvm.h  |   4 +
 6 files changed, 322 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index d10944e..925c509 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3561,6 +3561,52 @@ Returns: 0 on success,
-ENOENT on deassign if the conn_id isn't registered
-EEXIST on assign if the conn_id is already registered
 
+4.114 KVM_GET_NESTED_STATE
+
+Capability: KVM_CAP_NESTED_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_nested_state (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  E2BIG: the data size exceeds the value of 'size' specified by
+ the user (the size required will be written into size).
+
+struct kvm_nested_state {
+   __u16 flags;
+   __u16 format;
+   __u32 size;
+   union {
+   struct kvm_vmx_nested_state vmx;
+   struct kvm_svm_nested_state svm;
+   __u8 pad[120];
+   };
+   __u8 data[0];
+};
+
+This ioctl copies the vcpu's kvm_nested_state struct from the kernel to 
userspace.
+
+4.115 KVM_SET_NESTED_STATE
+
+Capability: KVM_CAP_NESTED_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_nested_state (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_nested_state {
+   __u16 flags;
+   __u16 format;
+   __u32 size;
+   union {
+   struct kvm_vmx_nested_state vmx;
+   struct kvm_svm_nested_state svm;
+   __u8 pad[120];
+   };
+   __u8 data[0];
+};
+
+This copies the vcpu's kvm_nested_state struct from userspace to the kernel.
 
 5. The kvm_run structure
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0dab702..2e8eb08 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -75,6 +75,7 @@
 #define KVM_REQ_HV_EXITKVM_ARCH_REQ(21)
 #define KVM_REQ_HV_STIMER  KVM_ARCH_REQ(22)
 #define KVM_REQ_LOAD_EOI_EXITMAP   KVM_ARCH_REQ(23)
+#define KVM_REQ_GET_VMCS12_PAGES   KVM_ARCH_REQ(24)
 
 #define CR0_RESERVED_BITS   \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -1087,6 +1088,12 @@ struct kvm_x86_ops {
 
void (*setup_mce)(struct kvm_vcpu *vcpu);
 
+   int (*get_nested_state)(struct kvm_vcpu *vcpu,
+   struct kvm_nested_state __user 
*user_kvm_nested_state);
+   int (*set_nested_state)(struct kvm_vcpu *vcpu,
+   struct kvm_nested_state __user 
*user_kvm_nested_state);
+   void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+
int (*smi_allowed)(struct

[PATCH v5 1/2] KVM: Switch 'requests' to be 64-bit (explicitly)

2018-07-10 Thread KarimAllah Ahmed
Switch 'requests' to be explicitly 64-bit and update BUILD_BUG_ON check to
use the size of "requests" instead of the hard-coded '32'.

That gives us a bit more room again for arch-specific requests as we
already ran out of space for x86 due to the hard-coded check.

The only exception here is ARM32 as it is still 32-bits.

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Jim Mattson 
Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Use FIELD_SIZEOF
---
 include/linux/kvm_host.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4ee7bc5..64518a1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -130,7 +130,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQUEST_ARCH_BASE 8
 
 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
-   BUILD_BUG_ON((unsigned)(nr) >= 32 - KVM_REQUEST_ARCH_BASE); \
+   BUILD_BUG_ON((unsigned)(nr) >= (FIELD_SIZEOF(struct kvm_vcpu, requests) 
* 8) - KVM_REQUEST_ARCH_BASE); \
(unsigned)(((nr) + KVM_REQUEST_ARCH_BASE) | (flags)); \
 })
 #define KVM_ARCH_REQ(nr)   KVM_ARCH_REQ_FLAGS(nr, 0)
@@ -224,7 +224,7 @@ struct kvm_vcpu {
int vcpu_id;
int srcu_idx;
int mode;
-   unsigned long requests;
+   u64 requests;
unsigned long guest_debug;
 
int pre_pcpu;
@@ -1124,7 +1124,7 @@ static inline void kvm_make_request(int req, struct 
kvm_vcpu *vcpu)
 * caller.  Paired with the smp_mb__after_atomic in kvm_check_request.
 */
smp_wmb();
-   set_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
+   set_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
 }
 
 static inline bool kvm_request_pending(struct kvm_vcpu *vcpu)
@@ -1134,12 +1134,12 @@ static inline bool kvm_request_pending(struct kvm_vcpu 
*vcpu)
 
 static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
 {
-   return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
+   return test_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
 }
 
 static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu)
 {
-   clear_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
+   clear_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
 }
 
 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
-- 
2.7.4



[PATCH] x86/headers/UAPI: Move DISABLE_EXITS KVM capability bits to the UAPI

2018-04-16 Thread KarimAllah Ahmed
Move DISABLE_EXITS KVM capability bits to the UAPI just like the rest of
capabilities.

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/x86.h   | 7 ---
 include/uapi/linux/kvm.h | 7 +++
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 98d3503..acb1502 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -303,13 +303,6 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, 
u64 nsec)
__rem;  \
 })
 
-#define KVM_X86_DISABLE_EXITS_MWAIT  (1 << 0)
-#define KVM_X86_DISABLE_EXITS_HTL(1 << 1)
-#define KVM_X86_DISABLE_EXITS_PAUSE  (1 << 2)
-#define KVM_X86_DISABLE_VALID_EXITS  (KVM_X86_DISABLE_EXITS_MWAIT | \
-  KVM_X86_DISABLE_EXITS_HTL | \
-  KVM_X86_DISABLE_EXITS_PAUSE)
-
 static inline bool kvm_mwait_in_guest(struct kvm *kvm)
 {
return kvm->arch.mwait_in_guest;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index aefaf6c..077d16f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -676,6 +676,13 @@ struct kvm_ioeventfd {
__u8  pad[36];
 };
 
+#define KVM_X86_DISABLE_EXITS_MWAIT  (1 << 0)
+#define KVM_X86_DISABLE_EXITS_HTL(1 << 1)
+#define KVM_X86_DISABLE_EXITS_PAUSE  (1 << 2)
+#define KVM_X86_DISABLE_VALID_EXITS  (KVM_X86_DISABLE_EXITS_MWAIT | \
+  KVM_X86_DISABLE_EXITS_HTL | \
+  KVM_X86_DISABLE_EXITS_PAUSE)
+
 /* for KVM_ENABLE_CAP */
 struct kvm_enable_cap {
/* in */
-- 
2.7.4



[PATCH v2 06/12] KVM/nVMX: Use kvm_vcpu_map when mapping the L1 MSR bitmap

2018-04-15 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the L1 MSR bitmap since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
---
 arch/x86/kvm/vmx.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 179061d..6d335fc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -494,6 +494,9 @@ struct nested_vmx {
struct page *apic_access_page;
struct page *virtual_apic_page;
struct page *pi_desc_page;
+
+   struct kvm_host_map msr_bitmap_map;
+
struct pi_desc *pi_desc;
bool pi_pending;
u16 posted_intr_nv;
@@ -10513,9 +10516,10 @@ static inline bool 
nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 struct vmcs12 *vmcs12)
 {
int msr;
-   struct page *page;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+   struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
+
/*
 * pred_cmd & spec_ctrl are trying to verify two things:
 *
@@ -10541,11 +10545,10 @@ static inline bool 
nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
!pred_cmd && !spec_ctrl)
return false;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
 
-   msr_bitmap_l1 = (unsigned long *)kmap(page);
+   msr_bitmap_l1 = (unsigned long *)map->hva;
if (nested_cpu_has_apic_reg_virt(vmcs12)) {
/*
 * L0 need not intercept reads for MSRs between 0x800 and 
0x8ff, it
@@ -10593,8 +10596,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
MSR_IA32_PRED_CMD,
MSR_TYPE_W);
 
-   kunmap(page);
-   kvm_release_page_clean(page);
+   kvm_vcpu_unmap(&to_vmx(vcpu)->nested.msr_bitmap_map);
 
return true;
 }
-- 
2.7.4



[PATCH v2 07/12] KVM/nVMX: Use kvm_vcpu_map when mapping the virtual APIC page

2018-04-15 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the virtual APIC page since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

One additional semantic change is that the virtual host mapping lifecycle
has changed a bit. It now has the same lifetime of the pinning of the
virtual APIC page on the host side.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
- Use pfn_to_hpa instead of gfn_to_gpa
---
 arch/x86/kvm/vmx.c | 34 +++---
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6d335fc..b55053a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -492,9 +492,8 @@ struct nested_vmx {
 * pointers, so we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
-   struct page *virtual_apic_page;
+   struct kvm_host_map virtual_apic_map;
struct page *pi_desc_page;
-
struct kvm_host_map msr_bitmap_map;
 
struct pi_desc *pi_desc;
@@ -5572,11 +5571,12 @@ static void vmx_complete_nested_posted_interrupt(struct 
kvm_vcpu *vcpu)
 
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
-   vapic_page = kmap(vmx->nested.virtual_apic_page);
+   vapic_page = vmx->nested.virtual_apic_map.hva;
+   if (!vapic_page)
+   return;
+
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
vapic_page, &max_irr);
-   kunmap(vmx->nested.virtual_apic_page);
-
status = vmcs_read16(GUEST_INTR_STATUS);
if ((u8)max_irr > ((u8)status & 0xff)) {
status &= ~0xff;
@@ -7806,10 +7806,7 @@ static void free_nested(struct vcpu_vmx *vmx)
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
-   if (vmx->nested.virtual_apic_page) {
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
kvm_release_page_dirty(vmx->nested.pi_desc_page);
@@ -10358,6 +10355,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
 {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct kvm_host_map *map;
struct page *page;
u64 hpa;
 
@@ -10395,11 +10393,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
}
 
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-   if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
-   page = kvm_vcpu_gpa_to_page(vcpu, 
vmcs12->virtual_apic_page_addr);
+   map = &vmx->nested.virtual_apic_map;
 
/*
 * If translation failed, VM entry will fail because
@@ -10414,11 +10408,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
 * control.  But such a configuration is useless, so
 * let's keep the code simple.
 */
-   if (!is_error_page(page)) {
-   vmx->nested.virtual_apic_page = page;
-   hpa = page_to_phys(vmx->nested.virtual_apic_page);
-   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
-   }
+   if (!kvm_vcpu_map(vcpu, 
gpa_to_gfn(vmcs12->virtual_apic_page_addr), map))
+   vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 
pfn_to_hpa(map->pfn));
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -12076,10 +12067,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
u32 exit_reason,
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
-   if (vmx->nested.virtual_apic_page) {
-   kvm_release_page_dirty(vmx->nested.virtual_apic_page);
-   vmx->nested.virtual_apic_page = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
if (vmx->nested.pi_desc_page) {
kunmap(vmx->nested.pi_desc_page);
kvm_release_page_dirty(vmx->nested.pi_desc_page);
-- 
2.7.4



[PATCH v2 02/12] X86/nVMX: handle_vmptrld: Copy the VMCS12 directly from guest memory

2018-04-15 Thread KarimAllah Ahmed
Copy the VMCS12 directly from guest memory instead of the map->copy->unmap
sequence. This also avoids using kvm_vcpu_gpa_to_page() and kmap() which
assumes that there is a "struct page" for guest memory.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Massage commit message a bit.
---
 arch/x86/kvm/vmx.c | 26 +++---
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 77fc1ee..810ba7a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8156,30 +8156,18 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
}
 
if (vmx->nested.current_vmptr != vmptr) {
-   struct vmcs12 *new_vmcs12;
-   struct page *page;
-   page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-   if (is_error_page(page)) {
-   nested_vmx_failInvalid(vcpu);
-   return kvm_skip_emulated_instruction(vcpu);
-   }
-   new_vmcs12 = kmap(page);
-   if (new_vmcs12->revision_id != VMCS12_REVISION) {
-   kunmap(page);
-   kvm_release_page_clean(page);
-   nested_vmx_failValid(vcpu,
-   VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
-   return kvm_skip_emulated_instruction(vcpu);
-   }
-
nested_release_vmcs12(vmx);
+
/*
 * Load VMCS12 from guest memory since it is not already
 * cached.
 */
-   memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
-   kunmap(page);
-   kvm_release_page_clean(page);
+   if (kvm_read_guest(vcpu->kvm, vmptr, vmx->nested.cached_vmcs12,
+  sizeof(*vmx->nested.cached_vmcs12)) ||
+   vmx->nested.cached_vmcs12->revision_id != VMCS12_REVISION) {
+   nested_vmx_failInvalid(vcpu);
+   return kvm_skip_emulated_instruction(vcpu);
+   }
 
set_current_vmptr(vmx, vmptr);
}
-- 
2.7.4



[PATCH v2 03/12] X86/nVMX: Update the PML table without mapping and unmapping the page

2018-04-15 Thread KarimAllah Ahmed
Update the PML table without mapping and unmapping the page. This also
avoids using kvm_vcpu_gpa_to_page(..) which assumes that there is a "struct
page" for guest memory.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Use kvm_write_guest_page instead of kvm_write_guest (pbonzini)
- Do not use pointer arithmetic for pml_address (pbonzini)
---
 arch/x86/kvm/vmx.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 810ba7a..179061d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -12294,9 +12294,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
 {
struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
-   gpa_t gpa;
-   struct page *page = NULL;
-   u64 *pml_address;
+   gpa_t gpa, dst;
 
if (is_guest_mode(vcpu)) {
WARN_ON_ONCE(vmx->nested.pml_full);
@@ -12316,15 +12314,13 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
}
 
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+   dst = vmcs12->pml_address + sizeof(u64) * 
vmcs12->guest_pml_index;
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
-   if (is_error_page(page))
+   if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+offset_in_page(dst), sizeof(gpa)))
return 0;
 
-   pml_address = kmap(page);
-   pml_address[vmcs12->guest_pml_index--] = gpa;
-   kunmap(page);
-   kvm_release_page_clean(page);
+   vmcs12->guest_pml_index--;
}
 
return 0;
-- 
2.7.4



[PATCH v2 09/12] KVM/X86: Use kvm_vcpu_map in emulator_cmpxchg_emulated

2018-04-15 Thread KarimAllah Ahmed
Use kvm_vcpu_map in emulator_cmpxchg_emulated since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Update to match the new API return codes
---
 arch/x86/kvm/x86.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4633674..8d08c11 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5162,9 +5162,9 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
 unsigned int bytes,
 struct x86_exception *exception)
 {
+   struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
-   struct page *page;
char *kaddr;
bool exchanged;
 
@@ -5181,12 +5181,11 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
goto emul_write;
 
-   page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
goto emul_write;
 
-   kaddr = kmap_atomic(page);
-   kaddr += offset_in_page(gpa);
+   kaddr = map.hva + offset_in_page(gpa);
+
switch (bytes) {
case 1:
exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
@@ -5203,8 +5202,8 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
default:
BUG();
}
-   kunmap_atomic(kaddr);
-   kvm_release_page_dirty(page);
+
+   kvm_vcpu_unmap(&map);
 
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
-- 
2.7.4



[PATCH v2 08/12] KVM/nVMX: Use kvm_vcpu_map when mapping the posted interrupt descriptor table

2018-04-15 Thread KarimAllah Ahmed
Use kvm_vcpu_map when mapping the posted interrupt descriptor table since
using kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory
that has a "struct page".

One additional semantic change is that the virtual host mapping lifecycle
has changed a bit. It now has the same lifetime of the pinning of the
interrupt descriptor table page on the host side.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Do not change the lifecycle of the mapping (pbonzini)
---
 arch/x86/kvm/vmx.c | 45 +++--
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b55053a..3dd8bb2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -493,7 +493,7 @@ struct nested_vmx {
 */
struct page *apic_access_page;
struct kvm_host_map virtual_apic_map;
-   struct page *pi_desc_page;
+   struct kvm_host_map pi_desc_map;
struct kvm_host_map msr_bitmap_map;
 
struct pi_desc *pi_desc;
@@ -7807,12 +7807,8 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
-   if (vmx->nested.pi_desc_page) {
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
+   vmx->nested.pi_desc = NULL;
 
free_loaded_vmcs(&vmx->nested.vmcs02);
 }
@@ -10413,24 +10409,16 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu)
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
-   if (vmx->nested.pi_desc_page) { /* shouldn't happen */
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
+   map = &vmx->nested.pi_desc_map;
+
+   if (!kvm_vcpu_map(vcpu, 
gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
+   vmx->nested.pi_desc =
+   (struct pi_desc *)(((void *)map->hva) +
+   offset_in_page(vmcs12->posted_intr_desc_addr));
+   vmcs_write64(POSTED_INTR_DESC_ADDR, 
pfn_to_hpa(map->pfn) +
+   
offset_in_page(vmcs12->posted_intr_desc_addr));
}
-   page = kvm_vcpu_gpa_to_page(vcpu, 
vmcs12->posted_intr_desc_addr);
-   if (is_error_page(page))
-   return;
-   vmx->nested.pi_desc_page = page;
-   vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc =
-   (struct pi_desc *)((void *)vmx->nested.pi_desc +
-   (unsigned long)(vmcs12->posted_intr_desc_addr &
-   (PAGE_SIZE - 1)));
-   vmcs_write64(POSTED_INTR_DESC_ADDR,
-   page_to_phys(vmx->nested.pi_desc_page) +
-   (unsigned long)(vmcs12->posted_intr_desc_addr &
-   (PAGE_SIZE - 1)));
+
}
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
@@ -12067,13 +12055,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
u32 exit_reason,
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
+
kvm_vcpu_unmap(&vmx->nested.virtual_apic_map);
-   if (vmx->nested.pi_desc_page) {
-   kunmap(vmx->nested.pi_desc_page);
-   kvm_release_page_dirty(vmx->nested.pi_desc_page);
-   vmx->nested.pi_desc_page = NULL;
-   vmx->nested.pi_desc = NULL;
-   }
+   kvm_vcpu_unmap(&vmx->nested.pi_desc_map);
+   vmx->nested.pi_desc = NULL;
 
/*
 * We are now running in L2, mmu_notifier will force to reload the
-- 
2.7.4



[PATCH v2 05/12] KVM: Introduce a new guest mapping API

2018-04-15 Thread KarimAllah Ahmed
In KVM, specially for nested guests, there is a dominant pattern of:

=> map guest memory -> do_something -> unmap guest memory

In addition to all this unnecessarily noise in the code due to boiler plate
code, most of the time the mapping function does not properly handle memory
that is not backed by "struct page". This new guest mapping API encapsulate
most of this boiler plate code and also handles guest memory that is not
backed by "struct page".

Keep in mind that memremap is horribly slow, so this mapping API should not
be used for high-frequency mapping operations. But rather for low-frequency
mappings.

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Drop the caching optimization (pbonzini)
- Use 'hva' instead of 'kaddr' (pbonzini)
- Return 0/-EINVAL/-EFAULT instead of true/false. -EFAULT will be used for
  AMD patch (pbonzini)
- Introduce __kvm_map_gfn which accepts a memory slot and use it (pbonzini)
- Only clear map->hva instead of memsetting the whole structure.
- Drop kvm_vcpu_map_valid since it is no longer used.
- Fix EXPORT_MODULE naming.
---
 include/linux/kvm_host.h |  9 +
 virt/kvm/kvm_main.c  | 50 
 2 files changed, 59 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fe4f46b..15b9244 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -204,6 +204,13 @@ enum {
READING_SHADOW_PAGE_TABLES,
 };
 
+struct kvm_host_map {
+   struct page *page;
+   void *hva;
+   kvm_pfn_t pfn;
+   kvm_pfn_t gfn;
+};
+
 /*
  * Sometimes a large or cross-page mmio needs to be broken up into separate
  * exits for userspace servicing.
@@ -700,6 +707,8 @@ struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu 
*vcpu);
 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t 
gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
+void kvm_vcpu_unmap(struct kvm_host_map *map);
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool 
*writable);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c7b2e92..70c3e56 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1631,6 +1631,56 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
+static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn,
+struct kvm_host_map *map)
+{
+   kvm_pfn_t pfn;
+   void *hva = NULL;
+   struct page *page = NULL;
+
+   pfn = gfn_to_pfn_memslot(slot, gfn);
+   if (is_error_noslot_pfn(pfn))
+   return -EINVAL;
+
+   if (pfn_valid(pfn)) {
+   page = pfn_to_page(pfn);
+   hva = kmap(page);
+   } else {
+   hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+   }
+
+   if (!hva)
+   return -EFAULT;
+
+   map->page = page;
+   map->hva = hva;
+   map->pfn = pfn;
+   map->gfn = gfn;
+
+   return 0;
+}
+
+int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
+{
+   return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_map);
+
+void kvm_vcpu_unmap(struct kvm_host_map *map)
+{
+   if (!map->hva)
+   return;
+
+   if (map->page)
+   kunmap(map->page);
+   else
+   memunmap(map->hva);
+
+   kvm_release_pfn_dirty(map->pfn);
+   map->hva = NULL;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
+
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
kvm_pfn_t pfn;
-- 
2.7.4



[PATCH v2 11/12] KVM/X86: hyperv: Use kvm_vcpu_map in synic_deliver_msg

2018-04-15 Thread KarimAllah Ahmed
Use kvm_vcpu_map in synic_deliver_msg since using kvm_vcpu_gpa_to_page()
and kmap() will only work for guest memory that has a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Update to match the new API return codes
---
 arch/x86/kvm/hyperv.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index fc33d8f..668dbd3 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -580,7 +580,7 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic 
*synic, u32 sint,
 struct hv_message *src_msg)
 {
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
-   struct page *page;
+   struct kvm_host_map map;
gpa_t gpa;
struct hv_message *dst_msg;
int r;
@@ -590,11 +590,11 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic 
*synic, u32 sint,
return -ENOENT;
 
gpa = synic->msg_page & PAGE_MASK;
-   page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
+
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
return -EFAULT;
 
-   msg_page = kmap_atomic(page);
+   msg_page = map.hva;
dst_msg = &msg_page->sint_message[sint];
if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE,
 src_msg->header.message_type) != HVMSG_NONE) {
@@ -611,8 +611,8 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic 
*synic, u32 sint,
else if (r == 0)
r = -EFAULT;
}
-   kunmap_atomic(msg_page);
-   kvm_release_page_dirty(page);
+
+   kvm_vcpu_unmap(&map);
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
return r;
 }
-- 
2.7.4



[PATCH v2 01/12] X86/nVMX: handle_vmon: Read 4 bytes from guest memory

2018-04-15 Thread KarimAllah Ahmed
Read the data directly from guest memory instead instead of the
map->read->unmap sequence. This also avoids using kvm_vcpu_gpa_to_page()
and kmap() which assumes that there is a "struct page" for guest memory.

Suggested-by: Jim Mattson 
Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Massage commit message a bit.
---
 arch/x86/kvm/vmx.c | 14 +++---
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 55ab0ca..77fc1ee 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7672,7 +7672,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 {
int ret;
gpa_t vmptr;
-   struct page *page;
+   uint32_t revision;
struct vcpu_vmx *vmx = to_vmx(vcpu);
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -7718,19 +7718,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
 
-   page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-   if (is_error_page(page)) {
+   if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
+   revision != VMCS12_REVISION) {
nested_vmx_failInvalid(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
-   if (*(u32 *)kmap(page) != VMCS12_REVISION) {
-   kunmap(page);
-   kvm_release_page_clean(page);
-   nested_vmx_failInvalid(vcpu);
-   return kvm_skip_emulated_instruction(vcpu);
-   }
-   kunmap(page);
-   kvm_release_page_clean(page);
 
vmx->nested.vmxon_ptr = vmptr;
ret = enter_vmx_operation(vcpu);
-- 
2.7.4



[PATCH v2 10/12] KVM/X86: hyperv: Use kvm_vcpu_map in synic_clear_sint_msg_pending

2018-04-15 Thread KarimAllah Ahmed
Use kvm_vcpu_map in synic_clear_sint_msg_pending since using
kvm_vcpu_gpa_to_page() and kmap() will only work for guest memory that has
a "struct page".

Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Update to match the new API return codes
---
 arch/x86/kvm/hyperv.c | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 98618e3..fc33d8f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -158,26 +158,22 @@ static void synic_clear_sint_msg_pending(struct 
kvm_vcpu_hv_synic *synic,
u32 sint)
 {
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
-   struct page *page;
-   gpa_t gpa;
+   struct kvm_host_map map;
struct hv_message *msg;
struct hv_message_page *msg_page;
 
-   gpa = synic->msg_page & PAGE_MASK;
-   page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page)) {
+   if (kvm_vcpu_map(vcpu, gpa_to_gfn(synic->msg_page), &map)) {
vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n",
-gpa);
+synic->msg_page);
return;
}
-   msg_page = kmap_atomic(page);
 
+   msg_page = map.hva;
msg = &msg_page->sint_message[sint];
msg->header.message_flags.msg_pending = 0;
 
-   kunmap_atomic(msg_page);
-   kvm_release_page_dirty(page);
-   kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+   kvm_vcpu_unmap(&map);
+   kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(synic->msg_page));
 }
 
 static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
-- 
2.7.4



[PATCH v2 04/12] X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs

2018-04-15 Thread KarimAllah Ahmed
From: Filippo Sironi 

cmpxchg_gpte() calls get_user_pages_fast() to retrieve the number of
pages and the respective struct page to map in the kernel virtual
address space.
This doesn't work if get_user_pages_fast() is invoked with a userspace
virtual address that's backed by PFNs outside of kernel reach (e.g., when
limiting the kernel memory with mem= in the command line and using
/dev/mem to map memory).

If get_user_pages_fast() fails, look up the VMA that back the userspace
virtual address, compute the PFN and the physical address, and map it in
the kernel virtual address space with memremap().

Signed-off-by: Filippo Sironi 
Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/paging_tmpl.h | 38 +-
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6288e9d..31a37e4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -141,15 +141,35 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
struct page *page;
 
npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
-   /* Check if the user is doing something meaningless. */
-   if (unlikely(npages != 1))
-   return -EFAULT;
-
-   table = kmap_atomic(page);
-   ret = CMPXCHG(&table[index], orig_pte, new_pte);
-   kunmap_atomic(table);
-
-   kvm_release_page_dirty(page);
+   if (likely(npages == 1)) {
+   table = kmap_atomic(page);
+   ret = CMPXCHG(&table[index], orig_pte, new_pte);
+   kunmap_atomic(table);
+
+   kvm_release_page_dirty(page);
+   } else {
+   struct vm_area_struct *vma;
+   unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
+   unsigned long pfn;
+   unsigned long paddr;
+
+   down_read(¤t->mm->mmap_sem);
+   vma = find_vma_intersection(current->mm, vaddr, vaddr + 
PAGE_SIZE);
+   if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
+   up_read(¤t->mm->mmap_sem);
+   return -EFAULT;
+   }
+   pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+   paddr = pfn << PAGE_SHIFT;
+   table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
+   if (!table) {
+   up_read(¤t->mm->mmap_sem);
+   return -EFAULT;
+   }
+   ret = CMPXCHG(&table[index], orig_pte, new_pte);
+   memunmap(table);
+   up_read(¤t->mm->mmap_sem);
+   }
 
return (ret != orig_pte);
 }
-- 
2.7.4



[PATCH v2 12/12] KVM/nSVM: Use the new mapping API for mapping guest memory

2018-04-15 Thread KarimAllah Ahmed
Use the new mapping API for mapping guest memory to avoid depending on
"struct page".

Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/svm.c | 97 +++---
 1 file changed, 49 insertions(+), 48 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b4ade8d..1706eab 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3026,32 +3026,6 @@ static inline bool nested_svm_nmi(struct vcpu_svm *svm)
return false;
 }
 
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
-{
-   struct page *page;
-
-   might_sleep();
-
-   page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
-   if (is_error_page(page))
-   goto error;
-
-   *_page = page;
-
-   return kmap(page);
-
-error:
-   kvm_inject_gp(&svm->vcpu, 0);
-
-   return NULL;
-}
-
-static void nested_svm_unmap(struct page *page)
-{
-   kunmap(page);
-   kvm_release_page_dirty(page);
-}
-
 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
 {
unsigned port, size, iopm_len;
@@ -3252,10 +3226,11 @@ static inline void copy_vmcb_control_area(struct vmcb 
*dst_vmcb, struct vmcb *fr
 
 static int nested_svm_vmexit(struct vcpu_svm *svm)
 {
+   int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
-   struct page *page;
+   struct kvm_host_map map;
 
trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
   vmcb->control.exit_info_1,
@@ -3264,9 +3239,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
   vmcb->control.exit_int_info_err,
   KVM_ISA_SVM);
 
-   nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
-   if (!nested_vmcb)
+   rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(svm->nested.vmcb), &map);
+   if (rc) {
+   if (rc == -EINVAL)
+   kvm_inject_gp(&svm->vcpu, 0);
return 1;
+   }
+
+   nested_vmcb = map.hva;
 
/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
@@ -3365,7 +3345,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
mark_all_dirty(svm->vmcb);
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(&map);
 
nested_svm_uninit_mmu_context(&svm->vcpu);
kvm_mmu_reset_context(&svm->vcpu);
@@ -3423,7 +3403,7 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 }
 
 static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-struct vmcb *nested_vmcb, struct page *page)
+struct vmcb *nested_vmcb, struct kvm_host_map 
*map)
 {
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -3503,7 +3483,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, 
u64 vmcb_gpa,
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(map);
 
/* Enter Guest-Mode */
enter_guest_mode(&svm->vcpu);
@@ -3523,17 +3503,23 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, 
u64 vmcb_gpa,
 
 static bool nested_svm_vmrun(struct vcpu_svm *svm)
 {
+   int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
-   struct page *page;
+   struct kvm_host_map map;
u64 vmcb_gpa;
 
vmcb_gpa = svm->vmcb->save.rax;
 
-   nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
-   if (!nested_vmcb)
+   rc = kvm_vcpu_map(&svm->vcpu, gfn_to_gpa(vmcb_gpa), &map);
+   if (rc) {
+   if (rc == -EINVAL)
+   kvm_inject_gp(&svm->vcpu, 0);
return false;
+   }
+
+   nested_vmcb = map.hva;
 
if (!nested_vmcb_checks(nested_vmcb)) {
nested_vmcb->control.exit_code= SVM_EXIT_ERR;
@@ -3541,7 +3527,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_vmcb->control.exit_info_1  = 0;
nested_vmcb->control.exit_info_2  = 0;
 
-   nested_svm_unmap(page);
+   kvm_vcpu_unmap(&map);
 
return false;
}
@@ -3585,7 +3571,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
copy_vmcb_control_area(hsave, vmcb);
 
-   enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
+   enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
 
return true;
 }
@@ -3609,21 +3595,26 @@ static void nested_svm_vmloadsave

[PATCH v2 00/12] KVM/X86: Introduce a new guest mapping interface

2018-04-15 Thread KarimAllah Ahmed
Guest memory can either be directly managed by the kernel (i.e. have a "struct
page") or they can simply live outside kernel control (i.e. do not have a
"struct page"). KVM mostly support these two modes, except in a few places
where the code seems to assume that guest memory must have a "struct page".

This patchset introduces a new mapping interface to map guest memory into host
kernel memory which also supports PFN-based memory (i.e. memory without 'struct
page'). It also converts all offending code to this interface or simply
read/write directly from guest memory.

As far as I can see all offending code is now fixed except the APIC-access page
which I will handle in a seperate patch.

Filippo Sironi (1):
  X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs

KarimAllah Ahmed (11):
  X86/nVMX: handle_vmon: Read 4 bytes from guest memory
  X86/nVMX: handle_vmptrld: Copy the VMCS12 directly from guest memory
  X86/nVMX: Update the PML table without mapping and unmapping the page
  KVM: Introduce a new guest mapping API
  KVM/nVMX: Use kvm_vcpu_map when mapping the L1 MSR bitmap
  KVM/nVMX: Use kvm_vcpu_map when mapping the virtual APIC page
  KVM/nVMX: Use kvm_vcpu_map when mapping the posted interrupt
descriptor table
  KVM/X86: Use kvm_vcpu_map in emulator_cmpxchg_emulated
  KVM/X86: hyperv: Use kvm_vcpu_map in synic_clear_sint_msg_pending
  KVM/X86: hyperv: Use kvm_vcpu_map in synic_deliver_msg
  KVM/nSVM: Use the new mapping API for mapping guest memory

 arch/x86/kvm/hyperv.c  |  28 -
 arch/x86/kvm/paging_tmpl.h |  38 +---
 arch/x86/kvm/svm.c |  97 +++---
 arch/x86/kvm/vmx.c | 145 +++--
 arch/x86/kvm/x86.c |  13 ++--
 include/linux/kvm_host.h   |   9 +++
 virt/kvm/kvm_main.c|  50 
 7 files changed, 203 insertions(+), 177 deletions(-)

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org

-- 
2.7.4



[PATCH] KVM: Switch 'requests' to be 64-bit (explicitly)

2018-04-14 Thread KarimAllah Ahmed
Switch 'requests' to be explicitly 64-bit and update BUILD_BUG_ON check to
use the size of "requests" instead of the hard-coded '32'.

That gives us a bit more room again for arch-specific requests as we
already ran out of space for x86 due to the hard-coded check.

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
 include/linux/kvm_host.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6930c63..fe4f46b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -129,7 +129,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQUEST_ARCH_BASE 8
 
 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
-   BUILD_BUG_ON((unsigned)(nr) >= 32 - KVM_REQUEST_ARCH_BASE); \
+   BUILD_BUG_ON((unsigned)(nr) >= (sizeof(((struct kvm_vcpu 
*)0)->requests) * 8) - KVM_REQUEST_ARCH_BASE); \
(unsigned)(((nr) + KVM_REQUEST_ARCH_BASE) | (flags)); \
 })
 #define KVM_ARCH_REQ(nr)   KVM_ARCH_REQ_FLAGS(nr, 0)
@@ -223,7 +223,7 @@ struct kvm_vcpu {
int vcpu_id;
int srcu_idx;
int mode;
-   unsigned long requests;
+   u64 requests;
unsigned long guest_debug;
 
int pre_pcpu;
@@ -1122,7 +1122,7 @@ static inline void kvm_make_request(int req, struct 
kvm_vcpu *vcpu)
 * caller.  Paired with the smp_mb__after_atomic in kvm_check_request.
 */
smp_wmb();
-   set_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
+   set_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
 }
 
 static inline bool kvm_request_pending(struct kvm_vcpu *vcpu)
@@ -1132,12 +1132,12 @@ static inline bool kvm_request_pending(struct kvm_vcpu 
*vcpu)
 
 static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
 {
-   return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
+   return test_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
 }
 
 static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu)
 {
-   clear_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
+   clear_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
 }
 
 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
-- 
2.7.4



[PATCH v4] X86/KVM: Properly update 'tsc_offset' to represent the running guest

2018-04-13 Thread KarimAllah Ahmed
Update 'tsc_offset' on vmentry/vmexit of L2 guests to ensure that it always
captures the TSC_OFFSET of the running guest whether it is the L1 or L2
guest.

Cc: Jim Mattson 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Suggested-by: Paolo Bonzini 
Signed-off-by: KarimAllah Ahmed 
[AMD changes, fix update_ia32_tsc_adjust_msr. - Paolo]
Signed-off-by: Paolo Bonzini 

---
v3 -> v4:
- Restore L01 tsc_offset on enter_vmx_non_root_mode failures.
- Move tsc_offset update for L02 later in nested_vmx_run.

v2 -> v3:
- Add AMD bits as well.
- Fix update_ia32_tsc_adjust_msr.

v1 -> v2:
- Rewrote the patch to always update tsc_offset to represent the current
  guest (pbonzini@)
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c  | 17 -
 arch/x86/kvm/vmx.c  | 29 -
 arch/x86/kvm/x86.c  |  6 --
 4 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7a200f6..a40a32e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1016,6 +1016,7 @@ struct kvm_x86_ops {
 
bool (*has_wbinvd_exit)(void);
 
+   u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b58787d..1f00c18 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1423,12 +1423,23 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t 
type)
seg->base = 0;
 }
 
+static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+
+   if (is_guest_mode(vcpu))
+   return svm->nested.hsave->control.tsc_offset;
+
+   return vcpu->arch.tsc_offset;
+}
+
 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
struct vcpu_svm *svm = to_svm(vcpu);
u64 g_tsc_offset = 0;
 
if (is_guest_mode(vcpu)) {
+   /* Write L1's TSC offset.  */
g_tsc_offset = svm->vmcb->control.tsc_offset -
   svm->nested.hsave->control.tsc_offset;
svm->nested.hsave->control.tsc_offset = offset;
@@ -3322,6 +,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
/* Restore the original control entries */
copy_vmcb_control_area(vmcb, hsave);
 
+   vcpu->arch.tsc_offset = svm->vmcb->control.tsc_offset;
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
 
@@ -3482,10 +3494,12 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, 
u64 vmcb_gpa,
/* We don't want to see VMMCALLs from a nested guest */
clr_intercept(svm, INTERCEPT_VMMCALL);
 
+   vcpu->arch.tsc_offset += nested_vmcb->control.tsc_offset;
+   svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
+
svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
svm->vmcb->control.int_state = nested_vmcb->control.int_state;
-   svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
@@ -7102,6 +7116,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
.has_wbinvd_exit = svm_has_wbinvd_exit,
 
+   .read_l1_tsc_offset = svm_read_l1_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
 
.set_tdp_cr3 = set_tdp_cr3,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b6942de..05ba3c6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2885,6 +2885,17 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx_update_msr_bitmap(&vmx->vcpu);
 }
 
+static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+   struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+   if (is_guest_mode(vcpu) &&
+   (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+   return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
+
+   return vcpu->arch.tsc_offset;
+}
+
 /*
  * reads and returns guest's timestamp counter "register"
  * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
@@ -2,11 +11123,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12,
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
}
 
-   if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
-   vmcs_write64(

[PATCH v4 2/2] kvm: nVMX: Introduce KVM_CAP_STATE

2018-04-12 Thread KarimAllah Ahmed
From: Jim Mattson 

For nested virtualization L0 KVM is managing a bit of state for L2 guests,
this state can not be captured through the currently available IOCTLs. In
fact the state captured through all of these IOCTLs is usually a mix of L1
and L2 state. It is also dependent on whether the L2 guest was running at
the moment when the process was interrupted to save its state.

With this capability, there are two new vcpu ioctls: KVM_GET_VMX_STATE and
KVM_SET_VMX_STATE. These can be used for saving and restoring a VM that is
in VMX operation.

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Jim Mattson 
[karahmed@ - rename structs and functions and make them ready for AMD and
 address previous comments.
   - rebase & a bit of refactoring.
   - Merge 7/8 and 8/8 into one patch.
   - Force a VMExit from L2 after reading the kvm_state to avoid
 mixed state between L1 and L2 on resurrecting the instance. ]
Signed-off-by: KarimAllah Ahmed 
---
v3 -> v4:
- Rename function to have _nested

v2 -> v3:
- Remove the forced VMExit from L2 after reading the kvm_state. The actual
  problem is solved.
- Rebase again!
- Set nested_run_pending during restore (not sure if it makes sense yet or
  not).
- Reduce KVM_REQUEST_ARCH_BASE to 7 instead of 8 (the other alternative is
  to switch everything to u64)

v1 -> v2:
- Rename structs and functions and make them ready for AMD and address
  previous comments.
- Rebase & a bit of refactoring.
- Merge 7/8 and 8/8 into one patch.
- Force a VMExit from L2 after reading the kvm_state to avoid mixed state
  between L1 and L2 on resurrecting the instance.
---
 Documentation/virtual/kvm/api.txt |  46 ++
 arch/x86/include/asm/kvm_host.h   |   7 ++
 arch/x86/include/uapi/asm/kvm.h   |  38 
 arch/x86/kvm/vmx.c| 180 +-
 arch/x86/kvm/x86.c|  21 +
 include/linux/kvm_host.h  |   2 +-
 include/uapi/linux/kvm.h  |   4 +
 7 files changed, 293 insertions(+), 5 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 1c7958b..435e6cb 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3548,6 +3548,52 @@ Returns: 0 on success,
-ENOENT on deassign if the conn_id isn't registered
-EEXIST on assign if the conn_id is already registered
 
+4.114 KVM_GET_NESTED_STATE
+
+Capability: KVM_CAP_NESTED_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_nested_state (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  E2BIG: the data size exceeds the value of 'size' specified by
+ the user (the size required will be written into size).
+
+struct kvm_nested_state {
+   __u16 flags;
+   __u16 format;
+   __u32 size;
+   union {
+   struct kvm_vmx_nested_state vmx;
+   struct kvm_svm_nested_state svm;
+   __u8 pad[120];
+   };
+   __u8 data[0];
+};
+
+This ioctl copies the vcpu's kvm_nested_state struct from the kernel to 
userspace.
+
+4.115 KVM_SET_NESTED_STATE
+
+Capability: KVM_CAP_NESTED_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_nested_state (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_nested_state {
+   __u16 flags;
+   __u16 format;
+   __u32 size;
+   union {
+   struct kvm_vmx_nested_state vmx;
+   struct kvm_svm_nested_state svm;
+   __u8 pad[120];
+   };
+   __u8 data[0];
+};
+
+This copies the vcpu's kvm_nested_state struct from userspace to the kernel.
 
 5. The kvm_run structure
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3bedfef..a40a32e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -75,6 +75,7 @@
 #define KVM_REQ_HV_EXITKVM_ARCH_REQ(21)
 #define KVM_REQ_HV_STIMER  KVM_ARCH_REQ(22)
 #define KVM_REQ_LOAD_EOI_EXITMAP   KVM_ARCH_REQ(23)
+#define KVM_REQ_GET_VMCS12_PAGES   KVM_ARCH_REQ(24)
 
 #define CR0_RESERVED_BITS   \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -1085,6 +1086,12 @@ struct kvm_x86_ops {
 
void (*setup_mce)(struct kvm_vcpu *vcpu);
 
+   int (*get_nested_state)(struct kvm_vcpu *vcpu,
+   struct kvm_nested_state __user 
*user_kvm_nested_state);
+   int (*set_nested_state)(struct kvm_vcpu *vcpu,
+   struct kvm_nested_state __user 
*user_kvm_nested_state);
+   void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+
int (*smi_allowed)(struct kvm_vcpu *vcpu);
  

[PATCH v4 1/2] X86/KVM: Properly update 'tsc_offset' to represent the running guest

2018-04-12 Thread KarimAllah Ahmed
Update 'tsc_offset' on vmenty/vmexit of L2 guests to ensure that it always
captures the TSC_OFFSET of the running guest whether it is the L1 or L2
guest.

Cc: Jim Mattson 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Suggested-by: Paolo Bonzini 
Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:

- Rewrote the patch to always update tsc_offset to represent the current
  guest (pbonzini@)
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx.c  | 25 -
 arch/x86/kvm/x86.c  |  9 -
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9fa4f57..3bedfef 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1015,6 +1015,7 @@ struct kvm_x86_ops {
 
bool (*has_wbinvd_exit)(void);
 
+   u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cff2f50..9e7dd39 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2895,6 +2895,17 @@ static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
 }
 
+static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+   struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+   if (is_guest_mode(vcpu) &&
+   (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+   return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
+
+   return vcpu->arch.tsc_offset;
+}
+
 /*
  * writes 'offset' into guest's timestamp counter offset register
  */
@@ -11163,11 +11174,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12,
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
}
 
-   if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
-   vmcs_write64(TSC_OFFSET,
-   vcpu->arch.tsc_offset + vmcs12->tsc_offset);
-   else
-   vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+   vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
 
@@ -11469,6 +11477,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool 
launch)
if (enable_shadow_vmcs)
copy_shadow_to_vmcs12(vmx);
 
+   if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+   vcpu->arch.tsc_offset += vmcs12->tsc_offset;
+
/*
 * The nested entry process starts with enforcing various prerequisites
 * on vmcs12 as required by the Intel SDM, and act appropriately when
@@ -12015,6 +12026,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, 
u32 exit_reason,
 
leave_guest_mode(vcpu);
 
+   if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+   vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+
if (likely(!vmx->fail)) {
if (exit_reason == -1)
sync_vmcs12(vcpu, vmcs12);
@@ -12688,6 +12702,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = 
{
 
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
+   .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
 
.set_tdp_cr3 = vmx_set_cr3,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ac42c85..3fb1353 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1532,7 +1532,14 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, 
u64 target_tsc)
 
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-   return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+   u64 tsc_offset;
+
+   if (kvm_x86_ops->read_l1_tsc_offset)
+   tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+   else
+   tsc_offset = vcpu->arch.tsc_offset;
+
+   return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
-- 
2.7.4



[PATCH 2/2] kvm: nVMX: Introduce KVM_CAP_STATE

2018-04-12 Thread KarimAllah Ahmed
From: Jim Mattson 

For nested virtualization L0 KVM is managing a bit of state for L2 guests,
this state can not be captured through the currently available IOCTLs. In
fact the state captured through all of these IOCTLs is usually a mix of L1
and L2 state. It is also dependent on whether the L2 guest was running at
the moment when the process was interrupted to save its state.

With this capability, there are two new vcpu ioctls: KVM_GET_VMX_STATE and
KVM_SET_VMX_STATE. These can be used for saving and restoring a VM that is
in VMX operation.

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Jim Mattson 
[karahmed@ - rename structs and functions and make them ready for AMD and
 address previous comments.
   - rebase & a bit of refactoring.
   - Merge 7/8 and 8/8 into one patch.
   - Force a VMExit from L2 after reading the kvm_state to avoid
 mixed state between L1 and L2 on resurrecting the instance. ]
Signed-off-by: KarimAllah Ahmed 
---
v2 -> v3:
- Remove the forced VMExit from L2 after reading the kvm_state. The actual
  problem is solved.
- Rebase again!
- Set nested_run_pending during restore (not sure if it makes sense yet or
  not).
- Reduce KVM_REQUEST_ARCH_BASE to 7 instead of 8 (the other alternative is
  to switch everything to u64)

v1 -> v2:
- Rename structs and functions and make them ready for AMD and address
  previous comments.
- Rebase & a bit of refactoring.
- Merge 7/8 and 8/8 into one patch.
- Force a VMExit from L2 after reading the kvm_state to avoid mixed state
  between L1 and L2 on resurrecting the instance.
---
 Documentation/virtual/kvm/api.txt |  47 ++
 arch/x86/include/asm/kvm_host.h   |   7 ++
 arch/x86/include/uapi/asm/kvm.h   |  38 
 arch/x86/kvm/vmx.c| 177 +-
 arch/x86/kvm/x86.c|  21 +
 include/linux/kvm_host.h  |   2 +-
 include/uapi/linux/kvm.h  |   5 ++
 7 files changed, 292 insertions(+), 5 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 1c7958b..c51d5d3 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3548,6 +3548,53 @@ Returns: 0 on success,
-ENOENT on deassign if the conn_id isn't registered
-EEXIST on assign if the conn_id is already registered
 
+4.114 KVM_GET_STATE
+
+Capability: KVM_CAP_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_state (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  E2BIG: the data size exceeds the value of 'size' specified by
+ the user (the size required will be written into size).
+
+struct kvm_state {
+   __u16 flags;
+   __u16 format;
+   __u32 size;
+   union {
+   struct kvm_vmx_state vmx;
+   struct kvm_svm_state svm;
+   __u8 pad[120];
+   };
+   __u8 data[0];
+};
+
+This ioctl copies the vcpu's kvm_state struct from the kernel to userspace.
+
+4.115 KVM_SET_STATE
+
+Capability: KVM_CAP_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_state (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_state {
+   __u16 flags;
+   __u16 format;
+   __u32 size;
+   union {
+   struct kvm_vmx_state vmx;
+   struct kvm_svm_state svm;
+   __u8 pad[120];
+   };
+   __u8 data[0];
+};
+
+This copies the vcpu's kvm_state struct from userspace to the kernel.
+>>>>>>> 13a7c9e... kvm: nVMX: Introduce KVM_CAP_STATE
 
 5. The kvm_run structure
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9fa4f57..ad2116a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -75,6 +75,7 @@
 #define KVM_REQ_HV_EXITKVM_ARCH_REQ(21)
 #define KVM_REQ_HV_STIMER  KVM_ARCH_REQ(22)
 #define KVM_REQ_LOAD_EOI_EXITMAP   KVM_ARCH_REQ(23)
+#define KVM_REQ_GET_VMCS12_PAGES   KVM_ARCH_REQ(24)
 
 #define CR0_RESERVED_BITS   \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -1084,6 +1085,12 @@ struct kvm_x86_ops {
 
void (*setup_mce)(struct kvm_vcpu *vcpu);
 
+   int (*get_state)(struct kvm_vcpu *vcpu,
+struct kvm_state __user *user_kvm_state);
+   int (*set_state)(struct kvm_vcpu *vcpu,
+struct kvm_state __user *user_kvm_state);
+   void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+
int (*smi_allowed)(struct kvm_vcpu *vcpu);
int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smba

[PATCH 1/2] X86/KVM: Properly restore 'tsc_offset' when running an L2 guest

2018-04-12 Thread KarimAllah Ahmed
When the TSC MSR is captured while an L2 guest is running then restored,
the 'tsc_offset' ends up capturing the L02 TSC_OFFSET instead of the L01
TSC_OFFSET. So ensure that this is compensated for when storing the value.

Cc: Jim Mattson 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/vmx.c | 12 +---
 arch/x86/kvm/x86.c |  1 -
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cff2f50..2f57571 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2900,6 +2900,8 @@ static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
  */
 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
+   u64 l1_tsc_offset = 0;
+
if (is_guest_mode(vcpu)) {
/*
 * We're here if L1 chose not to trap WRMSR to TSC. According
@@ -2908,16 +2910,20 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, 
u64 offset)
 * to the newly set TSC to get L2's TSC.
 */
struct vmcs12 *vmcs12;
+
/* recalculate vmcs02.TSC_OFFSET: */
vmcs12 = get_vmcs12(vcpu);
-   vmcs_write64(TSC_OFFSET, offset +
-   (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
-vmcs12->tsc_offset : 0));
+
+   l1_tsc_offset = nested_cpu_has(vmcs12, 
CPU_BASED_USE_TSC_OFFSETING) ?
+   vmcs12->tsc_offset : 0;
+   vmcs_write64(TSC_OFFSET, offset + l1_tsc_offset);
} else {
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
   vmcs_read64(TSC_OFFSET), offset);
vmcs_write64(TSC_OFFSET, offset);
}
+
+   vcpu->arch.tsc_offset = offset - l1_tsc_offset;
 }
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ac42c85..1a2ed92 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1539,7 +1539,6 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
kvm_x86_ops->write_tsc_offset(vcpu, offset);
-   vcpu->arch.tsc_offset = offset;
 }
 
 static inline bool kvm_check_tsc_unstable(void)
-- 
2.7.4



[PATCH] X86/KVM: Do not allow DISABLE_EXITS_MWAIT when LAPIC ARAT is not available

2018-04-11 Thread KarimAllah Ahmed
If the processor does not have an "Always Running APIC Timer" (aka ARAT),
we should not give guests direct access to MWAIT. The LAPIC timer would
stop ticking in deep C-states, so any host deadlines would not wakeup the
host kernel.

The host kernel intel_idle driver handles this by switching to broadcast
mode when ARAT is not available and MWAIT is issued with a deep C-state
that would stop the LAPIC timer. When MWAIT is passed through, we can not
tell when MWAIT is issued.

So just disable this capability when LAPIC ARAT is not available. I am not
even sure if there are any CPUs with VMX support but no LAPIC ARAT or not.

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reported-by: Wanpeng Li 
Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/kvm/x86.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b2ff74b..0334b25 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2819,7 +2819,8 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs 
__user *user_msrs,
 static inline bool kvm_can_mwait_in_guest(void)
 {
return boot_cpu_has(X86_FEATURE_MWAIT) &&
-   !boot_cpu_has_bug(X86_BUG_MONITOR);
+   !boot_cpu_has_bug(X86_BUG_MONITOR) &&
+   boot_cpu_has(X86_FEATURE_ARAT);
 }
 
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
-- 
2.7.4



[PATCH v3] X86/VMX: Disable VMX preemption timer if MWAIT is not intercepted

2018-04-10 Thread KarimAllah Ahmed
The VMX-preemption timer is used by KVM as a way to set deadlines for the
guest (i.e. timer emulation). That was safe till very recently when
capability KVM_X86_DISABLE_EXITS_MWAIT to disable intercepting MWAIT was
introduced. According to Intel SDM 25.5.1:

"""
The VMX-preemption timer operates in the C-states C0, C1, and C2; it also
operates in the shutdown and wait-for-SIPI states. If the timer counts down
to zero in any state other than the wait-for SIPI state, the logical
processor transitions to the C0 C-state and causes a VM exit; the timer
does not cause a VM exit if it counts down to zero in the wait-for-SIPI
state. The timer is not decremented in C-states deeper than C2.
"""

Now once the guest issues the MWAIT with a c-state deeper than
C2 the preemption timer will never wake it up again since it stopped
ticking! Usually this is compensated by other activities in the system that
would wake the core from the deep C-state (and cause a VMExit). For
example, if the host itself is ticking or it received interrupts, etc!

So disable the VMX-preemption timer if MWAIT is exposed to the guest!

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
v2 -> v3:
- return -EOPNOTSUPP before any other operation in vmx_set_hv_timer

v1 -> v2:
- Drop everything .. just return -EOPNOTSUPP (pbonzini@) :D
---
 arch/x86/kvm/vmx.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d2e54e7..31a4204 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11903,10 +11903,16 @@ static inline int u64_shl_div_u64(u64 a, unsigned int 
shift,
 
 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
 {
-   struct vcpu_vmx *vmx = to_vmx(vcpu);
-   u64 tscl = rdtsc();
-   u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
-   u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+   struct vcpu_vmx *vmx;
+   u64 tscl, guest_tscl, delta_tsc;
+
+   if (kvm_pause_in_guest(vcpu->kvm))
+   return -EOPNOTSUPP;
+
+   vmx = to_vmx(vcpu);
+   tscl = rdtsc();
+   guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+   delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
 
/* Convert to host delta tsc if tsc scaling is enabled */
if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
-- 
2.7.4



[PATCH v2] X86/VMX: Disable VMX preemption timer if MWAIT is not intercepted

2018-04-10 Thread KarimAllah Ahmed
The VMX-preemption timer is used by KVM as a way to set deadlines for the
guest (i.e. timer emulation). That was safe till very recently when
capability KVM_X86_DISABLE_EXITS_MWAIT to disable intercepting MWAIT was
introduced. According to Intel SDM 25.5.1:

"""
The VMX-preemption timer operates in the C-states C0, C1, and C2; it also
operates in the shutdown and wait-for-SIPI states. If the timer counts down
to zero in any state other than the wait-for SIPI state, the logical
processor transitions to the C0 C-state and causes a VM exit; the timer
does not cause a VM exit if it counts down to zero in the wait-for-SIPI
state. The timer is not decremented in C-states deeper than C2.
"""

Now once the guest issues the MWAIT with a c-state deeper than
C2 the preemption timer will never wake it up again since it stopped
ticking! Usually this is compensated by other activities in the system that
would wake the core from the deep C-state (and cause a VMExit). For
example, if the host itself is ticking or it received interrupts, etc!

So disable the VMX-preemption timer is MWAIT is exposed to the guest!

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- Drop everything .. just return -EOPNOTSUPP (pbonzini@) :D
---
 arch/x86/kvm/vmx.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d2e54e7..d99008b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11908,6 +11908,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 
guest_deadline_tsc)
u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
 
+   if (kvm_pause_in_guest(vcpu->kvm))
+   return -EOPNOTSUPP;
+
/* Convert to host delta tsc if tsc scaling is enabled */
if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
u64_shl_div_u64(delta_tsc,
-- 
2.7.4



[PATCH] X86/VMX: Disable VMX preempition timer if MWAIT is not intercepted

2018-04-10 Thread KarimAllah Ahmed
The VMX-preemption timer is used by KVM as a way to set deadlines for the
guest (i.e. timer emulation). That was safe till very recently when
capability KVM_X86_DISABLE_EXITS_MWAIT to disable intercepting MWAIT was
introduced. According to Intel SDM 25.5.1:

"""
The VMX-preemption timer operates in the C-states C0, C1, and C2; it also
operates in the shutdown and wait-for-SIPI states. If the timer counts down
to zero in any state other than the wait-for SIPI state, the logical
processor transitions to the C0 C-state and causes a VM exit; the timer
does not cause a VM exit if it counts down to zero in the wait-for-SIPI
state. The timer is not decremented in C-states deeper than C2.
"""

Now once the guest issues the MWAIT with a c-state deeper than
C2 the preemption timer will never wake it up again since it stopped
ticking! Usually this is compensated by other activities in the system that
would wake the core from the deep C-state (and cause a VMExit). For
example, if the host itself is ticking or it received interrupts, etc!

So disable the VMX-preemption timer is MWAIT is exposed to the guest!

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/lapic.c|  3 ++-
 arch/x86/kvm/vmx.c  | 11 +--
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 97448f1..5d9da9c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1090,6 +1090,7 @@ struct kvm_x86_ops {
  uint32_t guest_irq, bool set);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
 
+   bool (*has_hv_timer)(struct kvm_vcpu *vcpu);
int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a071dc1..9fb50e6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1561,7 +1561,8 @@ static bool start_hv_timer(struct kvm_lapic *apic)
int r;
 
WARN_ON(preemptible());
-   if (!kvm_x86_ops->set_hv_timer)
+   if (!kvm_x86_ops->has_hv_timer ||
+   !kvm_x86_ops->has_hv_timer(apic->vcpu))
return false;
 
if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d2e54e7..d99a823 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7112,7 +7112,7 @@ static __init int hardware_setup(void)
cpu_preemption_timer_multi =
 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
} else {
-   kvm_x86_ops->set_hv_timer = NULL;
+   kvm_x86_ops->has_hv_timer = NULL;
kvm_x86_ops->cancel_hv_timer = NULL;
}
 
@@ -11901,6 +11901,11 @@ static inline int u64_shl_div_u64(u64 a, unsigned int 
shift,
return 0;
 }
 
+static bool vmx_has_hv_timer(struct kvm_vcpu *vcpu)
+{
+   return !kvm_pause_in_guest(vcpu->kvm);
+}
+
 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -12136,7 +12141,8 @@ static void pi_post_block(struct kvm_vcpu *vcpu)
 
 static void vmx_post_block(struct kvm_vcpu *vcpu)
 {
-   if (kvm_x86_ops->set_hv_timer)
+   if (kvm_x86_ops->has_hv_timer &&
+   kvm_x86_ops->has_hv_timer(vcpu))
kvm_lapic_switch_to_hv_timer(vcpu);
 
pi_post_block(vcpu);
@@ -12592,6 +12598,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = 
{
.update_pi_irte = vmx_update_pi_irte,
 
 #ifdef CONFIG_X86_64
+   .has_hv_timer = vmx_has_hv_timer,
.set_hv_timer = vmx_set_hv_timer,
.cancel_hv_timer = vmx_cancel_hv_timer,
 #endif
-- 
2.7.4



[PATCH v2] kvm: nVMX: Introduce KVM_CAP_STATE

2018-04-09 Thread KarimAllah Ahmed
From: Jim Mattson 

For nested virtualization L0 KVM is managing a bit of state for L2 guests,
this state can not be captured through the currently available IOCTLs. In
fact the state captured through all of these IOCTLs is usually a mix of L1
and L2 state. It is also dependent on whether the L2 guest was running at
the moment when the process was interrupted to save its state.

With this capability, there are two new vcpu ioctls: KVM_GET_VMX_STATE and
KVM_SET_VMX_STATE. These can be used for saving and restoring a VM that is
in VMX operation.

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: H. Peter Anvin 
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Jim Mattson 
[karahmed@ - rename structs and functions and make them ready for AMD and
 address previous comments.
   - rebase & a bit of refactoring.
   - Merge 7/8 and 8/8 into one patch.
   - Force a VMExit from L2 after reading the kvm_state to avoid
 mixed state between L1 and L2 on resurrecting the instance. ]
Signed-off-by: KarimAllah Ahmed 
---
v1 -> v2:
- rename structs and functions and make them ready for AMD and address
  previous comments.
- rebase & a bit of refactoring.
- Merge 7/8 and 8/8 into one patch.
- Force a VMExit from L2 after reading the kvm_state to avoid mixed state
  between L1 and L2 on resurrecting the instance.
---
 Documentation/virtual/kvm/api.txt |  46 ++
 arch/x86/include/asm/kvm_host.h   |   7 ++
 arch/x86/include/uapi/asm/kvm.h   |  38 
 arch/x86/kvm/vmx.c| 189 +-
 arch/x86/kvm/x86.c|  21 +
 include/uapi/linux/kvm.h  |   5 +
 6 files changed, 302 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index d6b3ff5..3ed56df 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3516,6 +3516,52 @@ Returns: 0 on success; -1 on error
 This ioctl can be used to unregister the guest memory region registered
 with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above.
 
+4.112 KVM_GET_STATE
+
+Capability: KVM_CAP_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_state (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  E2BIG: the data size exceeds the value of 'size' specified by
+ the user (the size required will be written into size).
+
+struct kvm_state {
+   __u16 flags;
+   __u16 format;
+   __u32 size;
+   union {
+   struct kvm_vmx_state vmx;
+   struct kvm_svm_state svm;
+   __u8 pad[120];
+   };
+   __u8 data[0];
+};
+
+This ioctl copies the vcpu's kvm_state struct from the kernel to userspace.
+
+4.113 KVM_SET_STATE
+
+Capability: KVM_CAP_STATE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_state (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_state {
+   __u16 flags;
+   __u16 format;
+   __u32 size;
+   union {
+   struct kvm_vmx_state vmx;
+   struct kvm_svm_state svm;
+   __u8 pad[120];
+   };
+   __u8 data[0];
+};
+
+This copies the vcpu's kvm_state struct from userspace to the kernel.
 
 5. The kvm_run structure
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fad4d46..902db9e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -73,6 +73,7 @@
 #define KVM_REQ_HV_RESET   KVM_ARCH_REQ(20)
 #define KVM_REQ_HV_EXITKVM_ARCH_REQ(21)
 #define KVM_REQ_HV_STIMER  KVM_ARCH_REQ(22)
+#define KVM_REQ_GET_VMCS12_PAGES   KVM_ARCH_REQ(23)
 
 #define CR0_RESERVED_BITS   \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -1090,6 +1091,12 @@ struct kvm_x86_ops {
 
void (*setup_mce)(struct kvm_vcpu *vcpu);
 
+   int (*get_state)(struct kvm_vcpu *vcpu,
+struct kvm_state __user *user_kvm_state);
+   int (*set_state)(struct kvm_vcpu *vcpu,
+struct kvm_state __user *user_kvm_state);
+   void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+
int (*smi_allowed)(struct kvm_vcpu *vcpu);
int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index f3a9604..1d1cd26 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -361,4 +361,42 @@ struct kvm_sync_regs {
 #define KVM_X86_QUIRK_LINT0_REENABLED  (1 << 0)
 #define KVM_X86_QUIRK_CD_NW_CLEARED(1 << 1)
 
+#define KVM_STATE_GUEST_MODE   0x0001
+#define KVM_STATE_RUN_PENDING  0x0002

[PATCH v4 1/2] PCI/IOV: Use VF0 cached config registers for other VFs

2018-03-19 Thread KarimAllah Ahmed
Cache some config data from VF0 and use it for all other VFs instead of
reading it from the config space of each VF.  We assume these items are the
same across all associated VFs:

   Revision ID
   Class Code
   Subsystem Vendor ID
   Subsystem ID

This is an optimization when enabling SR-IOV on a device with many VFs.

Cc: Bjorn Helgaas 
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
[bhelgaas: changelog, simplify comments, remove unused "device"]
Signed-off-by: Bjorn Helgaas 
---
v3->v4:
- Restructure the code to handle CONFIG_PCI_ATS

 drivers/pci/iov.c   | 42 +++---
 drivers/pci/pci.h   |  4 
 drivers/pci/probe.c | 47 ++-
 3 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 677924a..30bf8f7 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -114,6 +114,29 @@ resource_size_t pci_iov_resource_size(struct pci_dev *dev, 
int resno)
return dev->sriov->barsz[resno - PCI_IOV_RESOURCES];
 }
 
+static void pci_read_vf_config_common(struct pci_dev *virtfn)
+{
+   struct pci_dev *physfn = virtfn->physfn;
+
+   /*
+* Some config registers are the same across all associated VFs.
+* Read them once from VF0 so we can skip reading them from the
+* other VFs.
+*
+* PCIe r4.0, sec 9.3.4.1, technically doesn't require all VFs to
+* have the same Revision ID and Subsystem ID, but we assume they
+* do.
+*/
+   pci_read_config_dword(virtfn, PCI_CLASS_REVISION,
+ &physfn->sriov->class);
+   pci_read_config_byte(virtfn, PCI_HEADER_TYPE,
+&physfn->sriov->hdr_type);
+   pci_read_config_word(virtfn, PCI_SUBSYSTEM_VENDOR_ID,
+&physfn->sriov->subsystem_vendor);
+   pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID,
+&physfn->sriov->subsystem_device);
+}
+
 int pci_iov_add_virtfn(struct pci_dev *dev, int id)
 {
int i;
@@ -136,13 +159,17 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
virtfn->vendor = dev->vendor;
virtfn->device = iov->vf_device;
+   virtfn->is_virtfn = 1;
+   virtfn->physfn = pci_dev_get(dev);
+
+   if (id == 0)
+   pci_read_vf_config_common(virtfn);
+
rc = pci_setup_device(virtfn);
if (rc)
-   goto failed0;
+   goto failed1;
 
virtfn->dev.parent = dev->dev.parent;
-   virtfn->physfn = pci_dev_get(dev);
-   virtfn->is_virtfn = 1;
virtfn->multifunction = 0;
 
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
@@ -163,10 +190,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
sprintf(buf, "virtfn%u", id);
rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
if (rc)
-   goto failed1;
+   goto failed2;
rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
if (rc)
-   goto failed2;
+   goto failed3;
 
kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
 
@@ -174,11 +201,12 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
 
return 0;
 
-failed2:
+failed3:
sysfs_remove_link(&dev->dev.kobj, buf);
+failed2:
+   pci_stop_and_remove_bus_device(virtfn);
 failed1:
pci_dev_put(dev);
-   pci_stop_and_remove_bus_device(virtfn);
 failed0:
virtfn_remove_bus(dev->bus, bus);
 failed:
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index fcd8191..bdb4ba2 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -271,6 +271,10 @@ struct pci_sriov {
u16 driver_max_VFs; /* Max num VFs driver supports */
struct pci_dev  *dev;   /* Lowest numbered PF */
struct pci_dev  *self;  /* This PF */
+   u32 class;  /* VF device */
+   u8  hdr_type;   /* VF header type */
+   u16 subsystem_vendor; /* VF subsystem vendor */
+   u16 subsystem_device; /* VF subsystem device */
resource_size_t barsz[PCI_SRIOV_NUM_BARS];  /* VF BAR size */
booldrivers_autoprobe; /* Auto probing of VFs by driver */
 };
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ef53774..21ee1c3 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1389,6 +1389,43 @@ int pci_cfg_space_size(struct pci_dev *dev)
return PCI_CFG_SPACE_SIZE;
 }
 
+static int pci_cfg_space_class(struct pci_dev *dev)
+{
+   int class;
+
+#ifdef CONFIG_PCI_ATS
+   if (dev->is_virtfn)
+   re

[PATCH v4 2/2] PCI/IOV: Use VF0 cached config space size for other VFs

2018-03-19 Thread KarimAllah Ahmed
Cache the config space size from VF0 and use it for all other VFs instead
of reading it from the config space of each VF.  We assume that it will be
the same across all associated VFs.

This is an optimization when enabling SR-IOV on a device with many VFs.

Cc: Bjorn Helgaas 
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
 drivers/pci/iov.c   |  3 +++
 drivers/pci/pci.h   |  1 +
 drivers/pci/probe.c | 11 ++-
 include/linux/pci.h |  1 +
 4 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 30bf8f7..046e0d3 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -135,6 +135,9 @@ static void pci_read_vf_config_common(struct pci_dev 
*virtfn)
 &physfn->sriov->subsystem_vendor);
pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID,
 &physfn->sriov->subsystem_device);
+
+   virtfn->class = physfn->sriov->class;
+   physfn->sriov->cfg_size = __pci_cfg_space_size(virtfn);
 }
 
 int pci_iov_add_virtfn(struct pci_dev *dev, int id)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index bdb4ba2..69da57b 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -271,6 +271,7 @@ struct pci_sriov {
u16 driver_max_VFs; /* Max num VFs driver supports */
struct pci_dev  *dev;   /* Lowest numbered PF */
struct pci_dev  *self;  /* This PF */
+   u32 cfg_size;   /* VF config space size */
u32 class;  /* VF device */
u8  hdr_type;   /* VF header type */
u16 subsystem_vendor; /* VF subsystem vendor */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 21ee1c3..fd21e2b 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1365,7 +1365,7 @@ static int pci_cfg_space_size_ext(struct pci_dev *dev)
return PCI_CFG_SPACE_EXP_SIZE;
 }
 
-int pci_cfg_space_size(struct pci_dev *dev)
+int __pci_cfg_space_size(struct pci_dev *dev)
 {
int pos;
u32 status;
@@ -1389,6 +1389,15 @@ int pci_cfg_space_size(struct pci_dev *dev)
return PCI_CFG_SPACE_SIZE;
 }
 
+int pci_cfg_space_size(struct pci_dev *dev)
+{
+#ifdef CONFIG_PCI_ATS
+   if (dev->is_virtfn)
+   return dev->physfn->sriov->cfg_size;
+#endif
+   return __pci_cfg_space_size(dev);
+}
+
 static int pci_cfg_space_class(struct pci_dev *dev)
 {
int class;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 024a1be..fcd5d88 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1290,6 +1290,7 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev 
*dev, int max,
 
 void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
  void *userdata);
+int __pci_cfg_space_size(struct pci_dev *dev);
 int pci_cfg_space_size(struct pci_dev *dev);
 unsigned char pci_bus_max_busnr(struct pci_bus *bus);
 void pci_setup_bridge(struct pci_bus *bus);
-- 
2.7.4



[PATCH] nvmx: Check exit qualification RD/WR permission for MMIO accesses

2018-03-04 Thread KarimAllah Ahmed
Validate that a write MMIO access that follows a read MMIO access would
have the correct access captured in the exit qualification.

Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
Message-Id: <1519841208-23349-1-git-send-email-karah...@amazon.de>
---
 x86/vmx_tests.c | 52 
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/x86/vmx_tests.c b/x86/vmx_tests.c
index 598dd88..a72af1a 100644
--- a/x86/vmx_tests.c
+++ b/x86/vmx_tests.c
@@ -7,6 +7,7 @@
 #include "msr.h"
 #include "processor.h"
 #include "vm.h"
+#include "pci.h"
 #include "fwcfg.h"
 #include "isr.h"
 #include "desc.h"
@@ -28,6 +29,8 @@ unsigned long *pml4;
 u64 eptp;
 void *data_page1, *data_page2;
 
+phys_addr_t pci_physaddr;
+
 void *pml_log;
 #define PML_INDEX 512
 
@@ -1041,6 +1044,9 @@ static int apic_version;
 
 static int ept_init_common(bool have_ad)
 {
+   int ret;
+   struct pci_dev pcidev;
+
if (setup_ept(have_ad))
return VMX_TEST_EXIT;
data_page1 = alloc_page();
@@ -1053,6 +1059,13 @@ static int ept_init_common(bool have_ad)
EPT_RA | EPT_WA | EPT_EA);
 
apic_version = apic_read(APIC_LVR);
+
+   ret = pci_find_dev(PCI_VENDOR_ID_REDHAT, PCI_DEVICE_ID_REDHAT_TEST);
+   if (ret != PCIDEVADDR_INVALID) {
+   pci_dev_init(&pcidev, ret);
+   pci_physaddr = pcidev.resource[PCI_TESTDEV_BAR_MEM];
+   }
+
return VMX_TEST_START;
 }
 
@@ -1101,6 +1114,16 @@ t1:
vmcall();
*((u32 *)data_page1) = MAGIC_VAL_2;
report("EPT violation - paging structure", vmx_get_test_stage() == 5);
+
+   // MMIO Read/Write
+   vmx_set_test_stage(5);
+   vmcall();
+
+   *(u32 volatile *)pci_physaddr;
+   report("MMIO EPT violation - read", vmx_get_test_stage() == 6);
+
+   *(u32 volatile *)pci_physaddr = MAGIC_VAL_1;
+   report("MMIO EPT violation - write", vmx_get_test_stage() == 7);
 }
 
 static void ept_main()
@@ -1108,12 +1131,12 @@ static void ept_main()
ept_common();
 
// Test EPT access to L1 MMIO
-   vmx_set_test_stage(6);
+   vmx_set_test_stage(7);
report("EPT - MMIO access", *((u32 *)0xfee00030UL) == apic_version);
 
// Test invalid operand for INVEPT
vmcall();
-   report("EPT - unsupported INVEPT", vmx_get_test_stage() == 7);
+   report("EPT - unsupported INVEPT", vmx_get_test_stage() == 8);
 }
 
 bool invept_test(int type, u64 eptp)
@@ -1187,7 +1210,7 @@ static int ept_exit_handler_common(bool have_ad)
ulong reason;
u32 insn_len;
u32 exit_qual;
-   static unsigned long data_page1_pte, data_page1_pte_pte;
+   static unsigned long data_page1_pte, data_page1_pte_pte, memaddr_pte;
 
guest_rip = vmcs_read(GUEST_RIP);
guest_cr3 = vmcs_read(GUEST_CR3);
@@ -1249,7 +1272,12 @@ static int ept_exit_handler_common(bool have_ad)
data_page1_pte_pte & ~EPT_PRESENT);
ept_sync(INVEPT_SINGLE, eptp);
break;
-   case 6:
+   case 5:
+   install_ept(pml4, (unsigned long)pci_physaddr,
+   (unsigned long)pci_physaddr, 0);
+   ept_sync(INVEPT_SINGLE, eptp);
+   break;
+   case 7:
if (!invept_test(0, eptp))
vmx_inc_test_stage();
break;
@@ -1305,6 +1333,22 @@ static int ept_exit_handler_common(bool have_ad)
data_page1_pte_pte | (EPT_PRESENT));
ept_sync(INVEPT_SINGLE, eptp);
break;
+   case 5:
+   if (exit_qual & EPT_VLT_RD)
+   vmx_inc_test_stage();
+   TEST_ASSERT(get_ept_pte(pml4, (unsigned 
long)pci_physaddr,
+   1, &memaddr_pte));
+   set_ept_pte(pml4, memaddr_pte, 1, memaddr_pte | EPT_RA);
+   ept_sync(INVEPT_SINGLE, eptp);
+   break;
+   case 6:
+   if (exit_qual & EPT_VLT_WR)
+   vmx_inc_test_stage();
+   TEST_ASSERT(get_ept_pte(pml4, (unsigned 
long)pci_physaddr,
+   1, &memaddr_pte));
+   set_ept_pte(pml4, memaddr_pte, 1, memaddr_pte | EPT_RA 
| EPT_WA);
+   ept_sync(INVEPT_SINGLE, eptp);
+   break;
default:
// Should not reach here
report("ERROR : unexpected stage, %d", false,
-- 
2.7.4



[PATCH] PCI/IOV: Skip initializing the base addresses from the VF config space

2018-03-02 Thread KarimAllah Ahmed
Per PCIe r4.0, sec 9.3.4.1.11, the BARs registers from the VF config space
are all RO Zero for PCI VFs. So just skip reading them for VFs.

This is an optimization when enabling SR-IOV on a device with many VFs.

Cc: Bjorn Helgaas 
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Suggested-by: Bjorn Helgaas 
Signed-off-by: KarimAllah Ahmed 
---
 drivers/pci/probe.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index a96837e..7204d46 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -329,6 +329,10 @@ static void pci_read_bases(struct pci_dev *dev, unsigned 
int howmany, int rom)
if (dev->non_compliant_bars)
return;
 
+   /* Per PCIe r4.0, sec 9.3.4.1.11, the VF BARs are all RO Zero */
+   if (dev->is_virtfn)
+   return;
+
for (pos = 0; pos < howmany; pos++) {
struct resource *res = &dev->resource[pos];
reg = PCI_BASE_ADDRESS_0 + (pos << 2);
-- 
2.7.4



[PATCH v3 2/2] PCI/IOV: Use the cached VF BARs size instead of re-reading them

2018-03-01 Thread KarimAllah Ahmed
Use the cached VF BARs size instead of re-reading them from the hardware.
That avoids doing unnecessarily bus transactions which is specially
noticable when you have a PF with a large number of VFs.

Cc: Bjorn Helgaas 
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
 drivers/pci/probe.c | 24 ++--
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index a96837e..aeaa10a 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -180,6 +180,7 @@ static inline unsigned long decode_bar(struct pci_dev *dev, 
u32 bar)
 int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
struct resource *res, unsigned int pos)
 {
+   int bar = res - dev->resource;
u32 l = 0, sz = 0, mask;
u64 l64, sz64, mask64;
u16 orig_cmd;
@@ -199,9 +200,13 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type 
type,
res->name = pci_name(dev);
 
pci_read_config_dword(dev, pos, &l);
-   pci_write_config_dword(dev, pos, l | mask);
-   pci_read_config_dword(dev, pos, &sz);
-   pci_write_config_dword(dev, pos, l);
+   if (dev->is_virtfn) {
+   sz = dev->physfn->sriov->barsz[bar] & 0x;
+   } else {
+   pci_write_config_dword(dev, pos, l | mask);
+   pci_read_config_dword(dev, pos, &sz);
+   pci_write_config_dword(dev, pos, l);
+   }
 
/*
 * All bits set in sz means the device isn't working properly.
@@ -241,9 +246,14 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type 
type,
 
if (res->flags & IORESOURCE_MEM_64) {
pci_read_config_dword(dev, pos + 4, &l);
-   pci_write_config_dword(dev, pos + 4, ~0);
-   pci_read_config_dword(dev, pos + 4, &sz);
-   pci_write_config_dword(dev, pos + 4, l);
+
+   if (dev->is_virtfn) {
+   sz = (dev->physfn->sriov->barsz[bar] >> 32) & 
0x;
+   } else {
+   pci_write_config_dword(dev, pos + 4, ~0);
+   pci_read_config_dword(dev, pos + 4, &sz);
+   pci_write_config_dword(dev, pos + 4, l);
+   }
 
l64 |= ((u64)l << 32);
sz64 |= ((u64)sz << 32);
@@ -332,6 +342,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned 
int howmany, int rom)
for (pos = 0; pos < howmany; pos++) {
struct resource *res = &dev->resource[pos];
reg = PCI_BASE_ADDRESS_0 + (pos << 2);
+   if (dev->is_virtfn && dev->physfn->sriov->barsz[pos] == 0)
+   continue;
pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
}
 
-- 
2.7.4



[PATCH v3 1/2] PCI/IOV: Store more data about VFs into the SRIOV struct

2018-03-01 Thread KarimAllah Ahmed
Store more data about PCI VFs into the SRIOV to avoid reading them from the
config space of all the PCI VFs. This is specially a useful optimization
when bringing up thousands of VFs.

Cc: Bjorn Helgaas 
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed 
---
v2 -> v3:
 * Update changelog
 * Move the call to pci_read_vf_config_common a bit later and use standard
   pci_read_config*.
 * Update whitespace.
 * Move the using barsz into its own patch.
 * Added a comment about the usage of subsystem vendor id, subsystem id, and
   class revision.
 * Make sure virtfn->is_virtfn is set before calling into pci_setup_device.

v1 -> v2:
 * Rebase on latest + remove dependency on a non-upstream patch.

 drivers/pci/iov.c   | 48 +---
 drivers/pci/pci.h   |  5 +
 drivers/pci/probe.c | 18 ++
 3 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 677924a..10291a0 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -114,6 +114,36 @@ resource_size_t pci_iov_resource_size(struct pci_dev *dev, 
int resno)
return dev->sriov->barsz[resno - PCI_IOV_RESOURCES];
 }
 
+static void pci_read_vf_config_common(struct pci_dev *virtfn)
+{
+   struct pci_dev *physfn = virtfn->physfn;
+
+   BUG_ON(!virtfn->is_virtfn || physfn->is_virtfn);
+
+   /*
+* Per PCIe r4.0, sec 9.3.4.1.5, the value reported in the VF maybe
+* different than the value reported in the PF. We assume here that all
+* VFs would report the same revision ID.
+*/
+   pci_read_config_dword(virtfn, PCI_CLASS_REVISION,
+ &physfn->sriov->class);
+   /*
+* Per PCIe r4.0, sec 9.3.4.1.13, the field in the PF and the
+* associated VFs must return the same value.
+*/
+   pci_read_config_word(virtfn, PCI_SUBSYSTEM_VENDOR_ID,
+&physfn->sriov->subsystem_vendor);
+   /*
+* Per PCIe r4.0, sec 9.3.4.1.14, the value reported in the VF maybe
+* different than the value reported in the PF. We assume here that all
+* VFs would report the same subsystem ID.
+*/
+   pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID,
+&physfn->sriov->subsystem_device);
+   pci_read_config_byte(virtfn, PCI_HEADER_TYPE,
+&physfn->sriov->hdr_type);
+}
+
 int pci_iov_add_virtfn(struct pci_dev *dev, int id)
 {
int i;
@@ -134,15 +164,18 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
goto failed0;
 
virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
+   virtfn->is_virtfn = 1;
+   virtfn->physfn = pci_dev_get(dev);
+   if (id == 0)
+   /* virtfn->{devfn,bus,is_virtfn,physfn} have to be initialized 
*/
+   pci_read_vf_config_common(virtfn);
virtfn->vendor = dev->vendor;
virtfn->device = iov->vf_device;
rc = pci_setup_device(virtfn);
if (rc)
-   goto failed0;
+   goto failed1;
 
virtfn->dev.parent = dev->dev.parent;
-   virtfn->physfn = pci_dev_get(dev);
-   virtfn->is_virtfn = 1;
virtfn->multifunction = 0;
 
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
@@ -163,10 +196,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
sprintf(buf, "virtfn%u", id);
rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
if (rc)
-   goto failed1;
+   goto failed2;
rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
if (rc)
-   goto failed2;
+   goto failed3;
 
kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
 
@@ -174,11 +207,12 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
 
return 0;
 
-failed2:
+failed3:
sysfs_remove_link(&dev->dev.kobj, buf);
+failed2:
+   pci_stop_and_remove_bus_device(virtfn);
 failed1:
pci_dev_put(dev);
-   pci_stop_and_remove_bus_device(virtfn);
 failed0:
virtfn_remove_bus(dev->bus, bus);
 failed:
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index fcd8191..17e6688 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -271,6 +271,11 @@ struct pci_sriov {
u16 driver_max_VFs; /* Max num VFs driver supports */
struct pci_dev  *dev;   /* Lowest numbered PF */
struct pci_dev  *self;  /* This PF */
+   u8  hdr_type;   /* VF header type */
+   u32 class;  /* VF device */
+   u16 device; /* VF device */
+   u16 subsystem_vendor; /* VF subsystem vendor */
+   u16  

  1   2   3   >