Re: [PATCH 2/2] KVM: PPC: Remove page table walk helpers

2015-03-29 Thread Benjamin Herrenschmidt
On Mon, 2015-03-30 at 10:39 +0530, Aneesh Kumar K.V wrote:
 This patch remove helpers which we had used only once in the code.
 Limiting page table walk variants help in ensuring that we won't
 end up with code walking page table with wrong assumptions.
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Alex, it would be preferable to have this (and the previous one) in the
powerpc tree due to dependencies with further fixes to our page table
walking vs. THP, so if you're happy, just give us an ack.

Cheers,
Ben.

 ---
  arch/powerpc/include/asm/pgtable.h  | 21 -
  arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 
 -
  arch/powerpc/kvm/e500_mmu_host.c|  2 +-
  3 files changed, 28 insertions(+), 57 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/pgtable.h 
 b/arch/powerpc/include/asm/pgtable.h
 index 9835ac4173b7..92fe01c355a9 100644
 --- a/arch/powerpc/include/asm/pgtable.h
 +++ b/arch/powerpc/include/asm/pgtable.h
 @@ -249,27 +249,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, 
 unsigned long addr,
  #endif
  pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
unsigned *shift);
 -
 -static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva,
 -  unsigned long *pte_sizep)
 -{
 - pte_t *ptep;
 - unsigned long ps = *pte_sizep;
 - unsigned int shift;
 -
 - ptep = find_linux_pte_or_hugepte(pgdir, hva, shift);
 - if (!ptep)
 - return NULL;
 - if (shift)
 - *pte_sizep = 1ul  shift;
 - else
 - *pte_sizep = PAGE_SIZE;
 -
 - if (ps  *pte_sizep)
 - return NULL;
 -
 - return ptep;
 -}
  #endif /* __ASSEMBLY__ */
  
  #endif /* __KERNEL__ */
 diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
 b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 index 625407e4d3b0..73e083cb9f7e 100644
 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 @@ -131,25 +131,6 @@ static void remove_revmap_chain(struct kvm *kvm, long 
 pte_index,
   unlock_rmap(rmap);
  }
  
 -static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
 -   int writing, unsigned long *pte_sizep)
 -{
 - pte_t *ptep;
 - unsigned long ps = *pte_sizep;
 - unsigned int hugepage_shift;
 -
 - ptep = find_linux_pte_or_hugepte(pgdir, hva, hugepage_shift);
 - if (!ptep)
 - return __pte(0);
 - if (hugepage_shift)
 - *pte_sizep = 1ul  hugepage_shift;
 - else
 - *pte_sizep = PAGE_SIZE;
 - if (ps  *pte_sizep)
 - return __pte(0);
 - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
 -}
 -
  static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
  {
   asm volatile(PPC_RELEASE_BARRIER  : : : memory);
 @@ -166,10 +147,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
 flags,
   struct revmap_entry *rev;
   unsigned long g_ptel;
   struct kvm_memory_slot *memslot;
 - unsigned long pte_size;
 + unsigned hpage_shift;
   unsigned long is_io;
   unsigned long *rmap;
 - pte_t pte;
 + pte_t *ptep;
   unsigned int writing;
   unsigned long mmu_seq;
   unsigned long rcbits;
 @@ -208,22 +189,33 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
 flags,
  
   /* Translate to host virtual address */
   hva = __gfn_to_hva_memslot(memslot, gfn);
 + ptep = find_linux_pte_or_hugepte(pgdir, hva, hpage_shift);
 + if (ptep) {
 + pte_t pte;
 + unsigned int host_pte_size;
  
 - /* Look up the Linux PTE for the backing page */
 - pte_size = psize;
 - pte = lookup_linux_pte_and_update(pgdir, hva, writing, pte_size);
 - if (pte_present(pte)  !pte_protnone(pte)) {
 - if (writing  !pte_write(pte))
 - /* make the actual HPTE be read-only */
 - ptel = hpte_make_readonly(ptel);
 - is_io = hpte_cache_bits(pte_val(pte));
 - pa = pte_pfn(pte)  PAGE_SHIFT;
 - pa |= hva  (pte_size - 1);
 - pa |= gpa  ~PAGE_MASK;
 - }
 + if (hpage_shift)
 + host_pte_size = 1ul  hpage_shift;
 + else
 + host_pte_size = PAGE_SIZE;
 + /*
 +  * We should always find the guest page size
 +  * to = host page size, if host is using hugepage
 +  */
 + if (host_pte_size  psize)
 + return H_PARAMETER;
  
 - if (pte_size  psize)
 - return H_PARAMETER;
 + pte = kvmppc_read_update_linux_pte(ptep, writing, hpage_shift);
 + if (pte_present(pte)  !pte_protnone(pte)) {
 + if (writing  !pte_write(pte))
 + /* make the actual HPTE be read-only */
 +  

Re: [PATCH 2/2] KVM: PPC: Remove page table walk helpers

2015-03-29 Thread Benjamin Herrenschmidt
On Mon, 2015-03-30 at 10:39 +0530, Aneesh Kumar K.V wrote:
 This patch remove helpers which we had used only once in the code.
 Limiting page table walk variants help in ensuring that we won't
 end up with code walking page table with wrong assumptions.
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Alex, it would be preferable to have this (and the previous one) in the
powerpc tree due to dependencies with further fixes to our page table
walking vs. THP, so if you're happy, just give us an ack.

Cheers,
Ben.

 ---
  arch/powerpc/include/asm/pgtable.h  | 21 -
  arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 
 -
  arch/powerpc/kvm/e500_mmu_host.c|  2 +-
  3 files changed, 28 insertions(+), 57 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/pgtable.h 
 b/arch/powerpc/include/asm/pgtable.h
 index 9835ac4173b7..92fe01c355a9 100644
 --- a/arch/powerpc/include/asm/pgtable.h
 +++ b/arch/powerpc/include/asm/pgtable.h
 @@ -249,27 +249,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, 
 unsigned long addr,
  #endif
  pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
unsigned *shift);
 -
 -static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva,
 -  unsigned long *pte_sizep)
 -{
 - pte_t *ptep;
 - unsigned long ps = *pte_sizep;
 - unsigned int shift;
 -
 - ptep = find_linux_pte_or_hugepte(pgdir, hva, shift);
 - if (!ptep)
 - return NULL;
 - if (shift)
 - *pte_sizep = 1ul  shift;
 - else
 - *pte_sizep = PAGE_SIZE;
 -
 - if (ps  *pte_sizep)
 - return NULL;
 -
 - return ptep;
 -}
  #endif /* __ASSEMBLY__ */
  
  #endif /* __KERNEL__ */
 diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
 b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 index 625407e4d3b0..73e083cb9f7e 100644
 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 @@ -131,25 +131,6 @@ static void remove_revmap_chain(struct kvm *kvm, long 
 pte_index,
   unlock_rmap(rmap);
  }
  
 -static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
 -   int writing, unsigned long *pte_sizep)
 -{
 - pte_t *ptep;
 - unsigned long ps = *pte_sizep;
 - unsigned int hugepage_shift;
 -
 - ptep = find_linux_pte_or_hugepte(pgdir, hva, hugepage_shift);
 - if (!ptep)
 - return __pte(0);
 - if (hugepage_shift)
 - *pte_sizep = 1ul  hugepage_shift;
 - else
 - *pte_sizep = PAGE_SIZE;
 - if (ps  *pte_sizep)
 - return __pte(0);
 - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
 -}
 -
  static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
  {
   asm volatile(PPC_RELEASE_BARRIER  : : : memory);
 @@ -166,10 +147,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
 flags,
   struct revmap_entry *rev;
   unsigned long g_ptel;
   struct kvm_memory_slot *memslot;
 - unsigned long pte_size;
 + unsigned hpage_shift;
   unsigned long is_io;
   unsigned long *rmap;
 - pte_t pte;
 + pte_t *ptep;
   unsigned int writing;
   unsigned long mmu_seq;
   unsigned long rcbits;
 @@ -208,22 +189,33 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
 flags,
  
   /* Translate to host virtual address */
   hva = __gfn_to_hva_memslot(memslot, gfn);
 + ptep = find_linux_pte_or_hugepte(pgdir, hva, hpage_shift);
 + if (ptep) {
 + pte_t pte;
 + unsigned int host_pte_size;
  
 - /* Look up the Linux PTE for the backing page */
 - pte_size = psize;
 - pte = lookup_linux_pte_and_update(pgdir, hva, writing, pte_size);
 - if (pte_present(pte)  !pte_protnone(pte)) {
 - if (writing  !pte_write(pte))
 - /* make the actual HPTE be read-only */
 - ptel = hpte_make_readonly(ptel);
 - is_io = hpte_cache_bits(pte_val(pte));
 - pa = pte_pfn(pte)  PAGE_SHIFT;
 - pa |= hva  (pte_size - 1);
 - pa |= gpa  ~PAGE_MASK;
 - }
 + if (hpage_shift)
 + host_pte_size = 1ul  hpage_shift;
 + else
 + host_pte_size = PAGE_SIZE;
 + /*
 +  * We should always find the guest page size
 +  * to = host page size, if host is using hugepage
 +  */
 + if (host_pte_size  psize)
 + return H_PARAMETER;
  
 - if (pte_size  psize)
 - return H_PARAMETER;
 + pte = kvmppc_read_update_linux_pte(ptep, writing, hpage_shift);
 + if (pte_present(pte)  !pte_protnone(pte)) {
 + if (writing  !pte_write(pte))
 + /* make the actual HPTE be read-only */
 +  

[PATCH] kvm: mmu: lazy collapse small sptes into large sptes

2015-03-29 Thread Wanpeng Li
There are two scenarios for the requirement of collapsing small sptes
into large sptes.
- dirty logging tracks sptes in 4k granularity, so large sptes are splitted,
  the large sptes will be reallocated in the destination machine and the
  guest in the source machine will be destroyed when live migration 
successfully.
  However, the guest in the source machine will continue to run if live 
migration
  fail due to some reasons, the sptes still keep small which lead to bad
  performance.
- our customers write tools to track the dirty speed of guests by EPT D bit/PML
  in order to determine the most appropriate one to be live migrated, however
  sptes will still keep small after tracking dirty speed.

This patch introduce lazy collapse small sptes into large sptes, the memory 
region 
will be scanned on the ioctl context when dirty log is stopped, the ones which 
can 
be collapsed into large pages will be dropped during the scan, it depends the 
on 
later #PF to reallocate all large sptes.

Signed-off-by: Wanpeng Li wanpeng...@linux.intel.com
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.c  | 66 +
 arch/x86/kvm/x86.c  |  5 
 3 files changed, 73 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a236e39..73de5d3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -859,6 +859,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
  struct kvm_memory_slot *memslot);
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+   struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
   struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cee7592..d25ced1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4465,6 +4465,72 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
kvm_flush_remote_tlbs(kvm);
 }
 
+static int kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+   unsigned long *rmapp)
+{
+   u64 *sptep;
+   struct rmap_iterator iter;
+   int need_tlb_flush = 0;
+   pfn_t pfn;
+   struct kvm_mmu_page *sp;
+
+   for (sptep = rmap_get_first(*rmapp, iter); sptep;) {
+   BUG_ON(!(*sptep  PT_PRESENT_MASK));
+
+   sp = page_header(__pa(sptep));
+   pfn = spte_to_pfn(*sptep);
+   if (sp-role.direct 
+   !kvm_is_reserved_pfn(pfn) 
+   PageTransCompound(pfn_to_page(pfn))) {
+   drop_spte(kvm, sptep);
+   need_tlb_flush = 1;
+   }
+   sptep = rmap_get_next(iter);
+   }
+
+   return need_tlb_flush;
+}
+
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+   struct kvm_memory_slot *memslot)
+{
+   bool flush = false;
+   unsigned long *rmapp;
+   unsigned long last_index, index;
+   gfn_t gfn_start, gfn_end;
+
+   spin_lock(kvm-mmu_lock);
+
+   gfn_start = memslot-base_gfn;
+   gfn_end = memslot-base_gfn + memslot-npages - 1;
+
+   if (gfn_start = gfn_end)
+   goto out;
+
+   rmapp = memslot-arch.rmap[0];
+   last_index = gfn_to_index(gfn_end, memslot-base_gfn,
+   PT_PAGE_TABLE_LEVEL);
+
+   for (index = 0; index = last_index; ++index, ++rmapp) {
+   if (*rmapp)
+   flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
+
+   if (need_resched() || spin_needbreak(kvm-mmu_lock)) {
+   if (flush) {
+   kvm_flush_remote_tlbs(kvm);
+   flush = false;
+   }
+   cond_resched_lock(kvm-mmu_lock);
+   }
+   }
+
+   if (flush)
+   kvm_flush_remote_tlbs(kvm);
+
+out:
+   spin_unlock(kvm-mmu_lock);
+}
+
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
   struct kvm_memory_slot *memslot)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c5f7e03..6037389 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7618,6 +7618,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
/* It's OK to get 'new' slot here as it has already been installed */
new = id_to_memslot(kvm-memslots, mem-slot);
 
+   if ((change != KVM_MR_DELETE) 
+   (old-flags  KVM_MEM_LOG_DIRTY_PAGES) 
+   !(new-flags  KVM_MEM_LOG_DIRTY_PAGES))
+   kvm_mmu_zap_collapsible_sptes(kvm, new);
+
/*
 * Set up write 

[PATCH 1/2] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer

2015-03-29 Thread Aneesh Kumar K.V
pte can get updated from other CPUs as part of multiple activities
like THP split, huge page collapse, unmap. We need to make sure we
don't reload the pte value again and again for different checks.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
Note:
This is posted previously as part of
http://article.gmane.org/gmane.linux.ports.ppc.embedded/79278

 arch/powerpc/include/asm/kvm_book3s_64.h |  5 -
 arch/powerpc/kvm/e500_mmu_host.c | 20 
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index cc073a7ac2b7..f06820c67175 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -290,7 +290,10 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t 
*ptep, int writing,
pte_t old_pte, new_pte = __pte(0);
 
while (1) {
-   old_pte = *ptep;
+   /*
+* Make sure we don't reload from ptep
+*/
+   old_pte = READ_ONCE(*ptep);
/*
 * wait until _PAGE_BUSY is clear then set it atomically
 */
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index cc536d4a75ef..5840d546aa03 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -469,14 +469,18 @@ static inline int kvmppc_e500_shadow_map(struct 
kvmppc_vcpu_e500 *vcpu_e500,
 
pgdir = vcpu_e500-vcpu.arch.pgdir;
ptep = lookup_linux_ptep(pgdir, hva, tsize_pages);
-   if (pte_present(*ptep))
-   wimg = (*ptep  PTE_WIMGE_SHIFT)  MAS2_WIMGE_MASK;
-   else {
-   if (printk_ratelimit())
-   pr_err(%s: pte not present: gfn %lx, pfn %lx\n,
-   __func__, (long)gfn, pfn);
-   ret = -EINVAL;
-   goto out;
+   if (ptep) {
+   pte_t pte = READ_ONCE(*ptep);
+
+   if (pte_present(pte))
+   wimg = (pte_val(pte)  PTE_WIMGE_SHIFT) 
+   MAS2_WIMGE_MASK;
+   else {
+   pr_err_ratelimited(%s: pte not present: gfn %lx,pfn 
%lx\n,
+  __func__, (long)gfn, pfn);
+   ret = -EINVAL;
+   goto out;
+   }
}
kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
 
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer

2015-03-29 Thread Aneesh Kumar K.V
pte can get updated from other CPUs as part of multiple activities
like THP split, huge page collapse, unmap. We need to make sure we
don't reload the pte value again and again for different checks.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
Note:
This is posted previously as part of
http://article.gmane.org/gmane.linux.ports.ppc.embedded/79278

 arch/powerpc/include/asm/kvm_book3s_64.h |  5 -
 arch/powerpc/kvm/e500_mmu_host.c | 20 
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index cc073a7ac2b7..f06820c67175 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -290,7 +290,10 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t 
*ptep, int writing,
pte_t old_pte, new_pte = __pte(0);
 
while (1) {
-   old_pte = *ptep;
+   /*
+* Make sure we don't reload from ptep
+*/
+   old_pte = READ_ONCE(*ptep);
/*
 * wait until _PAGE_BUSY is clear then set it atomically
 */
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index cc536d4a75ef..5840d546aa03 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -469,14 +469,18 @@ static inline int kvmppc_e500_shadow_map(struct 
kvmppc_vcpu_e500 *vcpu_e500,
 
pgdir = vcpu_e500-vcpu.arch.pgdir;
ptep = lookup_linux_ptep(pgdir, hva, tsize_pages);
-   if (pte_present(*ptep))
-   wimg = (*ptep  PTE_WIMGE_SHIFT)  MAS2_WIMGE_MASK;
-   else {
-   if (printk_ratelimit())
-   pr_err(%s: pte not present: gfn %lx, pfn %lx\n,
-   __func__, (long)gfn, pfn);
-   ret = -EINVAL;
-   goto out;
+   if (ptep) {
+   pte_t pte = READ_ONCE(*ptep);
+
+   if (pte_present(pte))
+   wimg = (pte_val(pte)  PTE_WIMGE_SHIFT) 
+   MAS2_WIMGE_MASK;
+   else {
+   pr_err_ratelimited(%s: pte not present: gfn %lx,pfn 
%lx\n,
+  __func__, (long)gfn, pfn);
+   ret = -EINVAL;
+   goto out;
+   }
}
kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
 
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] KVM: PPC: Remove page table walk helpers

2015-03-29 Thread Aneesh Kumar K.V
This patch remove helpers which we had used only once in the code.
Limiting page table walk variants help in ensuring that we won't
end up with code walking page table with wrong assumptions.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable.h  | 21 -
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 -
 arch/powerpc/kvm/e500_mmu_host.c|  2 +-
 3 files changed, 28 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 9835ac4173b7..92fe01c355a9 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -249,27 +249,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, 
unsigned long addr,
 #endif
 pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 unsigned *shift);
-
-static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva,
-unsigned long *pte_sizep)
-{
-   pte_t *ptep;
-   unsigned long ps = *pte_sizep;
-   unsigned int shift;
-
-   ptep = find_linux_pte_or_hugepte(pgdir, hva, shift);
-   if (!ptep)
-   return NULL;
-   if (shift)
-   *pte_sizep = 1ul  shift;
-   else
-   *pte_sizep = PAGE_SIZE;
-
-   if (ps  *pte_sizep)
-   return NULL;
-
-   return ptep;
-}
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 625407e4d3b0..73e083cb9f7e 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -131,25 +131,6 @@ static void remove_revmap_chain(struct kvm *kvm, long 
pte_index,
unlock_rmap(rmap);
 }
 
-static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
- int writing, unsigned long *pte_sizep)
-{
-   pte_t *ptep;
-   unsigned long ps = *pte_sizep;
-   unsigned int hugepage_shift;
-
-   ptep = find_linux_pte_or_hugepte(pgdir, hva, hugepage_shift);
-   if (!ptep)
-   return __pte(0);
-   if (hugepage_shift)
-   *pte_sizep = 1ul  hugepage_shift;
-   else
-   *pte_sizep = PAGE_SIZE;
-   if (ps  *pte_sizep)
-   return __pte(0);
-   return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
-}
-
 static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
 {
asm volatile(PPC_RELEASE_BARRIER  : : : memory);
@@ -166,10 +147,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
flags,
struct revmap_entry *rev;
unsigned long g_ptel;
struct kvm_memory_slot *memslot;
-   unsigned long pte_size;
+   unsigned hpage_shift;
unsigned long is_io;
unsigned long *rmap;
-   pte_t pte;
+   pte_t *ptep;
unsigned int writing;
unsigned long mmu_seq;
unsigned long rcbits;
@@ -208,22 +189,33 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
flags,
 
/* Translate to host virtual address */
hva = __gfn_to_hva_memslot(memslot, gfn);
+   ptep = find_linux_pte_or_hugepte(pgdir, hva, hpage_shift);
+   if (ptep) {
+   pte_t pte;
+   unsigned int host_pte_size;
 
-   /* Look up the Linux PTE for the backing page */
-   pte_size = psize;
-   pte = lookup_linux_pte_and_update(pgdir, hva, writing, pte_size);
-   if (pte_present(pte)  !pte_protnone(pte)) {
-   if (writing  !pte_write(pte))
-   /* make the actual HPTE be read-only */
-   ptel = hpte_make_readonly(ptel);
-   is_io = hpte_cache_bits(pte_val(pte));
-   pa = pte_pfn(pte)  PAGE_SHIFT;
-   pa |= hva  (pte_size - 1);
-   pa |= gpa  ~PAGE_MASK;
-   }
+   if (hpage_shift)
+   host_pte_size = 1ul  hpage_shift;
+   else
+   host_pte_size = PAGE_SIZE;
+   /*
+* We should always find the guest page size
+* to = host page size, if host is using hugepage
+*/
+   if (host_pte_size  psize)
+   return H_PARAMETER;
 
-   if (pte_size  psize)
-   return H_PARAMETER;
+   pte = kvmppc_read_update_linux_pte(ptep, writing, hpage_shift);
+   if (pte_present(pte)  !pte_protnone(pte)) {
+   if (writing  !pte_write(pte))
+   /* make the actual HPTE be read-only */
+   ptel = hpte_make_readonly(ptel);
+   is_io = hpte_cache_bits(pte_val(pte));
+   pa = pte_pfn(pte)  PAGE_SHIFT;
+   pa |= hva  (host_pte_size - 1);
+   pa |= gpa  

RE: [v3 24/26] KVM: Update Posted-Interrupts Descriptor when vCPU is blocked

2015-03-29 Thread Wu, Feng


 -Original Message-
 From: Marcelo Tosatti [mailto:mtosa...@redhat.com]
 Sent: Saturday, March 28, 2015 3:30 AM
 To: Wu, Feng
 Cc: h...@zytor.com; t...@linutronix.de; mi...@redhat.com; x...@kernel.org;
 g...@kernel.org; pbonz...@redhat.com; dw...@infradead.org;
 j...@8bytes.org; alex.william...@redhat.com; jiang@linux.intel.com;
 eric.au...@linaro.org; linux-ker...@vger.kernel.org;
 io...@lists.linux-foundation.org; kvm@vger.kernel.org
 Subject: Re: [v3 24/26] KVM: Update Posted-Interrupts Descriptor when vCPU
 is blocked
 
 On Fri, Mar 27, 2015 at 06:34:14AM +, Wu, Feng wrote:
Currently, the following code is executed before local_irq_disable() is
 called,
so do you mean 1)moving local_irq_disable() to the place before it. 2) 
after
   interrupt
is disabled, set KVM_REQ_EVENT in case the ON bit is set?
  
   2) after interrupt is disabled, set KVM_REQ_EVENT in case the ON bit
   is set.
 
  Here is my understanding about your comments here:
  - Disable interrupts
  - Check 'ON'
  - Set KVM_REQ_EVENT if 'ON' is set
 
  Then we can put the above code inside  if
 (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) 
  just like it used to be. However, I still have some questions about this
 comment:
 
  1. Where should I set KVM_REQ_EVENT? In function vcpu_enter_guest(), or
 other places?
 
 See below:
 
  If in vcpu_enter_guest(), since currently local_irq_disable() is called 
  after
 'KVM_REQ_EVENT'
  is checked, is it helpful to set KVM_REQ_EVENT after local_irq_disable() is
 called?
 
 local_irq_disable();
 
   *** add code here ***

So we need add code like the following here, right?

  if ('ON' is set)
  kvm_make_request(KVM_REQ_EVENT, vcpu);

 
 if (vcpu-mode == EXITING_GUEST_MODE || vcpu-requests
   ^^
 || need_resched() || signal_pending(current)) {
 vcpu-mode = OUTSIDE_GUEST_MODE;
 smp_wmb();
 local_irq_enable();
 preempt_enable();
 vcpu-srcu_idx = srcu_read_lock(vcpu-kvm-srcu);
 r = 1;
 goto cancel_injection;
 }
 
  2. 'ON' is set by VT-d hardware, it can be set even when interrupt is 
  disabled
 (the related bit in PIR is also set).
 
 Yes, we are checking if the HW has set an interrupt in PIR while
 outside VM (which requires PIR-VIRR transfer by software).
 
 If the interrupt it set by hardware after local_irq_disable(),
 VMX-entry will handle the interrupt and perform the PIR-VIRR
 transfer and reevaluate interrupts, injecting to guest
 if necessary, is that correct ?
 
  So does it make sense to check 'ON' and set KVM_REQ_EVENT accordingly
 after interrupt is disabled?
 
 To replace the costly
 
 +*/
 +   if (kvm_x86_ops-hwapic_irr_update)
 +   kvm_x86_ops-hwapic_irr_update(vcpu,
 +   kvm_lapic_find_highest_irr(vcpu));
 
 Yes, i think so.

After adding the checking ON and setting KVM_REQ_EVENT operations listed in my
comments above, do you mean we still need to keep the costly code above
inside if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {} in 
function
vcpu_enter_guest() as it used to be? If yes, my question is what is the exact 
purpose
of checking ON and setting KVM_REQ_EVENT operations? Here is the code flow in
vcpu_enter_guest():

1. Check KVM_REQ_EVENT, if it is set, sync pir-virr
2. Disable interrupts
3. Check ON and set KVM_REQ_EVENT -- Here, we set KVM_REQ_EVENT, but it is
checked in the step 1, which means, we cannot get any benefits even we set it 
here,
since the pir-virr sync operation was done in step 1, between step 3 and 
VM-Entry,
we don't synchronize the pir to virr. So even we set KVM_REQ_EVENT here, the 
interrupts
remaining in PIR cannot be delivered to guest during this VM-Entry, right?

Thanks,
Feng

 
  I might miss something in your comments, if so please point out. Thanks a
 lot!
 
  Thanks,
  Feng
 
  
   
if (kvm_x86_ops-hwapic_irr_update)
kvm_x86_ops-hwapic_irr_update(vcpu,
kvm_lapic_find_highest_irr(vcpu));
   
 kvm_lapic_find_highest_irr(vcpu) eats some cache
 (4 cachelines) versus 1 cacheline for reading ON bit.

 Please remove blocked and wakeup_cpu, they should not be
   necessary.
   
Why do you think wakeup_cpu is not needed, when vCPU is
 blocked,
wakeup_cpu saves the cpu which the vCPU is blocked on, after
 vCPU
is woken up, it can run on a different cpu, so we need 
wakeup_cpu
 to
find the right list to wake up the vCPU.
  
   If the vCPU was moved it should have updated IRTE destination 
   field
   to the pCPU which it has moved to?
 
  Every time a vCPU is scheduled to a new pCPU, the IRTE destination
 filed
  would be updated accordingly.
 
  When vCPU is blocked. To wake 

[PATCH 2/2] KVM: PPC: Remove page table walk helpers

2015-03-29 Thread Aneesh Kumar K.V
This patch remove helpers which we had used only once in the code.
Limiting page table walk variants help in ensuring that we won't
end up with code walking page table with wrong assumptions.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable.h  | 21 -
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 -
 arch/powerpc/kvm/e500_mmu_host.c|  2 +-
 3 files changed, 28 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 9835ac4173b7..92fe01c355a9 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -249,27 +249,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, 
unsigned long addr,
 #endif
 pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 unsigned *shift);
-
-static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva,
-unsigned long *pte_sizep)
-{
-   pte_t *ptep;
-   unsigned long ps = *pte_sizep;
-   unsigned int shift;
-
-   ptep = find_linux_pte_or_hugepte(pgdir, hva, shift);
-   if (!ptep)
-   return NULL;
-   if (shift)
-   *pte_sizep = 1ul  shift;
-   else
-   *pte_sizep = PAGE_SIZE;
-
-   if (ps  *pte_sizep)
-   return NULL;
-
-   return ptep;
-}
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 625407e4d3b0..73e083cb9f7e 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -131,25 +131,6 @@ static void remove_revmap_chain(struct kvm *kvm, long 
pte_index,
unlock_rmap(rmap);
 }
 
-static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
- int writing, unsigned long *pte_sizep)
-{
-   pte_t *ptep;
-   unsigned long ps = *pte_sizep;
-   unsigned int hugepage_shift;
-
-   ptep = find_linux_pte_or_hugepte(pgdir, hva, hugepage_shift);
-   if (!ptep)
-   return __pte(0);
-   if (hugepage_shift)
-   *pte_sizep = 1ul  hugepage_shift;
-   else
-   *pte_sizep = PAGE_SIZE;
-   if (ps  *pte_sizep)
-   return __pte(0);
-   return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
-}
-
 static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
 {
asm volatile(PPC_RELEASE_BARRIER  : : : memory);
@@ -166,10 +147,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
flags,
struct revmap_entry *rev;
unsigned long g_ptel;
struct kvm_memory_slot *memslot;
-   unsigned long pte_size;
+   unsigned hpage_shift;
unsigned long is_io;
unsigned long *rmap;
-   pte_t pte;
+   pte_t *ptep;
unsigned int writing;
unsigned long mmu_seq;
unsigned long rcbits;
@@ -208,22 +189,33 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
flags,
 
/* Translate to host virtual address */
hva = __gfn_to_hva_memslot(memslot, gfn);
+   ptep = find_linux_pte_or_hugepte(pgdir, hva, hpage_shift);
+   if (ptep) {
+   pte_t pte;
+   unsigned int host_pte_size;
 
-   /* Look up the Linux PTE for the backing page */
-   pte_size = psize;
-   pte = lookup_linux_pte_and_update(pgdir, hva, writing, pte_size);
-   if (pte_present(pte)  !pte_protnone(pte)) {
-   if (writing  !pte_write(pte))
-   /* make the actual HPTE be read-only */
-   ptel = hpte_make_readonly(ptel);
-   is_io = hpte_cache_bits(pte_val(pte));
-   pa = pte_pfn(pte)  PAGE_SHIFT;
-   pa |= hva  (pte_size - 1);
-   pa |= gpa  ~PAGE_MASK;
-   }
+   if (hpage_shift)
+   host_pte_size = 1ul  hpage_shift;
+   else
+   host_pte_size = PAGE_SIZE;
+   /*
+* We should always find the guest page size
+* to = host page size, if host is using hugepage
+*/
+   if (host_pte_size  psize)
+   return H_PARAMETER;
 
-   if (pte_size  psize)
-   return H_PARAMETER;
+   pte = kvmppc_read_update_linux_pte(ptep, writing, hpage_shift);
+   if (pte_present(pte)  !pte_protnone(pte)) {
+   if (writing  !pte_write(pte))
+   /* make the actual HPTE be read-only */
+   ptel = hpte_make_readonly(ptel);
+   is_io = hpte_cache_bits(pte_val(pte));
+   pa = pte_pfn(pte)  PAGE_SHIFT;
+   pa |= hva  (host_pte_size - 1);
+   pa |= gpa  

[PATCH 2/2] KVM: x86: Remove redundant definitions

2015-03-29 Thread Nadav Amit
Some constants are redfined in emulate.c. Avoid it.

s/SELECTOR_RPL_MASK/SEGMENT_RPL_MASK
s/SELECTOR_TI_MASK/SEGMENT_TI_MASK

No functional change.

Signed-off-by: Nadav Amit na...@cs.technion.ac.il
---
 arch/x86/include/asm/kvm_host.h |  3 ---
 arch/x86/kvm/emulate.c  |  6 +++---
 arch/x86/kvm/vmx.c  | 18 +-
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7ba3d9d..30b28dc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -81,9 +81,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, 
int level)
(base_gfn  KVM_HPAGE_GFN_SHIFT(level));
 }
 
-#define SELECTOR_TI_MASK (1  2)
-#define SELECTOR_RPL_MASK 0x03
-
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
 #define KVM_MMU_HASH_SHIFT 10
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9a578a1..a48bcd7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2412,7 +2412,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
return emulate_gp(ctxt, 0);
 
ctxt-eflags = ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
-   cs_sel = (u16)msr_data  ~SELECTOR_RPL_MASK;
+   cs_sel = (u16)msr_data  ~SEGMENT_RPL_MASK;
ss_sel = cs_sel + 8;
if (efer  EFER_LMA) {
cs.d = 0;
@@ -2479,8 +2479,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
return emulate_gp(ctxt, 0);
break;
}
-   cs_sel |= SELECTOR_RPL_MASK;
-   ss_sel |= SELECTOR_RPL_MASK;
+   cs_sel |= SEGMENT_RPL_MASK;
+   ss_sel |= SEGMENT_RPL_MASK;
 
ops-set_segment(ctxt, cs_sel, cs, 0, VCPU_SREG_CS);
ops-set_segment(ctxt, ss_sel, ss, 0, VCPU_SREG_SS);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fdd9f8b..63ca692 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3263,8 +3263,8 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
 * default value.
 */
if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
-   save-selector = ~SELECTOR_RPL_MASK;
-   save-dpl = save-selector  SELECTOR_RPL_MASK;
+   save-selector = ~SEGMENT_RPL_MASK;
+   save-dpl = save-selector  SEGMENT_RPL_MASK;
save-s = 1;
}
vmx_set_segment(vcpu, save, seg);
@@ -3837,7 +3837,7 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
unsigned int cs_rpl;
 
vmx_get_segment(vcpu, cs, VCPU_SREG_CS);
-   cs_rpl = cs.selector  SELECTOR_RPL_MASK;
+   cs_rpl = cs.selector  SEGMENT_RPL_MASK;
 
if (cs.unusable)
return false;
@@ -3865,7 +3865,7 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu)
unsigned int ss_rpl;
 
vmx_get_segment(vcpu, ss, VCPU_SREG_SS);
-   ss_rpl = ss.selector  SELECTOR_RPL_MASK;
+   ss_rpl = ss.selector  SEGMENT_RPL_MASK;
 
if (ss.unusable)
return true;
@@ -3887,7 +3887,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int 
seg)
unsigned int rpl;
 
vmx_get_segment(vcpu, var, seg);
-   rpl = var.selector  SELECTOR_RPL_MASK;
+   rpl = var.selector  SEGMENT_RPL_MASK;
 
if (var.unusable)
return true;
@@ -3914,7 +3914,7 @@ static bool tr_valid(struct kvm_vcpu *vcpu)
 
if (tr.unusable)
return false;
-   if (tr.selector  SELECTOR_TI_MASK) /* TI = 1 */
+   if (tr.selector  SEGMENT_TI_MASK)  /* TI = 1 */
return false;
if (tr.type != 3  tr.type != 11) /* TODO: Check if guest is in IA32e 
mode */
return false;
@@ -3932,7 +3932,7 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
 
if (ldtr.unusable)
return true;
-   if (ldtr.selector  SELECTOR_TI_MASK)   /* TI = 1 */
+   if (ldtr.selector  SEGMENT_TI_MASK)/* TI = 1 */
return false;
if (ldtr.type != 2)
return false;
@@ -3949,8 +3949,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
vmx_get_segment(vcpu, cs, VCPU_SREG_CS);
vmx_get_segment(vcpu, ss, VCPU_SREG_SS);
 
-   return ((cs.selector  SELECTOR_RPL_MASK) ==
-(ss.selector  SELECTOR_RPL_MASK));
+   return ((cs.selector  SEGMENT_RPL_MASK) ==
+(ss.selector  SEGMENT_RPL_MASK));
 }
 
 /*
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] KVM: x86: removing redundant eflags bits definitions

2015-03-29 Thread Nadav Amit
The eflags are redefined (using other defines) in emulate.c.
Use the definition from processor-flags.h as some mess already started.
No functional change.

Signed-off-by: Nadav Amit na...@cs.technion.ac.il
---
 arch/x86/include/asm/kvm_host.h |   2 -
 arch/x86/kvm/emulate.c  | 105 ++--
 2 files changed, 46 insertions(+), 61 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bf5a160..7ba3d9d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -84,8 +84,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, 
int level)
 #define SELECTOR_TI_MASK (1  2)
 #define SELECTOR_RPL_MASK 0x03
 
-#define IOPL_SHIFT 12
-
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
 #define KVM_MMU_HASH_SHIFT 10
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c941abe..9a578a1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -248,27 +248,7 @@ struct mode_dual {
struct opcode mode64;
 };
 
-/* EFLAGS bit definitions. */
-#define EFLG_ID (121)
-#define EFLG_VIP (120)
-#define EFLG_VIF (119)
-#define EFLG_AC (118)
-#define EFLG_VM (117)
-#define EFLG_RF (116)
-#define EFLG_IOPL (312)
-#define EFLG_NT (114)
-#define EFLG_OF (111)
-#define EFLG_DF (110)
-#define EFLG_IF (19)
-#define EFLG_TF (18)
-#define EFLG_SF (17)
-#define EFLG_ZF (16)
-#define EFLG_AF (14)
-#define EFLG_PF (12)
-#define EFLG_CF (10)
-
 #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
-#define EFLG_RESERVED_ONE_MASK 2
 
 enum x86_transfer_type {
X86_TRANSFER_NONE,
@@ -317,7 +297,8 @@ static void invalidate_registers(struct x86_emulate_ctxt 
*ctxt)
  * These EFLAGS bits are restored from saved value during emulation, and
  * any changes are written back to the saved value after emulation.
  */
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\
+X86_EFLAGS_PF|X86_EFLAGS_CF)
 
 #ifdef CONFIG_X86_64
 #define ON64(x) x
@@ -1399,7 +1380,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
unsigned int in_page, n;
unsigned int count = ctxt-rep_prefix ?
address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
-   in_page = (ctxt-eflags  EFLG_DF) ?
+   in_page = (ctxt-eflags  X86_EFLAGS_DF) ?
offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
PAGE_SIZE - offset_in_page(reg_read(ctxt, 
VCPU_REGS_RDI));
n = min3(in_page, (unsigned int)sizeof(rc-data) / size, count);
@@ -1412,7 +1393,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
}
 
if (ctxt-rep_prefix  (ctxt-d  String) 
-   !(ctxt-eflags  EFLG_DF)) {
+   !(ctxt-eflags  X86_EFLAGS_DF)) {
ctxt-dst.data = rc-data + rc-pos;
ctxt-dst.type = OP_MEM_STR;
ctxt-dst.count = (rc-end - rc-pos) / size;
@@ -1792,32 +1773,34 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 {
int rc;
unsigned long val, change_mask;
-   int iopl = (ctxt-eflags  X86_EFLAGS_IOPL)  IOPL_SHIFT;
+   int iopl = (ctxt-eflags  X86_EFLAGS_IOPL)  X86_EFLAGS_IOPL_BIT;
int cpl = ctxt-ops-cpl(ctxt);
 
rc = emulate_pop(ctxt, val, len);
if (rc != X86EMUL_CONTINUE)
return rc;
 
-   change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
-   | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID;
+   change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF |
+ X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT |
+ X86_EFLAGS_AC | X86_EFLAGS_ID;
 
switch(ctxt-mode) {
case X86EMUL_MODE_PROT64:
case X86EMUL_MODE_PROT32:
case X86EMUL_MODE_PROT16:
if (cpl == 0)
-   change_mask |= EFLG_IOPL;
+   change_mask |= X86_EFLAGS_IOPL;
if (cpl = iopl)
-   change_mask |= EFLG_IF;
+   change_mask |= X86_EFLAGS_IF;
break;
case X86EMUL_MODE_VM86:
if (iopl  3)
return emulate_gp(ctxt, 0);
-   change_mask |= EFLG_IF;
+   change_mask |= X86_EFLAGS_IF;
break;
default: /* real mode */
-   change_mask |= (EFLG_IOPL | EFLG_IF);
+   change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF);
break;
}
 
@@ -1918,7 +1901,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
 
 static int em_pushf(struct x86_emulate_ctxt *ctxt)
 {
-   ctxt-src.val = (unsigned long)ctxt-eflags  ~EFLG_VM;
+   ctxt-src.val = (unsigned long)ctxt-eflags  

[PATCH 0/2] KVM: x86: Removing redundant definitions from emulator

2015-03-29 Thread Nadav Amit
There are several redundant definitions in processor-flags.h and emulator.c.

Slowly, but surely they will get mixed, so removing those of emulator.c seems
like a reasonable move (unless I am missing something, e.g., kvm-kmod
consideration).

Nadav Amit (2):
  KVM: x86: removing redundant eflags bits definitions
  KVM: x86: Remove redundant definitions

 arch/x86/include/asm/kvm_host.h |   5 --
 arch/x86/kvm/emulate.c  | 111 ++--
 arch/x86/kvm/vmx.c  |  18 +++
 3 files changed, 58 insertions(+), 76 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] KVM: nVMX: add missing checks for address bits beyond physical address width on VM-entry

2015-03-29 Thread Eugene Korenevsky
On each VM-entry CPU should check the following VMCS fields for zero bits
beyond physical address width:
-  APIC-access address
-  virtual-APIC address
-  posted-interrupt descriptor address
This patch adds these checks required by Intel SDM.

Signed-off-by: Eugene Korenevsky ekorenev...@gmail.com
---
 arch/x86/kvm/vmx.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b20b4..b3dcf73 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8616,10 +8616,11 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu 
*vcpu,
struct vmcs12 *vmcs12)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
+   int maxphyaddr = cpuid_maxphyaddr(vcpu);
 
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-   /* TODO: Also verify bits beyond physical address width are 0 */
-   if (!PAGE_ALIGNED(vmcs12-apic_access_addr))
+   if (!PAGE_ALIGNED(vmcs12-apic_access_addr) ||
+   vmcs12-apic_access_addr  maxphyaddr)
return false;
 
/*
@@ -8635,8 +8636,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
}
 
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-   /* TODO: Also verify bits beyond physical address width are 0 */
-   if (!PAGE_ALIGNED(vmcs12-virtual_apic_page_addr))
+   if (!PAGE_ALIGNED(vmcs12-virtual_apic_page_addr) ||
+   vmcs12-virtual_apic_page_addr  maxphyaddr)
return false;
 
if (vmx-nested.virtual_apic_page) /* shouldn't happen */
@@ -8659,7 +8660,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
}
 
if (nested_cpu_has_posted_intr(vmcs12)) {
-   if (!IS_ALIGNED(vmcs12-posted_intr_desc_addr, 64))
+   if (!IS_ALIGNED(vmcs12-posted_intr_desc_addr, 64) ||
+   vmcs12-posted_intr_desc_addr  maxphyaddr)
return false;
 
if (vmx-nested.pi_desc_page) { /* shouldn't happen */
@@ -9379,7 +9381,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool 
launch)
}
 
if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
-   /*TODO: Also verify bits beyond physical address width are 0*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
-- 
2.0.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] KVM: x86: optimization: cache physical address width to avoid excessive enumerations of CPUID entries

2015-03-29 Thread Eugene Korenevsky
cpuid_maxphyaddr() which performs lot of memory accesses is called extensively
across KVM, especially in nVMX code.
This patch adds cached value of maxphyaddr to vcpu.arch to reduce the pressure 
onto
CPU cache and simplify the code of cpuid_maxphyaddr() callers. The cached value 
is
initialized in kvm_arch_vcpu_init() and reloaded every time CPUID is updated by
usermode. It is obvious that these reloads occur infrequently.

Signed-off-by: Eugene Korenevsky ekorenev...@gmail.com
---
 arch/x86/include/asm/kvm_host.h |  4 +++-
 arch/x86/kvm/cpuid.c| 33 ++---
 arch/x86/kvm/cpuid.h|  6 ++
 arch/x86/kvm/x86.c  |  2 ++
 4 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a236e39..2362a60 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -431,6 +431,9 @@ struct kvm_vcpu_arch {
 
int cpuid_nent;
struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+
+   int maxphyaddr;
+
/* emulate context */
 
struct x86_emulate_ctxt emulate_ctxt;
@@ -1128,7 +1131,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long 
start, unsigned long end)
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a80737..59b69f6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -104,6 +104,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
((best-eax  0xff00)  8) != 0)
return -EINVAL;
 
+   /* Update physical-address width */
+   vcpu-arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
kvm_pmu_cpuid_update(vcpu);
return 0;
 }
@@ -135,6 +138,21 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
}
 }
 
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+   struct kvm_cpuid_entry2 *best;
+
+   best = kvm_find_cpuid_entry(vcpu, 0x8000, 0);
+   if (!best || best-eax  0x8008)
+   goto not_found;
+   best = kvm_find_cpuid_entry(vcpu, 0x8008, 0);
+   if (best)
+   return best-eax  0xff;
+not_found:
+   return 36;
+}
+EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
+
 /* when an old userspace process fills a new kernel module */
 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 struct kvm_cpuid *cpuid,
@@ -757,21 +775,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct 
kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
 
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
-{
-   struct kvm_cpuid_entry2 *best;
-
-   best = kvm_find_cpuid_entry(vcpu, 0x8000, 0);
-   if (!best || best-eax  0x8008)
-   goto not_found;
-   best = kvm_find_cpuid_entry(vcpu, 0x8008, 0);
-   if (best)
-   return best-eax  0xff;
-not_found:
-   return 36;
-}
-EXPORT_SYMBOL_GPL(cpuid_maxphyaddr);
-
 /*
  * If no match is found, check whether we exceed the vCPU's limit
  * and return the content of the highest valid _standard_ leaf instead.
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 4452eed..78b61b4 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -20,6 +20,12 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
  struct kvm_cpuid_entry2 __user *entries);
 void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
 
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+
+static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+   return vcpu-arch.maxphyaddr;
+}
 
 static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bd7a70b..084e1d5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7289,6 +7289,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
vcpu-arch.guest_supported_xcr0 = 0;
vcpu-arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
 
+   vcpu-arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
 
-- 
2.0.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] KVM: nVMX: nested_vmx_check_msr_switch_controls simplification: remove unnecessary caching of physical address width

2015-03-29 Thread Eugene Korenevsky
After speed-up of cpuid_maxphyaddr() it can be called easily. Now instead of
heavy enumeration of CPUID entries it returns cached pre-computed value. It is
also inlined now. So caching its result became unnecessary and can be removed.

Signed-off-by: Eugene Korenevsky ekorenev...@gmail.com
---
 arch/x86/kvm/vmx.c | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b3dcf73..1cd2837 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8860,9 +8860,9 @@ static int nested_vmx_check_apicv_controls(struct 
kvm_vcpu *vcpu,
 
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
   unsigned long count_field,
-  unsigned long addr_field,
-  int maxphyaddr)
+  unsigned long addr_field)
 {
+   int maxphyaddr;
u64 count, addr;
 
if (vmcs12_read_any(vcpu, count_field, count) ||
@@ -8872,6 +8872,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu 
*vcpu,
}
if (count == 0)
return 0;
+   maxphyaddr = cpuid_maxphyaddr(vcpu);
if (!IS_ALIGNED(addr, 16) || addr  maxphyaddr ||
(addr + count * sizeof(struct vmx_msr_entry) - 1)  maxphyaddr) {
pr_warn_ratelimited(
@@ -8885,19 +8886,16 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu 
*vcpu,
 static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
 {
-   int maxphyaddr;
-
if (vmcs12-vm_exit_msr_load_count == 0 
vmcs12-vm_exit_msr_store_count == 0 
vmcs12-vm_entry_msr_load_count == 0)
return 0; /* Fast path */
-   maxphyaddr = cpuid_maxphyaddr(vcpu);
if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
-   VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+   VM_EXIT_MSR_LOAD_ADDR) ||
nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
-   VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+   VM_EXIT_MSR_STORE_ADDR) ||
nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
-   VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+   VM_ENTRY_MSR_LOAD_ADDR))
return -EINVAL;
return 0;
 }
-- 
2.0.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html