Re: [PATCH 2/2] KVM: PPC: Remove page table walk helpers
On Mon, 2015-03-30 at 10:39 +0530, Aneesh Kumar K.V wrote: This patch remove helpers which we had used only once in the code. Limiting page table walk variants help in ensuring that we won't end up with code walking page table with wrong assumptions. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Alex, it would be preferable to have this (and the previous one) in the powerpc tree due to dependencies with further fixes to our page table walking vs. THP, so if you're happy, just give us an ack. Cheers, Ben. --- arch/powerpc/include/asm/pgtable.h | 21 - arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 - arch/powerpc/kvm/e500_mmu_host.c| 2 +- 3 files changed, 28 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 9835ac4173b7..92fe01c355a9 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -249,27 +249,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, #endif pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift); - -static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva, - unsigned long *pte_sizep) -{ - pte_t *ptep; - unsigned long ps = *pte_sizep; - unsigned int shift; - - ptep = find_linux_pte_or_hugepte(pgdir, hva, shift); - if (!ptep) - return NULL; - if (shift) - *pte_sizep = 1ul shift; - else - *pte_sizep = PAGE_SIZE; - - if (ps *pte_sizep) - return NULL; - - return ptep; -} #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 625407e4d3b0..73e083cb9f7e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -131,25 +131,6 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } -static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, - int writing, unsigned long *pte_sizep) -{ - pte_t *ptep; - unsigned long ps = *pte_sizep; - unsigned int hugepage_shift; - - ptep = find_linux_pte_or_hugepte(pgdir, hva, hugepage_shift); - if (!ptep) - return __pte(0); - if (hugepage_shift) - *pte_sizep = 1ul hugepage_shift; - else - *pte_sizep = PAGE_SIZE; - if (ps *pte_sizep) - return __pte(0); - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); -} - static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) { asm volatile(PPC_RELEASE_BARRIER : : : memory); @@ -166,10 +147,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, struct revmap_entry *rev; unsigned long g_ptel; struct kvm_memory_slot *memslot; - unsigned long pte_size; + unsigned hpage_shift; unsigned long is_io; unsigned long *rmap; - pte_t pte; + pte_t *ptep; unsigned int writing; unsigned long mmu_seq; unsigned long rcbits; @@ -208,22 +189,33 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Translate to host virtual address */ hva = __gfn_to_hva_memslot(memslot, gfn); + ptep = find_linux_pte_or_hugepte(pgdir, hva, hpage_shift); + if (ptep) { + pte_t pte; + unsigned int host_pte_size; - /* Look up the Linux PTE for the backing page */ - pte_size = psize; - pte = lookup_linux_pte_and_update(pgdir, hva, writing, pte_size); - if (pte_present(pte) !pte_protnone(pte)) { - if (writing !pte_write(pte)) - /* make the actual HPTE be read-only */ - ptel = hpte_make_readonly(ptel); - is_io = hpte_cache_bits(pte_val(pte)); - pa = pte_pfn(pte) PAGE_SHIFT; - pa |= hva (pte_size - 1); - pa |= gpa ~PAGE_MASK; - } + if (hpage_shift) + host_pte_size = 1ul hpage_shift; + else + host_pte_size = PAGE_SIZE; + /* + * We should always find the guest page size + * to = host page size, if host is using hugepage + */ + if (host_pte_size psize) + return H_PARAMETER; - if (pte_size psize) - return H_PARAMETER; + pte = kvmppc_read_update_linux_pte(ptep, writing, hpage_shift); + if (pte_present(pte) !pte_protnone(pte)) { + if (writing !pte_write(pte)) + /* make the actual HPTE be read-only */ +
Re: [PATCH 2/2] KVM: PPC: Remove page table walk helpers
On Mon, 2015-03-30 at 10:39 +0530, Aneesh Kumar K.V wrote: This patch remove helpers which we had used only once in the code. Limiting page table walk variants help in ensuring that we won't end up with code walking page table with wrong assumptions. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Alex, it would be preferable to have this (and the previous one) in the powerpc tree due to dependencies with further fixes to our page table walking vs. THP, so if you're happy, just give us an ack. Cheers, Ben. --- arch/powerpc/include/asm/pgtable.h | 21 - arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 - arch/powerpc/kvm/e500_mmu_host.c| 2 +- 3 files changed, 28 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 9835ac4173b7..92fe01c355a9 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -249,27 +249,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, #endif pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift); - -static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva, - unsigned long *pte_sizep) -{ - pte_t *ptep; - unsigned long ps = *pte_sizep; - unsigned int shift; - - ptep = find_linux_pte_or_hugepte(pgdir, hva, shift); - if (!ptep) - return NULL; - if (shift) - *pte_sizep = 1ul shift; - else - *pte_sizep = PAGE_SIZE; - - if (ps *pte_sizep) - return NULL; - - return ptep; -} #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 625407e4d3b0..73e083cb9f7e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -131,25 +131,6 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } -static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, - int writing, unsigned long *pte_sizep) -{ - pte_t *ptep; - unsigned long ps = *pte_sizep; - unsigned int hugepage_shift; - - ptep = find_linux_pte_or_hugepte(pgdir, hva, hugepage_shift); - if (!ptep) - return __pte(0); - if (hugepage_shift) - *pte_sizep = 1ul hugepage_shift; - else - *pte_sizep = PAGE_SIZE; - if (ps *pte_sizep) - return __pte(0); - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); -} - static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) { asm volatile(PPC_RELEASE_BARRIER : : : memory); @@ -166,10 +147,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, struct revmap_entry *rev; unsigned long g_ptel; struct kvm_memory_slot *memslot; - unsigned long pte_size; + unsigned hpage_shift; unsigned long is_io; unsigned long *rmap; - pte_t pte; + pte_t *ptep; unsigned int writing; unsigned long mmu_seq; unsigned long rcbits; @@ -208,22 +189,33 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Translate to host virtual address */ hva = __gfn_to_hva_memslot(memslot, gfn); + ptep = find_linux_pte_or_hugepte(pgdir, hva, hpage_shift); + if (ptep) { + pte_t pte; + unsigned int host_pte_size; - /* Look up the Linux PTE for the backing page */ - pte_size = psize; - pte = lookup_linux_pte_and_update(pgdir, hva, writing, pte_size); - if (pte_present(pte) !pte_protnone(pte)) { - if (writing !pte_write(pte)) - /* make the actual HPTE be read-only */ - ptel = hpte_make_readonly(ptel); - is_io = hpte_cache_bits(pte_val(pte)); - pa = pte_pfn(pte) PAGE_SHIFT; - pa |= hva (pte_size - 1); - pa |= gpa ~PAGE_MASK; - } + if (hpage_shift) + host_pte_size = 1ul hpage_shift; + else + host_pte_size = PAGE_SIZE; + /* + * We should always find the guest page size + * to = host page size, if host is using hugepage + */ + if (host_pte_size psize) + return H_PARAMETER; - if (pte_size psize) - return H_PARAMETER; + pte = kvmppc_read_update_linux_pte(ptep, writing, hpage_shift); + if (pte_present(pte) !pte_protnone(pte)) { + if (writing !pte_write(pte)) + /* make the actual HPTE be read-only */ +
[PATCH] kvm: mmu: lazy collapse small sptes into large sptes
There are two scenarios for the requirement of collapsing small sptes into large sptes. - dirty logging tracks sptes in 4k granularity, so large sptes are splitted, the large sptes will be reallocated in the destination machine and the guest in the source machine will be destroyed when live migration successfully. However, the guest in the source machine will continue to run if live migration fail due to some reasons, the sptes still keep small which lead to bad performance. - our customers write tools to track the dirty speed of guests by EPT D bit/PML in order to determine the most appropriate one to be live migrated, however sptes will still keep small after tracking dirty speed. This patch introduce lazy collapse small sptes into large sptes, the memory region will be scanned on the ioctl context when dirty log is stopped, the ones which can be collapsed into large pages will be dropped during the scan, it depends the on later #PF to reallocate all large sptes. Signed-off-by: Wanpeng Li wanpeng...@linux.intel.com --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu.c | 66 + arch/x86/kvm/x86.c | 5 3 files changed, 73 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a236e39..73de5d3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -859,6 +859,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot); +void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, + struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cee7592..d25ced1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4465,6 +4465,72 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); } +static int kvm_mmu_zap_collapsible_spte(struct kvm *kvm, + unsigned long *rmapp) +{ + u64 *sptep; + struct rmap_iterator iter; + int need_tlb_flush = 0; + pfn_t pfn; + struct kvm_mmu_page *sp; + + for (sptep = rmap_get_first(*rmapp, iter); sptep;) { + BUG_ON(!(*sptep PT_PRESENT_MASK)); + + sp = page_header(__pa(sptep)); + pfn = spte_to_pfn(*sptep); + if (sp-role.direct + !kvm_is_reserved_pfn(pfn) + PageTransCompound(pfn_to_page(pfn))) { + drop_spte(kvm, sptep); + need_tlb_flush = 1; + } + sptep = rmap_get_next(iter); + } + + return need_tlb_flush; +} + +void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + bool flush = false; + unsigned long *rmapp; + unsigned long last_index, index; + gfn_t gfn_start, gfn_end; + + spin_lock(kvm-mmu_lock); + + gfn_start = memslot-base_gfn; + gfn_end = memslot-base_gfn + memslot-npages - 1; + + if (gfn_start = gfn_end) + goto out; + + rmapp = memslot-arch.rmap[0]; + last_index = gfn_to_index(gfn_end, memslot-base_gfn, + PT_PAGE_TABLE_LEVEL); + + for (index = 0; index = last_index; ++index, ++rmapp) { + if (*rmapp) + flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp); + + if (need_resched() || spin_needbreak(kvm-mmu_lock)) { + if (flush) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + cond_resched_lock(kvm-mmu_lock); + } + } + + if (flush) + kvm_flush_remote_tlbs(kvm); + +out: + spin_unlock(kvm-mmu_lock); +} + void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c5f7e03..6037389 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7618,6 +7618,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, /* It's OK to get 'new' slot here as it has already been installed */ new = id_to_memslot(kvm-memslots, mem-slot); + if ((change != KVM_MR_DELETE) + (old-flags KVM_MEM_LOG_DIRTY_PAGES) + !(new-flags KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_zap_collapsible_sptes(kvm, new); + /* * Set up write
[PATCH 1/2] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer
pte can get updated from other CPUs as part of multiple activities like THP split, huge page collapse, unmap. We need to make sure we don't reload the pte value again and again for different checks. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- Note: This is posted previously as part of http://article.gmane.org/gmane.linux.ports.ppc.embedded/79278 arch/powerpc/include/asm/kvm_book3s_64.h | 5 - arch/powerpc/kvm/e500_mmu_host.c | 20 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index cc073a7ac2b7..f06820c67175 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -290,7 +290,10 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing, pte_t old_pte, new_pte = __pte(0); while (1) { - old_pte = *ptep; + /* +* Make sure we don't reload from ptep +*/ + old_pte = READ_ONCE(*ptep); /* * wait until _PAGE_BUSY is clear then set it atomically */ diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index cc536d4a75ef..5840d546aa03 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -469,14 +469,18 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, pgdir = vcpu_e500-vcpu.arch.pgdir; ptep = lookup_linux_ptep(pgdir, hva, tsize_pages); - if (pte_present(*ptep)) - wimg = (*ptep PTE_WIMGE_SHIFT) MAS2_WIMGE_MASK; - else { - if (printk_ratelimit()) - pr_err(%s: pte not present: gfn %lx, pfn %lx\n, - __func__, (long)gfn, pfn); - ret = -EINVAL; - goto out; + if (ptep) { + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) + wimg = (pte_val(pte) PTE_WIMGE_SHIFT) + MAS2_WIMGE_MASK; + else { + pr_err_ratelimited(%s: pte not present: gfn %lx,pfn %lx\n, + __func__, (long)gfn, pfn); + ret = -EINVAL; + goto out; + } } kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); -- 2.1.0 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer
pte can get updated from other CPUs as part of multiple activities like THP split, huge page collapse, unmap. We need to make sure we don't reload the pte value again and again for different checks. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- Note: This is posted previously as part of http://article.gmane.org/gmane.linux.ports.ppc.embedded/79278 arch/powerpc/include/asm/kvm_book3s_64.h | 5 - arch/powerpc/kvm/e500_mmu_host.c | 20 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index cc073a7ac2b7..f06820c67175 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -290,7 +290,10 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing, pte_t old_pte, new_pte = __pte(0); while (1) { - old_pte = *ptep; + /* +* Make sure we don't reload from ptep +*/ + old_pte = READ_ONCE(*ptep); /* * wait until _PAGE_BUSY is clear then set it atomically */ diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index cc536d4a75ef..5840d546aa03 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -469,14 +469,18 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, pgdir = vcpu_e500-vcpu.arch.pgdir; ptep = lookup_linux_ptep(pgdir, hva, tsize_pages); - if (pte_present(*ptep)) - wimg = (*ptep PTE_WIMGE_SHIFT) MAS2_WIMGE_MASK; - else { - if (printk_ratelimit()) - pr_err(%s: pte not present: gfn %lx, pfn %lx\n, - __func__, (long)gfn, pfn); - ret = -EINVAL; - goto out; + if (ptep) { + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) + wimg = (pte_val(pte) PTE_WIMGE_SHIFT) + MAS2_WIMGE_MASK; + else { + pr_err_ratelimited(%s: pte not present: gfn %lx,pfn %lx\n, + __func__, (long)gfn, pfn); + ret = -EINVAL; + goto out; + } } kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); -- 2.1.0 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] KVM: PPC: Remove page table walk helpers
This patch remove helpers which we had used only once in the code. Limiting page table walk variants help in ensuring that we won't end up with code walking page table with wrong assumptions. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable.h | 21 - arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 - arch/powerpc/kvm/e500_mmu_host.c| 2 +- 3 files changed, 28 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 9835ac4173b7..92fe01c355a9 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -249,27 +249,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, #endif pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift); - -static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva, -unsigned long *pte_sizep) -{ - pte_t *ptep; - unsigned long ps = *pte_sizep; - unsigned int shift; - - ptep = find_linux_pte_or_hugepte(pgdir, hva, shift); - if (!ptep) - return NULL; - if (shift) - *pte_sizep = 1ul shift; - else - *pte_sizep = PAGE_SIZE; - - if (ps *pte_sizep) - return NULL; - - return ptep; -} #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 625407e4d3b0..73e083cb9f7e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -131,25 +131,6 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } -static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, - int writing, unsigned long *pte_sizep) -{ - pte_t *ptep; - unsigned long ps = *pte_sizep; - unsigned int hugepage_shift; - - ptep = find_linux_pte_or_hugepte(pgdir, hva, hugepage_shift); - if (!ptep) - return __pte(0); - if (hugepage_shift) - *pte_sizep = 1ul hugepage_shift; - else - *pte_sizep = PAGE_SIZE; - if (ps *pte_sizep) - return __pte(0); - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); -} - static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) { asm volatile(PPC_RELEASE_BARRIER : : : memory); @@ -166,10 +147,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, struct revmap_entry *rev; unsigned long g_ptel; struct kvm_memory_slot *memslot; - unsigned long pte_size; + unsigned hpage_shift; unsigned long is_io; unsigned long *rmap; - pte_t pte; + pte_t *ptep; unsigned int writing; unsigned long mmu_seq; unsigned long rcbits; @@ -208,22 +189,33 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Translate to host virtual address */ hva = __gfn_to_hva_memslot(memslot, gfn); + ptep = find_linux_pte_or_hugepte(pgdir, hva, hpage_shift); + if (ptep) { + pte_t pte; + unsigned int host_pte_size; - /* Look up the Linux PTE for the backing page */ - pte_size = psize; - pte = lookup_linux_pte_and_update(pgdir, hva, writing, pte_size); - if (pte_present(pte) !pte_protnone(pte)) { - if (writing !pte_write(pte)) - /* make the actual HPTE be read-only */ - ptel = hpte_make_readonly(ptel); - is_io = hpte_cache_bits(pte_val(pte)); - pa = pte_pfn(pte) PAGE_SHIFT; - pa |= hva (pte_size - 1); - pa |= gpa ~PAGE_MASK; - } + if (hpage_shift) + host_pte_size = 1ul hpage_shift; + else + host_pte_size = PAGE_SIZE; + /* +* We should always find the guest page size +* to = host page size, if host is using hugepage +*/ + if (host_pte_size psize) + return H_PARAMETER; - if (pte_size psize) - return H_PARAMETER; + pte = kvmppc_read_update_linux_pte(ptep, writing, hpage_shift); + if (pte_present(pte) !pte_protnone(pte)) { + if (writing !pte_write(pte)) + /* make the actual HPTE be read-only */ + ptel = hpte_make_readonly(ptel); + is_io = hpte_cache_bits(pte_val(pte)); + pa = pte_pfn(pte) PAGE_SHIFT; + pa |= hva (host_pte_size - 1); + pa |= gpa
RE: [v3 24/26] KVM: Update Posted-Interrupts Descriptor when vCPU is blocked
-Original Message- From: Marcelo Tosatti [mailto:mtosa...@redhat.com] Sent: Saturday, March 28, 2015 3:30 AM To: Wu, Feng Cc: h...@zytor.com; t...@linutronix.de; mi...@redhat.com; x...@kernel.org; g...@kernel.org; pbonz...@redhat.com; dw...@infradead.org; j...@8bytes.org; alex.william...@redhat.com; jiang@linux.intel.com; eric.au...@linaro.org; linux-ker...@vger.kernel.org; io...@lists.linux-foundation.org; kvm@vger.kernel.org Subject: Re: [v3 24/26] KVM: Update Posted-Interrupts Descriptor when vCPU is blocked On Fri, Mar 27, 2015 at 06:34:14AM +, Wu, Feng wrote: Currently, the following code is executed before local_irq_disable() is called, so do you mean 1)moving local_irq_disable() to the place before it. 2) after interrupt is disabled, set KVM_REQ_EVENT in case the ON bit is set? 2) after interrupt is disabled, set KVM_REQ_EVENT in case the ON bit is set. Here is my understanding about your comments here: - Disable interrupts - Check 'ON' - Set KVM_REQ_EVENT if 'ON' is set Then we can put the above code inside if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) just like it used to be. However, I still have some questions about this comment: 1. Where should I set KVM_REQ_EVENT? In function vcpu_enter_guest(), or other places? See below: If in vcpu_enter_guest(), since currently local_irq_disable() is called after 'KVM_REQ_EVENT' is checked, is it helpful to set KVM_REQ_EVENT after local_irq_disable() is called? local_irq_disable(); *** add code here *** So we need add code like the following here, right? if ('ON' is set) kvm_make_request(KVM_REQ_EVENT, vcpu); if (vcpu-mode == EXITING_GUEST_MODE || vcpu-requests ^^ || need_resched() || signal_pending(current)) { vcpu-mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); preempt_enable(); vcpu-srcu_idx = srcu_read_lock(vcpu-kvm-srcu); r = 1; goto cancel_injection; } 2. 'ON' is set by VT-d hardware, it can be set even when interrupt is disabled (the related bit in PIR is also set). Yes, we are checking if the HW has set an interrupt in PIR while outside VM (which requires PIR-VIRR transfer by software). If the interrupt it set by hardware after local_irq_disable(), VMX-entry will handle the interrupt and perform the PIR-VIRR transfer and reevaluate interrupts, injecting to guest if necessary, is that correct ? So does it make sense to check 'ON' and set KVM_REQ_EVENT accordingly after interrupt is disabled? To replace the costly +*/ + if (kvm_x86_ops-hwapic_irr_update) + kvm_x86_ops-hwapic_irr_update(vcpu, + kvm_lapic_find_highest_irr(vcpu)); Yes, i think so. After adding the checking ON and setting KVM_REQ_EVENT operations listed in my comments above, do you mean we still need to keep the costly code above inside if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {} in function vcpu_enter_guest() as it used to be? If yes, my question is what is the exact purpose of checking ON and setting KVM_REQ_EVENT operations? Here is the code flow in vcpu_enter_guest(): 1. Check KVM_REQ_EVENT, if it is set, sync pir-virr 2. Disable interrupts 3. Check ON and set KVM_REQ_EVENT -- Here, we set KVM_REQ_EVENT, but it is checked in the step 1, which means, we cannot get any benefits even we set it here, since the pir-virr sync operation was done in step 1, between step 3 and VM-Entry, we don't synchronize the pir to virr. So even we set KVM_REQ_EVENT here, the interrupts remaining in PIR cannot be delivered to guest during this VM-Entry, right? Thanks, Feng I might miss something in your comments, if so please point out. Thanks a lot! Thanks, Feng if (kvm_x86_ops-hwapic_irr_update) kvm_x86_ops-hwapic_irr_update(vcpu, kvm_lapic_find_highest_irr(vcpu)); kvm_lapic_find_highest_irr(vcpu) eats some cache (4 cachelines) versus 1 cacheline for reading ON bit. Please remove blocked and wakeup_cpu, they should not be necessary. Why do you think wakeup_cpu is not needed, when vCPU is blocked, wakeup_cpu saves the cpu which the vCPU is blocked on, after vCPU is woken up, it can run on a different cpu, so we need wakeup_cpu to find the right list to wake up the vCPU. If the vCPU was moved it should have updated IRTE destination field to the pCPU which it has moved to? Every time a vCPU is scheduled to a new pCPU, the IRTE destination filed would be updated accordingly. When vCPU is blocked. To wake
[PATCH 2/2] KVM: PPC: Remove page table walk helpers
This patch remove helpers which we had used only once in the code. Limiting page table walk variants help in ensuring that we won't end up with code walking page table with wrong assumptions. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable.h | 21 - arch/powerpc/kvm/book3s_hv_rm_mmu.c | 62 - arch/powerpc/kvm/e500_mmu_host.c| 2 +- 3 files changed, 28 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 9835ac4173b7..92fe01c355a9 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -249,27 +249,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, #endif pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift); - -static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva, -unsigned long *pte_sizep) -{ - pte_t *ptep; - unsigned long ps = *pte_sizep; - unsigned int shift; - - ptep = find_linux_pte_or_hugepte(pgdir, hva, shift); - if (!ptep) - return NULL; - if (shift) - *pte_sizep = 1ul shift; - else - *pte_sizep = PAGE_SIZE; - - if (ps *pte_sizep) - return NULL; - - return ptep; -} #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 625407e4d3b0..73e083cb9f7e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -131,25 +131,6 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } -static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, - int writing, unsigned long *pte_sizep) -{ - pte_t *ptep; - unsigned long ps = *pte_sizep; - unsigned int hugepage_shift; - - ptep = find_linux_pte_or_hugepte(pgdir, hva, hugepage_shift); - if (!ptep) - return __pte(0); - if (hugepage_shift) - *pte_sizep = 1ul hugepage_shift; - else - *pte_sizep = PAGE_SIZE; - if (ps *pte_sizep) - return __pte(0); - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); -} - static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) { asm volatile(PPC_RELEASE_BARRIER : : : memory); @@ -166,10 +147,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, struct revmap_entry *rev; unsigned long g_ptel; struct kvm_memory_slot *memslot; - unsigned long pte_size; + unsigned hpage_shift; unsigned long is_io; unsigned long *rmap; - pte_t pte; + pte_t *ptep; unsigned int writing; unsigned long mmu_seq; unsigned long rcbits; @@ -208,22 +189,33 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Translate to host virtual address */ hva = __gfn_to_hva_memslot(memslot, gfn); + ptep = find_linux_pte_or_hugepte(pgdir, hva, hpage_shift); + if (ptep) { + pte_t pte; + unsigned int host_pte_size; - /* Look up the Linux PTE for the backing page */ - pte_size = psize; - pte = lookup_linux_pte_and_update(pgdir, hva, writing, pte_size); - if (pte_present(pte) !pte_protnone(pte)) { - if (writing !pte_write(pte)) - /* make the actual HPTE be read-only */ - ptel = hpte_make_readonly(ptel); - is_io = hpte_cache_bits(pte_val(pte)); - pa = pte_pfn(pte) PAGE_SHIFT; - pa |= hva (pte_size - 1); - pa |= gpa ~PAGE_MASK; - } + if (hpage_shift) + host_pte_size = 1ul hpage_shift; + else + host_pte_size = PAGE_SIZE; + /* +* We should always find the guest page size +* to = host page size, if host is using hugepage +*/ + if (host_pte_size psize) + return H_PARAMETER; - if (pte_size psize) - return H_PARAMETER; + pte = kvmppc_read_update_linux_pte(ptep, writing, hpage_shift); + if (pte_present(pte) !pte_protnone(pte)) { + if (writing !pte_write(pte)) + /* make the actual HPTE be read-only */ + ptel = hpte_make_readonly(ptel); + is_io = hpte_cache_bits(pte_val(pte)); + pa = pte_pfn(pte) PAGE_SHIFT; + pa |= hva (host_pte_size - 1); + pa |= gpa
[PATCH 2/2] KVM: x86: Remove redundant definitions
Some constants are redfined in emulate.c. Avoid it. s/SELECTOR_RPL_MASK/SEGMENT_RPL_MASK s/SELECTOR_TI_MASK/SEGMENT_TI_MASK No functional change. Signed-off-by: Nadav Amit na...@cs.technion.ac.il --- arch/x86/include/asm/kvm_host.h | 3 --- arch/x86/kvm/emulate.c | 6 +++--- arch/x86/kvm/vmx.c | 18 +- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7ba3d9d..30b28dc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -81,9 +81,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) (base_gfn KVM_HPAGE_GFN_SHIFT(level)); } -#define SELECTOR_TI_MASK (1 2) -#define SELECTOR_RPL_MASK 0x03 - #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 #define KVM_MMU_HASH_SHIFT 10 diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9a578a1..a48bcd7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2412,7 +2412,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) return emulate_gp(ctxt, 0); ctxt-eflags = ~(X86_EFLAGS_VM | X86_EFLAGS_IF); - cs_sel = (u16)msr_data ~SELECTOR_RPL_MASK; + cs_sel = (u16)msr_data ~SEGMENT_RPL_MASK; ss_sel = cs_sel + 8; if (efer EFER_LMA) { cs.d = 0; @@ -2479,8 +2479,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) return emulate_gp(ctxt, 0); break; } - cs_sel |= SELECTOR_RPL_MASK; - ss_sel |= SELECTOR_RPL_MASK; + cs_sel |= SEGMENT_RPL_MASK; + ss_sel |= SEGMENT_RPL_MASK; ops-set_segment(ctxt, cs_sel, cs, 0, VCPU_SREG_CS); ops-set_segment(ctxt, ss_sel, ss, 0, VCPU_SREG_SS); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fdd9f8b..63ca692 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3263,8 +3263,8 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, * default value. */ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) - save-selector = ~SELECTOR_RPL_MASK; - save-dpl = save-selector SELECTOR_RPL_MASK; + save-selector = ~SEGMENT_RPL_MASK; + save-dpl = save-selector SEGMENT_RPL_MASK; save-s = 1; } vmx_set_segment(vcpu, save, seg); @@ -3837,7 +3837,7 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu) unsigned int cs_rpl; vmx_get_segment(vcpu, cs, VCPU_SREG_CS); - cs_rpl = cs.selector SELECTOR_RPL_MASK; + cs_rpl = cs.selector SEGMENT_RPL_MASK; if (cs.unusable) return false; @@ -3865,7 +3865,7 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu) unsigned int ss_rpl; vmx_get_segment(vcpu, ss, VCPU_SREG_SS); - ss_rpl = ss.selector SELECTOR_RPL_MASK; + ss_rpl = ss.selector SEGMENT_RPL_MASK; if (ss.unusable) return true; @@ -3887,7 +3887,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) unsigned int rpl; vmx_get_segment(vcpu, var, seg); - rpl = var.selector SELECTOR_RPL_MASK; + rpl = var.selector SEGMENT_RPL_MASK; if (var.unusable) return true; @@ -3914,7 +3914,7 @@ static bool tr_valid(struct kvm_vcpu *vcpu) if (tr.unusable) return false; - if (tr.selector SELECTOR_TI_MASK) /* TI = 1 */ + if (tr.selector SEGMENT_TI_MASK) /* TI = 1 */ return false; if (tr.type != 3 tr.type != 11) /* TODO: Check if guest is in IA32e mode */ return false; @@ -3932,7 +3932,7 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu) if (ldtr.unusable) return true; - if (ldtr.selector SELECTOR_TI_MASK) /* TI = 1 */ + if (ldtr.selector SEGMENT_TI_MASK)/* TI = 1 */ return false; if (ldtr.type != 2) return false; @@ -3949,8 +3949,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) vmx_get_segment(vcpu, cs, VCPU_SREG_CS); vmx_get_segment(vcpu, ss, VCPU_SREG_SS); - return ((cs.selector SELECTOR_RPL_MASK) == -(ss.selector SELECTOR_RPL_MASK)); + return ((cs.selector SEGMENT_RPL_MASK) == +(ss.selector SEGMENT_RPL_MASK)); } /* -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] KVM: x86: removing redundant eflags bits definitions
The eflags are redefined (using other defines) in emulate.c. Use the definition from processor-flags.h as some mess already started. No functional change. Signed-off-by: Nadav Amit na...@cs.technion.ac.il --- arch/x86/include/asm/kvm_host.h | 2 - arch/x86/kvm/emulate.c | 105 ++-- 2 files changed, 46 insertions(+), 61 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bf5a160..7ba3d9d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -84,8 +84,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define SELECTOR_TI_MASK (1 2) #define SELECTOR_RPL_MASK 0x03 -#define IOPL_SHIFT 12 - #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 #define KVM_MMU_HASH_SHIFT 10 diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c941abe..9a578a1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -248,27 +248,7 @@ struct mode_dual { struct opcode mode64; }; -/* EFLAGS bit definitions. */ -#define EFLG_ID (121) -#define EFLG_VIP (120) -#define EFLG_VIF (119) -#define EFLG_AC (118) -#define EFLG_VM (117) -#define EFLG_RF (116) -#define EFLG_IOPL (312) -#define EFLG_NT (114) -#define EFLG_OF (111) -#define EFLG_DF (110) -#define EFLG_IF (19) -#define EFLG_TF (18) -#define EFLG_SF (17) -#define EFLG_ZF (16) -#define EFLG_AF (14) -#define EFLG_PF (12) -#define EFLG_CF (10) - #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a -#define EFLG_RESERVED_ONE_MASK 2 enum x86_transfer_type { X86_TRANSFER_NONE, @@ -317,7 +297,8 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) * These EFLAGS bits are restored from saved value during emulation, and * any changes are written back to the saved value after emulation. */ -#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) +#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\ +X86_EFLAGS_PF|X86_EFLAGS_CF) #ifdef CONFIG_X86_64 #define ON64(x) x @@ -1399,7 +1380,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, unsigned int in_page, n; unsigned int count = ctxt-rep_prefix ? address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1; - in_page = (ctxt-eflags EFLG_DF) ? + in_page = (ctxt-eflags X86_EFLAGS_DF) ? offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); n = min3(in_page, (unsigned int)sizeof(rc-data) / size, count); @@ -1412,7 +1393,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, } if (ctxt-rep_prefix (ctxt-d String) - !(ctxt-eflags EFLG_DF)) { + !(ctxt-eflags X86_EFLAGS_DF)) { ctxt-dst.data = rc-data + rc-pos; ctxt-dst.type = OP_MEM_STR; ctxt-dst.count = (rc-end - rc-pos) / size; @@ -1792,32 +1773,34 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, { int rc; unsigned long val, change_mask; - int iopl = (ctxt-eflags X86_EFLAGS_IOPL) IOPL_SHIFT; + int iopl = (ctxt-eflags X86_EFLAGS_IOPL) X86_EFLAGS_IOPL_BIT; int cpl = ctxt-ops-cpl(ctxt); rc = emulate_pop(ctxt, val, len); if (rc != X86EMUL_CONTINUE) return rc; - change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF - | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID; + change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF | + X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT | + X86_EFLAGS_AC | X86_EFLAGS_ID; switch(ctxt-mode) { case X86EMUL_MODE_PROT64: case X86EMUL_MODE_PROT32: case X86EMUL_MODE_PROT16: if (cpl == 0) - change_mask |= EFLG_IOPL; + change_mask |= X86_EFLAGS_IOPL; if (cpl = iopl) - change_mask |= EFLG_IF; + change_mask |= X86_EFLAGS_IF; break; case X86EMUL_MODE_VM86: if (iopl 3) return emulate_gp(ctxt, 0); - change_mask |= EFLG_IF; + change_mask |= X86_EFLAGS_IF; break; default: /* real mode */ - change_mask |= (EFLG_IOPL | EFLG_IF); + change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF); break; } @@ -1918,7 +1901,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt) static int em_pushf(struct x86_emulate_ctxt *ctxt) { - ctxt-src.val = (unsigned long)ctxt-eflags ~EFLG_VM; + ctxt-src.val = (unsigned long)ctxt-eflags
[PATCH 0/2] KVM: x86: Removing redundant definitions from emulator
There are several redundant definitions in processor-flags.h and emulator.c. Slowly, but surely they will get mixed, so removing those of emulator.c seems like a reasonable move (unless I am missing something, e.g., kvm-kmod consideration). Nadav Amit (2): KVM: x86: removing redundant eflags bits definitions KVM: x86: Remove redundant definitions arch/x86/include/asm/kvm_host.h | 5 -- arch/x86/kvm/emulate.c | 111 ++-- arch/x86/kvm/vmx.c | 18 +++ 3 files changed, 58 insertions(+), 76 deletions(-) -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] KVM: nVMX: add missing checks for address bits beyond physical address width on VM-entry
On each VM-entry CPU should check the following VMCS fields for zero bits beyond physical address width: - APIC-access address - virtual-APIC address - posted-interrupt descriptor address This patch adds these checks required by Intel SDM. Signed-off-by: Eugene Korenevsky ekorenev...@gmail.com --- arch/x86/kvm/vmx.c | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f7b20b4..b3dcf73 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8616,10 +8616,11 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); + int maxphyaddr = cpuid_maxphyaddr(vcpu); if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - /* TODO: Also verify bits beyond physical address width are 0 */ - if (!PAGE_ALIGNED(vmcs12-apic_access_addr)) + if (!PAGE_ALIGNED(vmcs12-apic_access_addr) || + vmcs12-apic_access_addr maxphyaddr) return false; /* @@ -8635,8 +8636,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - /* TODO: Also verify bits beyond physical address width are 0 */ - if (!PAGE_ALIGNED(vmcs12-virtual_apic_page_addr)) + if (!PAGE_ALIGNED(vmcs12-virtual_apic_page_addr) || + vmcs12-virtual_apic_page_addr maxphyaddr) return false; if (vmx-nested.virtual_apic_page) /* shouldn't happen */ @@ -8659,7 +8660,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, } if (nested_cpu_has_posted_intr(vmcs12)) { - if (!IS_ALIGNED(vmcs12-posted_intr_desc_addr, 64)) + if (!IS_ALIGNED(vmcs12-posted_intr_desc_addr, 64) || + vmcs12-posted_intr_desc_addr maxphyaddr) return false; if (vmx-nested.pi_desc_page) { /* shouldn't happen */ @@ -9379,7 +9381,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { - /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } -- 2.0.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] KVM: x86: optimization: cache physical address width to avoid excessive enumerations of CPUID entries
cpuid_maxphyaddr() which performs lot of memory accesses is called extensively across KVM, especially in nVMX code. This patch adds cached value of maxphyaddr to vcpu.arch to reduce the pressure onto CPU cache and simplify the code of cpuid_maxphyaddr() callers. The cached value is initialized in kvm_arch_vcpu_init() and reloaded every time CPUID is updated by usermode. It is obvious that these reloads occur infrequently. Signed-off-by: Eugene Korenevsky ekorenev...@gmail.com --- arch/x86/include/asm/kvm_host.h | 4 +++- arch/x86/kvm/cpuid.c| 33 ++--- arch/x86/kvm/cpuid.h| 6 ++ arch/x86/kvm/x86.c | 2 ++ 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a236e39..2362a60 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -431,6 +431,9 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + + int maxphyaddr; + /* emulate context */ struct x86_emulate_ctxt emulate_ctxt; @@ -1128,7 +1131,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8a80737..59b69f6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -104,6 +104,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) ((best-eax 0xff00) 8) != 0) return -EINVAL; + /* Update physical-address width */ + vcpu-arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + kvm_pmu_cpuid_update(vcpu); return 0; } @@ -135,6 +138,21 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) } } +int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x8000, 0); + if (!best || best-eax 0x8008) + goto not_found; + best = kvm_find_cpuid_entry(vcpu, 0x8008, 0); + if (best) + return best-eax 0xff; +not_found: + return 36; +} +EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr); + /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, @@ -757,21 +775,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); -int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0x8000, 0); - if (!best || best-eax 0x8008) - goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x8008, 0); - if (best) - return best-eax 0xff; -not_found: - return 36; -} -EXPORT_SYMBOL_GPL(cpuid_maxphyaddr); - /* * If no match is found, check whether we exceed the vCPU's limit * and return the content of the highest valid _standard_ leaf instead. diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 4452eed..78b61b4 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -20,6 +20,12 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 __user *entries); void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); +int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); + +static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) +{ + return vcpu-arch.maxphyaddr; +} static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd7a70b..084e1d5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7289,6 +7289,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu-arch.guest_supported_xcr0 = 0; vcpu-arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + vcpu-arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); -- 2.0.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] KVM: nVMX: nested_vmx_check_msr_switch_controls simplification: remove unnecessary caching of physical address width
After speed-up of cpuid_maxphyaddr() it can be called easily. Now instead of heavy enumeration of CPUID entries it returns cached pre-computed value. It is also inlined now. So caching its result became unnecessary and can be removed. Signed-off-by: Eugene Korenevsky ekorenev...@gmail.com --- arch/x86/kvm/vmx.c | 14 ++ 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b3dcf73..1cd2837 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8860,9 +8860,9 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, unsigned long count_field, - unsigned long addr_field, - int maxphyaddr) + unsigned long addr_field) { + int maxphyaddr; u64 count, addr; if (vmcs12_read_any(vcpu, count_field, count) || @@ -8872,6 +8872,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, } if (count == 0) return 0; + maxphyaddr = cpuid_maxphyaddr(vcpu); if (!IS_ALIGNED(addr, 16) || addr maxphyaddr || (addr + count * sizeof(struct vmx_msr_entry) - 1) maxphyaddr) { pr_warn_ratelimited( @@ -8885,19 +8886,16 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int maxphyaddr; - if (vmcs12-vm_exit_msr_load_count == 0 vmcs12-vm_exit_msr_store_count == 0 vmcs12-vm_entry_msr_load_count == 0) return 0; /* Fast path */ - maxphyaddr = cpuid_maxphyaddr(vcpu); if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, - VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) || + VM_EXIT_MSR_LOAD_ADDR) || nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, - VM_EXIT_MSR_STORE_ADDR, maxphyaddr) || + VM_EXIT_MSR_STORE_ADDR) || nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, - VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr)) + VM_ENTRY_MSR_LOAD_ADDR)) return -EINVAL; return 0; } -- 2.0.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html