[PATCH] KVM: PPC: BOOK3S: HV: Use base page size when comparing against slb value
With guest supporting Multiple page size per segment (MPSS), hpte_page_size returns actual page size used. Add a new function to return base page size and use that to compare against the the page size calculated from SLB Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_book3s_64.h | 19 +-- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 34422be566ce..3d0f3fb9c6b6 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -202,8 +202,10 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, return rb; } -static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) +static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l, +bool is_base_size) { + int size, a_psize; /* Look at the 8 bit LP value */ unsigned int lp = (l LP_SHIFT) ((1 LP_BITS) - 1); @@ -218,14 +220,27 @@ static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) continue; a_psize = __hpte_actual_psize(lp, size); - if (a_psize != -1) + if (a_psize != -1) { + if (is_base_size) + return 1ul mmu_psize_defs[size].shift; return 1ul mmu_psize_defs[a_psize].shift; + } } } return 0; } +static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) +{ + return __hpte_page_size(h, l, 0); +} + +static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l) +{ + return __hpte_page_size(h, l, 1); +} + static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize) { return ((ptel HPTE_R_RPN) ~(psize - 1)) PAGE_SHIFT; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index f53cf2eae36a..7ff45ed27c65 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1567,7 +1567,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, goto out; } if (!rma_setup is_vrma_hpte(v)) { - unsigned long psize = hpte_page_size(v, r); + unsigned long psize = hpte_base_page_size(v, r); unsigned long senc = slb_pgsize_encoding(psize); unsigned long lpcr; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 87624ab5ba82..c6aca75b8376 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -839,7 +839,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, * to check against the actual page size. */ if ((v valid) (v mask) == val - hpte_page_size(v, r) == (1ul pshift)) + hpte_base_page_size(v, r) == (1ul pshift)) /* Return with the HPTE still locked */ return (hash 3) + (i 1); -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: BOOK3S: HV: Use base page size when comparing against slb value
On 13.06.14 09:23, Aneesh Kumar K.V wrote: With guest supporting Multiple page size per segment (MPSS), hpte_page_size returns actual page size used. Add a new function to return base page size and use that to compare against the the page size calculated from SLB Why? What does this fix? Is this a bug fix, an enhancement? Don't describe only what you do, but also why you do it. Alex Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_book3s_64.h | 19 +-- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 34422be566ce..3d0f3fb9c6b6 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -202,8 +202,10 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, return rb; } -static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) +static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l, +bool is_base_size) { + int size, a_psize; /* Look at the 8 bit LP value */ unsigned int lp = (l LP_SHIFT) ((1 LP_BITS) - 1); @@ -218,14 +220,27 @@ static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) continue; a_psize = __hpte_actual_psize(lp, size); - if (a_psize != -1) + if (a_psize != -1) { + if (is_base_size) + return 1ul mmu_psize_defs[size].shift; return 1ul mmu_psize_defs[a_psize].shift; + } } } return 0; } +static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) +{ + return __hpte_page_size(h, l, 0); +} + +static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long l) +{ + return __hpte_page_size(h, l, 1); +} + static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize) { return ((ptel HPTE_R_RPN) ~(psize - 1)) PAGE_SHIFT; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index f53cf2eae36a..7ff45ed27c65 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1567,7 +1567,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, goto out; } if (!rma_setup is_vrma_hpte(v)) { - unsigned long psize = hpte_page_size(v, r); + unsigned long psize = hpte_base_page_size(v, r); unsigned long senc = slb_pgsize_encoding(psize); unsigned long lpcr; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 87624ab5ba82..c6aca75b8376 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -839,7 +839,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, * to check against the actual page size. */ if ((v valid) (v mask) == val - hpte_page_size(v, r) == (1ul pshift)) + hpte_base_page_size(v, r) == (1ul pshift)) /* Return with the HPTE still locked */ return (hash 3) + (i 1); -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: BOOK3S: HV: Use base page size when comparing against slb value
Alexander Graf ag...@suse.de writes: On 13.06.14 09:23, Aneesh Kumar K.V wrote: With guest supporting Multiple page size per segment (MPSS), hpte_page_size returns actual page size used. Add a new function to return base page size and use that to compare against the the page size calculated from SLB Why? What does this fix? Is this a bug fix, an enhancement? Don't describe only what you do, but also why you do it. This could result in page fault failures (unhandled page fault) because even though we have a valid hpte entry mapping a 16MB page, since we were comparing actual page size against page size calculated from SLB bits kvmppc_hv_find_lock_hpte will fail and return -1. I did not observe a failure in real and the bug was found during code audit. That could be because with THP we have guest ram backed by hugetlbfs and we always find the page in the host linux page table. The will result in do_h_enter always inserting HPTE_V_VALID entry and hence we might not really end up calling kvmppc_hv_find_lock_hpte. -aneesh -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule
-Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, June 12, 2014 8:05 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-ppc@vger.kernel.org; k...@vger.kernel.org; linuxppc- d...@lists.ozlabs.org; Wood Scott-B07421 Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule On 06/12/2014 04:00 PM, Mihai Caraman wrote: On vcpu schedule, the condition checked for tlb pollution is too tight. The tlb entries of one vcpu are polluted when a different vcpu from the same partition runs in-between. Relax the current tlb invalidation condition taking into account the lpid. Signed-off-by: Mihai Caraman mihai.caraman at freescale.com Your mailer is broken? :) This really should be an @. I think this should work. Scott, please ack. Alex, you were right. I screwed up the patch description by inverting relax and tight terms :) It should have been more like this: KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule On vcpu schedule, the condition checked for tlb pollution is too loose. The tlb entries of a vcpu are polluted (vs stale) only when a different vcpu within the same logical partition runs in-between. Optimize the tlb invalidation condition taking into account the lpid. -Mike -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: BOOK3S: HV: Use base page size when comparing against slb value
On 13.06.14 16:28, Aneesh Kumar K.V wrote: Alexander Graf ag...@suse.de writes: On 13.06.14 09:23, Aneesh Kumar K.V wrote: With guest supporting Multiple page size per segment (MPSS), hpte_page_size returns actual page size used. Add a new function to return base page size and use that to compare against the the page size calculated from SLB Why? What does this fix? Is this a bug fix, an enhancement? Don't describe only what you do, but also why you do it. This could result in page fault failures (unhandled page fault) because even though we have a valid hpte entry mapping a 16MB page, since we were comparing actual page size against page size calculated from SLB bits kvmppc_hv_find_lock_hpte will fail and return -1. I did not observe a failure in real and the bug was found during code audit. That could be because with THP we have guest ram backed by hugetlbfs and we always find the page in the host linux page table. The will result in do_h_enter always inserting HPTE_V_VALID entry and hence we might not really end up calling kvmppc_hv_find_lock_hpte. So why do we need to override to base page size for the VRMA region? Also I think you want to change the comment above the line in find_lock_hpte you're changing. Alex -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule
On 13.06.14 16:43, mihai.cara...@freescale.com wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, June 12, 2014 8:05 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-ppc@vger.kernel.org; k...@vger.kernel.org; linuxppc- d...@lists.ozlabs.org; Wood Scott-B07421 Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule On 06/12/2014 04:00 PM, Mihai Caraman wrote: On vcpu schedule, the condition checked for tlb pollution is too tight. The tlb entries of one vcpu are polluted when a different vcpu from the same partition runs in-between. Relax the current tlb invalidation condition taking into account the lpid. Signed-off-by: Mihai Caraman mihai.caraman at freescale.com Your mailer is broken? :) This really should be an @. I think this should work. Scott, please ack. Alex, you were right. I screwed up the patch description by inverting relax and tight terms :) It should have been more like this: KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule On vcpu schedule, the condition checked for tlb pollution is too loose. The tlb entries of a vcpu are polluted (vs stale) only when a different vcpu within the same logical partition runs in-between. Optimize the tlb invalidation condition taking into account the lpid. Can't we give every vcpu its own lpid? Or don't we trap on global invalidates? Alex -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: BOOK3S: HV: Use base page size when comparing against slb value
Alexander Graf ag...@suse.de writes: On 13.06.14 16:28, Aneesh Kumar K.V wrote: Alexander Graf ag...@suse.de writes: On 13.06.14 09:23, Aneesh Kumar K.V wrote: With guest supporting Multiple page size per segment (MPSS), hpte_page_size returns actual page size used. Add a new function to return base page size and use that to compare against the the page size calculated from SLB Why? What does this fix? Is this a bug fix, an enhancement? Don't describe only what you do, but also why you do it. This could result in page fault failures (unhandled page fault) because even though we have a valid hpte entry mapping a 16MB page, since we were comparing actual page size against page size calculated from SLB bits kvmppc_hv_find_lock_hpte will fail and return -1. I did not observe a failure in real and the bug was found during code audit. That could be because with THP we have guest ram backed by hugetlbfs and we always find the page in the host linux page table. The will result in do_h_enter always inserting HPTE_V_VALID entry and hence we might not really end up calling kvmppc_hv_find_lock_hpte. So why do we need to override to base page size for the VRMA region? slb encoding should be derived based on base page size. Also I think you want to change the comment above the line in find_lock_hpte you're changing. Will do that. -aneesh -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule
On Fri, 2014-06-13 at 16:55 +0200, Alexander Graf wrote: On 13.06.14 16:43, mihai.cara...@freescale.com wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, June 12, 2014 8:05 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-ppc@vger.kernel.org; k...@vger.kernel.org; linuxppc- d...@lists.ozlabs.org; Wood Scott-B07421 Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule On 06/12/2014 04:00 PM, Mihai Caraman wrote: On vcpu schedule, the condition checked for tlb pollution is too tight. The tlb entries of one vcpu are polluted when a different vcpu from the same partition runs in-between. Relax the current tlb invalidation condition taking into account the lpid. Can you quantify the performance improvement from this? We've had bugs in this area before, so let's make sure it's worth it before making this more complicated. Signed-off-by: Mihai Caraman mihai.caraman at freescale.com Your mailer is broken? :) This really should be an @. I think this should work. Scott, please ack. Alex, you were right. I screwed up the patch description by inverting relax and tight terms :) It should have been more like this: KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule On vcpu schedule, the condition checked for tlb pollution is too loose. The tlb entries of a vcpu are polluted (vs stale) only when a different vcpu within the same logical partition runs in-between. Optimize the tlb invalidation condition taking into account the lpid. Can't we give every vcpu its own lpid? Or don't we trap on global invalidates? That would significantly increase the odds of exhausting LPIDs, especially on large chips like t4240 with similarly large VMs. If we were to do that, the LPIDs would need to be dynamically assigned (like PIDs), and should probably be a separate numberspace per physical core. -Scott -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/4 v3] KVM: PPC: Bookehv: Get vcpu's last instruction for emulation
On Thu, 2014-06-12 at 18:04 +0200, Alexander Graf wrote: On 06/02/2014 05:50 PM, Mihai Caraman wrote: On book3e, KVM uses load external pid (lwepx) dedicated instruction to read guest last instruction on the exit path. lwepx exceptions (DTLB_MISS, DSI and LRAT), generated by loading a guest address, needs to be handled by KVM. These exceptions are generated in a substituted guest translation context (EPLC[EGS] = 1) from host context (MSR[GS] = 0). Currently, KVM hooks only interrupts generated from guest context (MSR[GS] = 1), doing minimal checks on the fast path to avoid host performance degradation. lwepx exceptions originate from host state (MSR[GS] = 0) which implies additional checks in DO_KVM macro (beside the current MSR[GS] = 1) by looking at the Exception Syndrome Register (ESR[EPID]) and the External PID Load Context Register (EPLC[EGS]). Doing this on each Data TLB miss exception is obvious too intrusive for the host. Read guest last instruction from kvmppc_load_last_inst() by searching for the physical address and kmap it. This address the TODO for TLB eviction and execute-but-not-read entries, and allow us to get rid of lwepx until we are able to handle failures. A simple stress benchmark shows a 1% sys performance degradation compared with previous approach (lwepx without failure handling): time for i in `seq 1 1`; do /bin/echo /dev/null; done real0m 8.85s user0m 4.34s sys 0m 4.48s vs real0m 8.84s user0m 4.36s sys 0m 4.44s An alternative solution, to handle lwepx exceptions in KVM, is to temporary highjack the interrupt vector from host. Some cores share host IVOR registers between hardware threads, which is the case of FSL e6500, which impose additional synchronization logic for this solution to work. This optimized solution can be developed later on top of this patch. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- v3: - reworked patch description - use unaltered kmap addr for kunmap - get last instruction before beeing preempted v2: - reworked patch description - used pr_* functions - addressed cosmetic feedback arch/powerpc/kvm/booke.c | 32 arch/powerpc/kvm/bookehv_interrupts.S | 37 -- arch/powerpc/kvm/e500_mmu_host.c | 93 +++ 3 files changed, 134 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 34a42b9..4ef52a8 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -880,6 +880,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, int r = RESUME_HOST; int s; int idx; + u32 last_inst = KVM_INST_FETCH_FAILED; + enum emulation_result emulated = EMULATE_DONE; /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); @@ -887,6 +889,15 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, /* restart interrupts if they were meant for the host */ kvmppc_restart_interrupt(vcpu, exit_nr); + /* +* get last instruction before beeing preempted +* TODO: for e6500 check also BOOKE_INTERRUPT_LRAT_ERROR ESR_DATA +*/ + if (exit_nr == BOOKE_INTERRUPT_DATA_STORAGE || + exit_nr == BOOKE_INTERRUPT_DTLB_MISS || + exit_nr == BOOKE_INTERRUPT_HV_PRIV) Please make this a switch() - that's easier to read. + emulated = kvmppc_get_last_inst(vcpu, false, last_inst); + local_irq_enable(); trace_kvm_exit(exit_nr, vcpu); @@ -895,6 +906,26 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, run-exit_reason = KVM_EXIT_UNKNOWN; run-ready_for_interrupt_injection = 1; + switch (emulated) { + case EMULATE_AGAIN: + r = RESUME_GUEST; + goto out; + + case EMULATE_FAIL: + pr_debug(%s: emulation at %lx failed (%08x)\n, + __func__, vcpu-arch.pc, last_inst); + /* For debugging, encode the failing instruction and +* report it to userspace. */ + run-hw.hardware_exit_reason = ~0ULL 32; + run-hw.hardware_exit_reason |= last_inst; + kvmppc_core_queue_program(vcpu, ESR_PIL); + r = RESUME_HOST; + goto out; + + default: + break; + } I think you can just put this into a function. Scott, I think the patch overall looks quite good. Can you please check as well and if you agree give it your reviewed-by? Mike, when Scott gives you a reviewed-by, please include it for the next version. Alex + switch (exit_nr) { case BOOKE_INTERRUPT_MACHINE_CHECK: printk(MACHINE CHECK: %lx\n, mfspr(SPRN_MCSR)); @@ -1184,6 +1215,7 @@ int