[PATCH] KVM: PPC: BOOK3S: HV: Use base page size when comparing against slb value

2014-06-13 Thread Aneesh Kumar K.V
With guest supporting Multiple page size per segment (MPSS),
hpte_page_size returns actual page size used. Add a new function to
return base page size and use that to compare against the the page size
calculated from SLB

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 19 +--
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  2 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |  2 +-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 34422be566ce..3d0f3fb9c6b6 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -202,8 +202,10 @@ static inline unsigned long compute_tlbie_rb(unsigned long 
v, unsigned long r,
return rb;
 }
 
-static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
+static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
+bool is_base_size)
 {
+
int size, a_psize;
/* Look at the 8 bit LP value */
unsigned int lp = (l  LP_SHIFT)  ((1  LP_BITS) - 1);
@@ -218,14 +220,27 @@ static inline unsigned long hpte_page_size(unsigned long 
h, unsigned long l)
continue;
 
a_psize = __hpte_actual_psize(lp, size);
-   if (a_psize != -1)
+   if (a_psize != -1) {
+   if (is_base_size)
+   return 1ul  
mmu_psize_defs[size].shift;
return 1ul  mmu_psize_defs[a_psize].shift;
+   }
}
 
}
return 0;
 }
 
+static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
+{
+   return __hpte_page_size(h, l, 0);
+}
+
+static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long 
l)
+{
+   return __hpte_page_size(h, l, 1);
+}
+
 static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
 {
return ((ptel  HPTE_R_RPN)  ~(psize - 1))  PAGE_SHIFT;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index f53cf2eae36a..7ff45ed27c65 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1567,7 +1567,7 @@ static ssize_t kvm_htab_write(struct file *file, const 
char __user *buf,
goto out;
}
if (!rma_setup  is_vrma_hpte(v)) {
-   unsigned long psize = hpte_page_size(v, r);
+   unsigned long psize = hpte_base_page_size(v, r);
unsigned long senc = slb_pgsize_encoding(psize);
unsigned long lpcr;
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 87624ab5ba82..c6aca75b8376 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -839,7 +839,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, 
unsigned long slb_v,
 * to check against the actual page size.
 */
if ((v  valid)  (v  mask) == val 
-   hpte_page_size(v, r) == (1ul  pshift))
+   hpte_base_page_size(v, r) == (1ul  pshift))
/* Return with the HPTE still locked */
return (hash  3) + (i  1);
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: BOOK3S: HV: Use base page size when comparing against slb value

2014-06-13 Thread Alexander Graf


On 13.06.14 09:23, Aneesh Kumar K.V wrote:

With guest supporting Multiple page size per segment (MPSS),
hpte_page_size returns actual page size used. Add a new function to
return base page size and use that to compare against the the page size
calculated from SLB


Why? What does this fix? Is this a bug fix, an enhancement? Don't 
describe only what you do, but also why you do it.



Alex



Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
  arch/powerpc/include/asm/kvm_book3s_64.h | 19 +--
  arch/powerpc/kvm/book3s_64_mmu_hv.c  |  2 +-
  arch/powerpc/kvm/book3s_hv_rm_mmu.c  |  2 +-
  3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 34422be566ce..3d0f3fb9c6b6 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -202,8 +202,10 @@ static inline unsigned long compute_tlbie_rb(unsigned long 
v, unsigned long r,
return rb;
  }
  
-static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)

+static inline unsigned long __hpte_page_size(unsigned long h, unsigned long l,
+bool is_base_size)
  {
+
int size, a_psize;
/* Look at the 8 bit LP value */
unsigned int lp = (l  LP_SHIFT)  ((1  LP_BITS) - 1);
@@ -218,14 +220,27 @@ static inline unsigned long hpte_page_size(unsigned long 
h, unsigned long l)
continue;
  
  			a_psize = __hpte_actual_psize(lp, size);

-   if (a_psize != -1)
+   if (a_psize != -1) {
+   if (is_base_size)
+   return 1ul  
mmu_psize_defs[size].shift;
return 1ul  mmu_psize_defs[a_psize].shift;
+   }
}
  
  	}

return 0;
  }
  
+static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)

+{
+   return __hpte_page_size(h, l, 0);
+}
+
+static inline unsigned long hpte_base_page_size(unsigned long h, unsigned long 
l)
+{
+   return __hpte_page_size(h, l, 1);
+}
+
  static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
  {
return ((ptel  HPTE_R_RPN)  ~(psize - 1))  PAGE_SHIFT;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index f53cf2eae36a..7ff45ed27c65 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1567,7 +1567,7 @@ static ssize_t kvm_htab_write(struct file *file, const 
char __user *buf,
goto out;
}
if (!rma_setup  is_vrma_hpte(v)) {
-   unsigned long psize = hpte_page_size(v, r);
+   unsigned long psize = hpte_base_page_size(v, r);
unsigned long senc = slb_pgsize_encoding(psize);
unsigned long lpcr;
  
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c

index 87624ab5ba82..c6aca75b8376 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -839,7 +839,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, 
unsigned long slb_v,
 * to check against the actual page size.
 */
if ((v  valid)  (v  mask) == val 
-   hpte_page_size(v, r) == (1ul  pshift))
+   hpte_base_page_size(v, r) == (1ul  pshift))
/* Return with the HPTE still locked */
return (hash  3) + (i  1);
  


--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: BOOK3S: HV: Use base page size when comparing against slb value

2014-06-13 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 13.06.14 09:23, Aneesh Kumar K.V wrote:
 With guest supporting Multiple page size per segment (MPSS),
 hpte_page_size returns actual page size used. Add a new function to
 return base page size and use that to compare against the the page size
 calculated from SLB

 Why? What does this fix? Is this a bug fix, an enhancement? Don't 
 describe only what you do, but also why you do it.



This could result in page fault failures (unhandled page fault) because
even though we have a valid hpte entry mapping a 16MB page, since we
were comparing actual page size against page size calculated from SLB
bits kvmppc_hv_find_lock_hpte will fail and return -1. I did not observe
a failure in real and the bug was found during code audit. That could be
because with THP we have guest ram backed by hugetlbfs and we always
find the page in the host linux page table. The will result in do_h_enter always
inserting HPTE_V_VALID entry and hence we might not really end up calling
kvmppc_hv_find_lock_hpte.

-aneesh

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule

2014-06-13 Thread mihai.cara...@freescale.com
 -Original Message-
 From: Alexander Graf [mailto:ag...@suse.de]
 Sent: Thursday, June 12, 2014 8:05 PM
 To: Caraman Mihai Claudiu-B02008
 Cc: kvm-ppc@vger.kernel.org; k...@vger.kernel.org; linuxppc-
 d...@lists.ozlabs.org; Wood Scott-B07421
 Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition
 on vcpu schedule
 
 On 06/12/2014 04:00 PM, Mihai Caraman wrote:
  On vcpu schedule, the condition checked for tlb pollution is too tight.
  The tlb entries of one vcpu are polluted when a different vcpu from the
  same partition runs in-between. Relax the current tlb invalidation
  condition taking into account the lpid.
 
  Signed-off-by: Mihai Caraman mihai.caraman at freescale.com
 
 Your mailer is broken? :)
 This really should be an @.
 
 I think this should work. Scott, please ack.

Alex, you were right. I screwed up the patch description by inverting relax
and tight terms :) It should have been more like this:

KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

On vcpu schedule, the condition checked for tlb pollution is too loose.
The tlb entries of a vcpu are polluted (vs stale) only when a different vcpu
within the same logical partition runs in-between. Optimize the tlb invalidation
condition taking into account the lpid.

-Mike
--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: BOOK3S: HV: Use base page size when comparing against slb value

2014-06-13 Thread Alexander Graf


On 13.06.14 16:28, Aneesh Kumar K.V wrote:

Alexander Graf ag...@suse.de writes:


On 13.06.14 09:23, Aneesh Kumar K.V wrote:

With guest supporting Multiple page size per segment (MPSS),
hpte_page_size returns actual page size used. Add a new function to
return base page size and use that to compare against the the page size
calculated from SLB

Why? What does this fix? Is this a bug fix, an enhancement? Don't
describe only what you do, but also why you do it.



This could result in page fault failures (unhandled page fault) because
even though we have a valid hpte entry mapping a 16MB page, since we
were comparing actual page size against page size calculated from SLB
bits kvmppc_hv_find_lock_hpte will fail and return -1. I did not observe
a failure in real and the bug was found during code audit. That could be
because with THP we have guest ram backed by hugetlbfs and we always
find the page in the host linux page table. The will result in do_h_enter always
inserting HPTE_V_VALID entry and hence we might not really end up calling
kvmppc_hv_find_lock_hpte.


So why do we need to override to base page size for the VRMA region? 
Also I think you want to change the comment above the line in 
find_lock_hpte you're changing.



Alex

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule

2014-06-13 Thread Alexander Graf


On 13.06.14 16:43, mihai.cara...@freescale.com wrote:

-Original Message-
From: Alexander Graf [mailto:ag...@suse.de]
Sent: Thursday, June 12, 2014 8:05 PM
To: Caraman Mihai Claudiu-B02008
Cc: kvm-ppc@vger.kernel.org; k...@vger.kernel.org; linuxppc-
d...@lists.ozlabs.org; Wood Scott-B07421
Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition
on vcpu schedule

On 06/12/2014 04:00 PM, Mihai Caraman wrote:

On vcpu schedule, the condition checked for tlb pollution is too tight.
The tlb entries of one vcpu are polluted when a different vcpu from the
same partition runs in-between. Relax the current tlb invalidation
condition taking into account the lpid.

Signed-off-by: Mihai Caraman mihai.caraman at freescale.com

Your mailer is broken? :)
This really should be an @.

I think this should work. Scott, please ack.

Alex, you were right. I screwed up the patch description by inverting relax
and tight terms :) It should have been more like this:

KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule

On vcpu schedule, the condition checked for tlb pollution is too loose.
The tlb entries of a vcpu are polluted (vs stale) only when a different vcpu
within the same logical partition runs in-between. Optimize the tlb invalidation
condition taking into account the lpid.


Can't we give every vcpu its own lpid? Or don't we trap on global 
invalidates?



Alex

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: BOOK3S: HV: Use base page size when comparing against slb value

2014-06-13 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 13.06.14 16:28, Aneesh Kumar K.V wrote:
 Alexander Graf ag...@suse.de writes:

 On 13.06.14 09:23, Aneesh Kumar K.V wrote:
 With guest supporting Multiple page size per segment (MPSS),
 hpte_page_size returns actual page size used. Add a new function to
 return base page size and use that to compare against the the page size
 calculated from SLB
 Why? What does this fix? Is this a bug fix, an enhancement? Don't
 describe only what you do, but also why you do it.


 This could result in page fault failures (unhandled page fault) because
 even though we have a valid hpte entry mapping a 16MB page, since we
 were comparing actual page size against page size calculated from SLB
 bits kvmppc_hv_find_lock_hpte will fail and return -1. I did not observe
 a failure in real and the bug was found during code audit. That could be
 because with THP we have guest ram backed by hugetlbfs and we always
 find the page in the host linux page table. The will result in do_h_enter 
 always
 inserting HPTE_V_VALID entry and hence we might not really end up calling
 kvmppc_hv_find_lock_hpte.

 So why do we need to override to base page size for the VRMA region?

slb encoding should be derived based on base page size. 

 Also I think you want to change the comment above the line in 
 find_lock_hpte you're changing.


Will do that.

-aneesh

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition on vcpu schedule

2014-06-13 Thread Scott Wood
On Fri, 2014-06-13 at 16:55 +0200, Alexander Graf wrote:
 On 13.06.14 16:43, mihai.cara...@freescale.com wrote:
  -Original Message-
  From: Alexander Graf [mailto:ag...@suse.de]
  Sent: Thursday, June 12, 2014 8:05 PM
  To: Caraman Mihai Claudiu-B02008
  Cc: kvm-ppc@vger.kernel.org; k...@vger.kernel.org; linuxppc-
  d...@lists.ozlabs.org; Wood Scott-B07421
  Subject: Re: [PATCH] KVM: PPC: e500mc: Relax tlb invalidation condition
  on vcpu schedule
 
  On 06/12/2014 04:00 PM, Mihai Caraman wrote:
  On vcpu schedule, the condition checked for tlb pollution is too tight.
  The tlb entries of one vcpu are polluted when a different vcpu from the
  same partition runs in-between. Relax the current tlb invalidation
  condition taking into account the lpid.

Can you quantify the performance improvement from this?  We've had bugs
in this area before, so let's make sure it's worth it before making this
more complicated.

  Signed-off-by: Mihai Caraman mihai.caraman at freescale.com
  Your mailer is broken? :)
  This really should be an @.
 
  I think this should work. Scott, please ack.
  Alex, you were right. I screwed up the patch description by inverting relax
  and tight terms :) It should have been more like this:
 
  KVM: PPC: e500mc: Enhance tlb invalidation condition on vcpu schedule
 
  On vcpu schedule, the condition checked for tlb pollution is too loose.
  The tlb entries of a vcpu are polluted (vs stale) only when a different vcpu
  within the same logical partition runs in-between. Optimize the tlb 
  invalidation
  condition taking into account the lpid.
 
 Can't we give every vcpu its own lpid? Or don't we trap on global 
 invalidates?

That would significantly increase the odds of exhausting LPIDs,
especially on large chips like t4240 with similarly large VMs.  If we
were to do that, the LPIDs would need to be dynamically assigned (like
PIDs), and should probably be a separate numberspace per physical core.

-Scott


--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4 v3] KVM: PPC: Bookehv: Get vcpu's last instruction for emulation

2014-06-13 Thread Scott Wood
On Thu, 2014-06-12 at 18:04 +0200, Alexander Graf wrote:
 On 06/02/2014 05:50 PM, Mihai Caraman wrote:
  On book3e, KVM uses load external pid (lwepx) dedicated instruction to read
  guest last instruction on the exit path. lwepx exceptions (DTLB_MISS, DSI
  and LRAT), generated by loading a guest address, needs to be handled by KVM.
  These exceptions are generated in a substituted guest translation context
  (EPLC[EGS] = 1) from host context (MSR[GS] = 0).
 
  Currently, KVM hooks only interrupts generated from guest context (MSR[GS] 
  = 1),
  doing minimal checks on the fast path to avoid host performance degradation.
  lwepx exceptions originate from host state (MSR[GS] = 0) which implies
  additional checks in DO_KVM macro (beside the current MSR[GS] = 1) by 
  looking
  at the Exception Syndrome Register (ESR[EPID]) and the External PID Load 
  Context
  Register (EPLC[EGS]). Doing this on each Data TLB miss exception is obvious
  too intrusive for the host.
 
  Read guest last instruction from kvmppc_load_last_inst() by searching for 
  the
  physical address and kmap it. This address the TODO for TLB eviction and
  execute-but-not-read entries, and allow us to get rid of lwepx until we are
  able to handle failures.
 
  A simple stress benchmark shows a 1% sys performance degradation compared 
  with
  previous approach (lwepx without failure handling):
 
  time for i in `seq 1 1`; do /bin/echo  /dev/null; done
 
  real0m 8.85s
  user0m 4.34s
  sys 0m 4.48s
 
  vs
 
  real0m 8.84s
  user0m 4.36s
  sys 0m 4.44s
 
  An alternative solution, to handle lwepx exceptions in KVM, is to temporary
  highjack the interrupt vector from host. Some cores share host IVOR 
  registers
  between hardware threads, which is the case of FSL e6500, which impose 
  additional
  synchronization logic for this solution to work. This optimized solution can
  be developed later on top of this patch.
 
  Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
  ---
  v3:
- reworked patch description
- use unaltered kmap addr for kunmap
- get last instruction before beeing preempted
 
  v2:
- reworked patch description
- used pr_* functions
- addressed cosmetic feedback
 
arch/powerpc/kvm/booke.c  | 32 
arch/powerpc/kvm/bookehv_interrupts.S | 37 --
arch/powerpc/kvm/e500_mmu_host.c  | 93 
  +++
3 files changed, 134 insertions(+), 28 deletions(-)
 
  diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
  index 34a42b9..4ef52a8 100644
  --- a/arch/powerpc/kvm/booke.c
  +++ b/arch/powerpc/kvm/booke.c
  @@ -880,6 +880,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct 
  kvm_vcpu *vcpu,
  int r = RESUME_HOST;
  int s;
  int idx;
  +   u32 last_inst = KVM_INST_FETCH_FAILED;
  +   enum emulation_result emulated = EMULATE_DONE;

  /* update before a new last_exit_type is rewritten */
  kvmppc_update_timing_stats(vcpu);
  @@ -887,6 +889,15 @@ int kvmppc_handle_exit(struct kvm_run *run, struct 
  kvm_vcpu *vcpu,
  /* restart interrupts if they were meant for the host */
  kvmppc_restart_interrupt(vcpu, exit_nr);

  +   /*
  +* get last instruction before beeing preempted
  +* TODO: for e6500 check also BOOKE_INTERRUPT_LRAT_ERROR  ESR_DATA
  +*/
  +   if (exit_nr == BOOKE_INTERRUPT_DATA_STORAGE ||
  +   exit_nr == BOOKE_INTERRUPT_DTLB_MISS ||
  +   exit_nr == BOOKE_INTERRUPT_HV_PRIV)
 
 Please make this a switch() - that's easier to read.
 
  +   emulated = kvmppc_get_last_inst(vcpu, false, last_inst);
  +
  local_irq_enable();

  trace_kvm_exit(exit_nr, vcpu);
  @@ -895,6 +906,26 @@ int kvmppc_handle_exit(struct kvm_run *run, struct 
  kvm_vcpu *vcpu,
  run-exit_reason = KVM_EXIT_UNKNOWN;
  run-ready_for_interrupt_injection = 1;

  +   switch (emulated) {
  +   case EMULATE_AGAIN:
  +   r = RESUME_GUEST;
  +   goto out;
  +
  +   case EMULATE_FAIL:
  +   pr_debug(%s: emulation at %lx failed (%08x)\n,
  +  __func__, vcpu-arch.pc, last_inst);
  +   /* For debugging, encode the failing instruction and
  +* report it to userspace. */
  +   run-hw.hardware_exit_reason = ~0ULL  32;
  +   run-hw.hardware_exit_reason |= last_inst;
  +   kvmppc_core_queue_program(vcpu, ESR_PIL);
  +   r = RESUME_HOST;
  +   goto out;
  +
  +   default:
  +   break;
  +   }
 
 I think you can just put this into a function.
 
 Scott, I think the patch overall looks quite good. Can you please check 
 as well and if you agree give it your reviewed-by? Mike, when Scott 
 gives you a reviewed-by, please include it for the next version.
 
 
 Alex
 
  +
  switch (exit_nr) {
  case BOOKE_INTERRUPT_MACHINE_CHECK:
  printk(MACHINE CHECK: %lx\n, mfspr(SPRN_MCSR));
  @@ -1184,6 +1215,7 @@ int