RE: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages

2013-07-30 Thread Bhushan Bharat-R65777


 -Original Message-
 From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org]
 Sent: Saturday, July 27, 2013 3:57 AM
 To: Bhushan Bharat-R65777
 Cc: Alexander Graf; kvm-ppc@vger.kernel.org; k...@vger.kernel.org; linuxppc-
 d...@lists.ozlabs.org; Wood Scott-B07421
 Subject: Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages
 
 On Fri, 2013-07-26 at 15:03 +, Bhushan Bharat-R65777 wrote:
  Will not searching the Linux PTE is a overkill?
 
 That's the best approach. Also we are searching it already to resolve the page
 fault. That does mean we search twice but on the other hand that also means 
 it's
 hot in the cache.


Below is early git diff (not a proper cleanup patch), to be sure that this is 
what we want on PowerPC and take early feedback. Also I run some benchmark to 
understand the overhead if any. 

Using kvm_is_mmio_pfn(); what the current patch does:   

Real: 0m46.616s + 0m49.517s + 0m49.510s + 0m46.936s + 0m46.889s + 0m46.684s = 
Avg; 47.692s
User: 0m31.636s + 0m31.816s + 0m31.456s + 0m31.752s + 0m32.028s + 0m31.848s = 
Avg; 31.756s
Sys:  0m11.596s + 0m11.868s + 0m12.244s + 0m11.672s + 0m11.356s + 0m11.432s = 
Avg; 11.695s


Using kernel page table search (below changes):
Real: 0m46.431s + 0m50.269s + 0m46.724s + 0m46.645s + 0m46.670s + 0m50.259s = 
Avg; 47.833s
User: 0m31.568s + 0m31.816s + 0m31.444s + 0m31.808s + 0m31.312s + 0m31.740s = 
Avg; 31.614s
Sys:  0m11.516s + 0m12.060s + 0m11.872s + 0m11.476s + 0m12.000s + 0m12.152s = 
Avg; 11.846s

--
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 3328353..d6d0dac 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -532,6 +532,7 @@ struct kvm_vcpu_arch {
u32 epr;
u32 crit_save;
struct kvmppc_booke_debug_reg dbg_reg;
+   pgd_t *pgdir;
 #endif
gpa_t paddr_accessed;
gva_t vaddr_accessed;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 17722d8..eb2 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -697,7 +697,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu)
 #endif
 
kvmppc_fix_ee_before_entry();
-
+   vcpu-arch.pgdir = current-mm-pgd;
ret = __kvmppc_vcpu_run(kvm_run, vcpu);
 
/* No need for kvm_guest_exit. It's done in handle_exit.
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 4fd9650..fc4b2f6 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -31,11 +31,13 @@ enum vcpu_ftr {
 #define E500_TLB_NUM   2
 
 /* entry is mapped somewhere in host TLB */
-#define E500_TLB_VALID (1  0)
+#define E500_TLB_VALID (1  31)
 /* TLB1 entry is mapped by host TLB1, tracked by bitmaps */
-#define E500_TLB_BITMAP(1  1)
+#define E500_TLB_BITMAP(1  30)
 /* TLB1 entry is mapped by host TLB0 */
-#define E500_TLB_TLB0  (1  2)
+#define E500_TLB_TLB0  (1  29)
+/* Lower 5 bits have WIMGE value */
+#define E500_TLB_WIMGE_MASK(0x1f)
 
 struct tlbe_ref {
pfn_t pfn;  /* valid only for TLB0, except briefly */
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 5cbdc8f..a48c13f 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -40,6 +40,84 @@
 
 static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
 
+/*
+ * find_linux_pte returns the address of a linux pte for a given
+ * effective address and directory.  If not found, it returns zero.
+ */
+static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea)
+{
+pgd_t *pg;
+pud_t *pu;
+pmd_t *pm;
+pte_t *pt = NULL;
+
+pg = pgdir + pgd_index(ea);
+if (!pgd_none(*pg)) {
+pu = pud_offset(pg, ea);
+if (!pud_none(*pu)) {
+pm = pmd_offset(pu, ea);
+if (pmd_present(*pm))
+pt = pte_offset_kernel(pm, ea);
+}
+}
+return pt;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+ unsigned *shift);
+#else
+static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+   unsigned *shift)
+{
+if (shift)
+*shift = 0;
+return find_linux_pte(pgdir, ea);
+}
+#endif /* !CONFIG_HUGETLB_PAGE */
+
+/*
+ * Lock and read a linux PTE.  If it's present and writable, atomically
+ * set dirty and referenced bits and return the PTE, otherwise return 0.
+ */
+static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing)
+{
+   pte_t pte = pte_val(*p);
+
+   if (pte_present(pte)) {
+   pte = 

Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages

2013-07-30 Thread Scott Wood

On 07/30/2013 11:22:54 AM, Bhushan Bharat-R65777 wrote:
diff --git a/arch/powerpc/kvm/e500_mmu_host.c  
b/arch/powerpc/kvm/e500_mmu_host.c

index 5cbdc8f..a48c13f 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -40,6 +40,84 @@

 static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];

+/*
+ * find_linux_pte returns the address of a linux pte for a given
+ * effective address and directory.  If not found, it returns zero.
+ */
+static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea)
+{
+pgd_t *pg;
+pud_t *pu;
+pmd_t *pm;
+pte_t *pt = NULL;
+
+pg = pgdir + pgd_index(ea);
+if (!pgd_none(*pg)) {
+pu = pud_offset(pg, ea);
+if (!pud_none(*pu)) {
+pm = pmd_offset(pu, ea);
+if (pmd_present(*pm))
+pt = pte_offset_kernel(pm, ea);
+}
+}
+return pt;
+}


How is this specific to KVM or e500?


+#ifdef CONFIG_HUGETLB_PAGE
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+ unsigned *shift);
+#else
+static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir,  
unsigned long ea,

+   unsigned *shift)
+{
+if (shift)
+*shift = 0;
+return find_linux_pte(pgdir, ea);
+}
+#endif /* !CONFIG_HUGETLB_PAGE */


This is already declared in asm/pgtable.h.  If we need a non-hugepage  
alternative, that should also go in asm/pgtable.h.



+/*
+ * Lock and read a linux PTE.  If it's present and writable,  
atomically
+ * set dirty and referenced bits and return the PTE, otherwise  
return 0.

+ */
+static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int  
writing)

+{
+   pte_t pte = pte_val(*p);
+
+   if (pte_present(pte)) {
+   pte = pte_mkyoung(pte);
+   if (writing  pte_write(pte))
+   pte = pte_mkdirty(pte);
+   }
+
+   *p = pte;
+
+   return pte;
+}
+
+static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
+ int writing, unsigned long *pte_sizep)
+{
+   pte_t *ptep;
+   unsigned long ps = *pte_sizep;
+   unsigned int shift;
+
+   ptep = find_linux_pte_or_hugepte(pgdir, hva, shift);
+   if (!ptep)
+   return __pte(0);
+   if (shift)
+   *pte_sizep = 1ul  shift;
+   else
+   *pte_sizep = PAGE_SIZE;
+
+   if (ps  *pte_sizep)
+   return __pte(0);
+   if (!pte_present(*ptep))
+   return __pte(0);
+
+   return kvmppc_read_update_linux_pte(ptep, writing);
+}
+


None of this belongs in this file either.


@@ -326,8 +405,8 @@ static void kvmppc_e500_setup_stlbe(

/* Force IPROT=0 for all guest mappings. */
stlbe-mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) |  
MAS1_VALID;

-   stlbe-mas2 = (gvaddr  MAS2_EPN) |
- e500_shadow_mas2_attrib(gtlbe-mas2, pfn);
+   stlbe-mas2 = (gvaddr  MAS2_EPN) | (ref-flags   
E500_TLB_WIMGE_MASK);

+//   e500_shadow_mas2_attrib(gtlbe-mas2, pfn);


MAS2_E and MAS2_G should be safe to come from the guest.

How does this work for TLB1?  One ref corresponds to one guest entry,  
which may correspond to multiple host entries, potentially each with  
different WIM settings.



stlbe-mas7_3 = ((u64)pfn  PAGE_SHIFT) |
e500_shadow_mas3_attrib(gtlbe-mas7_3, pr);

@@ -346,6 +425,8 @@ static inline int kvmppc_e500_shadow_map(struct  
kvmppc_vcpu_e500 *vcpu_e500,

unsigned long hva;
int pfnmap = 0;
int tsize = BOOK3E_PAGESZ_4K;
+   pte_t pte;
+   int wimg = 0;

/*
 * Translate guest physical to true physical, acquiring
@@ -451,6 +532,8 @@ static inline int kvmppc_e500_shadow_map(struct  
kvmppc_vcpu_e500 *vcpu_e500,


if (likely(!pfnmap)) {
unsigned long tsize_pages = 1  (tsize + 10 -  
PAGE_SHIFT);

+   pgd_t *pgdir;
+
pfn = gfn_to_pfn_memslot(slot, gfn);
if (is_error_noslot_pfn(pfn)) {
printk(KERN_ERR Couldn't get real page for  
gfn %lx!\n,
@@ -461,9 +544,15 @@ static inline int kvmppc_e500_shadow_map(struct  
kvmppc_vcpu_e500 *vcpu_e500,
/* Align guest and physical address to page map  
boundaries */

pfn = ~(tsize_pages - 1);
gvaddr = ~((tsize_pages  PAGE_SHIFT) - 1);
+   pgdir = vcpu_e500-vcpu.arch.pgdir;
+   pte = lookup_linux_pte(pgdir, hva, 1, tsize_pages);
+   if (pte_present(pte))
+   wimg = (pte  PTE_WIMGE_SHIFT)   
MAS2_WIMGE_MASK;

+   else
+   wimg = MAS2_I | MAS2_G;


If the PTE is not present, then we can't map it, right?  So why I+G?

-Scott
--
To unsubscribe from 

RE: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages

2013-07-30 Thread Bhushan Bharat-R65777


 -Original Message-
 From: Wood Scott-B07421
 Sent: Wednesday, July 31, 2013 12:19 AM
 To: Bhushan Bharat-R65777
 Cc: Benjamin Herrenschmidt; Alexander Graf; kvm-ppc@vger.kernel.org;
 k...@vger.kernel.org; linuxppc-...@lists.ozlabs.org; Wood Scott-B07421
 Subject: Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages
 
 On 07/30/2013 11:22:54 AM, Bhushan Bharat-R65777 wrote:
  diff --git a/arch/powerpc/kvm/e500_mmu_host.c
  b/arch/powerpc/kvm/e500_mmu_host.c
  index 5cbdc8f..a48c13f 100644
  --- a/arch/powerpc/kvm/e500_mmu_host.c
  +++ b/arch/powerpc/kvm/e500_mmu_host.c
  @@ -40,6 +40,84 @@
 
   static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
 
  +/*
  + * find_linux_pte returns the address of a linux pte for a given
  + * effective address and directory.  If not found, it returns zero.
  + */
  +static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) {
  +pgd_t *pg;
  +pud_t *pu;
  +pmd_t *pm;
  +pte_t *pt = NULL;
  +
  +pg = pgdir + pgd_index(ea);
  +if (!pgd_none(*pg)) {
  +pu = pud_offset(pg, ea);
  +if (!pud_none(*pu)) {
  +pm = pmd_offset(pu, ea);
  +if (pmd_present(*pm))
  +pt = pte_offset_kernel(pm, ea);
  +}
  +}
  +return pt;
  +}
 
 How is this specific to KVM or e500?
 
  +#ifdef CONFIG_HUGETLB_PAGE
  +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
  + unsigned *shift); #else static
  +inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir,
  unsigned long ea,
  +   unsigned *shift) {
  +if (shift)
  +*shift = 0;
  +return find_linux_pte(pgdir, ea); } #endif /*
  +!CONFIG_HUGETLB_PAGE */
 
 This is already declared in asm/pgtable.h.  If we need a non-hugepage
 alternative, that should also go in asm/pgtable.h.
 
  +/*
  + * Lock and read a linux PTE.  If it's present and writable,
  atomically
  + * set dirty and referenced bits and return the PTE, otherwise
  return 0.
  + */
  +static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int
  writing)
  +{
  +   pte_t pte = pte_val(*p);
  +
  +   if (pte_present(pte)) {
  +   pte = pte_mkyoung(pte);
  +   if (writing  pte_write(pte))
  +   pte = pte_mkdirty(pte);
  +   }
  +
  +   *p = pte;
  +
  +   return pte;
  +}
  +
  +static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
  + int writing, unsigned long *pte_sizep) {
  +   pte_t *ptep;
  +   unsigned long ps = *pte_sizep;
  +   unsigned int shift;
  +
  +   ptep = find_linux_pte_or_hugepte(pgdir, hva, shift);
  +   if (!ptep)
  +   return __pte(0);
  +   if (shift)
  +   *pte_sizep = 1ul  shift;
  +   else
  +   *pte_sizep = PAGE_SIZE;
  +
  +   if (ps  *pte_sizep)
  +   return __pte(0);
  +   if (!pte_present(*ptep))
  +   return __pte(0);
  +
  +   return kvmppc_read_update_linux_pte(ptep, writing); }
  +
 
 None of this belongs in this file either.
 
  @@ -326,8 +405,8 @@ static void kvmppc_e500_setup_stlbe(
 
  /* Force IPROT=0 for all guest mappings. */
  stlbe-mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) |
  MAS1_VALID;
  -   stlbe-mas2 = (gvaddr  MAS2_EPN) |
  - e500_shadow_mas2_attrib(gtlbe-mas2, pfn);
  +   stlbe-mas2 = (gvaddr  MAS2_EPN) | (ref-flags 
  E500_TLB_WIMGE_MASK);
  +//   e500_shadow_mas2_attrib(gtlbe-mas2, pfn);
 
 MAS2_E and MAS2_G should be safe to come from the guest.

This is handled when setting WIMGE in ref-flags.

 
 How does this work for TLB1?  One ref corresponds to one guest entry, which 
 may
 correspond to multiple host entries, potentially each with different WIM
 settings.

Yes, one ref corresponds to one guest entry. To understand how this will work 
when a one guest tlb1 entry may maps to many host tlb0/1 entry; 
on guest tlbwe, KVM setup one guest tlb entry and then pre-map one host tlb 
entry (out of many) and ref (ref-pfn etc) points to this pre-map entry for 
that guest entry.
Now a guest TLB miss happens which falls on same guest tlb entry and but 
demands another host tlb entry. In that flow we change/overwrite ref (ref-pfn 
etc) to point to new host mapping for same guest mapping.

 
  stlbe-mas7_3 = ((u64)pfn  PAGE_SHIFT) |
  e500_shadow_mas3_attrib(gtlbe-mas7_3, pr);
 
  @@ -346,6 +425,8 @@ static inline int kvmppc_e500_shadow_map(struct
  kvmppc_vcpu_e500 *vcpu_e500,
  unsigned long hva;
  int pfnmap = 0;
  int tsize = BOOK3E_PAGESZ_4K;
  +   pte_t pte;
  +   int wimg = 0;
 
  /*
   * Translate guest physical to true physical, acquiring @@