RE: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages
-Original Message- From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org] Sent: Saturday, July 27, 2013 3:57 AM To: Bhushan Bharat-R65777 Cc: Alexander Graf; kvm-ppc@vger.kernel.org; k...@vger.kernel.org; linuxppc- d...@lists.ozlabs.org; Wood Scott-B07421 Subject: Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages On Fri, 2013-07-26 at 15:03 +, Bhushan Bharat-R65777 wrote: Will not searching the Linux PTE is a overkill? That's the best approach. Also we are searching it already to resolve the page fault. That does mean we search twice but on the other hand that also means it's hot in the cache. Below is early git diff (not a proper cleanup patch), to be sure that this is what we want on PowerPC and take early feedback. Also I run some benchmark to understand the overhead if any. Using kvm_is_mmio_pfn(); what the current patch does: Real: 0m46.616s + 0m49.517s + 0m49.510s + 0m46.936s + 0m46.889s + 0m46.684s = Avg; 47.692s User: 0m31.636s + 0m31.816s + 0m31.456s + 0m31.752s + 0m32.028s + 0m31.848s = Avg; 31.756s Sys: 0m11.596s + 0m11.868s + 0m12.244s + 0m11.672s + 0m11.356s + 0m11.432s = Avg; 11.695s Using kernel page table search (below changes): Real: 0m46.431s + 0m50.269s + 0m46.724s + 0m46.645s + 0m46.670s + 0m50.259s = Avg; 47.833s User: 0m31.568s + 0m31.816s + 0m31.444s + 0m31.808s + 0m31.312s + 0m31.740s = Avg; 31.614s Sys: 0m11.516s + 0m12.060s + 0m11.872s + 0m11.476s + 0m12.000s + 0m12.152s = Avg; 11.846s -- diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3328353..d6d0dac 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -532,6 +532,7 @@ struct kvm_vcpu_arch { u32 epr; u32 crit_save; struct kvmppc_booke_debug_reg dbg_reg; + pgd_t *pgdir; #endif gpa_t paddr_accessed; gva_t vaddr_accessed; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 17722d8..eb2 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -697,7 +697,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif kvmppc_fix_ee_before_entry(); - + vcpu-arch.pgdir = current-mm-pgd; ret = __kvmppc_vcpu_run(kvm_run, vcpu); /* No need for kvm_guest_exit. It's done in handle_exit. diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index 4fd9650..fc4b2f6 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -31,11 +31,13 @@ enum vcpu_ftr { #define E500_TLB_NUM 2 /* entry is mapped somewhere in host TLB */ -#define E500_TLB_VALID (1 0) +#define E500_TLB_VALID (1 31) /* TLB1 entry is mapped by host TLB1, tracked by bitmaps */ -#define E500_TLB_BITMAP(1 1) +#define E500_TLB_BITMAP(1 30) /* TLB1 entry is mapped by host TLB0 */ -#define E500_TLB_TLB0 (1 2) +#define E500_TLB_TLB0 (1 29) +/* Lower 5 bits have WIMGE value */ +#define E500_TLB_WIMGE_MASK(0x1f) struct tlbe_ref { pfn_t pfn; /* valid only for TLB0, except briefly */ diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 5cbdc8f..a48c13f 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -40,6 +40,84 @@ static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; +/* + * find_linux_pte returns the address of a linux pte for a given + * effective address and directory. If not found, it returns zero. + */ +static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) +{ +pgd_t *pg; +pud_t *pu; +pmd_t *pm; +pte_t *pt = NULL; + +pg = pgdir + pgd_index(ea); +if (!pgd_none(*pg)) { +pu = pud_offset(pg, ea); +if (!pud_none(*pu)) { +pm = pmd_offset(pu, ea); +if (pmd_present(*pm)) +pt = pte_offset_kernel(pm, ea); +} +} +return pt; +} + +#ifdef CONFIG_HUGETLB_PAGE +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift); +#else +static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift) +{ +if (shift) +*shift = 0; +return find_linux_pte(pgdir, ea); +} +#endif /* !CONFIG_HUGETLB_PAGE */ + +/* + * Lock and read a linux PTE. If it's present and writable, atomically + * set dirty and referenced bits and return the PTE, otherwise return 0. + */ +static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing) +{ + pte_t pte = pte_val(*p); + + if (pte_present(pte)) { + pte =
Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages
On 07/30/2013 11:22:54 AM, Bhushan Bharat-R65777 wrote: diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 5cbdc8f..a48c13f 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -40,6 +40,84 @@ static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; +/* + * find_linux_pte returns the address of a linux pte for a given + * effective address and directory. If not found, it returns zero. + */ +static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) +{ +pgd_t *pg; +pud_t *pu; +pmd_t *pm; +pte_t *pt = NULL; + +pg = pgdir + pgd_index(ea); +if (!pgd_none(*pg)) { +pu = pud_offset(pg, ea); +if (!pud_none(*pu)) { +pm = pmd_offset(pu, ea); +if (pmd_present(*pm)) +pt = pte_offset_kernel(pm, ea); +} +} +return pt; +} How is this specific to KVM or e500? +#ifdef CONFIG_HUGETLB_PAGE +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift); +#else +static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift) +{ +if (shift) +*shift = 0; +return find_linux_pte(pgdir, ea); +} +#endif /* !CONFIG_HUGETLB_PAGE */ This is already declared in asm/pgtable.h. If we need a non-hugepage alternative, that should also go in asm/pgtable.h. +/* + * Lock and read a linux PTE. If it's present and writable, atomically + * set dirty and referenced bits and return the PTE, otherwise return 0. + */ +static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing) +{ + pte_t pte = pte_val(*p); + + if (pte_present(pte)) { + pte = pte_mkyoung(pte); + if (writing pte_write(pte)) + pte = pte_mkdirty(pte); + } + + *p = pte; + + return pte; +} + +static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, + int writing, unsigned long *pte_sizep) +{ + pte_t *ptep; + unsigned long ps = *pte_sizep; + unsigned int shift; + + ptep = find_linux_pte_or_hugepte(pgdir, hva, shift); + if (!ptep) + return __pte(0); + if (shift) + *pte_sizep = 1ul shift; + else + *pte_sizep = PAGE_SIZE; + + if (ps *pte_sizep) + return __pte(0); + if (!pte_present(*ptep)) + return __pte(0); + + return kvmppc_read_update_linux_pte(ptep, writing); +} + None of this belongs in this file either. @@ -326,8 +405,8 @@ static void kvmppc_e500_setup_stlbe( /* Force IPROT=0 for all guest mappings. */ stlbe-mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; - stlbe-mas2 = (gvaddr MAS2_EPN) | - e500_shadow_mas2_attrib(gtlbe-mas2, pfn); + stlbe-mas2 = (gvaddr MAS2_EPN) | (ref-flags E500_TLB_WIMGE_MASK); +// e500_shadow_mas2_attrib(gtlbe-mas2, pfn); MAS2_E and MAS2_G should be safe to come from the guest. How does this work for TLB1? One ref corresponds to one guest entry, which may correspond to multiple host entries, potentially each with different WIM settings. stlbe-mas7_3 = ((u64)pfn PAGE_SHIFT) | e500_shadow_mas3_attrib(gtlbe-mas7_3, pr); @@ -346,6 +425,8 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, unsigned long hva; int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; + pte_t pte; + int wimg = 0; /* * Translate guest physical to true physical, acquiring @@ -451,6 +532,8 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (likely(!pfnmap)) { unsigned long tsize_pages = 1 (tsize + 10 - PAGE_SHIFT); + pgd_t *pgdir; + pfn = gfn_to_pfn_memslot(slot, gfn); if (is_error_noslot_pfn(pfn)) { printk(KERN_ERR Couldn't get real page for gfn %lx!\n, @@ -461,9 +544,15 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, /* Align guest and physical address to page map boundaries */ pfn = ~(tsize_pages - 1); gvaddr = ~((tsize_pages PAGE_SHIFT) - 1); + pgdir = vcpu_e500-vcpu.arch.pgdir; + pte = lookup_linux_pte(pgdir, hva, 1, tsize_pages); + if (pte_present(pte)) + wimg = (pte PTE_WIMGE_SHIFT) MAS2_WIMGE_MASK; + else + wimg = MAS2_I | MAS2_G; If the PTE is not present, then we can't map it, right? So why I+G? -Scott -- To unsubscribe from
RE: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages
-Original Message- From: Wood Scott-B07421 Sent: Wednesday, July 31, 2013 12:19 AM To: Bhushan Bharat-R65777 Cc: Benjamin Herrenschmidt; Alexander Graf; kvm-ppc@vger.kernel.org; k...@vger.kernel.org; linuxppc-...@lists.ozlabs.org; Wood Scott-B07421 Subject: Re: [PATCH 4/4] kvm: powerpc: set cache coherency only for RAM pages On 07/30/2013 11:22:54 AM, Bhushan Bharat-R65777 wrote: diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 5cbdc8f..a48c13f 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -40,6 +40,84 @@ static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; +/* + * find_linux_pte returns the address of a linux pte for a given + * effective address and directory. If not found, it returns zero. + */ +static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) { +pgd_t *pg; +pud_t *pu; +pmd_t *pm; +pte_t *pt = NULL; + +pg = pgdir + pgd_index(ea); +if (!pgd_none(*pg)) { +pu = pud_offset(pg, ea); +if (!pud_none(*pu)) { +pm = pmd_offset(pu, ea); +if (pmd_present(*pm)) +pt = pte_offset_kernel(pm, ea); +} +} +return pt; +} How is this specific to KVM or e500? +#ifdef CONFIG_HUGETLB_PAGE +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift); #else static +inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift) { +if (shift) +*shift = 0; +return find_linux_pte(pgdir, ea); } #endif /* +!CONFIG_HUGETLB_PAGE */ This is already declared in asm/pgtable.h. If we need a non-hugepage alternative, that should also go in asm/pgtable.h. +/* + * Lock and read a linux PTE. If it's present and writable, atomically + * set dirty and referenced bits and return the PTE, otherwise return 0. + */ +static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing) +{ + pte_t pte = pte_val(*p); + + if (pte_present(pte)) { + pte = pte_mkyoung(pte); + if (writing pte_write(pte)) + pte = pte_mkdirty(pte); + } + + *p = pte; + + return pte; +} + +static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, + int writing, unsigned long *pte_sizep) { + pte_t *ptep; + unsigned long ps = *pte_sizep; + unsigned int shift; + + ptep = find_linux_pte_or_hugepte(pgdir, hva, shift); + if (!ptep) + return __pte(0); + if (shift) + *pte_sizep = 1ul shift; + else + *pte_sizep = PAGE_SIZE; + + if (ps *pte_sizep) + return __pte(0); + if (!pte_present(*ptep)) + return __pte(0); + + return kvmppc_read_update_linux_pte(ptep, writing); } + None of this belongs in this file either. @@ -326,8 +405,8 @@ static void kvmppc_e500_setup_stlbe( /* Force IPROT=0 for all guest mappings. */ stlbe-mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; - stlbe-mas2 = (gvaddr MAS2_EPN) | - e500_shadow_mas2_attrib(gtlbe-mas2, pfn); + stlbe-mas2 = (gvaddr MAS2_EPN) | (ref-flags E500_TLB_WIMGE_MASK); +// e500_shadow_mas2_attrib(gtlbe-mas2, pfn); MAS2_E and MAS2_G should be safe to come from the guest. This is handled when setting WIMGE in ref-flags. How does this work for TLB1? One ref corresponds to one guest entry, which may correspond to multiple host entries, potentially each with different WIM settings. Yes, one ref corresponds to one guest entry. To understand how this will work when a one guest tlb1 entry may maps to many host tlb0/1 entry; on guest tlbwe, KVM setup one guest tlb entry and then pre-map one host tlb entry (out of many) and ref (ref-pfn etc) points to this pre-map entry for that guest entry. Now a guest TLB miss happens which falls on same guest tlb entry and but demands another host tlb entry. In that flow we change/overwrite ref (ref-pfn etc) to point to new host mapping for same guest mapping. stlbe-mas7_3 = ((u64)pfn PAGE_SHIFT) | e500_shadow_mas3_attrib(gtlbe-mas7_3, pr); @@ -346,6 +425,8 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, unsigned long hva; int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; + pte_t pte; + int wimg = 0; /* * Translate guest physical to true physical, acquiring @@