Re: [PATCH v5 6/6] arm64: KVM: Implement 4 levels of translation tables for HYP and stage2

Christoffer Dall Tue, 06 May 2014 03:50:32 -0700

On Thu, May 01, 2014 at 11:34:19AM +0900, Jungseok Lee wrote:
> This patch adds 4 levels of translation tables implementation for both
> HYP and stage2.
> 
> Both symmetric and asymmetric configurations for page size and translation
> levels are are validated on Fast Models:
> 
>  1) 4KB  + 3 levels guest on 4KB  + 3 levels host
> 
>  2) 4KB  + 4 levels guest on 4KB  + 3 levels host
> 
>  3) 64KB + 2 levels guest on 4KB  + 3 levels host
> 
>  4) 4KB  + 3 levels guest on 4KB  + 4 levels host
> 
>  5) 4KB  + 4 levels guest on 4KB  + 4 levels host
> 
>  6) 64KB + 2 levels guest on 4KB  + 4 levels host
> 
>  7) 4KB  + 3 levels guest on 64KB + 2 levels host
> 
>  8) 4KB  + 4 levels guest on 64KB + 2 levels host
> 
>  9) 64KB + 2 levels guest on 64KB + 2 levels host
> 
> Cc: Marc Zyngier <marc.zyng...@arm.com>
> Cc: Christoffer Dall <christoffer.d...@linaro.org>
> Signed-off-by: Jungseok Lee <jays....@samsung.com>
> Reviewed-by: Sungjinn Chung <sungjinn.ch...@samsung.com>
> ---
>  arch/arm/include/asm/kvm_mmu.h   |   10 +++++
>  arch/arm/kvm/mmu.c               |   88 
> +++++++++++++++++++++++++++++++++-----
>  arch/arm64/include/asm/kvm_arm.h |   34 ++++++++++++---
>  arch/arm64/include/asm/kvm_mmu.h |   12 ++++++
>  4 files changed, 127 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> index 5c7aa3c..31eaaa6 100644
> --- a/arch/arm/include/asm/kvm_mmu.h
> +++ b/arch/arm/include/asm/kvm_mmu.h
> @@ -37,6 +37,11 @@
>   */
>  #define TRAMPOLINE_VA                UL(CONFIG_VECTORS_BASE)
>  
> +/*
> + * MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
> + */
> +#define MMU_CACHE_MIN_PAGES  2
> +


I would prefer this was KVM_MMU_CACHE_MIN_PAGES

>  #ifndef __ASSEMBLY__
>  
>  #include <asm/cacheflush.h>
> @@ -94,6 +99,11 @@ static inline void kvm_clean_pgd(pgd_t *pgd)
>       clean_dcache_area(pgd, PTRS_PER_S2_PGD * sizeof(pgd_t));
>  }
>  
> +static inline void kvm_clean_pmd(pmd_t *pmd)
> +{
> +     clean_dcache_area(pmd, PTRS_PER_PMD * sizeof(pmd_t));
> +}
> +
>  static inline void kvm_clean_pmd_entry(pmd_t *pmd)
>  {
>       clean_pmd_entry(pmd);
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 80bb1e6..3ffbdfb 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -388,13 +388,44 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned 
> long start,
>       return 0;
>  }
>  
> +static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
> +                                unsigned long end, unsigned long pfn,
> +                                pgprot_t prot)
> +{
> +     pud_t *pud;
> +     pmd_t *pmd;
> +     unsigned long addr, next;
> +
> +     addr = start;
> +     do {
> +             pud = pud_offset(pgd, addr);
> +
> +             if (pud_none_or_clear_bad(pud)) {
> +                     pmd = pmd_alloc_one(NULL, addr);
> +                     if (!pmd) {
> +                             kvm_err("Cannot allocate Hyp pmd\n");
> +                             return -ENOMEM;
> +                     }
> +                     pud_populate(NULL, pud, pmd);
> +                     get_page(virt_to_page(pud));
> +                     kvm_flush_dcache_to_poc(pud, sizeof(*pud));
> +             }
> +
> +             next = pud_addr_end(addr, end);
> +
> +             create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
> +             pfn += (next - addr) >> PAGE_SHIFT;
> +     } while (addr = next, addr != end);
> +
> +     return 0;
> +}
> +
>  static int __create_hyp_mappings(pgd_t *pgdp,
>                                unsigned long start, unsigned long end,
>                                unsigned long pfn, pgprot_t prot)
>  {
>       pgd_t *pgd;
>       pud_t *pud;
> -     pmd_t *pmd;
>       unsigned long addr, next;
>       int err = 0;
>  
> @@ -403,22 +434,23 @@ static int __create_hyp_mappings(pgd_t *pgdp,
>       end = PAGE_ALIGN(end);
>       do {
>               pgd = pgdp + pgd_index(addr);
> -             pud = pud_offset(pgd, addr);
>  
> -             if (pud_none_or_clear_bad(pud)) {
> -                     pmd = pmd_alloc_one(NULL, addr);
> -                     if (!pmd) {
> -                             kvm_err("Cannot allocate Hyp pmd\n");
> +             if (pgd_none(*pgd)) {
> +                     pud = pud_alloc_one(NULL, addr);
> +                     if (!pud) {
> +                             kvm_err("Cannot allocate Hyp pud\n");
>                               err = -ENOMEM;
>                               goto out;
>                       }
> -                     pud_populate(NULL, pud, pmd);
> -                     get_page(virt_to_page(pud));
> -                     kvm_flush_dcache_to_poc(pud, sizeof(*pud));
> +                     pgd_populate(NULL, pgd, pud);
> +                     get_page(virt_to_page(pgd));
> +                     kvm_flush_dcache_to_poc(pgd, sizeof(*pgd));
>               }
>  
>               next = pgd_addr_end(addr, end);
> -             err = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
> +
> +             err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
> +

super nit: this whitespacing looks weird

>               if (err)
>                       goto out;
>               pfn += (next - addr) >> PAGE_SHIFT;
> @@ -563,6 +595,24 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
>       kvm->arch.pgd = NULL;
>  }
>  
> +static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache 
> *cache,
> +                          phys_addr_t addr)
> +{
> +     pgd_t *pgd;
> +     pud_t *pud;
> +
> +     pgd = kvm->arch.pgd + pgd_index(addr);
> +     if (pgd_none(*pgd)) {
> +             if (!cache)
> +                     return NULL;
> +             pud = mmu_memory_cache_alloc(cache);
> +             pgd_populate(NULL, pgd, pud);
> +             get_page(virt_to_page(pgd));
> +     }
> +
> +     return pud_offset(pgd, addr);
> +}
> +
>  static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache 
> *cache,
>                            phys_addr_t addr)
>  {
> @@ -614,9 +664,24 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct 
> kvm_mmu_memory_cache
>  static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache 
> *cache,
>                         phys_addr_t addr, const pte_t *new_pte, bool iomap)
>  {
> +     pud_t *pud;
>       pmd_t *pmd;
>       pte_t *pte, old_pte;
>  
> +     /* Create stage-2 page table mapping - Level 0 */
> +     pud = stage2_get_pud(kvm, cache, addr);
> +     if (!pud)
> +             return 0;
> +
> +     if (pud_none(*pud)) {
> +             if (!cache)
> +                     return 0;
> +             pmd = mmu_memory_cache_alloc(cache);
> +             kvm_clean_pmd(pmd);
> +             pud_populate(NULL, pud, pmd);
> +             get_page(virt_to_page(pud));
> +     }
> +

Now we are doing this work twice, here and in stage2_get_pmd.  Can you
not simply call stage2_get_pmd() from stage2_get_pud() and get rid of
this code?

>       /* Create stage-2 page table mapping - Level 1 */
>       pmd = stage2_get_pmd(kvm, cache, addr);
>       if (!pmd) {
> @@ -675,7 +740,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t 
> guest_ipa,
>       for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
>               pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
>  
> -             ret = mmu_topup_memory_cache(&cache, 2, 2);
> +             ret = mmu_topup_memory_cache(&cache, MMU_CACHE_MIN_PAGES,
> +                                          MMU_CACHE_MIN_PAGES);
>               if (ret)
>                       goto out;
>               spin_lock(&kvm->mmu_lock);
> diff --git a/arch/arm64/include/asm/kvm_arm.h 
> b/arch/arm64/include/asm/kvm_arm.h
> index 3d69030..29c9c25 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -117,9 +117,10 @@
>  #define VTCR_EL2_IRGN0_MASK  (3 << 8)
>  #define VTCR_EL2_IRGN0_WBWA  (1 << 8)
>  #define VTCR_EL2_SL0_MASK    (3 << 6)
> +#define VTCR_EL2_SL0_LVL0    (2 << 6)
>  #define VTCR_EL2_SL0_LVL1    (1 << 6)
>  #define VTCR_EL2_T0SZ_MASK   0x3f
> -#define VTCR_EL2_T0SZ_40B    24
> +#define VTCR_EL2_T0SZ(bits)  (64 - (bits))
>  
>  #ifdef CONFIG_ARM64_64K_PAGES
>  /*
> @@ -129,11 +130,14 @@
>   * 64kB pages (TG0 = 1)
>   * 2 level page tables (SL = 1)
>   */
> +#define VTTBR_OUTPUT_BITS    40

This is confusing, because the PS field is populated at runtime in
arch/arm64/kvm/hyp-init.S and depends on the PARange field in
ID_AA64MMFR0_EL1.

So the existing code actually looks wrong to me in that we will generate
a Stage-2 translation fault for all IPAs > 2^40 on systems with >40 bits
physical address space.  (Which will also be the case after this patch).

>  #define VTCR_EL2_FLAGS               (VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER 
> | \
>                                VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -                              VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> -#define VTTBR_X              (38 - VTCR_EL2_T0SZ_40B)
> +                              VTCR_EL2_SL0_LVL1 | \
> +                              VTCR_EL2_T0SZ(VTTBR_OUTPUT_BITS))
> +#define VTTBR_X              (38 - VTCR_EL2_T0SZ(VTTBR_OUTPUT_BITS))

I spent hours trying to remember how to make sense of these hard-coded
numbers, and failed.

Now when you're adding yet another one, please add an explanation on how
to decode the VTTBR_X and why it all adds up as it should.

>  #else
> +#ifndef CONFIG_ARM64_4_LEVELS
>  /*
>   * Stage2 translation configuration:
>   * 40bits output (PS = 2)
> @@ -141,14 +145,32 @@
>   * 4kB pages (TG0 = 0)
>   * 3 level page tables (SL = 1)
>   */
> +#define VTTBR_OUTPUT_BITS    40
>  #define VTCR_EL2_FLAGS               (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | 
> \
>                                VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> -                              VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
> -#define VTTBR_X              (37 - VTCR_EL2_T0SZ_40B)
> +                              VTCR_EL2_SL0_LVL1 | \
> +                              VTCR_EL2_T0SZ(VTTBR_OUTPUT_BITS))
> +#define VTTBR_X              (37 - VTCR_EL2_T0SZ(VTTBR_OUTPUT_BITS))
> +#else
> +/*
> + * Stage2 translation configuration:
> + * 40bits output (PS = 2)
> + * 48bits input  (T0SZ = 16)
> + * 4kB pages (TG0 = 0)
> + * 4 level page tables (SL = 2)
> + */
> +#define VTTBR_OUTPUT_BITS    48

You're defining output bits to 48, but your comment says 40 bits output,
and 48 bits input here.

I suspect the problem here in general is that the T0SZ field size
regulates the size of the input to the Stage-2 translation, and not the
output, and therefore that your comment is correct (except that PS will
not be fixed to 2), and that the input will indeed be 48 bits.

> +#define VTCR_EL2_FLAGS               (VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | 
> \
> +                              VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
> +                              VTCR_EL2_SL0_LVL0 | \
> +                              VTCR_EL2_T0SZ(VTTBR_OUTPUT_BITS))
> +#define VTTBR_X              (29 - VTCR_EL2_T0SZ(VTTBR_OUTPUT_BITS))

Definitely need to key so I can verify that 29 is correct or a comment
with a direct reference to the arch. document stating that this is the
correct formula.

> +#endif
>  #endif
>  
>  #define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
> -#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << 
> VTTBR_BADDR_SHIFT)
> +#define VTTBR_BADDR_SIZE  (1LLU << (VTTBR_OUTPUT_BITS - VTTBR_X))
> +#define VTTBR_BADDR_MASK  ((VTTBR_BADDR_SIZE - 1) << VTTBR_BADDR_SHIFT)
>  #define VTTBR_VMID_SHIFT  (48LLU)
>  #define VTTBR_VMID_MASK        (0xffLLU << VTTBR_VMID_SHIFT)
>  
> diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> b/arch/arm64/include/asm/kvm_mmu.h
> index 7d29847..72bf9ab 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -41,6 +41,17 @@
>   */
>  #define TRAMPOLINE_VA                (HYP_PAGE_OFFSET_MASK & PAGE_MASK)
>  
> +/*
> + * MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
> + */
> +#ifdef CONFIG_ARM64_2_LEVELS
> +#define MMU_CACHE_MIN_PAGES  1
> +#elif defined(CONFIG_ARM64_3_LEVELS)
> +#define MMU_CACHE_MIN_PAGES  2
> +#else
> +#define MMU_CACHE_MIN_PAGES  3
> +#endif
> +
>  #ifdef __ASSEMBLY__
>  
>  /*
> @@ -107,6 +118,7 @@ static inline bool kvm_is_write_fault(unsigned long esr)
>  }
>  
>  static inline void kvm_clean_pgd(pgd_t *pgd) {}
> +static inline void kvm_clean_pmd(pmd_t *pmd) {}
>  static inline void kvm_clean_pmd_entry(pmd_t *pmd) {}
>  static inline void kvm_clean_pte(pte_t *pte) {}
>  static inline void kvm_clean_pte_entry(pte_t *pte) {}
> -- 
> 1.7.10.4
> 
> 

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v5 6/6] arm64: KVM: Implement 4 levels of translation tables for HYP and stage2

Reply via email to