from:"Yanan Wang"

[PATCH] KVM: selftests: Tweak time measurement flag in kvm_page_table_test

2021-04-19 Thread Yanan Wang

Also use CLOCK_MONOTONIC flag to get time in kvm_page_table_test.c,
since that's what all the kvm/selftests do currently. And this will
be consistent with function timespec_elapsed() in test_util.c.

Signed-off-by: Yanan Wang 
---
 tools/testing/selftests/kvm/kvm_page_table_test.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c 
b/tools/testing/selftests/kvm/kvm_page_table_test.c
index 1c4753fff19e..d7847fba47a8 100644
--- a/tools/testing/selftests/kvm/kvm_page_table_test.c
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -212,7 +212,7 @@ static void *vcpu_worker(void *data)
if (READ_ONCE(host_quit))
return NULL;
 
-   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+   clock_gettime(CLOCK_MONOTONIC, &start);
ret = _vcpu_run(vm, vcpu_id);
ts_diff = timespec_elapsed(start);
 
@@ -390,7 +390,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
/* Test the stage of KVM creating mappings */
*current_stage = KVM_CREATE_MAPPINGS;
 
-   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+   clock_gettime(CLOCK_MONOTONIC, &start);
vcpus_complete_new_stage(*current_stage);
ts_diff = timespec_elapsed(start);
 
@@ -403,7 +403,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
*current_stage = KVM_UPDATE_MAPPINGS;
 
-   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+   clock_gettime(CLOCK_MONOTONIC, &start);
vcpus_complete_new_stage(*current_stage);
ts_diff = timespec_elapsed(start);
 
@@ -415,7 +415,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
*current_stage = KVM_ADJUST_MAPPINGS;
 
-   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+   clock_gettime(CLOCK_MONOTONIC, &start);
vcpus_complete_new_stage(*current_stage);
ts_diff = timespec_elapsed(start);
 
-- 
2.23.0

[PATCH v5 6/6] KVM: arm64: Distinguish cases of memcache allocations completely

2021-04-15 Thread Yanan Wang

With a guest translation fault, the memcache pages are not needed if KVM
is only about to install a new leaf entry into the existing page table.
And with a guest permission fault, the memcache pages are also not needed
for a write_fault in dirty-logging time if KVM is only about to update
the existing leaf entry instead of collapsing a block entry into a table.

By comparing fault_granule and vma_pagesize, cases that require allocations
from memcache and cases that don't can be distinguished completely.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/mmu.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index aa536392b308..9e35aa5d29f2 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -895,19 +895,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
 
-   /*
-* Permission faults just need to update the existing leaf entry,
-* and so normally don't require allocations from the memcache. The
-* only exception to this is when dirty logging is enabled at runtime
-* and a write fault needs to collapse a block entry into a table.
-*/
-   if (fault_status != FSC_PERM || (logging_active && write_fault)) {
-   ret = kvm_mmu_topup_memory_cache(memcache,
-kvm_mmu_cache_min_pages(kvm));
-   if (ret)
-   return ret;
-   }
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
 * Ensure the read of mmu_notifier_seq happens before we call
@@ -970,6 +957,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
 
+   /*
+* Allocations from the memcache are required only when granule of the
+* lookup level where the guest fault happened exceeds vma_pagesize,
+* which means new page tables will be created in the fault handlers.
+*/
+   if (fault_granule > vma_pagesize) {
+   ret = kvm_mmu_topup_memory_cache(memcache,
+kvm_mmu_cache_min_pages(kvm));
+   if (ret)
+   return ret;
+   }
+
/*
 * Under the premise of getting a FSC_PERM fault, we just need to relax
 * permissions only if vma_pagesize equals fault_granule. Otherwise,
-- 
2.23.0

[PATCH v5 3/6] KVM: arm64: Add mm_ops member for structure stage2_attr_data

2021-04-15 Thread Yanan Wang

Also add a mm_ops member for structure stage2_attr_data, since we
will move I-cache maintenance for guest stage-2 to the permission
path and as a result will need mm_ops for address transformation.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index e3606c9dcec7..b480f6d1171e 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -869,10 +869,11 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 
addr, u64 size)
 }
 
 struct stage2_attr_data {
-   kvm_pte_t   attr_set;
-   kvm_pte_t   attr_clr;
-   kvm_pte_t   pte;
-   u32 level;
+   kvm_pte_t   attr_set;
+   kvm_pte_t   attr_clr;
+   kvm_pte_t   pte;
+   u32 level;
+   struct kvm_pgtable_mm_ops   *mm_ops;
 };
 
 static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
@@ -911,6 +912,7 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable 
*pgt, u64 addr,
struct stage2_attr_data data = {
.attr_set   = attr_set & attr_mask,
.attr_clr   = attr_clr & attr_mask,
+   .mm_ops = pgt->mm_ops,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_attr_walker,
-- 
2.23.0

[PATCH v5 4/6] KVM: arm64: Provide invalidate_icache_range at non-VHE EL2

2021-04-15 Thread Yanan Wang

We want to move I-cache maintenance for the guest to the stage-2
page table code for performance improvement. Before it can work,
we should first make function invalidate_icache_range available
to non-VHE EL2 to avoid compiling or program running error, as
pgtable.c is now linked into the non-VHE EL2 code for pKVM mode.

In this patch, we only introduce symbol of invalidate_icache_range
with no real functionality in nvhe/cache.S, because there haven't
been situations found currently where I-cache maintenance is also
needed in non-VHE EL2 for pKVM mode.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/nvhe/cache.S | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
index 36cef6915428..a125ec9aeed2 100644
--- a/arch/arm64/kvm/hyp/nvhe/cache.S
+++ b/arch/arm64/kvm/hyp/nvhe/cache.S
@@ -11,3 +11,14 @@ SYM_FUNC_START_PI(__flush_dcache_area)
dcache_by_line_op civac, sy, x0, x1, x2, x3
ret
 SYM_FUNC_END_PI(__flush_dcache_area)
+
+/*
+ * invalidate_icache_range(start,end)
+ *
+ * Ensure that the I cache is invalid within specified region.
+ *
+ * - start   - virtual start address of region
+ * - end - virtual end address of region
+ */
+SYM_FUNC_START(invalidate_icache_range)
+SYM_FUNC_END(invalidate_icache_range)
-- 
2.23.0

[PATCH v5 5/6] KVM: arm64: Move I-cache flush to the fault handlers

2021-04-15 Thread Yanan Wang

In this patch, we move invalidation of I-cache to the fault handlers to
avoid unnecessary I-cache maintenances. On the map path, invalidate the
I-cache if we are going to create an executable stage-2 mapping for guest.
And on the permission path, invalidate the I-cache if we are going to add
an executable permission to the existing guest stage-2 mapping.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/kvm_mmu.h | 15 --
 arch/arm64/kvm/hyp/pgtable.c | 35 +++-
 arch/arm64/kvm/mmu.c |  9 +---
 3 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index e9b163c5f023..155492fe5b15 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -187,21 +187,6 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu 
*vcpu)
return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
- unsigned long size)
-{
-   if (icache_is_aliasing()) {
-   /* any kind of VIPT cache */
-   __flush_icache_all();
-   } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
-   /* PIPT or VPIPT at EL2 (see comment in 
__kvm_tlb_flush_vmid_ipa) */
-   void *va = page_address(pfn_to_page(pfn));
-
-   invalidate_icache_range((unsigned long)va,
-   (unsigned long)va + size);
-   }
-}
-
 void kvm_set_way_flush(struct kvm_vcpu *vcpu);
 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
 
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index b480f6d1171e..9f4429d80df0 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -568,6 +568,26 @@ static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, 
kvm_pte_t pte)
return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
 }
 
+static bool stage2_pte_executable(kvm_pte_t pte)
+{
+   return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
+}
+
+static void stage2_invalidate_icache(void *addr, u64 size)
+{
+   if (icache_is_aliasing()) {
+   /* Any kind of VIPT cache */
+   __flush_icache_all();
+   } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
+   /*
+* See comment in __kvm_tlb_flush_vmid_ipa().
+* Invalidate PIPT, or VPIPT at EL2.
+*/
+   invalidate_icache_range((unsigned long)addr,
+   (unsigned long)addr + size);
+   }
+}
+
 static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
   u32 level, struct kvm_pgtable_mm_ops *mm_ops)
 {
@@ -618,6 +638,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
if (stage2_pte_cacheable(pgt, new) && !stage2_has_fwb(pgt))
__flush_dcache_area(mm_ops->phys_to_virt(phys),
granule);
+
+   if (stage2_pte_executable(new))
+   stage2_invalidate_icache(mm_ops->phys_to_virt(phys),
+granule);
}
 
smp_store_release(ptep, new);
@@ -896,8 +920,17 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 
level, kvm_pte_t *ptep,
 * but worst-case the access flag update gets lost and will be
 * set on the next access instead.
 */
-   if (data->pte != pte)
+   if (data->pte != pte) {
+   /*
+* Invalidate the instruction cache before updating
+* if we are going to add the executable permission
+* for the guest stage-2 PTE.
+*/
+   if (!stage2_pte_executable(*ptep) && stage2_pte_executable(pte))
+   stage2_invalidate_icache(kvm_pte_follow(pte, 
data->mm_ops),
+kvm_granule_size(level));
WRITE_ONCE(*ptep, pte);
+   }
 
return 0;
 }
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 86f7dd1c234f..aa536392b308 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -694,11 +694,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm 
*kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-   __invalidate_icache_guest_page(pfn, size);
-}
-
 static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
 {
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
@@ -967,10 +962,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
if (writable)
prot |= KVM_P

[PATCH v5 0/6] KVM: arm64: Improve efficiency of stage2 page table

2021-04-15 Thread Yanan Wang

Hi,

This series makes some efficiency improvement of guest stage-2 page
table code, and there are some test results to quantify the benefit.
The code has been re-arranged based on the latest kvmarm/next tree.

Descriptions:
We currently uniformly permorm CMOs of D-cache and I-cache in function
user_mem_abort before calling the fault handlers. If we get concurrent
guest faults(e.g. translation faults, permission faults) or some really
unnecessary guest faults caused by BBM, CMOs for the first vcpu are
necessary while the others later are not.

By moving CMOs to the fault handlers, we can easily identify conditions
where they are really needed and avoid the unnecessary ones. As it's a
time consuming process to perform CMOs especially when flushing a block
range, so this solution reduces much load of kvm and improve efficiency
of the stage-2 page table code.

In this series, patch #1, #3, #4 make preparation for place movement
of CMOs (adapt to the latest stage-2 page table framework). And patch
#2, #5 move CMOs of D-cache and I-cache to the fault handlers. Patch
#6 introduces a new way to distinguish cases of memcache allocations.

The following are results in v3 to represent the benefit introduced
by movement of CMOs, and they were tested by [1] (kvm/selftest) that
I have posted recently.
[1] https://lore.kernel.org/lkml/20210302125751.19080-1-wangyana...@huawei.com/

When there are muitiple vcpus concurrently accessing the same memory
region, we can test the execution time of KVM creating new mappings,
updating the permissions of old mappings from RO to RW, and the time
of re-creating the blocks after they have been split.

hardware platform: HiSilicon Kunpeng920 Server
host kernel: Linux mainline v5.12-rc2

cmdline: ./kvm_page_table_test -m 4 -s anonymous -b 1G -v 80
   (80 vcpus, 1G memory, page mappings(normal 4K))
KVM_CREATE_MAPPINGS: before 104.35s -> after  90.42s  +13.35%
KVM_UPDATE_MAPPINGS: before  78.64s -> after  75.45s  + 4.06%

cmdline: ./kvm_page_table_test -m 4 -s anonymous_thp -b 20G -v 40
   (40 vcpus, 20G memory, block mappings(THP 2M))
KVM_CREATE_MAPPINGS: before  15.66s -> after   6.92s  +55.80%
KVM_UPDATE_MAPPINGS: before 178.80s -> after 123.35s  +31.00%
KVM_REBUILD_BLOCKS:  before 187.34s -> after 131.76s  +30.65%

cmdline: ./kvm_page_table_test -m 4 -s anonymous_hugetlb_1gb -b 20G -v 40
   (40 vcpus, 20G memory, block mappings(HUGETLB 1G))
KVM_CREATE_MAPPINGS: before 104.54s -> after   3.70s  +96.46%
KVM_UPDATE_MAPPINGS: before 174.20s -> after 115.94s  +33.44%
KVM_REBUILD_BLOCKS:  before 103.95s -> after   2.96s  +97.15%

---

Changelogs:
v4->v5:
- rebased on the latest kvmarm/tree to adapt to the new stage-2 page-table code
- v4: 
https://lore.kernel.org/lkml/20210409033652.28316-1-wangyana...@huawei.com/

v3->v4:
- perform D-cache flush if we are not mapping device memory
- rebased on top of mainline v5.12-rc6
- v3: https://lore.kernel.org/lkml/20210326031654.3716-1-wangyana...@huawei.com/

v2->v3:
- drop patch #3 in v2
- retest v3 based on v5.12-rc2
- v2: 
https://lore.kernel.org/lkml/20210310094319.18760-1-wangyana...@huawei.com/

v1->v2:
- rebased on top of mainline v5.12-rc2
- also move CMOs of I-cache to the fault handlers
- retest v2 based on v5.12-rc2
- v1: 
https://lore.kernel.org/lkml/20210208112250.163568-1-wangyana...@huawei.com/

---

Yanan Wang (6):
  KVM: arm64: Introduce KVM_PGTABLE_S2_GUEST stage-2 flag
  KVM: arm64: Move D-cache flush to the fault handlers
  KVM: arm64: Add mm_ops member for structure stage2_attr_data
  KVM: arm64: Provide invalidate_icache_range at non-VHE EL2
  KVM: arm64: Move I-cache flush to the fault handlers
  KVM: arm64: Distinguish cases of memcache allocations completely

 arch/arm64/include/asm/kvm_mmu.h | 31 -
 arch/arm64/include/asm/kvm_pgtable.h | 38 ++--
 arch/arm64/kvm/hyp/nvhe/cache.S  | 11 +
 arch/arm64/kvm/hyp/pgtable.c | 65 +++-
 arch/arm64/kvm/mmu.c | 51 --
 5 files changed, 107 insertions(+), 89 deletions(-)

-- 
2.23.0

[PATCH v5 2/6] KVM: arm64: Move D-cache flush to the fault handlers

2021-04-15 Thread Yanan Wang

We currently uniformly permorm CMOs of D-cache and I-cache in function
user_mem_abort before calling the fault handlers. If we get concurrent
guest faults(e.g. translation faults, permission faults) or some really
unnecessary guest faults caused by BBM, CMOs for the first vcpu are
necessary while the others later are not.

By moving CMOs to the fault handlers, we can easily identify conditions
where they are really needed and avoid the unnecessary ones. As it's a
time consuming process to perform CMOs especially when flushing a block
range, so this solution reduces much load of kvm and improve efficiency
of the page table code.

This patch only moves clean of D-cache to the map path, and drop the
original APIs in mmu.c/mmu.h for D-cache maintenance by using what we
already have in pgtable.c. Change about the I-side will come from a
later patch.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/kvm_mmu.h | 16 
 arch/arm64/kvm/hyp/pgtable.c | 20 ++--
 arch/arm64/kvm/mmu.c | 14 +++---
 3 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 25ed956f9af1..e9b163c5f023 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -187,22 +187,6 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu 
*vcpu)
return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-   void *va = page_address(pfn_to_page(pfn));
-
-   /*
-* With FWB, we ensure that the guest always accesses memory using
-* cacheable attributes, and we don't have to clean to PoC when
-* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
-* PoU is not required either in this case.
-*/
-   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-   return;
-
-   kvm_flush_dcache_to_poc(va, size);
-}
-
 static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
  unsigned long size)
 {
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index c37c1dc4feaf..e3606c9dcec7 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -562,6 +562,12 @@ static bool stage2_pte_is_counted(kvm_pte_t pte)
return !!pte;
 }
 
+static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
+{
+   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
+   return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
+}
+
 static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
   u32 level, struct kvm_pgtable_mm_ops *mm_ops)
 {
@@ -583,6 +589,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
 {
kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
+   struct kvm_pgtable *pgt = data->mmu->pgt;
struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
 
if (!kvm_block_mapping_supported(addr, end, phys, level))
@@ -606,6 +613,13 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
}
 
+   /* Perform CMOs before installation of the guest stage-2 PTE */
+   if (pgt->flags & KVM_PGTABLE_S2_GUEST) {
+   if (stage2_pte_cacheable(pgt, new) && !stage2_has_fwb(pgt))
+   __flush_dcache_area(mm_ops->phys_to_virt(phys),
+   granule);
+   }
+
smp_store_release(ptep, new);
if (stage2_pte_is_counted(new))
mm_ops->get_page(ptep);
@@ -798,12 +812,6 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, 
u64 addr, u64 size,
return ret;
 }
 
-static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
-{
-   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
-   return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
-}
-
 static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
   enum kvm_pgtable_walk_flags flag,
   void * const arg)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 2cfcfc5f4e4e..86f7dd1c234f 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -694,11 +694,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm 
*kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-   __clean_dcache_guest_page(pfn, size);
-}
-
 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
 {
__invalidate_icache_guest_page(pfn, size);
@@ -972,9 +967,6 @@ static int user_mem_abort

[PATCH v5 1/6] KVM: arm64: Introduce KVM_PGTABLE_S2_GUEST stage-2 flag

2021-04-15 Thread Yanan Wang

The stage-2 page table code in pgtable.c now is generally used for
guest stage-2 and host stage-2. There may be some different issues
between guest S2 page-table and host S2 page-table that we should
consider, e.g., whether CMOs are needed when creating a new mapping.

So introduce the KVM_PGTABLE_S2_GUEST flag to determine if we are
doing something about guest stage-2. This flag will be used in a
coming patch, in which we will move CMOs for guest to pgtable.c.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/kvm_pgtable.h | 38 ++--
 arch/arm64/kvm/mmu.c |  3 ++-
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
b/arch/arm64/include/asm/kvm_pgtable.h
index c3674c47d48c..a43cbe697b37 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -61,10 +61,12 @@ struct kvm_pgtable_mm_ops {
  * @KVM_PGTABLE_S2_NOFWB:  Don't enforce Normal-WB even if the CPUs have
  * ARM64_HAS_STAGE2_FWB.
  * @KVM_PGTABLE_S2_IDMAP:  Only use identity mappings.
+ * @KVM_PGTABLE_S2_GUEST:  Whether the page-tables are guest stage-2.
  */
 enum kvm_pgtable_stage2_flags {
KVM_PGTABLE_S2_NOFWB= BIT(0),
KVM_PGTABLE_S2_IDMAP= BIT(1),
+   KVM_PGTABLE_S2_GUEST= BIT(2),
 };
 
 /**
@@ -221,12 +223,10 @@ int kvm_pgtable_stage2_init_flags(struct kvm_pgtable 
*pgt, struct kvm_arch *arch
  struct kvm_pgtable_mm_ops *mm_ops,
  enum kvm_pgtable_stage2_flags flags);
 
-#define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \
-   kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0)
-
 /**
  * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
- * @pgt:   Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @pgt:   Page-table structure initialised by function
+ * kvm_pgtable_stage2_init_flags().
  *
  * The page-table is assumed to be unreachable by any hardware walkers prior
  * to freeing and therefore no TLB invalidation is performed.
@@ -235,7 +235,8 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
 
 /**
  * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
- * @pgt:   Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @pgt:   Page-table structure initialised by function
+ * kvm_pgtable_stage2_init_flags().
  * @addr:  Intermediate physical address at which to place the mapping.
  * @size:  Size of the mapping.
  * @phys:  Physical address of the memory to map.
@@ -268,7 +269,8 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 
addr, u64 size,
 /**
  * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space 
to
  * track ownership.
- * @pgt:   Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @pgt:   Page-table structure initialised by function
+ * kvm_pgtable_stage2_init_flags().
  * @addr:  Base intermediate physical address to annotate.
  * @size:  Size of the annotated range.
  * @mc:Cache of pre-allocated and zeroed memory from which to 
allocate
@@ -287,7 +289,8 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, 
u64 addr, u64 size,
 
 /**
  * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 
page-table.
- * @pgt:   Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @pgt:   Page-table structure initialised by function
+ * kvm_pgtable_stage2_init_flags().
  * @addr:  Intermediate physical address from which to remove the mapping.
  * @size:  Size of the mapping.
  *
@@ -307,7 +310,8 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 
addr, u64 size);
 /**
  * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
  *  without TLB invalidation.
- * @pgt:   Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @pgt:   Page-table structure initialised by function
+ * kvm_pgtable_stage2_init_flags().
  * @addr:  Intermediate physical address from which to write-protect,
  * @size:  Size of the range.
  *
@@ -324,7 +328,8 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, 
u64 addr, u64 size);
 
 /**
  * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
- * @pgt:   Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @pgt:   Page-table structure initialised by function
+ * kvm_pgtable_stage2_init_flags().
  * @addr:  Intermediate physical address to identify the page-table entry.
  *
  * The offset of @addr within a page is ignored.
@@ -338,7 +343,8 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable 
*pgt

[PATCH v4 1/2] KVM: arm64: Move CMOs from user_mem_abort to the fault handlers

2021-04-08 Thread Yanan Wang

We currently uniformly permorm CMOs of D-cache and I-cache in function
user_mem_abort before calling the fault handlers. If we get concurrent
guest faults(e.g. translation faults, permission faults) or some really
unnecessary guest faults caused by BBM, CMOs for the first vcpu are
necessary while the others later are not.

By moving CMOs to the fault handlers, we can easily identify conditions
where they are really needed and avoid the unnecessary ones. As it's a
time consuming process to perform CMOs especially when flushing a block
range, so this solution reduces much load of kvm and improve efficiency
of the page table code.

So let's move both clean of D-cache and invalidation of I-cache to the
map path and move only invalidation of I-cache to the permission path.
Since the original APIs for CMOs in mmu.c are only called in function
user_mem_abort, we now also move them to pgtable.c.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/kvm_mmu.h | 31 ---
 arch/arm64/kvm/hyp/pgtable.c | 68 +---
 arch/arm64/kvm/mmu.c | 23 ++-
 3 files changed, 57 insertions(+), 65 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 90873851f677..c31f88306d4e 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -177,37 +177,6 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu 
*vcpu)
return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-   void *va = page_address(pfn_to_page(pfn));
-
-   /*
-* With FWB, we ensure that the guest always accesses memory using
-* cacheable attributes, and we don't have to clean to PoC when
-* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
-* PoU is not required either in this case.
-*/
-   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-   return;
-
-   kvm_flush_dcache_to_poc(va, size);
-}
-
-static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
- unsigned long size)
-{
-   if (icache_is_aliasing()) {
-   /* any kind of VIPT cache */
-   __flush_icache_all();
-   } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
-   /* PIPT or VPIPT at EL2 (see comment in 
__kvm_tlb_flush_vmid_ipa) */
-   void *va = page_address(pfn_to_page(pfn));
-
-   invalidate_icache_range((unsigned long)va,
-   (unsigned long)va + size);
-   }
-}
-
 void kvm_set_way_flush(struct kvm_vcpu *vcpu);
 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
 
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 4d177ce1d536..0e811c86fd06 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -464,6 +464,43 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
return 0;
 }
 
+static bool stage2_pte_cacheable(kvm_pte_t pte)
+{
+   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
+   return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static bool stage2_pte_executable(kvm_pte_t pte)
+{
+   return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+   /*
+* With FWB, we ensure that the guest always accesses memory using
+* cacheable attributes, and we don't have to clean to PoC when
+* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
+* PoU is not required either in this case.
+*/
+   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+   return;
+
+   __flush_dcache_area(addr, size);
+}
+
+static void stage2_invalidate_icache(void *addr, u64 size)
+{
+   if (icache_is_aliasing()) {
+   /* Flush any kind of VIPT icache */
+   __flush_icache_all();
+   } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
+   /* PIPT or VPIPT at EL2 */
+   invalidate_icache_range((unsigned long)addr,
+   (unsigned long)addr + size);
+   }
+}
+
 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
  kvm_pte_t *ptep,
  struct stage2_map_data *data)
@@ -495,6 +532,13 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
put_page(page);
}
 
+   /* Perform CMOs before installation of the new PTE */
+   if (stage2_pte_cacheable(new))
+   stage2_flush_dcache(__va(phys), granule);
+
+   if (stage2_pte_executable(new))
+   stage2_invalidate_icache(__va(phys), granule);
+
smp_store_release(ptep, new);
get_page(page

[PATCH v4 0/2] KVM: arm64: Improve efficiency of stage2 page table

2021-04-08 Thread Yanan Wang

Hi,

This series makes some efficiency improvement of stage2 page table code,
and there are some test results to quantify the benefit of each patch.

Changelogs:
v3->v4:
- perform D-cache flush if we are not mapping device memory
- rebased on top of mainline v5.12-rc6
- v3: https://lore.kernel.org/lkml/20210326031654.3716-1-wangyana...@huawei.com/

v2->v3:
- drop patch #3 in v2
- retest v3 based on v5.12-rc2
- v2: 
https://lore.kernel.org/lkml/20210310094319.18760-1-wangyana...@huawei.com/

v1->v2:
- rebased on top of mainline v5.12-rc2
- also move CMOs of I-cache to the fault handlers
- retest v2 based on v5.12-rc2
- v1: 
https://lore.kernel.org/lkml/20210208112250.163568-1-wangyana...@huawei.com/

About this v4 series:
Patch #1:
We currently uniformly permorm CMOs of D-cache and I-cache in function
user_mem_abort before calling the fault handlers. If we get concurrent
guest faults(e.g. translation faults, permission faults) or some really
unnecessary guest faults caused by BBM, CMOs for the first vcpu are
necessary while the others later are not.

By moving CMOs to the fault handlers, we can easily identify conditions
where they are really needed and avoid the unnecessary ones. As it's a
time consuming process to perform CMOs especially when flushing a block
range, so this solution reduces much load of kvm and improve efficiency
of the page table code.

So let's move both clean of D-cache and invalidation of I-cache to the
map path and move only invalidation of I-cache to the permission path.
Since the original APIs for CMOs in mmu.c are only called in function
user_mem_abort, we now also move them to pgtable.c.

After this patch, in function stage2_map_walker_try_leaf (map path),
we flush D-cache if we are not mapping device memory and invalidate
I-cache if we are adding executable permission. And in the function
stage2_attr_walker (permission path), we invalidate I-cache if we are
adding executable permission. The logic is consistent with current
code in user_mem_abort (without this patch).

The following results represent the benefit of patch #1 alone, and they
were tested by [1] (kvm/selftest) that I have posted recently.
[1] https://lore.kernel.org/lkml/20210302125751.19080-1-wangyana...@huawei.com/

When there are muitiple vcpus concurrently accessing the same memory region,
we can test the execution time of KVM creating new mappings, updating the
permissions of old mappings from RO to RW, and rebuilding the blocks after
they have been split.

hardware platform: HiSilicon Kunpeng920 Server
host kernel: Linux mainline v5.12-rc2

cmdline: ./kvm_page_table_test -m 4 -s anonymous -b 1G -v 80
   (80 vcpus, 1G memory, page mappings(normal 4K))
KVM_CREATE_MAPPINGS: before 104.35s -> after  90.42s  +13.35%
KVM_UPDATE_MAPPINGS: before  78.64s -> after  75.45s  + 4.06%

cmdline: ./kvm_page_table_test -m 4 -s anonymous_thp -b 20G -v 40
   (40 vcpus, 20G memory, block mappings(THP 2M))
KVM_CREATE_MAPPINGS: before  15.66s -> after   6.92s  +55.80%
KVM_UPDATE_MAPPINGS: before 178.80s -> after 123.35s  +31.00%
KVM_REBUILD_BLOCKS:  before 187.34s -> after 131.76s  +30.65%

cmdline: ./kvm_page_table_test -m 4 -s anonymous_hugetlb_1gb -b 20G -v 40
   (40 vcpus, 20G memory, block mappings(HUGETLB 1G))
KVM_CREATE_MAPPINGS: before 104.54s -> after   3.70s  +96.46%
KVM_UPDATE_MAPPINGS: before 174.20s -> after 115.94s  +33.44%
KVM_REBUILD_BLOCKS:  before 103.95s -> after   2.96s  +97.15%

Patch #2:
A new method to distinguish cases of memcache allocations is introduced.
By comparing fault_granule and vma_pagesize, cases that require allocations
from memcache and cases that don't can be distinguished completely.

Yanan Wang (2):
  KVM: arm64: Move CMOs from user_mem_abort to the fault handlers
  KVM: arm64: Distinguish cases of memcache allocations completely

 arch/arm64/include/asm/kvm_mmu.h | 31 ---
 arch/arm64/kvm/hyp/pgtable.c | 68 +---
 arch/arm64/kvm/mmu.c | 48 --
 3 files changed, 69 insertions(+), 78 deletions(-)

-- 
2.19.1

[PATCH v4 2/2] KVM: arm64: Distinguish cases of memcache allocations completely

2021-04-08 Thread Yanan Wang

With a guest translation fault, the memcache pages are not needed if KVM
is only about to install a new leaf entry into the existing page table.
And with a guest permission fault, the memcache pages are also not needed
for a write_fault in dirty-logging time if KVM is only about to update
the existing leaf entry instead of collapsing a block entry into a table.

By comparing fault_granule and vma_pagesize, cases that require allocations
from memcache and cases that don't can be distinguished completely.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/mmu.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1eec9f63bc6f..05af40dc60c1 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -810,19 +810,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
 
-   /*
-* Permission faults just need to update the existing leaf entry,
-* and so normally don't require allocations from the memcache. The
-* only exception to this is when dirty logging is enabled at runtime
-* and a write fault needs to collapse a block entry into a table.
-*/
-   if (fault_status != FSC_PERM || (logging_active && write_fault)) {
-   ret = kvm_mmu_topup_memory_cache(memcache,
-kvm_mmu_cache_min_pages(kvm));
-   if (ret)
-   return ret;
-   }
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
 * Ensure the read of mmu_notifier_seq happens before we call
@@ -880,6 +867,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
 
+   /*
+* Allocations from the memcache are required only when granule of the
+* lookup level where the guest fault happened exceeds vma_pagesize,
+* which means new page tables will be created in the fault handlers.
+*/
+   if (fault_granule > vma_pagesize) {
+   ret = kvm_mmu_topup_memory_cache(memcache,
+kvm_mmu_cache_min_pages(kvm));
+   if (ret)
+   return ret;
+   }
+
/*
 * Under the premise of getting a FSC_PERM fault, we just need to relax
 * permissions only if vma_pagesize equals fault_granule. Otherwise,
-- 
2.19.1

[PATCH v6 08/10] KVM: selftests: List all hugetlb src types specified with page sizes

2021-03-30 Thread Yanan Wang

With VM_MEM_SRC_ANONYMOUS_HUGETLB, we currently can only use system
default hugetlb pages to back the testing guest memory. In order to
add flexibility, now list all the known hugetlb backing src types with
different page sizes, so that we can specify use of hugetlb pages of the
exact granularity that we want. And as all the known hugetlb page sizes
are listed, it's appropriate for all architectures.

Besides, the helper get_backing_src_pagesz() is added to get the
granularity of different backing src types(anonumous, thp, hugetlb).

Suggested-by: Ben Gardon 
Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
---
 .../testing/selftests/kvm/include/test_util.h |  18 ++-
 tools/testing/selftests/kvm/lib/kvm_util.c|   2 +-
 tools/testing/selftests/kvm/lib/test_util.c   | 109 --
 3 files changed, 116 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index e087174eefe5..fade3130eb01 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -71,16 +71,32 @@ enum vm_mem_backing_src_type {
VM_MEM_SRC_ANONYMOUS,
VM_MEM_SRC_ANONYMOUS_THP,
VM_MEM_SRC_ANONYMOUS_HUGETLB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
+   NUM_SRC_TYPES,
 };
 
 struct vm_mem_backing_src_alias {
const char *name;
-   enum vm_mem_backing_src_type type;
+   uint32_t flag;
 };
 
 bool thp_configured(void);
 size_t get_trans_hugepagesz(void);
 size_t get_def_hugetlb_pagesz(void);
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i);
+size_t get_backing_src_pagesz(uint32_t i);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index 2ea837fe03af..3506174c2053 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -759,7 +759,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->mmap_start = mmap(NULL, region->mmap_size,
  PROT_READ | PROT_WRITE,
  MAP_PRIVATE | MAP_ANONYMOUS
- | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? 
MAP_HUGETLB : 0),
+ | vm_mem_backing_src_alias(src_type)->flag,
  -1, 0);
TEST_ASSERT(region->mmap_start != MAP_FAILED,
"test_malloc failed, mmap_start: %p errno: %i",
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index 665724ccab97..205408bffa38 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -112,12 +113,6 @@ void print_skip(const char *fmt, ...)
puts(", skipping test");
 }
 
-const struct vm_mem_backing_src_alias backing_src_aliases[] = {
-   {"anonymous", VM_MEM_SRC_ANONYMOUS,},
-   {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
-   {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
-};
-
 bool thp_configured(void)
 {
int ret;
@@ -171,22 +166,114 @@ size_t get_def_hugetlb_pagesz(void)
return 0;
 }
 
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
+{
+   static const struct vm_mem_backing_src_alias aliases[] = {
+   [VM_MEM_SRC_ANONYMOUS] = {
+   .name = "anonymous",
+   .flag = 0,
+   },
+   [VM_MEM_SRC_ANONYMOUS_THP] = {
+   .name = "anonymous_thp",
+   .flag = 0,
+   },
+   [VM_MEM_SRC_ANONYMOUS_HUGETLB] = {
+   .name = "anonymous_hugetlb",
+   .flag = MAP_HUGETLB,
+   },
+   [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = {
+   .name = "anonymous_hugetlb_16kb",
+   .flag = MAP_HUGETLB | MAP_HUGE_16KB,
+   },
+   [VM_MEM_SRC_ANONYMOUS_HUGETLB_

[PATCH v6 10/10] KVM: selftests: Add a test for kvm page table code

2021-03-30 Thread Yanan Wang

This test serves as a performance tester and a bug reproducer for
kvm page table code (GPA->HPA mappings), so it gives guidance for
people trying to make some improvement for kvm.

The function guest_code() can cover the conditions where a single vcpu or
multiple vcpus access guest pages within the same memory region, in three
VM stages(before dirty logging, during dirty logging, after dirty logging).
Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings
or block mappings can be chosen by users to be created in the test.

If ANONYMOUS memory is specified, kvm will create normal page mappings
for the tested memory region before dirty logging, and update attributes
of the page mappings from RO to RW during dirty logging. If THP/HUGETLB
memory is specified, kvm will create block mappings for the tested memory
region before dirty logging, and split the blcok mappings into normal page
mappings during dirty logging, and coalesce the page mappings back into
block mappings after dirty logging is stopped.

So in summary, as a performance tester, this test can present the
performance of kvm creating/updating normal page mappings, or the
performance of kvm creating/splitting/recovering block mappings,
through execution time.

When we need to coalesce the page mappings back to block mappings after
dirty logging is stopped, we have to firstly invalidate *all* the TLB
entries for the page mappings right before installation of the block entry,
because a TLB conflict abort error could occur if we can't invalidate the
TLB entries fully. We have hit this TLB conflict twice on aarch64 software
implementation and fixed it. As this test can imulate process from dirty
logging enabled to dirty logging stopped of a VM with block mappings,
so it can also reproduce this TLB conflict abort due to inadequate TLB
invalidation when coalescing tables.

Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
Reviewed-by: Andrew Jones 
---
 tools/testing/selftests/kvm/.gitignore|   1 +
 tools/testing/selftests/kvm/Makefile  |   3 +
 .../selftests/kvm/kvm_page_table_test.c   | 506 ++
 3 files changed, 510 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore 
b/tools/testing/selftests/kvm/.gitignore
index 32b87cc77c8e..137ab7273be6 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -35,6 +35,7 @@
 /dirty_log_perf_test
 /hardware_disable_test
 /kvm_create_max_vcpus
+/kvm_page_table_test
 /memslot_modification_stress_test
 /set_memory_region_test
 /steal_time
diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index a6d61f451f88..75dc57db36b4 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -69,6 +69,7 @@ TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
 TEST_GEN_PROGS_x86_64 += hardware_disable_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += kvm_page_table_test
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
@@ -79,6 +80,7 @@ TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
 
@@ -88,6 +90,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390x += kvm_page_table_test
 TEST_GEN_PROGS_s390x += set_memory_region_test
 
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c 
b/tools/testing/selftests/kvm/kvm_page_table_test.c
new file mode 100644
index ..1c4753fff19e
--- /dev/null
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM page table test
+ *
+ * Copyright (C) 2021, Huawei, Inc.
+ *
+ * Make sure that THP has been enabled or enough HUGETLB pages with specific
+ * page size have been pre-allocated on your system, if you are planning to
+ * use hugepages to back the guest memory for testing.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "guest_modes.h"
+
+#define TEST_MEM_SLOT_INDEX 1
+
+/* Default size(1GB) of the memory for testing */
+#define DEFAULT_TEST_MEM_SIZE  (1 <&

[PATCH v6 05/10] KVM: selftests: Make a generic helper to get vm guest mode strings

2021-03-30 Thread Yanan Wang

For generality and conciseness, make an API which can be used in all
kvm libs and selftests to get vm guest mode strings. And the index i
is checked in the API in case of possiable faults.

Suggested-by: Sean Christopherson 
Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
Reviewed-by: Andrew Jones 
---
 .../testing/selftests/kvm/include/kvm_util.h  |  4 +--
 tools/testing/selftests/kvm/lib/kvm_util.c| 29 ---
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
b/tools/testing/selftests/kvm/include/kvm_util.h
index 2d7eb6989e83..f52a7492f47f 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -68,9 +68,6 @@ enum vm_guest_mode {
 #define MIN_PAGE_SIZE  (1U << MIN_PAGE_SHIFT)
 #define PTES_PER_MIN_PAGE  ptes_per_page(MIN_PAGE_SIZE)
 
-#define vm_guest_mode_string(m) vm_guest_mode_string[m]
-extern const char * const vm_guest_mode_string[];
-
 struct vm_guest_mode_params {
unsigned int pa_bits;
unsigned int va_bits;
@@ -84,6 +81,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap 
*cap);
 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
struct kvm_enable_cap *cap);
 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+const char *vm_guest_mode_string(uint32_t i);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int 
perm);
 void kvm_vm_free(struct kvm_vm *vmp);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index e5fbf16f725b..2ea837fe03af 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -143,17 +143,24 @@ static void vm_open(struct kvm_vm *vm, int perm)
"rc: %i errno: %i", vm->fd, errno);
 }
 
-const char * const vm_guest_mode_string[] = {
-   "PA-bits:52,  VA-bits:48,  4K pages",
-   "PA-bits:52,  VA-bits:48, 64K pages",
-   "PA-bits:48,  VA-bits:48,  4K pages",
-   "PA-bits:48,  VA-bits:48, 64K pages",
-   "PA-bits:40,  VA-bits:48,  4K pages",
-   "PA-bits:40,  VA-bits:48, 64K pages",
-   "PA-bits:ANY, VA-bits:48,  4K pages",
-};
-_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
-  "Missing new mode strings?");
+const char *vm_guest_mode_string(uint32_t i)
+{
+   static const char * const strings[] = {
+   [VM_MODE_P52V48_4K] = "PA-bits:52,  VA-bits:48,  4K pages",
+   [VM_MODE_P52V48_64K]= "PA-bits:52,  VA-bits:48, 64K pages",
+   [VM_MODE_P48V48_4K] = "PA-bits:48,  VA-bits:48,  4K pages",
+   [VM_MODE_P48V48_64K]= "PA-bits:48,  VA-bits:48, 64K pages",
+   [VM_MODE_P40V48_4K] = "PA-bits:40,  VA-bits:48,  4K pages",
+   [VM_MODE_P40V48_64K]= "PA-bits:40,  VA-bits:48, 64K pages",
+   [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48,  4K pages",
+   };
+   _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
+  "Missing new mode strings?");
+
+   TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
+
+   return strings[i];
+}
 
 const struct vm_guest_mode_params vm_guest_mode_params[] = {
{ 52, 48,  0x1000, 12 },
-- 
2.19.1

[PATCH v6 07/10] KVM: selftests: Add a helper to get system default hugetlb page size

2021-03-30 Thread Yanan Wang

If HUGETLB is configured in the host kernel, then we can know the system
default hugetlb page size through *cat /proc/meminfo*. Otherwise, we will
not see the information of hugetlb pages in file /proc/meminfo if it's not
configured. So add a helper to determine whether HUGETLB is configured and
then get the default page size by reading /proc/meminfo.

This helper can be useful when a program wants to use the default hugetlb
pages of the system and doesn't know the default page size.

Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
---
 .../testing/selftests/kvm/include/test_util.h |  1 +
 tools/testing/selftests/kvm/lib/test_util.c   | 25 +++
 2 files changed, 26 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index ef24c76ba89a..e087174eefe5 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -80,6 +80,7 @@ struct vm_mem_backing_src_alias {
 
 bool thp_configured(void);
 size_t get_trans_hugepagesz(void);
+size_t get_def_hugetlb_pagesz(void);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index efc1a7782de0..665724ccab97 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -146,6 +146,31 @@ size_t get_trans_hugepagesz(void)
return size;
 }
 
+size_t get_def_hugetlb_pagesz(void)
+{
+   char buf[64];
+   const char *tag = "Hugepagesize:";
+   FILE *f;
+
+   f = fopen("/proc/meminfo", "r");
+   TEST_ASSERT(f != NULL, "Error in opening /proc/meminfo");
+
+   while (fgets(buf, sizeof(buf), f) != NULL) {
+   if (strstr(buf, tag) == buf) {
+   fclose(f);
+   return strtoull(buf + strlen(tag), NULL, 10) << 10;
+   }
+   }
+
+   if (feof(f))
+   TEST_FAIL("HUGETLB is not configured in host kernel");
+   else
+   TEST_FAIL("Error in reading /proc/meminfo");
+
+   fclose(f);
+   return 0;
+}
+
 void backing_src_help(void)
 {
int i;
-- 
2.19.1

[PATCH v6 09/10] KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers

2021-03-30 Thread Yanan Wang

With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
we have to get the transparent hugepage size for HVA alignment. With the
new helpers, we can use get_backing_src_pagesz() to check whether THP is
configured and then get the exact configured hugepage size.

As different architectures may have different THP page sizes configured,
this can get the accurate THP page sizes on any platform.

Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
Reviewed-by: Andrew Jones 
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 28 +++---
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index 3506174c2053..c7a2228deaf3 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -18,7 +18,6 @@
 #include 
 #include 
 
-#define KVM_UTIL_PGS_PER_HUGEPG 512
 #define KVM_UTIL_MIN_PFN   2
 
 static int vcpu_mmap_sz(void);
@@ -688,7 +687,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 {
int ret;
struct userspace_mem_region *region;
-   size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+   size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
size_t alignment;
 
TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
@@ -750,7 +749,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 #endif
 
if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
-   alignment = max(huge_page_size, alignment);
+   alignment = max(backing_src_pagesz, alignment);
 
/* Add enough memory to align up if necessary */
if (alignment > 1)
@@ -769,22 +768,13 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->host_mem = align(region->mmap_start, alignment);
 
/* As needed perform madvise */
-   if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == 
VM_MEM_SRC_ANONYMOUS_THP) {
-   struct stat statbuf;
-
-   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
-   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
-   "stat /sys/kernel/mm/transparent_hugepage");
-
-   TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
-   "VM_MEM_SRC_ANONYMOUS_THP requires THP to be 
configured in the host kernel");
-
-   if (ret == 0) {
-   ret = madvise(region->host_mem, npages * vm->page_size,
- src_type == VM_MEM_SRC_ANONYMOUS ? 
MADV_NOHUGEPAGE : MADV_HUGEPAGE);
-   TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 
0x%lx src_type: %x",
-   region->host_mem, npages * vm->page_size, 
src_type);
-   }
+   if ((src_type == VM_MEM_SRC_ANONYMOUS ||
+src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
+   ret = madvise(region->host_mem, npages * vm->page_size,
+ src_type == VM_MEM_SRC_ANONYMOUS ? 
MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+   TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx 
src_type: %s",
+   region->host_mem, npages * vm->page_size,
+   vm_mem_backing_src_alias(src_type)->name);
}
 
region->unused_phy_pages = sparsebit_alloc();
-- 
2.19.1

[PATCH v6 06/10] KVM: selftests: Add a helper to get system configured THP page size

2021-03-30 Thread Yanan Wang

If we want to have some tests about transparent hugepages, the system
configured THP hugepage size should better be known by the tests, which
can be used for kinds of alignment or guest memory accessing of vcpus...
So it makes sense to add a helper to get the transparent hugepage size.

With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
we now stat /sys/kernel/mm/transparent_hugepage to check whether THP is
configured in the host kernel before madvise(). Based on this, we can also
read file /sys/kernel/mm/transparent_hugepage/hpage_pmd_size to get THP
hugepage size.

Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
Reviewed-by: Andrew Jones 
---
 .../testing/selftests/kvm/include/test_util.h |  2 ++
 tools/testing/selftests/kvm/lib/test_util.c   | 29 +++
 2 files changed, 31 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index b7f41399f22c..ef24c76ba89a 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -78,6 +78,8 @@ struct vm_mem_backing_src_alias {
enum vm_mem_backing_src_type type;
 };
 
+bool thp_configured(void);
+size_t get_trans_hugepagesz(void);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index c7c0627c6842..efc1a7782de0 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -117,6 +118,34 @@ const struct vm_mem_backing_src_alias 
backing_src_aliases[] = {
{"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
 };
 
+bool thp_configured(void)
+{
+   int ret;
+   struct stat statbuf;
+
+   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+   "Error in stating /sys/kernel/mm/transparent_hugepage");
+
+   return ret == 0;
+}
+
+size_t get_trans_hugepagesz(void)
+{
+   size_t size;
+   FILE *f;
+
+   TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
+
+   f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+   TEST_ASSERT(f != NULL, "Error in opening 
transparent_hugepage/hpage_pmd_size");
+
+   fscanf(f, "%ld", &size);
+   fclose(f);
+
+   return size;
+}
+
 void backing_src_help(void)
 {
int i;
-- 
2.19.1

[PATCH v6 04/10] KVM: selftests: Print the errno besides error-string in TEST_ASSERT

2021-03-30 Thread Yanan Wang

Print the errno besides error-string in TEST_ASSERT in the format of
"errno=%d - %s" will explicitly indicate that the string is an error
information. Besides, the errno is easier to be used for debugging
than the error-string.

Suggested-by: Andrew Jones 
Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
---
 tools/testing/selftests/kvm/lib/assert.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/assert.c 
b/tools/testing/selftests/kvm/lib/assert.c
index 5ebbd0d6b472..71ade6100fd3 100644
--- a/tools/testing/selftests/kvm/lib/assert.c
+++ b/tools/testing/selftests/kvm/lib/assert.c
@@ -71,9 +71,9 @@ test_assert(bool exp, const char *exp_str,
 
fprintf(stderr, " Test Assertion Failure \n"
"  %s:%u: %s\n"
-   "  pid=%d tid=%d - %s\n",
+   "  pid=%d tid=%d errno=%d - %s\n",
file, line, exp_str, getpid(), _gettid(),
-   strerror(errno));
+   errno, strerror(errno));
test_dump_stack();
if (fmt) {
fputs("  ", stderr);
-- 
2.19.1

[PATCH v6 02/10] mm/hugetlb: Add a macro to get HUGETLB page sizes for mmap

2021-03-30 Thread Yanan Wang

We know that if a system supports multiple hugetlb page sizes,
the desired hugetlb page size can be specified in bits [26:31]
of the mmap() flag arguments. The value in these 6 bits will be
the shift of each hugetlb page size.

So add a macro to get the page size shift and then calculate the
corresponding hugetlb page size, using flag x.

Cc: Ben Gardon 
Cc: Ingo Molnar 
Cc: Adrian Hunter 
Cc: Jiri Olsa 
Cc: Arnaldo Carvalho de Melo 
Cc: Arnd Bergmann 
Cc: Michael Kerrisk 
Cc: Thomas Gleixner 
Suggested-by: Ben Gardon 
Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 include/uapi/linux/mman.h   | 2 ++
 tools/include/uapi/linux/mman.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index f55bc680b5b0..d72df73b182d 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -41,4 +41,6 @@
 #define MAP_HUGE_2GB   HUGETLB_FLAG_ENCODE_2GB
 #define MAP_HUGE_16GB  HUGETLB_FLAG_ENCODE_16GB
 
+#define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
 #endif /* _UAPI_LINUX_MMAN_H */
diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h
index f55bc680b5b0..d72df73b182d 100644
--- a/tools/include/uapi/linux/mman.h
+++ b/tools/include/uapi/linux/mman.h
@@ -41,4 +41,6 @@
 #define MAP_HUGE_2GB   HUGETLB_FLAG_ENCODE_2GB
 #define MAP_HUGE_16GB  HUGETLB_FLAG_ENCODE_16GB
 
+#define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
 #endif /* _UAPI_LINUX_MMAN_H */
-- 
2.19.1

[PATCH v6 01/10] tools/headers: sync headers of asm-generic/hugetlb_encode.h

2021-03-30 Thread Yanan Wang

This patch syncs contents of tools/include/asm-generic/hugetlb_encode.h
and include/uapi/asm-generic/hugetlb_encode.h. Arch powerpc supports 16KB
hugepages and ARM64 supports 32MB/512MB hugepages. The corresponding mmap
flags have already been added in include/uapi/asm-generic/hugetlb_encode.h,
but not tools/include/asm-generic/hugetlb_encode.h.

Cc: Ingo Molnar 
Cc: Adrian Hunter 
Cc: Jiri Olsa 
Cc: Arnaldo Carvalho de Melo 
Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 tools/include/asm-generic/hugetlb_encode.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/include/asm-generic/hugetlb_encode.h 
b/tools/include/asm-generic/hugetlb_encode.h
index e4732d3c2998..4f3d5aaa11f5 100644
--- a/tools/include/asm-generic/hugetlb_encode.h
+++ b/tools/include/asm-generic/hugetlb_encode.h
@@ -20,13 +20,16 @@
 #define HUGETLB_FLAG_ENCODE_SHIFT  26
 #define HUGETLB_FLAG_ENCODE_MASK   0x3f
 
+#define HUGETLB_FLAG_ENCODE_16KB   (14 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_64KB   (16 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_512KB  (19 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1MB(20 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2MB(21 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_8MB(23 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16MB   (24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_32MB   (25 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_256MB  (28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512MB  (29 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1GB(30 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2GB(31 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16GB   (34 << HUGETLB_FLAG_ENCODE_SHIFT)
-- 
2.19.1

[PATCH v6 03/10] KVM: selftests: Use flag CLOCK_MONOTONIC_RAW for timing

2021-03-30 Thread Yanan Wang

In addition to function of CLOCK_MONOTONIC, flag CLOCK_MONOTONIC_RAW can
also shield possiable impact of NTP, which can provide more robustness.

Suggested-by: Vitaly Kuznetsov 
Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
Reviewed-by: Andrew Jones 
---
 tools/testing/selftests/kvm/demand_paging_test.c  |  8 
 tools/testing/selftests/kvm/dirty_log_perf_test.c | 14 +++---
 tools/testing/selftests/kvm/lib/test_util.c   |  2 +-
 tools/testing/selftests/kvm/steal_time.c  |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/demand_paging_test.c 
b/tools/testing/selftests/kvm/demand_paging_test.c
index 5f7a229c3af1..efbf0c1e9130 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -53,7 +53,7 @@ static void *vcpu_worker(void *data)
vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
run = vcpu_state(vm, vcpu_id);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
/* Let the guest access its memory */
ret = _vcpu_run(vm, vcpu_id);
@@ -86,7 +86,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
copy.len = perf_test_args.host_page_size;
copy.mode = 0;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
r = ioctl(uffd, UFFDIO_COPY, ©);
if (r == -1) {
@@ -123,7 +123,7 @@ static void *uffd_handler_thread_fn(void *arg)
struct timespec start;
struct timespec ts_diff;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
while (!quit_uffd_thread) {
struct uffd_msg msg;
struct pollfd pollfd[2];
@@ -336,7 +336,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
pr_info("Finished creating vCPUs and starting uffd threads\n");
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c 
b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index 04a2641261be..6cff4ccf9525 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -50,7 +50,7 @@ static void *vcpu_worker(void *data)
while (!READ_ONCE(host_quit)) {
int current_iteration = READ_ONCE(iteration);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
ret = _vcpu_run(vm, vcpu_id);
ts_diff = timespec_elapsed(start);
 
@@ -141,7 +141,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
iteration = 0;
host_quit = false;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
vcpu_last_completed_iteration[vcpu_id] = -1;
 
@@ -162,7 +162,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
ts_diff.tv_sec, ts_diff.tv_nsec);
 
/* Enable dirty logging */
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX,
KVM_MEM_LOG_DIRTY_PAGES);
ts_diff = timespec_elapsed(start);
@@ -174,7 +174,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 * Incrementing the iteration number will start the vCPUs
 * dirtying memory again.
 */
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
iteration++;
 
pr_debug("Starting iteration %d\n", iteration);
@@ -189,7 +189,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("Iteration %d dirty memory time: %ld.%.9lds\n",
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap);
 
ts_diff = timespec_elapsed(start);
@@ -199,7 +199,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
if (dirty_log_manual_caps) {
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime

[PATCH v6 00/10] KVM: selftests: some improvement and a new test for kvm page table

2021-03-30 Thread Yanan Wang

Hi,
This v6 series can mainly include two parts.
Rebased on kvm queue branch: 
https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=queue

In the first part, all the known hugetlb backing src types specified
with different hugepage sizes are listed, so that we can specify use
of hugetlb source of the exact granularity that we want, instead of
the system default ones. And as all the known hugetlb page sizes are
listed, it's appropriate for all architectures. Besides, a helper that
can get granularity of different backing src types(anonumous/thp/hugetlb)
is added, so that we can use the accurate backing src granularity for
kinds of alignment or guest memory accessing of vcpus.

In the second part, a new test is added:
This test is added to serve as a performance tester and a bug reproducer
for kvm page table code (GPA->HPA mappings), it gives guidance for the
people trying to make some improvement for kvm. And the following explains
what we can exactly do through this test.

The function guest_code() can cover the conditions where a single vcpu or
multiple vcpus access guest pages within the same memory region, in three
VM stages(before dirty logging, during dirty logging, after dirty logging).
Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings
or block mappings can be chosen by users to be created in the test.

If ANONYMOUS memory is specified, kvm will create normal page mappings
for the tested memory region before dirty logging, and update attributes
of the page mappings from RO to RW during dirty logging. If THP/HUGETLB
memory is specified, kvm will create block mappings for the tested memory
region before dirty logging, and split the blcok mappings into normal page
mappings during dirty logging, and coalesce the page mappings back into
block mappings after dirty logging is stopped.

So in summary, as a performance tester, this test can present the
performance of kvm creating/updating normal page mappings, or the
performance of kvm creating/splitting/recovering block mappings,
through execution time.

When we need to coalesce the page mappings back to block mappings after
dirty logging is stopped, we have to firstly invalidate *all* the TLB
entries for the page mappings right before installation of the block entry,
because a TLB conflict abort error could occur if we can't invalidate the
TLB entries fully. We have hit this TLB conflict twice on aarch64 software
implementation and fixed it. As this test can imulate process from dirty
logging enabled to dirty logging stopped of a VM with block mappings,
so it can also reproduce this TLB conflict abort due to inadequate TLB
invalidation when coalescing tables.

Links about the TLB conflict abort:
https://lore.kernel.org/lkml/20201201201034.116760-3-wangyana...@huawei.com/

---

Change logs:

v5->v6:
- Address Andrew Jones's comments for v5 series
- Add Andrew Jones's R-b tags in some patches
- Rebased on newest kvm/queue tree
- v5: 
https://lore.kernel.org/lkml/20210323135231.24948-1-wangyana...@huawei.com/

v4->v5:
- Use synchronization(sem_wait) for time measurement
- Add a new patch about TEST_ASSERT(patch 4)
- Address Andrew Jones's comments for v4 series
- Add Andrew Jones's R-b tags in some patches
- v4: 
https://lore.kernel.org/lkml/20210302125751.19080-1-wangyana...@huawei.com/

v3->v4:
- Add a helper to get system default hugetlb page size
- Add tags of Reviewed-by of Ben in the patches
- v3: 
https://lore.kernel.org/lkml/20210301065916.11484-1-wangyana...@huawei.com/

v2->v3:
- Add tags of Suggested-by, Reviewed-by in the patches
- Add a generic micro to get hugetlb page sizes
- Some changes for suggestions about v2 series
- v2: 
https://lore.kernel.org/lkml/20210225055940.18748-1-wangyana...@huawei.com/

v1->v2:
- Add a patch to sync header files
- Add helpers to get granularity of different backing src types
- Some changes for suggestions about v1 series
- v1: 
https://lore.kernel.org/lkml/20210208090841.333724-1-wangyana...@huawei.com/

---

Yanan Wang (10):
  tools headers: sync headers of asm-generic/hugetlb_encode.h
  mm/hugetlb: Add a macro to get HUGETLB page sizes for mmap
  KVM: selftests: Use flag CLOCK_MONOTONIC_RAW for timing
  KVM: selftests: Print the errno besides error-string in TEST_ASSERT
  KVM: selftests: Make a generic helper to get vm guest mode strings
  KVM: selftests: Add a helper to get system configured THP page size
  KVM: selftests: Add a helper to get system default hugetlb page size
  KVM: selftests: List all hugetlb src types specified with page sizes
  KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers
  KVM: selftests: Add a test for kvm page table code

 include/uapi/linux/mman.h |   2 +
 tools/include/asm-generic/hugetlb_encode.h|   3 +
 tools/include/uapi/linux/mman.h   |   2 +
 tools/testing/selftests/kvm/.gitig

[RFC PATCH v3 0/2] KVM: arm64: Improve efficiency of stage2 page table

2021-03-25 Thread Yanan Wang

Hi,

This is a new version of the series [1] that I have posted before. It makes some
efficiency improvement of stage2 page table code and there are some test results
to quantify the benefit of each patch.
[1] v2: 
https://lore.kernel.org/lkml/20210310094319.18760-1-wangyana...@huawei.com/

Although there hasn't been any feedback about v2, I am certain that there should
be a big change for the series after plenty of discussion with Alexandru Elisei.
A conclusion was drew that CMOs are still needed for the scenario of coalescing
tables, and as a result the benefit of patch #3 in v2 becomes rather little
judging from the test results. So drop this patch and keep the others which
still remain meaningful.

Changelogs:
v2->v3:
- drop patch #3 in v2
- retest v3 based on v5.12-rc2

v1->v2:
- rebased on top of mainline v5.12-rc2
- also move CMOs of I-cache to the fault handlers
- retest v2 based on v5.12-rc2
- v1: 
https://lore.kernel.org/lkml/20210208112250.163568-1-wangyana...@huawei.com/

About this v3 series:
Patch #1:
We currently uniformly permorm CMOs of D-cache and I-cache in function
user_mem_abort before calling the fault handlers. If we get concurrent
guest faults(e.g. translation faults, permission faults) or some really
unnecessary guest faults caused by BBM, CMOs for the first vcpu are
necessary while the others later are not.

By moving CMOs to the fault handlers, we can easily identify conditions
where they are really needed and avoid the unnecessary ones. As it's a
time consuming process to perform CMOs especially when flushing a block
range, so this solution reduces much load of kvm and improve efficiency
of the page table code.

So let's move both clean of D-cache and invalidation of I-cache to the
map path and move only invalidation of I-cache to the permission path.
Since the original APIs for CMOs in mmu.c are only called in function
user_mem_abort, we now also move them to pgtable.c.

The following results represent the benefit of patch #1 alone, and they
were tested by [2] (kvm/selftest) that I have posted recently.
[2] https://lore.kernel.org/lkml/20210302125751.19080-1-wangyana...@huawei.com/

When there are muitiple vcpus concurrently accessing the same memory region,
we can test the execution time of KVM creating new mappings, updating the
permissions of old mappings from RO to RW, and rebuilding the blocks after
they have been split.

hardware platform: HiSilicon Kunpeng920 Server
host kernel: Linux mainline v5.12-rc2

cmdline: ./kvm_page_table_test -m 4 -s anonymous -b 1G -v 80
   (80 vcpus, 1G memory, page mappings(normal 4K))
KVM_CREATE_MAPPINGS: before 104.35s -> after  90.42s  +13.35%
KVM_UPDATE_MAPPINGS: before  78.64s -> after  75.45s  + 4.06%

cmdline: ./kvm_page_table_test -m 4 -s anonymous_thp -b 20G -v 40
   (40 vcpus, 20G memory, block mappings(THP 2M))
KVM_CREATE_MAPPINGS: before  15.66s -> after   6.92s  +55.80%
KVM_UPDATE_MAPPINGS: before 178.80s -> after 123.35s  +31.00%
KVM_REBUILD_BLOCKS:  before 187.34s -> after 131.76s  +30.65%

cmdline: ./kvm_page_table_test -m 4 -s anonymous_hugetlb_1gb -b 20G -v 40
   (40 vcpus, 20G memory, block mappings(HUGETLB 1G))
KVM_CREATE_MAPPINGS: before 104.54s -> after   3.70s  +96.46%
KVM_UPDATE_MAPPINGS: before 174.20s -> after 115.94s  +33.44%
KVM_REBUILD_BLOCKS:  before 103.95s -> after   2.96s  +97.15%

Patch #2:
A new method to distinguish cases of memcache allocations is introduced.
By comparing fault_granule and vma_pagesize, cases that require allocations
from memcache and cases that don't can be distinguished completely.

Yanan Wang (2):
  KVM: arm64: Move CMOs from user_mem_abort to the fault handlers
  KVM: arm64: Distinguish cases of memcache allocations completely

 arch/arm64/include/asm/kvm_mmu.h | 31 ---
 arch/arm64/kvm/hyp/pgtable.c | 68 +---
 arch/arm64/kvm/mmu.c | 48 --
 3 files changed, 69 insertions(+), 78 deletions(-)

-- 
2.19.1

[RFC PATCH v3 2/2] KVM: arm64: Distinguish cases of memcache allocations completely

2021-03-25 Thread Yanan Wang

With a guest translation fault, the memcache pages are not needed if KVM
is only about to install a new leaf entry into the existing page table.
And with a guest permission fault, the memcache pages are also not needed
for a write_fault in dirty-logging time if KVM is only about to update
the existing leaf entry instead of collapsing a block entry into a table.

By comparing fault_granule and vma_pagesize, cases that require allocations
from memcache and cases that don't can be distinguished completely.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/mmu.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1eec9f63bc6f..05af40dc60c1 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -810,19 +810,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
 
-   /*
-* Permission faults just need to update the existing leaf entry,
-* and so normally don't require allocations from the memcache. The
-* only exception to this is when dirty logging is enabled at runtime
-* and a write fault needs to collapse a block entry into a table.
-*/
-   if (fault_status != FSC_PERM || (logging_active && write_fault)) {
-   ret = kvm_mmu_topup_memory_cache(memcache,
-kvm_mmu_cache_min_pages(kvm));
-   if (ret)
-   return ret;
-   }
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
 * Ensure the read of mmu_notifier_seq happens before we call
@@ -880,6 +867,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
 
+   /*
+* Allocations from the memcache are required only when granule of the
+* lookup level where the guest fault happened exceeds vma_pagesize,
+* which means new page tables will be created in the fault handlers.
+*/
+   if (fault_granule > vma_pagesize) {
+   ret = kvm_mmu_topup_memory_cache(memcache,
+kvm_mmu_cache_min_pages(kvm));
+   if (ret)
+   return ret;
+   }
+
/*
 * Under the premise of getting a FSC_PERM fault, we just need to relax
 * permissions only if vma_pagesize equals fault_granule. Otherwise,
-- 
2.19.1

[RFC PATCH v3 1/2] KVM: arm64: Move CMOs from user_mem_abort to the fault handlers

2021-03-25 Thread Yanan Wang

We currently uniformly permorm CMOs of D-cache and I-cache in function
user_mem_abort before calling the fault handlers. If we get concurrent
guest faults(e.g. translation faults, permission faults) or some really
unnecessary guest faults caused by BBM, CMOs for the first vcpu are
necessary while the others later are not.

By moving CMOs to the fault handlers, we can easily identify conditions
where they are really needed and avoid the unnecessary ones. As it's a
time consuming process to perform CMOs especially when flushing a block
range, so this solution reduces much load of kvm and improve efficiency
of the page table code.

So let's move both clean of D-cache and invalidation of I-cache to the
map path and move only invalidation of I-cache to the permission path.
Since the original APIs for CMOs in mmu.c are only called in function
user_mem_abort, we now also move them to pgtable.c.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/kvm_mmu.h | 31 ---
 arch/arm64/kvm/hyp/pgtable.c | 68 +---
 arch/arm64/kvm/mmu.c | 23 ++-
 3 files changed, 57 insertions(+), 65 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 90873851f677..c31f88306d4e 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -177,37 +177,6 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu 
*vcpu)
return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-   void *va = page_address(pfn_to_page(pfn));
-
-   /*
-* With FWB, we ensure that the guest always accesses memory using
-* cacheable attributes, and we don't have to clean to PoC when
-* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
-* PoU is not required either in this case.
-*/
-   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-   return;
-
-   kvm_flush_dcache_to_poc(va, size);
-}
-
-static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
- unsigned long size)
-{
-   if (icache_is_aliasing()) {
-   /* any kind of VIPT cache */
-   __flush_icache_all();
-   } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
-   /* PIPT or VPIPT at EL2 (see comment in 
__kvm_tlb_flush_vmid_ipa) */
-   void *va = page_address(pfn_to_page(pfn));
-
-   invalidate_icache_range((unsigned long)va,
-   (unsigned long)va + size);
-   }
-}
-
 void kvm_set_way_flush(struct kvm_vcpu *vcpu);
 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
 
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 4d177ce1d536..829a34eea526 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -464,6 +464,43 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
return 0;
 }
 
+static bool stage2_pte_cacheable(kvm_pte_t pte)
+{
+   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
+   return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static bool stage2_pte_executable(kvm_pte_t pte)
+{
+   return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+   /*
+* With FWB, we ensure that the guest always accesses memory using
+* cacheable attributes, and we don't have to clean to PoC when
+* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
+* PoU is not required either in this case.
+*/
+   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+   return;
+
+   __flush_dcache_area(addr, size);
+}
+
+static void stage2_invalidate_icache(void *addr, u64 size)
+{
+   if (icache_is_aliasing()) {
+   /* Flush any kind of VIPT icache */
+   __flush_icache_all();
+   } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
+   /* PIPT or VPIPT at EL2 */
+   invalidate_icache_range((unsigned long)addr,
+   (unsigned long)addr + size);
+   }
+}
+
 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
  kvm_pte_t *ptep,
  struct stage2_map_data *data)
@@ -495,6 +532,13 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
put_page(page);
}
 
+   /* Perform CMOs before installation of the new PTE */
+   if (!kvm_pte_valid(old) || stage2_pte_cacheable(old))
+   stage2_flush_dcache(__va(phys), granule);
+
+   if (stage2_pte_executable(new))
+   stage2_invalidate_icache(__va(phys), granule);
+
smp_store_release(p

[RFC PATCH v5 07/10] KVM: selftests: Add a helper to get system default hugetlb page size

2021-03-23 Thread Yanan Wang

If HUGETLB is configured in the host kernel, then we can know the system
default hugetlb page size through *cat /proc/meminfo*. Otherwise, we will
not see the information of hugetlb pages in file /proc/meminfo if it's not
configured. So add a helper to determine whether HUGETLB is configured and
then get the default page size by reading /proc/meminfo.

This helper can be useful when a program wants to use the default hugetlb
pages of the system and doesn't know the default page size.

Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
---
 .../testing/selftests/kvm/include/test_util.h |  1 +
 tools/testing/selftests/kvm/lib/test_util.c   | 25 +++
 2 files changed, 26 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index ef24c76ba89a..e087174eefe5 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -80,6 +80,7 @@ struct vm_mem_backing_src_alias {
 
 bool thp_configured(void);
 size_t get_trans_hugepagesz(void);
+size_t get_def_hugetlb_pagesz(void);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index efc1a7782de0..665724ccab97 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -146,6 +146,31 @@ size_t get_trans_hugepagesz(void)
return size;
 }
 
+size_t get_def_hugetlb_pagesz(void)
+{
+   char buf[64];
+   const char *tag = "Hugepagesize:";
+   FILE *f;
+
+   f = fopen("/proc/meminfo", "r");
+   TEST_ASSERT(f != NULL, "Error in opening /proc/meminfo");
+
+   while (fgets(buf, sizeof(buf), f) != NULL) {
+   if (strstr(buf, tag) == buf) {
+   fclose(f);
+   return strtoull(buf + strlen(tag), NULL, 10) << 10;
+   }
+   }
+
+   if (feof(f))
+   TEST_FAIL("HUGETLB is not configured in host kernel");
+   else
+   TEST_FAIL("Error in reading /proc/meminfo");
+
+   fclose(f);
+   return 0;
+}
+
 void backing_src_help(void)
 {
int i;
-- 
2.19.1

[RFC PATCH v5 09/10] KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers

2021-03-23 Thread Yanan Wang

With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
we have to get the transparent hugepage size for HVA alignment. With the
new helpers, we can use get_backing_src_pagesz() to check whether THP is
configured and then get the exact configured hugepage size.

As different architectures may have different THP page sizes configured,
this can get the accurate THP page sizes on any platform.

Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
Reviewed-by: Andrew Jones 
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 28 +++---
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index 3506174c2053..c7a2228deaf3 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -18,7 +18,6 @@
 #include 
 #include 
 
-#define KVM_UTIL_PGS_PER_HUGEPG 512
 #define KVM_UTIL_MIN_PFN   2
 
 static int vcpu_mmap_sz(void);
@@ -688,7 +687,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 {
int ret;
struct userspace_mem_region *region;
-   size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+   size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
size_t alignment;
 
TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
@@ -750,7 +749,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 #endif
 
if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
-   alignment = max(huge_page_size, alignment);
+   alignment = max(backing_src_pagesz, alignment);
 
/* Add enough memory to align up if necessary */
if (alignment > 1)
@@ -769,22 +768,13 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->host_mem = align(region->mmap_start, alignment);
 
/* As needed perform madvise */
-   if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == 
VM_MEM_SRC_ANONYMOUS_THP) {
-   struct stat statbuf;
-
-   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
-   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
-   "stat /sys/kernel/mm/transparent_hugepage");
-
-   TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
-   "VM_MEM_SRC_ANONYMOUS_THP requires THP to be 
configured in the host kernel");
-
-   if (ret == 0) {
-   ret = madvise(region->host_mem, npages * vm->page_size,
- src_type == VM_MEM_SRC_ANONYMOUS ? 
MADV_NOHUGEPAGE : MADV_HUGEPAGE);
-   TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 
0x%lx src_type: %x",
-   region->host_mem, npages * vm->page_size, 
src_type);
-   }
+   if ((src_type == VM_MEM_SRC_ANONYMOUS ||
+src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
+   ret = madvise(region->host_mem, npages * vm->page_size,
+ src_type == VM_MEM_SRC_ANONYMOUS ? 
MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+   TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx 
src_type: %s",
+   region->host_mem, npages * vm->page_size,
+   vm_mem_backing_src_alias(src_type)->name);
}
 
region->unused_phy_pages = sparsebit_alloc();
-- 
2.19.1

[RFC PATCH v5 10/10] KVM: selftests: Add a test for kvm page table code

2021-03-23 Thread Yanan Wang

This test serves as a performance tester and a bug reproducer for
kvm page table code (GPA->HPA mappings), so it gives guidance for
people trying to make some improvement for kvm.

The function guest_code() can cover the conditions where a single vcpu or
multiple vcpus access guest pages within the same memory region, in three
VM stages(before dirty logging, during dirty logging, after dirty logging).
Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings
or block mappings can be chosen by users to be created in the test.

If ANONYMOUS memory is specified, kvm will create normal page mappings
for the tested memory region before dirty logging, and update attributes
of the page mappings from RO to RW during dirty logging. If THP/HUGETLB
memory is specified, kvm will create block mappings for the tested memory
region before dirty logging, and split the blcok mappings into normal page
mappings during dirty logging, and coalesce the page mappings back into
block mappings after dirty logging is stopped.

So in summary, as a performance tester, this test can present the
performance of kvm creating/updating normal page mappings, or the
performance of kvm creating/splitting/recovering block mappings,
through execution time.

When we need to coalesce the page mappings back to block mappings after
dirty logging is stopped, we have to firstly invalidate *all* the TLB
entries for the page mappings right before installation of the block entry,
because a TLB conflict abort error could occur if we can't invalidate the
TLB entries fully. We have hit this TLB conflict twice on aarch64 software
implementation and fixed it. As this test can imulate process from dirty
logging enabled to dirty logging stopped of a VM with block mappings,
so it can also reproduce this TLB conflict abort due to inadequate TLB
invalidation when coalescing tables.

Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 tools/testing/selftests/kvm/.gitignore|   1 +
 tools/testing/selftests/kvm/Makefile  |   3 +
 .../selftests/kvm/kvm_page_table_test.c   | 512 ++
 3 files changed, 516 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c

diff --git a/tools/testing/selftests/kvm/.gitignore 
b/tools/testing/selftests/kvm/.gitignore
index 32b87cc77c8e..137ab7273be6 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -35,6 +35,7 @@
 /dirty_log_perf_test
 /hardware_disable_test
 /kvm_create_max_vcpus
+/kvm_page_table_test
 /memslot_modification_stress_test
 /set_memory_region_test
 /steal_time
diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index a6d61f451f88..75dc57db36b4 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -69,6 +69,7 @@ TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
 TEST_GEN_PROGS_x86_64 += hardware_disable_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += kvm_page_table_test
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
@@ -79,6 +80,7 @@ TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
 
@@ -88,6 +90,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390x += kvm_page_table_test
 TEST_GEN_PROGS_s390x += set_memory_region_test
 
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c 
b/tools/testing/selftests/kvm/kvm_page_table_test.c
new file mode 100644
index ..bbd5c489d61f
--- /dev/null
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -0,0 +1,512 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM page table test
+ *
+ * Copyright (C) 2021, Huawei, Inc.
+ *
+ * Make sure that THP has been enabled or enough HUGETLB pages with specific
+ * page size have been pre-allocated on your system, if you are planning to
+ * use hugepages to back the guest memory for testing.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "guest_modes.h"
+
+#define TEST_MEM_SLOT_INDEX 1
+
+/* Default size(1GB) of the memory for testing */
+#define DEFAULT_TEST_MEM_SIZE  (1 << 30)
+
+/

[RFC PATCH v5 08/10] KVM: selftests: List all hugetlb src types specified with page sizes

2021-03-23 Thread Yanan Wang

With VM_MEM_SRC_ANONYMOUS_HUGETLB, we currently can only use system
default hugetlb pages to back the testing guest memory. In order to
add flexibility, now list all the known hugetlb backing src types with
different page sizes, so that we can specify use of hugetlb pages of the
exact granularity that we want. And as all the known hugetlb page sizes
are listed, it's appropriate for all architectures.

Besides, the helper get_backing_src_pagesz() is added to get the
granularity of different backing src types(anonumous, thp, hugetlb).

Suggested-by: Ben Gardon 
Signed-off-by: Yanan Wang 
---
 .../testing/selftests/kvm/include/test_util.h |  18 ++-
 tools/testing/selftests/kvm/lib/kvm_util.c|   2 +-
 tools/testing/selftests/kvm/lib/test_util.c   | 109 --
 3 files changed, 116 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index e087174eefe5..fade3130eb01 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -71,16 +71,32 @@ enum vm_mem_backing_src_type {
VM_MEM_SRC_ANONYMOUS,
VM_MEM_SRC_ANONYMOUS_THP,
VM_MEM_SRC_ANONYMOUS_HUGETLB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
+   NUM_SRC_TYPES,
 };
 
 struct vm_mem_backing_src_alias {
const char *name;
-   enum vm_mem_backing_src_type type;
+   uint32_t flag;
 };
 
 bool thp_configured(void);
 size_t get_trans_hugepagesz(void);
 size_t get_def_hugetlb_pagesz(void);
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i);
+size_t get_backing_src_pagesz(uint32_t i);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index 2ea837fe03af..3506174c2053 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -759,7 +759,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->mmap_start = mmap(NULL, region->mmap_size,
  PROT_READ | PROT_WRITE,
  MAP_PRIVATE | MAP_ANONYMOUS
- | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? 
MAP_HUGETLB : 0),
+ | vm_mem_backing_src_alias(src_type)->flag,
  -1, 0);
TEST_ASSERT(region->mmap_start != MAP_FAILED,
"test_malloc failed, mmap_start: %p errno: %i",
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index 665724ccab97..205408bffa38 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -112,12 +113,6 @@ void print_skip(const char *fmt, ...)
puts(", skipping test");
 }
 
-const struct vm_mem_backing_src_alias backing_src_aliases[] = {
-   {"anonymous", VM_MEM_SRC_ANONYMOUS,},
-   {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
-   {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
-};
-
 bool thp_configured(void)
 {
int ret;
@@ -171,22 +166,114 @@ size_t get_def_hugetlb_pagesz(void)
return 0;
 }
 
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
+{
+   static const struct vm_mem_backing_src_alias aliases[] = {
+   [VM_MEM_SRC_ANONYMOUS] = {
+   .name = "anonymous",
+   .flag = 0,
+   },
+   [VM_MEM_SRC_ANONYMOUS_THP] = {
+   .name = "anonymous_thp",
+   .flag = 0,
+   },
+   [VM_MEM_SRC_ANONYMOUS_HUGETLB] = {
+   .name = "anonymous_hugetlb",
+   .flag = MAP_HUGETLB,
+   },
+   [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = {
+   .name = "anonymous_hugetlb_16kb",
+   .flag = MAP_HUGETLB | MAP_HUGE_16KB,
+   },
+   [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = {
+   .name = "anonymous_hugetlb_64kb&q

[RFC PATCH v5 06/10] KVM: selftests: Add a helper to get system configured THP page size

2021-03-23 Thread Yanan Wang

If we want to have some tests about transparent hugepages, the system
configured THP hugepage size should better be known by the tests, which
can be used for kinds of alignment or guest memory accessing of vcpus...
So it makes sense to add a helper to get the transparent hugepage size.

With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
we now stat /sys/kernel/mm/transparent_hugepage to check whether THP is
configured in the host kernel before madvise(). Based on this, we can also
read file /sys/kernel/mm/transparent_hugepage/hpage_pmd_size to get THP
hugepage size.

Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 .../testing/selftests/kvm/include/test_util.h |  2 ++
 tools/testing/selftests/kvm/lib/test_util.c   | 29 +++
 2 files changed, 31 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index b7f41399f22c..ef24c76ba89a 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -78,6 +78,8 @@ struct vm_mem_backing_src_alias {
enum vm_mem_backing_src_type type;
 };
 
+bool thp_configured(void);
+size_t get_trans_hugepagesz(void);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index c7c0627c6842..efc1a7782de0 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -117,6 +118,34 @@ const struct vm_mem_backing_src_alias 
backing_src_aliases[] = {
{"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
 };
 
+bool thp_configured(void)
+{
+   int ret;
+   struct stat statbuf;
+
+   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+   "Error in stating /sys/kernel/mm/transparent_hugepage");
+
+   return ret == 0;
+}
+
+size_t get_trans_hugepagesz(void)
+{
+   size_t size;
+   FILE *f;
+
+   TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
+
+   f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+   TEST_ASSERT(f != NULL, "Error in opening 
transparent_hugepage/hpage_pmd_size");
+
+   fscanf(f, "%ld", &size);
+   fclose(f);
+
+   return size;
+}
+
 void backing_src_help(void)
 {
int i;
-- 
2.19.1

[RFC PATCH v5 04/10] KVM: selftests: Print the errno besides error-string in TEST_ASSERT

2021-03-23 Thread Yanan Wang

Print the errno besides error-string in TEST_ASSERT in the format of
"errno=%d - %s" will explicitly indicate that the string is an error
information. Besides, the errno is easier to be used for debugging
than the error-string.

Suggested-by: Andrew Jones 
Signed-off-by: Yanan Wang 
---
 tools/testing/selftests/kvm/lib/assert.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/assert.c 
b/tools/testing/selftests/kvm/lib/assert.c
index 5ebbd0d6b472..71ade6100fd3 100644
--- a/tools/testing/selftests/kvm/lib/assert.c
+++ b/tools/testing/selftests/kvm/lib/assert.c
@@ -71,9 +71,9 @@ test_assert(bool exp, const char *exp_str,
 
fprintf(stderr, " Test Assertion Failure \n"
"  %s:%u: %s\n"
-   "  pid=%d tid=%d - %s\n",
+   "  pid=%d tid=%d errno=%d - %s\n",
file, line, exp_str, getpid(), _gettid(),
-   strerror(errno));
+   errno, strerror(errno));
test_dump_stack();
if (fmt) {
fputs("  ", stderr);
-- 
2.19.1

[RFC PATCH v5 02/10] tools headers: Add a macro to get HUGETLB page sizes for mmap

2021-03-23 Thread Yanan Wang

We know that if a system supports multiple hugetlb page sizes,
the desired hugetlb page size can be specified in bits [26:31]
of the flag arguments. The value in these 6 bits will be the
shift of each hugetlb page size.

So add a macro to get the page size shift and then calculate the
corresponding hugetlb page size, using flag x.

Cc: Ben Gardon 
Cc: Ingo Molnar 
Cc: Adrian Hunter 
Cc: Jiri Olsa 
Cc: Arnaldo Carvalho de Melo 
Cc: Arnd Bergmann 
Cc: Michael Kerrisk 
Cc: Thomas Gleixner 
Suggested-by: Ben Gardon 
Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 include/uapi/linux/mman.h   | 2 ++
 tools/include/uapi/linux/mman.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index f55bc680b5b0..d72df73b182d 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -41,4 +41,6 @@
 #define MAP_HUGE_2GB   HUGETLB_FLAG_ENCODE_2GB
 #define MAP_HUGE_16GB  HUGETLB_FLAG_ENCODE_16GB
 
+#define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
 #endif /* _UAPI_LINUX_MMAN_H */
diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h
index f55bc680b5b0..d72df73b182d 100644
--- a/tools/include/uapi/linux/mman.h
+++ b/tools/include/uapi/linux/mman.h
@@ -41,4 +41,6 @@
 #define MAP_HUGE_2GB   HUGETLB_FLAG_ENCODE_2GB
 #define MAP_HUGE_16GB  HUGETLB_FLAG_ENCODE_16GB
 
+#define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
 #endif /* _UAPI_LINUX_MMAN_H */
-- 
2.19.1

[RFC PATCH v5 03/10] KVM: selftests: Use flag CLOCK_MONOTONIC_RAW for timing

2021-03-23 Thread Yanan Wang

In addition to function of CLOCK_MONOTONIC, flag CLOCK_MONOTONIC_RAW can
also shield possiable impact of NTP, which can provide more robustness.

Suggested-by: Vitaly Kuznetsov 
Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
Reviewed-by: Andrew Jones 
---
 tools/testing/selftests/kvm/demand_paging_test.c  |  8 
 tools/testing/selftests/kvm/dirty_log_perf_test.c | 14 +++---
 tools/testing/selftests/kvm/lib/test_util.c   |  2 +-
 tools/testing/selftests/kvm/steal_time.c  |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/demand_paging_test.c 
b/tools/testing/selftests/kvm/demand_paging_test.c
index 5f7a229c3af1..efbf0c1e9130 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -53,7 +53,7 @@ static void *vcpu_worker(void *data)
vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
run = vcpu_state(vm, vcpu_id);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
/* Let the guest access its memory */
ret = _vcpu_run(vm, vcpu_id);
@@ -86,7 +86,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
copy.len = perf_test_args.host_page_size;
copy.mode = 0;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
r = ioctl(uffd, UFFDIO_COPY, ©);
if (r == -1) {
@@ -123,7 +123,7 @@ static void *uffd_handler_thread_fn(void *arg)
struct timespec start;
struct timespec ts_diff;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
while (!quit_uffd_thread) {
struct uffd_msg msg;
struct pollfd pollfd[2];
@@ -336,7 +336,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
pr_info("Finished creating vCPUs and starting uffd threads\n");
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c 
b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index 04a2641261be..6cff4ccf9525 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -50,7 +50,7 @@ static void *vcpu_worker(void *data)
while (!READ_ONCE(host_quit)) {
int current_iteration = READ_ONCE(iteration);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
ret = _vcpu_run(vm, vcpu_id);
ts_diff = timespec_elapsed(start);
 
@@ -141,7 +141,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
iteration = 0;
host_quit = false;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
vcpu_last_completed_iteration[vcpu_id] = -1;
 
@@ -162,7 +162,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
ts_diff.tv_sec, ts_diff.tv_nsec);
 
/* Enable dirty logging */
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX,
KVM_MEM_LOG_DIRTY_PAGES);
ts_diff = timespec_elapsed(start);
@@ -174,7 +174,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 * Incrementing the iteration number will start the vCPUs
 * dirtying memory again.
 */
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
iteration++;
 
pr_debug("Starting iteration %d\n", iteration);
@@ -189,7 +189,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("Iteration %d dirty memory time: %ld.%.9lds\n",
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap);
 
ts_diff = timespec_elapsed(start);
@@ -199,7 +199,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
if (dirty_log_manual_caps) {
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime

[RFC PATCH v5 05/10] KVM: selftests: Make a generic helper to get vm guest mode strings

2021-03-23 Thread Yanan Wang

For generality and conciseness, make an API which can be used in all
kvm libs and selftests to get vm guest mode strings. And the index i
is checked in the API in case of possiable faults.

Suggested-by: Sean Christopherson 
Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
Reviewed-by: Ben Gardon 
---
 .../testing/selftests/kvm/include/kvm_util.h  |  4 +--
 tools/testing/selftests/kvm/lib/kvm_util.c| 29 ---
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
b/tools/testing/selftests/kvm/include/kvm_util.h
index 2d7eb6989e83..f52a7492f47f 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -68,9 +68,6 @@ enum vm_guest_mode {
 #define MIN_PAGE_SIZE  (1U << MIN_PAGE_SHIFT)
 #define PTES_PER_MIN_PAGE  ptes_per_page(MIN_PAGE_SIZE)
 
-#define vm_guest_mode_string(m) vm_guest_mode_string[m]
-extern const char * const vm_guest_mode_string[];
-
 struct vm_guest_mode_params {
unsigned int pa_bits;
unsigned int va_bits;
@@ -84,6 +81,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap 
*cap);
 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
struct kvm_enable_cap *cap);
 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+const char *vm_guest_mode_string(uint32_t i);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int 
perm);
 void kvm_vm_free(struct kvm_vm *vmp);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index e5fbf16f725b..2ea837fe03af 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -143,17 +143,24 @@ static void vm_open(struct kvm_vm *vm, int perm)
"rc: %i errno: %i", vm->fd, errno);
 }
 
-const char * const vm_guest_mode_string[] = {
-   "PA-bits:52,  VA-bits:48,  4K pages",
-   "PA-bits:52,  VA-bits:48, 64K pages",
-   "PA-bits:48,  VA-bits:48,  4K pages",
-   "PA-bits:48,  VA-bits:48, 64K pages",
-   "PA-bits:40,  VA-bits:48,  4K pages",
-   "PA-bits:40,  VA-bits:48, 64K pages",
-   "PA-bits:ANY, VA-bits:48,  4K pages",
-};
-_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
-  "Missing new mode strings?");
+const char *vm_guest_mode_string(uint32_t i)
+{
+   static const char * const strings[] = {
+   [VM_MODE_P52V48_4K] = "PA-bits:52,  VA-bits:48,  4K pages",
+   [VM_MODE_P52V48_64K]= "PA-bits:52,  VA-bits:48, 64K pages",
+   [VM_MODE_P48V48_4K] = "PA-bits:48,  VA-bits:48,  4K pages",
+   [VM_MODE_P48V48_64K]= "PA-bits:48,  VA-bits:48, 64K pages",
+   [VM_MODE_P40V48_4K] = "PA-bits:40,  VA-bits:48,  4K pages",
+   [VM_MODE_P40V48_64K]= "PA-bits:40,  VA-bits:48, 64K pages",
+   [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48,  4K pages",
+   };
+   _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
+  "Missing new mode strings?");
+
+   TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
+
+   return strings[i];
+}
 
 const struct vm_guest_mode_params vm_guest_mode_params[] = {
{ 52, 48,  0x1000, 12 },
-- 
2.19.1

[RFC PATCH v5 01/10] tools headers: sync headers of asm-generic/hugetlb_encode.h

2021-03-23 Thread Yanan Wang

This patch syncs contents of tools/include/asm-generic/hugetlb_encode.h
and include/uapi/asm-generic/hugetlb_encode.h. Arch powerpc supports 16KB
hugepages and ARM64 supports 32MB/512MB hugepages. The corresponding mmap
flags have already been added in include/uapi/asm-generic/hugetlb_encode.h,
but not tools/include/asm-generic/hugetlb_encode.h.

Cc: Ingo Molnar 
Cc: Adrian Hunter 
Cc: Jiri Olsa 
Cc: Arnaldo Carvalho de Melo 
Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 tools/include/asm-generic/hugetlb_encode.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/include/asm-generic/hugetlb_encode.h 
b/tools/include/asm-generic/hugetlb_encode.h
index e4732d3c2998..4f3d5aaa11f5 100644
--- a/tools/include/asm-generic/hugetlb_encode.h
+++ b/tools/include/asm-generic/hugetlb_encode.h
@@ -20,13 +20,16 @@
 #define HUGETLB_FLAG_ENCODE_SHIFT  26
 #define HUGETLB_FLAG_ENCODE_MASK   0x3f
 
+#define HUGETLB_FLAG_ENCODE_16KB   (14 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_64KB   (16 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_512KB  (19 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1MB(20 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2MB(21 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_8MB(23 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16MB   (24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_32MB   (25 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_256MB  (28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512MB  (29 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1GB(30 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2GB(31 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16GB   (34 << HUGETLB_FLAG_ENCODE_SHIFT)
-- 
2.19.1

[RFC PATCH v5 00/10] KVM: selftests: some improvement and a new test for kvm page table

2021-03-23 Thread Yanan Wang

Hi,
This v5 series can mainly include two parts.
Based on kvm queue branch: 
https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=queue

In the first part, all the known hugetlb backing src types specified
with different hugepage sizes are listed, so that we can specify use
of hugetlb source of the exact granularity that we want, instead of
the system default ones. And as all the known hugetlb page sizes are
listed, it's appropriate for all architectures. Besides, a helper that
can get granularity of different backing src types(anonumous/thp/hugetlb)
is added, so that we can use the accurate backing src granularity for
kinds of alignment or guest memory accessing of vcpus.

In the second part, a new test is added:
This test is added to serve as a performance tester and a bug reproducer
for kvm page table code (GPA->HPA mappings), it gives guidance for the
people trying to make some improvement for kvm. And the following explains
what we can exactly do through this test.

The function guest_code() can cover the conditions where a single vcpu or
multiple vcpus access guest pages within the same memory region, in three
VM stages(before dirty logging, during dirty logging, after dirty logging).
Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings
or block mappings can be chosen by users to be created in the test.

If ANONYMOUS memory is specified, kvm will create normal page mappings
for the tested memory region before dirty logging, and update attributes
of the page mappings from RO to RW during dirty logging. If THP/HUGETLB
memory is specified, kvm will create block mappings for the tested memory
region before dirty logging, and split the blcok mappings into normal page
mappings during dirty logging, and coalesce the page mappings back into
block mappings after dirty logging is stopped.

So in summary, as a performance tester, this test can present the
performance of kvm creating/updating normal page mappings, or the
performance of kvm creating/splitting/recovering block mappings,
through execution time.

When we need to coalesce the page mappings back to block mappings after
dirty logging is stopped, we have to firstly invalidate *all* the TLB
entries for the page mappings right before installation of the block entry,
because a TLB conflict abort error could occur if we can't invalidate the
TLB entries fully. We have hit this TLB conflict twice on aarch64 software
implementation and fixed it. As this test can imulate process from dirty
logging enabled to dirty logging stopped of a VM with block mappings,
so it can also reproduce this TLB conflict abort due to inadequate TLB
invalidation when coalescing tables.

Links about the TLB conflict abort:
https://lore.kernel.org/lkml/20201201201034.116760-3-wangyana...@huawei.com/

---

Change logs:

v4->v5:
- Use synchronization(sem_wait) for time measurement
- Add a new patch about TEST_ASSERT(patch 4)
- Address Andrew Jones's comments for v4 series
- Add Andrew Jones's R-b tags in some patches
- v4: 
https://lore.kernel.org/lkml/20210302125751.19080-1-wangyana...@huawei.com/

v3->v4:
- Add a helper to get system default hugetlb page size
- Add tags of Reviewed-by of Ben in the patches
- v3: 
https://lore.kernel.org/lkml/20210301065916.11484-1-wangyana...@huawei.com/

v2->v3:
- Add tags of Suggested-by, Reviewed-by in the patches
- Add a generic micro to get hugetlb page sizes
- Some changes for suggestions about v2 series
- v2: 
https://lore.kernel.org/lkml/20210225055940.18748-1-wangyana...@huawei.com/

v1->v2:
- Add a patch to sync header files
- Add helpers to get granularity of different backing src types
- Some changes for suggestions about v1 series
- v1: 
https://lore.kernel.org/lkml/20210208090841.333724-1-wangyana...@huawei.com/

---

Yanan Wang (10):
  tools headers: sync headers of asm-generic/hugetlb_encode.h
  tools headers: Add a macro to get HUGETLB page sizes for mmap
  KVM: selftests: Use flag CLOCK_MONOTONIC_RAW for timing
  KVM: selftests: Print the errno besides error-string in TEST_ASSERT
  KVM: selftests: Make a generic helper to get vm guest mode strings
  KVM: selftests: Add a helper to get system configured THP page size
  KVM: selftests: Add a helper to get system default hugetlb page size
  KVM: selftests: List all hugetlb src types specified with page sizes
  KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers
  KVM: selftests: Add a test for kvm page table code

 include/uapi/linux/mman.h |   2 +
 tools/include/asm-generic/hugetlb_encode.h|   3 +
 tools/include/uapi/linux/mman.h   |   2 +
 tools/testing/selftests/kvm/.gitignore|   1 +
 tools/testing/selftests/kvm/Makefile  |   3 +
 .../selftests/kvm/demand_paging_test.c|   8 +-
 .../selftests/kvm/dirty_log_perf_test.c   |  14 +-
 .../testing/selftests/kvm/include/kvm_util.h

[RFC PATCH v2 2/3] KVM: arm64: Install the block entry before unmapping the page mappings

2021-03-10 Thread Yanan Wang

When KVM needs to coalesce the existing normal page mappings into a block
mapping, we currently follow the following steps successively:
1) invalidate the table entry in the PMD/PUD table
2) flush TLB by VMID
3) unmap the old sub-level tables
4) install the new block entry to the PMD/PUD table

It will cost a long time to unmap the old page mappings in step 3, which
means there will be a long period when the PMD/PUD table entry could be
found invalid (step 1, 2, 3). So the other vcpus have a really big chance
to trigger unnecessary translations if they access any page within the
block and find the table entry invalid.

So let's quickly install the block entry at first to ensure uninterrupted
memory access of the other vcpus, and then unmap the page mappings after
installation. This will reduce most of the time when the table entry is
invalid, and avoid most of the unnecessary translation faults.

After this patch the steps can be like:
1) invalidate the table entry in the PMD/PUD table
2) flush TLB by VMID
3) install the new block entry to the PMD/PUD table
4) unmap the old sub-level tables

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 44 
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 829a34eea526..b40b1f1615c7 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -434,6 +434,7 @@ struct stage2_map_data {
kvm_pte_t   attr;
 
kvm_pte_t   *anchor;
+   kvm_pte_t   *follow;
 
struct kvm_s2_mmu   *mmu;
struct kvm_mmu_memory_cache *memcache;
@@ -545,6 +546,24 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
return 0;
 }
 
+static void stage2_coalesce_tables_into_block(u64 addr, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+   u64 granule = kvm_granule_size(level), phys = data->phys;
+   kvm_pte_t new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+
+   kvm_set_invalid_pte(ptep);
+
+   /*
+* Invalidate the whole stage-2, as we may have numerous leaf entries
+* below us which would otherwise need invalidating individually.
+*/
+   kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
+   smp_store_release(ptep, new);
+   data->phys += granule;
+}
+
 static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
 kvm_pte_t *ptep,
 struct stage2_map_data *data)
@@ -555,15 +574,14 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, 
u32 level,
if (!kvm_block_mapping_supported(addr, end, data->phys, level))
return 0;
 
-   kvm_set_invalid_pte(ptep);
-
/*
-* Invalidate the whole stage-2, as we may have numerous leaf
-* entries below us which would otherwise need invalidating
-* individually.
+* If we need to coalesce existing table entries into a block here,
+* then install the block entry first and the sub-level page mappings
+* will be unmapped later.
 */
-   kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
data->anchor = ptep;
+   data->follow = kvm_pte_follow(*ptep);
+   stage2_coalesce_tables_into_block(addr, level, ptep, data);
return 0;
 }
 
@@ -616,20 +634,18 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, 
u32 level,
  kvm_pte_t *ptep,
  struct stage2_map_data *data)
 {
-   int ret = 0;
-
if (!data->anchor)
return 0;
 
-   free_page((unsigned long)kvm_pte_follow(*ptep));
-   put_page(virt_to_page(ptep));
-
-   if (data->anchor == ptep) {
+   if (data->anchor != ptep) {
+   free_page((unsigned long)kvm_pte_follow(*ptep));
+   put_page(virt_to_page(ptep));
+   } else {
+   free_page((unsigned long)data->follow);
data->anchor = NULL;
-   ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
}
 
-   return ret;
+   return 0;
 }
 
 /*
-- 
2.19.1

[RFC PATCH v2 3/3] KVM: arm64: Distinguish cases of memcache allocations completely

2021-03-10 Thread Yanan Wang

With a guest translation fault, the memcache pages are not needed if KVM
is only about to install a new leaf entry into the existing page table.
And with a guest permission fault, the memcache pages are also not needed
for a write_fault in dirty-logging time if KVM is only about to update
the existing leaf entry instead of collapsing a block entry into a table.

By comparing fault_granule and vma_pagesize, cases that require allocations
from memcache and cases that don't can be distinguished completely.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/mmu.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1eec9f63bc6f..05af40dc60c1 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -810,19 +810,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
 
-   /*
-* Permission faults just need to update the existing leaf entry,
-* and so normally don't require allocations from the memcache. The
-* only exception to this is when dirty logging is enabled at runtime
-* and a write fault needs to collapse a block entry into a table.
-*/
-   if (fault_status != FSC_PERM || (logging_active && write_fault)) {
-   ret = kvm_mmu_topup_memory_cache(memcache,
-kvm_mmu_cache_min_pages(kvm));
-   if (ret)
-   return ret;
-   }
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
 * Ensure the read of mmu_notifier_seq happens before we call
@@ -880,6 +867,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
 
+   /*
+* Allocations from the memcache are required only when granule of the
+* lookup level where the guest fault happened exceeds vma_pagesize,
+* which means new page tables will be created in the fault handlers.
+*/
+   if (fault_granule > vma_pagesize) {
+   ret = kvm_mmu_topup_memory_cache(memcache,
+kvm_mmu_cache_min_pages(kvm));
+   if (ret)
+   return ret;
+   }
+
/*
 * Under the premise of getting a FSC_PERM fault, we just need to relax
 * permissions only if vma_pagesize equals fault_granule. Otherwise,
-- 
2.19.1

[RFC PATCH v2 1/3] KVM: arm64: Move CMOs from user_mem_abort to the fault handlers

2021-03-10 Thread Yanan Wang

We currently uniformly perform CMOs of D-cache and I-cache in function
user_mem_abort() before calling the fault handlers. If we get concurrent
translation faults on the same IPA (page or block), CMOs for the first
time is necessary while the others later are not.

By moving CMOs to the fault handlers, we can easily identify conditions
where they are really needed and avoid the unnecessary ones. As it's a
time consuming process to perform CMOs especially when flushing a block
range, so this solution reduces much load of kvm and improve efficiency
of the page table code.

So let's move both clean of D-cache and invalidation of I-cache to the
map path and move only invalidation of I-cache to the permission path.
Since the original APIs for CMOs in mmu.c are only called in function
user_mem_abort, we now also move them to pgtable.c.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/kvm_mmu.h | 31 ---
 arch/arm64/kvm/hyp/pgtable.c | 68 +---
 arch/arm64/kvm/mmu.c | 23 ++-
 3 files changed, 57 insertions(+), 65 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 90873851f677..c31f88306d4e 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -177,37 +177,6 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu 
*vcpu)
return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-   void *va = page_address(pfn_to_page(pfn));
-
-   /*
-* With FWB, we ensure that the guest always accesses memory using
-* cacheable attributes, and we don't have to clean to PoC when
-* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
-* PoU is not required either in this case.
-*/
-   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-   return;
-
-   kvm_flush_dcache_to_poc(va, size);
-}
-
-static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
- unsigned long size)
-{
-   if (icache_is_aliasing()) {
-   /* any kind of VIPT cache */
-   __flush_icache_all();
-   } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
-   /* PIPT or VPIPT at EL2 (see comment in 
__kvm_tlb_flush_vmid_ipa) */
-   void *va = page_address(pfn_to_page(pfn));
-
-   invalidate_icache_range((unsigned long)va,
-   (unsigned long)va + size);
-   }
-}
-
 void kvm_set_way_flush(struct kvm_vcpu *vcpu);
 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
 
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 4d177ce1d536..829a34eea526 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -464,6 +464,43 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
return 0;
 }
 
+static bool stage2_pte_cacheable(kvm_pte_t pte)
+{
+   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
+   return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static bool stage2_pte_executable(kvm_pte_t pte)
+{
+   return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+   /*
+* With FWB, we ensure that the guest always accesses memory using
+* cacheable attributes, and we don't have to clean to PoC when
+* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
+* PoU is not required either in this case.
+*/
+   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+   return;
+
+   __flush_dcache_area(addr, size);
+}
+
+static void stage2_invalidate_icache(void *addr, u64 size)
+{
+   if (icache_is_aliasing()) {
+   /* Flush any kind of VIPT icache */
+   __flush_icache_all();
+   } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
+   /* PIPT or VPIPT at EL2 */
+   invalidate_icache_range((unsigned long)addr,
+   (unsigned long)addr + size);
+   }
+}
+
 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
  kvm_pte_t *ptep,
  struct stage2_map_data *data)
@@ -495,6 +532,13 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
put_page(page);
}
 
+   /* Perform CMOs before installation of the new PTE */
+   if (!kvm_pte_valid(old) || stage2_pte_cacheable(old))
+   stage2_flush_dcache(__va(phys), granule);
+
+   if (stage2_pte_executable(new))
+   stage2_invalidate_icache(__va(phys), granule);
+
smp_store_release(ptep, new);
get_page(page);
data->phys +

[RFC PATCH v2 0/3] KVM: arm64: Improve efficiency of stage2 page table

2021-03-10 Thread Yanan Wang

memory, block mappings(HUGETLB 1G))
KVM_REBUILD_BLOCKS: before 2.965s -> after 0.359s  +87.55%

About patch 3:
A new method to distinguish cases of memcache allocations is introduced.
By comparing fault_granule and vma_pagesize, cases that require allocations
from memcache and cases that don't can be distinguished completely.

---

Yanan Wang (3):
  KVM: arm64: Move CMOs from user_mem_abort to the fault handlers
  KVM: arm64: Install the block entry before unmapping the page mappings
  KVM: arm64: Distinguish cases of memcache allocations completely

 arch/arm64/include/asm/kvm_mmu.h |  31 -
 arch/arm64/kvm/hyp/pgtable.c | 112 +++
 arch/arm64/kvm/mmu.c |  48 +
 3 files changed, 99 insertions(+), 92 deletions(-)

-- 
2.19.1

[RFC PATCH v4 7/9] KVM: selftests: List all hugetlb src types specified with page sizes

2021-03-02 Thread Yanan Wang

With VM_MEM_SRC_ANONYMOUS_HUGETLB, we currently can only use system
default hugetlb pages to back the testing guest memory. In order to
add flexibility, now list all the known hugetlb backing src types with
different page sizes, so that we can specify use of hugetlb pages of the
exact granularity that we want. And as all the known hugetlb page sizes
are listed, it's appropriate for all architectures.

Besides, the helper get_backing_src_pagesz() is added to get the
granularity of different backing src types(anonumous, thp, hugetlb).

Suggested-by: Ben Gardon 
Signed-off-by: Yanan Wang 
---
 .../testing/selftests/kvm/include/test_util.h | 18 +-
 tools/testing/selftests/kvm/lib/kvm_util.c|  2 +-
 tools/testing/selftests/kvm/lib/test_util.c   | 59 +++
 3 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index e087174eefe5..fade3130eb01 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -71,16 +71,32 @@ enum vm_mem_backing_src_type {
VM_MEM_SRC_ANONYMOUS,
VM_MEM_SRC_ANONYMOUS_THP,
VM_MEM_SRC_ANONYMOUS_HUGETLB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
+   NUM_SRC_TYPES,
 };
 
 struct vm_mem_backing_src_alias {
const char *name;
-   enum vm_mem_backing_src_type type;
+   uint32_t flag;
 };
 
 bool thp_configured(void);
 size_t get_trans_hugepagesz(void);
 size_t get_def_hugetlb_pagesz(void);
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i);
+size_t get_backing_src_pagesz(uint32_t i);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index cc22c4ab7d67..b91c8e3a7ee1 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -757,7 +757,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->mmap_start = mmap(NULL, region->mmap_size,
  PROT_READ | PROT_WRITE,
  MAP_PRIVATE | MAP_ANONYMOUS
- | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? 
MAP_HUGETLB : 0),
+ | vm_mem_backing_src_alias(src_type)->flag,
  -1, 0);
TEST_ASSERT(region->mmap_start != MAP_FAILED,
"test_malloc failed, mmap_start: %p errno: %i",
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index 80d68dbd72d2..df8a42eff1f8 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -112,12 +113,6 @@ void print_skip(const char *fmt, ...)
puts(", skipping test");
 }
 
-const struct vm_mem_backing_src_alias backing_src_aliases[] = {
-   {"anonymous", VM_MEM_SRC_ANONYMOUS,},
-   {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
-   {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
-};
-
 bool thp_configured(void)
 {
int ret;
@@ -180,22 +175,64 @@ size_t get_def_hugetlb_pagesz(void)
return 0;
 }
 
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
+{
+   static const struct vm_mem_backing_src_alias aliases[] = {
+   { "anonymous",   0},
+   { "anonymous_thp",   0},
+   { "anonymous_hugetlb",   MAP_HUGETLB  },
+   { "anonymous_hugetlb_16kb",  MAP_HUGETLB | MAP_HUGE_16KB  },
+   { "anonymous_hugetlb_64kb",  MAP_HUGETLB | MAP_HUGE_64KB  },
+   { "anonymous_hugetlb_512kb", MAP_HUGETLB | MAP_HUGE_512KB },
+   { "anonymous_hugetlb_1mb",   MAP_HUGETLB | MAP_HUGE_1MB   },
+   { "anonymous_hugetlb_2mb",   MAP_HUGETLB | MAP_HUGE_2MB   },
+   { "anonymous_hugetlb_8mb",   MAP_HUGETLB | MAP_HUGE_8MB   },
+   { &

[RFC PATCH v4 4/9] KVM: selftests: Make a generic helper to get vm guest mode strings

2021-03-02 Thread Yanan Wang

For generality and conciseness, make an API which can be used in all
kvm libs and selftests to get vm guest mode strings. And the index i
is checked in the API in case of possiable faults.

Suggested-by: Sean Christopherson 
Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
Reviewed-by: Ben Gardon 
---
 .../testing/selftests/kvm/include/kvm_util.h  |  4 +--
 tools/testing/selftests/kvm/lib/kvm_util.c| 29 ---
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
b/tools/testing/selftests/kvm/include/kvm_util.h
index 2d7eb6989e83..f52a7492f47f 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -68,9 +68,6 @@ enum vm_guest_mode {
 #define MIN_PAGE_SIZE  (1U << MIN_PAGE_SHIFT)
 #define PTES_PER_MIN_PAGE  ptes_per_page(MIN_PAGE_SIZE)
 
-#define vm_guest_mode_string(m) vm_guest_mode_string[m]
-extern const char * const vm_guest_mode_string[];
-
 struct vm_guest_mode_params {
unsigned int pa_bits;
unsigned int va_bits;
@@ -84,6 +81,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap 
*cap);
 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
struct kvm_enable_cap *cap);
 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+const char *vm_guest_mode_string(uint32_t i);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int 
perm);
 void kvm_vm_free(struct kvm_vm *vmp);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index d787cb802b4a..cc22c4ab7d67 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -141,17 +141,24 @@ static void vm_open(struct kvm_vm *vm, int perm)
"rc: %i errno: %i", vm->fd, errno);
 }
 
-const char * const vm_guest_mode_string[] = {
-   "PA-bits:52,  VA-bits:48,  4K pages",
-   "PA-bits:52,  VA-bits:48, 64K pages",
-   "PA-bits:48,  VA-bits:48,  4K pages",
-   "PA-bits:48,  VA-bits:48, 64K pages",
-   "PA-bits:40,  VA-bits:48,  4K pages",
-   "PA-bits:40,  VA-bits:48, 64K pages",
-   "PA-bits:ANY, VA-bits:48,  4K pages",
-};
-_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
-  "Missing new mode strings?");
+const char *vm_guest_mode_string(uint32_t i)
+{
+   static const char * const strings[] = {
+   [VM_MODE_P52V48_4K] = "PA-bits:52,  VA-bits:48,  4K pages",
+   [VM_MODE_P52V48_64K]= "PA-bits:52,  VA-bits:48, 64K pages",
+   [VM_MODE_P48V48_4K] = "PA-bits:48,  VA-bits:48,  4K pages",
+   [VM_MODE_P48V48_64K]= "PA-bits:48,  VA-bits:48, 64K pages",
+   [VM_MODE_P40V48_4K] = "PA-bits:40,  VA-bits:48,  4K pages",
+   [VM_MODE_P40V48_64K]= "PA-bits:40,  VA-bits:48, 64K pages",
+   [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48,  4K pages",
+   };
+   _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
+  "Missing new mode strings?");
+
+   TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
+
+   return strings[i];
+}
 
 const struct vm_guest_mode_params vm_guest_mode_params[] = {
{ 52, 48,  0x1000, 12 },
-- 
2.23.0

[RFC PATCH v4 3/9] KVM: selftests: Use flag CLOCK_MONOTONIC_RAW for timing

2021-03-02 Thread Yanan Wang

In addition to function of CLOCK_MONOTONIC, flag CLOCK_MONOTONIC_RAW can
also shield possiable impact of NTP, which can provide more robustness.

Suggested-by: Vitaly Kuznetsov 
Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 tools/testing/selftests/kvm/demand_paging_test.c  |  8 
 tools/testing/selftests/kvm/dirty_log_perf_test.c | 14 +++---
 tools/testing/selftests/kvm/lib/test_util.c   |  2 +-
 tools/testing/selftests/kvm/steal_time.c  |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/demand_paging_test.c 
b/tools/testing/selftests/kvm/demand_paging_test.c
index 5f7a229c3af1..efbf0c1e9130 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -53,7 +53,7 @@ static void *vcpu_worker(void *data)
vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
run = vcpu_state(vm, vcpu_id);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
/* Let the guest access its memory */
ret = _vcpu_run(vm, vcpu_id);
@@ -86,7 +86,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
copy.len = perf_test_args.host_page_size;
copy.mode = 0;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
r = ioctl(uffd, UFFDIO_COPY, ©);
if (r == -1) {
@@ -123,7 +123,7 @@ static void *uffd_handler_thread_fn(void *arg)
struct timespec start;
struct timespec ts_diff;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
while (!quit_uffd_thread) {
struct uffd_msg msg;
struct pollfd pollfd[2];
@@ -336,7 +336,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
pr_info("Finished creating vCPUs and starting uffd threads\n");
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c 
b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index 04a2641261be..6cff4ccf9525 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -50,7 +50,7 @@ static void *vcpu_worker(void *data)
while (!READ_ONCE(host_quit)) {
int current_iteration = READ_ONCE(iteration);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
ret = _vcpu_run(vm, vcpu_id);
ts_diff = timespec_elapsed(start);
 
@@ -141,7 +141,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
iteration = 0;
host_quit = false;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
vcpu_last_completed_iteration[vcpu_id] = -1;
 
@@ -162,7 +162,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
ts_diff.tv_sec, ts_diff.tv_nsec);
 
/* Enable dirty logging */
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX,
KVM_MEM_LOG_DIRTY_PAGES);
ts_diff = timespec_elapsed(start);
@@ -174,7 +174,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 * Incrementing the iteration number will start the vCPUs
 * dirtying memory again.
 */
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
iteration++;
 
pr_debug("Starting iteration %d\n", iteration);
@@ -189,7 +189,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("Iteration %d dirty memory time: %ld.%.9lds\n",
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap);
 
ts_diff = timespec_elapsed(start);
@@ -199,7 +199,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
if (dirty_log_manual_caps) {
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
kvm_vm_clear_dirty_log(vm

[RFC PATCH v4 0/9] KVM: selftests: some improvement and a new test for kvm page table

2021-03-02 Thread Yanan Wang

Hi,
This v4 series can mainly include two parts.
Based on kvm queue branch: 
https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=queue
Links of v1: 
https://lore.kernel.org/lkml/20210208090841.333724-1-wangyana...@huawei.com/
Links of v2: 
https://lore.kernel.org/lkml/20210225055940.18748-1-wangyana...@huawei.com/
Links of v3: 
https://lore.kernel.org/lkml/20210301065916.11484-1-wangyana...@huawei.com/

In the first part, all the known hugetlb backing src types specified
with different hugepage sizes are listed, so that we can specify use
of hugetlb source of the exact granularity that we want, instead of
the system default ones. And as all the known hugetlb page sizes are
listed, it's appropriate for all architectures. Besides, a helper that
can get granularity of different backing src types(anonumous/thp/hugetlb)
is added, so that we can use the accurate backing src granularity for
kinds of alignment or guest memory accessing of vcpus.

In the second part, a new test is added:
This test is added to serve as a performance tester and a bug reproducer
for kvm page table code (GPA->HPA mappings), it gives guidance for the
people trying to make some improvement for kvm. And the following explains
what we can exactly do through this test.

The function guest_code() can cover the conditions where a single vcpu or
multiple vcpus access guest pages within the same memory region, in three
VM stages(before dirty logging, during dirty logging, after dirty logging).
Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings
or block mappings can be chosen by users to be created in the test.

If ANONYMOUS memory is specified, kvm will create normal page mappings
for the tested memory region before dirty logging, and update attributes
of the page mappings from RO to RW during dirty logging. If THP/HUGETLB
memory is specified, kvm will create block mappings for the tested memory
region before dirty logging, and split the blcok mappings into normal page
mappings during dirty logging, and coalesce the page mappings back into
block mappings after dirty logging is stopped.

So in summary, as a performance tester, this test can present the
performance of kvm creating/updating normal page mappings, or the
performance of kvm creating/splitting/recovering block mappings,
through execution time.

When we need to coalesce the page mappings back to block mappings after
dirty logging is stopped, we have to firstly invalidate *all* the TLB
entries for the page mappings right before installation of the block entry,
because a TLB conflict abort error could occur if we can't invalidate the
TLB entries fully. We have hit this TLB conflict twice on aarch64 software
implementation and fixed it. As this test can imulate process from dirty
logging enabled to dirty logging stopped of a VM with block mappings,
so it can also reproduce this TLB conflict abort due to inadequate TLB
invalidation when coalescing tables.

Links about the TLB conflict abort:
https://lore.kernel.org/lkml/20201201201034.116760-3-wangyana...@huawei.com/

---

Change logs:

v3->v4:
- Add a helper to get system default hugetlb page size
- Add tags of Reviewed-by of Ben in the patches

v2->v3:
- Add tags of Suggested-by, Reviewed-by in the patches
- Add a generic micro to get hugetlb page sizes
- Some changes for suggestions about v2 series

v1->v2:
- Add a patch to sync header files
- Add helpers to get granularity of different backing src types
- Some changes for suggestions about v1 series

---

Yanan Wang (9):
  tools headers: sync headers of asm-generic/hugetlb_encode.h
  tools headers: Add a macro to get HUGETLB page sizes for mmap
  KVM: selftests: Use flag CLOCK_MONOTONIC_RAW for timing
  KVM: selftests: Make a generic helper to get vm guest mode strings
  KVM: selftests: Add a helper to get system configured THP page size
  KVM: selftests: Add a helper to get system default hugetlb page size
  KVM: selftests: List all hugetlb src types specified with page sizes
  KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers
  KVM: selftests: Add a test for kvm page table code

 include/uapi/linux/mman.h |   2 +
 tools/include/asm-generic/hugetlb_encode.h|   3 +
 tools/include/uapi/linux/mman.h   |   2 +
 tools/testing/selftests/kvm/Makefile  |   3 +
 .../selftests/kvm/demand_paging_test.c|   8 +-
 .../selftests/kvm/dirty_log_perf_test.c   |  14 +-
 .../testing/selftests/kvm/include/kvm_util.h  |   4 +-
 .../testing/selftests/kvm/include/test_util.h |  21 +-
 .../selftests/kvm/kvm_page_table_test.c   | 476 ++
 tools/testing/selftests/kvm/lib/kvm_util.c|  59 ++-
 tools/testing/selftests/kvm/lib/test_util.c   | 122 -
 tools/testing/selftests/kvm/steal_time.c  |   4 +-
 12 files changed, 659 insertions(+), 59 deletions(-)
 create mode 100644 tools

[RFC PATCH v4 6/9] KVM: selftests: Add a helper to get system default hugetlb page size

2021-03-02 Thread Yanan Wang

If HUGETLB is configured in the host kernel, then we can know the system
default hugetlb page size through *cat /proc/meminfo*. Otherwise, we will
not see the information of hugetlb pages in file /proc/meminfo if it's not
configured. So add a helper to determine whether HUGETLB is configured and
then get the default page size by reading /proc/meminfo.

This helper can be useful when a program wants to use the default hugetlb
pages of the system and doesn't know the default page size.

Signed-off-by: Yanan Wang 
---
 .../testing/selftests/kvm/include/test_util.h |  1 +
 tools/testing/selftests/kvm/lib/test_util.c   | 27 +++
 2 files changed, 28 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index ef24c76ba89a..e087174eefe5 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -80,6 +80,7 @@ struct vm_mem_backing_src_alias {
 
 bool thp_configured(void);
 size_t get_trans_hugepagesz(void);
+size_t get_def_hugetlb_pagesz(void);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index f2d133f76c67..80d68dbd72d2 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -153,6 +153,33 @@ size_t get_trans_hugepagesz(void)
return size;
 }
 
+size_t get_def_hugetlb_pagesz(void)
+{
+   char buf[64];
+   const char *tag = "Hugepagesize:";
+   FILE *f;
+
+   f = fopen("/proc/meminfo", "r");
+   TEST_ASSERT(f != NULL, "Error in opening /proc/meminfo: %d", errno);
+
+   while (fgets(buf, sizeof(buf), f) != NULL) {
+   if (strstr(buf, tag) == buf) {
+   fclose(f);
+   return strtoull(buf + strlen(tag), NULL, 10) << 10;
+   }
+   }
+
+   if (feof(f)) {
+   fclose(f);
+   TEST_FAIL("HUGETLB is not configured in host kernel");
+   } else {
+   fclose(f);
+   TEST_FAIL("Error in reading /proc/meminfo: %d", errno);
+   }
+
+   return 0;
+}
+
 void backing_src_help(void)
 {
int i;
-- 
2.23.0

[RFC PATCH v4 2/9] tools headers: Add a macro to get HUGETLB page sizes for mmap

2021-03-02 Thread Yanan Wang

We know that if a system supports multiple hugetlb page sizes,
the desired hugetlb page size can be specified in bits [26:31]
of the flag arguments. The value in these 6 bits will be the
shift of each hugetlb page size.

So add a macro to get the page size shift and then calculate the
corresponding hugetlb page size, using flag x.

Cc: Ben Gardon 
Cc: Ingo Molnar 
Cc: Adrian Hunter 
Cc: Jiri Olsa 
Cc: Arnaldo Carvalho de Melo 
Cc: Arnd Bergmann 
Cc: Michael Kerrisk 
Cc: Thomas Gleixner 
Suggested-by: Ben Gardon 
Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 include/uapi/linux/mman.h   | 2 ++
 tools/include/uapi/linux/mman.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index f55bc680b5b0..8bd41128a0ee 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -41,4 +41,6 @@
 #define MAP_HUGE_2GB   HUGETLB_FLAG_ENCODE_2GB
 #define MAP_HUGE_16GB  HUGETLB_FLAG_ENCODE_16GB
 
+#define MAP_HUGE_PAGE_SIZE(x) (1 << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
 #endif /* _UAPI_LINUX_MMAN_H */
diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h
index f55bc680b5b0..8bd41128a0ee 100644
--- a/tools/include/uapi/linux/mman.h
+++ b/tools/include/uapi/linux/mman.h
@@ -41,4 +41,6 @@
 #define MAP_HUGE_2GB   HUGETLB_FLAG_ENCODE_2GB
 #define MAP_HUGE_16GB  HUGETLB_FLAG_ENCODE_16GB
 
+#define MAP_HUGE_PAGE_SIZE(x) (1 << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
 #endif /* _UAPI_LINUX_MMAN_H */
-- 
2.23.0

[RFC PATCH v4 1/9] tools headers: sync headers of asm-generic/hugetlb_encode.h

2021-03-02 Thread Yanan Wang

This patch syncs contents of tools/include/asm-generic/hugetlb_encode.h
and include/uapi/asm-generic/hugetlb_encode.h. Arch powerpc supports 16KB
hugepages and ARM64 supports 32MB/512MB hugepages. The corresponding mmap
flags have already been added in include/uapi/asm-generic/hugetlb_encode.h,
but not tools/include/asm-generic/hugetlb_encode.h.

Cc: Ingo Molnar 
Cc: Adrian Hunter 
Cc: Jiri Olsa 
Cc: Arnaldo Carvalho de Melo 
Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 tools/include/asm-generic/hugetlb_encode.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/include/asm-generic/hugetlb_encode.h 
b/tools/include/asm-generic/hugetlb_encode.h
index e4732d3c2998..4f3d5aaa11f5 100644
--- a/tools/include/asm-generic/hugetlb_encode.h
+++ b/tools/include/asm-generic/hugetlb_encode.h
@@ -20,13 +20,16 @@
 #define HUGETLB_FLAG_ENCODE_SHIFT  26
 #define HUGETLB_FLAG_ENCODE_MASK   0x3f
 
+#define HUGETLB_FLAG_ENCODE_16KB   (14 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_64KB   (16 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_512KB  (19 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1MB(20 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2MB(21 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_8MB(23 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16MB   (24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_32MB   (25 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_256MB  (28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512MB  (29 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1GB(30 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2GB(31 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16GB   (34 << HUGETLB_FLAG_ENCODE_SHIFT)
-- 
2.23.0

[RFC PATCH v4 9/9] KVM: selftests: Add a test for kvm page table code

2021-03-02 Thread Yanan Wang

This test serves as a performance tester and a bug reproducer for
kvm page table code (GPA->HPA mappings), so it gives guidance for
people trying to make some improvement for kvm.

The function guest_code() can cover the conditions where a single vcpu or
multiple vcpus access guest pages within the same memory region, in three
VM stages(before dirty logging, during dirty logging, after dirty logging).
Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings
or block mappings can be chosen by users to be created in the test.

If ANONYMOUS memory is specified, kvm will create normal page mappings
for the tested memory region before dirty logging, and update attributes
of the page mappings from RO to RW during dirty logging. If THP/HUGETLB
memory is specified, kvm will create block mappings for the tested memory
region before dirty logging, and split the blcok mappings into normal page
mappings during dirty logging, and coalesce the page mappings back into
block mappings after dirty logging is stopped.

So in summary, as a performance tester, this test can present the
performance of kvm creating/updating normal page mappings, or the
performance of kvm creating/splitting/recovering block mappings,
through execution time.

When we need to coalesce the page mappings back to block mappings after
dirty logging is stopped, we have to firstly invalidate *all* the TLB
entries for the page mappings right before installation of the block entry,
because a TLB conflict abort error could occur if we can't invalidate the
TLB entries fully. We have hit this TLB conflict twice on aarch64 software
implementation and fixed it. As this test can imulate process from dirty
logging enabled to dirty logging stopped of a VM with block mappings,
so it can also reproduce this TLB conflict abort due to inadequate TLB
invalidation when coalescing tables.

Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 tools/testing/selftests/kvm/Makefile  |   3 +
 .../selftests/kvm/kvm_page_table_test.c   | 476 ++
 2 files changed, 479 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c

diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index a6d61f451f88..bac81924166d 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -67,6 +67,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
 TEST_GEN_PROGS_x86_64 += demand_paging_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
+TEST_GEN_PROGS_x86_64 += kvm_page_table_test
 TEST_GEN_PROGS_x86_64 += hardware_disable_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
@@ -78,6 +79,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
 TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
+TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
@@ -87,6 +89,7 @@ TEST_GEN_PROGS_s390x += s390x/resets
 TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
+TEST_GEN_PROGS_s390x += kvm_page_table_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
 TEST_GEN_PROGS_s390x += set_memory_region_test
 
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c 
b/tools/testing/selftests/kvm/kvm_page_table_test.c
new file mode 100644
index ..032b49d1483b
--- /dev/null
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM page table test
+ *
+ * Copyright (C) 2021, Huawei, Inc.
+ *
+ * Make sure that THP has been enabled or enough HUGETLB pages with specific
+ * page size have been pre-allocated on your system, if you are planning to
+ * use hugepages to back the guest memory for testing.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "guest_modes.h"
+
+#define TEST_MEM_SLOT_INDEX 1
+
+/* Default size(1GB) of the memory for testing */
+#define DEFAULT_TEST_MEM_SIZE  (1 << 30)
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM 0xc000
+
+/* Number of guest memory accessing types(read/write) */
+#define NUM_ACCESS_TYPES   2
+
+/* Different guest memory accessing stages */
+enum test_stage {
+   KVM_BEFORE_MAPPINGS,
+   KVM_CREATE_MAPPINGS,
+   KVM_UPDATE_MAPPINGS,
+   KVM_ADJUST_MAPPINGS,
+   NUM_TEST_STAGES,
+};
+
+static const char * cons

[RFC PATCH v4 8/9] KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers

2021-03-02 Thread Yanan Wang

With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
we have to get the transparent hugepage size for HVA alignment. With the
new helpers, we can use get_backing_src_pagesz() to check whether THP is
configured and then get the exact configured hugepage size.

As different architectures may have different THP page sizes configured,
this can get the accurate THP page sizes on any platform.

Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 28 +++---
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index b91c8e3a7ee1..b29402f9f00c 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -18,7 +18,6 @@
 #include 
 #include 
 
-#define KVM_UTIL_PGS_PER_HUGEPG 512
 #define KVM_UTIL_MIN_PFN   2
 
 /* Aligns x up to the next multiple of size. Size must be a power of 2. */
@@ -686,7 +685,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 {
int ret;
struct userspace_mem_region *region;
-   size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+   size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
size_t alignment;
 
TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
@@ -748,7 +747,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 #endif
 
if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
-   alignment = max(huge_page_size, alignment);
+   alignment = max(backing_src_pagesz, alignment);
 
/* Add enough memory to align up if necessary */
if (alignment > 1)
@@ -767,22 +766,13 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->host_mem = align(region->mmap_start, alignment);
 
/* As needed perform madvise */
-   if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == 
VM_MEM_SRC_ANONYMOUS_THP) {
-   struct stat statbuf;
-
-   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
-   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
-   "stat /sys/kernel/mm/transparent_hugepage");
-
-   TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
-   "VM_MEM_SRC_ANONYMOUS_THP requires THP to be 
configured in the host kernel");
-
-   if (ret == 0) {
-   ret = madvise(region->host_mem, npages * vm->page_size,
- src_type == VM_MEM_SRC_ANONYMOUS ? 
MADV_NOHUGEPAGE : MADV_HUGEPAGE);
-   TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 
0x%lx src_type: %x",
-   region->host_mem, npages * vm->page_size, 
src_type);
-   }
+   if ((src_type == VM_MEM_SRC_ANONYMOUS ||
+src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
+   ret = madvise(region->host_mem, npages * vm->page_size,
+ src_type == VM_MEM_SRC_ANONYMOUS ? 
MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+   TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx 
src_type: %s",
+   region->host_mem, npages * vm->page_size,
+   vm_mem_backing_src_alias(src_type)->name);
}
 
region->unused_phy_pages = sparsebit_alloc();
-- 
2.23.0

[RFC PATCH v4 5/9] KVM: selftests: Add a helper to get system configured THP page size

2021-03-02 Thread Yanan Wang

If we want to have some tests about transparent hugepages, the system
configured THP hugepage size should better be known by the tests, which
can be used for kinds of alignment or guest memory accessing of vcpus...
So it makes sense to add a helper to get the transparent hugepage size.

With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
we now stat /sys/kernel/mm/transparent_hugepage to check whether THP is
configured in the host kernel before madvise(). Based on this, we can also
read file /sys/kernel/mm/transparent_hugepage/hpage_pmd_size to get THP
hugepage size.

Signed-off-by: Yanan Wang 
Reviewed-by: Ben Gardon 
---
 .../testing/selftests/kvm/include/test_util.h |  2 ++
 tools/testing/selftests/kvm/lib/test_util.c   | 36 +++
 2 files changed, 38 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index b7f41399f22c..ef24c76ba89a 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -78,6 +78,8 @@ struct vm_mem_backing_src_alias {
enum vm_mem_backing_src_type type;
 };
 
+bool thp_configured(void);
+size_t get_trans_hugepagesz(void);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index c7c0627c6842..f2d133f76c67 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -117,6 +118,41 @@ const struct vm_mem_backing_src_alias 
backing_src_aliases[] = {
{"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
 };
 
+bool thp_configured(void)
+{
+   int ret;
+   struct stat statbuf;
+
+   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+   "Error in stating /sys/kernel/mm/transparent_hugepage: %d",
+   errno);
+
+   return ret == 0;
+}
+
+size_t get_trans_hugepagesz(void)
+{
+   size_t size;
+   char buf[16];
+   FILE *f;
+
+   TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
+
+   f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+   TEST_ASSERT(f != NULL,
+   "Error in opening transparent_hugepage/hpage_pmd_size: %d",
+   errno);
+
+   if (fread(buf, sizeof(char), sizeof(buf), f) == 0) {
+   fclose(f);
+   TEST_FAIL("Unable to read transparent_hugepage/hpage_pmd_size");
+   }
+
+   size = strtoull(buf, NULL, 10);
+   return size;
+}
+
 void backing_src_help(void)
 {
int i;
-- 
2.23.0

[RFC PATCH v3 4/8] KVM: selftests: Make a generic helper to get vm guest mode strings

2021-02-28 Thread Yanan Wang

For generality and conciseness, make an API which can be used in all
kvm libs and selftests to get vm guest mode strings. And the index i
is checked in the API in case of possiable faults.

Reviewed-by: Andrew Jones 
Suggested-by: Sean Christopherson 
Signed-off-by: Yanan Wang 
---
 .../testing/selftests/kvm/include/kvm_util.h  |  4 +--
 tools/testing/selftests/kvm/lib/kvm_util.c| 29 ---
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
b/tools/testing/selftests/kvm/include/kvm_util.h
index 2d7eb6989e83..f52a7492f47f 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -68,9 +68,6 @@ enum vm_guest_mode {
 #define MIN_PAGE_SIZE  (1U << MIN_PAGE_SHIFT)
 #define PTES_PER_MIN_PAGE  ptes_per_page(MIN_PAGE_SIZE)
 
-#define vm_guest_mode_string(m) vm_guest_mode_string[m]
-extern const char * const vm_guest_mode_string[];
-
 struct vm_guest_mode_params {
unsigned int pa_bits;
unsigned int va_bits;
@@ -84,6 +81,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap 
*cap);
 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
struct kvm_enable_cap *cap);
 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+const char *vm_guest_mode_string(uint32_t i);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int 
perm);
 void kvm_vm_free(struct kvm_vm *vmp);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index d787cb802b4a..cc22c4ab7d67 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -141,17 +141,24 @@ static void vm_open(struct kvm_vm *vm, int perm)
"rc: %i errno: %i", vm->fd, errno);
 }
 
-const char * const vm_guest_mode_string[] = {
-   "PA-bits:52,  VA-bits:48,  4K pages",
-   "PA-bits:52,  VA-bits:48, 64K pages",
-   "PA-bits:48,  VA-bits:48,  4K pages",
-   "PA-bits:48,  VA-bits:48, 64K pages",
-   "PA-bits:40,  VA-bits:48,  4K pages",
-   "PA-bits:40,  VA-bits:48, 64K pages",
-   "PA-bits:ANY, VA-bits:48,  4K pages",
-};
-_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
-  "Missing new mode strings?");
+const char *vm_guest_mode_string(uint32_t i)
+{
+   static const char * const strings[] = {
+   [VM_MODE_P52V48_4K] = "PA-bits:52,  VA-bits:48,  4K pages",
+   [VM_MODE_P52V48_64K]= "PA-bits:52,  VA-bits:48, 64K pages",
+   [VM_MODE_P48V48_4K] = "PA-bits:48,  VA-bits:48,  4K pages",
+   [VM_MODE_P48V48_64K]= "PA-bits:48,  VA-bits:48, 64K pages",
+   [VM_MODE_P40V48_4K] = "PA-bits:40,  VA-bits:48,  4K pages",
+   [VM_MODE_P40V48_64K]= "PA-bits:40,  VA-bits:48, 64K pages",
+   [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48,  4K pages",
+   };
+   _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
+  "Missing new mode strings?");
+
+   TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
+
+   return strings[i];
+}
 
 const struct vm_guest_mode_params vm_guest_mode_params[] = {
{ 52, 48,  0x1000, 12 },
-- 
2.23.0

[RFC PATCH v3 1/8] tools headers: sync headers of asm-generic/hugetlb_encode.h

2021-02-28 Thread Yanan Wang

This patch syncs contents of tools/include/asm-generic/hugetlb_encode.h
and include/uapi/asm-generic/hugetlb_encode.h. Arch powerpc supports 16KB
hugepages and ARM64 supports 32MB/512MB hugepages. The corresponding mmap
flags have already been added in include/uapi/asm-generic/hugetlb_encode.h,
but not tools/include/asm-generic/hugetlb_encode.h.

Cc: Ingo Molnar 
Cc: Adrian Hunter 
Cc: Jiri Olsa 
Cc: Arnaldo Carvalho de Melo 
Signed-off-by: Yanan Wang 
---
 tools/include/asm-generic/hugetlb_encode.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/include/asm-generic/hugetlb_encode.h 
b/tools/include/asm-generic/hugetlb_encode.h
index e4732d3c2998..4f3d5aaa11f5 100644
--- a/tools/include/asm-generic/hugetlb_encode.h
+++ b/tools/include/asm-generic/hugetlb_encode.h
@@ -20,13 +20,16 @@
 #define HUGETLB_FLAG_ENCODE_SHIFT  26
 #define HUGETLB_FLAG_ENCODE_MASK   0x3f
 
+#define HUGETLB_FLAG_ENCODE_16KB   (14 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_64KB   (16 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_512KB  (19 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1MB(20 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2MB(21 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_8MB(23 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16MB   (24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_32MB   (25 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_256MB  (28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512MB  (29 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1GB(30 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2GB(31 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16GB   (34 << HUGETLB_FLAG_ENCODE_SHIFT)
-- 
2.23.0

[RFC PATCH v3 6/8] KVM: selftests: List all hugetlb src types specified with page sizes

2021-02-28 Thread Yanan Wang

With VM_MEM_SRC_ANONYMOUS_HUGETLB, we currently can only use system
default hugetlb pages to back the testing guest memory. In order to
add flexibility, now list all the known hugetlb backing src types with
different page sizes, so that we can specify use of hugetlb pages of the
exact granularity that we want. And as all the known hugetlb page sizes
are listed, it's appropriate for all architectures.

Besides, the helper get_backing_src_pagesz() is added to get the
granularity of different backing src types(anonumous, thp, hugetlb).

Suggested-by: Ben Gardon 
Signed-off-by: Yanan Wang 
---
 .../testing/selftests/kvm/include/test_util.h | 19 ++-
 tools/testing/selftests/kvm/lib/kvm_util.c|  2 +-
 tools/testing/selftests/kvm/lib/test_util.c   | 56 +++
 3 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index ef24c76ba89a..be5d08bcdca7 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -70,16 +70,31 @@ struct timespec timespec_div(struct timespec ts, int 
divisor);
 enum vm_mem_backing_src_type {
VM_MEM_SRC_ANONYMOUS,
VM_MEM_SRC_ANONYMOUS_THP,
-   VM_MEM_SRC_ANONYMOUS_HUGETLB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
+   NUM_SRC_TYPES,
 };
 
 struct vm_mem_backing_src_alias {
const char *name;
-   enum vm_mem_backing_src_type type;
+   uint32_t flag;
 };
 
 bool thp_configured(void);
 size_t get_trans_hugepagesz(void);
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i);
+size_t get_backing_src_pagesz(uint32_t i);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index cc22c4ab7d67..b91c8e3a7ee1 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -757,7 +757,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->mmap_start = mmap(NULL, region->mmap_size,
  PROT_READ | PROT_WRITE,
  MAP_PRIVATE | MAP_ANONYMOUS
- | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? 
MAP_HUGETLB : 0),
+ | vm_mem_backing_src_alias(src_type)->flag,
  -1, 0);
TEST_ASSERT(region->mmap_start != MAP_FAILED,
"test_malloc failed, mmap_start: %p errno: %i",
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index f2d133f76c67..1f5e7241c80e 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -112,12 +113,6 @@ void print_skip(const char *fmt, ...)
puts(", skipping test");
 }
 
-const struct vm_mem_backing_src_alias backing_src_aliases[] = {
-   {"anonymous", VM_MEM_SRC_ANONYMOUS,},
-   {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
-   {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
-};
-
 bool thp_configured(void)
 {
int ret;
@@ -153,22 +148,61 @@ size_t get_trans_hugepagesz(void)
return size;
 }
 
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
+{
+   static const struct vm_mem_backing_src_alias aliases[] = {
+   { "anonymous",   0},
+   { "anonymous_thp",   0},
+   { "anonymous_hugetlb_16kb",  MAP_HUGETLB | MAP_HUGE_16KB  },
+   { "anonymous_hugetlb_64kb",  MAP_HUGETLB | MAP_HUGE_64KB  },
+   { "anonymous_hugetlb_512kb", MAP_HUGETLB | MAP_HUGE_512KB },
+   { "anonymous_hugetlb_1mb",   MAP_HUGETLB | MAP_HUGE_1MB   },
+   { "anonymous_hugetlb_2mb",   MAP_HUGETLB | MAP_HUGE_2MB   },
+   { "anonymous_hugetlb_8mb",   MAP_HUGETLB | MAP_HUGE_8MB   },
+   { "anonymous_hugetlb_16mb",  MAP_H

[RFC PATCH v3 8/8] KVM: selftests: Add a test for kvm page table code

2021-02-28 Thread Yanan Wang

This test serves as a performance tester and a bug reproducer for
kvm page table code (GPA->HPA mappings), so it gives guidance for
people trying to make some improvement for kvm.

The function guest_code() can cover the conditions where a single vcpu or
multiple vcpus access guest pages within the same memory region, in three
VM stages(before dirty logging, during dirty logging, after dirty logging).
Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings
or block mappings can be chosen by users to be created in the test.

If ANONYMOUS memory is specified, kvm will create normal page mappings
for the tested memory region before dirty logging, and update attributes
of the page mappings from RO to RW during dirty logging. If THP/HUGETLB
memory is specified, kvm will create block mappings for the tested memory
region before dirty logging, and split the blcok mappings into normal page
mappings during dirty logging, and coalesce the page mappings back into
block mappings after dirty logging is stopped.

So in summary, as a performance tester, this test can present the
performance of kvm creating/updating normal page mappings, or the
performance of kvm creating/splitting/recovering block mappings,
through execution time.

When we need to coalesce the page mappings back to block mappings after
dirty logging is stopped, we have to firstly invalidate *all* the TLB
entries for the page mappings right before installation of the block entry,
because a TLB conflict abort error could occur if we can't invalidate the
TLB entries fully. We have hit this TLB conflict twice on aarch64 software
implementation and fixed it. As this test can imulate process from dirty
logging enabled to dirty logging stopped of a VM with block mappings,
so it can also reproduce this TLB conflict abort due to inadequate TLB
invalidation when coalescing tables.

Signed-off-by: Yanan Wang 
---
 tools/testing/selftests/kvm/Makefile  |   3 +
 .../selftests/kvm/kvm_page_table_test.c   | 476 ++
 2 files changed, 479 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c

diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index a6d61f451f88..bac81924166d 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -67,6 +67,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
 TEST_GEN_PROGS_x86_64 += demand_paging_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
+TEST_GEN_PROGS_x86_64 += kvm_page_table_test
 TEST_GEN_PROGS_x86_64 += hardware_disable_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
@@ -78,6 +79,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
 TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
+TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
@@ -87,6 +89,7 @@ TEST_GEN_PROGS_s390x += s390x/resets
 TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
+TEST_GEN_PROGS_s390x += kvm_page_table_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
 TEST_GEN_PROGS_s390x += set_memory_region_test
 
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c 
b/tools/testing/selftests/kvm/kvm_page_table_test.c
new file mode 100644
index ..032b49d1483b
--- /dev/null
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM page table test
+ *
+ * Copyright (C) 2021, Huawei, Inc.
+ *
+ * Make sure that THP has been enabled or enough HUGETLB pages with specific
+ * page size have been pre-allocated on your system, if you are planning to
+ * use hugepages to back the guest memory for testing.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "guest_modes.h"
+
+#define TEST_MEM_SLOT_INDEX 1
+
+/* Default size(1GB) of the memory for testing */
+#define DEFAULT_TEST_MEM_SIZE  (1 << 30)
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM 0xc000
+
+/* Number of guest memory accessing types(read/write) */
+#define NUM_ACCESS_TYPES   2
+
+/* Different guest memory accessing stages */
+enum test_stage {
+   KVM_BEFORE_MAPPINGS,
+   KVM_CREATE_MAPPINGS,
+   KVM_UPDATE_MAPPINGS,
+   KVM_ADJUST_MAPPINGS,
+   NUM_TEST_STAGES,
+};
+
+static const char * cons

[RFC PATCH v3 0/7] KVM: selftests: some improvement and a new test for kvm page table

2021-02-28 Thread Yanan Wang

Hi,
This v3 series can mainly include two parts.
Based on kvm queue branch: 
https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=queue
Links of v1: 
https://lore.kernel.org/lkml/20210208090841.333724-1-wangyana...@huawei.com/
Links of v2: 
https://lore.kernel.org/lkml/20210225055940.18748-1-wangyana...@huawei.com/

In the first part, all the known hugetlb backing src types specified
with different hugepage sizes are listed, so that we can specify use
of hugetlb source of the exact granularity that we want, instead of
the system default ones. And as all the known hugetlb page sizes are
listed, it's appropriate for all architectures. Besides, a helper that
can get granularity of different backing src types(anonumous/thp/hugetlb)
is added, so that we can use the accurate backing src granularity for
kinds of alignment or guest memory accessing of vcpus.

In the second part, a new test is added:
This test is added to serve as a performance tester and a bug reproducer
for kvm page table code (GPA->HPA mappings), it gives guidance for the
people trying to make some improvement for kvm. And the following explains
what we can exactly do through this test.

The function guest_code() can cover the conditions where a single vcpu or
multiple vcpus access guest pages within the same memory region, in three
VM stages(before dirty logging, during dirty logging, after dirty logging).
Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings
or block mappings can be chosen by users to be created in the test.

If ANONYMOUS memory is specified, kvm will create normal page mappings
for the tested memory region before dirty logging, and update attributes
of the page mappings from RO to RW during dirty logging. If THP/HUGETLB
memory is specified, kvm will create block mappings for the tested memory
region before dirty logging, and split the blcok mappings into normal page
mappings during dirty logging, and coalesce the page mappings back into
block mappings after dirty logging is stopped.

So in summary, as a performance tester, this test can present the
performance of kvm creating/updating normal page mappings, or the
performance of kvm creating/splitting/recovering block mappings,
through execution time.

When we need to coalesce the page mappings back to block mappings after
dirty logging is stopped, we have to firstly invalidate *all* the TLB
entries for the page mappings right before installation of the block entry,
because a TLB conflict abort error could occur if we can't invalidate the
TLB entries fully. We have hit this TLB conflict twice on aarch64 software
implementation and fixed it. As this test can imulate process from dirty
logging enabled to dirty logging stopped of a VM with block mappings,
so it can also reproduce this TLB conflict abort due to inadequate TLB
invalidation when coalescing tables.

Links about the TLB conflict abort:
https://lore.kernel.org/lkml/20201201201034.116760-3-wangyana...@huawei.com/

---

Change logs:

v2->v3:
- Add tags of Suggested-by, Reviewed-by in the patches
- Add a generic micro to get hugetlb page sizes
- Some changes for suggestions about v2 series

v1->v2:
- Add a patch to sync header files
- Add helpers to get granularity of different backing src types
- Some changes for suggestions about v1 series

---

Yanan Wang (7):
  tools headers: sync headers of asm-generic/hugetlb_encode.h
  tools headers: Add a macro to get HUGETLB page sizes for mmap
  KVM: selftests: Use flag CLOCK_MONOTONIC_RAW for timing
  KVM: selftests: Make a generic helper to get vm guest mode strings
  KVM: selftests: Add a helper to get system configured THP page size
  KVM: selftests: List all hugetlb src types specified with page sizes
  KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers
  KVM: selftests: Add a test for kvm page table code

 include/uapi/linux/mman.h |   2 +
 tools/include/asm-generic/hugetlb_encode.h|   3 +
 tools/include/uapi/linux/mman.h   |   2 +
 tools/testing/selftests/kvm/Makefile  |   3 +
 .../selftests/kvm/demand_paging_test.c|   8 +-
 .../selftests/kvm/dirty_log_perf_test.c   |  14 +-
 .../testing/selftests/kvm/include/kvm_util.h  |   4 +-
 .../testing/selftests/kvm/include/test_util.h |  21 +-
 .../selftests/kvm/kvm_page_table_test.c   | 476 ++
 tools/testing/selftests/kvm/lib/kvm_util.c|  59 ++-
 tools/testing/selftests/kvm/lib/test_util.c   |  92 +++-
 tools/testing/selftests/kvm/steal_time.c  |   4 +-
 12 files changed, 628 insertions(+), 60 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c 

-- 
2.19.1

[RFC PATCH v3 3/8] KVM: selftests: Use flag CLOCK_MONOTONIC_RAW for timing

2021-02-28 Thread Yanan Wang

In addition to function of CLOCK_MONOTONIC, flag CLOCK_MONOTONIC_RAW can
also shield possiable impact of NTP, which can provide more robustness.

Suggested-by: Vitaly Kuznetsov 
Signed-off-by: Yanan Wang 
---
 tools/testing/selftests/kvm/demand_paging_test.c  |  8 
 tools/testing/selftests/kvm/dirty_log_perf_test.c | 14 +++---
 tools/testing/selftests/kvm/lib/test_util.c   |  2 +-
 tools/testing/selftests/kvm/steal_time.c  |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/demand_paging_test.c 
b/tools/testing/selftests/kvm/demand_paging_test.c
index 5f7a229c3af1..efbf0c1e9130 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -53,7 +53,7 @@ static void *vcpu_worker(void *data)
vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
run = vcpu_state(vm, vcpu_id);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
/* Let the guest access its memory */
ret = _vcpu_run(vm, vcpu_id);
@@ -86,7 +86,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
copy.len = perf_test_args.host_page_size;
copy.mode = 0;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
r = ioctl(uffd, UFFDIO_COPY, ©);
if (r == -1) {
@@ -123,7 +123,7 @@ static void *uffd_handler_thread_fn(void *arg)
struct timespec start;
struct timespec ts_diff;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
while (!quit_uffd_thread) {
struct uffd_msg msg;
struct pollfd pollfd[2];
@@ -336,7 +336,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
pr_info("Finished creating vCPUs and starting uffd threads\n");
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c 
b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index 04a2641261be..6cff4ccf9525 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -50,7 +50,7 @@ static void *vcpu_worker(void *data)
while (!READ_ONCE(host_quit)) {
int current_iteration = READ_ONCE(iteration);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
ret = _vcpu_run(vm, vcpu_id);
ts_diff = timespec_elapsed(start);
 
@@ -141,7 +141,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
iteration = 0;
host_quit = false;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
vcpu_last_completed_iteration[vcpu_id] = -1;
 
@@ -162,7 +162,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
ts_diff.tv_sec, ts_diff.tv_nsec);
 
/* Enable dirty logging */
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX,
KVM_MEM_LOG_DIRTY_PAGES);
ts_diff = timespec_elapsed(start);
@@ -174,7 +174,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 * Incrementing the iteration number will start the vCPUs
 * dirtying memory again.
 */
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
iteration++;
 
pr_debug("Starting iteration %d\n", iteration);
@@ -189,7 +189,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("Iteration %d dirty memory time: %ld.%.9lds\n",
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap);
 
ts_diff = timespec_elapsed(start);
@@ -199,7 +199,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
if (dirty_log_manual_caps) {
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
kvm_vm_clear_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX,

[RFC PATCH v3 7/8] KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers

2021-02-28 Thread Yanan Wang

With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
we have to get the transparent hugepage size for HVA alignment. With the
new helpers, we can use get_backing_src_pagesz() to check whether THP is
configured and then get the exact configured hugepage size.

As different architectures may have different THP page sizes configured,
this can get the accurate THP page sizes on any platform.

Signed-off-by: Yanan Wang 
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 28 +++---
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index b91c8e3a7ee1..b29402f9f00c 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -18,7 +18,6 @@
 #include 
 #include 
 
-#define KVM_UTIL_PGS_PER_HUGEPG 512
 #define KVM_UTIL_MIN_PFN   2
 
 /* Aligns x up to the next multiple of size. Size must be a power of 2. */
@@ -686,7 +685,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 {
int ret;
struct userspace_mem_region *region;
-   size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+   size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
size_t alignment;
 
TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
@@ -748,7 +747,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 #endif
 
if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
-   alignment = max(huge_page_size, alignment);
+   alignment = max(backing_src_pagesz, alignment);
 
/* Add enough memory to align up if necessary */
if (alignment > 1)
@@ -767,22 +766,13 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->host_mem = align(region->mmap_start, alignment);
 
/* As needed perform madvise */
-   if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == 
VM_MEM_SRC_ANONYMOUS_THP) {
-   struct stat statbuf;
-
-   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
-   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
-   "stat /sys/kernel/mm/transparent_hugepage");
-
-   TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
-   "VM_MEM_SRC_ANONYMOUS_THP requires THP to be 
configured in the host kernel");
-
-   if (ret == 0) {
-   ret = madvise(region->host_mem, npages * vm->page_size,
- src_type == VM_MEM_SRC_ANONYMOUS ? 
MADV_NOHUGEPAGE : MADV_HUGEPAGE);
-   TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 
0x%lx src_type: %x",
-   region->host_mem, npages * vm->page_size, 
src_type);
-   }
+   if ((src_type == VM_MEM_SRC_ANONYMOUS ||
+src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
+   ret = madvise(region->host_mem, npages * vm->page_size,
+ src_type == VM_MEM_SRC_ANONYMOUS ? 
MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+   TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx 
src_type: %s",
+   region->host_mem, npages * vm->page_size,
+   vm_mem_backing_src_alias(src_type)->name);
}
 
region->unused_phy_pages = sparsebit_alloc();
-- 
2.23.0

[RFC PATCH v3 5/8] KVM: selftests: Add a helper to get system configured THP page size

2021-02-28 Thread Yanan Wang

If we want to have some tests about transparent hugepages, the system
configured THP hugepage size should better be known by the tests, which
can be used for kinds of alignment or guest memory accessing of vcpus...
So it makes sense to add a helper to get the transparent hugepage size.

With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
we now stat /sys/kernel/mm/transparent_hugepage to check whether THP is
configured in the host kernel before madvise(). Based on this, we can also
read file /sys/kernel/mm/transparent_hugepage/hpage_pmd_size to get THP
hugepage size.

Reviewed-by: Ben Gardon 
Signed-off-by: Yanan Wang 
---
 .../testing/selftests/kvm/include/test_util.h |  2 ++
 tools/testing/selftests/kvm/lib/test_util.c   | 36 +++
 2 files changed, 38 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index b7f41399f22c..ef24c76ba89a 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -78,6 +78,8 @@ struct vm_mem_backing_src_alias {
enum vm_mem_backing_src_type type;
 };
 
+bool thp_configured(void);
+size_t get_trans_hugepagesz(void);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index c7c0627c6842..f2d133f76c67 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -117,6 +118,41 @@ const struct vm_mem_backing_src_alias 
backing_src_aliases[] = {
{"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
 };
 
+bool thp_configured(void)
+{
+   int ret;
+   struct stat statbuf;
+
+   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+   "Error in stating /sys/kernel/mm/transparent_hugepage: %d",
+   errno);
+
+   return ret == 0;
+}
+
+size_t get_trans_hugepagesz(void)
+{
+   size_t size;
+   char buf[16];
+   FILE *f;
+
+   TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
+
+   f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+   TEST_ASSERT(f != NULL,
+   "Error in opening transparent_hugepage/hpage_pmd_size: %d",
+   errno);
+
+   if (fread(buf, sizeof(char), sizeof(buf), f) == 0) {
+   fclose(f);
+   TEST_FAIL("Unable to read transparent_hugepage/hpage_pmd_size");
+   }
+
+   size = strtoull(buf, NULL, 10);
+   return size;
+}
+
 void backing_src_help(void)
 {
int i;
-- 
2.23.0

[RFC PATCH v3 2/8] tools headers: Add a macro to get HUGETLB page sizes for mmap

2021-02-28 Thread Yanan Wang

We know that if a system supports multiple hugetlb page sizes,
the desired hugetlb page size can be specified in bits [26:31]
of the flag arguments. The value in these 6 bits will be the
shift of each hugetlb page size.

So add a macro to get the page size shift and then calculate the
corresponding hugetlb page size, using flag x.

Cc: Ben Gardon 
Cc: Ingo Molnar 
Cc: Adrian Hunter 
Cc: Jiri Olsa 
Cc: Arnaldo Carvalho de Melo 
Cc: Arnd Bergmann 
Cc: Michael Kerrisk 
Cc: Thomas Gleixner 
Suggested-by: Ben Gardon 
Signed-off-by: Yanan Wang 
---
 include/uapi/linux/mman.h   | 2 ++
 tools/include/uapi/linux/mman.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index f55bc680b5b0..8bd41128a0ee 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -41,4 +41,6 @@
 #define MAP_HUGE_2GB   HUGETLB_FLAG_ENCODE_2GB
 #define MAP_HUGE_16GB  HUGETLB_FLAG_ENCODE_16GB
 
+#define MAP_HUGE_PAGE_SIZE(x) (1 << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
 #endif /* _UAPI_LINUX_MMAN_H */
diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h
index f55bc680b5b0..8bd41128a0ee 100644
--- a/tools/include/uapi/linux/mman.h
+++ b/tools/include/uapi/linux/mman.h
@@ -41,4 +41,6 @@
 #define MAP_HUGE_2GB   HUGETLB_FLAG_ENCODE_2GB
 #define MAP_HUGE_16GB  HUGETLB_FLAG_ENCODE_16GB
 
+#define MAP_HUGE_PAGE_SIZE(x) (1 << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK))
+
 #endif /* _UAPI_LINUX_MMAN_H */
-- 
2.23.0

[RFC PATCH v2 7/7] KVM: selftests: Add a test for kvm page table code

2021-02-24 Thread Yanan Wang

This test serves as a performance tester and a bug reproducer for
kvm page table code (GPA->HPA mappings), so it gives guidance for
people trying to make some improvement for kvm.

The function guest_code() can cover the conditions where a single vcpu or
multiple vcpus access guest pages within the same memory region, in three
VM stages(before dirty logging, during dirty logging, after dirty logging).
Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings
or block mappings can be chosen by users to be created in the test.

If ANONYMOUS memory is specified, kvm will create normal page mappings
for the tested memory region before dirty logging, and update attributes
of the page mappings from RO to RW during dirty logging. If THP/HUGETLB
memory is specified, kvm will create block mappings for the tested memory
region before dirty logging, and split the blcok mappings into normal page
mappings during dirty logging, and coalesce the page mappings back into
block mappings after dirty logging is stopped.

So in summary, as a performance tester, this test can present the
performance of kvm creating/updating normal page mappings, or the
performance of kvm creating/splitting/recovering block mappings,
through execution time.

When we need to coalesce the page mappings back to block mappings after
dirty logging is stopped, we have to firstly invalidate *all* the TLB
entries for the page mappings right before installation of the block entry,
because a TLB conflict abort error could occur if we can't invalidate the
TLB entries fully. We have hit this TLB conflict twice on aarch64 software
implementation and fixed it. As this test can imulate process from dirty
logging enabled to dirty logging stopped of a VM with block mappings,
so it can also reproduce this TLB conflict abort due to inadequate TLB
invalidation when coalescing tables.

Signed-off-by: Yanan Wang 
---
 tools/testing/selftests/kvm/Makefile  |   3 +
 .../selftests/kvm/kvm_page_table_test.c   | 476 ++
 2 files changed, 479 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c

diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index a6d61f451f88..bac81924166d 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -67,6 +67,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
 TEST_GEN_PROGS_x86_64 += demand_paging_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
+TEST_GEN_PROGS_x86_64 += kvm_page_table_test
 TEST_GEN_PROGS_x86_64 += hardware_disable_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test
@@ -78,6 +79,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
 TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
+TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
@@ -87,6 +89,7 @@ TEST_GEN_PROGS_s390x += s390x/resets
 TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
+TEST_GEN_PROGS_s390x += kvm_page_table_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
 TEST_GEN_PROGS_s390x += set_memory_region_test
 
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c 
b/tools/testing/selftests/kvm/kvm_page_table_test.c
new file mode 100644
index ..032b49d1483b
--- /dev/null
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM page table test
+ *
+ * Copyright (C) 2021, Huawei, Inc.
+ *
+ * Make sure that THP has been enabled or enough HUGETLB pages with specific
+ * page size have been pre-allocated on your system, if you are planning to
+ * use hugepages to back the guest memory for testing.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "guest_modes.h"
+
+#define TEST_MEM_SLOT_INDEX 1
+
+/* Default size(1GB) of the memory for testing */
+#define DEFAULT_TEST_MEM_SIZE  (1 << 30)
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM 0xc000
+
+/* Number of guest memory accessing types(read/write) */
+#define NUM_ACCESS_TYPES   2
+
+/* Different guest memory accessing stages */
+enum test_stage {
+   KVM_BEFORE_MAPPINGS,
+   KVM_CREATE_MAPPINGS,
+   KVM_UPDATE_MAPPINGS,
+   KVM_ADJUST_MAPPINGS,
+   NUM_TEST_STAGES,
+};
+
+static const char * cons

[RFC PATCH v2 6/7] KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers

2021-02-24 Thread Yanan Wang

With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
we have to get the transparent hugepage size for HVA alignment. With the
new helpers, we can use get_backing_src_pagesz() to check whether THP is
configured and then get the exact configured hugepage size.

As different architectures may have different THP page sizes configured,
this can get the accurate THP page sizes on any platform.

Signed-off-by: Yanan Wang 
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 27 +++---
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index b91c8e3a7ee1..0105fbfed036 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -18,7 +18,6 @@
 #include 
 #include 
 
-#define KVM_UTIL_PGS_PER_HUGEPG 512
 #define KVM_UTIL_MIN_PFN   2
 
 /* Aligns x up to the next multiple of size. Size must be a power of 2. */
@@ -686,7 +685,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 {
int ret;
struct userspace_mem_region *region;
-   size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+   size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
size_t alignment;
 
TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
@@ -748,7 +747,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
 #endif
 
if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
-   alignment = max(huge_page_size, alignment);
+   alignment = max(backing_src_pagesz, alignment);
 
/* Add enough memory to align up if necessary */
if (alignment > 1)
@@ -767,22 +766,12 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->host_mem = align(region->mmap_start, alignment);
 
/* As needed perform madvise */
-   if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == 
VM_MEM_SRC_ANONYMOUS_THP) {
-   struct stat statbuf;
-
-   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
-   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
-   "stat /sys/kernel/mm/transparent_hugepage");
-
-   TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
-   "VM_MEM_SRC_ANONYMOUS_THP requires THP to be 
configured in the host kernel");
-
-   if (ret == 0) {
-   ret = madvise(region->host_mem, npages * vm->page_size,
- src_type == VM_MEM_SRC_ANONYMOUS ? 
MADV_NOHUGEPAGE : MADV_HUGEPAGE);
-   TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 
0x%lx src_type: %x",
-   region->host_mem, npages * vm->page_size, 
src_type);
-   }
+   if (src_type <= VM_MEM_SRC_ANONYMOUS_THP && thp_configured()) {
+   ret = madvise(region->host_mem, npages * vm->page_size,
+ src_type == VM_MEM_SRC_ANONYMOUS ? 
MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+   TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx 
src_type: %s",
+   region->host_mem, npages * vm->page_size,
+   vm_mem_backing_src_alias(src_type)->name);
}
 
region->unused_phy_pages = sparsebit_alloc();
-- 
2.19.1

[RFC PATCH v2 5/7] KVM: selftests: List all hugetlb src types specified with page sizes

2021-02-24 Thread Yanan Wang

With VM_MEM_SRC_ANONYMOUS_HUGETLB, we currently can only use system
default hugetlb pages to back the testing guest memory. In order to
add flexibility, now list all the known hugetlb backing src types with
different page sizes, so that we can specify use of hugetlb pages of the
exact granularity that we want. And as all the known hugetlb page sizes
are listed, it's appropriate for all architectures.

Besides, the helper get_backing_src_pagesz() is added to get the
granularity of different backing src types(anonumous, thp, hugetlb).

Signed-off-by: Yanan Wang 
---
 .../testing/selftests/kvm/include/test_util.h | 19 ++-
 tools/testing/selftests/kvm/lib/kvm_util.c|  2 +-
 tools/testing/selftests/kvm/lib/test_util.c   | 56 +++
 3 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index ef24c76ba89a..be5d08bcdca7 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -70,16 +70,31 @@ struct timespec timespec_div(struct timespec ts, int 
divisor);
 enum vm_mem_backing_src_type {
VM_MEM_SRC_ANONYMOUS,
VM_MEM_SRC_ANONYMOUS_THP,
-   VM_MEM_SRC_ANONYMOUS_HUGETLB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB,
+   VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
+   NUM_SRC_TYPES,
 };
 
 struct vm_mem_backing_src_alias {
const char *name;
-   enum vm_mem_backing_src_type type;
+   uint32_t flag;
 };
 
 bool thp_configured(void);
 size_t get_trans_hugepagesz(void);
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i);
+size_t get_backing_src_pagesz(uint32_t i);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index cc22c4ab7d67..b91c8e3a7ee1 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -757,7 +757,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
region->mmap_start = mmap(NULL, region->mmap_size,
  PROT_READ | PROT_WRITE,
  MAP_PRIVATE | MAP_ANONYMOUS
- | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? 
MAP_HUGETLB : 0),
+ | vm_mem_backing_src_alias(src_type)->flag,
  -1, 0);
TEST_ASSERT(region->mmap_start != MAP_FAILED,
"test_malloc failed, mmap_start: %p errno: %i",
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index f2d133f76c67..6780aa058f35 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -112,12 +113,6 @@ void print_skip(const char *fmt, ...)
puts(", skipping test");
 }
 
-const struct vm_mem_backing_src_alias backing_src_aliases[] = {
-   {"anonymous", VM_MEM_SRC_ANONYMOUS,},
-   {"anonymous_thp", VM_MEM_SRC_ANONYMOUS_THP,},
-   {"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
-};
-
 bool thp_configured(void)
 {
int ret;
@@ -153,22 +148,61 @@ size_t get_trans_hugepagesz(void)
return size;
 }
 
+const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
+{
+   static const struct vm_mem_backing_src_alias aliases[] = {
+   { "anonymous",   0},
+   { "anonymous_thp",   0},
+   { "anonymous_hugetlb_16kb",  MAP_HUGETLB | MAP_HUGE_16KB  },
+   { "anonymous_hugetlb_64kb",  MAP_HUGETLB | MAP_HUGE_64KB  },
+   { "anonymous_hugetlb_512kb", MAP_HUGETLB | MAP_HUGE_512KB },
+   { "anonymous_hugetlb_1mb",   MAP_HUGETLB | MAP_HUGE_1MB   },
+   { "anonymous_hugetlb_2mb",   MAP_HUGETLB | MAP_HUGE_2MB   },
+   { "anonymous_hugetlb_8mb",   MAP_HUGETLB | MAP_HUGE_8MB   },
+   { "anonymous_hugetlb_16mb",  MAP_HUGETLB | MAP_HUGE_16MB  },
+   { &

[RFC PATCH v2 2/7] KVM: selftests: Use flag CLOCK_MONOTONIC_RAW for timing

2021-02-24 Thread Yanan Wang

In addition to function of CLOCK_MONOTONIC, flag CLOCK_MONOTONIC_RAW can
also shield possiable impact of NTP, which can provide more robustness.

Signed-off-by: Yanan Wang 
---
 tools/testing/selftests/kvm/demand_paging_test.c  |  8 
 tools/testing/selftests/kvm/dirty_log_perf_test.c | 14 +++---
 tools/testing/selftests/kvm/lib/test_util.c   |  2 +-
 tools/testing/selftests/kvm/steal_time.c  |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/demand_paging_test.c 
b/tools/testing/selftests/kvm/demand_paging_test.c
index 5f7a229c3af1..efbf0c1e9130 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -53,7 +53,7 @@ static void *vcpu_worker(void *data)
vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
run = vcpu_state(vm, vcpu_id);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
/* Let the guest access its memory */
ret = _vcpu_run(vm, vcpu_id);
@@ -86,7 +86,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr)
copy.len = perf_test_args.host_page_size;
copy.mode = 0;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
r = ioctl(uffd, UFFDIO_COPY, ©);
if (r == -1) {
@@ -123,7 +123,7 @@ static void *uffd_handler_thread_fn(void *arg)
struct timespec start;
struct timespec ts_diff;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
while (!quit_uffd_thread) {
struct uffd_msg msg;
struct pollfd pollfd[2];
@@ -336,7 +336,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
pr_info("Finished creating vCPUs and starting uffd threads\n");
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c 
b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index 04a2641261be..6cff4ccf9525 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -50,7 +50,7 @@ static void *vcpu_worker(void *data)
while (!READ_ONCE(host_quit)) {
int current_iteration = READ_ONCE(iteration);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
ret = _vcpu_run(vm, vcpu_id);
ts_diff = timespec_elapsed(start);
 
@@ -141,7 +141,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
iteration = 0;
host_quit = false;
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
vcpu_last_completed_iteration[vcpu_id] = -1;
 
@@ -162,7 +162,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
ts_diff.tv_sec, ts_diff.tv_nsec);
 
/* Enable dirty logging */
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX,
KVM_MEM_LOG_DIRTY_PAGES);
ts_diff = timespec_elapsed(start);
@@ -174,7 +174,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 * Incrementing the iteration number will start the vCPUs
 * dirtying memory again.
 */
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
iteration++;
 
pr_debug("Starting iteration %d\n", iteration);
@@ -189,7 +189,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("Iteration %d dirty memory time: %ld.%.9lds\n",
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap);
 
ts_diff = timespec_elapsed(start);
@@ -199,7 +199,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
 
if (dirty_log_manual_caps) {
-   clock_gettime(CLOCK_MONOTONIC, &start);
+   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
kvm_vm_clear_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, 
bmap, 0,

[RFC PATCH v2 3/7] KVM: selftests: Make a generic helper to get vm guest mode strings

2021-02-24 Thread Yanan Wang

For generality and conciseness, make an API which can be used in all
kvm libs and selftests to get vm guest mode strings. And the index i
is checked in the API in case of possiable faults.

Signed-off-by: Yanan Wang 
---
 .../testing/selftests/kvm/include/kvm_util.h  |  4 +--
 tools/testing/selftests/kvm/lib/kvm_util.c| 29 ---
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
b/tools/testing/selftests/kvm/include/kvm_util.h
index 2d7eb6989e83..f52a7492f47f 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -68,9 +68,6 @@ enum vm_guest_mode {
 #define MIN_PAGE_SIZE  (1U << MIN_PAGE_SHIFT)
 #define PTES_PER_MIN_PAGE  ptes_per_page(MIN_PAGE_SIZE)
 
-#define vm_guest_mode_string(m) vm_guest_mode_string[m]
-extern const char * const vm_guest_mode_string[];
-
 struct vm_guest_mode_params {
unsigned int pa_bits;
unsigned int va_bits;
@@ -84,6 +81,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap 
*cap);
 int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
struct kvm_enable_cap *cap);
 void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+const char *vm_guest_mode_string(uint32_t i);
 
 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int 
perm);
 void kvm_vm_free(struct kvm_vm *vmp);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index d787cb802b4a..cc22c4ab7d67 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -141,17 +141,24 @@ static void vm_open(struct kvm_vm *vm, int perm)
"rc: %i errno: %i", vm->fd, errno);
 }
 
-const char * const vm_guest_mode_string[] = {
-   "PA-bits:52,  VA-bits:48,  4K pages",
-   "PA-bits:52,  VA-bits:48, 64K pages",
-   "PA-bits:48,  VA-bits:48,  4K pages",
-   "PA-bits:48,  VA-bits:48, 64K pages",
-   "PA-bits:40,  VA-bits:48,  4K pages",
-   "PA-bits:40,  VA-bits:48, 64K pages",
-   "PA-bits:ANY, VA-bits:48,  4K pages",
-};
-_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
-  "Missing new mode strings?");
+const char *vm_guest_mode_string(uint32_t i)
+{
+   static const char * const strings[] = {
+   [VM_MODE_P52V48_4K] = "PA-bits:52,  VA-bits:48,  4K pages",
+   [VM_MODE_P52V48_64K]= "PA-bits:52,  VA-bits:48, 64K pages",
+   [VM_MODE_P48V48_4K] = "PA-bits:48,  VA-bits:48,  4K pages",
+   [VM_MODE_P48V48_64K]= "PA-bits:48,  VA-bits:48, 64K pages",
+   [VM_MODE_P40V48_4K] = "PA-bits:40,  VA-bits:48,  4K pages",
+   [VM_MODE_P40V48_64K]= "PA-bits:40,  VA-bits:48, 64K pages",
+   [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48,  4K pages",
+   };
+   _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
+  "Missing new mode strings?");
+
+   TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
+
+   return strings[i];
+}
 
 const struct vm_guest_mode_params vm_guest_mode_params[] = {
{ 52, 48,  0x1000, 12 },
-- 
2.19.1

[RFC PATCH v2 1/7] tools include: sync head files of mmap flag encodings about hugetlb

2021-02-24 Thread Yanan Wang

This patch syncs contents of tools/include/asm-generic/hugetlb_encode.h
and include/uapi/asm-generic/hugetlb_encode.h. Arch powerpc supports 16KB
hugepages and ARM64 supports 32MB/512MB hugepages. The corresponding mmap
flags have already been added in include/uapi/asm-generic/hugetlb_encode.h,
but not tools/include/asm-generic/hugetlb_encode.h.

Signed-off-by: Yanan Wang 
---
 tools/include/asm-generic/hugetlb_encode.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/include/asm-generic/hugetlb_encode.h 
b/tools/include/asm-generic/hugetlb_encode.h
index e4732d3c2998..4f3d5aaa11f5 100644
--- a/tools/include/asm-generic/hugetlb_encode.h
+++ b/tools/include/asm-generic/hugetlb_encode.h
@@ -20,13 +20,16 @@
 #define HUGETLB_FLAG_ENCODE_SHIFT  26
 #define HUGETLB_FLAG_ENCODE_MASK   0x3f
 
+#define HUGETLB_FLAG_ENCODE_16KB   (14 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_64KB   (16 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_512KB  (19 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1MB(20 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2MB(21 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_8MB(23 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16MB   (24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_32MB   (25 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_256MB  (28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512MB  (29 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1GB(30 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2GB(31 << 
HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16GB   (34 << HUGETLB_FLAG_ENCODE_SHIFT)
-- 
2.19.1

[RFC PATCH v2 4/7] KVM: selftests: Add a helper to get system configured THP page size

2021-02-24 Thread Yanan Wang

If we want to have some tests about transparent hugepages, the system
configured THP hugepage size should better be known by the tests, which
can be used for kinds of alignment or guest memory accessing of vcpus...
So it makes sense to add a helper to get the transparent hugepage size.

With VM_MEM_SRC_ANONYMOUS_THP specified in vm_userspace_mem_region_add(),
we now stat /sys/kernel/mm/transparent_hugepage to check whether THP is
configured in the host kernel before madvise(). Based on this, we can also
read file /sys/kernel/mm/transparent_hugepage/hpage_pmd_size to get THP
hugepage size.

Signed-off-by: Yanan Wang 
---
 .../testing/selftests/kvm/include/test_util.h |  2 ++
 tools/testing/selftests/kvm/lib/test_util.c   | 36 +++
 2 files changed, 38 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/test_util.h 
b/tools/testing/selftests/kvm/include/test_util.h
index b7f41399f22c..ef24c76ba89a 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -78,6 +78,8 @@ struct vm_mem_backing_src_alias {
enum vm_mem_backing_src_type type;
 };
 
+bool thp_configured(void);
+size_t get_trans_hugepagesz(void);
 void backing_src_help(void);
 enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
 
diff --git a/tools/testing/selftests/kvm/lib/test_util.c 
b/tools/testing/selftests/kvm/lib/test_util.c
index c7c0627c6842..f2d133f76c67 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "linux/kernel.h"
 
 #include "test_util.h"
@@ -117,6 +118,41 @@ const struct vm_mem_backing_src_alias 
backing_src_aliases[] = {
{"anonymous_hugetlb", VM_MEM_SRC_ANONYMOUS_HUGETLB,},
 };
 
+bool thp_configured(void)
+{
+   int ret;
+   struct stat statbuf;
+
+   ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+   TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+   "Error in stating /sys/kernel/mm/transparent_hugepage: %d",
+   errno);
+
+   return ret == 0;
+}
+
+size_t get_trans_hugepagesz(void)
+{
+   size_t size;
+   char buf[16];
+   FILE *f;
+
+   TEST_ASSERT(thp_configured(), "THP is not configured in host kernel");
+
+   f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r");
+   TEST_ASSERT(f != NULL,
+   "Error in opening transparent_hugepage/hpage_pmd_size: %d",
+   errno);
+
+   if (fread(buf, sizeof(char), sizeof(buf), f) == 0) {
+   fclose(f);
+   TEST_FAIL("Unable to read transparent_hugepage/hpage_pmd_size");
+   }
+
+   size = strtoull(buf, NULL, 10);
+   return size;
+}
+
 void backing_src_help(void)
 {
int i;
-- 
2.19.1

[RFC PATCH v2 0/7] Some improvement and a new test for kvm page table

2021-02-24 Thread Yanan Wang

Hi,
This v2 series can mainly include two parts.
Based on kvm queue branch: 
https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=queue
Links of v1: 
https://lore.kernel.org/lkml/20210208090841.333724-1-wangyana...@huawei.com/

In the first part, all the known hugetlb backing src types specified
with different hugepage sizes are listed, so that we can specify use
of hugetlb source of the exact granularity that we want, instead of
the system default ones. And as all the known hugetlb page sizes are
listed, it's appropriate for all architectures. Besides, a helper that
can get granularity of different backing src types(anonumous/thp/hugetlb)
is added, so that we can use the accurate backing src granularity for
kinds of alignment or guest memory accessing of vcpus.

In the second part, a new test is added:
This test is added to serve as a performance tester and a bug reproducer
for kvm page table code (GPA->HPA mappings), it gives guidance for the
people trying to make some improvement for kvm. And the following explains
what we can exactly do through this test.

The function guest_code() can cover the conditions where a single vcpu or
multiple vcpus access guest pages within the same memory region, in three
VM stages(before dirty logging, during dirty logging, after dirty logging).
Besides, the backing src memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings
or block mappings can be chosen by users to be created in the test.

If ANONYMOUS memory is specified, kvm will create normal page mappings
for the tested memory region before dirty logging, and update attributes
of the page mappings from RO to RW during dirty logging. If THP/HUGETLB
memory is specified, kvm will create block mappings for the tested memory
region before dirty logging, and split the blcok mappings into normal page
mappings during dirty logging, and coalesce the page mappings back into
block mappings after dirty logging is stopped.

So in summary, as a performance tester, this test can present the
performance of kvm creating/updating normal page mappings, or the
performance of kvm creating/splitting/recovering block mappings,
through execution time.

When we need to coalesce the page mappings back to block mappings after
dirty logging is stopped, we have to firstly invalidate *all* the TLB
entries for the page mappings right before installation of the block entry,
because a TLB conflict abort error could occur if we can't invalidate the
TLB entries fully. We have hit this TLB conflict twice on aarch64 software
implementation and fixed it. As this test can imulate process from dirty
logging enabled to dirty logging stopped of a VM with block mappings,
so it can also reproduce this TLB conflict abort due to inadequate TLB
invalidation when coalescing tables.

Links about the TLB conflict abort:
https://lore.kernel.org/lkml/20201201201034.116760-3-wangyana...@huawei.com/

Yanan Wang (7):
  tools include: sync head files of mmap flag encodings about hugetlb
  KVM: selftests: Use flag CLOCK_MONOTONIC_RAW for timing
  KVM: selftests: Make a generic helper to get vm guest mode strings
  KVM: selftests: Add a helper to get system configured THP page size
  KVM: selftests: List all hugetlb src types specified with page sizes
  KVM: selftests: Adapt vm_userspace_mem_region_add to new helpers
  KVM: selftests: Add a test for kvm page table code

 tools/include/asm-generic/hugetlb_encode.h|   3 +
 tools/testing/selftests/kvm/Makefile  |   3 +
 .../selftests/kvm/demand_paging_test.c|   8 +-
 .../selftests/kvm/dirty_log_perf_test.c   |  14 +-
 .../testing/selftests/kvm/include/kvm_util.h  |   4 +-
 .../testing/selftests/kvm/include/test_util.h |  21 +-
 .../selftests/kvm/kvm_page_table_test.c   | 476 ++
 tools/testing/selftests/kvm/lib/kvm_util.c|  58 +--
 tools/testing/selftests/kvm/lib/test_util.c   |  92 +++-
 tools/testing/selftests/kvm/steal_time.c  |   4 +-
 10 files changed, 623 insertions(+), 60 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c

-- 
2.19.1

[RFC PATCH 1/4] KVM: arm64: Move the clean of dcache to the map handler

2021-02-08 Thread Yanan Wang

We currently uniformly clean dcache in user_mem_abort() before calling the
fault handlers, if we take a translation fault and the pfn is cacheable.
But if there are concurrent translation faults on the same page or block,
clean of dcache for the first time is necessary while the others are not.

By moving clean of dcache to the map handler, we can easily identify the
conditions where CMOs are really needed and avoid the unnecessary ones.
As it's a time consuming process to perform CMOs especially when flushing
a block range, so this solution reduces much load of kvm and improve the
efficiency of creating mappings.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/kvm_mmu.h | 16 --
 arch/arm64/kvm/hyp/pgtable.c | 38 
 arch/arm64/kvm/mmu.c | 14 +++-
 3 files changed, 27 insertions(+), 41 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index e52d82aeadca..4ec9879e82ed 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -204,22 +204,6 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu 
*vcpu)
return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-   void *va = page_address(pfn_to_page(pfn));
-
-   /*
-* With FWB, we ensure that the guest always accesses memory using
-* cacheable attributes, and we don't have to clean to PoC when
-* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
-* PoU is not required either in this case.
-*/
-   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-   return;
-
-   kvm_flush_dcache_to_poc(va, size);
-}
-
 static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
  unsigned long size)
 {
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 4d177ce1d536..2f4f87021980 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -464,6 +464,26 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
return 0;
 }
 
+static bool stage2_pte_cacheable(kvm_pte_t pte)
+{
+   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
+   return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+   /*
+* With FWB, we ensure that the guest always accesses memory using
+* cacheable attributes, and we don't have to clean to PoC when
+* faulting in pages. Furthermore, FWB implies IDC, so cleaning to
+* PoU is not required either in this case.
+*/
+   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+   return;
+
+   __flush_dcache_area(addr, size);
+}
+
 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
  kvm_pte_t *ptep,
  struct stage2_map_data *data)
@@ -495,6 +515,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
put_page(page);
}
 
+   /* Flush data cache before installation of the new PTE */
+   if (stage2_pte_cacheable(new))
+   stage2_flush_dcache(__va(phys), granule);
+
smp_store_release(ptep, new);
get_page(page);
data->phys += granule;
@@ -651,20 +675,6 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 
addr, u64 size,
return ret;
 }
 
-static void stage2_flush_dcache(void *addr, u64 size)
-{
-   if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
-   return;
-
-   __flush_dcache_area(addr, size);
-}
-
-static bool stage2_pte_cacheable(kvm_pte_t pte)
-{
-   u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
-   return memattr == PAGE_S2_MEMATTR(NORMAL);
-}
-
 static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
   enum kvm_pgtable_walk_flags flag,
   void * const arg)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 77cb2d28f2a4..d151927a7d62 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -609,11 +609,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm 
*kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-   __clean_dcache_guest_page(pfn, size);
-}
-
 static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
 {
__invalidate_icache_guest_page(pfn, size);
@@ -882,9 +877,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
if (writable)
prot |= KVM_PGTABLE_PROT_W;
 
-   if (fault_status != FSC_PERM && !device)
-   clean_dcache_guest

[RFC PATCH 4/4] KVM: arm64: Distinguish cases of memcache allocations completely

2021-02-08 Thread Yanan Wang

With a guest translation fault, the memcache pages are not needed if KVM
is only about to install a new leaf entry into the existing page table.
And with a guest permission fault, the memcache pages are also not needed
for a write_fault in dirty-logging time if KVM is only about to update
the existing leaf entry instead of collapsing a block entry into a table.

By comparing fault_granule and vma_pagesize, cases that require allocations
from memcache and cases that don't can be distinguished completely.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/mmu.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d151927a7d62..550498a9104e 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -815,19 +815,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
 
-   /*
-* Permission faults just need to update the existing leaf entry,
-* and so normally don't require allocations from the memcache. The
-* only exception to this is when dirty logging is enabled at runtime
-* and a write fault needs to collapse a block entry into a table.
-*/
-   if (fault_status != FSC_PERM || (logging_active && write_fault)) {
-   ret = kvm_mmu_topup_memory_cache(memcache,
-kvm_mmu_cache_min_pages(kvm));
-   if (ret)
-   return ret;
-   }
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
 * Ensure the read of mmu_notifier_seq happens before we call
@@ -887,6 +874,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
 
+   /*
+* Allocations from the memcache are required only when granule of the
+* lookup level where the guest fault happened exceeds vma_pagesize,
+* which means new page tables will be created in the fault handlers.
+*/
+   if (fault_granule > vma_pagesize) {
+   ret = kvm_mmu_topup_memory_cache(memcache,
+kvm_mmu_cache_min_pages(kvm));
+   if (ret)
+   return ret;
+   }
+
/*
 * Under the premise of getting a FSC_PERM fault, we just need to relax
 * permissions only if vma_pagesize equals fault_granule. Otherwise,
-- 
2.23.0

[RFC PATCH 3/4] KVM: arm64: Install the block entry before unmapping the page mappings

2021-02-08 Thread Yanan Wang

When KVM needs to coalesce the normal page mappings into a block mapping,
we currently invalidate the old table entry first followed by invalidation
of TLB, then unmap the page mappings, and install the block entry at last.

It will cost a long time to unmap the numerous page mappings, which means
there will be a long period when the table entry can be found invalid.
If other vCPUs access any guest page within the block range and find the
table entry invalid, they will all exit from guest with a translation fault
which is not necessary. And KVM will make efforts to handle these faults,
especially when performing CMOs by block range.

So let's quickly install the block entry at first to ensure uninterrupted
memory access of the other vCPUs, and then unmap the page mappings after
installation. This will reduce most of the time when the table entry is
invalid, and avoid most of the unnecessary translation faults.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 26 --
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 78a560446f80..308c36b9cd21 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -434,6 +434,7 @@ struct stage2_map_data {
kvm_pte_t   attr;
 
kvm_pte_t   *anchor;
+   kvm_pte_t   *follow;
 
struct kvm_s2_mmu   *mmu;
struct kvm_mmu_memory_cache *memcache;
@@ -553,15 +554,14 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, 
u32 level,
if (!kvm_block_mapping_supported(addr, end, data->phys, level))
return 0;
 
-   kvm_set_invalid_pte(ptep);
-
/*
-* Invalidate the whole stage-2, as we may have numerous leaf
-* entries below us which would otherwise need invalidating
-* individually.
+* If we need to coalesce existing table entries into a block here,
+* then install the block entry first and the sub-level page mappings
+* will be unmapped later.
 */
-   kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
data->anchor = ptep;
+   data->follow = kvm_pte_follow(*ptep);
+   stage2_coalesce_tables_into_block(addr, level, ptep, data);
return 0;
 }
 
@@ -614,20 +614,18 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, 
u32 level,
  kvm_pte_t *ptep,
  struct stage2_map_data *data)
 {
-   int ret = 0;
-
if (!data->anchor)
return 0;
 
-   free_page((unsigned long)kvm_pte_follow(*ptep));
-   put_page(virt_to_page(ptep));
-
-   if (data->anchor == ptep) {
+   if (data->anchor != ptep) {
+   free_page((unsigned long)kvm_pte_follow(*ptep));
+   put_page(virt_to_page(ptep));
+   } else {
+   free_page((unsigned long)data->follow);
data->anchor = NULL;
-   ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
}
 
-   return ret;
+   return 0;
 }
 
 /*
-- 
2.23.0

[RFC PATCH 0/4] KVM: arm64: Improve efficiency of stage2 page table

2021-02-08 Thread Yanan Wang

Hi,

This series makes some efficiency improvement of stage2 page table code,
and there are some test results to present the performance changes, which
were tested by a kvm selftest [1] that I have post:
[1] 
https://lore.kernel.org/lkml/20210208090841.333724-1-wangyana...@huawei.com/ 

About patch 1:
We currently uniformly clean dcache in user_mem_abort() before calling the
fault handlers, if we take a translation fault and the pfn is cacheable.
But if there are concurrent translation faults on the same page or block,
clean of dcache for the first time is necessary while the others are not.

By moving clean of dcache to the map handler, we can easily identify the
conditions where CMOs are really needed and avoid the unnecessary ones.
As it's a time consuming process to perform CMOs especially when flushing
a block range, so this solution reduces much load of kvm and improve the
efficiency of creating mappings.

Test results:
(1) when 20 vCPUs concurrently access 20G ram (all 1G hugepages):
KVM create block mappings time: 52.83s -> 3.70s
KVM recover block mappings time(after dirty-logging): 52.0s -> 2.87s

(2) when 40 vCPUs concurrently access 20G ram (all 1G hugepages):
KVM creating block mappings time: 104.56s -> 3.70s
KVM recover block mappings time(after dirty-logging): 103.93s -> 2.96s

About patch 2, 3:
When KVM needs to coalesce the normal page mappings into a block mapping,
we currently invalidate the old table entry first followed by invalidation
of TLB, then unmap the page mappings, and install the block entry at last.

It will cost a lot of time to unmap the numerous page mappings, which means
the table entry will be left invalid for a long time before installation of
the block entry, and this will cause many spurious translation faults.

So let's quickly install the block entry at first to ensure uninterrupted
memory access of the other vCPUs, and then unmap the page mappings after
installation. This will reduce most of the time when the table entry is
invalid, and avoid most of the unnecessary translation faults.

Test results based on patch 1:
(1) when 20 vCPUs concurrently access 20G ram (all 1G hugepages):
KVM recover block mappings time(after dirty-logging): 2.87s -> 0.30s

(2) when 40 vCPUs concurrently access 20G ram (all 1G hugepages):
KVM recover block mappings time(after dirty-logging): 2.96s -> 0.35s

So combined with patch 1, it makes a big difference of KVM creating mappings
and recovering block mappings with not much code change.

About patch 4:
A new method to distinguish cases of memcache allocations is introduced.
By comparing fault_granule and vma_pagesize, cases that require allocations
from memcache and cases that don't can be distinguished completely.

---

Details of test results
platform: HiSilicon Kunpeng920 (FWB not supported)
host kernel: Linux mainline (v5.11-rc6)

(1) performance change of patch 1
cmdline: ./kvm_page_table_test -m 4 -t 2 -g 1G -s 20G -v 20
   (20 vcpus, 20G memory, block mappings(granule 1G))
Before patch: KVM_CREATE_MAPPINGS: 52.8338s 52.8327s 52.8336s 52.8255s 52.8303s
After  patch: KVM_CREATE_MAPPINGS:  3.7022s  3.7031s  3.7028s  3.7012s  3.7024s

Before patch: KVM_ADJUST_MAPPINGS: 52.0466s 52.0473s 52.0550s 52.0518s 52.0467s
After  patch: KVM_ADJUST_MAPPINGS:  2.8787s  2.8781s  2.8785s  2.8742s  2.8759s

cmdline: ./kvm_page_table_test -m 4 -t 2 -g 1G -s 20G -v 40
   (40 vcpus, 20G memory, block mappings(granule 1G))
Before patch: KVM_CREATE_MAPPINGS: 104.560s 104.556s 104.554s 104.556s 104.550s
After  patch: KVM_CREATE_MAPPINGS:  3.7011s  3.7103s  3.7005s  3.7024s  3.7106s

Before patch: KVM_ADJUST_MAPPINGS: 103.931s 103.936s 103.927s 103.942s 103.927s
After  patch: KVM_ADJUST_MAPPINGS:  2.9621s  2.9648s  2.9474s  2.9587s  2.9603s

(2) performance change of patch 2, 3(based on patch 1)
cmdline: ./kvm_page_table_test -m 4 -t 2 -g 1G -s 20G -v 1
   (1 vcpu, 20G memory, block mappings(granule 1G))
Before patch: KVM_ADJUST_MAPPINGS: 2.8241s 2.8234s 2.8245s 2.8230s 2.8652s
After  patch: KVM_ADJUST_MAPPINGS: 0.2444s 0.2442s 0.2423s 0.2441s 0.2429s

cmdline: ./kvm_page_table_test -m 4 -t 2 -g 1G -s 20G -v 20
   (20 vcpus, 20G memory, block mappings(granule 1G))
Before patch: KVM_ADJUST_MAPPINGS: 2.8787s 2.8781s 2.8785s 2.8742s 2.8759s
After  patch: KVM_ADJUST_MAPPINGS: 0.3008s 0.3004s 0.2974s 0.2917s 0.2900s

cmdline: ./kvm_page_table_test -m 4 -t 2 -g 1G -s 20G -v 40
   (40 vcpus, 20G memory, block mappings(granule 1G))
Before patch: KVM_ADJUST_MAPPINGS: 2.9621s 2.9648s 2.9474s 2.9587s 2.9603s
After  patch: KVM_ADJUST_MAPPINGS: 0.3541s 0.3694s 0.3656s 0.3693s 0.3687s

---

Yanan Wang (4):
  KVM: arm64: Move the clean of dcache to the map handler
  KVM: arm64: Add an independent API for coalescing tables
  KVM: arm64: Install the block entry before unmapping the page mappings
  KVM: arm64: Distinguish cases of memcache allocations completely

 arch/arm64/include/asm/kvm_mmu.h

[RFC PATCH 2/4] KVM: arm64: Add an independent API for coalescing tables

2021-02-08 Thread Yanan Wang

Process of coalescing page mappings back to a block mapping is different
from normal map path, such as TLB invalidation and CMOs, so here add an
independent API for this case.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 2f4f87021980..78a560446f80 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -525,6 +525,24 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
return 0;
 }
 
+static void stage2_coalesce_tables_into_block(u64 addr, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+   u64 granule = kvm_granule_size(level), phys = data->phys;
+   kvm_pte_t new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+
+   kvm_set_invalid_pte(ptep);
+
+   /*
+* Invalidate the whole stage-2, as we may have numerous leaf entries
+* below us which would otherwise need invalidating individually.
+*/
+   kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
+   smp_store_release(ptep, new);
+   data->phys += granule;
+}
+
 static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
 kvm_pte_t *ptep,
 struct stage2_map_data *data)
-- 
2.23.0

[RFC PATCH 0/2] Add a test for kvm page table code

2021-02-08 Thread Yanan Wang

Hi,

This test is added to serve as a performance tester and a bug reproducer
for kvm page table code (GPA->HPA mappings), it gives guidance for the
people trying to make some improvement for kvm.

The following explains what we can exactly do through this test.
And a RFC is sent for comments, thanks.

The function guest_code() is designed to cover conditions where a single vcpu
or multiple vcpus access guest pages within the same memory range, in three
VM stages(before dirty-logging, during dirty-logging, after dirty-logging).
Besides, the backing source memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings or
block mappings can be chosen by users to be created in the test.

If use of ANONYMOUS memory is specified, kvm will create page mappings for the
tested memory region before dirty-logging, and update attributes of the page
mappings from RO to RW during dirty-logging. If use of THP/HUGETLB memory is
specified, kvm will create block mappings for the tested memory region before
dirty-logging, and split the blcok mappings into page mappings during
dirty-logging, and coalesce the page mappings back into block mappings after
dirty-logging is stopped.

So in summary, as a performance tester, this test can present the performance
of kvm creating/updating normal page mappings, or the performance of kvm
creating/splitting/recovering block mappings, through execution time.

When we need to coalesce the page mappings back to block mappings after dirty
logging is stopped, we have to firstly invalidate *all* the TLB entries for the
page mappings right before installation of the block entry, because a TLB 
conflict
abort error could occur if we can't invalidate the TLB entries fully. We have
hit this TLB conflict twice on aarch64 software implementation and fixed it.
As this test can imulate process from dirty-logging enabled to dirty-logging
stopped of a VM with block mappings, so it can also reproduce this TLB conflict
abort due to inadequate TLB invalidation when coalescing tables.

Links about the TLB conflict abort:
https://lore.kernel.org/lkml/20201201201034.116760-3-wangyana...@huawei.com/

---

Here are some test examples of this test:
platform: HiSilicon Kunpeng920 (aarch64, FWB not supported)
host kernel: Linux mainline

(1) Based on v5.11-rc6

cmdline: ./kvm_page_table_test -m 4 -t 0 -g 4K -s 1G -v 1
   (1 vcpu, 1G memory, page mappings(granule 4K))
KVM_CREATE_MAPPINGS: 0.8196s 0.8260s 0.8258s 0.8169s 0.8190s
KVM_UPDATE_MAPPINGS: 1.1930s 1.1949s 1.1940s 1.1934s 1.1946s

cmdline: ./kvm_page_table_test -m 4 -t 0 -g 4K -s 1G -v 20
   (20 vcpus, 1G memory, page mappings(granule 4K))
KVM_CREATE_MAPPINGS: 23.4028s 23.8015s 23.6702s 23.9437s 22.1646s
KVM_UPDATE_MAPPINGS: 16.9550s 16.4734s 16.8300s 16.9621s 16.9402s

cmdline: ./kvm_page_table_test -m 4 -t 2 -g 1G -s 20G -v 1
   (1 vcpu, 20G memory, block mappings(granule 1G))
KVM_CREATE_MAPPINGS: 3.7040s 3.7053s 3.7047s 3.7061s 3.7068s
KVM_ADJUST_MAPPINGS: 2.8264s 2.8266s 2.8272s 2.8259s 2.8283s

cmdline: ./kvm_page_table_test -m 4 -t 2 -g 1G -s 20G -v 20
   (20 vcpus, 20G memory, block mappings(granule 1G))
KVM_CREATE_MAPPINGS: 52.8338s 52.8327s 52.8336s 52.8255s 52.8303s
KVM_ADJUST_MAPPINGS: 52.0466s 52.0473s 52.0550s 52.0518s 52.0467s

(2) I have post a patch series to improve efficiency of stage2 page table code,
so test the performance changes.

cmdline: ./kvm_page_table_test -m 4 -t 2 -g 1G -s 20G -v 20
   (20 vcpus, 20G memory, block mappings(granule 1G))
Before patch: KVM_CREATE_MAPPINGS: 52.8338s 52.8327s 52.8336s 52.8255s 52.8303s
After  patch: KVM_CREATE_MAPPINGS:  3.7022s  3.7031s  3.7028s  3.7012s  3.7024s

Before patch: KVM_ADJUST_MAPPINGS: 52.0466s 52.0473s 52.0550s 52.0518s 52.0467s
After  patch: KVM_ADJUST_MAPPINGS:  0.3008s  0.3004s  0.2974s  0.2917s  0.2900s

cmdline: ./kvm_page_table_test -m 4 -t 2 -g 1G -s 20G -v 40
   (40 vcpus, 20G memory, block mappings(granule 1G))
Before patch: KVM_CREATE_MAPPINGS: 104.560s 104.556s 104.554s 104.556s 104.550s
After  patch: KVM_CREATE_MAPPINGS:  3.7011s  3.7103s  3.7005s  3.7024s  3.7106s

Before patch: KVM_ADJUST_MAPPINGS: 103.931s 103.936s 103.927s 103.942s 103.927s
After  patch: KVM_ADJUST_MAPPINGS:  0.3541s  0.3694s  0.3656s  0.3693s  0.3687s

---

Yanan Wang (2):
  KVM: selftests: Add a macro to get string of vm_mem_backing_src_type
  KVM: selftests: Add a test for kvm page table code

 tools/testing/selftests/kvm/Makefile  |   3 +
 .../testing/selftests/kvm/include/kvm_util.h  |   3 +
 .../selftests/kvm/kvm_page_table_test.c   | 518 ++
 tools/testing/selftests/kvm/lib/kvm_util.c|   8 +
 4 files changed, 532 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c

-- 
2.23.0

[RFC PATCH 1/2] KVM: selftests: Add a macro to get string of vm_mem_backing_src_type

2021-02-08 Thread Yanan Wang

Add a macro to get string of the backing source memory type, so that
application can add choices for source types in the help() function,
and users can specify which type to use for testing.

Signed-off-by: Yanan Wang 
---
 tools/testing/selftests/kvm/include/kvm_util.h | 3 +++
 tools/testing/selftests/kvm/lib/kvm_util.c | 8 
 2 files changed, 11 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h 
b/tools/testing/selftests/kvm/include/kvm_util.h
index 5cbb861525ed..f5fc29dc9ee6 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -69,7 +69,9 @@ enum vm_guest_mode {
 #define PTES_PER_MIN_PAGE  ptes_per_page(MIN_PAGE_SIZE)
 
 #define vm_guest_mode_string(m) vm_guest_mode_string[m]
+#define vm_mem_backing_src_type_string(s) vm_mem_backing_src_type_string[s]
 extern const char * const vm_guest_mode_string[];
+extern const char * const vm_mem_backing_src_type_string[];
 
 struct vm_guest_mode_params {
unsigned int pa_bits;
@@ -83,6 +85,7 @@ enum vm_mem_backing_src_type {
VM_MEM_SRC_ANONYMOUS,
VM_MEM_SRC_ANONYMOUS_THP,
VM_MEM_SRC_ANONYMOUS_HUGETLB,
+   NUM_VM_BACKING_SRC_TYPES,
 };
 
 int kvm_check_cap(long cap);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c 
b/tools/testing/selftests/kvm/lib/kvm_util.c
index fa5a90e6c6f0..a9b651c7f866 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -165,6 +165,14 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = 
{
 _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct 
vm_guest_mode_params) == NUM_VM_MODES,
   "Missing new mode params?");
 
+const char * const vm_mem_backing_src_type_string[] = {
+   "VM_MEM_SRC_ANONYMOUS",
+   "VM_MEM_SRC_ANONYMOUS_THP",
+   "VM_MEM_SRC_ANONYMOUS_HUGETLB",
+};
+_Static_assert(sizeof(vm_mem_backing_src_type_string)/sizeof(char *) == 
NUM_VM_BACKING_SRC_TYPES,
+  "Missing new source type strings?");
+
 /*
  * VM Create
  *
-- 
2.23.0

[RFC PATCH 2/2] KVM: selftests: Add a test for kvm page table code

2021-02-08 Thread Yanan Wang

This test serves as a performance tester and a bug reproducer for
kvm page table code (GPA->HPA mappings), so it gives guidance for
people trying to make some improvement for kvm.

The function guest_code() is designed to cover conditions where a single vcpu
or multiple vcpus access guest pages within the same memory range, in three
VM stages(before dirty-logging, during dirty-logging, after dirty-logging).
Besides, the backing source memory type(ANONYMOUS/THP/HUGETLB) of the tested
memory region can be specified by users, which means normal page mappings or
block mappings can be chosen by users to be created in the test.

If use of ANONYMOUS memory is specified, kvm will create page mappings for the
tested memory region before dirty-logging, and update attributes of the page
mappings from RO to RW during dirty-logging. If use of THP/HUGETLB memory is
specified, kvm will create block mappings for the tested memory region before
dirty-logging, and split the blcok mappings into page mappings during
dirty-logging, and coalesce the page mappings back into block mappings after
dirty-logging is stopped.

So in summary, as a performance tester, this test can present the performance
of kvm creating/updating normal page mappings, or the performance of kvm
creating/splitting/recovering block mappings, through execution time.

When we need to coalesce the page mappings back to block mappings after dirty
logging is stopped, we have to firstly invalidate *all* the TLB entries for the
page mappings right before installation of the block entry, because a TLB 
conflict
abort error could occur if we can't invalidate the TLB entries fully. We have
hit this TLB conflict twice on aarch64 software implementation and fixed it.
As this test can imulate process from dirty-logging enabled to dirty-logging
stopped of a VM with block mappings, so it can also reproduce this TLB conflict
abort due to inadequate TLB invalidation when coalescing tables.

Signed-off-by: Yanan Wang 
---
 tools/testing/selftests/kvm/Makefile  |   3 +
 .../selftests/kvm/kvm_page_table_test.c   | 518 ++
 2 files changed, 521 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/kvm_page_table_test.c

diff --git a/tools/testing/selftests/kvm/Makefile 
b/tools/testing/selftests/kvm/Makefile
index fe41c6a0fa67..697318019bd4 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -62,6 +62,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
 TEST_GEN_PROGS_x86_64 += demand_paging_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
 TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
+TEST_GEN_PROGS_x86_64 += kvm_page_table_test
 TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_x86_64 += set_memory_region_test
 TEST_GEN_PROGS_x86_64 += steal_time
@@ -71,6 +72,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
 TEST_GEN_PROGS_aarch64 += demand_paging_test
 TEST_GEN_PROGS_aarch64 += dirty_log_test
 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
+TEST_GEN_PROGS_aarch64 += kvm_page_table_test
 TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
 TEST_GEN_PROGS_aarch64 += set_memory_region_test
 TEST_GEN_PROGS_aarch64 += steal_time
@@ -80,6 +82,7 @@ TEST_GEN_PROGS_s390x += s390x/resets
 TEST_GEN_PROGS_s390x += s390x/sync_regs_test
 TEST_GEN_PROGS_s390x += demand_paging_test
 TEST_GEN_PROGS_s390x += dirty_log_test
+TEST_GEN_PROGS_s390x += kvm_page_table_test
 TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
 TEST_GEN_PROGS_s390x += set_memory_region_test
 
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c 
b/tools/testing/selftests/kvm/kvm_page_table_test.c
new file mode 100644
index ..b09c05288937
--- /dev/null
+++ b/tools/testing/selftests/kvm/kvm_page_table_test.c
@@ -0,0 +1,518 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM page table test
+ * Based on dirty_log_test.c
+ * Based on dirty_log_perf_test.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2020, Google, Inc.
+ * Copyright (C) 2021, Huawei, Inc.
+ *
+ * Make sure that enough THP/HUGETLB pages have been allocated on systems
+ * to cover the testing memory region before running this program, if you
+ * wish to create block mappings in this test.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "guest_modes.h"
+
+#define TEST_MEM_SLOT_INDEX 1
+
+/* Default size(1GB) of the memory for testing */
+#define DEFAULT_TEST_MEM_SIZE  (1 << 30)
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM 0xc000
+
+/* Different memory accessing types for a vcpu */
+enum access_type {
+   ACCESS_TYPE_READ,
+   ACCESS_TYPE_WRITE,
+   NUM_ACCESS_TYPES,
+};
+
+/* Different memory accessing stages for a vcpu */
+enum test_stage {
+

[RFC PATCH v1 5/5] KVM: arm64: Adapt page-table code to new handling of coalescing tables

2021-01-26 Thread Yanan Wang

With new handling of coalescing tables, we can install the block entry
before unmap of the old table mappings. So make the installation in
stage2_map_walk_table_pre(), and elide the installation from function
stage2_map_walk_table_post().

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index ab1c94985ed0..fb755aac4384 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -436,6 +436,7 @@ struct stage2_map_data {
kvm_pte_t   attr;
 
kvm_pte_t   *anchor;
+   kvm_pte_t   *follow;
 
struct kvm_s2_mmu   *mmu;
struct kvm_mmu_memory_cache *memcache;
@@ -550,13 +551,13 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, 
u32 level,
kvm_set_invalid_pte(ptep);
 
/*
-* Invalidate the whole stage-2, as we may have numerous leaf
-* entries below us which would otherwise need invalidating
-* individually.
+* If there is an existing table entry and block mapping is needed here,
+* then set the anchor and replace it with a block entry. The sub-level
+* mappings will later be unmapped lazily.
 */
-   kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
data->anchor = ptep;
-   return 0;
+   data->follow = kvm_pte_follow(*ptep);
+   return stage2_coalesce_tables_into_block(addr, level, ptep, data);
 }
 
 static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
@@ -608,20 +609,18 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, 
u32 level,
  kvm_pte_t *ptep,
  struct stage2_map_data *data)
 {
-   int ret = 0;
-
if (!data->anchor)
return 0;
 
-   free_page((unsigned long)kvm_pte_follow(*ptep));
-   put_page(virt_to_page(ptep));
-
-   if (data->anchor == ptep) {
+   if (data->anchor != ptep) {
+   free_page((unsigned long)kvm_pte_follow(*ptep));
+   put_page(virt_to_page(ptep));
+   } else {
+   free_page((unsigned long)data->follow);
data->anchor = NULL;
-   ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
}
 
-   return ret;
+   return 0;
 }
 
 /*
-- 
2.19.1

[RFC PATCH v1 0/5] Enable CPU TTRem feature for stage-2

2021-01-26 Thread Yanan Wang

Hi all,
This series enable CPU TTRem feature for stage-2 page table and a RFC is sent
for some comments, thanks.

The ARMv8.4 TTRem feature offers 3 levels of support when changing block
size without changing any other parameters that are listed as requiring use
of break-before-make. And I found that maybe we can use this feature to make
some improvement for stage-2 page table and the following explains what
TTRem exactly does for the improvement.

If migration of a VM with hugepages is canceled midway, KVM will adjust the
stage-2 table mappings back to block mappings. We currently use BBM to replace
the table entry with a block entry. Take adjustment of 1G block mapping as an
example, with BBM procedures, we have to invalidate the old table entry first,
flush TLB and unmap the old table mappings, right before installing the new
block entry.

So there will be a bit long period when the old table entry is invalid before
installation of the new block entry, if other vCPUs access any guest page within
the 1G range during this period and find the table entry invalid, they will all
exit from guest with a translation fault. Actually, these translation faults
are not necessary, because the block mapping will be built later. Besides, KVM
will still try to build 1G block mappings for these spurious translation faults,
and will perform cache maintenance operations, page table walk, etc.

In summary, the spurious faults are caused by invalidation in BBM procedures.
Approaches of TTRem level 1,2 ensure that there will not be a moment when the
old table entry is invalid before installation of the new block entry. However,
level-2 method will possibly lead to a TLB conflict which is bothering, so we
use nT both at level-1 and level-2 case to avoid handling TLB conflict aborts.

For an implementation which meets level 1 or level 2, the CPU has two responses
to choose when accessing a block table entry with nT bit set: Firstly, CPU will
generate a translation fault, the effect of this response is simier to BBM.
Secondly, CPU can use the block entry for translation. So with the second kind
of implementation, the above described spurious translations can be prevented.

Yanan Wang (5):
  KVM: arm64: Detect the ARMv8.4 TTRem feature
  KVM: arm64: Add an API to get level of TTRem supported by hardware
  KVM: arm64: Support usage of TTRem in guest stage-2 translation
  KVM: arm64: Add handling of coalescing tables into a block mapping
  KVM: arm64: Adapt page-table code to new handling of coalescing tables

 arch/arm64/include/asm/cpucaps.h|  3 +-
 arch/arm64/include/asm/cpufeature.h | 13 ++
 arch/arm64/kernel/cpufeature.c  | 10 +
 arch/arm64/kvm/hyp/pgtable.c| 62 +++--
 4 files changed, 74 insertions(+), 14 deletions(-)

-- 
2.19.1

[RFC PATCH v1 4/5] KVM: arm64: Add handling of coalescing tables into a block mapping

2021-01-26 Thread Yanan Wang

If migration of a VM with hugepages is canceled midway, KVM will adjust
the stage-2 table mappings back to block mappings. We currently use BBM
to replace the table entry with a block entry. Take adjustment of 1G block
mapping as an example, with BBM procedures, we have to invalidate the old
table entry of level 1 first, flush TLB and unmap the old table mappings,
right before installing the new block entry.

So there will be a bit long period when the table entry of level 1 is
invalid before installation of block entry, if other vCPUs access any
guest page within the 1G range during this period and find the table
entry invalid, they will all exit from guest with an translation fault.
Actually, these translation faults are not necessary, because the block
mapping will be built later. Besides, KVM will try to build 1G block
mappings for these translation faults, and will perform cache maintenance
operations, page table walk, etc.

Approaches of TTRem level 1,2 ensure that there will be not a moment when
the old table entry is invalid before installation of the new block entry,
so no unnecessary translation faults will be caused. But level-2 method
will possibly lead to a TLB conflict which is bothering, so we use nT both
at level-1 and level-2 case to avoid handling TLB conflict aborts.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index c8b959e3951b..ab1c94985ed0 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -49,6 +49,8 @@
 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
 KVM_PTE_LEAF_ATTR_HI_S2_XN)
 
+#define KVM_PTE_LEAF_BLOCK_S2_NT   BIT(16)
+
 struct kvm_pgtable_walk_data {
struct kvm_pgtable  *pgt;
struct kvm_pgtable_walker   *walker;
@@ -502,6 +504,39 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
return 0;
 }
 
+static int stage2_coalesce_tables_into_block(u64 addr, u32 level,
+kvm_pte_t *ptep,
+struct stage2_map_data *data)
+{
+   u32 ttrem_level = data->ttrem_level;
+   u64 granule = kvm_granule_size(level), phys = data->phys;
+   kvm_pte_t new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+
+   switch (ttrem_level) {
+   case TTREM_LEVEL0:
+   kvm_set_invalid_pte(ptep);
+
+   /*
+* Invalidate the whole stage-2, as we may have numerous leaf
+* entries below us which would otherwise need invalidating
+* individually.
+*/
+   kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
+   smp_store_release(ptep, new);
+   data->phys += granule;
+   return 0;
+   case TTREM_LEVEL1:
+   case TTREM_LEVEL2:
+   WRITE_ONCE(*ptep, new | KVM_PTE_LEAF_BLOCK_S2_NT);
+   kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
+   WRITE_ONCE(*ptep, new & ~KVM_PTE_LEAF_BLOCK_S2_NT);
+   data->phys += granule;
+   return 0;
+   }
+
+   return -EINVAL;
+}
+
 static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
 kvm_pte_t *ptep,
 struct stage2_map_data *data)
-- 
2.19.1

[RFC PATCH v1 3/5] KVM: arm64: Support usage of TTRem in guest stage-2 translation

2021-01-26 Thread Yanan Wang

As TTrem can be used when coalesce existing table mappings into a block
in guest stage-2 translation, so just support usage of it.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 4d177ce1d536..c8b959e3951b 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -437,6 +437,7 @@ struct stage2_map_data {
 
struct kvm_s2_mmu   *mmu;
struct kvm_mmu_memory_cache *memcache;
+   u32 ttrem_level;
 };
 
 static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
@@ -633,6 +634,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 
addr, u64 size,
.phys   = ALIGN_DOWN(phys, PAGE_SIZE),
.mmu= pgt->mmu,
.memcache   = mc,
+   .ttrem_level= system_support_level_of_ttrem(),
};
struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
-- 
2.19.1

[RFC PATCH v1 1/5] arm64: cpufeature: Detect the ARMv8.4 TTRem feature

2021-01-26 Thread Yanan Wang

The ARMv8.4 TTRem feature offers 3 levels of support when changing block
size without changing any other parameters that are listed as requiring
use of break-before-make.

With level 0 supported, software must use break-before-make to avoid the
possible hardware problems. With level 1 supported, besides use of BBM,
software can also make use of the nT block translation entry. With level
2 supported, besides approaches of BBM and nT, software can also directly
change block size, but TLB conflicts possibly occur as a result.

We have found a place where TTRem can be used to improve the performance
in guest stage-2 translation. So detact the TTRem feature here.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/cpucaps.h |  3 ++-
 arch/arm64/kernel/cpufeature.c   | 10 ++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index b77d997b173b..e24570ea7444 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -66,7 +66,8 @@
 #define ARM64_WORKAROUND_1508412   58
 #define ARM64_HAS_LDAPR59
 #define ARM64_KVM_PROTECTED_MODE   60
+#define ARM64_HAS_ARMv8_4_TTREM61
 
-#define ARM64_NCAPS61
+#define ARM64_NCAPS62
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e99eddec0a46..8295dd1d450b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1960,6 +1960,16 @@ static const struct arm64_cpu_capabilities 
arm64_features[] = {
.sign = FTR_UNSIGNED,
.min_field_value = ID_AA64ISAR0_TLB_RANGE,
},
+   {
+   .desc = "ARMv8.4 TTRem",
+   .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+   .capability = ARM64_HAS_ARMv8_4_TTREM,
+   .sys_reg = SYS_ID_AA64MMFR2_EL1,
+   .sign = FTR_UNSIGNED,
+   .field_pos = ID_AA64MMFR2_BBM_SHIFT,
+   .min_field_value = 1,
+   .matches = has_cpuid_feature,
+   },
 #ifdef CONFIG_ARM64_HW_AFDBM
{
/*
-- 
2.19.1

[RFC PATCH v1 2/5] arm64: cpufeature: Add an API to get level of TTRem supported by hardware

2021-01-26 Thread Yanan Wang

The ARMv8.4 architecture offers 3 levels of support when changing
block size without changing any other parameters that are listed
as requiring use of break-before-make. So get the current level
of TTRem supported by hardware and software can use corresponding
process when changing block size.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/cpufeature.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index 9a555809b89c..f8ee7d30829b 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -50,6 +50,11 @@ enum ftr_type {
 #define FTR_VISIBLEtrue/* Feature visible to the user space */
 #define FTR_HIDDEN false   /* Feature is hidden from the user */
 
+/* Supported levels of ARMv8.4 TTRem feature */
+#define TTREM_LEVEL0   0
+#define TTREM_LEVEL1   1
+#define TTREM_LEVEL2   2
+
 #define FTR_VISIBLE_IF_IS_ENABLED(config)  \
(IS_ENABLED(config) ? FTR_VISIBLE : FTR_HIDDEN)
 
@@ -739,6 +744,14 @@ static inline bool system_supports_tlb_range(void)
cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
 }
 
+static inline u32 system_support_level_of_ttrem(void)
+{
+   u64 mmfr2 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
+
+   return cpuid_feature_extract_unsigned_field(mmfr2,
+   ID_AA64MMFR2_BBM_SHIFT);
+}
+
 extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
 
 static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
-- 
2.19.1

[PATCH 2/2] KVM: arm64: Skip the cache flush when coalescing tables into a block

2021-01-25 Thread Yanan Wang

After dirty-logging is stopped for a VM configured with huge mappings,
KVM will recover the table mappings back to block mappings. As we only
replace the existing page tables with a block entry and the cacheability
has not been changed, the cache maintenance opreations can be skipped.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/mmu.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 8e8549ea1d70..37b427dcbc4f 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -744,7 +744,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 {
int ret = 0;
bool write_fault, writable, force_pte = false;
-   bool exec_fault;
+   bool exec_fault, adjust_hugepage;
bool device = false;
unsigned long mmu_seq;
struct kvm *kvm = vcpu->kvm;
@@ -872,12 +872,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
mark_page_dirty(kvm, gfn);
}
 
-   if (fault_status != FSC_PERM && !device)
+   /*
+* There is no necessity to perform cache maintenance operations if we
+* will only replace the existing table mappings with a block mapping.
+*/
+   adjust_hugepage = fault_granule < vma_pagesize ? true : false;
+   if (fault_status != FSC_PERM && !device && !adjust_hugepage)
clean_dcache_guest_page(pfn, vma_pagesize);
 
if (exec_fault) {
prot |= KVM_PGTABLE_PROT_X;
-   invalidate_icache_guest_page(pfn, vma_pagesize);
+   if (!adjust_hugepage)
+   invalidate_icache_guest_page(pfn, vma_pagesize);
}
 
if (device)
-- 
2.19.1

[PATCH 0/2] Performance improvement about cache flush

2021-01-25 Thread Yanan Wang

Hi,
This two patches are posted to introduce a new method that can distinguish cases
of allocating memcache more precisely, and to elide some unnecessary cache 
flush.

For patch-1:
With a guest translation fault, we don't really need the memcache pages when
only installing a new entry to the existing page table or replacing the table
entry with a block entry. And with a guest permission fault, we also don't need
the memcache pages for a write_fault in dirty-logging time if VMs are not
configured with huge mappings. So a new method is introduced to distinguish 
cases
of allocating memcache more precisely.

For patch-2:
If migration of a VM with hugepages is canceled midway, KVM will adjust the
stage-2 table mappings back to block mappings. With multiple vCPUs accessing
guest pages within the same 1G range, there could be numbers of translation
faults to handle, and KVM will uniformly flush data cache for 1G range before
handling the faults. As it will cost a long time to flush the data cache for
1G range of memory(130ms on Kunpeng 920 servers, for example), the consequent
cache flush for each translation fault will finally lead to vCPU stuck for
seconds or even a soft lockup. I have met both the stuck and soft lockup on
Kunpeng servers with FWB not supported.

When KVM need to recover the table mappings back to block mappings, as we only
replace the existing page tables with a block entry and the cacheability has not
been changed, the cache maintenance opreations can be skipped.

Yanan Wang (2):
  KVM: arm64: Distinguish cases of allocating memcache more precisely
  KVM: arm64: Skip the cache flush when coalescing tables into a block

 arch/arm64/kvm/mmu.c | 37 +
 1 file changed, 21 insertions(+), 16 deletions(-)

-- 
2.19.1

[PATCH 1/2] KVM: arm64: Distinguish cases of allocating memcache more precisely

2021-01-25 Thread Yanan Wang

With a guest translation fault, we don't really need the memcache pages
when only installing a new entry to the existing page table or replacing
the table entry with a block entry. And with a guest permission fault,
we also don't need the memcache pages for a write_fault in dirty-logging
time if VMs are not configured with huge mappings.

The cases where allocations from memcache are required can be much more
precisely distinguished by comparing fault_granule and vma_pagesize.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/mmu.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 7d2257cc5438..8e8549ea1d70 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -820,19 +820,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
 
-   /*
-* Permission faults just need to update the existing leaf entry,
-* and so normally don't require allocations from the memcache. The
-* only exception to this is when dirty logging is enabled at runtime
-* and a write fault needs to collapse a block entry into a table.
-*/
-   if (fault_status != FSC_PERM || (logging_active && write_fault)) {
-   ret = kvm_mmu_topup_memory_cache(memcache,
-kvm_mmu_cache_min_pages(kvm));
-   if (ret)
-   return ret;
-   }
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
 * Ensure the read of mmu_notifier_seq happens before we call
@@ -898,6 +885,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
 
+   /*
+* Allocations from the memcache are required only when granule of the
+* lookup level where a guest fault happened exceeds the vma_pagesize,
+* which means new page tables will be created in the fault handlers.
+*/
+   if (fault_granule > vma_pagesize) {
+   ret = kvm_mmu_topup_memory_cache(memcache,
+kvm_mmu_cache_min_pages(kvm));
+   if (ret)
+   return ret;
+   }
+
/*
 * Under the premise of getting a FSC_PERM fault, we just need to relax
 * permissions only if vma_pagesize equals fault_granule. Otherwise,
-- 
2.19.1

[RFC PATCH v4 0/2] Some optimization for stage-2 translation

2021-01-22 Thread Yanan Wang

Hi, Will, Marc,
Is there any further comment on the v3 series I post previously?
If they are not fine to you, then I think maybe we should just turn
back to the original solution in v1, where I suggestted to filter out
the case of only updating access permissions in the map handler and
handle it right there.

Here are the reasons for my current opinion:
With an errno returned from the map handler for this single case, there
will be one more vcpu exit from guest and we also have to consider the
spurious dirty pages. Besides, it seems that the EAGAIN errno has been
chosen specially for this case and can not be used elsewhere for other
reasons, as we will change this errno to zero at the end of the function.

The v1 solution looks like more concise at last, so I refine the diff
and post the v4 with two patches here, just for a contrast.

Which solution will you prefer now? Could you please let me know.

Thanks,
Yanan.

Links:
v1: https://lore.kernel.org/lkml/20201211080115.21460-1-wangyana...@huawei.com
v2: https://lore.kernel.org/lkml/20201216122844.25092-1-wangyana...@huawei.com
v3: https://lore.kernel.org/lkml/20210114121350.123684-1-wangyana...@huawei.com

---

About patch-1:
Procedures of hyp stage-1 map and guest stage-2 map are quite different,
but they are now tied closely by function kvm_set_valid_leaf_pte().
So adjust the relative code for ease of code maintenance in the future.

About patch-2:
(1) During running time of a a VM with numbers of vCPUs, if some vCPUs
access the same GPA almost at the same time and the stage-2 mapping of
the GPA has not been built yet, as a result they will all cause
translation faults. The first vCPU builds the mapping, and the followed
ones end up updating the valid leaf PTE. Note that these vCPUs might
want different access permissions (RO, RW, RX, RWX, etc.).

(2) It's inevitable that we sometimes will update an existing valid leaf
PTE in the map path, and we all perform break-before-make in this case.
Then more unnecessary translation faults could be caused if the
*break stage* of BBM is just catched by other vCPUs.

With (1) and (2), something unsatisfactory could happen: vCPU A causes
a translation fault and builds the mapping with RW permissions, vCPU B
then update the valid leaf PTE with break-before-make and permissions
are updated back to RO. Besides, *break stage* of BBM may trigger more
translation faults. Finally, some useless small loops could occur.

We can make some optimization to solve above problems: When we need to
update a valid leaf PTE in the translation fault handler, let's filter
out the case where this update only change access permissions that don't
require break-before-make. If there have already been the permissions
we want, don't bother to update. If still more permissions need to be
added, then update the PTE directly without break-before-make.

---

Changelogs

v4->v3:
- Turn back to the original solution in v1 and refine the diff
- Rebased on top of v5.11-rc4

v2->v3:
- Rebased on top of v5.11-rc3
- Refine the commit messages
- Make some adjustment about return value in patch-2 and patch-3

v1->v2:
- Make part of the diff a seperate patch (patch-1)
- Add Will's Signed-off-by for patch-1
- Return an errno when meeting changing permissions case in map path
- Add a new patch (patch-3)

---

Yanan Wang (2):
  KVM: arm64: Adjust partial code of hyp stage-1 map and guest stage-2
map
  KVM: arm64: Filter out the case of only changing permissions from
stage-2 map path

 arch/arm64/include/asm/kvm_pgtable.h |  4 ++
 arch/arm64/kvm/hyp/pgtable.c | 88 +++-
 2 files changed, 63 insertions(+), 29 deletions(-)

-- 
2.19.1

[RFC PATCH v4 1/2] KVM: arm64: Adjust partial code of hyp stage-1 map and guest stage-2 map

2021-01-22 Thread Yanan Wang

Procedures of hyp stage-1 map and guest stage-2 map are quite different,
but they are tied closely by function kvm_set_valid_leaf_pte().
So adjust the relative code for ease of code maintenance in the future.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 54 ++--
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index bdf8e55ed308..2878aaf53b3c 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -170,10 +170,9 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t 
*childp)
smp_store_release(ptep, pte);
 }
 
-static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
-  u32 level)
+static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
 {
-   kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+   kvm_pte_t pte = kvm_phys_to_pte(pa);
u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
   KVM_PTE_TYPE_BLOCK;
 
@@ -181,12 +180,7 @@ static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 
pa, kvm_pte_t attr,
pte |= FIELD_PREP(KVM_PTE_TYPE, type);
pte |= KVM_PTE_VALID;
 
-   /* Tolerate KVM recreating the exact same mapping. */
-   if (kvm_pte_valid(old))
-   return old == pte;
-
-   smp_store_release(ptep, pte);
-   return true;
+   return pte;
 }
 
 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
@@ -341,12 +335,17 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
 static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep, struct hyp_map_data *data)
 {
+   kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
 
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
 
-   WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
+   /* Tolerate KVM recreating the exact same mapping */
+   new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+   if (old != new && !WARN_ON(kvm_pte_valid(old)))
+   smp_store_release(ptep, new);
+
data->phys += granule;
return true;
 }
@@ -465,27 +464,29 @@ static bool stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
   kvm_pte_t *ptep,
   struct stage2_map_data *data)
 {
+   kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
 
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
 
-   /*
-* If the PTE was already valid, drop the refcount on the table
-* early, as it will be bumped-up again in stage2_map_walk_leaf().
-* This ensures that the refcount stays constant across a valid to
-* valid PTE update.
-*/
-   if (kvm_pte_valid(*ptep))
-   put_page(virt_to_page(ptep));
+   new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+   if (kvm_pte_valid(old)) {
+   /* Tolerate KVM recreating the exact same mapping */
+   if (old == new)
+   goto out;
 
-   if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
-   goto out;
+   /*
+* There's an existing different valid leaf entry, so perform
+* break-before-make.
+*/
+   kvm_set_invalid_pte(ptep);
+   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+   put_page(virt_to_page(ptep));
+   }
 
-   /* There's an existing valid leaf entry, so perform break-before-make */
-   kvm_set_invalid_pte(ptep);
-   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
-   kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
+   smp_store_release(ptep, new);
+   get_page(virt_to_page(ptep));
 out:
data->phys += granule;
return true;
@@ -527,7 +528,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 
level, kvm_pte_t *ptep,
}
 
if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
-   goto out_get_page;
+   return 0;
 
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
@@ -551,9 +552,8 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 
level, kvm_pte_t *ptep,
}
 
kvm_set_table_pte(ptep, childp);
-
-out_get_page:
get_page(page);
+
return 0;
 }
 
-- 
2.19.1

[RFC PATCH v4 2/2] KVM: arm64: Filter out the case of only changing permissions from stage-2 map path

2021-01-22 Thread Yanan Wang

(1) During running time of a a VM with numbers of vCPUs, if some vCPUs
access the same GPA almost at the same time and the stage-2 mapping of
the GPA has not been built yet, as a result they will all cause
translation faults. The first vCPU builds the mapping, and the followed
ones end up updating the valid leaf PTE. Note that these vCPUs might
want different access permissions (RO, RW, RX, RWX, etc.).

(2) It's inevitable that we sometimes will update an existing valid leaf
PTE in the map path, and we all perform break-before-make in this case.
Then more unnecessary translation faults could be caused if the
*break stage* of BBM is just catched by other vCPUs.

With (1) and (2), something unsatisfactory could happen: vCPU A causes
a translation fault and builds the mapping with RW permissions, vCPU B
then update the valid leaf PTE with break-before-make and permissions
are updated back to RO. Besides, *break stage* of BBM may trigger more
translation faults. Finally, some useless small loops could occur.

We can make some optimization to solve above problems: When we need to
update a valid leaf PTE in the translation fault handler, let's filter
out the case where this update only change access permissions that don't
require break-before-make. If there have already been the permissions
we want, don't bother to update. If still more permissions need to be
added, then update the PTE directly without break-before-make.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/kvm_pgtable.h |  4 ++
 arch/arm64/kvm/hyp/pgtable.c | 62 +---
 2 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
b/arch/arm64/include/asm/kvm_pgtable.h
index 52ab38db04c7..2bd4e772ca57 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -157,6 +157,10 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
  * If device attributes are not explicitly requested in @prot, then the
  * mapping will be normal, cacheable.
  *
+ * When there is an existing valid leaf PTE to be updated in this function,
+ * perform break-before-make only if the parameters to be changed for this
+ * update require it, otherwise the PTE can be updated directly.
+ *
  * Note that this function will both coalesce existing table entries and split
  * existing block mappings, relying on page-faults to fault back areas outside
  * of the new mapping lazily.
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 2878aaf53b3c..aac1915f9770 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -45,6 +45,10 @@
 
 #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
 
+#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
+KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
+KVM_PTE_LEAF_ATTR_HI_S2_XN)
+
 struct kvm_pgtable_walk_data {
struct kvm_pgtable  *pgt;
struct kvm_pgtable_walker   *walker;
@@ -460,34 +464,60 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
return 0;
 }
 
+static void stage2_map_update_valid_leaf_pte(u64 addr, u32 level,
+kvm_pte_t *ptep, kvm_pte_t new,
+struct stage2_map_data *data)
+{
+   kvm_pte_t old = *ptep;
+
+   /*
+* It's inevitable that we sometimes end up updating an existing valid
+* leaf PTE on the map path for kinds of reasons, for instance, multiple
+* vcpus accessing the same GPA page all cause translation faults on the
+* same time. So perform break-before-make here only if the parameters
+* to be changed for this update require it, otherwise the PTE can be
+* updated directly.
+*/
+   if ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)) {
+   kvm_set_invalid_pte(ptep);
+   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+   smp_store_release(ptep, new);
+   return;
+   }
+
+   old ^= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+   new ^= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+   new |= old;
+
+   /*
+* Update the valid leaf PTE directly without break-before-make if more
+* permissions need to be added, and skip the update if there have been
+* already the permissions that we want.
+*/
+   if (new != old) {
+   WRITE_ONCE(*ptep, new ^ KVM_PTE_LEAF_ATTR_HI_S2_XN);
+   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+   }
+}
+
 static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
   kvm_pte_t *ptep,
   struct stage2_map_data *data)
 {
-   kvm_pte_t new, old = *ptep;
+   kvm_pte_t new;
u64 granule = k

[PATCH v3 2/3] KVM: arm64: Filter out the case of only changing permissions from stage-2 map path

2021-01-14 Thread Yanan Wang

(1) During running time of a a VM with numbers of vCPUs, if some vCPUs
access the same GPA almost at the same time and the stage-2 mapping of
the GPA has not been built yet, as a result they will all cause
translation faults. The first vCPU builds the mapping, and the followed
ones end up updating the valid leaf PTE. Note that these vCPUs might
want different access permissions (RO, RW, RX, RWX, etc.).

(2) It's inevitable that we sometimes will update an existing valid leaf
PTE in the map path, and we perform break-before-make in this case.
Then more unnecessary translation faults could be caused if the
*break stage* of BBM is just catched by other vCPUS.

With (1) and (2), something unsatisfactory could happen: vCPU A causes
a translation fault and builds the mapping with RW permissions, vCPU B
then update the valid leaf PTE with break-before-make and permissions
are updated back to RO. Besides, *break stage* of BBM may trigger more
translation faults. Finally, some useless small loops could occur.

We can make some optimization to solve above problems: When we need to
update a valid leaf PTE in the map path, let's filter out the case where
this update only change access permissions, and don't update the valid
leaf PTE here in this case. Instead, let the vCPU enter back the guest
and it will exit next time to go through the relax_perms path without
break-before-make if it still wants more permissions.

Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/kvm_pgtable.h |  5 +
 arch/arm64/kvm/hyp/pgtable.c | 32 ++--
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h 
b/arch/arm64/include/asm/kvm_pgtable.h
index 52ab38db04c7..8886d43cfb11 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -157,6 +157,11 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
  * If device attributes are not explicitly requested in @prot, then the
  * mapping will be normal, cacheable.
  *
+ * Note that the update of a valid leaf PTE in this function will be aborted,
+ * if it's trying to recreate the exact same mapping or only change the access
+ * permissions. Instead, the vCPU will exit one more time from guest if still
+ * needed and then go through the path of relaxing permissions.
+ *
  * Note that this function will both coalesce existing table entries and split
  * existing block mappings, relying on page-faults to fault back areas outside
  * of the new mapping lazily.
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index a11ac874bc2a..4d177ce1d536 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -45,6 +45,10 @@
 
 #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
 
+#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
+KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
+KVM_PTE_LEAF_ATTR_HI_S2_XN)
+
 struct kvm_pgtable_walk_data {
struct kvm_pgtable  *pgt;
struct kvm_pgtable_walker   *walker;
@@ -460,22 +464,27 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
return 0;
 }
 
-static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
-  kvm_pte_t *ptep,
-  struct stage2_map_data *data)
+static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
 {
kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
struct page *page = virt_to_page(ptep);
 
if (!kvm_block_mapping_supported(addr, end, phys, level))
-   return false;
+   return -E2BIG;
 
new = kvm_init_valid_leaf_pte(phys, data->attr, level);
if (kvm_pte_valid(old)) {
-   /* Tolerate KVM recreating the exact same mapping */
-   if (old == new)
-   goto out;
+   /*
+* Skip updating the PTE if we are trying to recreate the exact
+* same mapping or only change the access permissions. Instead,
+* the vCPU will exit one more time from guest if still needed
+* and then go through the path of relaxing permissions.
+*/
+   if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)))
+   return -EAGAIN;
 
/*
 * There's an existing different valid leaf entry, so perform
@@ -488,9 +497,8 @@ static bool stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
 
smp_store_release(ptep, new);
get_page(page);
-out:
data->phys += granule;
-   return true;
+   return 0;
 }
 
 st

[PATCH v3 1/3] KVM: arm64: Adjust partial code of hyp stage-1 map and guest stage-2 map

2021-01-14 Thread Yanan Wang

Procedures of hyp stage-1 map and guest stage-2 map are quite different,
but they are tied closely by function kvm_set_valid_leaf_pte().
So adjust the relative code for ease of code maintenance in the future.

Signed-off-by: Will Deacon 
Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 55 ++--
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index bdf8e55ed308..a11ac874bc2a 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -170,10 +170,9 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t 
*childp)
smp_store_release(ptep, pte);
 }
 
-static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
-  u32 level)
+static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
 {
-   kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+   kvm_pte_t pte = kvm_phys_to_pte(pa);
u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
   KVM_PTE_TYPE_BLOCK;
 
@@ -181,12 +180,7 @@ static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 
pa, kvm_pte_t attr,
pte |= FIELD_PREP(KVM_PTE_TYPE, type);
pte |= KVM_PTE_VALID;
 
-   /* Tolerate KVM recreating the exact same mapping. */
-   if (kvm_pte_valid(old))
-   return old == pte;
-
-   smp_store_release(ptep, pte);
-   return true;
+   return pte;
 }
 
 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
@@ -341,12 +335,17 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
 static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep, struct hyp_map_data *data)
 {
+   kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
 
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
 
-   WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
+   /* Tolerate KVM recreating the exact same mapping */
+   new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+   if (old != new && !WARN_ON(kvm_pte_valid(old)))
+   smp_store_release(ptep, new);
+
data->phys += granule;
return true;
 }
@@ -465,27 +464,30 @@ static bool stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
   kvm_pte_t *ptep,
   struct stage2_map_data *data)
 {
+   kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
+   struct page *page = virt_to_page(ptep);
 
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
 
-   /*
-* If the PTE was already valid, drop the refcount on the table
-* early, as it will be bumped-up again in stage2_map_walk_leaf().
-* This ensures that the refcount stays constant across a valid to
-* valid PTE update.
-*/
-   if (kvm_pte_valid(*ptep))
-   put_page(virt_to_page(ptep));
+   new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+   if (kvm_pte_valid(old)) {
+   /* Tolerate KVM recreating the exact same mapping */
+   if (old == new)
+   goto out;
 
-   if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
-   goto out;
+   /*
+* There's an existing different valid leaf entry, so perform
+* break-before-make.
+*/
+   kvm_set_invalid_pte(ptep);
+   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+   put_page(page);
+   }
 
-   /* There's an existing valid leaf entry, so perform break-before-make */
-   kvm_set_invalid_pte(ptep);
-   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
-   kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
+   smp_store_release(ptep, new);
+   get_page(page);
 out:
data->phys += granule;
return true;
@@ -527,7 +529,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 
level, kvm_pte_t *ptep,
}
 
if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
-   goto out_get_page;
+   return 0;
 
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
@@ -551,9 +553,8 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 
level, kvm_pte_t *ptep,
}
 
kvm_set_table_pte(ptep, childp);
-
-out_get_page:
get_page(page);
+
return 0;
 }
 
-- 
2.19.1

[PATCH v3 0/3] Some optimization for stage-2 translation

2021-01-14 Thread Yanan Wang

Hi,
This patch series(v3) make some optimization for stage-2 translation.

About patch-1:
Procedures of hyp stage-1 map and guest stage-2 map are quite different,
but they are now tied closely by function kvm_set_valid_leaf_pte().
So adjust the relative code for ease of code maintenance in the future.

About patch-2:
There have been the separate map handler and perms handler used independently
for mapping and relaxing permissions in the new page-table infrastructure for
stage-2, yet there is still a specific case where we end up changing the access
permissions in the map path, and something unsatisfactory could happen because
of current handling for this case.

To solve above problem, we can filter out this case from the map path and abort
the PTE update. Instead, let the vCPU enter back the guest and it will exit next
time to go through the relax_perms path if still needed.

About patch-3:
We now set the pfn dirty and mark the page dirty before calling fault
handlers in user_mem_abort(), so we might end up having spurious dirty
pages if update of permissions or mapping has failed. Let's move these
two operations after the fault handlers, and they will be done only if
the fault has been handled successfully.

When an -EAGAIN errno is returned from the map handler, we hope to the
vcpu to enter guest directly instead of exiting back to userspace, so
adjust the return value at the end of function.

---

Changelogs

v2->v3:
- Rebased on top of v5.11-rc3
- Refine the commit messages
- Make some adjustment about return value in patch-2 and patch-3
- v2: 
https://lore.kernel.org/lkml/20201216122844.25092-1-wangyana...@huawei.com/

v1->v2:
- Make part of the diff a seperate patch (patch-1)
- Add Will's Signed-off-by for patch-1
- Return an errno when meeting changing permissions case in map path
- Add a new patch (patch-3)
- v1: 
https://lore.kernel.org/lkml/20201211080115.21460-1-wangyana...@huawei.com/

---

Yanan Wang (3):
  KVM: arm64: Adjust partial code of hyp stage-1 map and guest stage-2
map
  KVM: arm64: Filter out the case of only changing permissions from
stage-2 map path
  KVM: arm64: Mark the page dirty only if the fault is handled
successfully

 arch/arm64/include/asm/kvm_pgtable.h |  5 ++
 arch/arm64/kvm/hyp/pgtable.c | 83 
 arch/arm64/kvm/mmu.c | 13 +++--
 3 files changed, 60 insertions(+), 41 deletions(-)

-- 
2.19.1

[PATCH v3 3/3] KVM: arm64: Mark the page dirty only if the fault is handled successfully

2021-01-14 Thread Yanan Wang

We now set the pfn dirty and mark the page dirty before calling fault
handlers in user_mem_abort(), so we might end up having spurious dirty
pages if update of permissions or mapping has failed. Let's move these
two operations after the fault handlers, and they will be done only if
the fault has been handled successfully.

When an -EAGAIN errno is returned from the map handler, we hope to the
vcpu to enter guest directly instead of exiting back to userspace, so
adjust the return value at the end of function.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/mmu.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 7d2257cc5438..77cb2d28f2a4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -879,11 +879,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
if (vma_pagesize == PAGE_SIZE && !force_pte)
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
   &pfn, &fault_ipa);
-   if (writable) {
+   if (writable)
prot |= KVM_PGTABLE_PROT_W;
-   kvm_set_pfn_dirty(pfn);
-   mark_page_dirty(kvm, gfn);
-   }
 
if (fault_status != FSC_PERM && !device)
clean_dcache_guest_page(pfn, vma_pagesize);
@@ -911,11 +908,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 memcache);
}
 
+   /* Mark the page dirty only if the fault is handled successfully */
+   if (writable && !ret) {
+   kvm_set_pfn_dirty(pfn);
+   mark_page_dirty(kvm, gfn);
+   }
+
 out_unlock:
spin_unlock(&kvm->mmu_lock);
kvm_set_pfn_accessed(pfn);
kvm_release_pfn_clean(pfn);
-   return ret;
+   return ret != -EAGAIN ? ret : 0;
 }
 
 /* Resolve the access fault by making the page young again. */
-- 
2.19.1

[PATCH v2 3/3] KVM: arm64: Mark the page dirty only if the fault is handled successfully

2020-12-16 Thread Yanan Wang

We now mark the page dirty and set the bitmap before calling fault handlers
in user_mem_abort(), and we might end up having spurious dirty pages if
update of permissions or mapping has failed.
So, mark the page dirty only if the fault is handled successfully.

Let the guest directly enter again but not return to userspace if we were
trying to recreate the same mapping or only change access permissions
with BBM, which is not permitted in the mapping path.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/mmu.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 75814a02d189..72e516a10914 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -879,11 +879,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
if (vma_pagesize == PAGE_SIZE && !force_pte)
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
   &pfn, &fault_ipa);
-   if (writable) {
+   if (writable)
prot |= KVM_PGTABLE_PROT_W;
-   kvm_set_pfn_dirty(pfn);
-   mark_page_dirty(kvm, gfn);
-   }
 
if (fault_status != FSC_PERM && !device)
clean_dcache_guest_page(pfn, vma_pagesize);
@@ -911,6 +908,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 memcache);
}
 
+   /* Mark the page dirty only if the fault is handled successfully */
+   if (writable && !ret) {
+   kvm_set_pfn_dirty(pfn);
+   mark_page_dirty(kvm, gfn);
+   }
+
+   /* Let the guest directly enter again if we were trying to recreate the
+* same mapping or only change access permissions with BBM, which is not
+* permitted in the mapping path.
+*/
+   if (ret == -EAGAIN)
+   ret = 0;
+
 out_unlock:
spin_unlock(&kvm->mmu_lock);
kvm_set_pfn_accessed(pfn);
-- 
2.19.1

[PATCH v2 0/3] RFC: Solve several problems in stage 2 translation

2020-12-16 Thread Yanan Wang

Hi, this is the second version, thanks for reading.

PATCH1/3:
Procedures of hyp stage 1 mapping and guest stage 2 mapping are different, but
they are tied closely by function kvm_set_valid_leaf_pte(). So separate them by
rewriting kvm_set_valid_leaf_pte().

PATCH2/3:
To avoid unnecessary update and small loops, add prejudgement in the translation
fault handler: Skip updating the PTE with break-before-make if we are trying to
recreate the exact same mapping or only change the access permissions. Actually,
change of permissions will be handled through the relax_perms path next time if
necessary.

(1) If there are some vCPUs accessing the same GPA at the same time and the leaf
PTE is not set yet, then they will all cause translation faults and the first 
vCPU
holding mmu_lock will set valid leaf PTE, and the others will later update the 
old
PTE with a new one if they are different.

(2) When changing a leaf entry or a table entry with break-before-make, if there
are some vCPUs accessing the same GPA just catch the moment when the target PTE
is set invalid in a BBM procedure coincidentally, they will all cause 
translation
faults and will later update the old PTE with a new one if they are different.

The worst case can be like this: vCPU A causes a translation fault with RW prot 
and
sets the leaf PTE with RW permissions, and then the next vCPU B with RO prot 
updates
the PTE back to RO permissions with break-before-make. And the BBM-invalid 
moment
may trigger more unnecessary translation faults, then some useless small loops 
might
occur which could lead to vCPU stuck.

PATCH3/3:
We now mark the page dirty and set the bitmap before calling fault handlers in
user_mem_abort(), and we might end up having spurious dirty pages if update of
permissions or mapping has failed. So, mark the page dirty only if the fault is
handled successfully.

Let the guest directly enter again but not return to userspace if we were trying
to recreate the same mapping or only change access permissions with BBM, which 
is
not permitted in the mapping path.

Changes from v1:
- Make part of the diff as an independent patch (PATCH1/3),
  and add Will's Signed-off-by.
- Use *return -EPERM* way when changing permissions only in the mapping path.
- Add a new patch (PATCH3/3).

Yanan Wang (3):
  KVM: arm64: Decouple partial code of hyp stage 1 mapping and guest
stage 2 mapping
  KVM: arm64: Add prejudgement for relaxing permissions only case in
stage2 translation fault handler
  KVM: arm64: Mark the page dirty only if the fault is handled
successfully

 arch/arm64/kvm/hyp/pgtable.c | 78 
 arch/arm64/kvm/mmu.c | 18 +++--
 2 files changed, 58 insertions(+), 38 deletions(-)

-- 
2.19.1

[PATCH v2 2/3] KVM: arm64: Add prejudgement for relaxing permissions only case in stage2 translation fault handler

2020-12-16 Thread Yanan Wang

In dirty-logging, or dirty-logging-stopped time, even normal running
time of a guest configed with huge mappings and numbers of vCPUs,
translation faults by different vCPUs on the same GPA could occur
successively almost at the same time. There are two reasons for it.

(1) If there are some vCPUs accessing the same GPA at the same time and
the leaf PTE is not set yet, then they will all cause translation faults
and the first vCPU holding mmu_lock will set valid leaf PTE, and the
others will later update the old PTE with a new one if they are different.

(2) When changing a leaf entry or a table entry with break-before-make,
if there are some vCPUs accessing the same GPA just catch the moment when
the target PTE is set invalid in a BBM procedure coincidentally, they will
all cause translation faults and will later update the old PTE with a new
one if they are different.

The worst case can be like this: vCPU A causes a translation fault with RW
prot and sets the leaf PTE with RW permissions, and then the next vCPU B
with RO prot updates the PTE back to RO permissions with break-before-make.
And the BBM-invalid moment may trigger more unnecessary translation faults,
then some useless small loops might occur which could lead to vCPU stuck.

To avoid unnecessary update and small loops, add prejudgement in the
translation fault handler: Skip updating the PTE with break-before-make
if we are trying to recreate the exact same mapping or only change the
access permissions. Actually, change of permissions will be handled
through the relax_perms path next time if necessary.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 28 +++-
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 350f9f810930..8225ced49bad 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -45,6 +45,10 @@
 
 #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
 
+#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
+KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
+KVM_PTE_LEAF_ATTR_HI_S2_XN)
+
 struct kvm_pgtable_walk_data {
struct kvm_pgtable  *pgt;
struct kvm_pgtable_walker   *walker;
@@ -460,7 +464,7 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
return 0;
 }
 
-static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
   kvm_pte_t *ptep,
   struct stage2_map_data *data)
 {
@@ -469,13 +473,18 @@ static bool stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
struct page *page = virt_to_page(ptep);
 
if (!kvm_block_mapping_supported(addr, end, phys, level))
-   return false;
+   return 1;
 
new = kvm_init_valid_leaf_pte(phys, data->attr, level);
if (kvm_pte_valid(old)) {
-   /* Tolerate KVM recreating the exact same mapping */
-   if (old == new)
-   goto out;
+   /*
+* Skip updating the PTE with break-before-make if we are trying
+* to recreate the exact same mapping or only change the access
+* permissions. Actually, change of permissions will be handled
+* through the relax_perms path next time if necessary.
+*/
+   if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)))
+   return -EAGAIN;
 
/* There's an existing different valid leaf entry, so perform
 * break-before-make.
@@ -487,9 +496,8 @@ static bool stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
 
smp_store_release(ptep, new);
get_page(page);
-out:
data->phys += granule;
-   return true;
+   return 0;
 }
 
 static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
@@ -517,6 +525,7 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 
level,
 static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
struct stage2_map_data *data)
 {
+   int ret;
kvm_pte_t *childp, pte = *ptep;
struct page *page = virt_to_page(ptep);
 
@@ -527,8 +536,9 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 
level, kvm_pte_t *ptep,
return 0;
}
 
-   if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
-   return 0;
+   ret = stage2_map_walker_try_leaf(addr, end, level, ptep, data);
+   if (ret <= 0)
+   return ret;
 
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
-- 
2.19.1

[PATCH v2 1/3] KVM: arm64: Decouple partial code of hyp stage 1 mapping and guest stage 2 mapping

2020-12-16 Thread Yanan Wang

Procedures of hyp stage 1 mapping and guest stage 2 mapping are different,
but they are tied closely by function kvm_set_valid_leaf_pte().
So separate them by rewriting kvm_set_valid_leaf_pte().

Signed-off-by: Will Deacon 
Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 54 ++--
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index bdf8e55ed308..350f9f810930 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -170,10 +170,9 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t 
*childp)
smp_store_release(ptep, pte);
 }
 
-static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
-  u32 level)
+static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
 {
-   kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+   kvm_pte_t pte = kvm_phys_to_pte(pa);
u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
   KVM_PTE_TYPE_BLOCK;
 
@@ -181,12 +180,7 @@ static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 
pa, kvm_pte_t attr,
pte |= FIELD_PREP(KVM_PTE_TYPE, type);
pte |= KVM_PTE_VALID;
 
-   /* Tolerate KVM recreating the exact same mapping. */
-   if (kvm_pte_valid(old))
-   return old == pte;
-
-   smp_store_release(ptep, pte);
-   return true;
+   return pte;
 }
 
 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
@@ -341,12 +335,17 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
 static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep, struct hyp_map_data *data)
 {
+   kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
 
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
 
-   WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
+   /* Tolerate KVM recreating the exact same mapping */
+   new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+   if (old != new && !WARN_ON(kvm_pte_valid(old)))
+   smp_store_release(ptep, new);
+
data->phys += granule;
return true;
 }
@@ -465,27 +464,29 @@ static bool stage2_map_walker_try_leaf(u64 addr, u64 end, 
u32 level,
   kvm_pte_t *ptep,
   struct stage2_map_data *data)
 {
+   kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
+   struct page *page = virt_to_page(ptep);
 
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
 
-   /*
-* If the PTE was already valid, drop the refcount on the table
-* early, as it will be bumped-up again in stage2_map_walk_leaf().
-* This ensures that the refcount stays constant across a valid to
-* valid PTE update.
-*/
-   if (kvm_pte_valid(*ptep))
-   put_page(virt_to_page(ptep));
+   new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+   if (kvm_pte_valid(old)) {
+   /* Tolerate KVM recreating the exact same mapping */
+   if (old == new)
+   goto out;
 
-   if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
-   goto out;
+   /* There's an existing different valid leaf entry, so perform
+* break-before-make.
+*/
+   kvm_set_invalid_pte(ptep);
+   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+   put_page(page);
+   }
 
-   /* There's an existing valid leaf entry, so perform break-before-make */
-   kvm_set_invalid_pte(ptep);
-   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
-   kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
+   smp_store_release(ptep, new);
+   get_page(page);
 out:
data->phys += granule;
return true;
@@ -527,7 +528,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 
level, kvm_pte_t *ptep,
}
 
if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
-   goto out_get_page;
+   return 0;
 
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
@@ -551,9 +552,8 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 
level, kvm_pte_t *ptep,
}
 
kvm_set_table_pte(ptep, childp);
-
-out_get_page:
get_page(page);
+
return 0;
 }
 
-- 
2.19.1

[RFC PATCH 0/1] Add prejudgement for relaxing permissions only case

2020-12-11 Thread Yanan Wang

Hi folks,

Found that in dirty-logging, or dirty-logging-stopped time, even normal running
time of a guest configed with huge mappings and numbers of vCPUs, translation
faults by different vCPUs on the same GPA could occur successively almost at the
same time. See below for the trace log, and there are two reasons to explain.
 
(1) If there are some vCPUs accessing the same GPA at the same time and the leaf
PTE is not set yet, then they will all cause translation faults and the first
vCPU holding mmu_lock will set valid leaf PTE, and the others will later choose
to update the leaf PTE or not.  
 

  
(2) When changing a leaf entry or a table entry with break-before-make, if there
are some vCPUs accessing the same GPA just catch the moment when the target PTE
is set invalid in a BBM procedure coincidentally, they will all cause 
translation
faults and will later choose to update the leaf PTE or not. 
 

  
The worst case can be like this: some vCPUs cause translation faults on the same
GPA with different prots, they will fight each other by changing back access
permissions of the PTE with break-before-make. And the BBM-invalid moment might
trigger more unnecessary translation faults. As a result, some useless small
loops will occur, which could lead to vCPU stuck. We have met the stuck
occasionally in guest migration and migration-stop time.
 

  
To avoid unnecessary update and small loops, add prejudgement in the translation
fault handler: Skip updating the valid leaf PTE if we are trying to recreate
exactly the same mapping or to reduce access permissions only(such as RW-->RO).
And update the valid leaf PTE without break-before-make if we are trying to add
more permissions only.  
 

Yanan Wang (1): 
 
  KVM: arm64: Add prejudgement for relaxing permissions only case in
 
stage2 translation fault handler
 

  
 arch/arm64/kvm/hyp/pgtable.c | 73 +--- 
 
 1 file changed, 52 insertions(+), 21 deletions(-)  
 

  
--  
 
2.19.1  
 

Trace log for a guest with 96 vCPUs and huge mappings by 1G.

*
Recreating the same mappings and small loops in dirty-logging period.
*
Recreating the same mappings:
  CPU 94/KVM-8590[094] ...2 82538.821614: user_mem_abort: 
logging_active 1, vcpu_id 94, f_ipa 0x83fffc000 , fault_status 0x4, prot 0x6, 
vma_pagesize 4096  , write_fault 1, exec_fault 0
  CPU 94/KVM-8590[094] ...2 82538.821615: 
stage2_map_walker_try_leaf_equal: addr 0x83fffc000 , level 3, old_pte 
0x40002a7fffc7ff, new_pte 0x40002a7fffc7ff
  CPU 55/KVM-8547[055] ...2 82538.821618: user_mem_abort: 
logging_active 1, vcpu_id 55, f_ipa 0x83fffc000 , fault_status 0x4, prot 0x6, 
vma_pagesize 4096  , write_fault 1, exec_fault 0
  CPU 55/KVM-8547[055] ...2 82538.821619: 
stage2_map_walker_try_leaf_equal: addr 0x83fffc000 , level 3, old_pte 
0x40002a7fffc7ff, new_pte 0x40002a7fffc7ff
  CPU 78/KVM-8572[078] ...2 82538.821620: user_mem_abort: 
logging_active 1, vcpu_id 78, f_ipa 0x83fffc000 , fault_status 0x4, prot 0x6, 
vma_pagesize 4096  , write_fault 1, exec_fault 0
  CPU 78/KVM-8572[078] ...2 82538.821622: 
stage2_map_walker_try_leaf_equal: addr 0x83fffc000 , level 3, old_pte 
0x40002a7fffc7ff, new_pte 0x40002a7fffc7ff
  CPU 59/KVM-8552[059] ...2 82538.821624: user_mem_abort: 
logging_active 1, vcpu_id 59, f_ipa 0x83fffc000 , fault_status 0x4, prot 0x6, 
vma_pagesize 4096  , write_fault 1, exec_fault 0
  CPU 59/KVM-8552[059] ...2 82538.821625: 
stage2_map_walker_try_leaf_equal: addr 0x83fffc000 , level 3, old_pte 
0x40002a7fffc7ff, new_pte 0x40002a7fffc7ff
  CPU 57/KVM-8549[057] ...2 82538.821626: user_mem_abort: 
logging_active 1, vcpu_id 57, f_ipa 0x83fffc000 , fault_status 0x4, prot 0x6, 
vma_pagesize 4096  , write_fault 1, exec_fault 0
  CPU 57/KVM-8549[057] ...2 82538.821626: 
stage2_map_walker_try_leaf_equal: addr 0x83fffc000 , level 3, old_pte 
0x40002a7fffc7ff, new_

[RFC PATCH] KVM: arm64: Add prejudgement for relaxing permissions only case in stage2 translation fault handler

2020-12-11 Thread Yanan Wang

In dirty-logging, or dirty-logging-stopped time, even normal running
time of a guest configed with huge mappings and numbers of vCPUs,
translation faults by different vCPUs on the same GPA could occur
successively almost at the same time. There are two reasons for it.

(1) If there are some vCPUs accessing the same GPA at the same time
and the leaf PTE is not set yet, then they will all cause translation
faults and the first vCPU holding mmu_lock will set valid leaf PTE,
and the others will later choose to update the leaf PTE or not.

(2) When changing a leaf entry or a table entry with break-before-make,
if there are some vCPUs accessing the same GPA just catch the moment
when the target PTE is set invalid in a BBM procedure coincidentally,
they will all cause translation faults and will later choose to update
the leaf PTE or not.

The worst case can be like this: some vCPUs cause translation faults
on the same GPA with different prots, they will fight each other by
changing back access permissions of the PTE with break-before-make.
And the BBM-invalid moment might trigger more unnecessary translation
faults. As a result, some useless small loops will occur, which could
lead to vCPU stuck.

To avoid unnecessary update and small loops, add prejudgement in the
translation fault handler: Skip updating the valid leaf PTE if we are
trying to recreate exactly the same mapping or to reduce access
permissions only(such as RW-->RO). And update the valid leaf PTE without
break-before-make if we are trying to add more permissions only.

Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 73 +---
 1 file changed, 52 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 23a01dfcb27a..f8b3248cef1c 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -45,6 +45,8 @@
 
 #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
 
+#define KVM_PTE_LEAF_ATTR_PERMS(GENMASK(7, 6) | BIT(54))
+
 struct kvm_pgtable_walk_data {
struct kvm_pgtable  *pgt;
struct kvm_pgtable_walker   *walker;
@@ -170,10 +172,9 @@ static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t 
*childp)
smp_store_release(ptep, pte);
 }
 
-static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
-  u32 level)
+static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
 {
-   kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+   kvm_pte_t pte = kvm_phys_to_pte(pa);
u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
   KVM_PTE_TYPE_BLOCK;
 
@@ -181,12 +182,7 @@ static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 
pa, kvm_pte_t attr,
pte |= FIELD_PREP(KVM_PTE_TYPE, type);
pte |= KVM_PTE_VALID;
 
-   /* Tolerate KVM recreating the exact same mapping. */
-   if (kvm_pte_valid(old))
-   return old == pte;
-
-   smp_store_release(ptep, pte);
-   return true;
+   return pte;
 }
 
 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
@@ -341,12 +337,17 @@ static int hyp_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
 static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep, struct hyp_map_data *data)
 {
+   kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
 
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
 
-   WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
+   /* Tolerate KVM recreating the exact same mapping. */
+   new = kvm_init_valid_leaf_pte(phys, data->attr, level);
+   if (old != new && !WARN_ON(kvm_pte_valid(old)))
+   smp_store_release(ptep, new);
+
data->phys += granule;
return true;
 }
@@ -461,25 +462,56 @@ static int stage2_map_set_prot_attr(enum kvm_pgtable_prot 
prot,
return 0;
 }
 
+static bool stage2_set_valid_leaf_pte_pre(u64 addr, u32 level,
+ kvm_pte_t *ptep, kvm_pte_t new,
+ struct stage2_map_data *data)
+{
+   kvm_pte_t old = *ptep, old_attr, new_attr;
+
+   if ((old ^ new) & (~KVM_PTE_LEAF_ATTR_PERMS))
+   return false;
+
+   /*
+* Skip updating if we are trying to recreate exactly the same mapping
+* or to reduce the access permissions only. And update the valid leaf
+* PTE without break-before-make if we are trying to add more access
+* permissions only.
+*/
+   old_attr = (old & KVM_PTE_LEAF_ATTR_PERMS) ^ KVM_PTE_LEAF_ATTR_HI_S2_XN;
+   new_attr = (new & KVM_PTE_LEAF_ATTR_PERMS) ^ KVM_PTE_LEAF_ATTR_HI_S2_XN;
+   if (new_attr <=

[PATCH v2 2/3] KVM: arm64: Fix handling of merging tables into a block entry

2020-12-01 Thread Yanan Wang

In dirty logging case(logging_active == True), we need to collapse a
block entry into a table if necessary. After dirty logging is canceled,
when merging tables back into a block entry, we should not only free
the non-huge page-table pages but also invalidate all the TLB entries of
non-huge mappings for the block. Without enough TLBI, multiple TLB entries
for the memory in the block will be cached.

Signed-off-by: Will Deacon 
Signed-off-by: Yanan Wang 
---
 arch/arm64/kvm/hyp/pgtable.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index b232bdd142a6..23a01dfcb27a 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -496,7 +496,13 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, 
u32 level,
return 0;
 
kvm_set_invalid_pte(ptep);
-   kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0);
+
+   /*
+* Invalidate the whole stage-2, as we may have numerous leaf
+* entries below us which would otherwise need invalidating
+* individually.
+*/
+   kvm_call_hyp(__kvm_tlb_flush_vmid, data->mmu);
data->anchor = ptep;
return 0;
 }
-- 
2.19.1

[PATCH v2 0/3] Fix several bugs in KVM stage 2 translation

2020-12-01 Thread Yanan Wang

When installing a new pte entry or updating an old valid entry in stage 2
translation, we use get_page()/put_page() to record page_count of the page-table
pages. PATCH 1/3 aims to fix incorrect use of get_page()/put_page() in stage 2,
which might make page-table pages unable to be freed when unmapping a range.

When dirty logging of a guest with hugepages is finished, we should merge tables
back into a block entry if adjustment of huge mapping is found necessary.
In addition to installing the block entry, we should not only free the non-huge
page-table pages but also invalidate all the TLB entries of non-huge mappings 
for
the block. PATCH 2/3 adds enough TLBI when merging tables into a block entry.

The rewrite of page-table code and fault handling add two different handlers
for "just relaxing permissions" and "map by stage2 page-table walk", that's
good improvement. Yet, in function user_mem_abort(), conditions where we choose
the above two fault handlers are not strictly distinguished. This will causes
guest errors such as infinite-loop (soft lockup will occur in result), because 
of
calling the inappropriate fault handler. So, a solution that can strictly
distinguish conditions is introduced in PATCH 3/3.

Changes from v1:
 * In PATCH 1/3, introduce a more concise fix.
 * In PATCH 2/3, using full S2 TLB invalidation when merging tables into
   a block entry.

Yanan Wang (3):
  KVM: arm64: Fix possible memory leak in kvm stage2
  KVM: arm64: Fix handling of merging tables into a block entry
  KVM: arm64: Add usage of stage 2 fault lookup level in
user_mem_abort()

 arch/arm64/include/asm/esr.h |  1 +
 arch/arm64/include/asm/kvm_emulate.h |  5 +
 arch/arm64/kvm/hyp/pgtable.c | 11 ++-
 arch/arm64/kvm/mmu.c | 11 +--
 4 files changed, 25 insertions(+), 3 deletions(-)


-- 
2.19.1

[PATCH v2 3/3] KVM: arm64: Add usage of stage 2 fault lookup level in user_mem_abort()

2020-12-01 Thread Yanan Wang

If we get a FSC_PERM fault, just using (logging_active && writable) to
determine calling kvm_pgtable_stage2_map(). There will be two more cases
we should consider.

(1) After logging_active is configged back to false from true. When we
get a FSC_PERM fault with write_fault and adjustment of hugepage is needed,
we should merge tables back to a block entry. This case is ignored by still
calling kvm_pgtable_stage2_relax_perms(), which will lead to an endless
loop and guest panic due to soft lockup.

(2) We use (FSC_PERM && logging_active && writable) to determine
collapsing a block entry into a table by calling kvm_pgtable_stage2_map().
But sometimes we may only need to relax permissions when trying to write
to a page other than a block.
In this condition,using kvm_pgtable_stage2_relax_perms() will be fine.

The ISS filed bit[1:0] in ESR_EL2 regesiter indicates the stage2 lookup
level at which a D-abort or I-abort occurred. By comparing granule of
the fault lookup level with vma_pagesize, we can strictly distinguish
conditions of calling kvm_pgtable_stage2_relax_perms() or
kvm_pgtable_stage2_map(), and the above two cases will be well considered.

Suggested-by: Keqian Zhu 
Signed-off-by: Yanan Wang 
---
 arch/arm64/include/asm/esr.h |  1 +
 arch/arm64/include/asm/kvm_emulate.h |  5 +
 arch/arm64/kvm/mmu.c | 11 +--
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 22c81f1edda2..85a3e49f92f4 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -104,6 +104,7 @@
 /* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */
 #define ESR_ELx_FSC(0x3F)
 #define ESR_ELx_FSC_TYPE   (0x3C)
+#define ESR_ELx_FSC_LEVEL  (0x03)
 #define ESR_ELx_FSC_EXTABT (0x10)
 #define ESR_ELx_FSC_SERROR (0x11)
 #define ESR_ELx_FSC_ACCESS (0x08)
diff --git a/arch/arm64/include/asm/kvm_emulate.h 
b/arch/arm64/include/asm/kvm_emulate.h
index 5ef2669ccd6c..00bc6f1234ba 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -350,6 +350,11 @@ static __always_inline u8 
kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vc
return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_TYPE;
 }
 
+static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu 
*vcpu)
+{
+   return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_LEVEL;
+}
+
 static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu)
 {
switch (kvm_vcpu_trap_get_fault(vcpu)) {
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1a01da9fdc99..75814a02d189 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -754,10 +754,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
-   unsigned long vma_pagesize;
+   unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
+   unsigned long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
 
+   fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_BUG_ON(write_fault && exec_fault);
@@ -896,7 +898,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
 
-   if (fault_status == FSC_PERM && !(logging_active && writable)) {
+   /*
+* Under the premise of getting a FSC_PERM fault, we just need to relax
+* permissions only if vma_pagesize equals fault_granule. Otherwise,
+* kvm_pgtable_stage2_map() should be called to change block size.
+*/
+   if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
} else {
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
-- 
2.19.1

1 2 >

1 - 100 of 105 matches

Mail list logo