[RFC PATCH] vfio: Implement new Ioctl VFIO_IOMMU_GET_DIRTY_BITMAP

2018-04-10 Thread Yulei Zhang
Corresponding to the V4 migration patch set for vfio pci device,
this patch is to implement the new ioctl VFIO_IOMMU_GET_DIRTY_BITMAP
to fulfill the requirement for vfio-mdev device live migration, which
need copy the memory that has been pinned in iommu container to the
target VM for mdev device status restore.

Signed-off-by: Yulei Zhang 
---
 drivers/vfio/vfio_iommu_type1.c | 42 +
 include/uapi/linux/vfio.h   | 14 ++
 2 files changed, 56 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5c212bf..6cd2142 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson "
@@ -1658,6 +1659,23 @@ static int vfio_domains_have_iommu_cache(struct 
vfio_iommu *iommu)
return ret;
 }
 
+static void vfio_dma_update_dirty_bitmap(struct vfio_iommu *iommu,
+   u64 start_addr, u64 npage, void *bitmap)
+{
+   u64 iova = start_addr;
+   struct vfio_dma *dma;
+   int i;
+
+   for (i = 0; i < npage; i++) {
+   dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+   if (dma)
+   if (vfio_find_vpfn(dma, iova))
+   set_bit(i, bitmap);
+
+   iova += PAGE_SIZE;
+   }
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -1728,6 +1746,30 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
return copy_to_user((void __user *)arg, &unmap, minsz) ?
-EFAULT : 0;
+   } else if (cmd == VFIO_IOMMU_GET_DIRTY_BITMAP) {
+   struct vfio_iommu_get_dirty_bitmap d;
+   unsigned long bitmap_sz;
+   unsigned int *bitmap;
+
+   minsz = offsetofend(struct vfio_iommu_get_dirty_bitmap,
+   page_nr);
+
+   if (copy_from_user(&d, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   bitmap_sz = (BITS_TO_LONGS(d.page_nr) + 1) *
+   sizeof(unsigned long);
+   bitmap = vzalloc(bitmap_sz);
+   vfio_dma_update_dirty_bitmap(iommu, d.start_addr,
+d.page_nr, bitmap);
+
+   if (copy_to_user((void __user *)arg + minsz,
+   bitmap, bitmap_sz)) {
+   vfree(bitmap);
+   return -EFAULT;
+   }
+   vfree(bitmap);
+   return 0;
}
 
return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 1aa7b82..d4fd5af 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -665,6 +665,20 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE  _IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 17,
+ * struct vfio_iommu_get_dirty_bitmap)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_iommu_get_dirty_bitmap {
+   __u64  start_addr;
+   __u64  page_nr;
+   __u8   dirty_bitmap[];
+};
+
+#define VFIO_IOMMU_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
 
 /*
-- 
2.7.4



[RFC 1/9] Introduce new fields in kvm_arch/vcpu_arch struct for direct build EPT support

2020-08-05 Thread Yulei Zhang
From: Yulei Zhang 

Add parameter global_root_hpa for saving direct build global EPT root point,
and add per-vcpu flag direct_build_tdp to indicate using global EPT root
point.

Signed-off-by: Yulei Zhang 
---
 arch/x86/include/asm/kvm_host.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 86e2e0272c57..2407b872f493 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -821,6 +821,9 @@ struct kvm_vcpu_arch {
 
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
+
+   /* vcpu use pre-constructed EPT */
+   bool direct_build_tdp;
 };
 
 struct kvm_lpage_info {
@@ -983,6 +986,8 @@ struct kvm_arch {
 
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
+   /* global root hpa for pre-constructed EPT */
+   hpa_t  global_root_hpa;
 };
 
 struct kvm_vm_stat {
-- 
2.17.1



[RFC 0/9] KVM:x86/mmu:Introduce parallel memory virtualization to boost performance

2020-08-05 Thread Yulei Zhang
From: Yulei Zhang 

Currently in KVM memory virtulization we relay on mmu_lock to synchronize
the memory mapping update, which make vCPUs work in serialize mode and
slow down the execution, especially after migration to do substantial
memory mapping setup, and performance get worse if increase vCPU numbers
and guest memories.
  
The idea we present in this patch set is to mitigate the issue with
pre-constructed memory mapping table. We will fast pin the guest memory
to build up a global memory mapping table according to the guest memslots
changes and apply it to cr3, so that after guest starts up all the vCPUs
would be able to update the memory concurrently, thus the performance 
improvement is expected.

And after test the initial patch with memory dirty pattern workload, we
have seen positive results even with huge page enabled. For example,
guest with 32 vCPUs and 64G memories, in 2M/1G huge page mode we would get
more than 50% improvement. 


Yulei Zhang (9):
  Introduce new fields in kvm_arch/vcpu_arch struct for direct build EPT
support
  Introduce page table population function for direct build EPT feature
  Introduce page table remove function for direct build EPT feature
  Add release function for direct build ept when guest VM exit
  Modify the page fault path to meet the direct build EPT requirement
  Apply the direct build EPT according to the memory slots change
  Add migration support when using direct build EPT
  Introduce kvm module parameter global_tdp to turn on the direct build
EPT mode
  Handle certain mmu exposed functions properly while turn on direct
build EPT mode

 arch/mips/kvm/mips.c|  13 +
 arch/powerpc/kvm/powerpc.c  |  13 +
 arch/s390/kvm/kvm-s390.c|  13 +
 arch/x86/include/asm/kvm_host.h |  13 +-
 arch/x86/kvm/mmu/mmu.c  | 537 ++--
 arch/x86/kvm/svm/svm.c  |   2 +-
 arch/x86/kvm/vmx/vmx.c  |  17 +-
 arch/x86/kvm/x86.c  |  55 ++--
 include/linux/kvm_host.h|   7 +-
 virt/kvm/kvm_main.c |  43 ++-
 10 files changed, 648 insertions(+), 65 deletions(-)

-- 
2.17.1



[RFC 3/9] Introduce page table remove function for direct build EPT feature

2020-08-05 Thread Yulei Zhang
From: Yulei Zhang 

During guest boots up it will modify the memory slots multiple times,
so add page table remove function to free pre-pinned memory according
to the the memory slot changes.

Signed-off-by: Yulei Zhang 
---
 arch/x86/kvm/mmu/mmu.c | 56 ++
 1 file changed, 56 insertions(+)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 1609012be67d..539974183653 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6454,6 +6454,62 @@ int kvm_direct_tdp_populate_page_table(struct kvm *kvm, 
struct kvm_memory_slot *
return 0;
 }
 
+static int __kvm_remove_spte(struct kvm *kvm, u64 *addr, gfn_t gfn, int level)
+{
+   int i;
+   int ret = level;
+   bool present = false;
+   kvm_pfn_t pfn;
+   u64 *sptep = (u64 *)__va((*addr) & PT64_BASE_ADDR_MASK);
+   unsigned index = SHADOW_PT_INDEX(gfn << PAGE_SHIFT, level);
+
+   for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+   if (is_shadow_present_pte(sptep[i])) {
+   if (i == index) {
+   if (!is_last_spte(sptep[i], level)) {
+   ret = __kvm_remove_spte(kvm, &sptep[i], 
gfn, level - 1);
+   if (is_shadow_present_pte(sptep[i]))
+   return ret;
+   } else {
+   pfn = spte_to_pfn(sptep[i]);
+   mmu_spte_clear_track_bits(&sptep[i]);
+   kvm_release_pfn_clean(pfn);
+   if (present)
+   return ret;
+   }
+   } else {
+   if (i > index)
+   return ret;
+   else
+   present = true;
+   }
+   }
+   }
+
+   if (!present) {
+   pfn = spte_to_pfn(*addr);
+   mmu_spte_clear_track_bits(addr);
+   kvm_release_pfn_clean(pfn);
+   }
+   return ret;
+}
+
+void kvm_direct_tdp_remove_page_table(struct kvm *kvm, struct kvm_memory_slot 
*slot)
+{
+   gfn_t gfn = slot->base_gfn;
+   int host_level;
+
+   if (!kvm->arch.global_root_hpa)
+   return;
+
+   for (gfn = slot->base_gfn;
+   gfn < slot->base_gfn + slot->npages;
+   gfn += KVM_PAGES_PER_HPAGE(host_level))
+   host_level = __kvm_remove_spte(kvm, 
&(kvm->arch.global_root_hpa), gfn, PT64_ROOT_4LEVEL);
+
+   kvm_flush_remote_tlbs(kvm);
+}
+
 /*
  * Calculate mmu pages needed for kvm.
  */
-- 
2.17.1



[RFC 2/9] Introduce page table population function for direct build EPT feature

2020-08-05 Thread Yulei Zhang
From: Yulei Zhang 

Page table population function will pin the memory and pre-construct
the EPT base on the input memory slot configuration so that it won't
relay on the page fault interrupt to setup the page table.

Signed-off-by: Yulei Zhang 
---
 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/mmu/mmu.c  | 212 +++-
 arch/x86/kvm/svm/svm.c  |   2 +-
 arch/x86/kvm/vmx/vmx.c  |  17 ++-
 include/linux/kvm_host.h|   4 +-
 virt/kvm/kvm_main.c |  30 -
 6 files changed, 250 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2407b872f493..69c946831ca7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1152,7 +1152,7 @@ struct kvm_x86_ops {
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
int (*get_tdp_level)(struct kvm_vcpu *vcpu);
-   u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+   u64 (*get_mt_mask)(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn, 
bool is_mmio);
 
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 70cf2c1a1423..1609012be67d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -235,6 +235,11 @@ struct kvm_shadow_walk_iterator {
({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
 __shadow_walk_next(&(_walker), spte))
 
+#define for_each_direct_build_shadow_entry(_walker, shadow_addr, _addr, level) 
\
+   for (__shadow_walk_init(&(_walker), shadow_addr, _addr, level); 
\
+shadow_walk_okay(&(_walker));  
\
+shadow_walk_next(&(_walker)))
+
 static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
@@ -2564,13 +2569,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
return sp;
 }
 
+static void __shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+  hpa_t shadow_addr, u64 addr, int level)
+{
+   iterator->addr = addr;
+   iterator->shadow_addr = shadow_addr;
+   iterator->level = level;
+   iterator->sptep = NULL;
+}
+
 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator 
*iterator,
struct kvm_vcpu *vcpu, hpa_t root,
u64 addr)
 {
-   iterator->addr = addr;
-   iterator->shadow_addr = root;
-   iterator->level = vcpu->arch.mmu->shadow_root_level;
+   __shadow_walk_init(iterator, root, addr, 
vcpu->arch.mmu->shadow_root_level);
 
if (iterator->level == PT64_ROOT_4LEVEL &&
vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
@@ -3037,7 +3049,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (level > PT_PAGE_TABLE_LEVEL)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
-   spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
+   spte |= kvm_x86_ops.get_mt_mask(vcpu->kvm, vcpu, gfn,
kvm_is_mmio_pfn(pfn));
 
if (host_writable)
@@ -6250,6 +6262,198 @@ int kvm_mmu_module_init(void)
return ret;
 }
 
+static int direct_build_tdp_set_spte(struct kvm *kvm, struct kvm_memory_slot 
*slot,
+   u64 *sptep, unsigned pte_access, int level,
+   gfn_t gfn, kvm_pfn_t pfn, bool speculative,
+   bool dirty, bool host_writable)
+{
+   u64 spte = 0;
+   int ret = 0;
+   /*
+* For the EPT case, shadow_present_mask is 0 if hardware
+* supports exec-only page table entries.  In that case,
+* ACC_USER_MASK and shadow_user_mask are used to represent
+* read access.  See FNAME(gpte_access) in paging_tmpl.h.
+*/
+   spte |= shadow_present_mask;
+   if (!speculative)
+   spte |= shadow_accessed_mask;
+
+   if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+   is_nx_huge_page_enabled()) {
+   pte_access &= ~ACC_EXEC_MASK;
+   }
+
+   if (pte_access & ACC_EXEC_MASK)
+   spte |= shadow_x_mask;
+   else
+   spte |= shadow_nx_mask;
+
+   if (pte_access & ACC_USER_MASK)
+   spte |= shadow_user_mask;
+
+   if (level > PT_PAGE_TABLE_LEVEL)
+   spte |= PT_PAGE_SIZE_MASK;
+
+   if (tdp_enabled)
+   spte |= kvm_x86_ops.get_mt_mask(kvm, NULL, gfn, 
kvm_is_mmio_pfn(pfn));
+
+   if (host_writable)
+   spte |= SPTE_HOST_WRITEABLE;
+   else
+   

[RFC 9/9] Handle certain mmu exposed functions properly while turn on direct build EPT mode

2020-08-05 Thread Yulei Zhang
From: Yulei Zhang 

Signed-off-by: Yulei Zhang 
---
 arch/x86/kvm/mmu/mmu.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f963a3b0500f..bad01f66983d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1775,6 +1775,9 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
int i;
bool write_protected = false;
 
+   if (kvm->arch.global_root_hpa)
+   return write_protected;
+
for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
rmap_head = __gfn_to_rmap(gfn, i, slot);
write_protected |= __rmap_write_protect(kvm, rmap_head, true);
@@ -5835,6 +5838,9 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
  */
 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 {
+   if (kvm->arch.global_root_hpa)
+   return;
+
lockdep_assert_held(&kvm->slots_lock);
 
spin_lock(&kvm->mmu_lock);
@@ -5897,6 +5903,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, 
gfn_t gfn_end)
struct kvm_memory_slot *memslot;
int i;
 
+   if (kvm->arch.global_root_hpa)
+   return;
+
spin_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
-- 
2.17.1



[RFC 4/9] Add release function for direct build ept when guest VM exit

2020-08-05 Thread Yulei Zhang
From: Yulei Zhang 

Release the pre-pinned memory in direct build ept when guest VM
exit.

Signed-off-by: Yulei Zhang 
---
 arch/x86/kvm/mmu/mmu.c | 39 ++-
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 539974183653..df703deac928 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4360,8 +4360,11 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, 
gpa_t new_cr3,
 
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
 {
-   __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
- skip_tlb_flush);
+   if (!vcpu->arch.direct_build_tdp)
+   __kvm_mmu_new_cr3(vcpu, new_cr3, 
kvm_mmu_calc_root_page_role(vcpu),
+ skip_tlb_flush);
+   else
+   vcpu->arch.mmu->root_hpa = INVALID_PAGE;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
 
@@ -5204,10 +5207,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
-   kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
-   WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
-   kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
-   WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
+
+   if (!vcpu->arch.direct_build_tdp) {
+   kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, 
KVM_MMU_ROOTS_ALL);
+   WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
+   kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, 
KVM_MMU_ROOTS_ALL);
+   WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
+   }
+
+   vcpu->arch.direct_build_tdp = false;
+   vcpu->arch.mmu->root_hpa = INVALID_PAGE;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
@@ -6510,6 +6519,14 @@ void kvm_direct_tdp_remove_page_table(struct kvm *kvm, 
struct kvm_memory_slot *s
kvm_flush_remote_tlbs(kvm);
 }
 
+void kvm_direct_tdp_release_global_root(struct kvm *kvm)
+{
+   if (kvm->arch.global_root_hpa)
+   __kvm_walk_global_page(kvm, kvm->arch.global_root_hpa, 
kvm_x86_ops.get_tdp_level(NULL));
+
+   return;
+}
+
 /*
  * Calculate mmu pages needed for kvm.
  */
@@ -6536,9 +6553,13 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct 
kvm *kvm)
 
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-   kvm_mmu_unload(vcpu);
-   free_mmu_pages(&vcpu->arch.root_mmu);
-   free_mmu_pages(&vcpu->arch.guest_mmu);
+   if (vcpu->arch.direct_build_tdp) {
+   vcpu->arch.mmu->root_hpa = INVALID_PAGE;
+   } else {
+   kvm_mmu_unload(vcpu);
+   free_mmu_pages(&vcpu->arch.root_mmu);
+   free_mmu_pages(&vcpu->arch.guest_mmu);
+   }
mmu_free_memory_caches(vcpu);
 }
 
-- 
2.17.1



[RFC 7/9] Add migration support when using direct build EPT

2020-08-05 Thread Yulei Zhang
From: Yulei Zhang 

Make migration available in direct build ept mode whether
pml enabled or not.

Signed-off-by: Yulei Zhang 
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/mmu/mmu.c  | 153 +++-
 arch/x86/kvm/x86.c  |  44 +
 3 files changed, 178 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 69c946831ca7..7063b9d2cac0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1329,6 +1329,8 @@ void kvm_mmu_zap_all(struct kvm *kvm);
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
+void kvm_mmu_slot_direct_build_handle_wp(struct kvm *kvm,
+struct kvm_memory_slot *memslot);
 
 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
 bool pdptrs_changed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 33252e432c1b..485f7287aad2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -244,6 +244,8 @@ static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
+static int __kvm_write_protect_spte(struct kvm *kvm, struct kvm_memory_slot 
*slot,
+   gfn_t gfn, int level);
 static u64 __read_mostly shadow_nx_mask;
 static u64 __read_mostly shadow_x_mask;/* mutual exclusive with 
nx_mask */
 static u64 __read_mostly shadow_user_mask;
@@ -1685,11 +1687,18 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm 
*kvm,
 gfn_t gfn_offset, unsigned long mask)
 {
struct kvm_rmap_head *rmap_head;
+   gfn_t gfn;
 
while (mask) {
-   rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + 
__ffs(mask),
- PT_PAGE_TABLE_LEVEL, slot);
-   __rmap_write_protect(kvm, rmap_head, false);
+   if (kvm->arch.global_root_hpa) {
+   gfn = slot->base_gfn + gfn_offset + __ffs(mask);
+
+   __kvm_write_protect_spte(kvm, slot, gfn, 
PT_PAGE_TABLE_LEVEL);
+   } else {
+   rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + 
__ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+   __rmap_write_protect(kvm, rmap_head, false);
+   }
 
/* clear the first set bit */
mask &= mask - 1;
@@ -6558,6 +6567,144 @@ void kvm_direct_tdp_release_global_root(struct kvm *kvm)
return;
 }
 
+static int __kvm_write_protect_spte(struct kvm *kvm, struct kvm_memory_slot 
*slot,
+   gfn_t gfn, int level)
+{
+   int ret = 0;
+   /* add write protect on pte, tear down the page table if large page is 
enabled */
+   struct kvm_shadow_walk_iterator iterator;
+   unsigned long i;
+   kvm_pfn_t pfn;
+   struct page *page;
+   u64 *sptep;
+   u64 spte, t_spte;
+
+   for_each_direct_build_shadow_entry(iterator, kvm->arch.global_root_hpa,
+   gfn << PAGE_SHIFT, kvm_x86_ops.get_tdp_level(NULL)) {
+   if (iterator.level == level) {
+   break;
+   }
+   }
+
+   if (level != PT_PAGE_TABLE_LEVEL) {
+   sptep = iterator.sptep;
+
+   page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+   if (!page)
+   return ret;
+
+   t_spte = page_to_phys(page) | PT_PRESENT_MASK | 
PT_WRITABLE_MASK |
+   shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+
+   for (i = 0; i < KVM_PAGES_PER_HPAGE(level); i++) {
+
+   for_each_direct_build_shadow_entry(iterator, t_spte & 
PT64_BASE_ADDR_MASK,
+   gfn << PAGE_SHIFT, level - 1) {
+   if (iterator.level == PT_PAGE_TABLE_LEVEL) {
+   break;
+   }
+
+   if (!is_shadow_present_pte(*iterator.sptep)) {
+   struct page *page;
+   page = alloc_page(GFP_KERNEL | 
__GFP_ZERO);
+   if (!page) {
+   __kvm_walk_global_page(kvm, 
t_spte & PT64_BASE_ADDR_MASK, level - 1);
+   return ret;
+   }
+   spte = page_to_phys(p

[RFC 6/9] Apply the direct build EPT according to the memory slots change

2020-08-05 Thread Yulei Zhang
From: Yulei Zhang 

Construct the direct build ept when guest memory slots have been
changed, and issue mmu_reload request to update the CR3 so that
guest could use the pre-constructed EPT without page fault.

Signed-off-by: Yulei Zhang 
---
 arch/mips/kvm/mips.c   | 13 +
 arch/powerpc/kvm/powerpc.c | 13 +
 arch/s390/kvm/kvm-s390.c   | 13 +
 arch/x86/kvm/mmu/mmu.c | 33 ++---
 include/linux/kvm_host.h   |  3 +++
 virt/kvm/kvm_main.c| 13 +
 6 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 8f05dd0a0f4e..7e5608769696 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -257,6 +257,19 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
}
 }
 
+int kvm_direct_tdp_populate_page_table(struct kvm *kvm, struct kvm_memory_slot 
*slot)
+{
+   return 0;
+}
+
+void kvm_direct_tdp_remove_page_table(struct kvm *kvm, struct kvm_memory_slot 
*slot)
+{
+}
+
+void kvm_direct_tdp_release_global_root(struct kvm *kvm)
+{
+}
+
 static inline void dump_handler(const char *symbol, void *start, void *end)
 {
u32 *p;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index ad2f172c26a6..93066393e09d 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -712,6 +712,19 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvmppc_core_commit_memory_region(kvm, mem, old, new, change);
 }
 
+int kvm_direct_tdp_populate_page_table(struct kvm *kvm, struct kvm_memory_slot 
*slot)
+{
+   return 0;
+}
+
+void kvm_direct_tdp_remove_page_table(struct kvm *kvm, struct kvm_memory_slot 
*slot)
+{
+}
+
+void kvm_direct_tdp_release_global_root(struct kvm *kvm)
+{
+}
+
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
   struct kvm_memory_slot *slot)
 {
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d05bb040fd42..594c38a7cc9f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5008,6 +5008,19 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
return;
 }
 
+int kvm_direct_tdp_populate_page_table(struct kvm *kvm, struct kvm_memory_slot 
*slot)
+{
+   return 0;
+}
+
+void kvm_direct_tdp_remove_page_table(struct kvm *kvm, struct kvm_memory_slot 
*slot)
+{
+}
+
+void kvm_direct_tdp_release_global_root(struct kvm *kvm)
+{
+}
+
 static inline unsigned long nonhyp_mask(int i)
 {
unsigned int nonhyp_fai = (sclp.hmfai << i * 2) >> 30;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b59a4502d1f6..33252e432c1b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5203,13 +5203,20 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
int r;
 
-   r = mmu_topup_memory_caches(vcpu);
-   if (r)
-   goto out;
-   r = mmu_alloc_roots(vcpu);
-   kvm_mmu_sync_roots(vcpu);
-   if (r)
-   goto out;
+   if (vcpu->kvm->arch.global_root_hpa) {
+   vcpu->arch.direct_build_tdp = true;
+   vcpu->arch.mmu->root_hpa = vcpu->kvm->arch.global_root_hpa;
+   }
+
+   if (!vcpu->arch.direct_build_tdp) {
+   r = mmu_topup_memory_caches(vcpu);
+   if (r)
+   goto out;
+   r = mmu_alloc_roots(vcpu);
+   kvm_mmu_sync_roots(vcpu);
+   if (r)
+   goto out;
+   }
kvm_mmu_load_pgd(vcpu);
kvm_x86_ops.tlb_flush(vcpu, true);
 out:
@@ -6438,6 +6445,17 @@ int direct_build_mapping_level(struct kvm *kvm, struct 
kvm_memory_slot *slot, gf
return host_level;
 }
 
+static void kvm_make_direct_build_update(struct kvm *kvm)
+{
+   int i;
+   struct kvm_vcpu *vcpu;
+
+   kvm_for_each_vcpu(i, vcpu, kvm) {
+   kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+   kvm_vcpu_kick(vcpu);
+   }
+}
+
 int kvm_direct_tdp_populate_page_table(struct kvm *kvm, struct kvm_memory_slot 
*slot)
 {
gfn_t gfn;
@@ -6472,6 +6490,7 @@ int kvm_direct_tdp_populate_page_table(struct kvm *kvm, 
struct kvm_memory_slot *
direct_build_tdp_map(kvm, slot, gfn, pfn, host_level);
}
 
+   kvm_make_direct_build_update(kvm);
return 0;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d1f75ad5038b..767e5c4ed295 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -693,6 +693,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_memory_slot *old,
const struct kvm_memory_slot *new,
enum kvm_mr_change change);
+int kvm_direct_tdp_populate_page_table(struct kvm *kvm, struct kvm_memory_slot 
*slot);
+void kvm_direct_tdp_remove_page_table(struct kvm *kvm, struct kvm_memory_slot 
*slot);
+void kvm_direc

[RFC 5/9] Modify the page fault path to meet the direct build EPT requirement

2020-08-05 Thread Yulei Zhang
From: Yulei Zhang 

Refine the fast page fault code so that it can be used in either
normal ept mode or direct build EPT mode.

Signed-off-by: Yulei Zhang 
---
 arch/x86/kvm/mmu/mmu.c | 30 +-
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index df703deac928..b59a4502d1f6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3471,12 +3471,13 @@ static bool page_fault_can_be_fast(u32 error_code)
  * someone else modified the SPTE from its original value.
  */
 static bool
-fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, gpa_t gpa,
u64 *sptep, u64 old_spte, u64 new_spte)
 {
gfn_t gfn;
 
-   WARN_ON(!sp->role.direct);
+   WARN_ON(!vcpu->arch.direct_build_tdp &&
+   (!page_header(__pa(sptep))->role.direct));
 
/*
 * Theoretically we could also set dirty bit (and flush TLB) here in
@@ -3498,7 +3499,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
 * The gfn of direct spte is stable since it is
 * calculated by sp->gfn.
 */
-   gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+   gfn = gpa >> PAGE_SHIFT;
kvm_vcpu_mark_page_dirty(vcpu, gfn);
}
 
@@ -3526,10 +3527,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, 
gpa_t cr2_or_gpa,
u32 error_code)
 {
struct kvm_shadow_walk_iterator iterator;
-   struct kvm_mmu_page *sp;
bool fault_handled = false;
u64 spte = 0ull;
uint retry_count = 0;
+   int pte_level = 0;
 
if (!page_fault_can_be_fast(error_code))
return false;
@@ -3539,12 +3540,20 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, 
gpa_t cr2_or_gpa,
do {
u64 new_spte;
 
-   for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
+   for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, 
spte) {
if (!is_shadow_present_pte(spte))
break;
+   }
+
+   if (iterator.level < PT_PAGE_TABLE_LEVEL)
+   pte_level  = PT_PAGE_TABLE_LEVEL;
+   else
+   pte_level = iterator.level;
+
+   WARN_ON(!vcpu->arch.direct_build_tdp &&
+   (pte_level != 
page_header(__pa(iterator.sptep))->role.level));
 
-   sp = page_header(__pa(iterator.sptep));
-   if (!is_last_spte(spte, sp->role.level))
+   if (!is_last_spte(spte, pte_level))
break;
 
/*
@@ -3587,7 +3596,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t 
cr2_or_gpa,
 *
 * See the comments in kvm_arch_commit_memory_region().
 */
-   if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+   if (pte_level > PT_PAGE_TABLE_LEVEL)
break;
}
 
@@ -3601,7 +3610,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t 
cr2_or_gpa,
 * since the gfn is not stable for indirect shadow page. See
 * Documentation/virt/kvm/locking.txt to get more detail.
 */
-   fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
+   fault_handled = fast_pf_fix_direct_spte(vcpu, cr2_or_gpa,
iterator.sptep, spte,
new_spte);
if (fault_handled)
@@ -4153,6 +4162,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t 
gpa, u32 error_code,
if (fast_page_fault(vcpu, gpa, error_code))
return RET_PF_RETRY;
 
+   if (vcpu->arch.direct_build_tdp)
+   return RET_PF_EMULATE;
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
 
-- 
2.17.1



[RFC 8/9] Introduce kvm module parameter global_tdp to turn on the direct build EPT mode

2020-08-05 Thread Yulei Zhang
From: Yulei Zhang 

Currently global_tdp is only supported on intel X86 system with ept
supported, and it will turn off the smm mode when enable global_tdp.

Signed-off-by: Yulei Zhang 
---
 arch/x86/include/asm/kvm_host.h |  4 
 arch/x86/kvm/mmu/mmu.c  |  5 -
 arch/x86/kvm/x86.c  | 11 ++-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7063b9d2cac0..a8c219fb33f5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1368,6 +1368,8 @@ extern u64  kvm_default_tsc_scaling_ratio;
 
 extern u64 kvm_mce_cap_supported;
 
+extern bool global_tdp;
+
 /*
  * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
  * userspace I/O) to indicate that the emulation context
@@ -1698,6 +1700,8 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
 #endif
 }
 
+inline bool boot_cpu_is_amd(void);
+
 #define put_smstate(type, buf, offset, val)  \
*(type *)((buf) + (offset) - 0x7e00) = val
 
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 485f7287aad2..f963a3b0500f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4630,7 +4630,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct 
kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
 
-static inline bool boot_cpu_is_amd(void)
+inline bool boot_cpu_is_amd(void)
 {
WARN_ON_ONCE(!tdp_enabled);
return shadow_x_mask == 0;
@@ -6471,6 +6471,9 @@ int kvm_direct_tdp_populate_page_table(struct kvm *kvm, 
struct kvm_memory_slot *
kvm_pfn_t pfn;
int host_level;
 
+   if (!global_tdp)
+   return 0;
+
if (!kvm->arch.global_root_hpa) {
struct page *page;
WARN_ON(!tdp_enabled);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 37e11b3588b5..abe838240084 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -162,6 +162,9 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
 int __read_mostly pi_inject_timer = -1;
 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
 
+bool __read_mostly global_tdp;
+module_param_named(global_tdp, global_tdp, bool, S_IRUGO);
+
 #define KVM_NR_SHARED_MSRS 16
 
 struct kvm_shared_msrs_global {
@@ -3403,7 +3406,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
 * fringe case that is not enabled except via specific settings
 * of the module parameters.
 */
-   r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
+   if (global_tdp)
+   r = 0;
+   else
+   r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
break;
case KVM_CAP_VAPIC:
r = !kvm_x86_ops.cpu_has_accelerated_tpr();
@@ -9675,6 +9681,9 @@ int kvm_arch_hardware_setup(void *opaque)
if (r != 0)
return r;
 
+   if ((tdp_enabled == false) || boot_cpu_is_amd())
+   global_tdp = 0;
+
memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
 
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
-- 
2.17.1



Re: [RFC 0/9] KVM:x86/mmu:Introduce parallel memory virtualization to boost performance

2020-08-07 Thread yulei zhang
On Fri, Aug 7, 2020 at 1:04 AM Ben Gardon  wrote:
>
> On Wed, Aug 5, 2020 at 9:53 AM Yulei Zhang  wrote:
> >
> > From: Yulei Zhang 
> >
> > Currently in KVM memory virtulization we relay on mmu_lock to synchronize
> > the memory mapping update, which make vCPUs work in serialize mode and
> > slow down the execution, especially after migration to do substantial
> > memory mapping setup, and performance get worse if increase vCPU numbers
> > and guest memories.
> >
> > The idea we present in this patch set is to mitigate the issue with
> > pre-constructed memory mapping table. We will fast pin the guest memory
> > to build up a global memory mapping table according to the guest memslots
> > changes and apply it to cr3, so that after guest starts up all the vCPUs
> > would be able to update the memory concurrently, thus the performance
> > improvement is expected.
>
> Is a re-implementation of the various MMU functions in this series
> necessary to pre-populate the EPT/NPT? I realize the approach you took
> is probably the fastest way to pre-populate an EPT, but it seems like
> similar pre-population could be achieved with some changes to the PF
> handler's prefault scheme or, from user space by adding a dummy vCPU
> to touch memory before loading the actual guest image.
>
> I think this series is taking a similar approach to the direct MMU RFC
> I sent out a little less than a year ago. (I will send another version
> of that series in the next month.) I'm not sure this level of
> complexity is worth it if you're only interested in EPT pre-population.
> Is pre-population your goal? You mention "parallel memory
> virtualization," does that refer to parallel page fault handling you
> intend to implement in a future series?
>
> There are a number of features I see you've chosen to leave behind in
> this series which might work for your use case, but I think they're
> necessary. These include handling vCPUs with different roles (SMM, VMX
> non root mode, etc.), MMU notifiers (which I realize matter less for
> pinned memory), demand paging through UFFD, fast EPT
> invalidation/teardown and others.
>
Thanks for the feedback. I think the target circumstance for this feature is
without memory overcommitment, thus it can fast pin the memory and
setup the GPA->HPA mapping table, and after that we don't expect PF
while vCPUs access the memory. We call it "parallel memory virtualization"
as with pre-populated EPT the vCPUs will be able to update the memory
in parallel mode.
Yes, so far we disable the SMM etc. We are looking forward to gathering
the inputs from your experts and refine the implementation.

> >
> > And after test the initial patch with memory dirty pattern workload, we
> > have seen positive results even with huge page enabled. For example,
> > guest with 32 vCPUs and 64G memories, in 2M/1G huge page mode we would get
> > more than 50% improvement.
> >
> >
> > Yulei Zhang (9):
> >   Introduce new fields in kvm_arch/vcpu_arch struct for direct build EPT
> > support
> >   Introduce page table population function for direct build EPT feature
> >   Introduce page table remove function for direct build EPT feature
> >   Add release function for direct build ept when guest VM exit
> >   Modify the page fault path to meet the direct build EPT requirement
> >   Apply the direct build EPT according to the memory slots change
> >   Add migration support when using direct build EPT
> >   Introduce kvm module parameter global_tdp to turn on the direct build
> > EPT mode
> >   Handle certain mmu exposed functions properly while turn on direct
> > build EPT mode
> >
> >  arch/mips/kvm/mips.c|  13 +
> >  arch/powerpc/kvm/powerpc.c  |  13 +
> >  arch/s390/kvm/kvm-s390.c|  13 +
> >  arch/x86/include/asm/kvm_host.h |  13 +-
> >  arch/x86/kvm/mmu/mmu.c  | 537 ++--
> >  arch/x86/kvm/svm/svm.c  |   2 +-
> >  arch/x86/kvm/vmx/vmx.c  |  17 +-
> >  arch/x86/kvm/x86.c  |  55 ++--
> >  include/linux/kvm_host.h|   7 +-
> >  virt/kvm/kvm_main.c |  43 ++-
> >  10 files changed, 648 insertions(+), 65 deletions(-)
> >
> > --
> > 2.17.1
> >


Re: [PATCH 00/35] Enhance memory utilization with DMEMFS

2020-10-09 Thread yulei zhang
Joao, thanks a lot for the feedback. One more thing needs to mention
is that dmemfs also support fine-grained
memory management which makes it more flexible for tenants with
different requirements.

On Fri, Oct 9, 2020 at 3:01 AM Joao Martins  wrote:
>
> [adding a couple folks that directly or indirectly work on the subject]
>
> On 10/8/20 8:53 AM, yulei.ker...@gmail.com wrote:
> > From: Yulei Zhang 
> >
> > In current system each physical memory page is assocaited with
> > a page structure which is used to track the usage of this page.
> > But due to the memory usage rapidly growing in cloud environment,
> > we find the resource consuming for page structure storage becomes
> > highly remarkable. So is it an expense that we could spare?
> >
> Happy to see another person working to solve the same problem!
>
> I am really glad to see more folks being interested in solving
> this problem and I hope we can join efforts?
>
> BTW, there is also a second benefit in removing struct page -
> which is carving out memory from the direct map.
>
> > This patchset introduces an idea about how to save the extra
> > memory through a new virtual filesystem -- dmemfs.
> >
> > Dmemfs (Direct Memory filesystem) is device memory or reserved
> > memory based filesystem. This kind of memory is special as it
> > is not managed by kernel and most important it is without 'struct page'.
> > Therefore we can leverage the extra memory from the host system
> > to support more tenants in our cloud service.
> >
> This is like a walk down the memory lane.
>
> About a year ago we followed the same exact idea/motivation to
> have memory outside of the direct map (and removing struct page overhead)
> and started with our own layer/thingie. However we realized that DAX
> is one the subsystems which already gives you direct access to memory
> for free (and is already upstream), plus a couple of things which we
> found more handy.
>
> So we sent an RFC a couple months ago:
>
> https://lore.kernel.org/linux-mm/20200110190313.17144-1-joao.m.mart...@oracle.com/
>
> Since then majority of the work has been in improving DAX[1].
> But now that is done I am going to follow up with the above patchset.
>
> [1]
> https://lore.kernel.org/linux-mm/159625229779.3040297.11363509688097221416.st...@dwillia2-desk3.amr.corp.intel.com/
>
> (Give me a couple of days and I will send you the link to the latest
> patches on a git-tree - would love feedback!)
>
> The struct page removal for DAX would then be small, and ticks the
> same bells and whistles (MCE handling, reserving PAT memtypes, ptrace
> support) that we both do, with a smaller diffstat and it doesn't
> touch KVM (not at least fundamentally).
>
> 15 files changed, 401 insertions(+), 38 deletions(-)
>
> The things needed in core-mm is for handling PMD/PUD PAGE_SPECIAL much
> like we both do. Furthermore there wouldn't be a need for a new vm type,
> consuming an extra page bit (in addition to PAGE_SPECIAL) or new filesystem.
>
> [1]
> https://lore.kernel.org/linux-mm/159625229779.3040297.11363509688097221416.st...@dwillia2-desk3.amr.corp.intel.com/
>
>
> > We uses a kernel boot parameter 'dmem=' to reserve the system
> > memory when the host system boots up, the details can be checked
> > in /Documentation/admin-guide/kernel-parameters.txt.
> >
> > Theoretically for each 4k physical page it can save 64 bytes if
> > we drop the 'struct page', so for guest memory with 320G it can
> > save about 5G physical memory totally.
> >
> Also worth mentioning that if you only care about 'struct page' cost, and not 
> on the
> security boundary, there's also some work on hugetlbfs preallocation of 
> hugepages into
> tricking vmemmap in reusing tail pages.
>
>   
> https://lore.kernel.org/linux-mm/20200915125947.26204-1-songmuc...@bytedance.com/
>
> Going forward that could also make sense for device-dax to avoid so many
> struct pages allocated (which would require its transition to compound
> struct pages like hugetlbfs which we are looking at too). In addition an
> idea  would be perhaps to have a stricter mode in DAX where
> we initialize/use the metadata ('struct page') but remove the underlaying
> PFNs (of the 'struct page') from the direct map having to bear the cost of
> mapping/unmapping on gup/pup.
>
> Joao


Re: [PATCH 22/35] kvm, x86: Distinguish dmemfs page from mmio page

2020-10-09 Thread yulei zhang
Sean and Joao, thanks for the feedback. Probably we can drop this change.

On Fri, Oct 9, 2020 at 6:28 PM Joao Martins  wrote:
>
> On 10/9/20 1:58 AM, Sean Christopherson wrote:
> > On Thu, Oct 08, 2020 at 03:54:12PM +0800, yulei.ker...@gmail.com wrote:
> >> From: Yulei Zhang 
> >>
> >> Dmem page is pfn invalid but not mmio. Support cacheable
> >> dmem page for kvm.
> >>
> >> Signed-off-by: Chen Zhuo 
> >> Signed-off-by: Yulei Zhang 
> >> ---
> >>  arch/x86/kvm/mmu/mmu.c | 5 +++--
> >>  include/linux/dmem.h   | 7 +++
> >>  mm/dmem.c  | 7 +++
> >>  3 files changed, 17 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> >> index 71aa3da2a0b7..0115c1767063 100644
> >> --- a/arch/x86/kvm/mmu/mmu.c
> >> +++ b/arch/x86/kvm/mmu/mmu.c
> >> @@ -41,6 +41,7 @@
> >>  #include 
> >>  #include 
> >>  #include 
> >> +#include 
> >>
> >>  #include 
> >>  #include 
> >> @@ -2962,9 +2963,9 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
> >>   */
> >>  (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
> >>
> >> -return !e820__mapped_raw_any(pfn_to_hpa(pfn),
> >> +return (!e820__mapped_raw_any(pfn_to_hpa(pfn),
> >>   pfn_to_hpa(pfn + 1) - 1,
> >> - E820_TYPE_RAM);
> >> + E820_TYPE_RAM)) || (!is_dmem_pfn(pfn));
> >
> > This is wrong.  As is, the logic reads "A PFN is MMIO if it is INVALID &&
> > (!RAM || !DMEM)".  The obvious fix would be to change it to "INVALID &&
> > !RAM && !DMEM", but that begs the question of whether or DMEM is reported
> > as RAM.  I don't see any e820 related changes in the series, i.e. no 
> > evidence
> > that dmem yanks its memory out of the e820 tables, which makes me think this
> > change is unnecessary.
> >
> Even if there would exist e820 changes, e820__mapped_raw_any() checks against
> hardware-provided e820 that we are given before any changes happen i.e. not 
> the one kernel
> has changed (e820_table_firmware). So unless you're having that memory carved 
> from an MMIO
> range (which would be wrong), or the BIOS is misrepresenting its memory 
> map... the
> e820__mapped_raw_any(E820_TYPE_RAM) ought to be enough to cover RAM.
>
> Or at least that has been my experience with similar work.
>
> Joao


Re: [PATCH 08/35] dmem: show some statistic in debugfs

2020-10-09 Thread yulei zhang
Thanks, Randy. I will follow the instructions to modify the patches.

On Fri, Oct 9, 2020 at 4:23 AM Randy Dunlap  wrote:
>
> On 10/8/20 12:53 AM, yulei.ker...@gmail.com wrote:
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index e1995da11cea..8a67c8933a42 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -235,6 +235,15 @@ config DMEM
> > Allow reservation of memory which could be dedicated usage of dmem.
> > It's the basics of dmemfs.
> >
> > +config DMEM_DEBUG_FS
> > + bool "Enable debug information for direct memory"
> > + depends on DMEM && DEBUG_FS
> > + def_bool n
>
> Drop the def_bool line. 'n' is the default anyway and the symbol is
> already of type bool from 2 lines above.
>
> > + help
> > +   This option enables showing various statistics of direct memory
> > +   in debugfs filesystem.
> > +
> > +#
>
>
> --
> ~Randy
>


Re: [PATCH 00/35] Enhance memory utilization with DMEMFS

2020-10-10 Thread yulei zhang
On Fri, Oct 9, 2020 at 7:53 PM Joao Martins  wrote:
>
> On 10/9/20 12:39 PM, yulei zhang wrote:
> > Joao, thanks a lot for the feedback. One more thing needs to mention
> > is that dmemfs also support fine-grained
> > memory management which makes it more flexible for tenants with
> > different requirements.
> >
> So as DAX when it allows to partition a region (starting 5.10). Meaning you 
> have a region
> which you dedicated to userspace. That region can then be partitioning into 
> devices which
> give you access to multiple (possibly discontinuous) extents with at a given 
> page
> granularity (selectable when you create the device), accessed through mmap().
> You can then give that device to a cgroup. Or you can return that memory back 
> to the
> kernel (should you run into OOM situation), or you recreate the same mappings 
> across
> reboot/kexec.
>
> I probably need to read your patches again, but can you extend on the 'dmemfs 
> also support
> fine-grained memory management' to understand what is the gap that you 
> mention?
>

sure, dmemfs uses bitmap to track the memory usage in the reserved
memory region in
a given page size granularity. And for each user the memory can be
discrete as well.

> > On Fri, Oct 9, 2020 at 3:01 AM Joao Martins  
> > wrote:
> >>
> >> [adding a couple folks that directly or indirectly work on the subject]
> >>
> >> On 10/8/20 8:53 AM, yulei.ker...@gmail.com wrote:
> >>> From: Yulei Zhang 
> >>>
> >>> In current system each physical memory page is assocaited with
> >>> a page structure which is used to track the usage of this page.
> >>> But due to the memory usage rapidly growing in cloud environment,
> >>> we find the resource consuming for page structure storage becomes
> >>> highly remarkable. So is it an expense that we could spare?
> >>>
> >> Happy to see another person working to solve the same problem!
> >>
> >> I am really glad to see more folks being interested in solving
> >> this problem and I hope we can join efforts?
> >>
> >> BTW, there is also a second benefit in removing struct page -
> >> which is carving out memory from the direct map.
> >>
> >>> This patchset introduces an idea about how to save the extra
> >>> memory through a new virtual filesystem -- dmemfs.
> >>>
> >>> Dmemfs (Direct Memory filesystem) is device memory or reserved
> >>> memory based filesystem. This kind of memory is special as it
> >>> is not managed by kernel and most important it is without 'struct page'.
> >>> Therefore we can leverage the extra memory from the host system
> >>> to support more tenants in our cloud service.
> >>>
> >> This is like a walk down the memory lane.
> >>
> >> About a year ago we followed the same exact idea/motivation to
> >> have memory outside of the direct map (and removing struct page overhead)
> >> and started with our own layer/thingie. However we realized that DAX
> >> is one the subsystems which already gives you direct access to memory
> >> for free (and is already upstream), plus a couple of things which we
> >> found more handy.
> >>
> >> So we sent an RFC a couple months ago:
> >>
> >> https://lore.kernel.org/linux-mm/20200110190313.17144-1-joao.m.mart...@oracle.com/
> >>
> >> Since then majority of the work has been in improving DAX[1].
> >> But now that is done I am going to follow up with the above patchset.
> >>
> >> [1]
> >> https://lore.kernel.org/linux-mm/159625229779.3040297.11363509688097221416.st...@dwillia2-desk3.amr.corp.intel.com/
> >>
> >> (Give me a couple of days and I will send you the link to the latest
> >> patches on a git-tree - would love feedback!)
> >>
> >> The struct page removal for DAX would then be small, and ticks the
> >> same bells and whistles (MCE handling, reserving PAT memtypes, ptrace
> >> support) that we both do, with a smaller diffstat and it doesn't
> >> touch KVM (not at least fundamentally).
> >>
> >> 15 files changed, 401 insertions(+), 38 deletions(-)
> >>
> >> The things needed in core-mm is for handling PMD/PUD PAGE_SPECIAL much
> >> like we both do. Furthermore there wouldn't be a need for a new vm type,
> >> consuming an extra page bit (in addition to PAGE_SPECIAL) or new 
> >> filesystem.
> >>
> >> [1]
> >> https://lore.ke

Re: [PATCH 00/35] Enhance memory utilization with DMEMFS

2020-10-12 Thread yulei zhang
On Mon, Oct 12, 2020 at 7:57 PM Zengtao (B)  wrote:
>
>
> > -Original Message-
> > From: yulei.ker...@gmail.com [mailto:yulei.ker...@gmail.com]
> > Sent: Thursday, October 08, 2020 3:54 PM
> > To: a...@linux-foundation.org; naoya.horigu...@nec.com;
> > v...@zeniv.linux.org.uk; pbonz...@redhat.com
> > Cc: linux-fsde...@vger.kernel.org; k...@vger.kernel.org;
> > linux-kernel@vger.kernel.org; xiaoguangrong.e...@gmail.com;
> > kernel...@gmail.com; lihaiwei.ker...@gmail.com; Yulei Zhang
> > Subject: [PATCH 00/35] Enhance memory utilization with DMEMFS
> >
> > From: Yulei Zhang 
> >
> > In current system each physical memory page is assocaited with
> > a page structure which is used to track the usage of this page.
> > But due to the memory usage rapidly growing in cloud environment,
> > we find the resource consuming for page structure storage becomes
> > highly remarkable. So is it an expense that we could spare?
> >
> > This patchset introduces an idea about how to save the extra
> > memory through a new virtual filesystem -- dmemfs.
> >
> > Dmemfs (Direct Memory filesystem) is device memory or reserved
> > memory based filesystem. This kind of memory is special as it
> > is not managed by kernel and most important it is without 'struct page'.
> > Therefore we can leverage the extra memory from the host system
> > to support more tenants in our cloud service.
> >
> > We uses a kernel boot parameter 'dmem=' to reserve the system
> > memory when the host system boots up, the details can be checked
> > in /Documentation/admin-guide/kernel-parameters.txt.
> >
> > Theoretically for each 4k physical page it can save 64 bytes if
> > we drop the 'struct page', so for guest memory with 320G it can
> > save about 5G physical memory totally.
>
> Sounds interesting, but seems your patch only support x86, have you
>  considered aarch64?
>
> Regards
> Zengtao

Thanks, so far we only verify it on x86 server, may extend to arm platform
in the future.


Re: [PATCH 04/35] dmem: let pat recognize dmem

2020-10-13 Thread yulei zhang
On Tue, Oct 13, 2020 at 3:27 PM Paolo Bonzini  wrote:
>
> On 08/10/20 09:53, yulei.ker...@gmail.com wrote:
> > From: Yulei Zhang 
> >
> > x86 pat uses 'struct page' by only checking if it's system ram,
> > however it is not true if dmem is used, let's teach pat to
> > recognize this case if it is ram but it is !pfn_valid()
> >
> > We always use WB for dmem and any attempt to change this
> > behavior will be rejected and WARN_ON is triggered
> >
> > Signed-off-by: Xiao Guangrong 
> > Signed-off-by: Yulei Zhang 
>
> Hooks like these will make it very hard to merge this series.
>
> I like the idea of struct page-backed memory, but this is a lot of code
> and I wonder if it's worth adding all these complications.
>
> One can already use mem= to remove the "struct page" cost for most of
> the host memory, and manage the allocation of the remaining memory in
> userspace with /dev/mem.  What is the advantage of doing this in the kernel?
>
> Paolo
>

hi Paolo,as far as I know there are a few limitations to play with
/dev/mem in this case.
1. access to /dev/men is restricted due to the security requirement,
but usually our virtual machines are unprivileged processes.
2. what we get from /dev/mem is a whole block of memory, as dynamic
VMs running on /dev/mem will cause memory fragment, it needs extra logic
to manage the allocation and recovery to avoid wasted memory. dmemfs
can support this and also leverage the kernel tlb management.
3. it needs to support hugepage with different page size granularity.
4. MCE recovery capability is also required.


Re: [RFC V2 0/9] x86/mmu:Introduce parallel memory virtualization to boost performance

2020-09-25 Thread yulei zhang
On Fri, Sep 25, 2020 at 1:14 AM Ben Gardon  wrote:
>
> On Wed, Sep 23, 2020 at 11:28 PM Wanpeng Li  wrote:
> >
> > Any comments? Paolo! :)
>
> Hi, sorry to be so late in replying! I wanted to post the first part
> of the TDP MMU series I've been working on before responding so we
> could discuss the two together, but I haven't been able to get it out
> as fast as I would have liked. (I'll send it ASAP!) I'm hopeful that
> it will ultimately help address some of the page fault handling and
> lock contention issues you're addressing with these patches. I'd also
> be happy to work together to add a prepopulation feature to it. I'll
> put in some more comments inline below.
>

Thanks for the feedback and looking forward to your patchset.

> > On Wed, 9 Sep 2020 at 11:04, Wanpeng Li  wrote:
> > >
> > > Any comments? guys!
> > > On Tue, 1 Sep 2020 at 19:52,  wrote:
> > > >
> > > > From: Yulei Zhang 
> > > >
> > > > Currently in KVM memory virtulization we relay on mmu_lock to
> > > > synchronize the memory mapping update, which make vCPUs work
> > > > in serialize mode and slow down the execution, especially after
> > > > migration to do substantial memory mapping will cause visible
> > > > performance drop, and it can get worse if guest has more vCPU
> > > > numbers and memories.
> > > >
> > > > The idea we present in this patch set is to mitigate the issue
> > > > with pre-constructed memory mapping table. We will fast pin the
> > > > guest memory to build up a global memory mapping table according
> > > > to the guest memslots changes and apply it to cr3, so that after
> > > > guest starts up all the vCPUs would be able to update the memory
> > > > simultaneously without page fault exception, thus the performance
> > > > improvement is expected.
>
> My understanding from this RFC is that your primary goal is to
> eliminate page fault latencies and lock contention arising from the
> first page faults incurred by vCPUs when initially populating the EPT.
> Is that right?
>

That's right.

> I have the impression that the pinning and generally static memory
> mappings are more a convenient simplification than part of a larger
> goal to avoid incurring page faults down the line. Is that correct?
>
> I ask because I didn't fully understand, from our conversation on v1
> of this RFC, why reimplementing the page fault handler and associated
> functions was necessary for the above goals, as I understood them.
> My impression of the prepopulation approach is that, KVM will
> sequentially populate all the EPT entries to map guest memory. I
> understand how this could be optimized to be quite efficient, but I
> don't understand how it would scale better than the existing
> implementation with one vCPU accessing memory.
>

I don't think our goal is to simply eliminate the page fault. Our
target scenario
is in live migration, when the workload resume on the destination VM after
migrate, it will kick off the vcpus to build the gfn to pfn mapping,
but due to the
mmu_lock it holds the vcpus to execute in sequential which significantly slows
down the workload execution in VM and affect the end user experience, especially
when it is memory sensitive workload. Pre-populate the EPT entries
will solve the
problem smoothly as it allows the vcpus to execute in parallel after migration.

> > > >
> > > > We use memory dirty pattern workload to test the initial patch
> > > > set and get positive result even with huge page enabled. For example,
> > > > we create guest with 32 vCPUs and 64G memories, and let the vcpus
> > > > dirty the entire memory region concurrently, as the initial patch
> > > > eliminate the overhead of mmu_lock, in 2M/1G huge page mode we would
> > > > get the job done in about 50% faster.
>
> In this benchmark did you include the time required to pre-populate
> the EPT or just the time required for the vCPUs to dirty memory?
> I ask because I'm curious if your priority is to decrease the total
> end-to-end time, or you just care about the guest experience, and not
> so much the VM startup time.

We compare the time for each vcpu thread to finish the dirty job. Yes, it can
take some time for the page table pre-populate, but as each vcpu thread
can gain a huge advantage with concurrent dirty write, if we count that in
the total time it is still a better result.

> How does this compare to the case where 1 vCPU reads every page of
> memory and then 32 vCPUs concurrently dirty every page?
>

Haven't tried t

Re: [RFC V2 0/9] x86/mmu:Introduce parallel memory virtualization to boost performance

2020-09-28 Thread yulei zhang
On Sat, Sep 26, 2020 at 4:50 AM Paolo Bonzini  wrote:
>
> On 25/09/20 19:30, Ben Gardon wrote:
> > Oh, thank you for explaining that. I didn't realize the goal here was
> > to improve LM performance. I was under the impression that this was to
> > give VMs a better experience on startup for fast scaling or something.
> > In your testing with live migration how has this affected the
> > distribution of time between the phases of live migration? Just for
> > terminology (since I'm not sure how standard it is across the
> > industry) I think of a live migration as consisting of 3 stages:
> > precopy, blackout, and postcopy. In precopy we're tracking the VM's
> > working set via dirty logging and sending the contents of its memory
> > to the target host. In blackout we pause the vCPUs on the source, copy
> > minimal data to the target, and resume the vCPUs on the target. In
> > postcopy we may still have some pages that have not been copied to the
> > target and so request those in response to vCPU page faults via user
> > fault fd or some other mechanism.
> >
> > Does EPT pre-population preclude the use of a postcopy phase?
>
> I think so.
>
> As a quick recap, turn postcopy migration handles two kinds of
> pages---they can be copied to the destination either in background
> (stuff that was dirty when userspace decided to transition to the
> blackout phase) or on-demand (relayed from KVM to userspace via
> get_user_pages and userfaultfd).  Normally only on-demand pages would be
> served through userfaultfd, while with prepopulation every missing page
> would be faulted in from the kernel through userfaultfd.  In practice
> this would just extend the blackout phase.
>
> Paolo
>

Yep, you are right, based on current implementation it doesn't support the
postcopy. Thanks for the suggestion, we will try to fill the gap with proper
EPT population during the post-copy.

> > I would
> > expect that to make the blackout phase really long. Has that not been
> > a problem for you?
> >
> > I love the idea of partial EPT pre-population during precopy if you
> > could still handle postcopy and just pre-populate as memory came in.
> >
>


Re: [PATCH 00/35] Enhance memory utilization with DMEMFS

2020-10-20 Thread yulei zhang
On Tue, Oct 20, 2020 at 3:03 AM Joao Martins  wrote:
>
> On 10/19/20 2:37 PM, Paolo Bonzini wrote:
> > On 15/10/20 00:25, Dan Williams wrote:
> >> Now, with recent device-dax extensions, it
> >> also has a coarse grained memory management system for  physical
> >> address-space partitioning and a path for struct-page-less backing for
> >> VMs. What feature gaps remain vs dmemfs, and can those gaps be closed
> >> with incremental improvements to the 2 existing memory-management
> >> systems?
> >
> > If I understand correctly, devm_memremap_pages() on ZONE_DEVICE memory
> > would still create the "struct page" albeit lazily?  KVM then would use
> > the usual get_user_pages() path.
> >
> Correct.
>
> The removal of struct page would be one of the added incremental 
> improvements, like a
> 'map' with 'raw' sysfs attribute for dynamic dax regions that wouldn't 
> online/create the
> struct pages. The remaining plumbing (...)
>
> > Looking more closely at the implementation of dmemfs, I don't understand
> > is why dmemfs needs VM_DMEM etc. and cannot provide access to mmap-ed
> > memory using remap_pfn_range and VM_PFNMAP, just like /dev/mem.  If it
> > did that KVM would get physical addresses using fixup_user_fault and
> > never need pfn_to_page() or get_user_pages().  I'm not saying that would
> > instantly be an approval, but it would make remove a lot of hooks.
> >
>
> (...) is similar to what you describe above. Albeit there's probably no need 
> to do a
> remap_pfn_range at mmap(), as DAX supplies a fault/huge_fault. Also, using 
> that means it's
> limited to a single contiguous PFN chunk.
>
> KVM has the bits to make it work without struct pages, I don't think there's 
> a need for
> new pg/pfn_t/VM_* bits (aside from relying on {PFN,PAGE}_SPECIAL) as 
> mentioned at the
> start of the thread. I'm storing my wip here:
>
> https://github.com/jpemartins/linux pageless-dax
>
> Which is based on the first series that had been submitted earlier this year:
>
> 
> https://lore.kernel.org/kvm/20200110190313.17144-1-joao.m.mart...@oracle.com/
>
>   Joao

Just as Joao mentioned, remap_pfn_range() may request a single
contiguous PFN range, which
is not our intention. And for VM_DMEM, I think we may drop it in the
next version, and to use the
existing bits as much as possible to minimize the modifications.


Re: [PATCH 01/35] fs: introduce dmemfs module

2020-11-11 Thread yulei zhang
On Wed, Nov 11, 2020 at 4:04 AM Al Viro  wrote:
>
> On Thu, Oct 08, 2020 at 03:53:51PM +0800, yulei.ker...@gmail.com wrote:
>
> > +static struct inode *
> > +dmemfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t 
> > mode,
> > +  dev_t dev);
>
> WTF is 'dev' for?
>
> > +static int
> > +dmemfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t 
> > dev)
> > +{
> > + struct inode *inode = dmemfs_get_inode(dir->i_sb, dir, mode, dev);
> > + int error = -ENOSPC;
> > +
> > + if (inode) {
> > + d_instantiate(dentry, inode);
> > + dget(dentry);   /* Extra count - pin the dentry in core */
> > + error = 0;
> > + dir->i_mtime = dir->i_ctime = current_time(inode);
> > + }
> > + return error;
> > +}
>
> ... same here, seeing that you only call that thing from the next two 
> functions
> and you do *not* provide ->mknod() as a method (unsurprisingly - what would
> device nodes do there?)
>

Thanks for pointing this out. we may need support the mknod method, otherwise
the dev is redundant  and need to be removed.

> > +static int dmemfs_create(struct inode *dir, struct dentry *dentry,
> > +  umode_t mode, bool excl)
> > +{
> > + return dmemfs_mknod(dir, dentry, mode | S_IFREG, 0);
> > +}
> > +
> > +static int dmemfs_mkdir(struct inode *dir, struct dentry *dentry,
> > + umode_t mode)
> > +{
> > + int retval = dmemfs_mknod(dir, dentry, mode | S_IFDIR, 0);
> > +
> > + if (!retval)
> > + inc_nlink(dir);
> > + return retval;
> > +}
>
> > +int dmemfs_file_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > + return 0;
> > +}
> > +
> > +static const struct file_operations dmemfs_file_operations = {
> > + .mmap = dmemfs_file_mmap,
> > +};
>
> Er...  Is that a placeholder for later in the series?  Because as it is,
> it makes no sense whatsoever - "it can be mmapped, but any access to the
> mapped area will segfault".
>

Yes, we seperate the full implementation for dmemfs_file_mmap into
patch 05/35, it
will assign the interfaces to handle the page fault.

> > +struct inode *dmemfs_get_inode(struct super_block *sb,
> > +const struct inode *dir, umode_t mode, dev_t 
> > dev)
> > +{
> > + struct inode *inode = new_inode(sb);
> > +
> > + if (inode) {
> > + inode->i_ino = get_next_ino();
> > + inode_init_owner(inode, dir, mode);
> > + inode->i_mapping->a_ops = &empty_aops;
> > + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> > + mapping_set_unevictable(inode->i_mapping);
> > + inode->i_atime = inode->i_mtime = inode->i_ctime = 
> > current_time(inode);
> > + switch (mode & S_IFMT) {
> > + default:
> > + init_special_inode(inode, mode, dev);
> > + break;
> > + case S_IFREG:
> > + inode->i_op = &dmemfs_file_inode_operations;
> > + inode->i_fop = &dmemfs_file_operations;
> > + break;
> > + case S_IFDIR:
> > + inode->i_op = &dmemfs_dir_inode_operations;
> > + inode->i_fop = &simple_dir_operations;
> > +
> > + /*
> > +  * directory inodes start off with i_nlink == 2
> > +  * (for "." entry)
> > +  */
> > + inc_nlink(inode);
> > + break;
> > + case S_IFLNK:
> > + inode->i_op = &page_symlink_inode_operations;
> > + break;
>
> Where would symlinks come from?  Or anything other than regular files and
> directories, for that matter...

You are right, so far it just supports regular files and directories.


Re: [PATCH 01/35] fs: introduce dmemfs module

2020-11-12 Thread yulei zhang
On Thu, Nov 12, 2020 at 7:09 AM Al Viro  wrote:
>
> On Wed, Nov 11, 2020 at 04:53:00PM +0800, yulei zhang wrote:
>
> > > ... same here, seeing that you only call that thing from the next two 
> > > functions
> > > and you do *not* provide ->mknod() as a method (unsurprisingly - what 
> > > would
> > > device nodes do there?)
> > >
> >
> > Thanks for pointing this out. we may need support the mknod method, 
> > otherwise
> > the dev is redundant  and need to be removed.
>
> I'd suggest turning that into (static) __create_file() with
>
> static int dmemfs_create(struct inode *dir, struct dentry *dentry,
>  umode_t mode, bool excl)
> {
> return __create_file(dir, dentry, mode | S_IFREG);
> }
>
> static int dmemfs_mkdir(struct inode *dir, struct dentry *dentry,
>  umode_t mode)
> {
> return __create_file(dir, dentry, mode | S_IFDIR);
> }
>
> (i.e. even inc_nlink() of parent folded into that).
>
> [snip]
>
> > Yes, we seperate the full implementation for dmemfs_file_mmap into
> > patch 05/35, it
> > will assign the interfaces to handle the page fault.
>
> It would be less confusing to move the introduction of ->mmap() to that patch,
> then.

Thanks for the suggestion. will refactor the patches accordingly.


[PATCH RFC] vfio: Implement new Ioctl VFIO_IOMMU_GET_DIRTY_BITMAP

2017-07-30 Thread Yulei Zhang
This patch is to implement the new ioctl VFIO_IOMMU_GET_DIRTY_BITMAP
to fulfill the requirement for vfio-mdev device live migration, which
need copy the memory that has been pinned in iommu container to the
target VM for mdev device status restore.

Signed-off-by: Yulei Zhang 
---
 drivers/vfio/vfio_iommu_type1.c | 38 ++
 include/uapi/linux/vfio.h   | 14 ++
 2 files changed, 52 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8549cb1..fab9e26 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson "
@@ -1526,6 +1527,23 @@ static int vfio_domains_have_iommu_cache(struct 
vfio_iommu *iommu)
return ret;
 }
 
+static void vfio_dma_update_dirty_bitmap(struct vfio_iommu *iommu, u64 
start_addr,
+ u64 npage, void *bitmap)
+{
+   u64 iova = start_addr;
+   struct vfio_dma *dma;
+   int i;
+
+   for (i = 0; i < npage; i++) {
+   dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
+   if (dma)
+   if (vfio_find_vpfn(dma, iova))
+   set_bit(i, bitmap);
+
+   iova += PAGE_SIZE;
+   }
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -1596,6 +1614,26 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
return copy_to_user((void __user *)arg, &unmap, minsz) ?
-EFAULT : 0;
+   } else if (cmd == VFIO_IOMMU_GET_DIRTY_BITMAP) {
+   struct vfio_iommu_get_dirty_bitmap d;
+   unsigned long bitmap_sz;
+   unsigned *bitmap;
+
+   minsz = offsetofend(struct vfio_iommu_get_dirty_bitmap, 
page_nr);
+
+   if (copy_from_user(&d, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   bitmap_sz = (BITS_TO_LONGS(d.page_nr) + 1) * sizeof(unsigned 
long);
+   bitmap = vzalloc(bitmap_sz);
+   vfio_dma_update_dirty_bitmap(iommu, d.start_addr, d.page_nr, 
bitmap);
+
+   if (copy_to_user((void __user*)arg + minsz, bitmap, bitmap_sz)) 
{
+   vfree(bitmap);
+   return -EFAULT;
+   }
+   vfree(bitmap);
+   return 0;
}
 
return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 8424afb..ecf5c53 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -571,6 +571,20 @@ struct vfio_iommu_type1_dma_unmap {
 #define VFIO_IOMMU_ENABLE  _IO(VFIO_TYPE, VFIO_BASE + 15)
 #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOW(VFIO_TYPE, VFIO_BASE + 17,
+ * struct vfio_iommu_get_dirty_bitmap)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_iommu_get_dirty_bitmap{
+   __u64  start_addr;
+   __u64  page_nr;
+   __u8   dirty_bitmap[];
+};
+
+#define VFIO_IOMMU_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 17)
+
 /*  Additional API for SPAPR TCE (Server POWERPC) IOMMU  */
 
 /*
-- 
2.7.4