The key steps for a private to shared conversion are:

1. Unmap from guest page tables
2. Set pages associated with requested range in memslot to be
   faultable
3. Update kvm->mem_attr_array

The key steps for a shared to private conversion are:

1. Check and disallow set_memory_attributes if any page in the range
   is still mapped or pinned, by
   a. Updating guest_memfd's faultability to prevent future faulting
   b. Returning -EINVAL if any pages are still pinned.
2. Update kvm->mem_attr_array

Userspace VMM must ensure shared pages are not in use, since any
faults racing with this call will get a SIGBUS.

Co-developed-by: Ackerley Tng <ackerley...@google.com>
Signed-off-by: Ackerley Tng <ackerley...@google.com>
Co-developed-by: Vishal Annapurve <vannapu...@google.com>
Signed-off-by: Vishal Annapurve <vannapu...@google.com>

---
 include/linux/kvm_host.h |   1 +
 virt/kvm/guest_memfd.c   | 207 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c      |  15 +++
 virt/kvm/kvm_mm.h        |   9 ++
 4 files changed, 232 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 79a6b1a63027..10993cd33e34 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2476,6 +2476,7 @@ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, 
gfn_t gfn, kvm_pfn_t pfn,
 
 long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long 
npages,
                       kvm_gmem_populate_cb post_populate, void *opaque);
+
 #endif
 
 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 1d4dfe0660ad..110c4bbb004b 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1592,4 +1592,211 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t 
start_gfn, void __user *src, long
        return ret && !i ? ret : i;
 }
 EXPORT_SYMBOL_GPL(kvm_gmem_populate);
+
+/**
+ * Returns true if pages in range [@start, @end) in inode @inode have no
+ * userspace mappings.
+ */
+static bool kvm_gmem_no_mappings_range(struct inode *inode, pgoff_t start, 
pgoff_t end)
+{
+       pgoff_t index;
+       bool checked_indices_unmapped;
+
+       filemap_invalidate_lock_shared(inode->i_mapping);
+
+       /* TODO: replace iteration with filemap_get_folios() for efficiency. */
+       checked_indices_unmapped = true;
+       for (index = start; checked_indices_unmapped && index < end;) {
+               struct folio *folio;
+
+               /* Don't use kvm_gmem_get_folio to avoid allocating */
+               folio = filemap_lock_folio(inode->i_mapping, index);
+               if (IS_ERR(folio)) {
+                       ++index;
+                       continue;
+               }
+
+               if (folio_mapped(folio) || folio_maybe_dma_pinned(folio))
+                       checked_indices_unmapped = false;
+               else
+                       index = folio_next_index(folio);
+
+               folio_unlock(folio);
+               folio_put(folio);
+       }
+
+       filemap_invalidate_unlock_shared(inode->i_mapping);
+       return checked_indices_unmapped;
+}
+
+/**
+ * Returns true if pages in range [@start, @end) in memslot @slot have no
+ * userspace mappings.
+ */
+static bool kvm_gmem_no_mappings_slot(struct kvm_memory_slot *slot,
+                                     gfn_t start, gfn_t end)
+{
+       pgoff_t offset_start;
+       pgoff_t offset_end;
+       struct file *file;
+       bool ret;
+
+       offset_start = start - slot->base_gfn + slot->gmem.pgoff;
+       offset_end = end - slot->base_gfn + slot->gmem.pgoff;
+
+       file = kvm_gmem_get_file(slot);
+       if (!file)
+               return false;
+
+       ret = kvm_gmem_no_mappings_range(file_inode(file), offset_start, 
offset_end);
+
+       fput(file);
+
+       return ret;
+}
+
+/**
+ * Returns true if pages in range [@start, @end) have no host userspace 
mappings.
+ */
+static bool kvm_gmem_no_mappings(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+       int i;
+
+       lockdep_assert_held(&kvm->slots_lock);
+
+       for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+               struct kvm_memslot_iter iter;
+               struct kvm_memslots *slots;
+
+               slots = __kvm_memslots(kvm, i);
+               kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+                       struct kvm_memory_slot *slot;
+                       gfn_t gfn_start;
+                       gfn_t gfn_end;
+
+                       slot = iter.slot;
+                       gfn_start = max(start, slot->base_gfn);
+                       gfn_end = min(end, slot->base_gfn + slot->npages);
+
+                       if (iter.slot->flags & KVM_MEM_GUEST_MEMFD &&
+                           !kvm_gmem_no_mappings_slot(iter.slot, gfn_start, 
gfn_end))
+                               return false;
+               }
+       }
+
+       return true;
+}
+
+/**
+ * Set faultability of given range of gfns [@start, @end) in memslot @slot to
+ * @faultable.
+ */
+static void kvm_gmem_set_faultable_slot(struct kvm_memory_slot *slot, gfn_t 
start,
+                                       gfn_t end, bool faultable)
+{
+       pgoff_t start_offset;
+       pgoff_t end_offset;
+       struct file *file;
+
+       file = kvm_gmem_get_file(slot);
+       if (!file)
+               return;
+
+       start_offset = start - slot->base_gfn + slot->gmem.pgoff;
+       end_offset = end - slot->base_gfn + slot->gmem.pgoff;
+
+       WARN_ON(kvm_gmem_set_faultable(file_inode(file), start_offset, 
end_offset,
+                                      faultable));
+
+       fput(file);
+}
+
+/**
+ * Set faultability of given range of gfns [@start, @end) in memslot @slot to
+ * @faultable.
+ */
+static void kvm_gmem_set_faultable_vm(struct kvm *kvm, gfn_t start, gfn_t end,
+                                     bool faultable)
+{
+       int i;
+
+       lockdep_assert_held(&kvm->slots_lock);
+
+       for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+               struct kvm_memslot_iter iter;
+               struct kvm_memslots *slots;
+
+               slots = __kvm_memslots(kvm, i);
+               kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+                       struct kvm_memory_slot *slot;
+                       gfn_t gfn_start;
+                       gfn_t gfn_end;
+
+                       slot = iter.slot;
+                       gfn_start = max(start, slot->base_gfn);
+                       gfn_end = min(end, slot->base_gfn + slot->npages);
+
+                       if (iter.slot->flags & KVM_MEM_GUEST_MEMFD) {
+                               kvm_gmem_set_faultable_slot(slot, gfn_start,
+                                                           gfn_end, faultable);
+                       }
+               }
+       }
+}
+
+/**
+ * Returns true if guest_memfd permits setting range [@start, @end) to PRIVATE.
+ *
+ * If memory is faulted in to host userspace and a request was made to set the
+ * memory to PRIVATE, the faulted in pages must not be pinned for the request 
to
+ * be permitted.
+ */
+static int kvm_gmem_should_set_attributes_private(struct kvm *kvm, gfn_t start,
+                                                 gfn_t end)
+{
+       kvm_gmem_set_faultable_vm(kvm, start, end, false);
+
+       if (kvm_gmem_no_mappings(kvm, start, end))
+               return 0;
+
+       kvm_gmem_set_faultable_vm(kvm, start, end, true);
+       return -EINVAL;
+}
+
+/**
+ * Returns true if guest_memfd permits setting range [@start, @end) to SHARED.
+ *
+ * Because this allows pages to be faulted in to userspace, this must only be
+ * called after the pages have been invalidated from guest page tables.
+ */
+static int kvm_gmem_should_set_attributes_shared(struct kvm *kvm, gfn_t start,
+                                                gfn_t end)
+{
+       /* Always okay to set shared, hence set range faultable here. */
+       kvm_gmem_set_faultable_vm(kvm, start, end, true);
+
+       return 0;
+}
+
+/**
+ * Returns 0 if guest_memfd permits setting attributes @attrs for range 
[@start,
+ * @end) or negative error otherwise.
+ *
+ * If memory is faulted in to host userspace and a request was made to set the
+ * memory to PRIVATE, the faulted in pages must not be pinned for the request 
to
+ * be permitted.
+ *
+ * Because this may allow pages to be faulted in to userspace when requested to
+ * set attributes to shared, this must only be called after the pages have been
+ * invalidated from guest page tables.
+ */
+int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+                                  unsigned long attrs)
+{
+       if (attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE)
+               return kvm_gmem_should_set_attributes_private(kvm, start, end);
+       else
+               return kvm_gmem_should_set_attributes_shared(kvm, start, end);
+}
+
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 92901656a0d4..1a7bbcc31b7e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2524,6 +2524,13 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, 
gfn_t start, gfn_t end,
                .on_lock = kvm_mmu_invalidate_end,
                .may_block = true,
        };
+       struct kvm_mmu_notifier_range error_set_range = {
+               .start = start,
+               .end = end,
+               .handler = (void *)kvm_null_fn,
+               .on_lock = kvm_mmu_invalidate_end,
+               .may_block = true,
+       };
        unsigned long i;
        void *entry;
        int r = 0;
@@ -2548,6 +2555,10 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, 
gfn_t start, gfn_t end,
 
        kvm_handle_gfn_range(kvm, &pre_set_range);
 
+       r = kvm_gmem_should_set_attributes(kvm, start, end, attributes);
+       if (r)
+               goto err;
+
        for (i = start; i < end; i++) {
                r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
                                    GFP_KERNEL_ACCOUNT));
@@ -2560,6 +2571,10 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, 
gfn_t start, gfn_t end,
        mutex_unlock(&kvm->slots_lock);
 
        return r;
+
+err:
+       kvm_handle_gfn_range(kvm, &error_set_range);
+       goto out_unlock;
 }
 static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
                                           struct kvm_memory_attributes *attrs)
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 715f19669d01..d8ff2b380d0e 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -41,6 +41,8 @@ int kvm_gmem_create(struct kvm *kvm, struct 
kvm_create_guest_memfd *args);
 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
                  unsigned int fd, loff_t offset);
 void kvm_gmem_unbind(struct kvm_memory_slot *slot);
+int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+                                  unsigned long attrs);
 #else
 static inline void kvm_gmem_init(struct module *module)
 {
@@ -59,6 +61,13 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot 
*slot)
 {
        WARN_ON_ONCE(1);
 }
+
+static inline int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start,
+                                                gfn_t end, unsigned long attrs)
+{
+       return 0;
+}
+
 #endif /* CONFIG_KVM_PRIVATE_MEM */
 
 #endif /* __KVM_MM_H__ */
-- 
2.46.0.598.g6f2099f65c-goog


Reply via email to