From: Ackerley Tng <[email protected]>

When converting memory to private in guest_memfd, it is necessary to ensure
that the pages are not currently being accessed by any other part of the
kernel or userspace to avoid any current user writing to guest private
memory.

guest_memfd checks for unexpected refcounts to determine whether a page is
still in use. The only expected refcounts after unmapping the range
requested for conversion are those that are held by guest_memfd itself.

Update the kvm_memory_attributes2 structure to include an error_offset
field. This allows KVM to report the exact offset where a conversion
failed to userspace. If the safety check fails, return -EAGAIN and copy
the error_offset back to userspace so that it can potentially retry the
operation or handle the failure gracefully.

Suggested-by: David Hildenbrand <[email protected]>
Signed-off-by: Ackerley Tng <[email protected]>
Co-developed-by: Vishal Annapurve <[email protected]>
Signed-off-by: Vishal Annapurve <[email protected]>
---
 include/uapi/linux/kvm.h |  3 ++-
 virt/kvm/guest_memfd.c   | 65 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e6bbf68a83813..0b55258573d3d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1658,7 +1658,8 @@ struct kvm_memory_attributes2 {
        __u64 size;
        __u64 attributes;
        __u64 flags;
-       __u64 reserved[12];
+       __u64 error_offset;
+       __u64 reserved[11];
 };
 
 #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 9a26eca717047..e87a2b72ff802 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -584,9 +584,42 @@ static int kvm_gmem_mas_preallocate(struct ma_state *mas, 
u64 attributes,
        return mas_preallocate(mas, xa_mk_value(attributes), GFP_KERNEL);
 }
 
+static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
+                                           size_t nr_pages, pgoff_t *err_index)
+{
+       struct address_space *mapping = inode->i_mapping;
+       const int filemap_get_folios_refcount = 1;
+       pgoff_t last = start + nr_pages - 1;
+       struct folio_batch fbatch;
+       bool safe = true;
+       int i;
+
+       folio_batch_init(&fbatch);
+       while (safe && filemap_get_folios(mapping, &start, last, &fbatch)) {
+
+               for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+                       struct folio *folio = fbatch.folios[i];
+
+                       if (folio_ref_count(folio) !=
+                           folio_nr_pages(folio) + 
filemap_get_folios_refcount) {
+                               safe = false;
+                               *err_index = folio->index;
+                               break;
+                       }
+               }
+
+               folio_batch_release(&fbatch);
+               cond_resched();
+       }
+
+       return safe;
+}
+
 static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
-                                    size_t nr_pages, uint64_t attrs)
+                                    size_t nr_pages, uint64_t attrs,
+                                    pgoff_t *err_index)
 {
+       bool to_private = attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE;
        struct address_space *mapping = inode->i_mapping;
        struct gmem_inode *gi = GMEM_I(inode);
        pgoff_t end = start + nr_pages;
@@ -600,8 +633,21 @@ static int __kvm_gmem_set_attributes(struct inode *inode, 
pgoff_t start,
 
        mas_init(&mas, mt, start);
        r = kvm_gmem_mas_preallocate(&mas, attrs, start, nr_pages);
-       if (r)
+       if (r) {
+               *err_index = start;
                goto out;
+       }
+
+       if (to_private) {
+               unmap_mapping_pages(mapping, start, nr_pages, false);
+
+               if (!kvm_gmem_is_safe_for_conversion(inode, start, nr_pages,
+                                                    err_index)) {
+                       mas_destroy(&mas);
+                       r = -EAGAIN;
+                       goto out;
+               }
+       }
 
        /*
         * From this point on guest_memfd has performed necessary
@@ -621,9 +667,10 @@ static long kvm_gmem_set_attributes(struct file *file, 
void __user *argp)
        struct gmem_file *f = file->private_data;
        struct inode *inode = file_inode(file);
        struct kvm_memory_attributes2 attrs;
+       pgoff_t err_index;
        size_t nr_pages;
        pgoff_t index;
-       int i;
+       int i, r;
 
        if (copy_from_user(&attrs, argp, sizeof(attrs)))
                return -EFAULT;
@@ -647,8 +694,16 @@ static long kvm_gmem_set_attributes(struct file *file, 
void __user *argp)
 
        nr_pages = attrs.size >> PAGE_SHIFT;
        index = attrs.offset >> PAGE_SHIFT;
-       return __kvm_gmem_set_attributes(inode, index, nr_pages,
-                                        attrs.attributes);
+       r = __kvm_gmem_set_attributes(inode, index, nr_pages, attrs.attributes,
+                                     &err_index);
+       if (r) {
+               attrs.error_offset = ((uint64_t)err_index) << PAGE_SHIFT;
+
+               if (copy_to_user(argp, &attrs, sizeof(attrs)))
+                       return -EFAULT;
+       }
+
+       return r;
 }
 
 static long kvm_gmem_ioctl(struct file *file, unsigned int ioctl,

-- 
2.54.0.545.g6539524ca2-goog



Reply via email to