When using guest_memfd with support for shared memory / in-place
conversion, it is necessary to use the guest_memfd ioctls to handle
conversions instead of KVM ioctls. Implement support for this by looping
through all the sections within a converison range. Implement everything
in terms of the kvm_convert_memory() loop, which already deals with some
special considerations regarding various holes / region types that might
be encountered.
Also update kvm_set_memory_attributes_*() to use the same common path
when convert-in-place=false. This potentially results in a small change
in behavior due to the additional MMIO checks/skips now being applied in
that case (generally qemu-triggered during setup) rather than only for
kvm_convert_memory() (generally guest-triggered), but this is arguably
safer, and it provides similar behavior between convert-in-place=false
vs. convert-in-place=true, the latter of which *must* skip MMIO holes
because the regions (and associated guest_memfds) themselves track
shared/private state internally and passing the whole conversion range
through to KVM is not an option in that case.
Signed-off-by: Michael Roth <[email protected]>
---
accel/kvm/kvm-all.c | 131 ++++++++++++++++++++++++++++++++++++++------
1 file changed, 114 insertions(+), 17 deletions(-)
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 62f2e8aa15..fd01435a0f 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1626,14 +1626,78 @@ static int kvm_set_memory_attributes(hwaddr start,
uint64_t size, uint64_t attr)
return r;
}
-int kvm_set_memory_attributes_private(hwaddr start, uint64_t size)
+static int kvm_gmem_ioctl(int guest_memfd, unsigned long type, ...)
{
- return kvm_set_memory_attributes(start, size,
KVM_MEMORY_ATTRIBUTE_PRIVATE);
+ int ret;
+ void *arg;
+ va_list ap;
+
+ va_start(ap, type);
+ arg = va_arg(ap, void *);
+ va_end(ap);
+
+ ret = ioctl(guest_memfd, type, arg);
+ if (ret == -1) {
+ ret = -errno;
+ }
+ return ret;
}
-int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size)
+static int guest_memfd_set_memory_attributes_fd(int guest_memfd, hwaddr offset,
+ uint64_t size, uint64_t attr)
{
- return kvm_set_memory_attributes(start, size, 0);
+ struct kvm_memory_attributes2 attrs;
+ int r;
+
+ assert((attr & kvm_supported_memory_attributes) == attr);
+ attrs.attributes = attr;
+ attrs.offset = offset;
+ attrs.size = size;
+ attrs.flags = 0;
+
+ /*
+ * guest_memfd may need to delay conversion requests due to
+ * the memory being in-use by the kernel. In most cases these
+ * will be transient uses. In some cases, userspace itself may
+ * be the cause of the memory being considered in-use, though
+ * QEMU currently takes steps to avoid this (e.g. via
+ * RamBlockAttributes). On that basis, this code loops
+ * indefinitely with the assumption that only transient cases
+ * will block, and that those will be for relatively short
+ * periods vs. the overall conversion path.
+ * If those assumptions at some point prove false, most likely
+ * this will manifest as guest-side lockups on their conversion
+ * path, which seems like the appropriate way to surface this
+ * situation to the guest owner rather than some hard timeout.
+ */
+ do {
+ r = kvm_gmem_ioctl(guest_memfd, KVM_SET_MEMORY_ATTRIBUTES2, &attrs);
+ } while (r == -EAGAIN);
+
+ if (r) {
+ error_report("failed to set memory (0x%" HWADDR_PRIx "+0x%" PRIx64 ") "
+ "with attr 0x%" PRIx64 " error '%s'",
+ offset, size, attr, strerror(-r));
+ }
+ return r;
+}
+
+static int guest_memfd_set_memory_section_attributes(MemoryRegionSection
*section, uint64_t attr)
+{
+ hwaddr convert_offset, convert_size;
+ MemoryRegion *mr = section->mr;
+ RAMBlock *rb;
+
+ assert(mr);
+ rb = mr->ram_block;
+ assert(rb->guest_memfd);
+ convert_offset = section->offset_within_region;
+ convert_size = int128_get64(section->size);
+
+ return guest_memfd_set_memory_attributes_fd(rb->guest_memfd,
+ convert_offset,
+ convert_size,
+ attr);
}
/* Called with KVMMemoryListener.slots_lock held */
@@ -3447,10 +3511,18 @@ static int kvm_convert_section(MemoryRegionSection
*section, bool to_private)
hwaddr size = int128_get64(section->size);
int ret;
- if (to_private) {
- ret = kvm_set_memory_attributes_private(start, size);
+ if (current_machine->cgs && current_machine->cgs->convert_in_place) {
+ ret = guest_memfd_set_memory_section_attributes(section,
+ to_private ?
KVM_MEMORY_ATTRIBUTE_PRIVATE
+ : 0);
} else {
- ret = kvm_set_memory_attributes_shared(start, size);
+ /*
+ * Without in-place conversion, attribute-tracking is handled by KVM
+ * across all guest memory rather than on a per-section/slot basis.
+ */
+ ret = kvm_set_memory_attributes(start, size,
+ to_private ?
KVM_MEMORY_ATTRIBUTE_PRIVATE
+ : 0);
}
return ret;
@@ -3544,7 +3616,8 @@ static int kvm_post_convert_section(MemoryRegionSection
*section, bool to_privat
return 0;
}
-int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+static int kvm_convert_memory_full(hwaddr start, hwaddr size, bool to_private,
+ bool pre_hooks, bool post_hooks)
{
int ret = -EINVAL;
@@ -3588,10 +3661,12 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
continue;
}
- ret = kvm_pre_convert_section(§ion, to_private);
- if (ret) {
- memory_region_unref(section.mr);
- break;
+ if (pre_hooks) {
+ ret = kvm_pre_convert_section(§ion, to_private);
+ if (ret) {
+ memory_region_unref(section.mr);
+ break;
+ }
}
ret = kvm_convert_section(§ion, to_private);
@@ -3600,13 +3675,15 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool
to_private)
break;
}
- ret = kvm_post_convert_section(§ion, to_private);
- memory_region_unref(section.mr);
-
- if (ret) {
- break;
+ if (post_hooks) {
+ ret = kvm_post_convert_section(§ion, to_private);
+ if (ret) {
+ memory_region_unref(section.mr);
+ break;
+ }
}
+ memory_region_unref(section.mr);
size -= section_end - start;
start = section_end;
}
@@ -3614,6 +3691,26 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool
to_private)
return ret;
}
+int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
+{
+ return kvm_convert_memory_full(start, size, to_private, true, true);
+}
+
+static int kvm_convert_memory_attributes(hwaddr start, hwaddr size, bool
to_private)
+{
+ return kvm_convert_memory_full(start, size, to_private, false, false);
+}
+
+int kvm_set_memory_attributes_private(hwaddr start, uint64_t size)
+{
+ return kvm_convert_memory_attributes(start, size,
KVM_MEMORY_ATTRIBUTE_PRIVATE);
+}
+
+int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size)
+{
+ return kvm_convert_memory_attributes(start, size, 0);
+}
+
int kvm_cpu_exec(CPUState *cpu)
{
struct kvm_run *run = cpu->kvm_run;