When using guest_memfd with support for shared memory / in-place conversion, it is necessary to use the guest_memfd ioctls to handle conversions instead of KVM ioctls. Implement support for this by looping through all the sections within a converison range. Implement everything in terms of the kvm_convert_memory() loop, which already deals with some special considerations regarding various holes / region types that might be encountered.
Also update kvm_set_memory_attributes_*() to use the same common path when convert-in-place=false. This potentially results in a small change in behavior due to the additional MMIO checks/skips now being applied in that case (generally qemu-triggered during setup) rather than only for kvm_convert_memory() (generally guest-triggered), but this is arguably safer, and it provides similar behavior between convert-in-place=false vs. convert-in-place=true, the latter of which *must* skip MMIO holes because the regions (and associated guest_memfds) themselves track shared/private state internally and passing the whole conversion range through to KVM is not an option in that case. Signed-off-by: Michael Roth <[email protected]> --- accel/kvm/kvm-all.c | 131 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 114 insertions(+), 17 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 62f2e8aa15..fd01435a0f 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -1626,14 +1626,78 @@ static int kvm_set_memory_attributes(hwaddr start, uint64_t size, uint64_t attr) return r; } -int kvm_set_memory_attributes_private(hwaddr start, uint64_t size) +static int kvm_gmem_ioctl(int guest_memfd, unsigned long type, ...) { - return kvm_set_memory_attributes(start, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); + int ret; + void *arg; + va_list ap; + + va_start(ap, type); + arg = va_arg(ap, void *); + va_end(ap); + + ret = ioctl(guest_memfd, type, arg); + if (ret == -1) { + ret = -errno; + } + return ret; } -int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size) +static int guest_memfd_set_memory_attributes_fd(int guest_memfd, hwaddr offset, + uint64_t size, uint64_t attr) { - return kvm_set_memory_attributes(start, size, 0); + struct kvm_memory_attributes2 attrs; + int r; + + assert((attr & kvm_supported_memory_attributes) == attr); + attrs.attributes = attr; + attrs.offset = offset; + attrs.size = size; + attrs.flags = 0; + + /* + * guest_memfd may need to delay conversion requests due to + * the memory being in-use by the kernel. In most cases these + * will be transient uses. In some cases, userspace itself may + * be the cause of the memory being considered in-use, though + * QEMU currently takes steps to avoid this (e.g. via + * RamBlockAttributes). On that basis, this code loops + * indefinitely with the assumption that only transient cases + * will block, and that those will be for relatively short + * periods vs. the overall conversion path. + * If those assumptions at some point prove false, most likely + * this will manifest as guest-side lockups on their conversion + * path, which seems like the appropriate way to surface this + * situation to the guest owner rather than some hard timeout. + */ + do { + r = kvm_gmem_ioctl(guest_memfd, KVM_SET_MEMORY_ATTRIBUTES2, &attrs); + } while (r == -EAGAIN); + + if (r) { + error_report("failed to set memory (0x%" HWADDR_PRIx "+0x%" PRIx64 ") " + "with attr 0x%" PRIx64 " error '%s'", + offset, size, attr, strerror(-r)); + } + return r; +} + +static int guest_memfd_set_memory_section_attributes(MemoryRegionSection *section, uint64_t attr) +{ + hwaddr convert_offset, convert_size; + MemoryRegion *mr = section->mr; + RAMBlock *rb; + + assert(mr); + rb = mr->ram_block; + assert(rb->guest_memfd); + convert_offset = section->offset_within_region; + convert_size = int128_get64(section->size); + + return guest_memfd_set_memory_attributes_fd(rb->guest_memfd, + convert_offset, + convert_size, + attr); } /* Called with KVMMemoryListener.slots_lock held */ @@ -3447,10 +3511,18 @@ static int kvm_convert_section(MemoryRegionSection *section, bool to_private) hwaddr size = int128_get64(section->size); int ret; - if (to_private) { - ret = kvm_set_memory_attributes_private(start, size); + if (current_machine->cgs && current_machine->cgs->convert_in_place) { + ret = guest_memfd_set_memory_section_attributes(section, + to_private ? KVM_MEMORY_ATTRIBUTE_PRIVATE + : 0); } else { - ret = kvm_set_memory_attributes_shared(start, size); + /* + * Without in-place conversion, attribute-tracking is handled by KVM + * across all guest memory rather than on a per-section/slot basis. + */ + ret = kvm_set_memory_attributes(start, size, + to_private ? KVM_MEMORY_ATTRIBUTE_PRIVATE + : 0); } return ret; @@ -3544,7 +3616,8 @@ static int kvm_post_convert_section(MemoryRegionSection *section, bool to_privat return 0; } -int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) +static int kvm_convert_memory_full(hwaddr start, hwaddr size, bool to_private, + bool pre_hooks, bool post_hooks) { int ret = -EINVAL; @@ -3588,10 +3661,12 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) continue; } - ret = kvm_pre_convert_section(§ion, to_private); - if (ret) { - memory_region_unref(section.mr); - break; + if (pre_hooks) { + ret = kvm_pre_convert_section(§ion, to_private); + if (ret) { + memory_region_unref(section.mr); + break; + } } ret = kvm_convert_section(§ion, to_private); @@ -3600,13 +3675,15 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) break; } - ret = kvm_post_convert_section(§ion, to_private); - memory_region_unref(section.mr); - - if (ret) { - break; + if (post_hooks) { + ret = kvm_post_convert_section(§ion, to_private); + if (ret) { + memory_region_unref(section.mr); + break; + } } + memory_region_unref(section.mr); size -= section_end - start; start = section_end; } @@ -3614,6 +3691,26 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) return ret; } +int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) +{ + return kvm_convert_memory_full(start, size, to_private, true, true); +} + +static int kvm_convert_memory_attributes(hwaddr start, hwaddr size, bool to_private) +{ + return kvm_convert_memory_full(start, size, to_private, false, false); +} + +int kvm_set_memory_attributes_private(hwaddr start, uint64_t size) +{ + return kvm_convert_memory_attributes(start, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); +} + +int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size) +{ + return kvm_convert_memory_attributes(start, size, 0); +} + int kvm_cpu_exec(CPUState *cpu) { struct kvm_run *run = cpu->kvm_run; -- 2.43.0
