On Thu, Jun 04, 2026 at 03:19:17PM +0200, Gupta, Pankaj wrote:
> 
> > When using guest_memfd with support for shared memory / in-place
> > conversion, it is necessary to use the guest_memfd ioctls to handle
> > conversions instead of KVM ioctls. Implement support for this by looping
> > through all the sections within a converison range. Implement everything
> > in terms of the kvm_convert_memory() loop, which already deals with some
> > special considerations regarding various holes / region types that might
> > be encountered.
> > 
> > Also update kvm_set_memory_attributes_*() to use the same common path
> > when convert-in-place=false. This potentially results in a small change
> > in behavior due to the additional MMIO checks/skips now being applied in
> > that case (generally qemu-triggered during setup) rather than only for
> > kvm_convert_memory() (generally guest-triggered), but this is arguably
> > safer, and it provides similar behavior between convert-in-place=false
> > vs. convert-in-place=true, the latter of which *must* skip MMIO holes
> > because the regions (and associated guest_memfds) themselves track
> > shared/private state internally and passing the whole conversion range
> > through to KVM is not an option in that case.
> > 
> > Signed-off-by: Michael Roth <[email protected]>
> > ---
> >   accel/kvm/kvm-all.c | 131 ++++++++++++++++++++++++++++++++++++++------
> >   1 file changed, 114 insertions(+), 17 deletions(-)
> > 
> > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> > index 62f2e8aa15..fd01435a0f 100644
> > --- a/accel/kvm/kvm-all.c
> > +++ b/accel/kvm/kvm-all.c
> > @@ -1626,14 +1626,78 @@ static int kvm_set_memory_attributes(hwaddr start, 
> > uint64_t size, uint64_t attr)
> >       return r;
> >   }
> > -int kvm_set_memory_attributes_private(hwaddr start, uint64_t size)
> > +static int kvm_gmem_ioctl(int guest_memfd, unsigned long type, ...)
> >   {
> > -    return kvm_set_memory_attributes(start, size, 
> > KVM_MEMORY_ATTRIBUTE_PRIVATE);
> > +    int ret;
> > +    void *arg;
> > +    va_list ap;
> > +
> > +    va_start(ap, type);
> > +    arg = va_arg(ap, void *);
> > +    va_end(ap);
> > +
> > +    ret = ioctl(guest_memfd, type, arg);
> > +    if (ret == -1) {
> > +        ret = -errno;
> > +    }
> > +    return ret;
> >   }
> > -int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size)
> > +static int guest_memfd_set_memory_attributes_fd(int guest_memfd, hwaddr 
> > offset,
> > +                                                uint64_t size, uint64_t 
> > attr)
> >   {
> > -    return kvm_set_memory_attributes(start, size, 0);
> > +    struct kvm_memory_attributes2 attrs;
> 
> -    struct kvm_memory_attributes2 attrs;
> +    struct kvm_memory_attributes2 attrs = {0};
> 
> Zero initializing 'attrs' fixed a '-EINVAL' error, caused because of kernel
> 'attrs.reserved' check failed in 'kvm_gmem_set_attributes()'.

Indeed, thanks for the catch!

-Mike

> 
> Thanks,
> 
> Pankaj
> 
> > +    int r;
> > +
> > +    assert((attr & kvm_supported_memory_attributes) == attr);
> > +    attrs.attributes = attr;
> > +    attrs.offset = offset;
> > +    attrs.size = size;
> > +    attrs.flags = 0;
> > +
> > +    /*
> > +     * guest_memfd may need to delay conversion requests due to
> > +     * the memory being in-use by the kernel. In most cases these
> > +     * will be transient uses. In some cases, userspace itself may
> > +     * be the cause of the memory being considered in-use, though
> > +     * QEMU currently takes steps to avoid this (e.g. via
> > +     * RamBlockAttributes). On that basis, this code loops
> > +     * indefinitely with the assumption that only transient cases
> > +     * will block, and that those will be for relatively short
> > +     * periods vs. the overall conversion path.
> > +     * If those assumptions at some point prove false, most likely
> > +     * this will manifest as guest-side lockups on their conversion
> > +     * path, which seems like the appropriate way to surface this
> > +     * situation to the guest owner rather than some hard timeout.
> > +     */
> > +    do {
> > +        r = kvm_gmem_ioctl(guest_memfd, KVM_SET_MEMORY_ATTRIBUTES2, 
> > &attrs);
> > +    } while (r == -EAGAIN);
> > +
> > +    if (r) {
> > +        error_report("failed to set memory (0x%" HWADDR_PRIx "+0x%" PRIx64 
> > ") "
> > +                     "with attr 0x%" PRIx64 " error '%s'",
> > +                     offset, size, attr, strerror(-r));
> > +    }
> > +    return r;
> > +}
> > +
> > +static int guest_memfd_set_memory_section_attributes(MemoryRegionSection 
> > *section, uint64_t attr)
> > +{
> > +    hwaddr convert_offset, convert_size;
> > +    MemoryRegion *mr = section->mr;
> > +    RAMBlock *rb;
> > +
> > +    assert(mr);
> > +    rb = mr->ram_block;
> > +    assert(rb->guest_memfd);
> > +    convert_offset = section->offset_within_region;
> > +    convert_size = int128_get64(section->size);
> > +
> > +    return guest_memfd_set_memory_attributes_fd(rb->guest_memfd,
> > +                                                convert_offset,
> > +                                                convert_size,
> > +                                                attr);
> >   }
> >   /* Called with KVMMemoryListener.slots_lock held */
> > @@ -3447,10 +3511,18 @@ static int kvm_convert_section(MemoryRegionSection 
> > *section, bool to_private)
> >       hwaddr size = int128_get64(section->size);
> >       int ret;
> > -    if (to_private) {
> > -        ret = kvm_set_memory_attributes_private(start, size);
> > +    if (current_machine->cgs && current_machine->cgs->convert_in_place) {
> > +        ret = guest_memfd_set_memory_section_attributes(section,
> > +                                                        to_private ? 
> > KVM_MEMORY_ATTRIBUTE_PRIVATE
> > +                                                                   : 0);
> >       } else {
> > -        ret = kvm_set_memory_attributes_shared(start, size);
> > +        /*
> > +         * Without in-place conversion, attribute-tracking is handled by 
> > KVM
> > +         * across all guest memory rather than on a per-section/slot basis.
> > +         */
> > +        ret = kvm_set_memory_attributes(start, size,
> > +                                        to_private ? 
> > KVM_MEMORY_ATTRIBUTE_PRIVATE
> > +                                                   : 0);
> >       }
> >       return ret;
> > @@ -3544,7 +3616,8 @@ static int 
> > kvm_post_convert_section(MemoryRegionSection *section, bool to_privat
> >       return 0;
> >   }
> > -int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
> > +static int kvm_convert_memory_full(hwaddr start, hwaddr size, bool 
> > to_private,
> > +                                   bool pre_hooks, bool post_hooks)
> >   {
> >       int ret = -EINVAL;
> > @@ -3588,10 +3661,12 @@ int kvm_convert_memory(hwaddr start, hwaddr size, 
> > bool to_private)
> >               continue;
> >           }
> > -        ret = kvm_pre_convert_section(&section, to_private);
> > -        if (ret) {
> > -            memory_region_unref(section.mr);
> > -            break;
> > +        if (pre_hooks) {
> > +            ret = kvm_pre_convert_section(&section, to_private);
> > +            if (ret) {
> > +                memory_region_unref(section.mr);
> > +                break;
> > +            }
> >           }
> >           ret = kvm_convert_section(&section, to_private);
> > @@ -3600,13 +3675,15 @@ int kvm_convert_memory(hwaddr start, hwaddr size, 
> > bool to_private)
> >               break;
> >           }
> > -        ret = kvm_post_convert_section(&section, to_private);
> > -        memory_region_unref(section.mr);
> > -
> > -        if (ret) {
> > -            break;
> > +        if (post_hooks) {
> > +            ret = kvm_post_convert_section(&section, to_private);
> > +            if (ret) {
> > +                memory_region_unref(section.mr);
> > +                break;
> > +            }
> >           }
> > +        memory_region_unref(section.mr);
> >           size -= section_end - start;
> >           start = section_end;
> >       }
> > @@ -3614,6 +3691,26 @@ int kvm_convert_memory(hwaddr start, hwaddr size, 
> > bool to_private)
> >       return ret;
> >   }
> > +int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
> > +{
> > +    return kvm_convert_memory_full(start, size, to_private, true, true);
> > +}
> > +
> > +static int kvm_convert_memory_attributes(hwaddr start, hwaddr size, bool 
> > to_private)
> > +{
> > +    return kvm_convert_memory_full(start, size, to_private, false, false);
> > +}
> > +
> > +int kvm_set_memory_attributes_private(hwaddr start, uint64_t size)
> > +{
> > +    return kvm_convert_memory_attributes(start, size, 
> > KVM_MEMORY_ATTRIBUTE_PRIVATE);
> > +}
> > +
> > +int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size)
> > +{
> > +    return kvm_convert_memory_attributes(start, size, 0);
> > +}
> > +
> >   int kvm_cpu_exec(CPUState *cpu)
> >   {
> >       struct kvm_run *run = cpu->kvm_run;

Reply via email to