Ackerley Tng <[email protected]> writes:

> Tarun Sahu <[email protected]> writes:
>
>> Introduce core infrastructure to support VM preservation with LUO.
>>
>> First two changes are just refactoring, no functional change, third
>> change introduces a new member in struct kvm.
>> - Move ITOA_MAX_LEN to kvm_mm.h for reuse by upcoming kvm_luo code.
>> - Add a public kvm_create_vm_file() helper wrapping kvm_create_vm()
>>   and anon_inode_getfile() to provide a unified VM file creation API.
>> - Track a weak reference to the backing file in struct kvm under
>>   CONFIG_LIVEUPDATE_GUEST_MEMFD to enable reverse file resolution
>>   without circular lifetime dependencies.
>>
>
> Given the above, I think this should be separate patches.
>
>> Signed-off-by: Tarun Sahu <[email protected]>
>> ---
>>  include/linux/kvm_host.h | 14 +++++++
>>  virt/kvm/kvm_main.c      | 79 +++++++++++++++++++++++++++++-----------
>>  virt/kvm/kvm_mm.h        |  3 ++
>>  3 files changed, 75 insertions(+), 21 deletions(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 4c14aee1fb06..9111a28637af 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -874,6 +874,18 @@ struct kvm {
>>  #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
>>      /* Protected by slots_lock (for writes) and RCU (for reads) */
>>      struct xarray mem_attr_array;
>> +#endif
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> +    /*
>> +     * Weak reference to the VFS file backing this KVM instance. Stored
>> +     * without incrementing the file refcount to prevent a circular lifetime
>> +     * dependency (since file->private_data already pins this struct kvm).
>> +     * Used exclusively to resolve the file pointer back from struct kvm.
>> +     *
>> +     * Written/cleared via rcu_assign_pointer() and read locklessly under
>> +     * RCU (e.g. via get_file_active() to prevent ABA races).
>> +     */
>> +    struct file *vm_file;
>>  #endif
>
> We didn't really talk about this during the calls, but it seems weird to
> preserve a vm_file with pretty much nothing other than the vm type. The
> entire VM is re-created, which means it could potentially be a
> completely different VM?
>
> In some sense it's more flexible since the guest_memfd can be restored
> with some completely different VM, but it seems like it could introduce
> other issues.
>
> I think other KVM folks would probably have more thoughts here.
>
>>      char stats_id[KVM_STATS_NAME_SIZE];
>>  };
>> @@ -1074,7 +1086,9 @@ void kvm_get_kvm(struct kvm *kvm);
>>  bool kvm_get_kvm_safe(struct kvm *kvm);
>>  void kvm_put_kvm(struct kvm *kvm);
>>  bool file_is_kvm(struct file *file);
>> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname);
>>  void kvm_put_kvm_no_destroy(struct kvm *kvm);
>> +void kvm_uevent_notify_vm_create(struct kvm *kvm);
>>
>>  static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int 
>> as_id)
>>  {
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 89489996fbc1..65f0c5fb353e 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -67,9 +67,6 @@
>>  #include <linux/kvm_dirty_ring.h>
>>
>>
>> -/* Worst case buffer size needed for holding an integer. */
>> -#define ITOA_MAX_LEN 12
>> -
>>  MODULE_AUTHOR("Qumranet");
>>  MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
>>  MODULE_LICENSE("GPL");
>> @@ -1349,6 +1346,19 @@ static int kvm_vm_release(struct inode *inode, struct 
>> file *filp)
>>  {
>>      struct kvm *kvm = filp->private_data;
>>
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> +    /*
>> +     * Clear the weak reference of the vm file.
>> +     * In case vm file is closed by userspace, but kvm still has
>> +     * other users like vCPUs, clearing this pointer ensures
>> +     * that we don't have a dangling pointer to a closed file.
>> +     *
>> +     * Cleared via rcu_assign_pointer() to ensure proper memory visibility
>> +     * for concurrent lockless readers under RCU.
>> +     */
>> +    rcu_assign_pointer(kvm->vm_file, NULL);
>> +#endif
>> +
>>      kvm_irqfd_release(kvm);
>>
>>      kvm_put_kvm(kvm);
>> @@ -5476,11 +5486,47 @@ bool file_is_kvm(struct file *file)
>>  }
>>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
>>
>> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname)
>> +{
>> +    struct kvm *kvm = kvm_create_vm(type, fdname);
>> +    struct file *file;
>> +
>> +    if (IS_ERR(kvm))
>> +            return ERR_CAST(kvm);
>> +
>> +    file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>> +    if (IS_ERR(file)) {
>> +            kvm_put_kvm(kvm);
>> +            return file;
>> +    }
>> +
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> +    /*
>> +     * Weak reference to the file (without get_file()) to prevent a circular
>> +     * dependency. Safe because the file's release path clears this pointer
>> +     * and drops its reference to the VM.
>> +     *
>> +     * Written via rcu_assign_pointer() because the pointer can be read
>> +     * locklessly under RCU (e.g., in kvm_gmem_luo_preserve() via
>> +     * get_file_active() to prevent lockless ABA races).
>> +     */
>> +    rcu_assign_pointer(kvm->vm_file, file);
>> +#endif
>> +
>> +    /*
>> +     * Don't call kvm_put_kvm anymore at this point; file->f_op is
>> +     * already set, with ->release() being kvm_vm_release().  In error
>> +     * cases it will be called by the final fput(file) and will take
>> +     * care of doing kvm_put_kvm(kvm).
>> +     */
>> +
>> +    return file;
>> +}
>> +
>>  static int kvm_dev_ioctl_create_vm(unsigned long type)
>>  {
>>      char fdname[ITOA_MAX_LEN + 1];
>>      int r, fd;
>> -    struct kvm *kvm;
>>      struct file *file;
>>
>>      fd = get_unused_fd_flags(O_CLOEXEC);
>> @@ -5489,31 +5535,17 @@ static int kvm_dev_ioctl_create_vm(unsigned long 
>> type)
>>
>>      snprintf(fdname, sizeof(fdname), "%d", fd);
>>
>> -    kvm = kvm_create_vm(type, fdname);
>> -    if (IS_ERR(kvm)) {
>> -            r = PTR_ERR(kvm);
>> -            goto put_fd;
>> -    }
>> -
>> -    file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>> +    file = kvm_create_vm_file(type, fdname);
>>      if (IS_ERR(file)) {
>>              r = PTR_ERR(file);
>> -            goto put_kvm;
>> +            goto put_fd;
>>      }
>>
>> -    /*
>> -     * Don't call kvm_put_kvm anymore at this point; file->f_op is
>> -     * already set, with ->release() being kvm_vm_release().  In error
>> -     * cases it will be called by the final fput(file) and will take
>> -     * care of doing kvm_put_kvm(kvm).
>> -     */
>> -    kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
>> +    kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, file->private_data);
>
> Notifying with file->private_data threw me off... I would rather inline
> the rcu_assign_pointer() in this function and have this line read
> notify(..., kvm) like before.

Yes, I will update it to:

     struct kvm *kvm;
     ...
     kvm = file->private_data;
     notify (..., kvm);

About rcu_assign_pointer, I am not sure, IIUC. that only set the
kvm->vm_file pointer, which does not have any relation with
file->private_data. And keeping the rcu_assign_pointer(kvm->vm_file,
file) at the current place (inside kvm_create_vm_file) logically makes
sense. because, kvm_create_vm_file creates the struct kvm and vm_file,
So, all the relation variables should get updated there.

>
>>
>>      fd_install(fd, file);
>>      return fd;
>>
>> -put_kvm:
>> -    kvm_put_kvm(kvm);
>>  put_fd:
>>      put_unused_fd(fd);
>>      return r;
>> @@ -6341,6 +6373,11 @@ static void kvm_uevent_notify_change(unsigned int 
>> type, struct kvm *kvm)
>>      kfree(env);
>>  }
>>
>> +void kvm_uevent_notify_vm_create(struct kvm *kvm)
>> +{
>> +    kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
>> +}
>> +
>>  static void kvm_init_debug(void)
>>  {
>>      const struct file_operations *fops;
>> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
>> index 9fcc5d5b7f8d..7aa1d65c3d46 100644
>> --- a/virt/kvm/kvm_mm.h
>> +++ b/virt/kvm/kvm_mm.h
>> @@ -3,6 +3,9 @@
>>  #ifndef __KVM_MM_H__
>>  #define __KVM_MM_H__ 1
>>
>> +/* Worst case buffer size needed for holding an integer as a string. */
>> +#define ITOA_MAX_LEN 12
>> +
>>  /*
>>   * Architectures can choose whether to use an rwlock or spinlock
>>   * for the mmu_lock.  These macros, for use in common code
>> --
>> 2.54.0.1032.g2f8565e1d1-goog

Reply via email to