On Mon, 4 Nov 2024 12:21:30 +0200
Yishai Hadas <yish...@nvidia.com> wrote:

> Add PRE_COPY support for live migration.
> 
> This functionality may reduce the downtime upon STOP_COPY as of letting
> the target machine to get some 'initial data' from the source once the
> machine is still in its RUNNING state and let it prepares itself
> pre-ahead to get the final STOP_COPY data.
> 
> As the Virtio specification does not support reading partial or
> incremental device contexts. This means that during the PRE_COPY state,
> the vfio-virtio driver reads the full device state.
> 
> As the device state can be changed and the benefit is highest when the
> pre copy data closely matches the final data we read it in a rate
> limiter mode and reporting no data available for some time interval
> after the previous call.
> 
> With PRE_COPY enabled, we observed a downtime reduction of approximately
> 70-75% in various scenarios compared to when PRE_COPY was disabled,
> while keeping the total migration time nearly the same.
> 
> Signed-off-by: Yishai Hadas <yish...@nvidia.com>
> ---
>  drivers/vfio/pci/virtio/common.h  |   4 +
>  drivers/vfio/pci/virtio/migrate.c | 233 +++++++++++++++++++++++++++++-
>  2 files changed, 229 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/vfio/pci/virtio/common.h 
> b/drivers/vfio/pci/virtio/common.h
> index 3bdfb3ea1174..5704603f0f9d 100644
> --- a/drivers/vfio/pci/virtio/common.h
> +++ b/drivers/vfio/pci/virtio/common.h
> @@ -10,6 +10,8 @@
>  
>  enum virtiovf_migf_state {
>       VIRTIOVF_MIGF_STATE_ERROR = 1,
> +     VIRTIOVF_MIGF_STATE_PRECOPY = 2,
> +     VIRTIOVF_MIGF_STATE_COMPLETE = 3,
>  };
>  
>  enum virtiovf_load_state {
> @@ -57,6 +59,8 @@ struct virtiovf_migration_file {
>       /* synchronize access to the file state */
>       struct mutex lock;
>       loff_t max_pos;
> +     u64 pre_copy_initial_bytes;
> +     struct ratelimit_state pre_copy_rl_state;
>       u64 record_size;
>       u32 record_tag;
>       u8 has_obj_id:1;
> diff --git a/drivers/vfio/pci/virtio/migrate.c 
> b/drivers/vfio/pci/virtio/migrate.c
> index 2a9614c2ef07..cdb252f6fd80 100644
> --- a/drivers/vfio/pci/virtio/migrate.c
> +++ b/drivers/vfio/pci/virtio/migrate.c
...
> @@ -379,9 +432,104 @@ static ssize_t virtiovf_save_read(struct file *filp, 
> char __user *buf, size_t le
>       return done;
>  }
>  
> +static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd,
> +                                unsigned long arg)
> +{
> +     struct virtiovf_migration_file *migf = filp->private_data;
> +     struct virtiovf_pci_core_device *virtvdev = migf->virtvdev;
> +     struct vfio_precopy_info info = {};
> +     loff_t *pos = &filp->f_pos;
> +     bool end_of_data = false;
> +     unsigned long minsz;
> +     u32 ctx_size;
> +     int ret;
> +
> +     if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
> +             return -ENOTTY;
> +
> +     minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
> +     if (copy_from_user(&info, (void __user *)arg, minsz))
> +             return -EFAULT;
> +
> +     if (info.argsz < minsz)
> +             return -EINVAL;
> +
> +     mutex_lock(&virtvdev->state_mutex);
> +     if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
> +         virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
> +             ret = -EINVAL;
> +             goto err_state_unlock;
> +     }
> +
> +     /*
> +      * The virtio specification does not include a PRE_COPY concept.
> +      * Since we can expect the data to remain the same for a certain period,
> +      * we use a rate limiter mechanism before making a call to the device.
> +      */
> +     if (!__ratelimit(&migf->pre_copy_rl_state)) {
> +             /* Reporting no data available */
> +             ret = 0;
> +             goto done;

@ret is not used by the done: goto target.  Don't we need to zero dirty
bytes, or account for initial bytes not being fully read yet?

> +     }
> +
> +     ret = 
> virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
> +                             VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
> +                             VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
> +                             &ctx_size);
> +     if (ret)
> +             goto err_state_unlock;
> +
> +     mutex_lock(&migf->lock);
> +     if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
> +             ret = -ENODEV;
> +             goto err_migf_unlock;
> +     }
> +
> +     if (migf->pre_copy_initial_bytes > *pos) {
> +             info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
> +     } else {
> +             info.dirty_bytes = migf->max_pos - *pos;
> +             if (!info.dirty_bytes)
> +                     end_of_data = true;
> +             info.dirty_bytes += ctx_size;
> +     }
> +
> +     if (!end_of_data || !ctx_size) {
> +             mutex_unlock(&migf->lock);
> +             goto done;
> +     }
> +
> +     mutex_unlock(&migf->lock);
> +     /*
> +      * We finished transferring the current state and the device has a
> +      * dirty state, read a new state.
> +      */
> +     ret = virtiovf_read_device_context_chunk(migf, ctx_size);
> +     if (ret)
> +             /*
> +              * The machine is running, and context size could be grow, so 
> no reason to mark
> +              * the device state as VIRTIOVF_MIGF_STATE_ERROR.
> +              */
> +             goto err_state_unlock;
> +
> +done:
> +     virtiovf_state_mutex_unlock(virtvdev);
> +     if (copy_to_user((void __user *)arg, &info, minsz))
> +             return -EFAULT;
> +     return 0;
> +
> +err_migf_unlock:
> +     mutex_unlock(&migf->lock);
> +err_state_unlock:
> +     virtiovf_state_mutex_unlock(virtvdev);
> +     return ret;
> +}
> +
...
> @@ -536,6 +719,17 @@ virtiovf_pci_save_device_data(struct 
> virtiovf_pci_core_device *virtvdev)
>       if (ret)
>               goto out_clean;
>  
> +     if (pre_copy) {
> +             migf->pre_copy_initial_bytes = migf->max_pos;
> +             ratelimit_state_init(&migf->pre_copy_rl_state, 1 * HZ, 1);

A comment describing the choice of this heuristic would be useful for
future maintenance, even if the comment is "Arbitrarily rate limit
pre-copy to 1s intervals."  Thanks,

Alex

> +             /* Prevent any rate messages upon its usage */
> +             ratelimit_set_flags(&migf->pre_copy_rl_state,
> +                                 RATELIMIT_MSG_ON_RELEASE);
> +             migf->state = VIRTIOVF_MIGF_STATE_PRECOPY;
> +     } else {
> +             migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
> +     }
> +
>       return migf;
>  
>  out_clean:


Reply via email to