On Fri, Apr 17, 2026 at 05:08:31PM +0800, JiaJia wrote:
> virtio_device_restore() resets the device and restores the negotiated
> features before calling ->restore(). viortc_freeze() intentionally
> leaves the existing virtqueues in place so the alarm queue can still
> wake the system, but viortc_restore() immediately calls
> viortc_init_vqs() without first deleting those old queues.
>
> If virtqueue reinitialization fails on virtio-pci, the transport error
> path runs vp_del_vqs() against a new vp_dev->vqs array while vdev->vqs
> still holds the old virtqueues. vp_del_vqs() then looks up queue state
> through that new array and can dereference a NULL info pointer in
> vp_del_vq(), crashing the guest kernel during restore.
>
> Delete any old virtqueues before rebuilding them, and clear the cached
> queue pointers so later message requests fail cleanly if restore does
> not complete.
>
> Signed-off-by: JiaJia <[email protected]>
Thanks for identifying the bug and providing a fix! I had a look at the
bugfix and I added some comments below on how I think the fix could be
improved.
> ---
> Runtime evidence from a standard hibernation test_resume run on a guest
> with virtio-rtc-pci (no fault injection):
>
> KASAN null-ptr-deref report in vp_del_vq.isra.0+0x70/0xd0
>
> Call trace:
> vp_del_vq.isra.0+0x70/0xd0
> vp_del_vqs+0xec/0x358
> vp_find_vqs_msix+0x24c/0x6a8
> vp_find_vqs+0x88/0x360
> vp_modern_find_vqs+0x20/0x90
> viortc_init_vqs+0xd0/0x128
> viortc_restore+0x28/0xb8
> virtio_device_restore_priv+0x184/0x288
> virtio_pci_restore+0x44/0x58
>
> drivers/virtio/virtio_rtc_driver.c | 38 ++++++++++++++++++++++++++----
> 1 file changed, 33 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/virtio/virtio_rtc_driver.c
> b/drivers/virtio/virtio_rtc_driver.c
> index a57d5e06e..c1adde27d 100644
> --- a/drivers/virtio/virtio_rtc_driver.c
> +++ b/drivers/virtio/virtio_rtc_driver.c
> @@ -427,6 +427,15 @@ static int viortc_msg_xfer(struct viortc_vq *vq, struct
> viortc_msg *msg,
> sg_init_one(out_sg, msg->req, msg->req_size);
> sg_init_one(in_sg, msg->resp, msg->resp_cap);
>
> + if (!vq->vq) {
> + /*
> + * Keep runtime interfaces in a safe failed state if restore
> + * teardown removed the virtqueues.
> + */
> + viortc_msg_release(msg);
> + return -ENODEV;
> + }
> +
> spin_lock_irqsave(&vq->lock, flags);
>
> ret = virtqueue_add_sgs(vq->vq, sgs, 1, 1, msg, GFP_ATOMIC);
> @@ -478,6 +487,17 @@ static int viortc_msg_xfer(struct viortc_vq *vq, struct
> viortc_msg *msg,
> return 0;
> }
>
> +static void viortc_del_vqs(struct viortc_dev *viortc)
> +{
> + struct virtio_device *vdev = viortc->vdev;
> + unsigned int i;
> +
> + vdev->config->del_vqs(vdev);
> +
> + for (i = 0; i < ARRAY_SIZE(viortc->vqs); i++)
> + viortc->vqs[i].vq = NULL;
> +}
> +
If keeping this (see my comment at the bottom), it should go further
down, e.g. above viortc_probe().
> /*
> * common message handle macros for messages of different types
> */
> @@ -1316,7 +1336,7 @@ static int viortc_probe(struct virtio_device *vdev)
In order to delete virtqueues consistently everywhere, I think it makes
sense to also "goto err_reset_vdev;" if viortc_init_vqs() fails in
viortc_probe().
>
> err_reset_vdev:
> virtio_reset_device(vdev);
> - vdev->config->del_vqs(vdev);
> + viortc_del_vqs(viortc);
>
> return ret;
> }
> @@ -1332,7 +1352,7 @@ static void viortc_remove(struct virtio_device *vdev)
> viortc_clocks_deinit(viortc);
>
> virtio_reset_device(vdev);
> - vdev->config->del_vqs(vdev);
> + viortc_del_vqs(viortc);
> }
>
> static int viortc_freeze(struct virtio_device *dev)
> @@ -1353,9 +1373,11 @@ static int viortc_restore(struct virtio_device *dev)
> bool notify = false;
> int ret;
>
> + viortc_del_vqs(viortc);
> +
> ret = viortc_init_vqs(viortc);
> if (ret)
> - return ret;
> + goto err_del_vqs;
>
> alarm_viortc_vq = &viortc->vqs[VIORTC_ALARMQ];
> alarm_vq = alarm_viortc_vq->vq;
> @@ -1364,16 +1386,22 @@ static int viortc_restore(struct virtio_device *dev)
> ret = viortc_populate_vq(viortc, alarm_viortc_vq,
> VIORTC_ALARMQ_BUF_CAP, false);
> if (ret)
> - return ret;
> + goto err_del_vqs;
>
> notify = virtqueue_kick_prepare(alarm_vq);
> }
>
> virtio_device_ready(dev);
>
> - if (notify && !virtqueue_notify(alarm_vq))
> + if (notify && !virtqueue_notify(alarm_vq)) {
> ret = -EIO;
> + goto err_del_vqs;
> + }
> +
> + return ret;
>
> +err_del_vqs:
> + viortc_del_vqs(viortc);
This looks racy to me, since we can end up here after
virtio_device_ready(), when viortc_cb_alarmq() could run. But after
virtio_device_ready(), deleting the virtqueues does not seem to be
required. So directly returning an error after virtio_device_ready()
should work.
Alternatively, I'd suggest calling the logic from viortc_remove()
instead of just viortc_del_vqs() (factored out as __viortc_remove()).
This would have the benefit of using the same logic to stop the device
in both cases.
This should also make the viortc_del_vqs() wrapper and vq->vq check
unnecessary due to viortc_clocks_deinit() having been called.
The tradeoff would be that self-healing on another restore would not be
possible any more.
Best regards,
Peter
> return ret;
> }
>
> --
> 2.34.1