On 11/30/23 17:25, Stefan Hajnoczi wrote:
> On Wed, Nov 29, 2023 at 11:01:33PM -0800, Chaitanya Kulkarni wrote:
>> Improve block layer request handling by implementing a timeout handler.
>> Current implementation assums that request will never timeout and will
>> be completed by underlaying transport. However, this assumption can
>> cause issues under heavy load especially when dealing with different
>> subsystems and real hardware.
>>
>> To solve this, add a block layer request timeout handler that will
>> complete timed-out requests in the same context if the virtio device
>> has a VIRTIO_CONFIG_S_DRIVER_OK status. If the device has any other
>> status, we'll stop the block layer request queue and proceed with the
>> teardown sequence, allowing applications waiting for I/O to exit
>> gracefully with appropriate error.
>>
>> Also, add two new module parameters that allows user to specify the
>> I/O timeout for the tagset when allocating the disk and a teardown limit
>> for the timed out requeets before we initiate device teardown from the
>> timeout handler. These changes will improve the stability and
>> reliability of our system under request timeout scenario.
>>
>> Signed-off-by: Chaitanya Kulkarni <k...@nvidia.com>
>> ---
>>   drivers/block/virtio_blk.c      | 122 ++++++++++++++++++++++++++++++++
>>   include/uapi/linux/virtio_blk.h |   1 +
>>   2 files changed, 123 insertions(+)
> Hi,
> This looks interesting. There was a discussion about implementing
> ->timeout() in September:
> https://lore.kernel.org/io-uring/20230926145526.GA387951@fedora/

Thanks for pointing that out ...

>> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
>> index 4689ac2e0c0e..da26c2bf933b 100644
>> --- a/drivers/block/virtio_blk.c
>> +++ b/drivers/block/virtio_blk.c
>> @@ -16,6 +16,7 @@
>>   #include <linux/blk-mq-virtio.h>
>>   #include <linux/numa.h>
>>   #include <linux/vmalloc.h>
>> +#include <linux/xarray.h>
>>   #include <uapi/linux/virtio_ring.h>
>>   
>>   #define PART_BITS 4
>> @@ -31,6 +32,15 @@
>>   #define VIRTIO_BLK_INLINE_SG_CNT   2
>>   #endif
>>   
>> +static unsigned int io_timeout = 20;
>> +module_param(io_timeout, uint, 0644);
>> +MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O requests. 
>> Default:20");
>> +
>> +static unsigned int timeout_teardown_limit = 2;
>> +module_param(timeout_teardown_limit, uint, 0644);
>> +MODULE_PARM_DESC(timeout_teardown_limit,
>> +            "request timeout teardown limit for stable dev. Default:2");
>> +
>>   static unsigned int num_request_queues;
>>   module_param(num_request_queues, uint, 0644);
>>   MODULE_PARM_DESC(num_request_queues,
>> @@ -84,6 +94,20 @@ struct virtio_blk {
>>   
>>      /* For zoned device */
>>      unsigned int zone_sectors;
>> +
>> +    /*
>> +     * Block layer Request timeout teardown limit when device is in the
>> +     * stable state, i.e. it has VIRTIO_CONFIG_S_DRIVER_OK value for its
>> +     * config status. Once this limit is reached issue
>> +     * virtblk_teardown_work to teardown the device in the block lyaer
>> +     * request timeout callback.
>> +     */
>> +    atomic_t rq_timeout_count;
>> +    /* avoid tear down race between remove and teardown work */
>> +    struct mutex teardown_mutex;
>> +    /* tear down work to be scheduled from block layer request handler */
>> +    struct work_struct teardown_work;
>> +
>>   };
>>   
>>   struct virtblk_req {
>> @@ -117,6 +141,8 @@ static inline blk_status_t virtblk_result(u8 status)
>>      case VIRTIO_BLK_S_OK:
>>              return BLK_STS_OK;
>>      case VIRTIO_BLK_S_UNSUPP:
>> +    case VIRTIO_BLK_S_TIMEOUT:
>> +            return BLK_STS_TIMEOUT;
>>              return BLK_STS_NOTSUPP;
>>      case VIRTIO_BLK_S_ZONE_OPEN_RESOURCE:
>>              return BLK_STS_ZONE_OPEN_RESOURCE;
>> @@ -926,6 +952,7 @@ static void virtblk_free_disk(struct gendisk *disk)
>>      struct virtio_blk *vblk = disk->private_data;
>>   
>>      ida_free(&vd_index_ida, vblk->index);
>> +    mutex_destroy(&vblk->teardown_mutex);
>>      mutex_destroy(&vblk->vdev_mutex);
>>      kfree(vblk);
>>   }
>> @@ -1287,6 +1314,86 @@ static int virtblk_poll(struct blk_mq_hw_ctx *hctx, 
>> struct io_comp_batch *iob)
>>      return found;
>>   }
>>   
>> +static bool virtblk_cancel_request(struct request *rq, void *data)
>> +{
>> +    struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
>> +
>> +    vbr->in_hdr.status = VIRTIO_BLK_S_TIMEOUT;
>> +    if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq))
>> +            blk_mq_complete_request(rq);
>> +
>> +    return true;
>> +}
>> +
>> +static void virtblk_teardown_work(struct work_struct *w)
>> +{
>> +    struct virtio_blk *vblk =
>> +            container_of(w, struct virtio_blk, teardown_work);
>> +    struct request_queue *q = vblk->disk->queue;
>> +    struct virtio_device *vdev = vblk->vdev;
>> +    struct blk_mq_hw_ctx *hctx;
>> +    unsigned long idx;
>> +
>> +    mutex_lock(&vblk->teardown_mutex);
>> +    if (!vblk->vdev)
>> +            goto unlock;
>> +
>> +    blk_mq_quiesce_queue(q);
>> +
>> +    /* Process any outstanding request from device. */
>> +    xa_for_each(&q->hctx_table, idx, hctx)
>> +            virtblk_poll(hctx, NULL);
>> +
>> +    blk_sync_queue(q);
>> +    blk_mq_tagset_busy_iter(&vblk->tag_set, virtblk_cancel_request, vblk);
>> +    blk_mq_tagset_wait_completed_request(&vblk->tag_set);
>> +
>> +    /*
>> +     * Unblock any pending dispatch I/Os before we destroy device. From
>> +     * del_gendisk() -> __blk_mark_disk_dead(disk) will set GD_DEAD flag,
>> +     * that will make sure any new I/O from bio_queue_enter() to fail.
>> +     */
>> +    blk_mq_unquiesce_queue(q);
>> +    del_gendisk(vblk->disk);
>> +    blk_mq_free_tag_set(&vblk->tag_set);
>> +
>> +    mutex_lock(&vblk->vdev_mutex);
>> +    flush_work(&vblk->config_work);
>> +
>> +    virtio_reset_device(vdev);
>> +
>> +    vblk->vdev = NULL;
>> +
>> +    vdev->config->del_vqs(vdev);
>> +    kfree(vblk->vqs);
>> +
>> +    mutex_unlock(&vblk->vdev_mutex);
>> +
>> +    put_disk(vblk->disk);
>> +
>> +unlock:
>> +    mutex_unlock(&vblk->teardown_mutex);
>> +}
>> +
>> +static enum blk_eh_timer_return virtblk_timeout(struct request *req)
>> +{
>> +    struct virtio_blk *vblk = req->mq_hctx->queue->queuedata;
>> +    struct virtio_device *vdev = vblk->vdev;
>> +    bool ok = vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK;
> Please check VIRTIO_CONFIG_S_NEEDS_RESET first. When
> VIRTIO_CONFIG_S_NEEDS_RESET is set the device is not ok, regardless of
> whether DRIVER_OK is set. See 2.1.1 Driver Requirements: Device Status
> Field in the VIRTIO specification.

okay will add that to next version ...

>
>> +
>> +    if ((atomic_dec_return(&vblk->rq_timeout_count) != 0) && ok) {
>> +            virtblk_cancel_request(req, NULL);
> The driver cannot abandon the request here because the device still owns
> the request:
>
> 1. I/O buffer memory is corrupted for READ requests and disk contents
>     are corrupted for WRITE requests when the device does finally process
>     the request.
>
> 2. After virtblk_timeout() -> virtblk_cancel_request() ->
>     blk_mq_complete_request(), a freed address is returned from
>     virtblk_done() -> virtqueue_get_buf() -> blk_mq_rq_from_pdu() when
>     the device finally completes the request.
>
> I suggest the following:
>
> (Optional) Add an ABORT/CANCEL request type to the VIRTIO specification
> and use it to safely cancel requests when the device supports it.

I really want to keep this simple without introducing a new command
so we don't have to modify the underlying transport to handle that.

> Otherwise reset the device so that all pending requests are canceled.

will issue reset here and see if that works ...

>> +            return BLK_EH_DONE;
>> +    }
>> +
>> +    dev_err(&vdev->dev, "%s:%s initiating teardown\n", __func__,
>> +            vblk->disk->disk_name);
>> +
>> +    queue_work(virtblk_wq, &vblk->teardown_work);
>> +
>> +    return BLK_EH_RESET_TIMER;
>> +}
>> +
>>   static const struct blk_mq_ops virtio_mq_ops = {
>>      .queue_rq       = virtio_queue_rq,
>>      .queue_rqs      = virtio_queue_rqs,
>> @@ -1294,6 +1401,7 @@ static const struct blk_mq_ops virtio_mq_ops = {
>>      .complete       = virtblk_request_done,
>>      .map_queues     = virtblk_map_queues,
>>      .poll           = virtblk_poll,
>> +    .timeout        = virtblk_timeout,
>>   };
>>   
>>   static unsigned int virtblk_queue_depth;
>> @@ -1365,6 +1473,7 @@ static int virtblk_probe(struct virtio_device *vdev)
>>      memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
>>      vblk->tag_set.ops = &virtio_mq_ops;
>>      vblk->tag_set.queue_depth = queue_depth;
>> +    vblk->tag_set.timeout = io_timeout * HZ;
>>      vblk->tag_set.numa_node = NUMA_NO_NODE;
>>      vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
>>      vblk->tag_set.cmd_size =
>> @@ -1387,6 +1496,10 @@ static int virtblk_probe(struct virtio_device *vdev)
>>      }
>>      q = vblk->disk->queue;
>>   
>> +    mutex_init(&vblk->teardown_mutex);
>> +    INIT_WORK(&vblk->teardown_work, virtblk_teardown_work);
>> +    atomic_set(&vblk->rq_timeout_count, timeout_teardown_limit);
>> +
>>      virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
>>   
>>      vblk->disk->major = major;
>> @@ -1598,6 +1711,12 @@ static void virtblk_remove(struct virtio_device *vdev)
>>   {
>>      struct virtio_blk *vblk = vdev->priv;
>>   
>> +    mutex_lock(&vblk->teardown_mutex);
>> +
>> +    /* we did the cleanup in the timeout handler */
>> +    if (!vblk->vdev)
>> +            goto unlock;
>> +
>>      /* Make sure no work handler is accessing the device. */
>>      flush_work(&vblk->config_work);
>>   
>> @@ -1618,6 +1737,9 @@ static void virtblk_remove(struct virtio_device *vdev)
>>      mutex_unlock(&vblk->vdev_mutex);
>>   
>>      put_disk(vblk->disk);
>> +
>> +unlock:
>> +    mutex_unlock(&vblk->teardown_mutex);
>>   }
>>   
>>   #ifdef CONFIG_PM_SLEEP
>> diff --git a/include/uapi/linux/virtio_blk.h 
>> b/include/uapi/linux/virtio_blk.h
>> index 3744e4da1b2a..ed864195ab26 100644
>> --- a/include/uapi/linux/virtio_blk.h
>> +++ b/include/uapi/linux/virtio_blk.h
>> @@ -317,6 +317,7 @@ struct virtio_scsi_inhdr {
>>   #define VIRTIO_BLK_S_OK            0
>>   #define VIRTIO_BLK_S_IOERR 1
>>   #define VIRTIO_BLK_S_UNSUPP        2
>> +#define VIRTIO_BLK_S_TIMEOUT        3
> The structs and constants in this header file come from the VIRTIO
> specification. Anything changed in this file must first be accepted into
> the VIRTIO spec because this is the hardware interface definition.
>
> VIRTIO_BLK_S_TIMEOUT seems to be synthetic value that is purely used by
> software, not the device. Maybe there is no need to update the spec.
> Just avoid using in_hdr.status to signal timeouts and use a separate
> flag/field instead in a block layer or virtio_blk driver request struct.

It is a specific error hence I've added that on the similar lines,
do you have a specific field in mind that you would prefer ?

Thanks for the reply, just got back from time off, looking forward to
send V2 as soon as I get the reply.

-ck

Reply via email to