Re: [Qemu-devel] [PATCH] virtio-blk: add DISCARD support to virtio-blk driver
On Tue, Mar 28, 2017 at 3:15 AM, Liu, Changpengwrote: >> -Original Message- >> From: Stefan Hajnoczi [mailto:stefa...@gmail.com] >> Sent: Tuesday, March 28, 2017 4:20 AM >> To: Liu, Changpeng >> Cc: virtio-...@lists.oasis-open.org; >> virtualizat...@lists.linux-foundation.org; linux- >> ker...@vger.kernel.org; h...@lst.de; qemu-devel@nongnu.org >> Subject: Re: [PATCH] virtio-blk: add DISCARD support to virtio-blk driver >> >> On Tue, Mar 28, 2017 at 04:39:25PM +0800, Changpeng Liu wrote: >> > Currently virtio-blk driver does not provide discard feature flag, so the >> > filesystems which built on top of the block device will not send discard >> > command. This is okay for HDD backend, but it will impact the performance >> > for SSD backend. >> > >> > Add a feature flag VIRTIO_BLK_F_DISCARD and command >> VIRTIO_BLK_T_DISCARD >> > to extend exist virtio-blk protocol. virtio-blk protocol uses a single >> > 8 bytes descriptor containing type,reserved and sector, currently Linux >> > uses the reserved field as IO priority, here we also re-use the reserved >> > field as number of discard sectors. >> > >> > Signed-off-by: Changpeng Liu >> > --- >> > drivers/block/virtio_blk.c | 38 >> > +- >> > include/uapi/linux/virtio_blk.h | 12 ++-- >> > 2 files changed, 39 insertions(+), 11 deletions(-) >> > >> > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c >> > index 1d4c9f8..550cfe7 100644 >> > --- a/drivers/block/virtio_blk.c >> > +++ b/drivers/block/virtio_blk.c >> > @@ -241,6 +241,9 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, >> > case REQ_OP_FLUSH: >> > type = VIRTIO_BLK_T_FLUSH; >> > break; >> > + case REQ_OP_DISCARD: >> > + type = VIRTIO_BLK_T_DISCARD; >> > + break; >> > case REQ_OP_SCSI_IN: >> > case REQ_OP_SCSI_OUT: >> > type = VIRTIO_BLK_T_SCSI_CMD; >> > @@ -256,16 +259,24 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx >> > *hctx, >> > vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type); >> > vbr->out_hdr.sector = type ? >> > 0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req)); >> > - vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req)); >> > + vbr->out_hdr.u.ioprio = cpu_to_virtio32(vblk->vdev, >> > req_get_ioprio(req)); >> > >> > blk_mq_start_request(req); >> > >> > - num = blk_rq_map_sg(hctx->queue, req, vbr->sg); >> > - if (num) { >> > - if (rq_data_dir(req) == WRITE) >> > - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, >> VIRTIO_BLK_T_OUT); >> > - else >> > - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, >> VIRTIO_BLK_T_IN); >> > + if (type == VIRTIO_BLK_T_DISCARD) { >> > + vbr->out_hdr.u.discard_nr_sectors = cpu_to_virtio32(vblk->vdev, >> > + >> blk_rq_sectors(req)); >> > + num = 0; >> > + } else { >> > + num = blk_rq_map_sg(hctx->queue, req, vbr->sg); >> > + if (num) { >> > + if (rq_data_dir(req) == WRITE) >> > + vbr->out_hdr.type |= >> > cpu_to_virtio32(vblk->vdev, >> > + >> VIRTIO_BLK_T_OUT); >> > + else >> > + vbr->out_hdr.type |= >> > cpu_to_virtio32(vblk->vdev, >> > + >> VIRTIO_BLK_T_IN); >> > + } >> > } >> > >> > spin_lock_irqsave(>vqs[qid].lock, flags); >> > @@ -775,6 +786,15 @@ static int virtblk_probe(struct virtio_device *vdev) >> > if (!err && opt_io_size) >> > blk_queue_io_opt(q, blk_size * opt_io_size); >> > >> > + if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { >> > + q->limits.discard_zeroes_data = 0; >> > + q->limits.discard_alignment = blk_size; >> > + q->limits.discard_granularity = blk_size; >> > + blk_queue_max_discard_sectors(q, UINT_MAX); >> > + blk_queue_max_discard_segments(q, 1); >> > + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); >> > + } >> >> Please add configuration space fields for these limits. Looking at the >> virtio-scsi block limits code in QEMU's scsi_disk_emulate_inquiry() I >> can see that the hypervisor has useful values that it wants to >> communicate. They shouldn't be hardcoded to blk_size. > Yes, move discard related parameters to configuration space make sense. >> >> > + >> > virtio_device_ready(vdev); >> > >> > device_add_disk(>dev, vblk->disk); >> > @@ -882,14 +902,14 @@ static int virtblk_restore(struct virtio_device >> > *vdev) >> > VIRTIO_BLK_F_SCSI, >> > #endif >> > VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, >> VIRTIO_BLK_F_CONFIG_WCE, >> > - VIRTIO_BLK_F_MQ, >> > + VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, >> > } >> > ; >> > static unsigned int features[] = { >> > VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, >> VIRTIO_BLK_F_GEOMETRY,
Re: [Qemu-devel] [PATCH] virtio-blk: add DISCARD support to virtio-blk driver
> -Original Message- > From: Stefan Hajnoczi [mailto:stefa...@gmail.com] > Sent: Tuesday, March 28, 2017 4:20 AM > To: Liu, Changpeng> Cc: virtio-...@lists.oasis-open.org; > virtualizat...@lists.linux-foundation.org; linux- > ker...@vger.kernel.org; h...@lst.de; qemu-devel@nongnu.org > Subject: Re: [PATCH] virtio-blk: add DISCARD support to virtio-blk driver > > On Tue, Mar 28, 2017 at 04:39:25PM +0800, Changpeng Liu wrote: > > Currently virtio-blk driver does not provide discard feature flag, so the > > filesystems which built on top of the block device will not send discard > > command. This is okay for HDD backend, but it will impact the performance > > for SSD backend. > > > > Add a feature flag VIRTIO_BLK_F_DISCARD and command > VIRTIO_BLK_T_DISCARD > > to extend exist virtio-blk protocol. virtio-blk protocol uses a single > > 8 bytes descriptor containing type,reserved and sector, currently Linux > > uses the reserved field as IO priority, here we also re-use the reserved > > field as number of discard sectors. > > > > Signed-off-by: Changpeng Liu > > --- > > drivers/block/virtio_blk.c | 38 +- > > include/uapi/linux/virtio_blk.h | 12 ++-- > > 2 files changed, 39 insertions(+), 11 deletions(-) > > > > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c > > index 1d4c9f8..550cfe7 100644 > > --- a/drivers/block/virtio_blk.c > > +++ b/drivers/block/virtio_blk.c > > @@ -241,6 +241,9 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > > case REQ_OP_FLUSH: > > type = VIRTIO_BLK_T_FLUSH; > > break; > > + case REQ_OP_DISCARD: > > + type = VIRTIO_BLK_T_DISCARD; > > + break; > > case REQ_OP_SCSI_IN: > > case REQ_OP_SCSI_OUT: > > type = VIRTIO_BLK_T_SCSI_CMD; > > @@ -256,16 +259,24 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > > vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type); > > vbr->out_hdr.sector = type ? > > 0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req)); > > - vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req)); > > + vbr->out_hdr.u.ioprio = cpu_to_virtio32(vblk->vdev, > > req_get_ioprio(req)); > > > > blk_mq_start_request(req); > > > > - num = blk_rq_map_sg(hctx->queue, req, vbr->sg); > > - if (num) { > > - if (rq_data_dir(req) == WRITE) > > - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, > VIRTIO_BLK_T_OUT); > > - else > > - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, > VIRTIO_BLK_T_IN); > > + if (type == VIRTIO_BLK_T_DISCARD) { > > + vbr->out_hdr.u.discard_nr_sectors = cpu_to_virtio32(vblk->vdev, > > + > blk_rq_sectors(req)); > > + num = 0; > > + } else { > > + num = blk_rq_map_sg(hctx->queue, req, vbr->sg); > > + if (num) { > > + if (rq_data_dir(req) == WRITE) > > + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, > > + > VIRTIO_BLK_T_OUT); > > + else > > + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, > > + > VIRTIO_BLK_T_IN); > > + } > > } > > > > spin_lock_irqsave(>vqs[qid].lock, flags); > > @@ -775,6 +786,15 @@ static int virtblk_probe(struct virtio_device *vdev) > > if (!err && opt_io_size) > > blk_queue_io_opt(q, blk_size * opt_io_size); > > > > + if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { > > + q->limits.discard_zeroes_data = 0; > > + q->limits.discard_alignment = blk_size; > > + q->limits.discard_granularity = blk_size; > > + blk_queue_max_discard_sectors(q, UINT_MAX); > > + blk_queue_max_discard_segments(q, 1); > > + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); > > + } > > Please add configuration space fields for these limits. Looking at the > virtio-scsi block limits code in QEMU's scsi_disk_emulate_inquiry() I > can see that the hypervisor has useful values that it wants to > communicate. They shouldn't be hardcoded to blk_size. Yes, move discard related parameters to configuration space make sense. > > > + > > virtio_device_ready(vdev); > > > > device_add_disk(>dev, vblk->disk); > > @@ -882,14 +902,14 @@ static int virtblk_restore(struct virtio_device *vdev) > > VIRTIO_BLK_F_SCSI, > > #endif > > VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, > VIRTIO_BLK_F_CONFIG_WCE, > > - VIRTIO_BLK_F_MQ, > > + VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, > > } > > ; > > static unsigned int features[] = { > > VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, > VIRTIO_BLK_F_GEOMETRY, > > VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, > > VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, > VIRTIO_BLK_F_CONFIG_WCE, > > - VIRTIO_BLK_F_MQ, > > + VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, > > }; > > > > static
Re: [Qemu-devel] [PATCH] virtio-blk: add DISCARD support to virtio-blk driver
> -Original Message- > From: Christoph Hellwig [mailto:h...@lst.de] > Sent: Monday, March 27, 2017 10:56 PM > To: Liu, Changpeng> Cc: virtio-...@lists.oasis-open.org; > virtualizat...@lists.linux-foundation.org; linux- > ker...@vger.kernel.org; h...@lst.de; qemu-devel@nongnu.org > Subject: Re: [PATCH] virtio-blk: add DISCARD support to virtio-blk driver > > On Tue, Mar 28, 2017 at 04:39:25PM +0800, Changpeng Liu wrote: > > Currently virtio-blk driver does not provide discard feature flag, so the > > filesystems which built on top of the block device will not send discard > > command. This is okay for HDD backend, but it will impact the performance > > for SSD backend. > > > > Add a feature flag VIRTIO_BLK_F_DISCARD and command > VIRTIO_BLK_T_DISCARD > > to extend exist virtio-blk protocol. virtio-blk protocol uses a single > > 8 bytes descriptor containing type,reserved and sector, currently Linux > > uses the reserved field as IO priority, here we also re-use the reserved > > field as number of discard sectors. > > Do you have a link to the specification for this feature? At least > virtio-v1.0 does not seem to specify a discard feature. Not yet, patch goes first, there are uses who don't want to migrate from virtio-blk to virtio-scsi, so this feature seems reasonable, the question is that how we should define the specification to support this feature. > > Note that Linux 4.11 and later have support for multi-range discard > ala ATA TRIM, SCSI UNMAP and NVMe deallocate which might be useful > here, too. For support multi-range discard features, the DISCARD command must have data_in buffer, similar with SCSI UNMAP and NVMe DEALLOCATE commands, maybe 16 bytes aligned descriptors are required for each DISCARD command. > > > + q->limits.discard_zeroes_data = 0; > > No need to clear this. Also hopefully this field goes away for 4.12 > > > + blk_queue_max_discard_segments(q, 1); > > No need to set this.
Re: [Qemu-devel] [PATCH] virtio-blk: add DISCARD support to virtio-blk driver
On Tue, Mar 28, 2017 at 04:39:25PM +0800, Changpeng Liu wrote: > Currently virtio-blk driver does not provide discard feature flag, so the > filesystems which built on top of the block device will not send discard > command. This is okay for HDD backend, but it will impact the performance > for SSD backend. > > Add a feature flag VIRTIO_BLK_F_DISCARD and command VIRTIO_BLK_T_DISCARD > to extend exist virtio-blk protocol. virtio-blk protocol uses a single > 8 bytes descriptor containing type,reserved and sector, currently Linux > uses the reserved field as IO priority, here we also re-use the reserved > field as number of discard sectors. > > Signed-off-by: Changpeng Liu> --- > drivers/block/virtio_blk.c | 38 +- > include/uapi/linux/virtio_blk.h | 12 ++-- > 2 files changed, 39 insertions(+), 11 deletions(-) > > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c > index 1d4c9f8..550cfe7 100644 > --- a/drivers/block/virtio_blk.c > +++ b/drivers/block/virtio_blk.c > @@ -241,6 +241,9 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > case REQ_OP_FLUSH: > type = VIRTIO_BLK_T_FLUSH; > break; > + case REQ_OP_DISCARD: > + type = VIRTIO_BLK_T_DISCARD; > + break; > case REQ_OP_SCSI_IN: > case REQ_OP_SCSI_OUT: > type = VIRTIO_BLK_T_SCSI_CMD; > @@ -256,16 +259,24 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type); > vbr->out_hdr.sector = type ? > 0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req)); > - vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req)); > + vbr->out_hdr.u.ioprio = cpu_to_virtio32(vblk->vdev, > req_get_ioprio(req)); > > blk_mq_start_request(req); > > - num = blk_rq_map_sg(hctx->queue, req, vbr->sg); > - if (num) { > - if (rq_data_dir(req) == WRITE) > - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, > VIRTIO_BLK_T_OUT); > - else > - vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, > VIRTIO_BLK_T_IN); > + if (type == VIRTIO_BLK_T_DISCARD) { > + vbr->out_hdr.u.discard_nr_sectors = cpu_to_virtio32(vblk->vdev, > + > blk_rq_sectors(req)); > + num = 0; > + } else { > + num = blk_rq_map_sg(hctx->queue, req, vbr->sg); > + if (num) { > + if (rq_data_dir(req) == WRITE) > + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, > + > VIRTIO_BLK_T_OUT); > + else > + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, > + > VIRTIO_BLK_T_IN); > + } > } > > spin_lock_irqsave(>vqs[qid].lock, flags); > @@ -775,6 +786,15 @@ static int virtblk_probe(struct virtio_device *vdev) > if (!err && opt_io_size) > blk_queue_io_opt(q, blk_size * opt_io_size); > > + if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { > + q->limits.discard_zeroes_data = 0; > + q->limits.discard_alignment = blk_size; > + q->limits.discard_granularity = blk_size; > + blk_queue_max_discard_sectors(q, UINT_MAX); > + blk_queue_max_discard_segments(q, 1); > + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); > + } Please add configuration space fields for these limits. Looking at the virtio-scsi block limits code in QEMU's scsi_disk_emulate_inquiry() I can see that the hypervisor has useful values that it wants to communicate. They shouldn't be hardcoded to blk_size. > + > virtio_device_ready(vdev); > > device_add_disk(>dev, vblk->disk); > @@ -882,14 +902,14 @@ static int virtblk_restore(struct virtio_device *vdev) > VIRTIO_BLK_F_SCSI, > #endif > VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, > - VIRTIO_BLK_F_MQ, > + VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, > } > ; > static unsigned int features[] = { > VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, > VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, > VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, > - VIRTIO_BLK_F_MQ, > + VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, > }; > > static struct virtio_driver virtio_blk = { > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h > index 9ebe4d9..d608649 100644 > --- a/include/uapi/linux/virtio_blk.h > +++ b/include/uapi/linux/virtio_blk.h > @@ -38,6 +38,7 @@ > #define VIRTIO_BLK_F_BLK_SIZE6 /* Block size of disk is > available*/ > #define
Re: [Qemu-devel] [PATCH] virtio-blk: add DISCARD support to virtio-blk driver
On Tue, Mar 28, 2017 at 04:39:25PM +0800, Changpeng Liu wrote: > Currently virtio-blk driver does not provide discard feature flag, so the > filesystems which built on top of the block device will not send discard > command. This is okay for HDD backend, but it will impact the performance > for SSD backend. > > Add a feature flag VIRTIO_BLK_F_DISCARD and command VIRTIO_BLK_T_DISCARD > to extend exist virtio-blk protocol. virtio-blk protocol uses a single > 8 bytes descriptor containing type,reserved and sector, currently Linux > uses the reserved field as IO priority, here we also re-use the reserved > field as number of discard sectors. Do you have a link to the specification for this feature? At least virtio-v1.0 does not seem to specify a discard feature. Note that Linux 4.11 and later have support for multi-range discard ala ATA TRIM, SCSI UNMAP and NVMe deallocate which might be useful here, too. > + q->limits.discard_zeroes_data = 0; No need to clear this. Also hopefully this field goes away for 4.12 > + blk_queue_max_discard_segments(q, 1); No need to set this.