On Wed, May 24, 2023 at 06:34:05PM -0500, Mike Christie wrote:
> The linux block layer requires bios/requests to have lengths with a 512
> byte alignment. Some drivers/layers like dm-crypt and the directi IO code
> will test for it and just fail. Other drivers like SCSI just assume the
> requirement is met and will end up in infinte retry loops. The problem
> for drivers like SCSI is that it uses functions like blk_rq_cur_sectors
> and blk_rq_sectors which divide the request's length by 512. If there's
> lefovers then it just gets dropped. But other code in the block/scsi
> layer may use blk_rq_bytes/blk_rq_cur_bytes and end up thinking there is
> still data left and try to retry the cmd. We can then end up getting
> stuck in retry loops where part of the block/scsi thinks there is data
> left, but other parts think we want to do IOs of zero length.
> 
> Linux will always check for alignment, but windows will not. When
> vhost-scsi then translates the iovec it gets from a windows guest to a
> scatterlist, we can end up with sg items where the sg->length is not
> divisible by 512 due to the misaligned offset:
> 
> sg[0].offset = 255;
> sg[0].length = 3841;
> sg...
> sg[N].offset = 0;
> sg[N].length = 255;
> 
> When the lio backends then convert the SG to bios or other iovecs, we
> end up sending them with the same misaligned values and can hit the
> issues above.
> 
> This just has us drop down to allocating a temp page and copying the data
> when this happens. Because this adds a check that needs to loop over the
> iovec in the main IO path, this patch adds an attribute that can be turned
> on for just windows.
> 
> Signed-off-by: Mike Christie <michael.chris...@oracle.com>

I am assuming this triggers rarely, yes?

If so I would like to find a way to avoid setting an attribute.

I am guessing the main cost is an extra scan of iov.
vhost_scsi_iov_to_sgl already does a scan, how about changing
it to fail if misaligned?
We can then detect the relevant error code and try with a copy.

WDYT?

> ---
>  drivers/vhost/scsi.c | 174 +++++++++++++++++++++++++++++++++++++------
>  1 file changed, 151 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index bb10fa4bb4f6..dbad8fb579eb 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -25,6 +25,8 @@
>  #include <linux/fs.h>
>  #include <linux/vmalloc.h>
>  #include <linux/miscdevice.h>
> +#include <linux/blk_types.h>
> +#include <linux/bio.h>
>  #include <asm/unaligned.h>
>  #include <scsi/scsi_common.h>
>  #include <scsi/scsi_proto.h>
> @@ -75,6 +77,9 @@ struct vhost_scsi_cmd {
>       u32 tvc_prot_sgl_count;
>       /* Saved unpacked SCSI LUN for vhost_scsi_target_queue_cmd() */
>       u32 tvc_lun;
> +     u32 copied_iov:1;
> +     const void *saved_iter_addr;
> +     struct iov_iter saved_iter;
>       /* Pointer to the SGL formatted memory from virtio-scsi */
>       struct scatterlist *tvc_sgl;
>       struct scatterlist *tvc_prot_sgl;
> @@ -107,6 +112,7 @@ struct vhost_scsi_nexus {
>  struct vhost_scsi_tpg {
>       /* Vhost port target portal group tag for TCM */
>       u16 tport_tpgt;
> +     bool check_io_alignment;
>       /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus 
> shutdown */
>       int tv_tpg_port_count;
>       /* Used for vhost_scsi device reference to tpg_nexus, protected by 
> tv_tpg_mutex */
> @@ -328,8 +334,13 @@ static void vhost_scsi_release_cmd_res(struct se_cmd 
> *se_cmd)
>       int i;
>  
>       if (tv_cmd->tvc_sgl_count) {
> -             for (i = 0; i < tv_cmd->tvc_sgl_count; i++)
> -                     put_page(sg_page(&tv_cmd->tvc_sgl[i]));
> +             for (i = 0; i < tv_cmd->tvc_sgl_count; i++) {
> +                     if (tv_cmd->copied_iov)
> +                             __free_page(sg_page(&tv_cmd->tvc_sgl[i]));
> +                     else
> +                             put_page(sg_page(&tv_cmd->tvc_sgl[i]));
> +             }
> +             kfree(tv_cmd->saved_iter_addr);
>       }
>       if (tv_cmd->tvc_prot_sgl_count) {
>               for (i = 0; i < tv_cmd->tvc_prot_sgl_count; i++)
> @@ -502,6 +513,27 @@ static void vhost_scsi_evt_work(struct vhost_work *work)
>       mutex_unlock(&vq->mutex);
>  }
>  
> +static int vhost_scsi_copy_sgl_to_iov(struct vhost_scsi_cmd *cmd)
> +{
> +     struct iov_iter *iter = &cmd->saved_iter;
> +     struct scatterlist *sg = cmd->tvc_sgl;
> +     struct page *page;
> +     size_t len;
> +     int i;
> +
> +     for (i = 0; i < cmd->tvc_sgl_count; i++) {
> +             page = sg_page(&sg[i]);
> +             len = sg[i].length;
> +
> +             if (copy_page_to_iter(page, 0, len, iter) != len) {
> +                     pr_err("Could not copy data. Error %lu\n", len);
> +                     return -1;
> +             }
> +     }
> +
> +     return 0;
> +}
> +
>  /* Fill in status and signal that we are done processing this command
>   *
>   * This is scheduled in the vhost work queue so we are called with the owner
> @@ -525,15 +557,20 @@ static void vhost_scsi_complete_cmd_work(struct 
> vhost_work *work)
>  
>               pr_debug("%s tv_cmd %p resid %u status %#02x\n", __func__,
>                       cmd, se_cmd->residual_count, se_cmd->scsi_status);
> -
>               memset(&v_rsp, 0, sizeof(v_rsp));
> -             v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, 
> se_cmd->residual_count);
> -             /* TODO is status_qualifier field needed? */
> -             v_rsp.status = se_cmd->scsi_status;
> -             v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
> -                                              se_cmd->scsi_sense_length);
> -             memcpy(v_rsp.sense, cmd->tvc_sense_buf,
> -                    se_cmd->scsi_sense_length);
> +
> +             if (cmd->saved_iter_addr && vhost_scsi_copy_sgl_to_iov(cmd)) {
> +                     v_rsp.response = VIRTIO_SCSI_S_BAD_TARGET;
> +             } else {
> +                     v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq,
> +                                                  se_cmd->residual_count);
> +                     /* TODO is status_qualifier field needed? */
> +                     v_rsp.status = se_cmd->scsi_status;
> +                     v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq,
> +                                                      
> se_cmd->scsi_sense_length);
> +                     memcpy(v_rsp.sense, cmd->tvc_sense_buf,
> +                            se_cmd->scsi_sense_length);
> +             }
>  
>               iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iov,
>                             cmd->tvc_in_iovs, sizeof(v_rsp));
> @@ -682,7 +719,52 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool 
> write,
>  }
>  
>  static int
> -vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
> +vhost_scsi_copy_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter,
> +                        struct scatterlist *sg, int sg_count)
> +{
> +     size_t len = iov_iter_count(iter);
> +     unsigned int nbytes = 0;
> +     struct page *page;
> +     int i;
> +
> +     if (cmd->tvc_data_direction == DMA_FROM_DEVICE) {
> +             cmd->saved_iter_addr = dup_iter(&cmd->saved_iter, iter,
> +                                             GFP_KERNEL);
> +             if (!cmd->saved_iter_addr)
> +                     return -ENOMEM;
> +     }
> +
> +     for (i = 0; i < sg_count; i++) {
> +             page = alloc_page(GFP_KERNEL);
> +             if (!page) {
> +                     i--;
> +                     goto err;
> +             }
> +
> +             nbytes = min_t(unsigned int, PAGE_SIZE, len);
> +             sg_set_page(&sg[i], page, nbytes, 0);
> +
> +             if (cmd->tvc_data_direction == DMA_TO_DEVICE &&
> +                 copy_page_from_iter(page, 0, nbytes, iter) != nbytes)
> +                     goto err;
> +
> +             len -= nbytes;
> +     }
> +
> +     cmd->copied_iov = 1;
> +     return 0;
> +
> +err:
> +     pr_err("Could not read %u bytes\n", nbytes);
> +
> +     for (; i >= 0; i--)
> +             __free_page(sg_page(&sg[i]));
> +     kfree(cmd->saved_iter_addr);
> +     return -ENOMEM;
> +}
> +
> +static int
> +vhost_scsi_mapal(struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd,
>                size_t prot_bytes, struct iov_iter *prot_iter,
>                size_t data_bytes, struct iov_iter *data_iter)
>  {
> @@ -703,10 +785,8 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
>               ret = vhost_scsi_iov_to_sgl(cmd, write, prot_iter,
>                                           cmd->tvc_prot_sgl,
>                                           cmd->tvc_prot_sgl_count);
> -             if (ret < 0) {
> -                     cmd->tvc_prot_sgl_count = 0;
> -                     return ret;
> -             }
> +             if (ret < 0)
> +                     goto map_fail;
>       }
>       sgl_count = vhost_scsi_calc_sgls(data_iter, data_bytes,
>                                        VHOST_SCSI_PREALLOC_SGLS);
> @@ -717,14 +797,32 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd,
>       cmd->tvc_sgl_count = sgl_count;
>       pr_debug("%s data_sg %p data_sgl_count %u\n", __func__,
>                 cmd->tvc_sgl, cmd->tvc_sgl_count);
> +     /*
> +      * The block layer requires bios/requests to be a multiple of 512 bytes,
> +      * but Windows can send us vecs that are misaligned. This can result
> +      * in bios and later requests with misaligned sizes if we have to break
> +      * up a cmd into multiple bios.
> +      *
> +      * We currently only break up a command into multiple bios if we hit
> +      * the vec/seg limit, so check if our sgl_count is greater than the max
> +      * and if a vec in the cmd has a misaligned size.
> +      */
> +     if (tpg->check_io_alignment &&
> +         (!iov_iter_is_aligned(data_iter, 0, SECTOR_SIZE - 1) &&
> +          bio_max_segs(sgl_count) != sgl_count))
> +             ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl,
> +                                              cmd->tvc_sgl_count);
> +     else
> +             ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter,
> +                                         cmd->tvc_sgl, cmd->tvc_sgl_count);
> +     if (ret)
> +             goto map_fail;
>  
> -     ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter,
> -                                 cmd->tvc_sgl, cmd->tvc_sgl_count);
> -     if (ret < 0) {
> -             cmd->tvc_sgl_count = 0;
> -             return ret;
> -     }
>       return 0;
> +
> +map_fail:
> +     cmd->tvc_sgl_count = 0;
> +     return ret;
>  }
>  
>  static int vhost_scsi_to_tcm_attr(int attr)
> @@ -1077,7 +1175,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
> vhost_virtqueue *vq)
>                        " %d\n", cmd, exp_data_len, prot_bytes, 
> data_direction);
>  
>               if (data_direction != DMA_NONE) {
> -                     if (unlikely(vhost_scsi_mapal(cmd, prot_bytes,
> +                     if (unlikely(vhost_scsi_mapal(tpg, cmd, prot_bytes,
>                                                     &prot_iter, exp_data_len,
>                                                     &data_iter))) {
>                               vq_err(vq, "Failed to map iov to sgl\n");
> @@ -2068,11 +2166,41 @@ static ssize_t 
> vhost_scsi_tpg_attrib_fabric_prot_type_show(
>  
>       return sysfs_emit(page, "%d\n", tpg->tv_fabric_prot_type);
>  }
> -
>  CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, fabric_prot_type);
>  
> +static ssize_t
> +vhost_scsi_tpg_attrib_check_io_alignment_store(struct config_item *item,
> +                                            const char *page, size_t count)
> +{
> +     struct se_portal_group *se_tpg = attrib_to_tpg(item);
> +     struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg,
> +                                               se_tpg);
> +     bool val;
> +     int ret;
> +
> +     ret = kstrtobool(page, &val);
> +     if (ret)
> +             return ret;
> +
> +     tpg->check_io_alignment = val;
> +     return count;
> +}
> +
> +static ssize_t
> +vhost_scsi_tpg_attrib_check_io_alignment_show(struct config_item *item,
> +                                           char *page)
> +{
> +     struct se_portal_group *se_tpg = attrib_to_tpg(item);
> +     struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg,
> +                                               se_tpg);
> +
> +     return sysfs_emit(page, "%d\n", tpg->check_io_alignment);
> +}
> +CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, check_io_alignment);
> +
>  static struct configfs_attribute *vhost_scsi_tpg_attrib_attrs[] = {
>       &vhost_scsi_tpg_attrib_attr_fabric_prot_type,
> +     &vhost_scsi_tpg_attrib_attr_check_io_alignment,
>       NULL,
>  };
>  
> -- 
> 2.25.1

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Reply via email to