Re: [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers

Sam Li Thu, 13 Oct 2022 01:21:42 -0700

Damien Le Moal <damien.lem...@opensource.wdc.com> 于2022年10月13日周四 15:30写道：
>
> On 2022/10/13 16:08, Sam Li wrote:
> > Damien Le Moal <damien.lem...@opensource.wdc.com> 于2022年10月13日周四 13:13写道：
> >>
> >> On 10/10/22 11:33, Sam Li wrote:
> >>> Since Linux doesn't have a user API to issue zone append operations to
> >>> zoned devices from user space, the file-posix driver is modified to add
> >>> zone append emulation using regular writes. To do this, the file-posix
> >>> driver tracks the wp location of all zones of the device. It uses an
> >>> array of uint64_t. The most significant bit of each wp location indicates
> >>> if the zone type is conventional zones.
> >>>
> >>> The zones wp can be changed due to the following operations issued:
> >>> - zone reset: change the wp to the start offset of that zone
> >>> - zone finish: change to the end location of that zone
> >>> - write to a zone
> >>> - zone append
> >>>
> >>> Signed-off-by: Sam Li <faithilike...@gmail.com>
> >>> ---
> >>>  block/file-posix.c               | 158 +++++++++++++++++++++++++++++++
> >>>  include/block/block-common.h     |  14 +++
> >>>  include/block/block_int-common.h |   5 +
> >>>  3 files changed, 177 insertions(+)
> >>>
> >>> diff --git a/block/file-posix.c b/block/file-posix.c
> >>> index a9d347292e..17c0b58158 100755
> >>> --- a/block/file-posix.c
> >>> +++ b/block/file-posix.c
> >>> @@ -206,6 +206,7 @@ typedef struct RawPosixAIOData {
> >>>          struct {
> >>>              struct iovec *iov;
> >>>              int niov;
> >>> +            int64_t *append_sector;
> >>
> >> This should be added as part of patch 2. You do not need this to track
> >> the wp of zones in this patch.
> >>
> >>>          } io;
> >>>          struct {
> >>>              uint64_t cmd;
> >>> @@ -226,6 +227,7 @@ typedef struct RawPosixAIOData {
> >>>          struct {
> >>>              unsigned long zone_op;
> >>>              const char *zone_op_name;
> >>> +            bool all;
> >>>          } zone_mgmt;
> >>>      };
> >>>  } RawPosixAIOData;
> >>> @@ -1331,6 +1333,67 @@ static int hdev_get_max_segments(int fd, struct 
> >>> stat *st) {
> >>>  #endif
> >>>  }
> >>>
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +static int get_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
> >>
> >> Nit: It would seem more natural to have the fd argument first...
> >>
> >>> +                        unsigned int nrz) {
> >>> +    struct blk_zone *blkz;
> >>> +    int64_t rep_size;
> >>> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
> >>> +    int ret, n = 0, i = 0;
> >>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct 
> >>> blk_zone);
> >>> +    g_autofree struct blk_zone_report *rep = NULL;
> >>> +
> >>> +    rep = g_malloc(rep_size);
> >>> +    blkz = (struct blk_zone *)(rep + 1);
> >>> +    while (n < nrz) {
> >>> +        memset(rep, 0, rep_size);
> >>> +        rep->sector = sector;
> >>> +        rep->nr_zones = nrz - n;
> >>> +
> >>> +        do {
> >>> +            ret = ioctl(fd, BLKREPORTZONE, rep);
> >>> +        } while (ret != 0 && errno == EINTR);
> >>> +        if (ret != 0) {
> >>> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed 
> >>> %d",
> >>> +                    fd, offset, errno);
> >>> +            return -errno;
> >>> +        }
> >>> +
> >>> +        if (!rep->nr_zones) {
> >>> +            break;
> >>> +        }
> >>> +
> >>> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> >>> +            /*
> >>> +             * The wp tracking cares only about sequential writes 
> >>> required and
> >>> +             * sequential write preferred zones so that the wp can 
> >>> advance to
> >>> +             * the right location.
> >>> +             * Use the most significant bit of the wp location to 
> >>> indicate the
> >>> +             * zone type: 0 for SWR/SWP zones and 1 for conventional 
> >>> zones.
> >>> +             */
> >>> +            if (!(blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL)) {
> >>
> >> Double negation... This can simply be:
> >>
> >> if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> >>
> >>> +                wps->wp[i] += 1ULL << 63;
> >>
> >> No need for the += here. This can be "=".
> >>
> >>> +            } else {
> >>> +                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> >>> +            }
> >>> +        }
> >>> +        sector = blkz[i-1].start + blkz[i-1].len;
> >>
> >> spaces missing around the "-" in the "i-1" expressions.
> >>
> >>> +    }
> >>> +
> >>> +    return 0;
> >>> +}
> >>> +
> >>> +static void update_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
> >>
> >> Same nit as above: fd being the first argument would be a little more
> >> natural in my opinion.
> >>
> >>> +                            unsigned int nrz) {
> >>> +    qemu_mutex_lock(&wps->lock);
> >>> +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> >>> +        error_report("report zone wp failed");
> >>> +        return;
> >>
> >> You are leacking the lock here. Remove the return. Also, given that
> >> get_zones_wp() already prints a message if report fails, I do not think
> >> the message here is useful.
> >>
> >> Also, why is this function void typed ? How can the caller know if the
> >> update succeeded or not ?
> >
> > Update failures mean get_zones_wp() fails and that will be reported by
> > error_report. The error message indicates updates fail not reports
> > fail. Maybe modifying the message suffices error checking?
> >
> > +    qemu_mutex_lock(&wps->lock);
> > +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> > +        error_report("update zone wp failed");
> > +    }
> > +    qemu_mutex_unlock(&wps->lock);
> >
> >
> >>
> >>> +    }
> >>> +    qemu_mutex_unlock(&wps->lock);
> >>> +}
> >>> +#endif
> >>> +
> >>>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >>>  {
> >>>      BDRVRawState *s = bs->opaque;
> >>> @@ -1414,6 +1477,19 @@ static void raw_refresh_limits(BlockDriverState 
> >>> *bs, Error **errp)
> >>>              error_report("Invalid device capacity %" PRId64 " bytes ", 
> >>> bs->bl.capacity);
> >>>              return;
> >>>          }
> >>> +
> >>> +        ret = get_sysfs_long_val(&st, "physical_block_size");
> >>> +        if (ret >= 0) {
> >>> +            bs->bl.write_granularity = ret;
> >>> +        }
> >>
> >> Why is this change here ? Shouldn't this be part of the previous series
> >> "Add support for zoned device" ?
> >
> > Because only zone append uses write_granularity to check the iovector
> > size alignment. The previous series doesn't use this field.
>
> Then move this to patch 2. This should not be in this patch since you are not
> dealing with zone append yet.
>
> >
> >>
> >>> +
> >>> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * 
> >>> ret);
> >>> +        if (get_zones_wp(0, s->fd, bs->bl.wps, ret) < 0){
> >>> +            error_report("report wps failed");
> >>> +            g_free(bs->bl.wps);
> >>> +            return;
> >>> +        }
> >>> +        qemu_mutex_init(&bs->bl.wps->lock);
> >>>      }
> >>>  }
> >>>
> >>> @@ -1651,6 +1727,20 @@ static int handle_aiocb_rw(void *opaque)
> >>>      ssize_t nbytes;
> >>>      char *buf;
> >>>
> >>> +    /*
> >>> +     * The offset of regular writes, append writes is the wp location
> >>> +     * of that zone.
> >>> +     */
> >>> +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> +        if (aiocb->bs->bl.zone_size > 0) {
> >>> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> >>> +            qemu_mutex_lock(&wps->lock);
> >>> +            aiocb->aio_offset = wps->wp[aiocb->aio_offset /
> >>> +                                        aiocb->bs->bl.zone_size];
> >>> +            qemu_mutex_unlock(&wps->lock);
> >>> +        }
> >>
> >> I do not understand this hunk at all. What is this trying to do ? zone
> >> append support goes into patch 2. You are overwritting the user
> >> specified aio offset using the tracked wp value. That could result in a
> >> successfull write even if the user sent an unaligned write command. That
> >> is bad.
> >
> > Ok, regular writes and append writes got mixed up when I changed the
> > offset to the wp of that zone.
> >
> >>
> >> Here you should only be tracking the write pointer, so increment
> >> wps->wp[index], which you do below.
> >
> > Understood. Will move it to the next patch.
>
> No ! You should not change the aio offset for regular writes. Otherwise you 
> may
> hide errors for bad commands from the guest by having them succeed :)
> aio offset change should be done ONLY for zone append, not for regular writes.
>
> >
> >>
> >>> +    }
> >>> +
> >>>      if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
> >>>          /*
> >>>           * If there is just a single buffer, and it is properly aligned
> >>> @@ -1725,6 +1815,24 @@ static int handle_aiocb_rw(void *opaque)
> >>>
> >>>  out:
> >>>      if (nbytes == aiocb->aio_nbytes) {
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> >>> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> >>> +            if (wps) {
> >>> +                qemu_mutex_lock(&wps->lock);
> >>> +                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> >>> +                    uint64_t wend_offset =
> >>> +                            aiocb->aio_offset + aiocb->aio_nbytes;
> >>> +                    /* Advance the wp if needed */
> >>> +                    if (wend_offset > wps->wp[index]){
> >>> +                        wps->wp[index] = wend_offset;
> >>> +                    }
> >>> +                }
> >>> +                qemu_mutex_unlock(&wps->lock);
> >>> +            }
> >>> +        }
> >>> +#endif
> >>>          return 0;
> >>>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
> >>>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> @@ -1736,6 +1844,12 @@ out:
> >>>          }
> >>>      } else {
> >>>          assert(nbytes < 0);
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> +            update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> >>> +                            aiocb->bs->bl.nr_zones);
> >>
> >> You only need to update the target zone of the aio, not all zones.
> >> Updating all zones is actually a bug as wp[] entries for other zones may
> >> be larger than the device reported wp if there are other write aios in
> >> flight. So the last argument must be "1" here.
> >
> > Ok, I understood now.
> >
> >>
> >>> +        }
> >>> +#endif
> >>>          return nbytes;
> >>>      }
> >>>  }
> >>> @@ -2022,12 +2136,17 @@ static int handle_aiocb_zone_report(void *opaque) 
> >>> {
> >>>  #if defined(CONFIG_BLKZONED)
> >>>  static int handle_aiocb_zone_mgmt(void *opaque) {
> >>>      RawPosixAIOData *aiocb = opaque;
> >>> +    BlockDriverState *bs = aiocb->bs;
> >>>      int fd = aiocb->aio_fildes;
> >>>      int64_t sector = aiocb->aio_offset / 512;
> >>>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
> >>> +    uint64_t wend_offset;
> >>>      struct blk_zone_range range;
> >>>      int ret;
> >>>
> >>
> >> Why the blank line here ?
> >
> > For readability, separate it from the execution part.
>
> But the following lines are variable declarations. I personally prefer
> declarations to stay together before the code :)


Ok, will change it.

>
> >
> >>
> >>> +    BlockZoneWps *wps = bs->bl.wps;
> >>> +    int index = aiocb->aio_offset / bs->bl.zone_size;
> >>> +
> >>>      /* Execute the operation */
> >>>      range.sector = sector;
> >>>      range.nr_sectors = nr_sectors;
> >>> @@ -2035,11 +2154,41 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
> >>>          ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
> >>>      } while (ret != 0 && errno == EINTR);
> >>>      if (ret != 0) {
> >>> +        update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> >>> +                        aiocb->bs->bl.nr_zones);
> >>
> >> You need only to update the range of zones that was specified for the
> >> management option, not all zones. So you must specify the zone
> >> management aio offset and size/zone_size here.
> >>
> >>>          ret = -errno;
> >>>          error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
> >>>                       ret);
> >>>          return ret;
> >>>      }
> >>> +
> >>> +    qemu_mutex_lock(&wps->lock);
> >>> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> >>> +         /*
> >>> +         * The zoned device allows the last zone smaller that the zone 
> >>> size.
> >>> +         */
> >>
> >> comment indentation is off.
> >>
> >>> +        if (aiocb->aio_nbytes < bs->bl.zone_size) {
> >>> +            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
> >>> +        } else {
> >>> +            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
> >>> +        }
> >>> +
> >>> +        if (aiocb->aio_offset != wps->wp[index] &&
> >>> +            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
> >>
> >> I do not understand the condition here. Why do you have
> >> "aiocb->aio_offset != wps->wp[index]" ?
> >
> > It is intended for zone state checks. aio_offset (= start byte of that
> > zone) = wp means this zone is empty. Only non-empty zones will be
> > reset.
>
> That is not very natural to use an input from the user (the guest) to check 
> the
> state of something that you (qemu) maintains internally and hides to the user.
> You should do such test with a small helpers like this:
>
> static bool zone_is_empty(bl, index)
> {
>         return bl->wps.wp[index} == index * bl->zone_size;
> }

Ok.

>
> And note that this will ALWAYS return false for conventional zones.
> You are not checking for conventional zones either. Any zone management 
> function
> should be immediately failed if addressed to a conventional zone. That is
> missing. You need a:
>
> if (BDRV_ZT_IS_CONV(wps->wp[index] && "this is not a zone reset all op")
>         return -EIO; /* or similar... */
>
> at the beginning of handle_aiocb_zone_mgmt().

(Just add an additional note: )
Though this check should be in the block layer API patches, it is
possible when wps[] is introduced.

>
> >
> >>
> >>> +            if (aiocb->zone_mgmt.all) {
> >>
> >> This is the only place where you use this all boolean field. For
> >> simplicity, I would drop this field completely and test that
> >> aiocb->aio_offset == 0 && aiocb->aio_nbytes == bs->bl.capacity to detect
> >> a reset all zones operation.
> >
> > Right, the capacity field makes it possible. I'll drop it.
> >
> >>
> >>> +                for (int i = 0; i < bs->bl.nr_zones; ++i) {
> >>> +                    wps->wp[i] = i * bs->bl.zone_size;
> >>
> >> You are not handling conventional zones here. For conventional zones,
> >> you should not change the value. Otherwise, BDRV_ZT_IS_CONV() will
> >> always return false after this.
> >
> > Right, will add a condition line here:
> > + if (! BDRV_ZT_IS_CONV(wps->wp[i]))
>
> You need:
>
> if (BDRV_ZT_IS_CONV(wps->wp[i]))
>     continue;
>
> as the first lines inside the for loop.

Trivial: looks like the same behavior:
 if (! BDRV_ZT_IS_CONV(wps->wp[i])) {
    /* change pointers */
}

if (BDRV_ZT_IS_CONV(wps->wp[i]))
    continue;
/* change pointers */

>
>
> >
> >>
> >>> +                }
> >>> +            } else {
> >>> +                wps->wp[index] = aiocb->aio_offset;
> >>> +            }
> >>> +        } else if (aiocb->aio_offset != wps->wp[index] &&
> >>> +            aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
> >>
> >> Same comment here. Why do you have "aiocb->aio_offset != wps->wp[index]" ?
> >
> > This should be wend_offset != wps->wp[index]. It means if this zone is
> > full, no need to finish it.
>
> Nope, this does not mean the zone is full. Full condition would be:
>
> wps->wp[index] >= index * bl->zone_size + zone_cap
>
> But you do not have zone cap per zone (remember that zone capacity is per zone
> and may differ between zones)... You could add it to the wp array, but that 
> will
> make it larger for not much benefits. Since finishing a zone that is already
> full is a very rare case, optimizing for it is not valuable. So simply issue 
> the
> zone finish operation. It will be a no-op on the host device if the zone is
> already full. No big deal !

I see.

Thanks!

>
> >
> >>
> >>> +            wps->wp[index] = wend_offset;
> >>> +        }
> >>> +    }
> >>> +    qemu_mutex_unlock(&wps->lock);
> >>> +
> >>>      return ret;
> >>>  }
> >>>  #endif
> >>> @@ -2480,6 +2629,12 @@ static void raw_close(BlockDriverState *bs)
> >>>      BDRVRawState *s = bs->opaque;
> >>>
> >>>      if (s->fd >= 0) {
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +        if (bs->bl.wps) {
> >>> +            qemu_mutex_destroy(&bs->bl.wps->lock);
> >>> +            g_free(bs->bl.wps);
> >>> +        }
> >>> +#endif
> >>>          qemu_close(s->fd);
> >>>          s->fd = -1;
> >>>      }
> >>> @@ -3278,6 +3433,7 @@ static int coroutine_fn 
> >>> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >>>      int64_t zone_size, zone_size_mask;
> >>>      const char *zone_op_name;
> >>>      unsigned long zone_op;
> >>> +    bool is_all = false;
> >>>
> >>>      zone_size = bs->bl.zone_size;
> >>>      zone_size_mask = zone_size - 1;
> >>> @@ -3314,6 +3470,7 @@ static int coroutine_fn 
> >>> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >>>      case BLK_ZO_RESET_ALL:
> >>>          zone_op_name = "BLKRESETZONE";
> >>>          zone_op = BLKRESETZONE;
> >>> +        is_all = true;
> >>>          break;
> >>>      default:
> >>>          g_assert_not_reached();
> >>> @@ -3328,6 +3485,7 @@ static int coroutine_fn 
> >>> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >>>          .zone_mgmt  = {
> >>>              .zone_op = zone_op,
> >>>              .zone_op_name = zone_op_name,
> >>> +            .all = is_all,
> >>>          },
> >>>      };
> >>>
> >>> diff --git a/include/block/block-common.h b/include/block/block-common.h
> >>> index 882de6825e..b8b2dba64a 100644
> >>> --- a/include/block/block-common.h
> >>> +++ b/include/block/block-common.h
> >>> @@ -93,6 +93,14 @@ typedef struct BlockZoneDescriptor {
> >>>      BlockZoneCondition cond;
> >>>  } BlockZoneDescriptor;
> >>>
> >>> +/*
> >>> + * Track write pointers of a zone in bytes.
> >>> + */
> >>> +typedef struct BlockZoneWps {
> >>> +    QemuMutex lock;
> >>> +    uint64_t wp[];
> >>> +} BlockZoneWps;
> >>> +
> >>>  typedef struct BlockDriverInfo {
> >>>      /* in bytes, 0 if irrelevant */
> >>>      int cluster_size;
> >>> @@ -206,6 +214,12 @@ typedef enum {
> >>>  #define BDRV_SECTOR_BITS   9
> >>>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
> >>>
> >>> +/*
> >>> + * Get the first most significant bit of wp. If it is zero, then
> >>> + * the zone type is SWR.
> >>> + */
> >>> +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
> >>> +
> >>>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, 
> >>> \
> >>>                                             INT_MAX >> BDRV_SECTOR_BITS)
> >>>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << 
> >>> BDRV_SECTOR_BITS)
> >>> diff --git a/include/block/block_int-common.h 
> >>> b/include/block/block_int-common.h
> >>> index 37dddc603c..59c2d1316d 100644
> >>> --- a/include/block/block_int-common.h
> >>> +++ b/include/block/block_int-common.h
> >>> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
> >>>
> >>>      /* device capacity expressed in bytes */
> >>>      int64_t capacity;
> >>> +
> >>> +    /* array of write pointers' location of each zone in the zoned 
> >>> device. */
> >>> +    BlockZoneWps *wps;
> >>> +
> >>> +    int64_t write_granularity;
> >>>  } BlockLimits;
> >>>
> >>>  typedef struct BdrvOpBlocker BdrvOpBlocker;
> >>
> >> --
> >> Damien Le Moal
> >> Western Digital Research
> >>
>
> --
> Damien Le Moal
> Western Digital Research
>

Re: [PATCH v3 1/3] file-posix:add the tracking of the zones write pointers

Reply via email to