nvme: add dulbe support

Klaus Jensen Mon, 16 Nov 2020 04:05:22 -0800

On Nov 16 20:43, Minwoo Im wrote:
> On 11/12 20:59, Klaus Jensen wrote:
> > From: Klaus Jensen <k.jen...@samsung.com>
> > 
> > Add support for reporting the Deallocated or Unwritten Logical Block
> > Error (DULBE).
> > 
> > Rely on the block status flags reported by the block layer and consider
> > any block with the BDRV_BLOCK_ZERO flag to be deallocated.
> > 
> > Multiple factors affect when a Write Zeroes command result in
> > deallocation of blocks.
> > 
> >   * the underlying file system block size
> >   * the blockdev format
> >   * the 'discard' and 'logical_block_size' parameters
> > 
> >      format | discard | wz (512B)  wz (4KiB)  wz (64KiB)
> >     -----------------------------------------------------
> >       qcow2    ignore   n          n          y
> >       qcow2    unmap    n          n          y
> >       raw      ignore   n          y          y
> >       raw      unmap    n          y          y
> > 
> > So, this works best with an image in raw format and 4KiB LBAs, since
> > holes can then be punched on a per-block basis (this assumes a file
> > system with a 4kb block size, YMMV). A qcow2 image, uses a cluster size
> > of 64KiB by default and blocks will only be marked deallocated if a full
> > cluster is zeroed or discarded. However, this *is* consistent with the
> > spec since Write Zeroes "should" deallocate the block if the Deallocate
> > attribute is set and "may" deallocate if the Deallocate attribute is not
> > set. Thus, we always try to deallocate (the BDRV_REQ_MAY_UNMAP flag is
> > always set).
> > 
> > Signed-off-by: Klaus Jensen <k.jen...@samsung.com>
> > ---
> >  hw/block/nvme-ns.h    |  4 ++
> >  include/block/nvme.h  |  5 +++
> >  hw/block/nvme-ns.c    |  8 ++--
> >  hw/block/nvme.c       | 91 ++++++++++++++++++++++++++++++++++++++++++-
> >  hw/block/trace-events |  4 ++
> >  5 files changed, 107 insertions(+), 5 deletions(-)
> > 
> > diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> > index 83734f4606e1..44bf6271b744 100644
> > --- a/hw/block/nvme-ns.h
> > +++ b/hw/block/nvme-ns.h
> > @@ -31,6 +31,10 @@ typedef struct NvmeNamespace {
> >      NvmeIdNs     id_ns;
> >  
> >      NvmeNamespaceParams params;
> > +
> > +    struct {
> > +        uint32_t err_rec;
> > +    } features;
> >  } NvmeNamespace;
> >  
> >  static inline uint32_t nvme_nsid(NvmeNamespace *ns)
> > diff --git a/include/block/nvme.h b/include/block/nvme.h
> > index 8a46d9cf015f..966c3bb304bd 100644
> > --- a/include/block/nvme.h
> > +++ b/include/block/nvme.h
> > @@ -687,6 +687,7 @@ enum NvmeStatusCodes {
> >      NVME_E2E_REF_ERROR          = 0x0284,
> >      NVME_CMP_FAILURE            = 0x0285,
> >      NVME_ACCESS_DENIED          = 0x0286,
> > +    NVME_DULB                   = 0x0287,
> >      NVME_MORE                   = 0x2000,
> >      NVME_DNR                    = 0x4000,
> >      NVME_NO_COMPLETE            = 0xffff,
> > @@ -903,6 +904,9 @@ enum NvmeIdCtrlLpa {
> >  #define NVME_AEC_NS_ATTR(aec)       ((aec >> 8) & 0x1)
> >  #define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
> >  
> > +#define NVME_ERR_REC_TLER(err_rec)  (err_rec & 0xffff)
> > +#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000)
> > +
> >  enum NvmeFeatureIds {
> >      NVME_ARBITRATION                = 0x1,
> >      NVME_POWER_MANAGEMENT           = 0x2,
> > @@ -1023,6 +1027,7 @@ enum NvmeNsIdentifierType {
> >  
> >  
> >  #define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
> > +#define NVME_ID_NS_NSFEAT_DULBE(nsfeat)     ((nsfeat >> 2) & 0x1)
> >  #define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
> >  #define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))
> >  #define NVME_ID_NS_MC_SEPARATE(mc)          ((mc >> 1) & 0x1)
> > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> > index 31c80cdf5b5f..f1cc734c60f5 100644
> > --- a/hw/block/nvme-ns.c
> > +++ b/hw/block/nvme-ns.c
> > @@ -33,9 +33,7 @@ static void nvme_ns_init(NvmeNamespace *ns)
> >      NvmeIdNs *id_ns = &ns->id_ns;
> >      int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
> >  
> > -    if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
> > -        ns->id_ns.dlfeat = 0x9;
> > -    }
> > +    ns->id_ns.dlfeat = 0x9;
> >  
> >      id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
> >  
> > @@ -44,6 +42,9 @@ static void nvme_ns_init(NvmeNamespace *ns)
> >      /* no thin provisioning */
> >      id_ns->ncap = id_ns->nsze;
> >      id_ns->nuse = id_ns->ncap;
> > +
> > +    /* support DULBE */
> > +    id_ns->nsfeat |= 0x4;
> >  }
> >  
> >  static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> > @@ -92,6 +93,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
> > **errp)
> >      }
> >  
> >      nvme_ns_init(ns);
> > +
> >      if (nvme_register_namespace(n, ns, errp)) {
> >          return -1;
> >      }
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index a96a996ddc0d..8d75a89fd872 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -105,6 +105,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
> >  
> >  static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
> >      [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
> > +    [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | 
> > NVME_FEAT_CAP_NS,
> >      [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
> >      [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
> >      [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
> > @@ -878,6 +879,49 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace 
> > *ns, uint64_t slba,
> >      return NVME_SUCCESS;
> >  }
> >  
> > +static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
> > +                                 uint32_t nlb)
> > +{
> > +    BlockDriverState *bs = blk_bs(ns->blkconf.blk);
> > +
> > +    int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
> > +    int64_t offset = nvme_l2b(ns, slba);
> > +    bool zeroed;
> > +    int ret;
> > +
> > +    Error *local_err = NULL;
> > +
> > +    /*
> > +     * `pnum` holds the number of bytes after offset that shares the same
> > +     * allocation status as the byte at offset. If `pnum` is different from
> > +     * `bytes`, we should check the allocation status of the next range and
> > +     * continue this until all bytes have been checked.
> > +     */
> > +    do {
> > +        bytes -= pnum;
> > +
> > +        ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
> > +        if (ret < 0) {
> > +            error_setg_errno(&local_err, -ret, "unable to get block 
> > status");
> > +            error_report_err(local_err);
> > +
> > +            return NVME_INTERNAL_DEV_ERROR;
> > +        }
> > +
> > +        zeroed = !!(ret & BDRV_BLOCK_ZERO);
> > +
> > +        trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
> > +
> > +        if (zeroed) {
> > +            return NVME_DULB;
> > +        }
> > +
> > +        offset += pnum;
> > +    } while (pnum != bytes);
> > +
> > +    return NVME_SUCCESS;
> > +}
> > +
> >  static void nvme_aio_err(NvmeRequest *req, int ret)
> >  {
> >      uint16_t status = NVME_SUCCESS;
> > @@ -996,6 +1040,15 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
> >          goto invalid;
> >      }
> >  
> > +    if (acct == BLOCK_ACCT_READ) {
> > +        if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
> > +            status = nvme_check_dulbe(ns, slba, nlb);
> > +            if (status) {
> > +                goto invalid;
> > +            }
> > +        }
> > +    }
> > +
> >      status = nvme_map_dptr(n, data_size, req);
> >      if (status) {
> >          goto invalid;
> > @@ -1643,6 +1696,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, 
> > NvmeRequest *req)
> >      uint8_t fid = NVME_GETSETFEAT_FID(dw10);
> >      NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
> >      uint16_t iv;
> > +    NvmeNamespace *ns;
> >  
> >      static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
> >          [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
> > @@ -1705,6 +1759,18 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, 
> > NvmeRequest *req)
> >          }
> >  
> >          return NVME_INVALID_FIELD | NVME_DNR;
> > +    case NVME_ERROR_RECOVERY:
> > +        if (!nvme_nsid_valid(n, nsid)) {
> > +            return NVME_INVALID_NSID | NVME_DNR;
> > +        }
> > +
> > +        ns = nvme_ns(n, nsid);
> > +        if (unlikely(!ns)) {
> > +            return NVME_INVALID_FIELD | NVME_DNR;
> > +        }
> > +
> > +        result = ns->features.err_rec;
> > +        goto out;
> >      case NVME_VOLATILE_WRITE_CACHE:
> >          result = n->features.vwc;
> >          trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
> > @@ -1777,7 +1843,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl 
> > *n, NvmeRequest *req)
> >  
> >  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
> >  {
> > -    NvmeNamespace *ns;
> > +    NvmeNamespace *ns = NULL;
> >  
> >      NvmeCmd *cmd = &req->cmd;
> >      uint32_t dw10 = le32_to_cpu(cmd->cdw10);
> > @@ -1785,6 +1851,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
> > NvmeRequest *req)
> >      uint32_t nsid = le32_to_cpu(cmd->nsid);
> >      uint8_t fid = NVME_GETSETFEAT_FID(dw10);
> >      uint8_t save = NVME_SETFEAT_SAVE(dw10);
> > +    int i;
> >  
> >      trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
> >  
> > @@ -1844,11 +1911,31 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
> > NvmeRequest *req)
> >                                 NVME_LOG_SMART_INFO);
> >          }
> >  
> > +        break;
> > +    case NVME_ERROR_RECOVERY:
> > +        if (nsid == NVME_NSID_BROADCAST) {
> > +            for (i = 1; i <= n->num_namespaces; i++) {
> > +                ns = nvme_ns(n, i);
> > +
> > +                if (!ns) {
> > +                    continue;
> > +                }
> > +
> > +                if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
> > +                    ns->features.err_rec = dw11;
> > +                }
> > +            }
> > +
> > +            break;
> > +        }
> > +
> > +        assert(ns);
> 
> Klaus,
> 
> Sorry, but can we have assert for the failure that might happen due to
> user's mis-behavior?  Why don't we have NVME_INVALID_NSID for this case?
>


Oh. I see that this is pretty convoluted.

The assert catches if you add a new namespace specific feature but
forget to give it the NVME_FEAT_CAP_NS capability in the
nvme_feature_cap array.

If NVME_FEAT_CAP_NS is correctly set for the feature, then the code at
the beginning of nvme_set_feature will ensure that the namespace id
given is valid.

signature.asc
Description: PGP signature

Re: [PATCH v8 3/5] hw/block/nvme: add dulbe support

Reply via email to