On 11/12 20:59, Klaus Jensen wrote:
> From: Klaus Jensen <k.jen...@samsung.com>
> 
> Add support for reporting the Deallocated or Unwritten Logical Block
> Error (DULBE).
> 
> Rely on the block status flags reported by the block layer and consider
> any block with the BDRV_BLOCK_ZERO flag to be deallocated.
> 
> Multiple factors affect when a Write Zeroes command result in
> deallocation of blocks.
> 
>   * the underlying file system block size
>   * the blockdev format
>   * the 'discard' and 'logical_block_size' parameters
> 
>      format | discard | wz (512B)  wz (4KiB)  wz (64KiB)
>     -----------------------------------------------------
>       qcow2    ignore   n          n          y
>       qcow2    unmap    n          n          y
>       raw      ignore   n          y          y
>       raw      unmap    n          y          y
> 
> So, this works best with an image in raw format and 4KiB LBAs, since
> holes can then be punched on a per-block basis (this assumes a file
> system with a 4kb block size, YMMV). A qcow2 image, uses a cluster size
> of 64KiB by default and blocks will only be marked deallocated if a full
> cluster is zeroed or discarded. However, this *is* consistent with the
> spec since Write Zeroes "should" deallocate the block if the Deallocate
> attribute is set and "may" deallocate if the Deallocate attribute is not
> set. Thus, we always try to deallocate (the BDRV_REQ_MAY_UNMAP flag is
> always set).
> 
> Signed-off-by: Klaus Jensen <k.jen...@samsung.com>
> ---
>  hw/block/nvme-ns.h    |  4 ++
>  include/block/nvme.h  |  5 +++
>  hw/block/nvme-ns.c    |  8 ++--
>  hw/block/nvme.c       | 91 ++++++++++++++++++++++++++++++++++++++++++-
>  hw/block/trace-events |  4 ++
>  5 files changed, 107 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> index 83734f4606e1..44bf6271b744 100644
> --- a/hw/block/nvme-ns.h
> +++ b/hw/block/nvme-ns.h
> @@ -31,6 +31,10 @@ typedef struct NvmeNamespace {
>      NvmeIdNs     id_ns;
>  
>      NvmeNamespaceParams params;
> +
> +    struct {
> +        uint32_t err_rec;
> +    } features;
>  } NvmeNamespace;
>  
>  static inline uint32_t nvme_nsid(NvmeNamespace *ns)
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 8a46d9cf015f..966c3bb304bd 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -687,6 +687,7 @@ enum NvmeStatusCodes {
>      NVME_E2E_REF_ERROR          = 0x0284,
>      NVME_CMP_FAILURE            = 0x0285,
>      NVME_ACCESS_DENIED          = 0x0286,
> +    NVME_DULB                   = 0x0287,
>      NVME_MORE                   = 0x2000,
>      NVME_DNR                    = 0x4000,
>      NVME_NO_COMPLETE            = 0xffff,
> @@ -903,6 +904,9 @@ enum NvmeIdCtrlLpa {
>  #define NVME_AEC_NS_ATTR(aec)       ((aec >> 8) & 0x1)
>  #define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
>  
> +#define NVME_ERR_REC_TLER(err_rec)  (err_rec & 0xffff)
> +#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000)
> +
>  enum NvmeFeatureIds {
>      NVME_ARBITRATION                = 0x1,
>      NVME_POWER_MANAGEMENT           = 0x2,
> @@ -1023,6 +1027,7 @@ enum NvmeNsIdentifierType {
>  
>  
>  #define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
> +#define NVME_ID_NS_NSFEAT_DULBE(nsfeat)     ((nsfeat >> 2) & 0x1)
>  #define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
>  #define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))
>  #define NVME_ID_NS_MC_SEPARATE(mc)          ((mc >> 1) & 0x1)
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> index 31c80cdf5b5f..f1cc734c60f5 100644
> --- a/hw/block/nvme-ns.c
> +++ b/hw/block/nvme-ns.c
> @@ -33,9 +33,7 @@ static void nvme_ns_init(NvmeNamespace *ns)
>      NvmeIdNs *id_ns = &ns->id_ns;
>      int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
>  
> -    if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
> -        ns->id_ns.dlfeat = 0x9;
> -    }
> +    ns->id_ns.dlfeat = 0x9;
>  
>      id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
>  
> @@ -44,6 +42,9 @@ static void nvme_ns_init(NvmeNamespace *ns)
>      /* no thin provisioning */
>      id_ns->ncap = id_ns->nsze;
>      id_ns->nuse = id_ns->ncap;
> +
> +    /* support DULBE */
> +    id_ns->nsfeat |= 0x4;
>  }
>  
>  static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> @@ -92,6 +93,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
> **errp)
>      }
>  
>      nvme_ns_init(ns);
> +
>      if (nvme_register_namespace(n, ns, errp)) {
>          return -1;
>      }
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index a96a996ddc0d..8d75a89fd872 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -105,6 +105,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
>  
>  static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
>      [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
> +    [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | 
> NVME_FEAT_CAP_NS,
>      [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
>      [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
>      [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
> @@ -878,6 +879,49 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace 
> *ns, uint64_t slba,
>      return NVME_SUCCESS;
>  }
>  
> +static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
> +                                 uint32_t nlb)
> +{
> +    BlockDriverState *bs = blk_bs(ns->blkconf.blk);
> +
> +    int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
> +    int64_t offset = nvme_l2b(ns, slba);
> +    bool zeroed;
> +    int ret;
> +
> +    Error *local_err = NULL;
> +
> +    /*
> +     * `pnum` holds the number of bytes after offset that shares the same
> +     * allocation status as the byte at offset. If `pnum` is different from
> +     * `bytes`, we should check the allocation status of the next range and
> +     * continue this until all bytes have been checked.
> +     */
> +    do {
> +        bytes -= pnum;
> +
> +        ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
> +        if (ret < 0) {
> +            error_setg_errno(&local_err, -ret, "unable to get block status");
> +            error_report_err(local_err);
> +
> +            return NVME_INTERNAL_DEV_ERROR;
> +        }
> +
> +        zeroed = !!(ret & BDRV_BLOCK_ZERO);
> +
> +        trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
> +
> +        if (zeroed) {
> +            return NVME_DULB;
> +        }
> +
> +        offset += pnum;
> +    } while (pnum != bytes);
> +
> +    return NVME_SUCCESS;
> +}
> +
>  static void nvme_aio_err(NvmeRequest *req, int ret)
>  {
>      uint16_t status = NVME_SUCCESS;
> @@ -996,6 +1040,15 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
>          goto invalid;
>      }
>  
> +    if (acct == BLOCK_ACCT_READ) {
> +        if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
> +            status = nvme_check_dulbe(ns, slba, nlb);
> +            if (status) {
> +                goto invalid;
> +            }
> +        }
> +    }
> +
>      status = nvme_map_dptr(n, data_size, req);
>      if (status) {
>          goto invalid;
> @@ -1643,6 +1696,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, 
> NvmeRequest *req)
>      uint8_t fid = NVME_GETSETFEAT_FID(dw10);
>      NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
>      uint16_t iv;
> +    NvmeNamespace *ns;
>  
>      static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
>          [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
> @@ -1705,6 +1759,18 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, 
> NvmeRequest *req)
>          }
>  
>          return NVME_INVALID_FIELD | NVME_DNR;
> +    case NVME_ERROR_RECOVERY:
> +        if (!nvme_nsid_valid(n, nsid)) {
> +            return NVME_INVALID_NSID | NVME_DNR;
> +        }
> +
> +        ns = nvme_ns(n, nsid);
> +        if (unlikely(!ns)) {
> +            return NVME_INVALID_FIELD | NVME_DNR;
> +        }
> +
> +        result = ns->features.err_rec;
> +        goto out;
>      case NVME_VOLATILE_WRITE_CACHE:
>          result = n->features.vwc;
>          trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
> @@ -1777,7 +1843,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, 
> NvmeRequest *req)
>  
>  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
>  {
> -    NvmeNamespace *ns;
> +    NvmeNamespace *ns = NULL;
>  
>      NvmeCmd *cmd = &req->cmd;
>      uint32_t dw10 = le32_to_cpu(cmd->cdw10);
> @@ -1785,6 +1851,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
> NvmeRequest *req)
>      uint32_t nsid = le32_to_cpu(cmd->nsid);
>      uint8_t fid = NVME_GETSETFEAT_FID(dw10);
>      uint8_t save = NVME_SETFEAT_SAVE(dw10);
> +    int i;
>  
>      trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
>  
> @@ -1844,11 +1911,31 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
> NvmeRequest *req)
>                                 NVME_LOG_SMART_INFO);
>          }
>  
> +        break;
> +    case NVME_ERROR_RECOVERY:
> +        if (nsid == NVME_NSID_BROADCAST) {
> +            for (i = 1; i <= n->num_namespaces; i++) {
> +                ns = nvme_ns(n, i);
> +
> +                if (!ns) {
> +                    continue;
> +                }
> +
> +                if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
> +                    ns->features.err_rec = dw11;
> +                }
> +            }
> +
> +            break;
> +        }
> +
> +        assert(ns);

Klaus,

Sorry, but can we have assert for the failure that might happen due to
user's mis-behavior?  Why don't we have NVME_INVALID_NSID for this case?

Reply via email to