nvme: add the dataset management command

Keith Busch Wed, 21 Oct 2020 17:59:58 -0700

On Thu, Oct 22, 2020 at 12:17:36AM +0200, Klaus Jensen wrote:
> +static void nvme_aio_discard_cb(void *opaque, int ret)
> +{
> +    NvmeRequest *req = opaque;
> +    int *discards = req->opaque;
> +
> +    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
> +
> +    if (ret) {
> +        req->status = NVME_INTERNAL_DEV_ERROR;
> +        trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret),
> +                               req->status);
> +    }
> +
> +    if (discards && --(*discards) > 0) {
> +        return;
> +    }
> +
> +    g_free(req->opaque);
> +    req->opaque = NULL;
> +
> +    nvme_enqueue_req_completion(nvme_cq(req), req);
> +}
> +
> +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
> +{
> +    NvmeNamespace *ns = req->ns;
> +    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
> +    NvmeDsmRange *range = NULL;
> +    int *discards = NULL;
> +
> +    uint32_t attr = le32_to_cpu(dsm->attributes);
> +    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
> +
> +    uint16_t status = NVME_SUCCESS;
> +
> +    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
> +
> +    if (attr & NVME_DSMGMT_AD) {
> +        int64_t offset;
> +        size_t len;
> +
> +        range = g_new(NvmeDsmRange, nr);
> +
> +        status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange),
> +                          DMA_DIRECTION_TO_DEVICE, req);
> +        if (status) {
> +            goto out;
> +        }
> +
> +        discards = g_new0(int, 1);
> +        req->opaque = discards;


I think you need to initialize discards to 1 so that this function is
holding a reference on it while the asynchronous part is running.
Otherwise, the callback may see 'discards' at 0 and free it while we're
trying to discard the next range.

> +
> +        for (int i = 0; i < nr; i++) {
> +            uint64_t slba = le64_to_cpu(range[i].slba);
> +            uint32_t nlb = le32_to_cpu(range[i].nlb);
> +
> +            if (nvme_check_bounds(n, ns, slba, nlb)) {
> +                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
> +                                                     ns->id_ns.nsze);
> +                continue;
> +            }
> +
> +            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
> +                                          nlb);
> +
> +            offset = nvme_l2b(ns, slba);
> +            len = nvme_l2b(ns, nlb);
> +
> +            while (len) {
> +                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
> +
> +                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
> +                                 nvme_aio_discard_cb, req);
> +
> +                (*discards)++;

The increment ought to be before the aio call so that the _cb can't see
the value before it's accounted for. 

> +
> +                offset += bytes;
> +                len -= bytes;
> +            }
> +        }
> +
> +        if (*discards) {
> +            status = NVME_NO_COMPLETE;
> +        } else {
> +            g_free(discards);
> +            req->opaque = NULL;
> +        }
> +    }
> +
> +out:
> +    g_free(range);
> +
> +    return status;
> +}
> +
>  static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
>  {
>      block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
> @@ -1088,6 +1183,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
> *req)
>      case NVME_CMD_WRITE:
>      case NVME_CMD_READ:
>          return nvme_rw(n, req);
> +    case NVME_CMD_DSM:
> +        return nvme_dsm(n, req);
>      default:
>          trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
>          return NVME_INVALID_OPCODE | NVME_DNR;
> @@ -2810,7 +2907,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>      id->cqes = (0x4 << 4) | 0x4;
>      id->nn = cpu_to_le32(n->num_namespaces);
>      id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
> -                           NVME_ONCS_FEATURES);
> +                           NVME_ONCS_FEATURES | NVME_ONCS_DSM);

I think we should set ID_NS.NDPG and NDPA since there really is a
preferred granularity and alignment.

>  
>      id->vwc = 0x1;
>      id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
> -- 
> 2.28.0
>

Re: [PATCH v3 2/2] hw/block/nvme: add the dataset management command

Reply via email to