On Oct 22 23:02, Philippe Mathieu-Daudé wrote: > Hi Klaus, > Hi Philippe,
Thanks for your comments! > On 10/22/20 8:49 PM, Klaus Jensen wrote: > > From: Klaus Jensen <k.jen...@samsung.com> > > > > Add support for the Dataset Management command and the Deallocate > > attribute. Deallocation results in discards being sent to the underlying > > block device. Whether of not the blocks are actually deallocated is > > affected by the same factors as Write Zeroes (see previous commit). > > > > format | discard | dsm (512b) dsm (4kb) dsm (64kb) > > Please use B/KiB units which are unambiguous (kb is for kbits) > (if you queue this yourself, you can fix when applying, no need > to repost). > Thanks, I'll change it. > > ------------------------------------------------------ > > qcow2 ignore n n n > > qcow2 unmap n n y > > raw ignore n n n > > raw unmap n y y > > > > Again, a raw format and 4kb LBAs are preferable. > > > > In order to set the Namespace Preferred Deallocate Granularity and > > Alignment fields (NPDG and NPDA), choose a sane minimum discard > > granularity of 4kb. If we are using a passthru device supporting discard > > at a 512b granularity, user should set the discard_granularity property > > Ditto. > > > explicitly. NPDG and NPDA will also account for the cluster_size of the > > block driver if required (i.e. for QCOW2). > > > > See NVM Express 1.3d, Section 6.7 ("Dataset Management command"). > > > > Signed-off-by: Klaus Jensen <k.jen...@samsung.com> > > --- > > hw/block/nvme.h | 2 + > > include/block/nvme.h | 7 ++- > > hw/block/nvme-ns.c | 36 +++++++++++++-- > > hw/block/nvme.c | 101 ++++++++++++++++++++++++++++++++++++++++++- > > 4 files changed, 140 insertions(+), 6 deletions(-) > > > > diff --git a/hw/block/nvme.h b/hw/block/nvme.h > > index e080a2318a50..574333caa3f9 100644 > > --- a/hw/block/nvme.h > > +++ b/hw/block/nvme.h > > @@ -28,6 +28,7 @@ typedef struct NvmeRequest { > > struct NvmeNamespace *ns; > > BlockAIOCB *aiocb; > > uint16_t status; > > + void *opaque; > > NvmeCqe cqe; > > NvmeCmd cmd; > > BlockAcctCookie acct; > > @@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc) > > case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE"; > > case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; > > case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; > > + case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; > > default: return "NVME_NVM_CMD_UNKNOWN"; > > } > > } > > diff --git a/include/block/nvme.h b/include/block/nvme.h > > index 966c3bb304bd..e95ff6ca9b37 100644 > > --- a/include/block/nvme.h > > +++ b/include/block/nvme.h > > @@ -990,7 +990,12 @@ typedef struct QEMU_PACKED NvmeIdNs { > > uint16_t nabspf; > > uint16_t noiob; > > uint8_t nvmcap[16]; > > - uint8_t rsvd64[40]; > > + uint16_t npwg; > > + uint16_t npwa; > > + uint16_t npdg; > > + uint16_t npda; > > + uint16_t nows; > > + uint8_t rsvd74[30]; > > uint8_t nguid[16]; > > uint64_t eui64; > > NvmeLBAF lbaf[16]; > > If you consider "block/nvme.h" shared by 2 different subsystems, > it is better to add the changes in a separate patch. That way > the changes can be acked individually. > Sure. Some other stuff here warrents a v6 I think, so I will split it. > > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c > > index f1cc734c60f5..840651db7256 100644 > > --- a/hw/block/nvme-ns.c > > +++ b/hw/block/nvme-ns.c > > @@ -28,10 +28,14 @@ > > #include "nvme.h" > > #include "nvme-ns.h" > > -static void nvme_ns_init(NvmeNamespace *ns) > > +#define MIN_DISCARD_GRANULARITY (4 * KiB) > > + > > +static int nvme_ns_init(NvmeNamespace *ns, Error **errp) > > Hmm the Error* argument could be squashed in "hw/block/nvme: > support multiple namespaces". Else better split patch in dumb > units IMHO (maybe a reviewer's taste). > Yeah, I guess I can squash that in. > > { > > + BlockDriverInfo bdi; > > NvmeIdNs *id_ns = &ns->id_ns; > > int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); > > + int npdg, ret; > > ns->id_ns.dlfeat = 0x9; > > @@ -43,8 +47,25 @@ static void nvme_ns_init(NvmeNamespace *ns) > > id_ns->ncap = id_ns->nsze; > > id_ns->nuse = id_ns->ncap; > > - /* support DULBE */ > > - id_ns->nsfeat |= 0x4; > > + /* support DULBE and I/O optimization fields */ > > + id_ns->nsfeat |= (0x4 | 0x10); > > The comment helps, but isn't needed if you use explicit definitions > for these flags. You already introduced the NVME_ID_NS_NSFEAT_DULBE > and NVME_ID_NS_FLBAS_EXTENDED but they are restricted to extract bits. > This is why I personally prefer the registerfields API (see > "hw/registerfields.h"). > I've been wanting to fix those constants - but the convention that they only extract bits pre-dates the nvme device and is from when the nvme block driver was introduced - I have just been following the precedence by defining them like that. > > + > > + npdg = ns->blkconf.discard_granularity / > > ns->blkconf.logical_block_size; > > + > > + ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi); > > + if (ret < 0) { > > + error_setg_errno(errp, -ret, "could not get block driver info"); > > + return ret; > > + } > > + > > + if (bdi.cluster_size && > > + bdi.cluster_size > ns->blkconf.discard_granularity) { > > + npdg = bdi.cluster_size / ns->blkconf.logical_block_size; > > + } > > + > > + id_ns->npda = id_ns->npdg = npdg - 1; > > + > > + return 0; > > } > > static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) > > @@ -59,6 +80,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace > > *ns, Error **errp) > > return -1; > > } > > + if (ns->blkconf.discard_granularity == -1) { > > + ns->blkconf.discard_granularity = > > + MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY); > > + } > > + > > ns->size = blk_getlength(ns->blkconf.blk); > > if (ns->size < 0) { > > error_setg_errno(errp, -ns->size, "could not get blockdev size"); > > @@ -92,7 +118,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error > > **errp) > > return -1; > > } > > - nvme_ns_init(ns); > > + if (nvme_ns_init(ns, errp)) { > > + return -1; > > + } > > if (nvme_register_namespace(n, ns, errp)) { > > return -1; > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > > index 4ab0705f5a92..7acb9e9dc38a 100644 > > --- a/hw/block/nvme.c > > +++ b/hw/block/nvme.c > > @@ -959,6 +959,103 @@ static void nvme_rw_cb(void *opaque, int ret) > > nvme_enqueue_req_completion(nvme_cq(req), req); > > } > > +static void nvme_aio_discard_cb(void *opaque, int ret) > > +{ > > + NvmeRequest *req = opaque; > > + int *discards = req->opaque; > > + > > + trace_pci_nvme_aio_discard_cb(nvme_cid(req)); > > + > > + if (ret) { > > + req->status = NVME_INTERNAL_DEV_ERROR; > > + trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), > > + req->status); > > + } > > + > > + if (discards && --(*discards) > 0) { > > This line is hard to read. > Yes. Probably too clever for my own good. I'll fix it up. > > + return; > > + } > > + > > + g_free(req->opaque); > > + req->opaque = NULL; > > + > > + nvme_enqueue_req_completion(nvme_cq(req), req); > > +} > > + > > +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) > > +{ > > + NvmeNamespace *ns = req->ns; > > + NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; > > + NvmeDsmRange *range = NULL; > > g_autofree? > What sorcery is this?! I think I'll just replace it with a stack allocation like Keith suggested. > > + int *discards = NULL; > > + > > + uint32_t attr = le32_to_cpu(dsm->attributes); > > + uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; > > + > > + uint16_t status = NVME_SUCCESS; > > + > > + trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); > > + > > + if (attr & NVME_DSMGMT_AD) { > > + int64_t offset; > > + size_t len; > > + > > + range = g_new(NvmeDsmRange, nr); > > + > > + status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange), > > + DMA_DIRECTION_TO_DEVICE, req); > > + if (status) { > > + goto out; > > + } > > + > > + discards = g_new(int, 1); > > + *discards = 1; > > + req->opaque = discards; > > If opaque is a pointer, why not instead using it as an uintptr_t > discards directly without using the heap? > It was a "keep it simple, stupid" thing to just do the allocation. DSM is typically not on the fast path ;) But there is really no reason not to use that here.
signature.asc
Description: PGP signature