[PATCH 0/2] hw/block/nvme: oncs and write uncorrectable support
From: Klaus Jensen First, add support for toggling optional features through the new `oncs` nvme device parameter. Secondly, add support for the Write Uncorrectable command. Gollu Appalanaidu (2): hw/block/nvme: add oncs device parameter hw/block/nvme: add write uncorrectable command docs/specs/nvme.txt | 3 + hw/block/nvme-ns.h| 2 + hw/block/nvme.h | 8 ++ hw/block/nvme-ns.c| 2 + hw/block/nvme.c | 166 +++--- hw/block/trace-events | 1 + 6 files changed, 140 insertions(+), 42 deletions(-) -- 2.30.0
[PATCH 1/2] hw/block/nvme: add oncs device parameter
From: Gollu Appalanaidu Add the 'oncs' nvme device parameter to allow optional features to be enabled/disabled explicitly. Since most of these are optional commands, make the CSE log pages dynamic to account for the value of ONCS. Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen --- hw/block/nvme.h | 7 hw/block/nvme.c | 101 2 files changed, 74 insertions(+), 34 deletions(-) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index cb2b5175f1a1..98082b2dfba3 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -9,6 +9,7 @@ #define NVME_DEFAULT_ZONE_SIZE (128 * MiB) #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB) +#define NVME_MAX_COMMANDS 0x100 typedef struct NvmeParams { char *serial; @@ -22,6 +23,7 @@ typedef struct NvmeParams { bool use_intel_id; uint32_t zasl_bs; bool legacy_cmb; +uint16_t oncs; } NvmeParams; typedef struct NvmeAsyncEvent { @@ -183,6 +185,11 @@ typedef struct NvmeCtrl { NvmeCQueue admin_cq; NvmeIdCtrl id_ctrl; NvmeFeatureVal features; + +struct { +uint32_t nvm[NVME_MAX_COMMANDS]; +uint32_t zoned[NVME_MAX_COMMANDS]; +} iocs; } NvmeCtrl; static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 93345bf3c1fc..e5f725d7 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -71,6 +71,11 @@ * data size being in effect. By setting this property to 0, users can make * ZASL to be equal to MDTS. This property only affects zoned namespaces. * + * - `oncs` + * This field indicates the optional NVM commands and features supported + * by the controller. To add support for the optional feature, needs to + * set the corresponding support indicated bit. + * * nvme namespace device parameters * * - `subsys` @@ -165,7 +170,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE, }; -static const uint32_t nvme_cse_acs[256] = { +static const uint32_t nvme_cse_acs[NVME_MAX_COMMANDS] = { [NVME_ADM_CMD_DELETE_SQ]= NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_CREATE_SQ]= NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP, @@ -178,30 +183,7 @@ static const uint32_t nvme_cse_acs[256] = { [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, }; -static const uint32_t nvme_cse_iocs_none[256]; - -static const uint32_t nvme_cse_iocs_nvm[256] = { -[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, -[NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, -}; - -static const uint32_t nvme_cse_iocs_zoned[256] = { -[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, -[NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, -[NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, -[NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP, -}; +static const uint32_t nvme_cse_iocs_none[NVME_MAX_COMMANDS]; static void nvme_process_sq(void *opaque); @@ -2884,17 +2866,17 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, switch (NVME_CC_CSS(n->bar.cc)) { case NVME_CC_CSS_NVM: -src_iocs = nvme_cse_iocs_nvm; +src_iocs = n->iocs.nvm; /* fall through */ case NVME_CC_CSS_ADMIN_ONLY: break; case NVME_CC_CSS_CSI: switch (csi) { case NVME_CSI_NVM: -src_iocs = nvme_cse_iocs_nvm; +src_iocs = n->iocs.nvm; break; case NVME_CSI_ZONED: -src_iocs = nvme_cse_iocs_zoned; +src_iocs = n->iocs.zoned; break; } } @@ -3422,6 +3404,10 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } +if (!(le16_to_cpu(n->id_ctrl.oncs) & NVME_ONCS_FEATURES) && sel) { +return NVME_INVALID_FIELD | NVME_DNR; +} + if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { if (!nvme_nsid_v
[PATCH 2/2] hw/block/nvme: add write uncorrectable command
From: Gollu Appalanaidu Add support for marking blocks invalid with the Write Uncorrectable command. Block status is tracked in a (non-persistent) bitmap that is checked on all reads and written to on all writes. This is potentially expensive, so keep Write Uncorrectable disabled by default. Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen --- docs/specs/nvme.txt | 3 ++ hw/block/nvme-ns.h| 2 ++ hw/block/nvme.h | 1 + hw/block/nvme-ns.c| 2 ++ hw/block/nvme.c | 65 +-- hw/block/trace-events | 1 + 6 files changed, 66 insertions(+), 8 deletions(-) diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt index 56d393884e7a..88f9cc278d4c 100644 --- a/docs/specs/nvme.txt +++ b/docs/specs/nvme.txt @@ -19,5 +19,8 @@ Known issues * The accounting numbers in the SMART/Health are reset across power cycles +* Marking blocks invalid with the Write Uncorrectable is not persisted across + power cycles. + * Interrupt Coalescing is not supported and is disabled by default in volation of the specification. diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 7af6884862b5..15fa422ded03 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -72,6 +72,8 @@ typedef struct NvmeNamespace { struct { uint32_t err_rec; } features; + +unsigned long *uncorrectable; } NvmeNamespace; static inline uint32_t nvme_nsid(NvmeNamespace *ns) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 98082b2dfba3..9b8f85b9cf16 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -68,6 +68,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc) case NVME_CMD_FLUSH:return "NVME_NVM_CMD_FLUSH"; case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE"; case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; +case NVME_CMD_WRITE_UNCOR: return "NVME_CMD_WRITE_UNCOR"; case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE"; case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index ade46e2f3739..742bbc4b4b62 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -72,6 +72,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) id_ns->mcl = cpu_to_le32(ns->params.mcl); id_ns->msrc = ns->params.msrc; +ns->uncorrectable = bitmap_new(id_ns->nsze); + return 0; } diff --git a/hw/block/nvme.c b/hw/block/nvme.c index e5f725d7..56048046c193 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1112,6 +1112,20 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, return NVME_SUCCESS; } +static inline uint16_t nvme_check_uncor(NvmeNamespace *ns, uint64_t slba, +uint32_t nlb) +{ +uint64_t elba = nlb + slba; + +if (ns->uncorrectable) { +if (find_next_bit(ns->uncorrectable, elba, slba) < elba) { +return NVME_UNRECOVERED_READ | NVME_DNR; +} +} + +return NVME_SUCCESS; +} + static void nvme_aio_err(NvmeRequest *req, int ret) { uint16_t status = NVME_SUCCESS; @@ -1423,14 +1437,24 @@ static void nvme_rw_cb(void *opaque, int ret) BlockAcctCookie *acct = &req->acct; BlockAcctStats *stats = blk_get_stats(blk); +bool is_write = nvme_is_write(req); + trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); -if (ns->params.zoned && nvme_is_write(req)) { +if (ns->params.zoned && is_write) { nvme_finalize_zoned_write(ns, req); } if (!ret) { block_acct_done(stats, acct); + +if (is_write) { +NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; +uint64_t slba = le64_to_cpu(rw->slba); +uint32_t nlb = le16_to_cpu(rw->nlb) + 1; + +bitmap_clear(ns->uncorrectable, slba, nlb); +} } else { block_acct_failed(stats, acct); nvme_aio_err(req, ret); @@ -1521,13 +1545,13 @@ static void nvme_copy_cb(void *opaque, int ret) { NvmeRequest *req = opaque; NvmeNamespace *ns = req->ns; +NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; +uint64_t sdlba = le64_to_cpu(copy->sdlba); struct nvme_copy_ctx *ctx = req->opaque; trace_pci_nvme_copy_cb(nvme_cid(req)); if (ns->params.zoned) { -NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; -uint64_t sdlba = le64_to_cpu(copy->sdlba); NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); __nvme_advance_zone_wp(ns, zone, ctx->nlb); @@ -1535,6 +1559,7 @@ static void nvme_copy_cb(void *opaque, int ret) if (!ret) { block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); +bitmap_clear(ns->uncorrectable, sdlba, ctx->nlb); } else { block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); nvme_aio_err(req, ret); @@ -1953,6 +1978,12 @@ static uint16_t n
Re: [PATCH] hw/block/nvme: add broadcast nsid support flush command
On Feb 10 12:32, Keith Busch wrote: > On Mon, Feb 08, 2021 at 08:08:17PM +0100, Klaus Jensen wrote: > > On Feb 9 03:59, Keith Busch wrote: > > > This whole implementation would be much simpler with the synchronous > > > blk_flush() routine instead of the AIO equivalent. This is not really a > > > performant feature, so I don't think it's critical to get these > > > operations happening in parallel. What do you think? > > > > It would definitely be simpler, but I believe that if there is a lot to > > flush, then we won't just block the nvme device. We are holding the Big > > QEMU Lock and will block most other devices as well. > > Hm, I feel like you may have told me this same explanation for a > different patch. :) Okay, I'm convinced: this is the way. > Is that an Acked-by? ;) And yes, I might have used that argument for Copy, can't remember ;) signature.asc Description: PGP signature
Re: [PATCH] hw/block/nvme: improve invalid zasl value reporting
On 2/9/21 8:39 PM, Dmitry Fomichev wrote: > On Mon, 2021-02-08 at 09:25 +0100, Klaus Jensen wrote: >> From: Klaus Jensen >> >> The Zone Append Size Limit (ZASL) must be at least 4096 bytes, so >> improve the user experience by adding an early parameter check in >> nvme_check_constraints. >> >> When ZASL is still too small due to the host configuring the device for >> an even larger page size, convert the trace point in nvme_start_ctrl to >> an NVME_GUEST_ERR such that this is logged by QEMU instead of only >> traced. >> >> Reported-by: "i...@dantalion.nl" Apparently the reporter signed 'Corne'. >> Cc: Dmitry Fomichev >> Signed-off-by: Klaus Jensen >> --- >> hw/block/nvme.c | 12 ++-- >> 1 file changed, 10 insertions(+), 2 deletions(-) >> >> diff --git a/hw/block/nvme.c b/hw/block/nvme.c >> index c2f0c88fbf39..d96888cd2333 100644 >> --- a/hw/block/nvme.c >> +++ b/hw/block/nvme.c >> @@ -3983,8 +3983,10 @@ static int nvme_start_ctrl(NvmeCtrl *n) >> n->zasl = n->params.mdts; >> } else { >> if (n->params.zasl_bs < n->page_size) { >> -trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs, >> -n->page_size); >> +NVME_GUEST_ERR(pci_nvme_err_startfail_zasl_too_small, >> + "Zone Append Size Limit (ZASL) of %d bytes is >> too " >> + "small; must be at least %d bytes", >> + n->params.zasl_bs, n->page_size); >> return -1; >> } >> n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size); >> @@ -4503,6 +4505,12 @@ static void nvme_check_constraints(NvmeCtrl *n, Error >> **errp) >> error_setg(errp, "zone append size limit has to be a power of >> 2"); >> return; >> } >> + >> +if (n->params.zasl_bs < 4096) { >> +error_setg(errp, "zone append size limit must be at least " >> + "4096 bytes"); >> +return; >> +} >> } >> } > > The guest error is less confusing than simply a trace. LGTM. Trace events are meant for the developers when debugging, they are usually stripped out in final build. Errors are reported to the user / operator (i.e. incorrect configuration). Regards, Phil.
Re: [PATCH] hw/block/nvme: add broadcast nsid support flush command
On Mon, Feb 08, 2021 at 08:08:17PM +0100, Klaus Jensen wrote: > On Feb 9 03:59, Keith Busch wrote: > > This whole implementation would be much simpler with the synchronous > > blk_flush() routine instead of the AIO equivalent. This is not really a > > performant feature, so I don't think it's critical to get these > > operations happening in parallel. What do you think? > > It would definitely be simpler, but I believe that if there is a lot to > flush, then we won't just block the nvme device. We are holding the Big > QEMU Lock and will block most other devices as well. Hm, I feel like you may have told me this same explanation for a different patch. :) Okay, I'm convinced: this is the way.
Re: [PATCH v5 0/9] block: Add retry for werror=/rerror= mechanism
Kindly ping. Any comments and reviews are wellcome :) Thanks, Jiahui On 2021/2/5 18:13, Jiahui Cen wrote: > A VM in the cloud environment may use a virutal disk as the backend storage, > and there are usually filesystems on the virtual block device. When backend > storage is temporarily down, any I/O issued to the virtual block device > will cause an error. For example, an error occurred in ext4 filesystem would > make the filesystem readonly. In production environment, a cloud backend > storage can be soon recovered. For example, an IP-SAN may be down due to > network failure and will be online soon after network is recovered. However, > the error in the filesystem may not be recovered unless a device reattach > or system restart. Thus an I/O retry mechanism is in need to implement a > self-healing system. > > This patch series propose to extend the werror=/rerror= mechanism to add > a 'retry' feature. It can automatically retry failed I/O requests on error > without sending error back to guest, and guest can get back running smoothly > when I/O is recovred. > > v4->v5: > * Add document for 'retry' in qapi. > * Support werror=/rerror=retry for scsi-disk. > * Pause retry when draining. > > v3->v4: > * Adapt to werror=/rerror= mechanism. > > v2->v3: > * Add a doc to describe I/O hang. > > v1->v2: > * Rebase to fix compile problems. > * Fix incorrect remove of rehandle list. > * Provide rehandle pause interface. > > REF: https://lists.gnu.org/archive/html/qemu-devel/2020-10/msg06560.html > > Jiahui Cen (9): > qapi/block-core: Add retry option for error action > block-backend: Introduce retry timer > block-backend: Add device specific retry callback > block-backend: Enable retry action on errors > block-backend: Add timeout support for retry > block: Add error retry param setting > virtio_blk: Add support for retry on errors > scsi-bus: Refactor the code that retries requests > scsi-disk: Add support for retry on errors > > block/block-backend.c | 68 > blockdev.c | 52 +++ > hw/block/block.c | 10 +++ > hw/block/virtio-blk.c | 21 +- > hw/scsi/scsi-bus.c | 16 +++-- > hw/scsi/scsi-disk.c| 16 + > include/hw/block/block.h | 7 +- > include/hw/scsi/scsi.h | 1 + > include/sysemu/block-backend.h | 10 +++ > qapi/block-core.json | 9 ++- > 10 files changed, 199 insertions(+), 11 deletions(-) >
Re: [PATCH] hw/block/nvme: improve invalid zasl value reporting
On Mon, 2021-02-08 at 09:25 +0100, Klaus Jensen wrote: > From: Klaus Jensen > > The Zone Append Size Limit (ZASL) must be at least 4096 bytes, so > improve the user experience by adding an early parameter check in > nvme_check_constraints. > > When ZASL is still too small due to the host configuring the device for > an even larger page size, convert the trace point in nvme_start_ctrl to > an NVME_GUEST_ERR such that this is logged by QEMU instead of only > traced. > > Reported-by: "i...@dantalion.nl" > Cc: Dmitry Fomichev > Signed-off-by: Klaus Jensen > --- > hw/block/nvme.c | 12 ++-- > 1 file changed, 10 insertions(+), 2 deletions(-) > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > index c2f0c88fbf39..d96888cd2333 100644 > --- a/hw/block/nvme.c > +++ b/hw/block/nvme.c > @@ -3983,8 +3983,10 @@ static int nvme_start_ctrl(NvmeCtrl *n) > n->zasl = n->params.mdts; > } else { > if (n->params.zasl_bs < n->page_size) { > -trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs, > -n->page_size); > +NVME_GUEST_ERR(pci_nvme_err_startfail_zasl_too_small, > + "Zone Append Size Limit (ZASL) of %d bytes is too > " > + "small; must be at least %d bytes", > + n->params.zasl_bs, n->page_size); > return -1; > } > n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size); > @@ -4503,6 +4505,12 @@ static void nvme_check_constraints(NvmeCtrl *n, Error > **errp) > error_setg(errp, "zone append size limit has to be a power of > 2"); > return; > } > + > +if (n->params.zasl_bs < 4096) { > +error_setg(errp, "zone append size limit must be at least " > + "4096 bytes"); > +return; > +} > } > } The guest error is less confusing than simply a trace. LGTM. Reviewed-by: Dmitry Fomichev > > > >
Re: [PATCH 0/7] qcow2: compressed write cache
09.02.2021 21:41, Denis V. Lunev wrote: On 2/9/21 9:36 PM, Vladimir Sementsov-Ogievskiy wrote: 09.02.2021 19:39, Vladimir Sementsov-Ogievskiy wrote: 09.02.2021 17:47, Max Reitz wrote: On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote: 09.02.2021 16:25, Max Reitz wrote: On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote: Hi all! I know, I have several series waiting for a resend, but I had to switch to another task spawned from our customer's bug. Original problem: we use O_DIRECT for all vm images in our product, it's the policy. The only exclusion is backup target qcow2 image for compressed backup, because compressed backup is extremely slow with O_DIRECT (due to unaligned writes). Customer complains that backup produces a lot of pagecache. So we can either implement some internal cache or use fadvise somehow. Backup has several async workes, which writes simultaneously, so in both ways we have to track host cluster filling (before dropping the cache corresponding to the cluster). So, if we have to track anyway, let's try to implement the cache. I wanted to be excited here, because that sounds like it would be very easy to implement caching. Like, just keep the cluster at free_byte_offset cached until the cluster it points to changes, then flush the cluster. The problem is that chunks are written asynchronously.. That's why this all is not so easy. But then I see like 900 new lines of code, and I’m much less excited... Idea is simple: cache small unaligned write and flush the cluster when filled. Performance result is very good (results in a table is time of compressed backup of 1000M disk filled with ones in seconds): “Filled with ones” really is an edge case, though. Yes, I think, all clusters are compressed to rather small chunks :) --- --- --- backup(old) backup(new) ssd:hdd(direct) 3e+02 4.4 -99% ssd:hdd(cached) 5.7 5.4 -5% --- --- --- So, we have benefit even for cached mode! And the fastest thing is O_DIRECT with new implemented cache. So, I suggest to enable the new cache by default (which is done by the series). First, I’m not sure how O_DIRECT really is relevant, because I don’t really see the point for writing compressed images. compressed backup is a point (Perhaps irrelevant, but just to be clear:) I meant the point of using O_DIRECT, which one can decide to not use for backup targets (as you have done already). Second, I find it a bit cheating if you say there is a huge improvement for the no-cache case, when actually, well, you just added a cache. So the no-cache case just became faster because there is a cache now. Still, performance comparison is relevant to show that O_DIRECT as is unusable for compressed backup. (Again, perhaps irrelevant, but:) Yes, but my first point was exactly whether O_DIRECT is even relevant for writing compressed images. Well, I suppose I could follow that if O_DIRECT doesn’t make much sense for compressed images, qemu’s format drivers are free to introduce some caching (because technically the cache.direct option only applies to the protocol driver) for collecting compressed writes. Yes I thought in this way, enabling the cache by default. That conclusion makes both of my complaints kind of moot. *shrug* Third, what is the real-world impact on the page cache? You described that that’s the reason why you need the cache in qemu, because otherwise the page cache is polluted too much. How much is the difference really? (I don’t know how good the compression ratio is for real-world images.) Hm. I don't know the ratio.. Customer reported that most of RAM is polluted by Qemu's cache, and we use O_DIRECT for everything except for target of compressed backup.. Still the pollution may relate to several backups and of course it is simple enough to drop the cache after each backup. But I think that even one backup of 16T disk may pollute RAM enough. Oh, sorry, I just realized I had a brain fart there. I was referring to whether this series improves the page cache pollution. But obviously it will if it allows you to re-enable O_DIRECT. Related to that, I remember a long time ago we had some discussion about letting qemu-img convert set a special cache mode for the target image that would make Linux drop everything before the last offset written (i.e., I suppose fadvise() with POSIX_FADV_SEQUENTIAL). You discard that idea based on the fact that implementing a cache in qemu would be simple, but it isn’t, really. What would the impact of POSIX_FADV_SEQUENTIAL be? (One advantage of using that would be that we could reuse it for non-compressed images that are written by backup or qemu-img convert.) The problem is that writes are async. And therefore, not sequential. In theory, yes, but all compressed writes still goes through qcow2_alloc_byt
Re: [PATCH] iotests: Fix unsupported_imgopts for refcount_bits
On 2/9/21 12:27 PM, Max Reitz wrote: > Many _unsupported_imgopts lines for refcount_bits values use something > like "refcount_bits=1[^0-9]" to forbid everything but "refcount_bits=1" > (e.g. "refcount_bits=16" is allowed). > > That does not work when $IMGOPTS does not have any entry past the > refcount_bits option, which now became apparent with the "check" script > rewrite. > > Use \b instead of [^0-9] to check for a word boundary, which is what we > really want. \b is a Linux-ism (that is, glibc supports it, but BSD libc does not). https://mail-index.netbsd.org/tech-userlevel/2012/12/02/msg006954.html > > Signed-off-by: Max Reitz > --- > Reproducible with: > $ ./check -qcow2 -o refcount_bits=1 > (The tests touched here should be skipped) > > I don't know whether \b is portable. I hope it is. > (This is why I CC-ed you, Eric.) No, it's not portable. \> and [[:>:]] are other spellings for the same task, equally non-portable. > > Then again, it appears that nobody ever runs the iotests with > refcount_bits=1 but me, and I do that on Linux. So even if it isn't > portable, it shouldn't be an issue in practice... O:) What exactly is failing? Is it merely a case of our python script running the regex against "${unsupported_imgopts}" instead of "${unsupported_imgsopts} " with an added trailing space to guarantee that we have something to match against? -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH 0/7] qcow2: compressed write cache
On 2/9/21 9:36 PM, Vladimir Sementsov-Ogievskiy wrote: > 09.02.2021 19:39, Vladimir Sementsov-Ogievskiy wrote: >> 09.02.2021 17:47, Max Reitz wrote: >>> On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote: 09.02.2021 16:25, Max Reitz wrote: > On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote: >> Hi all! >> >> I know, I have several series waiting for a resend, but I had to >> switch >> to another task spawned from our customer's bug. >> >> Original problem: we use O_DIRECT for all vm images in our >> product, it's >> the policy. The only exclusion is backup target qcow2 image for >> compressed backup, because compressed backup is extremely slow with >> O_DIRECT (due to unaligned writes). Customer complains that backup >> produces a lot of pagecache. >> >> So we can either implement some internal cache or use fadvise >> somehow. >> Backup has several async workes, which writes simultaneously, so >> in both >> ways we have to track host cluster filling (before dropping the >> cache >> corresponding to the cluster). So, if we have to track anyway, >> let's >> try to implement the cache. > > I wanted to be excited here, because that sounds like it would be > very easy to implement caching. Like, just keep the cluster at > free_byte_offset cached until the cluster it points to changes, > then flush the cluster. The problem is that chunks are written asynchronously.. That's why this all is not so easy. > > But then I see like 900 new lines of code, and I’m much less > excited... > >> Idea is simple: cache small unaligned write and flush the cluster >> when >> filled. >> >> Performance result is very good (results in a table is time of >> compressed backup of 1000M disk filled with ones in seconds): > > “Filled with ones” really is an edge case, though. Yes, I think, all clusters are compressed to rather small chunks :) > >> --- --- --- >> backup(old) backup(new) >> ssd:hdd(direct) 3e+02 4.4 >> -99% >> ssd:hdd(cached) 5.7 5.4 >> -5% >> --- --- --- >> >> So, we have benefit even for cached mode! And the fastest thing is >> O_DIRECT with new implemented cache. So, I suggest to enable the new >> cache by default (which is done by the series). > > First, I’m not sure how O_DIRECT really is relevant, because I > don’t really see the point for writing compressed images. compressed backup is a point >>> >>> (Perhaps irrelevant, but just to be clear:) I meant the point of >>> using O_DIRECT, which one can decide to not use for backup targets >>> (as you have done already). >>> > Second, I find it a bit cheating if you say there is a huge > improvement for the no-cache case, when actually, well, you just > added a cache. So the no-cache case just became faster because > there is a cache now. Still, performance comparison is relevant to show that O_DIRECT as is unusable for compressed backup. >>> >>> (Again, perhaps irrelevant, but:) Yes, but my first point was >>> exactly whether O_DIRECT is even relevant for writing compressed >>> images. >>> > Well, I suppose I could follow that if O_DIRECT doesn’t make much > sense for compressed images, qemu’s format drivers are free to > introduce some caching (because technically the cache.direct > option only applies to the protocol driver) for collecting > compressed writes. Yes I thought in this way, enabling the cache by default. > That conclusion makes both of my complaints kind of moot. > > *shrug* > > Third, what is the real-world impact on the page cache? You > described that that’s the reason why you need the cache in qemu, > because otherwise the page cache is polluted too much. How much > is the difference really? (I don’t know how good the compression > ratio is for real-world images.) Hm. I don't know the ratio.. Customer reported that most of RAM is polluted by Qemu's cache, and we use O_DIRECT for everything except for target of compressed backup.. Still the pollution may relate to several backups and of course it is simple enough to drop the cache after each backup. But I think that even one backup of 16T disk may pollute RAM enough. >>> >>> Oh, sorry, I just realized I had a brain fart there. I was >>> referring to whether this series improves the page cache pollution. >>> But obviously it will if it allows you to re-enable O_DIRECT. >>> > Related to that, I remember a long time ago we had some discussion > about letting qemu-img convert set a special cache mode for the > target im
Re: [PATCH] iotests/210: Fix reference output
09.02.2021 21:19, Max Reitz wrote: Commit 69b55e03f has changed an error message, adjust the reference output to account for it. Fixes: 69b55e03f7e65a36eb954d0b7d4698b258df2708 ("block: refactor bdrv_check_request: add errp") Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy --- Fun fact: The branch name "fix-210-v1" was already taken for 8ba9c4d9b088d66aebfcb019f61ddc36fba2db88, which was only two months ago. Ah, well. :) Me again :( Hmm. I should definitely start running iotests with -luks, not only my favorite -raw, -qcow2, -nbd. Sorry. --- tests/qemu-iotests/210.out | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/210.out b/tests/qemu-iotests/210.out index dc1a3c9786..2e9fc596eb 100644 --- a/tests/qemu-iotests/210.out +++ b/tests/qemu-iotests/210.out @@ -182,7 +182,7 @@ Job failed: The requested file size is too large === Resize image with invalid sizes === {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 9223372036854775296}} -{"error": {"class": "GenericError", "desc": "Required too big image size, it must be not greater than 9223372035781033984"}} +{"error": {"class": "GenericError", "desc": "offset(9223372036854775296) exceeds maximum(9223372035781033984)"}} {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 9223372036854775808}} {"error": {"class": "GenericError", "desc": "Invalid parameter type for 'size', expected: integer"}} {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 18446744073709551104}} -- Best regards, Vladimir
Re: [PATCH 0/7] qcow2: compressed write cache
09.02.2021 19:39, Vladimir Sementsov-Ogievskiy wrote: 09.02.2021 17:47, Max Reitz wrote: On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote: 09.02.2021 16:25, Max Reitz wrote: On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote: Hi all! I know, I have several series waiting for a resend, but I had to switch to another task spawned from our customer's bug. Original problem: we use O_DIRECT for all vm images in our product, it's the policy. The only exclusion is backup target qcow2 image for compressed backup, because compressed backup is extremely slow with O_DIRECT (due to unaligned writes). Customer complains that backup produces a lot of pagecache. So we can either implement some internal cache or use fadvise somehow. Backup has several async workes, which writes simultaneously, so in both ways we have to track host cluster filling (before dropping the cache corresponding to the cluster). So, if we have to track anyway, let's try to implement the cache. I wanted to be excited here, because that sounds like it would be very easy to implement caching. Like, just keep the cluster at free_byte_offset cached until the cluster it points to changes, then flush the cluster. The problem is that chunks are written asynchronously.. That's why this all is not so easy. But then I see like 900 new lines of code, and I’m much less excited... Idea is simple: cache small unaligned write and flush the cluster when filled. Performance result is very good (results in a table is time of compressed backup of 1000M disk filled with ones in seconds): “Filled with ones” really is an edge case, though. Yes, I think, all clusters are compressed to rather small chunks :) --- --- --- backup(old) backup(new) ssd:hdd(direct) 3e+02 4.4 -99% ssd:hdd(cached) 5.7 5.4 -5% --- --- --- So, we have benefit even for cached mode! And the fastest thing is O_DIRECT with new implemented cache. So, I suggest to enable the new cache by default (which is done by the series). First, I’m not sure how O_DIRECT really is relevant, because I don’t really see the point for writing compressed images. compressed backup is a point (Perhaps irrelevant, but just to be clear:) I meant the point of using O_DIRECT, which one can decide to not use for backup targets (as you have done already). Second, I find it a bit cheating if you say there is a huge improvement for the no-cache case, when actually, well, you just added a cache. So the no-cache case just became faster because there is a cache now. Still, performance comparison is relevant to show that O_DIRECT as is unusable for compressed backup. (Again, perhaps irrelevant, but:) Yes, but my first point was exactly whether O_DIRECT is even relevant for writing compressed images. Well, I suppose I could follow that if O_DIRECT doesn’t make much sense for compressed images, qemu’s format drivers are free to introduce some caching (because technically the cache.direct option only applies to the protocol driver) for collecting compressed writes. Yes I thought in this way, enabling the cache by default. That conclusion makes both of my complaints kind of moot. *shrug* Third, what is the real-world impact on the page cache? You described that that’s the reason why you need the cache in qemu, because otherwise the page cache is polluted too much. How much is the difference really? (I don’t know how good the compression ratio is for real-world images.) Hm. I don't know the ratio.. Customer reported that most of RAM is polluted by Qemu's cache, and we use O_DIRECT for everything except for target of compressed backup.. Still the pollution may relate to several backups and of course it is simple enough to drop the cache after each backup. But I think that even one backup of 16T disk may pollute RAM enough. Oh, sorry, I just realized I had a brain fart there. I was referring to whether this series improves the page cache pollution. But obviously it will if it allows you to re-enable O_DIRECT. Related to that, I remember a long time ago we had some discussion about letting qemu-img convert set a special cache mode for the target image that would make Linux drop everything before the last offset written (i.e., I suppose fadvise() with POSIX_FADV_SEQUENTIAL). You discard that idea based on the fact that implementing a cache in qemu would be simple, but it isn’t, really. What would the impact of POSIX_FADV_SEQUENTIAL be? (One advantage of using that would be that we could reuse it for non-compressed images that are written by backup or qemu-img convert.) The problem is that writes are async. And therefore, not sequential. In theory, yes, but all compressed writes still goes through qcow2_alloc_bytes() right before submitting the write, so I wonder whether in pract
Re: [PATCH] iotests/210: Fix reference output
On 2/9/21 12:19 PM, Max Reitz wrote: > Commit 69b55e03f has changed an error message, adjust the reference > output to account for it. > > Fixes: 69b55e03f7e65a36eb954d0b7d4698b258df2708 >("block: refactor bdrv_check_request: add errp") > Signed-off-by: Max Reitz > --- Reviewed-by: Eric Blake I'm happy to queue this through my tree (since that's where the original came through) if you don't beat me to it. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
[PATCH] iotests: Fix unsupported_imgopts for refcount_bits
Many _unsupported_imgopts lines for refcount_bits values use something like "refcount_bits=1[^0-9]" to forbid everything but "refcount_bits=1" (e.g. "refcount_bits=16" is allowed). That does not work when $IMGOPTS does not have any entry past the refcount_bits option, which now became apparent with the "check" script rewrite. Use \b instead of [^0-9] to check for a word boundary, which is what we really want. Signed-off-by: Max Reitz --- Reproducible with: $ ./check -qcow2 -o refcount_bits=1 (The tests touched here should be skipped) I don't know whether \b is portable. I hope it is. (This is why I CC-ed you, Eric.) Then again, it appears that nobody ever runs the iotests with refcount_bits=1 but me, and I do that on Linux. So even if it isn't portable, it shouldn't be an issue in practice... O:) --- tests/qemu-iotests/007 | 2 +- tests/qemu-iotests/015 | 2 +- tests/qemu-iotests/029 | 2 +- tests/qemu-iotests/058 | 2 +- tests/qemu-iotests/062 | 2 +- tests/qemu-iotests/066 | 2 +- tests/qemu-iotests/068 | 2 +- tests/qemu-iotests/080 | 2 +- tests/qemu-iotests/103 | 2 +- tests/qemu-iotests/201 | 2 +- tests/qemu-iotests/214 | 2 +- tests/qemu-iotests/217 | 2 +- tests/qemu-iotests/267 | 2 +- tests/qemu-iotests/271 | 3 ++- tests/qemu-iotests/286 | 2 +- 15 files changed, 16 insertions(+), 15 deletions(-) diff --git a/tests/qemu-iotests/007 b/tests/qemu-iotests/007 index 936d3f14fb..a014f50a6b 100755 --- a/tests/qemu-iotests/007 +++ b/tests/qemu-iotests/007 @@ -44,7 +44,7 @@ _supported_proto generic # refcount_bits must be at least 4 so we can create ten internal snapshots # (1 bit supports none, 2 bits support two, 4 bits support 14); # snapshot are generally impossible with external data files -_unsupported_imgopts 'refcount_bits=\(1\|2\)[^0-9]' data_file +_unsupported_imgopts 'refcount_bits=\(1\|2\)\b' data_file echo echo "creating image" diff --git a/tests/qemu-iotests/015 b/tests/qemu-iotests/015 index 40c23235a6..24e28643e4 100755 --- a/tests/qemu-iotests/015 +++ b/tests/qemu-iotests/015 @@ -43,7 +43,7 @@ _supported_fmt qcow2 _supported_proto generic # Internal snapshots are (currently) impossible with refcount_bits=1, # and generally impossible with external data files -_unsupported_imgopts 'refcount_bits=1[^0-9]' data_file +_unsupported_imgopts 'refcount_bits=1\b' data_file echo echo "creating image" diff --git a/tests/qemu-iotests/029 b/tests/qemu-iotests/029 index bd71dd2f22..9b345060e5 100755 --- a/tests/qemu-iotests/029 +++ b/tests/qemu-iotests/029 @@ -44,7 +44,7 @@ _supported_fmt qcow2 _supported_proto generic # Internal snapshots are (currently) impossible with refcount_bits=1, # and generally impossible with external data files -_unsupported_imgopts 'refcount_bits=1[^0-9]' data_file +_unsupported_imgopts 'refcount_bits=1\b' data_file offset_size=24 offset_l1_size=36 diff --git a/tests/qemu-iotests/058 b/tests/qemu-iotests/058 index ce35ff4ee0..0b0303fcd5 100755 --- a/tests/qemu-iotests/058 +++ b/tests/qemu-iotests/058 @@ -59,7 +59,7 @@ _supported_os Linux _require_command QEMU_NBD # Internal snapshots are (currently) impossible with refcount_bits=1, # and generally impossible with external data files -_unsupported_imgopts 'refcount_bits=1[^0-9]' data_file +_unsupported_imgopts 'refcount_bits=1\b' data_file nbd_snapshot_img="nbd:unix:$nbd_unix_socket" diff --git a/tests/qemu-iotests/062 b/tests/qemu-iotests/062 index 321252298d..d7307f24ac 100755 --- a/tests/qemu-iotests/062 +++ b/tests/qemu-iotests/062 @@ -42,7 +42,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt qcow2 _supported_proto generic # We need zero clusters and snapshots -_unsupported_imgopts 'compat=0.10' 'refcount_bits=1[^0-9]' data_file +_unsupported_imgopts 'compat=0.10' 'refcount_bits=1\b' data_file IMG_SIZE=64M diff --git a/tests/qemu-iotests/066 b/tests/qemu-iotests/066 index a780ed7ab5..ec9dab2025 100755 --- a/tests/qemu-iotests/066 +++ b/tests/qemu-iotests/066 @@ -43,7 +43,7 @@ _supported_proto generic # We need zero clusters and snapshots # (TODO: Consider splitting the snapshot part into a separate test #file, so this one runs with refcount_bits=1 and data_file) -_unsupported_imgopts 'compat=0.10' 'refcount_bits=1[^0-9]' data_file +_unsupported_imgopts 'compat=0.10' 'refcount_bits=1\b' data_file # Intentionally create an unaligned image IMG_SIZE=$((64 * 1024 * 1024 + 512)) diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068 index 03e03508a6..39a04a6ab6 100755 --- a/tests/qemu-iotests/068 +++ b/tests/qemu-iotests/068 @@ -42,7 +42,7 @@ _supported_fmt qcow2 _supported_proto generic # Internal snapshots are (currently) impossible with refcount_bits=1, # and generally impossible with external data files -_unsupported_imgopts 'compat=0.10' 'refcount_bits=1[^0-9]' data_file +_unsupported_imgopts 'compat=0.10' 'refcount_bits=1\b' data_file IMG_SIZE=128K diff --git a/tests/qemu-iotests/080 b/tests/qemu-iotests/080 index 3306500683..a7cf
[PATCH] iotests/210: Fix reference output
Commit 69b55e03f has changed an error message, adjust the reference output to account for it. Fixes: 69b55e03f7e65a36eb954d0b7d4698b258df2708 ("block: refactor bdrv_check_request: add errp") Signed-off-by: Max Reitz --- Fun fact: The branch name "fix-210-v1" was already taken for 8ba9c4d9b088d66aebfcb019f61ddc36fba2db88, which was only two months ago. Ah, well. :) --- tests/qemu-iotests/210.out | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/210.out b/tests/qemu-iotests/210.out index dc1a3c9786..2e9fc596eb 100644 --- a/tests/qemu-iotests/210.out +++ b/tests/qemu-iotests/210.out @@ -182,7 +182,7 @@ Job failed: The requested file size is too large === Resize image with invalid sizes === {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 9223372036854775296}} -{"error": {"class": "GenericError", "desc": "Required too big image size, it must be not greater than 9223372035781033984"}} +{"error": {"class": "GenericError", "desc": "offset(9223372036854775296) exceeds maximum(9223372035781033984)"}} {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 9223372036854775808}} {"error": {"class": "GenericError", "desc": "Invalid parameter type for 'size', expected: integer"}} {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 18446744073709551104}} -- 2.29.2
Re: [PATCH v4 0/9] hw/sd: Support block read/write in SPI mode
On 2/9/21 3:32 PM, Bin Meng wrote: > Hi Philippe, > > On Thu, Feb 4, 2021 at 2:02 PM Bin Meng wrote: >> >> On Thu, Jan 28, 2021 at 2:30 PM Bin Meng wrote: >>> >>> From: Bin Meng >>> >>> This includes the previously v3 series [1], and one single patch [2]. >>> >>> Compared to v3, this fixed the following issue in patch [v3,6/6]: >>> - Keep the card state to SSI_SD_CMD instead of SSI_SD_RESPONSE after >>> receiving the STOP_TRAN token per the spec >>> >>> All software tested so far (U-Boot/Linux/VxWorks) do work without >>> the fix, but it is better to comform with the spec. >>> >>> In addition to [2], one more issue was exposed when testing with >>> VxWorks driver related to STOP_TRANSMISSION (CMD12) response. >>> >>> [1] http://patchwork.ozlabs.org/project/qemu-devel/list/?series=226136 >>> [2] >>> http://patchwork.ozlabs.org/project/qemu-devel/patch/1611636214-52427-1-git-send-email-bmeng...@gmail.com/ >>> >>> Changes in v4: >>> - Keep the card state to SSI_SD_CMD instead of SSI_SD_RESPONSE after >>> receiving the STOP_TRAN token per the spec >>> - new patch: fix STOP_TRANSMISSION (CMD12) response >>> - new patch: handle the rest commands with R1b response type >>> >> >> Ping? > > Will a PR be sent soon to include this series so that the SiFive SPI > series can follow? I had it planned for yesterday but had problems with the mails from the list, + the CVE (you fixed) took priority. Missing review is patch #8 "Fix STOP_TRANSMISSION (CMD12) response" for which I don't have test yet.
Re: [RFC PATCH] block/null: Use 'read-zeroes' mode by default
On 2/9/21 6:11 PM, Eric Blake wrote: > On 2/9/21 11:01 AM, Philippe Mathieu-Daudé wrote: >> The null-co driver is meant for (performance) testing. >> By default, read operation does nothing, the provided buffer >> is not filled with zero values and its content is unchanged. >> >> This can confuse security experts. For example, using the default >> null-co driver, buf[] is uninitialized, the blk_pread() call >> succeeds and we then access uninitialized memory: >> >> static int guess_disk_lchs(BlockBackend *blk, >> int *pcylinders, int *pheads, >> int *psectors) >> { >> uint8_t buf[BDRV_SECTOR_SIZE]; >> ... >> >> if (blk_pread(blk, 0, buf, BDRV_SECTOR_SIZE) < 0) { >> return -1; >> } >> /* test msdos magic */ >> if (buf[510] != 0x55 || buf[511] != 0xaa) { >> return -1; >> } >> >> We could audit all the uninitialized buffers and the >> bdrv_co_preadv() handlers, but it is simpler to change the >> default of this testing driver. Performance tests will have >> to adapt and use 'null-co,read-zeroes=on'. > > Wouldn't this rather be read-zeroes=off when doing performance testing? Oops, yes ;) > >> >> Suggested-by: Max Reitz >> Signed-off-by: Philippe Mathieu-Daudé >> --- >> RFC maybe a stricter approach is required? > > Since the null driver is only for testing in the first place, opting in > to speed over security seems like a reasonable tradeoff. But I consider > the patch incomplete without an audit of the iotests that will want to > use explicit read-zeroes=off. Correct. I don't know about each iotest but I can send a patch with explicit option, so review would be trivial. Thanks, Phil.
Re: [RFC PATCH] block/null: Use 'read-zeroes' mode by default
On 2/9/21 11:01 AM, Philippe Mathieu-Daudé wrote: > The null-co driver is meant for (performance) testing. > By default, read operation does nothing, the provided buffer > is not filled with zero values and its content is unchanged. > > This can confuse security experts. For example, using the default > null-co driver, buf[] is uninitialized, the blk_pread() call > succeeds and we then access uninitialized memory: > > static int guess_disk_lchs(BlockBackend *blk, > int *pcylinders, int *pheads, > int *psectors) > { > uint8_t buf[BDRV_SECTOR_SIZE]; > ... > > if (blk_pread(blk, 0, buf, BDRV_SECTOR_SIZE) < 0) { > return -1; > } > /* test msdos magic */ > if (buf[510] != 0x55 || buf[511] != 0xaa) { > return -1; > } > > We could audit all the uninitialized buffers and the > bdrv_co_preadv() handlers, but it is simpler to change the > default of this testing driver. Performance tests will have > to adapt and use 'null-co,read-zeroes=on'. Wouldn't this rather be read-zeroes=off when doing performance testing? > > Suggested-by: Max Reitz > Signed-off-by: Philippe Mathieu-Daudé > --- > RFC maybe a stricter approach is required? Since the null driver is only for testing in the first place, opting in to speed over security seems like a reasonable tradeoff. But I consider the patch incomplete without an audit of the iotests that will want to use explicit read-zeroes=off. > --- > block/null.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/block/null.c b/block/null.c > index cc9b1d4ea72..f9658fd70ac 100644 > --- a/block/null.c > +++ b/block/null.c > @@ -93,7 +93,7 @@ static int null_file_open(BlockDriverState *bs, QDict > *options, int flags, > error_setg(errp, "latency-ns is invalid"); > ret = -EINVAL; > } > -s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false); > +s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, true); > qemu_opts_del(opts); > bs->supported_write_flags = BDRV_REQ_FUA; > return ret; > -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [RFC PATCH] block/null: Use 'read-zeroes' mode by default
On 09.02.21 18:01, Philippe Mathieu-Daudé wrote: The null-co driver is meant for (performance) testing. By default, read operation does nothing, the provided buffer is not filled with zero values and its content is unchanged. This can confuse security experts. For example, using the default null-co driver, buf[] is uninitialized, the blk_pread() call succeeds and we then access uninitialized memory: I suppose in practice it’s going to be uninitialized guest memory most of the time, so it isn’t that bad, but yes. Thanks! static int guess_disk_lchs(BlockBackend *blk, int *pcylinders, int *pheads, int *psectors) { uint8_t buf[BDRV_SECTOR_SIZE]; ... if (blk_pread(blk, 0, buf, BDRV_SECTOR_SIZE) < 0) { return -1; } /* test msdos magic */ if (buf[510] != 0x55 || buf[511] != 0xaa) { return -1; } We could audit all the uninitialized buffers and the bdrv_co_preadv() handlers, but it is simpler to change the default of this testing driver. Performance tests will have to adapt and use 'null-co,read-zeroes=on'. Suggested-by: Max Reitz Signed-off-by: Philippe Mathieu-Daudé --- RFC maybe a stricter approach is required? I think this is good. If we do want a stricter approach, we might remove read-zeroes altogether (but I suppose that would require a deprecation period then) and add a new null-unsafe driver or something in its stead (that we can the conditionally compile out, or distributions can choose not to whitelist, or, or, or...). If we just follow through with this patch, I don’t think we need a deprecation period, because this can well be considered a bug fix; and because I don’t know of any use for read-zeroes=false except for some very special performance tests. --- block/null.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/null.c b/block/null.c index cc9b1d4ea72..f9658fd70ac 100644 --- a/block/null.c +++ b/block/null.c @@ -93,7 +93,7 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags, error_setg(errp, "latency-ns is invalid"); ret = -EINVAL; } -s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false); +s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, true); qemu_opts_del(opts); bs->supported_write_flags = BDRV_REQ_FUA; return ret; The documentation in qapi/block-core.json has to be changed, too. Are there any iotests (or other tests) that don’t set read-zeroes? Should they continue to use read-zeroes=false? Max
[RFC PATCH] block/null: Use 'read-zeroes' mode by default
The null-co driver is meant for (performance) testing. By default, read operation does nothing, the provided buffer is not filled with zero values and its content is unchanged. This can confuse security experts. For example, using the default null-co driver, buf[] is uninitialized, the blk_pread() call succeeds and we then access uninitialized memory: static int guess_disk_lchs(BlockBackend *blk, int *pcylinders, int *pheads, int *psectors) { uint8_t buf[BDRV_SECTOR_SIZE]; ... if (blk_pread(blk, 0, buf, BDRV_SECTOR_SIZE) < 0) { return -1; } /* test msdos magic */ if (buf[510] != 0x55 || buf[511] != 0xaa) { return -1; } We could audit all the uninitialized buffers and the bdrv_co_preadv() handlers, but it is simpler to change the default of this testing driver. Performance tests will have to adapt and use 'null-co,read-zeroes=on'. Suggested-by: Max Reitz Signed-off-by: Philippe Mathieu-Daudé --- RFC maybe a stricter approach is required? --- block/null.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/null.c b/block/null.c index cc9b1d4ea72..f9658fd70ac 100644 --- a/block/null.c +++ b/block/null.c @@ -93,7 +93,7 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags, error_setg(errp, "latency-ns is invalid"); ret = -EINVAL; } -s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false); +s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, true); qemu_opts_del(opts); bs->supported_write_flags = BDRV_REQ_FUA; return ret; -- 2.26.2
Re: [PATCH 0/7] qcow2: compressed write cache
On 2/9/21 5:47 PM, Max Reitz wrote: > On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote: >> 09.02.2021 16:25, Max Reitz wrote: >>> On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote: Hi all! I know, I have several series waiting for a resend, but I had to switch to another task spawned from our customer's bug. Original problem: we use O_DIRECT for all vm images in our product, it's the policy. The only exclusion is backup target qcow2 image for compressed backup, because compressed backup is extremely slow with O_DIRECT (due to unaligned writes). Customer complains that backup produces a lot of pagecache. So we can either implement some internal cache or use fadvise somehow. Backup has several async workes, which writes simultaneously, so in both ways we have to track host cluster filling (before dropping the cache corresponding to the cluster). So, if we have to track anyway, let's try to implement the cache. >>> >>> I wanted to be excited here, because that sounds like it would be >>> very easy to implement caching. Like, just keep the cluster at >>> free_byte_offset cached until the cluster it points to changes, then >>> flush the cluster. >> >> The problem is that chunks are written asynchronously.. That's why >> this all is not so easy. >> >>> >>> But then I see like 900 new lines of code, and I’m much less excited... >>> Idea is simple: cache small unaligned write and flush the cluster when filled. Performance result is very good (results in a table is time of compressed backup of 1000M disk filled with ones in seconds): >>> >>> “Filled with ones” really is an edge case, though. >> >> Yes, I think, all clusters are compressed to rather small chunks :) >> >>> --- --- --- backup(old) backup(new) ssd:hdd(direct) 3e+02 4.4 -99% ssd:hdd(cached) 5.7 5.4 -5% --- --- --- So, we have benefit even for cached mode! And the fastest thing is O_DIRECT with new implemented cache. So, I suggest to enable the new cache by default (which is done by the series). >>> >>> First, I’m not sure how O_DIRECT really is relevant, because I don’t >>> really see the point for writing compressed images. >> >> compressed backup is a point > > (Perhaps irrelevant, but just to be clear:) I meant the point of using > O_DIRECT, which one can decide to not use for backup targets (as you > have done already). > >>> Second, I find it a bit cheating if you say there is a huge >>> improvement for the no-cache case, when actually, well, you just >>> added a cache. So the no-cache case just became faster because >>> there is a cache now. >> >> Still, performance comparison is relevant to show that O_DIRECT as is >> unusable for compressed backup. > > (Again, perhaps irrelevant, but:) Yes, but my first point was exactly > whether O_DIRECT is even relevant for writing compressed images. > >>> Well, I suppose I could follow that if O_DIRECT doesn’t make much >>> sense for compressed images, qemu’s format drivers are free to >>> introduce some caching (because technically the cache.direct option >>> only applies to the protocol driver) for collecting compressed writes. >> >> Yes I thought in this way, enabling the cache by default. >> >>> That conclusion makes both of my complaints kind of moot. >>> >>> *shrug* >>> >>> Third, what is the real-world impact on the page cache? You >>> described that that’s the reason why you need the cache in qemu, >>> because otherwise the page cache is polluted too much. How much is >>> the difference really? (I don’t know how good the compression ratio >>> is for real-world images.) >> >> Hm. I don't know the ratio.. Customer reported that most of RAM is >> polluted by Qemu's cache, and we use O_DIRECT for everything except >> for target of compressed backup.. Still the pollution may relate to >> several backups and of course it is simple enough to drop the cache >> after each backup. But I think that even one backup of 16T disk may >> pollute RAM enough. > > Oh, sorry, I just realized I had a brain fart there. I was referring > to whether this series improves the page cache pollution. But > obviously it will if it allows you to re-enable O_DIRECT. > >>> Related to that, I remember a long time ago we had some discussion >>> about letting qemu-img convert set a special cache mode for the >>> target image that would make Linux drop everything before the last >>> offset written (i.e., I suppose fadvise() with >>> POSIX_FADV_SEQUENTIAL). You discard that idea based on the fact >>> that implementing a cache in qemu would be simple, but it isn’t, >>> really. What would the impact of POSIX_FADV_SEQUENTIAL be? (One >>> advantage of using that would be that we could reuse
Re: [PATCH 0/7] qcow2: compressed write cache
09.02.2021 17:47, Max Reitz wrote: On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote: 09.02.2021 16:25, Max Reitz wrote: On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote: Hi all! I know, I have several series waiting for a resend, but I had to switch to another task spawned from our customer's bug. Original problem: we use O_DIRECT for all vm images in our product, it's the policy. The only exclusion is backup target qcow2 image for compressed backup, because compressed backup is extremely slow with O_DIRECT (due to unaligned writes). Customer complains that backup produces a lot of pagecache. So we can either implement some internal cache or use fadvise somehow. Backup has several async workes, which writes simultaneously, so in both ways we have to track host cluster filling (before dropping the cache corresponding to the cluster). So, if we have to track anyway, let's try to implement the cache. I wanted to be excited here, because that sounds like it would be very easy to implement caching. Like, just keep the cluster at free_byte_offset cached until the cluster it points to changes, then flush the cluster. The problem is that chunks are written asynchronously.. That's why this all is not so easy. But then I see like 900 new lines of code, and I’m much less excited... Idea is simple: cache small unaligned write and flush the cluster when filled. Performance result is very good (results in a table is time of compressed backup of 1000M disk filled with ones in seconds): “Filled with ones” really is an edge case, though. Yes, I think, all clusters are compressed to rather small chunks :) --- --- --- backup(old) backup(new) ssd:hdd(direct) 3e+02 4.4 -99% ssd:hdd(cached) 5.7 5.4 -5% --- --- --- So, we have benefit even for cached mode! And the fastest thing is O_DIRECT with new implemented cache. So, I suggest to enable the new cache by default (which is done by the series). First, I’m not sure how O_DIRECT really is relevant, because I don’t really see the point for writing compressed images. compressed backup is a point (Perhaps irrelevant, but just to be clear:) I meant the point of using O_DIRECT, which one can decide to not use for backup targets (as you have done already). Second, I find it a bit cheating if you say there is a huge improvement for the no-cache case, when actually, well, you just added a cache. So the no-cache case just became faster because there is a cache now. Still, performance comparison is relevant to show that O_DIRECT as is unusable for compressed backup. (Again, perhaps irrelevant, but:) Yes, but my first point was exactly whether O_DIRECT is even relevant for writing compressed images. Well, I suppose I could follow that if O_DIRECT doesn’t make much sense for compressed images, qemu’s format drivers are free to introduce some caching (because technically the cache.direct option only applies to the protocol driver) for collecting compressed writes. Yes I thought in this way, enabling the cache by default. That conclusion makes both of my complaints kind of moot. *shrug* Third, what is the real-world impact on the page cache? You described that that’s the reason why you need the cache in qemu, because otherwise the page cache is polluted too much. How much is the difference really? (I don’t know how good the compression ratio is for real-world images.) Hm. I don't know the ratio.. Customer reported that most of RAM is polluted by Qemu's cache, and we use O_DIRECT for everything except for target of compressed backup.. Still the pollution may relate to several backups and of course it is simple enough to drop the cache after each backup. But I think that even one backup of 16T disk may pollute RAM enough. Oh, sorry, I just realized I had a brain fart there. I was referring to whether this series improves the page cache pollution. But obviously it will if it allows you to re-enable O_DIRECT. Related to that, I remember a long time ago we had some discussion about letting qemu-img convert set a special cache mode for the target image that would make Linux drop everything before the last offset written (i.e., I suppose fadvise() with POSIX_FADV_SEQUENTIAL). You discard that idea based on the fact that implementing a cache in qemu would be simple, but it isn’t, really. What would the impact of POSIX_FADV_SEQUENTIAL be? (One advantage of using that would be that we could reuse it for non-compressed images that are written by backup or qemu-img convert.) The problem is that writes are async. And therefore, not sequential. In theory, yes, but all compressed writes still goes through qcow2_alloc_bytes() right before submitting the write, so I wonder whether in practice the writes aren’t usually sufficiently sequential t
Re: [PULL v3 00/27] Block patches
> On Feb 9, 2021, at 1:03 AM, Thomas Huth wrote: > > On 08/02/2021 21.21, Stefan Hajnoczi wrote: >> On Mon, Feb 08, 2021 at 11:02:57AM +0100, Philippe Mathieu-Daudé wrote: >>> On 2/8/21 10:27 AM, Stefan Hajnoczi wrote: On Sat, Feb 06, 2021 at 05:03:20PM +, Peter Maydell wrote: > On Fri, 5 Feb 2021 at 22:53, Peter Maydell > wrote: >> >> On Fri, 5 Feb 2021 at 16:45, Stefan Hajnoczi wrote: >>> >>> The following changes since commit >>> e2c5093c993ef646e4e28f7aa78429853bcc06ac: >>> >>> iotests: 30: drop from auto group (and effectively from make check) >>> (2021-02-05 15:16:13 +) >>> >>> are available in the Git repository at: >>> >>> https://gitlab.com/stefanha/qemu.git tags/block-pull-request >>> >>> for you to fetch changes up to b07011f375bda3319cf72eee7cb18d310078387b: >>> >>> docs: fix Parallels Image "dirty bitmap" section (2021-02-05 16:36:36 >>> +) >>> >>> >>> Pull request >>> >>> v3: >>> * Replace {0} array initialization with {} to make clang happy [Peter] >>> >>> >> >> >> Fails 'make check' on s390x host: > > I gave this a rerun to check it was reproducible (it is) and realised > I missed what looks like an important line in the log. As usual, > trying to disentangle which lines of a parallel make check correspond > to the failure is pretty tricky, but the lines > Type 'remote-pcihost' is missing its parent 'pcie-host-bridge' > > are probably the proximate causes of the assertion failures. > > MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))} > QTEST_QEMU_IMG=./qemu-img > G_TEST_DBUS_DAEMON=/home/ubuntu/qemu/tests/dbus-vmstate-daemon.sh > QTEST_QEMU_BINARY=./qemu-system-rx tests/qtest/qos-test --tap -k > PASS 45 qtest-rx/qmp-cmd-test /rx/qmp/query-memory-size-summary > SKIP > MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))} > QTEST_QEMU_IMG=./qemu-img > G_TEST_DBUS_DAEMON=/home/ubuntu/qemu/tests/dbus-vmstate-daemon.sh > QTEST_QEMU_BINARY=./qemu-system-s390x tests/qtest/pxe-test --tap -k > PASS 46 qtest-rx/qmp-cmd-test /rx/qmp/query-memory-devices > Type 'remote-pcihost' is missing its parent 'pcie-host-bridge' > PASS 47 qtest-rx/qmp-cmd-test /rx/qmp/query-replay > PASS 48 qtest-rx/qmp-cmd-test /rx/qmp/query-yank > PASS 49 qtest-rx/qmp-cmd-test /rx/qmp/query-name > PASS 50 qtest-rx/qmp-cmd-test /rx/qmp/query-iothreads > PASS 51 qtest-rx/qmp-cmd-test /rx/qmp/query-fdsets > PASS 52 qtest-rx/qmp-cmd-test /rx/qmp/query-command-line-options > PASS 53 qtest-rx/qmp-cmd-test /rx/qmp/query-acpi-ospm-status > PASS 54 qtest-rx/qmp-cmd-test /rx/qmp/object-add-failure-modes > MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))} > QTEST_QEMU_IMG=./qemu-img > G_TEST_DBUS_DAEMON=/home/ubuntu/qemu/tests/dbus-vmstate-daemon.sh > QTEST_QEMU_BINARY=./qemu-system-s390x tests/qtest/test-netfilter --tap > -k > Type 'remote-pcihost' is missing its parent 'pcie-host-bridge' > socket_accept failed: Resource temporarily unavailable > socket_accept failed: Resource temporarily unavailable > ** > ERROR:../../tests/qtest/libqtest.c:308:qtest_init_without_qmp_handshake: > assertion failed: (s->fd >= 0 && s->qmp_fd >= 0) > ** > ERROR:../../tests/qtest/libqtest.c:308:qtest_init_without_qmp_handshake: > assertion failed: (s->fd >= 0 && s->qmp_fd >= 0) > ../../tests/qtest/libqtest.c:181: kill_qemu() detected QEMU death from > signal 6 (Aborted) (core dumped) > ../../tests/qtest/libqtest.c:181: kill_qemu() detected QEMU death from > signal 6 (Aborted) (core dumped) > ERROR qtest-s390x/pxe-test - Bail out! > ERROR:../../tests/qtest/libqtest.c:308:qtest_init_without_qmp_handshake: > assertion failed: (s->fd >= 0 && s->qmp_fd >= 0) > ERROR qtest-s390x/test-netfilter - Bail out! > ERROR:../../tests/qtest/libqtest.c:308:qtest_init_without_qmp_handshake: > assertion failed: (s->fd >= 0 && s->qmp_fd >= 0) > Makefile.mtest:3113: recipe for target 'run-test-387' failed > make: *** [run-test-387] Error 1 > make: *** Waiting for unfinished jobs > Makefile.mtest:3121: recipe for target 'run-test-388' failed Hi Elena and Jag, Please take a look at this QOM failure. I guess remote-pcihost is being built but pcie-host-bridge is missing from the s390x-softmmu target. >>> >>> Fix suggested here: >>> https://www.mail-archive.com/qemu-block@nongnu.org/msg80536.html >>> >>> But beside the fix what would be better is to restrict this feature >>> where it makes sense (we are having hard time building/testing all >>> features, better enable new ones where they are used).
Re: [PATCH v3 1/2] qemu-nbd: Use SOMAXCONN for socket listen() backlog
On 2/9/21 10:08 AM, Richard W.M. Jones wrote: > On Tue, Feb 09, 2021 at 09:27:58AM -0600, Eric Blake wrote: >> Our default of a backlog of 1 connection is rather puny; it gets in >> the way when we are explicitly allowing multiple clients (such as >> qemu-nbd -e N [--shared], or nbd-server-start with its default >> "max-connections":0 for unlimited), but is even a problem when we >> stick to qemu-nbd's default of only 1 active client but use -t >> [--persistent] where a second client can start using the server once >> the first finishes. While the effects are less noticeable on TCP >> sockets (since the client can poll() to learn when the server is ready >> again), it is definitely observable on Unix sockets, where on Unix, a s/where on Unix/where on Linux/ >> client will fail with EAGAIN and no recourse but to sleep an arbitrary >> amount of time before retrying if the server backlog is already full. >> >> Since QMP nbd-server-start is always persistent, it now always >> requests a backlog of SOMAXCONN; meanwhile, qemu-nbd will request >> SOMAXCONN if persistent, otherwise its backlog should be based on the >> expected number of clients. >> >> See https://bugzilla.redhat.com/1925045 for a demonstration of where >> our low backlog prevents libnbd from connecting as many parallel >> clients as it wants. >> >> Reported-by: Richard W.M. Jones >> Signed-off-by: Eric Blake >> CC: qemu-sta...@nongnu.org >> --- >> blockdev-nbd.c | 7 ++- >> qemu-nbd.c | 10 +- >> 2 files changed, 15 insertions(+), 2 deletions(-) >> > > Works fine here, so: > > Tested-by: Richard W.M. Jones > Thanks for testing. > Rich. > -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v3 1/2] qemu-nbd: Use SOMAXCONN for socket listen() backlog
On Tue, Feb 09, 2021 at 09:27:58AM -0600, Eric Blake wrote: > Our default of a backlog of 1 connection is rather puny; it gets in > the way when we are explicitly allowing multiple clients (such as > qemu-nbd -e N [--shared], or nbd-server-start with its default > "max-connections":0 for unlimited), but is even a problem when we > stick to qemu-nbd's default of only 1 active client but use -t > [--persistent] where a second client can start using the server once > the first finishes. While the effects are less noticeable on TCP > sockets (since the client can poll() to learn when the server is ready > again), it is definitely observable on Unix sockets, where on Unix, a > client will fail with EAGAIN and no recourse but to sleep an arbitrary > amount of time before retrying if the server backlog is already full. > > Since QMP nbd-server-start is always persistent, it now always > requests a backlog of SOMAXCONN; meanwhile, qemu-nbd will request > SOMAXCONN if persistent, otherwise its backlog should be based on the > expected number of clients. > > See https://bugzilla.redhat.com/1925045 for a demonstration of where > our low backlog prevents libnbd from connecting as many parallel > clients as it wants. > > Reported-by: Richard W.M. Jones > Signed-off-by: Eric Blake > CC: qemu-sta...@nongnu.org > --- > blockdev-nbd.c | 7 ++- > qemu-nbd.c | 10 +- > 2 files changed, 15 insertions(+), 2 deletions(-) Reviewed-by: Daniel P. Berrangé Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH v3 2/2] qemu-nbd: Permit --shared=0 for unlimited clients
On Tue, Feb 09, 2021 at 09:27:59AM -0600, Eric Blake wrote: > This gives us better feature parity with QMP nbd-server-start, where > max-connections defaults to 0 for unlimited. > > Signed-off-by: Eric Blake > --- > docs/tools/qemu-nbd.rst | 4 ++-- > qemu-nbd.c | 7 +++ > 2 files changed, 5 insertions(+), 6 deletions(-) Reviewed-by: Daniel P. Berrangé Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH v3 1/2] qemu-nbd: Use SOMAXCONN for socket listen() backlog
On Tue, Feb 09, 2021 at 09:27:58AM -0600, Eric Blake wrote: > Our default of a backlog of 1 connection is rather puny; it gets in > the way when we are explicitly allowing multiple clients (such as > qemu-nbd -e N [--shared], or nbd-server-start with its default > "max-connections":0 for unlimited), but is even a problem when we > stick to qemu-nbd's default of only 1 active client but use -t > [--persistent] where a second client can start using the server once > the first finishes. While the effects are less noticeable on TCP > sockets (since the client can poll() to learn when the server is ready > again), it is definitely observable on Unix sockets, where on Unix, a > client will fail with EAGAIN and no recourse but to sleep an arbitrary > amount of time before retrying if the server backlog is already full. > > Since QMP nbd-server-start is always persistent, it now always > requests a backlog of SOMAXCONN; meanwhile, qemu-nbd will request > SOMAXCONN if persistent, otherwise its backlog should be based on the > expected number of clients. > > See https://bugzilla.redhat.com/1925045 for a demonstration of where > our low backlog prevents libnbd from connecting as many parallel > clients as it wants. > > Reported-by: Richard W.M. Jones > Signed-off-by: Eric Blake > CC: qemu-sta...@nongnu.org > --- > blockdev-nbd.c | 7 ++- > qemu-nbd.c | 10 +- > 2 files changed, 15 insertions(+), 2 deletions(-) > > diff --git a/blockdev-nbd.c b/blockdev-nbd.c > index d8443d235b73..b264620b98d8 100644 > --- a/blockdev-nbd.c > +++ b/blockdev-nbd.c > @@ -134,7 +134,12 @@ void nbd_server_start(SocketAddress *addr, const char > *tls_creds, > qio_net_listener_set_name(nbd_server->listener, >"nbd-listener"); > > -if (qio_net_listener_open_sync(nbd_server->listener, addr, 1, errp) < 0) > { > +/* > + * Because this server is persistent, a backlog of SOMAXCONN is > + * better than trying to size it to max_connections. > + */ > +if (qio_net_listener_open_sync(nbd_server->listener, addr, SOMAXCONN, > + errp) < 0) { > goto error; > } > > diff --git a/qemu-nbd.c b/qemu-nbd.c > index 608c63e82a25..1a340ea4858d 100644 > --- a/qemu-nbd.c > +++ b/qemu-nbd.c > @@ -964,8 +964,16 @@ int main(int argc, char **argv) > > server = qio_net_listener_new(); > if (socket_activation == 0) { > +int backlog; > + > +if (persistent) { > +backlog = SOMAXCONN; > +} else { > +backlog = MIN(shared, SOMAXCONN); > +} > saddr = nbd_build_socket_address(sockpath, bindto, port); > -if (qio_net_listener_open_sync(server, saddr, 1, &local_err) < 0) { > +if (qio_net_listener_open_sync(server, saddr, backlog, > + &local_err) < 0) { > object_unref(OBJECT(server)); > error_report_err(local_err); > exit(EXIT_FAILURE); Works fine here, so: Tested-by: Richard W.M. Jones Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com Fedora Windows cross-compiler. Compile Windows programs, test, and build Windows installers. Over 100 libraries supported. http://fedoraproject.org/wiki/MinGW
[PATCH v3 2/2] qemu-nbd: Permit --shared=0 for unlimited clients
This gives us better feature parity with QMP nbd-server-start, where max-connections defaults to 0 for unlimited. Signed-off-by: Eric Blake --- docs/tools/qemu-nbd.rst | 4 ++-- qemu-nbd.c | 7 +++ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst index fe41336dc550..ee862fa0bc02 100644 --- a/docs/tools/qemu-nbd.rst +++ b/docs/tools/qemu-nbd.rst @@ -136,8 +136,8 @@ driver options if ``--image-opts`` is specified. .. option:: -e, --shared=NUM Allow up to *NUM* clients to share the device (default - ``1``). Safe for readers, but for now, consistency is not - guaranteed between multiple writers. + ``1``), 0 for unlimited. Safe for readers, but for now, + consistency is not guaranteed between multiple writers. .. option:: -t, --persistent diff --git a/qemu-nbd.c b/qemu-nbd.c index 1a340ea4858d..5416509ece18 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -328,7 +328,7 @@ static void *nbd_client_thread(void *arg) static int nbd_can_accept(void) { -return state == RUNNING && nb_fds < shared; +return state == RUNNING && (shared == 0 || nb_fds < shared); } static void nbd_update_server_watch(void); @@ -706,8 +706,8 @@ int main(int argc, char **argv) device = optarg; break; case 'e': if (qemu_strtoi(optarg, NULL, 0, &shared) < 0 || -shared < 1) { +shared < 0) { error_report("Invalid shared device number '%s'", optarg); exit(EXIT_FAILURE); } @@ -966,7 +965,7 @@ int main(int argc, char **argv) if (socket_activation == 0) { int backlog; -if (persistent) { +if (persistent || shared == 0) { backlog = SOMAXCONN; } else { backlog = MIN(shared, SOMAXCONN); -- 2.30.0
[PATCH v3 1/2] qemu-nbd: Use SOMAXCONN for socket listen() backlog
Our default of a backlog of 1 connection is rather puny; it gets in the way when we are explicitly allowing multiple clients (such as qemu-nbd -e N [--shared], or nbd-server-start with its default "max-connections":0 for unlimited), but is even a problem when we stick to qemu-nbd's default of only 1 active client but use -t [--persistent] where a second client can start using the server once the first finishes. While the effects are less noticeable on TCP sockets (since the client can poll() to learn when the server is ready again), it is definitely observable on Unix sockets, where on Unix, a client will fail with EAGAIN and no recourse but to sleep an arbitrary amount of time before retrying if the server backlog is already full. Since QMP nbd-server-start is always persistent, it now always requests a backlog of SOMAXCONN; meanwhile, qemu-nbd will request SOMAXCONN if persistent, otherwise its backlog should be based on the expected number of clients. See https://bugzilla.redhat.com/1925045 for a demonstration of where our low backlog prevents libnbd from connecting as many parallel clients as it wants. Reported-by: Richard W.M. Jones Signed-off-by: Eric Blake CC: qemu-sta...@nongnu.org --- blockdev-nbd.c | 7 ++- qemu-nbd.c | 10 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/blockdev-nbd.c b/blockdev-nbd.c index d8443d235b73..b264620b98d8 100644 --- a/blockdev-nbd.c +++ b/blockdev-nbd.c @@ -134,7 +134,12 @@ void nbd_server_start(SocketAddress *addr, const char *tls_creds, qio_net_listener_set_name(nbd_server->listener, "nbd-listener"); -if (qio_net_listener_open_sync(nbd_server->listener, addr, 1, errp) < 0) { +/* + * Because this server is persistent, a backlog of SOMAXCONN is + * better than trying to size it to max_connections. + */ +if (qio_net_listener_open_sync(nbd_server->listener, addr, SOMAXCONN, + errp) < 0) { goto error; } diff --git a/qemu-nbd.c b/qemu-nbd.c index 608c63e82a25..1a340ea4858d 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -964,8 +964,16 @@ int main(int argc, char **argv) server = qio_net_listener_new(); if (socket_activation == 0) { +int backlog; + +if (persistent) { +backlog = SOMAXCONN; +} else { +backlog = MIN(shared, SOMAXCONN); +} saddr = nbd_build_socket_address(sockpath, bindto, port); -if (qio_net_listener_open_sync(server, saddr, 1, &local_err) < 0) { +if (qio_net_listener_open_sync(server, saddr, backlog, + &local_err) < 0) { object_unref(OBJECT(server)); error_report_err(local_err); exit(EXIT_FAILURE); -- 2.30.0
[PATCH v3 0/2] NBD socket backlog
in v2: - also adjust backlog of QMP nbd-server-start [Dan] - tweak qemu-nbd backlog to -e when not persistent [Nir] - allow qemu-nbd -e0 for symmetry with QMP [new patch 2] Eric Blake (2): qemu-nbd: Use SOMAXCONN for socket listen() backlog qemu-nbd: Permit --shared=0 for unlimited clients docs/tools/qemu-nbd.rst | 4 ++-- blockdev-nbd.c | 7 ++- qemu-nbd.c | 15 +++ 3 files changed, 19 insertions(+), 7 deletions(-) -- 2.30.0
Re: [PULL 00/56] emulated nvme patches
On Tue, 9 Feb 2021 at 07:31, Klaus Jensen wrote: > > From: Klaus Jensen > > The following changes since commit 4f799257b323e1238a900fd0c71c2057863e0308: > > Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2021-02-08' > into staging (2021-02-08 16:12:21 +) > > are available in the Git repository at: > > git://git.infradead.org/qemu-nvme.git tags/nvme-next-pull-request > > for you to fetch changes up to 3e22762edc74be3e1ecafc361351a9640d114978: > > hw/block/nvme: refactor the logic for zone write checks (2021-02-08 > 21:15:54 +0100) > > > Emulated NVMe device updates > > * deallocate or unwritten logical block error feature (me) > * dataset management command (me) > * compare command (Gollu Appalanaidu) > * namespace types (Niklas Cassel) > * zoned namespaces (Dmitry Fomichev) > * smart critical warning toggle (Zhenwei Pi) > * allow cmb and pmr to coexist (Andrzej Jakowski, me) > * pmr rds/wds support (Naveen Nagar) > * cmb v1.4 logic (Padmakar Kalghatgi) > > And a lot of smaller fixes from Gollu Appalanaidu, Minwoo Im and me. > > Applied, thanks. Please update the changelog at https://wiki.qemu.org/ChangeLog/6.0 for any user-visible changes. -- PMM
Re: [PATCH 0/7] qcow2: compressed write cache
On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote: 09.02.2021 16:25, Max Reitz wrote: On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote: Hi all! I know, I have several series waiting for a resend, but I had to switch to another task spawned from our customer's bug. Original problem: we use O_DIRECT for all vm images in our product, it's the policy. The only exclusion is backup target qcow2 image for compressed backup, because compressed backup is extremely slow with O_DIRECT (due to unaligned writes). Customer complains that backup produces a lot of pagecache. So we can either implement some internal cache or use fadvise somehow. Backup has several async workes, which writes simultaneously, so in both ways we have to track host cluster filling (before dropping the cache corresponding to the cluster). So, if we have to track anyway, let's try to implement the cache. I wanted to be excited here, because that sounds like it would be very easy to implement caching. Like, just keep the cluster at free_byte_offset cached until the cluster it points to changes, then flush the cluster. The problem is that chunks are written asynchronously.. That's why this all is not so easy. But then I see like 900 new lines of code, and I’m much less excited... Idea is simple: cache small unaligned write and flush the cluster when filled. Performance result is very good (results in a table is time of compressed backup of 1000M disk filled with ones in seconds): “Filled with ones” really is an edge case, though. Yes, I think, all clusters are compressed to rather small chunks :) --- --- --- backup(old) backup(new) ssd:hdd(direct) 3e+02 4.4 -99% ssd:hdd(cached) 5.7 5.4 -5% --- --- --- So, we have benefit even for cached mode! And the fastest thing is O_DIRECT with new implemented cache. So, I suggest to enable the new cache by default (which is done by the series). First, I’m not sure how O_DIRECT really is relevant, because I don’t really see the point for writing compressed images. compressed backup is a point (Perhaps irrelevant, but just to be clear:) I meant the point of using O_DIRECT, which one can decide to not use for backup targets (as you have done already). Second, I find it a bit cheating if you say there is a huge improvement for the no-cache case, when actually, well, you just added a cache. So the no-cache case just became faster because there is a cache now. Still, performance comparison is relevant to show that O_DIRECT as is unusable for compressed backup. (Again, perhaps irrelevant, but:) Yes, but my first point was exactly whether O_DIRECT is even relevant for writing compressed images. Well, I suppose I could follow that if O_DIRECT doesn’t make much sense for compressed images, qemu’s format drivers are free to introduce some caching (because technically the cache.direct option only applies to the protocol driver) for collecting compressed writes. Yes I thought in this way, enabling the cache by default. That conclusion makes both of my complaints kind of moot. *shrug* Third, what is the real-world impact on the page cache? You described that that’s the reason why you need the cache in qemu, because otherwise the page cache is polluted too much. How much is the difference really? (I don’t know how good the compression ratio is for real-world images.) Hm. I don't know the ratio.. Customer reported that most of RAM is polluted by Qemu's cache, and we use O_DIRECT for everything except for target of compressed backup.. Still the pollution may relate to several backups and of course it is simple enough to drop the cache after each backup. But I think that even one backup of 16T disk may pollute RAM enough. Oh, sorry, I just realized I had a brain fart there. I was referring to whether this series improves the page cache pollution. But obviously it will if it allows you to re-enable O_DIRECT. Related to that, I remember a long time ago we had some discussion about letting qemu-img convert set a special cache mode for the target image that would make Linux drop everything before the last offset written (i.e., I suppose fadvise() with POSIX_FADV_SEQUENTIAL). You discard that idea based on the fact that implementing a cache in qemu would be simple, but it isn’t, really. What would the impact of POSIX_FADV_SEQUENTIAL be? (One advantage of using that would be that we could reuse it for non-compressed images that are written by backup or qemu-img convert.) The problem is that writes are async. And therefore, not sequential. In theory, yes, but all compressed writes still goes through qcow2_alloc_bytes() right before submitting the write, so I wonder whether in practice the writes aren’t usually sufficiently sequential to make POSIX_FADV_SEQUENTIAL
Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress
On 210209 1745, Bin Meng wrote: > Oops, hitting "send" by mistake ... > > On Tue, Feb 9, 2021 at 5:42 PM Bin Meng wrote: > > > > Hi Philippe, > > > > On Tue, Feb 9, 2021 at 5:38 PM Philippe Mathieu-Daudé > > wrote: > > > > > > On 2/9/21 9:28 AM, Bin Meng wrote: > > > > Hi Philippe, > > > > > > > > On Tue, Feb 9, 2021 at 3:34 AM Philippe Mathieu-Daudé > > > > wrote: > > > >> > > > >> Per the "SD Host Controller Simplified Specification Version 2.00" > > > >> spec. 'Table 2-4 : Block Size Register': > > > >> > > > >> Transfer Block Size [...] can be accessed only if no > > > >> transaction is executing (i.e., after a transaction has stopped). > > > >> Read operations during transfers may return an invalid value, > > > >> and write operations shall be ignored. > > > >> > > > >> Transactions will update 'data_count', so do not modify 'blksize' > > > >> and 'blkcnt' when 'data_count' is used. This fixes: > > > >> > > > >> $ cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \ > > > >>-nographic -serial none -M pc-q35-5.0 \ > > > >>-device sdhci-pci,sd-spec-version=3 \ > > > >>-device sd-card,drive=mydrive \ > > > >>-drive > > > >> if=sd,index=0,file=null-co://,format=raw,id=mydrive > > > >> outl 0xcf8 0x80001810 > > > >> outl 0xcfc 0xe1068000 > > > >> outl 0xcf8 0x80001814 > > > > > > > > Is this command needed? > > > > > > My guess is this makes the northbridge somehow map the device PCI space. > > > > > > Probably not needed in machines where SDHCI is MMIO mapped. > > > > I think this is not needed. Writing only the CFG_ADDR > > I think this is not needed. Writing only the CFG_ADDR without wring > CFG_DATA does not take any effect. > Ran it through scripts/oss-fuzz/minimize_qtest_trace.py , though that's probably not very useful now: cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \ -nographic -serial none -M pc-q35-5.0 \ -device sdhci-pci,sd-spec-version=3 \ -device sd-card,drive=mydrive \ -drive if=sd,index=0,file=null-co://,format=raw,id=mydrive outl 0xcf8 0x80001810 outl 0xcfc 0xe1068000 outl 0xcf8 0x80001804 outw 0xcfc 0x7 write 0xe106802c 0x1 0x0f write 0xe1068004 0x1 0x20 write 0xe1068005 0x1 0x01 write 0xe1068007 0x1 0x01 write 0xe106800c 0x1 0x33 write 0xe106800e 0x1 0x20 write 0xe106800f 0x1 0x0 write 0xe106800c 0x1 0x0 write 0xe106802a 0x1 0x11 write 0xe1068003 0x1 0x0 write 0xe1068005 0x1 0x00 write 0xe106800c 0x1 0x22 write 0xe106802a 0x1 0x12 write 0xe1068003 0x1 0x10 EOF > > > > > > > > > > > > >> outl 0xcf8 0x80001804 > > > >> outw 0xcfc 0x7 > > > >> outl 0xcf8 0x8000fa20 > > > > > > > > and this one? > > > > > > Ditto. > > > > > > > > > > >> write 0xe106802c 0x1 0x0f > > > >> write 0xe1068004 0xc 0x2801d10101fbff28a384 > > > > > > > > Are these fuzzy data? > > > > > > Yes, I didn't try to understand what this does, as often > > > non-sense operations. But this is what would craft a malicious > > > attacker. > > > > > > > > > > >> write 0xe106800c 0x1f > > > >> 0x9dacbbcad9e8f7061524334251606f7e8d9cabbac9d8e7f60514233241505f > > > >> write 0xe1068003 0x28 > > > >> 0x80d000251480d000252280d000253080d000253e80d000254c80d000255a80d000256880d0002576 > > > >> write 0xe1068003 0x1 0xfe > > > >> EOF > > > >> = > > > >> ==2686219==ERROR: AddressSanitizer: heap-buffer-overflow on address > > > >> 0x6153bb00 at pc 0x55ab469f456c bp 0x7ffee71be330 sp 0x7ffee71bdae0 > > > >> WRITE of size 4 at 0x6153bb00 thread T0 > > > >> #0 0x55ab469f456b in __asan_memcpy (qemu-system-i386+0x1cea56b) > > > >> #1 0x55ab483dc396 in stl_he_p include/qemu/bswap.h:353:5 > > > >> #2 0x55ab483af5e4 in stn_he_p include/qemu/bswap.h:546:1 > > > >> #3 0x55ab483aeb4b in flatview_read_continue > > > >> softmmu/physmem.c:2839:13 > > > >> #4 0x55ab483b0705 in flatview_read softmmu/physmem.c:2877:12 > > > >> #5 0x55ab483b028e in address_space_read_full > > > >> softmmu/physmem.c:2890:18 > > > >> #6 0x55ab483b1294 in address_space_rw softmmu/physmem.c:2918:16 > > > >> #7 0x55ab479374a2 in dma_memory_rw_relaxed > > > >> include/sysemu/dma.h:88:12 > > > >> #8 0x55ab47936f50 in dma_memory_rw include/sysemu/dma.h:127:12 > > > >> #9 0x55ab4793665f in dma_memory_read include/sysemu/dma.h:145:12 > > > >> #10 0x55ab4792f176 in sdhci_sdma_transfer_multi_blocks > > > >> hw/sd/sdhci.c:639:13 > > > >> #11 0x55ab4793dc9d in sdhci_write hw/sd/sdhci.c:1129:17 > > > >> #12 0x55ab483f8db8 in memory_region_write_accessor > > > >> softmmu/memory.c:491:5 > > > >> #13 0x55ab483f868a in access_with_adjusted_size > > > >> softmmu/memory.c:552:18 > > > >> #14 0x55ab483f6da5 in memory_region_dispatch_write > > > >> softmmu/memory.c:1501:16 > > > >> #15 0x55ab483c3b11 in flatview_write_continue > > >
Re: [PATCH 2/2] travis: remove travis configuration and all references to Travis CI
On 09/02/2021 14.50, Daniel P. Berrangé wrote: The Travis CI system QEMU has been using has removed the unlimited free usage model, replacing it with a one-time only grant of CI minutes that is not renewed. The QEMU CI jobs quickly exhaust maintainer's free CI credits, leaving them unable to test with Travis. This is not a sustainable situation, so we have no choice by to discontinue use of Travis. GitLab CI is now the primary target, with Cirrus CI filling in some platform gaps where needed. I've currently got a series in flight that moves some of the remaining jobs to gitlab-CI: https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg01924.html Could you please hold this patch 'til my series got merged first? Also I think we could still wait some more weeks with the final removal of the travis-CI either 'til travis-ci.org got shut down completely (and thus we cannot use it for QEMU at all anymore), or until we finally got the s390x and aarch64 runners up and running in the gitlab-CI. Thomas
Re: [PATCH v4 0/9] hw/sd: Support block read/write in SPI mode
Hi Philippe, On Thu, Feb 4, 2021 at 2:02 PM Bin Meng wrote: > > On Thu, Jan 28, 2021 at 2:30 PM Bin Meng wrote: > > > > From: Bin Meng > > > > This includes the previously v3 series [1], and one single patch [2]. > > > > Compared to v3, this fixed the following issue in patch [v3,6/6]: > > - Keep the card state to SSI_SD_CMD instead of SSI_SD_RESPONSE after > > receiving the STOP_TRAN token per the spec > > > > All software tested so far (U-Boot/Linux/VxWorks) do work without > > the fix, but it is better to comform with the spec. > > > > In addition to [2], one more issue was exposed when testing with > > VxWorks driver related to STOP_TRANSMISSION (CMD12) response. > > > > [1] http://patchwork.ozlabs.org/project/qemu-devel/list/?series=226136 > > [2] > > http://patchwork.ozlabs.org/project/qemu-devel/patch/1611636214-52427-1-git-send-email-bmeng...@gmail.com/ > > > > Changes in v4: > > - Keep the card state to SSI_SD_CMD instead of SSI_SD_RESPONSE after > > receiving the STOP_TRAN token per the spec > > - new patch: fix STOP_TRANSMISSION (CMD12) response > > - new patch: handle the rest commands with R1b response type > > > > Ping? Will a PR be sent soon to include this series so that the SiFive SPI series can follow? Regards, Bin
Re: [PATCH 0/7] qcow2: compressed write cache
09.02.2021 16:25, Max Reitz wrote: On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote: Hi all! I know, I have several series waiting for a resend, but I had to switch to another task spawned from our customer's bug. Original problem: we use O_DIRECT for all vm images in our product, it's the policy. The only exclusion is backup target qcow2 image for compressed backup, because compressed backup is extremely slow with O_DIRECT (due to unaligned writes). Customer complains that backup produces a lot of pagecache. So we can either implement some internal cache or use fadvise somehow. Backup has several async workes, which writes simultaneously, so in both ways we have to track host cluster filling (before dropping the cache corresponding to the cluster). So, if we have to track anyway, let's try to implement the cache. I wanted to be excited here, because that sounds like it would be very easy to implement caching. Like, just keep the cluster at free_byte_offset cached until the cluster it points to changes, then flush the cluster. The problem is that chunks are written asynchronously.. That's why this all is not so easy. But then I see like 900 new lines of code, and I’m much less excited... Idea is simple: cache small unaligned write and flush the cluster when filled. Performance result is very good (results in a table is time of compressed backup of 1000M disk filled with ones in seconds): “Filled with ones” really is an edge case, though. Yes, I think, all clusters are compressed to rather small chunks :) --- --- --- backup(old) backup(new) ssd:hdd(direct) 3e+02 4.4 -99% ssd:hdd(cached) 5.7 5.4 -5% --- --- --- So, we have benefit even for cached mode! And the fastest thing is O_DIRECT with new implemented cache. So, I suggest to enable the new cache by default (which is done by the series). First, I’m not sure how O_DIRECT really is relevant, because I don’t really see the point for writing compressed images. compressed backup is a point Second, I find it a bit cheating if you say there is a huge improvement for the no-cache case, when actually, well, you just added a cache. So the no-cache case just became faster because there is a cache now. Still, performance comparison is relevant to show that O_DIRECT as is unusable for compressed backup. Well, I suppose I could follow that if O_DIRECT doesn’t make much sense for compressed images, qemu’s format drivers are free to introduce some caching (because technically the cache.direct option only applies to the protocol driver) for collecting compressed writes. Yes I thought in this way, enabling the cache by default. That conclusion makes both of my complaints kind of moot. *shrug* Third, what is the real-world impact on the page cache? You described that that’s the reason why you need the cache in qemu, because otherwise the page cache is polluted too much. How much is the difference really? (I don’t know how good the compression ratio is for real-world images.) Hm. I don't know the ratio.. Customer reported that most of RAM is polluted by Qemu's cache, and we use O_DIRECT for everything except for target of compressed backup.. Still the pollution may relate to several backups and of course it is simple enough to drop the cache after each backup. But I think that even one backup of 16T disk may pollute RAM enough. Related to that, I remember a long time ago we had some discussion about letting qemu-img convert set a special cache mode for the target image that would make Linux drop everything before the last offset written (i.e., I suppose fadvise() with POSIX_FADV_SEQUENTIAL). You discard that idea based on the fact that implementing a cache in qemu would be simple, but it isn’t, really. What would the impact of POSIX_FADV_SEQUENTIAL be? (One advantage of using that would be that we could reuse it for non-compressed images that are written by backup or qemu-img convert.) The problem is that writes are async. And therefore, not sequential. So I have to track the writes and wait until the whole cluster is filled. It's simple use fadvise as an option to my cache: instead of caching data and write when cluster is filled we can instead mark cluster POSIX_FADV_DONTNEED. (I don’t remember why that qemu-img discussion died back then.) Fourth, regarding the code, would it be simpler if it were a pure write cache? I.e., on read, everything is flushed, so we don’t have to deal with that. I don’t think there are many valid cases where a compressed image is both written to and read from at the same time. (Just asking, because I’d really want this code to be simpler. I can imagine that reading from the cache is the least bit of complexity, but perhaps...) Hm. I really didn't want to support reads, and do it
Re: [PATCH 2/2] travis: remove travis configuration and all references to Travis CI
On Tue, Feb 09, 2021 at 02:58:46PM +0100, Philippe Mathieu-Daudé wrote: > On 2/9/21 2:50 PM, Daniel P. Berrangé wrote: > > The Travis CI system QEMU has been using has removed the unlimited free > > usage model, replacing it with a one-time only grant of CI minutes that > > is not renewed. The QEMU CI jobs quickly exhaust maintainer's free CI > > credits, leaving them unable to test with Travis. This is not a > > sustainable situation, so we have no choice by to discontinue use of > > Travis. GitLab CI is now the primary target, with Cirrus CI filling > > in some platform gaps where needed. > > > > Signed-off-by: Daniel P. Berrangé > > --- > > .travis.yml| 439 - > > MAINTAINERS| 3 - > > configure | 1 - > > contrib/gitdm/filetypes.txt| 2 +- > > scripts/travis/coverage-summary.sh | 27 -- > > tests/docker/docker.py | 2 +- > > tests/qemu-iotests/079 | 2 +- > > tests/test-util-filemonitor.c | 11 - > > 8 files changed, 3 insertions(+), 484 deletions(-) > > delete mode 100644 .travis.yml > > delete mode 100755 scripts/travis/coverage-summary.sh > ... > > > diff --git a/configure b/configure > > index 7c496d81fc..058a7c7967 100755 > > --- a/configure > > +++ b/configure > > @@ -4872,7 +4872,6 @@ fi > > > > # See if __attribute__((alias)) is supported. > > # This false for Xcode 9, but has been remedied for Xcode 10. > > Not related to this patch, but I don't think Xcode 9 is supported > anymore. > > > -# Unfortunately, travis uses Xcode 9 by default. > > > > attralias=no > > cat > $TMPC << EOF > > > diff --git a/scripts/travis/coverage-summary.sh > > b/scripts/travis/coverage-summary.sh > > deleted file mode 100755 > > index d7086cf9ca..00 > > --- a/scripts/travis/coverage-summary.sh > > +++ /dev/null > > @@ -1,27 +0,0 @@ > > -#!/bin/sh > > -# > > -# Author: Alex Bennée > > -# > > -# Summerise the state of code coverage with gcovr and tweak the output > > -# to be more sane on Travis hosts. As we expect to be executed on a > > -# throw away CI instance we do spam temp files all over the shop. You > > -# most likely don't want to execute this script but just call gcovr > > -# directly. See also "make coverage-report" > > -# > > -# This code is licensed under the GPL version 2 or later. See > > -# the COPYING file in the top-level directory. > > - > > -# first generate the coverage report > > -gcovr -p -o raw-report.txt > > - > > -# strip the full-path and line markers > > -sed s@$PWD\/@@ raw-report.txt | sed s/[0-9]\*[,-]//g > simplified.txt > > - > > -# reflow lines that got split > > -awk '/.[ch]$/ { printf("%s", $0); next } 1' simplified.txt > rejoined.txt > > - > > -# columnify > > -column -t rejoined.txt > final.txt > > - > > -# and dump, stripping out 0% coverage > > -grep -v "0%" final.txt > > This script can be run on other CI. > > Keeping scripts/travis/coverage-summary.sh (moved to > scripts/ci/coverage-summary.sh): I notice that the "gcovr" program used here should be able to output an XML document in a format that is supported by GitLab, which can then pretty-display the results. If we do that, perhaps we won't ned this coverage-summary script for post-processing the text output format ? I guess we need to make sure gcovr is actually installed in all our dockerfiles used by gitlab. > Reviewed-by: Philippe Mathieu-Daudé > Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH 2/2] travis: remove travis configuration and all references to Travis CI
On 2/9/21 2:50 PM, Daniel P. Berrangé wrote: > The Travis CI system QEMU has been using has removed the unlimited free > usage model, replacing it with a one-time only grant of CI minutes that > is not renewed. The QEMU CI jobs quickly exhaust maintainer's free CI > credits, leaving them unable to test with Travis. This is not a > sustainable situation, so we have no choice by to discontinue use of > Travis. GitLab CI is now the primary target, with Cirrus CI filling > in some platform gaps where needed. > > Signed-off-by: Daniel P. Berrangé > --- > .travis.yml| 439 - > MAINTAINERS| 3 - > configure | 1 - > contrib/gitdm/filetypes.txt| 2 +- > scripts/travis/coverage-summary.sh | 27 -- > tests/docker/docker.py | 2 +- > tests/qemu-iotests/079 | 2 +- > tests/test-util-filemonitor.c | 11 - > 8 files changed, 3 insertions(+), 484 deletions(-) > delete mode 100644 .travis.yml > delete mode 100755 scripts/travis/coverage-summary.sh ... > diff --git a/configure b/configure > index 7c496d81fc..058a7c7967 100755 > --- a/configure > +++ b/configure > @@ -4872,7 +4872,6 @@ fi > > # See if __attribute__((alias)) is supported. > # This false for Xcode 9, but has been remedied for Xcode 10. Not related to this patch, but I don't think Xcode 9 is supported anymore. > -# Unfortunately, travis uses Xcode 9 by default. > > attralias=no > cat > $TMPC << EOF > diff --git a/scripts/travis/coverage-summary.sh > b/scripts/travis/coverage-summary.sh > deleted file mode 100755 > index d7086cf9ca..00 > --- a/scripts/travis/coverage-summary.sh > +++ /dev/null > @@ -1,27 +0,0 @@ > -#!/bin/sh > -# > -# Author: Alex Bennée > -# > -# Summerise the state of code coverage with gcovr and tweak the output > -# to be more sane on Travis hosts. As we expect to be executed on a > -# throw away CI instance we do spam temp files all over the shop. You > -# most likely don't want to execute this script but just call gcovr > -# directly. See also "make coverage-report" > -# > -# This code is licensed under the GPL version 2 or later. See > -# the COPYING file in the top-level directory. > - > -# first generate the coverage report > -gcovr -p -o raw-report.txt > - > -# strip the full-path and line markers > -sed s@$PWD\/@@ raw-report.txt | sed s/[0-9]\*[,-]//g > simplified.txt > - > -# reflow lines that got split > -awk '/.[ch]$/ { printf("%s", $0); next } 1' simplified.txt > rejoined.txt > - > -# columnify > -column -t rejoined.txt > final.txt > - > -# and dump, stripping out 0% coverage > -grep -v "0%" final.txt This script can be run on other CI. Keeping scripts/travis/coverage-summary.sh (moved to scripts/ci/coverage-summary.sh): Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH 2/2] travis: remove travis configuration and all references to Travis CI
On 2/9/21 3:03 PM, Daniel P. Berrangé wrote: > On Tue, Feb 09, 2021 at 02:58:46PM +0100, Philippe Mathieu-Daudé wrote: >> On 2/9/21 2:50 PM, Daniel P. Berrangé wrote: >>> The Travis CI system QEMU has been using has removed the unlimited free >>> usage model, replacing it with a one-time only grant of CI minutes that >>> is not renewed. The QEMU CI jobs quickly exhaust maintainer's free CI >>> credits, leaving them unable to test with Travis. This is not a >>> sustainable situation, so we have no choice by to discontinue use of >>> Travis. GitLab CI is now the primary target, with Cirrus CI filling >>> in some platform gaps where needed. >>> >>> Signed-off-by: Daniel P. Berrangé >>> --- >>> .travis.yml| 439 - >>> MAINTAINERS| 3 - >>> configure | 1 - >>> contrib/gitdm/filetypes.txt| 2 +- >>> scripts/travis/coverage-summary.sh | 27 -- >>> tests/docker/docker.py | 2 +- >>> tests/qemu-iotests/079 | 2 +- >>> tests/test-util-filemonitor.c | 11 - >>> 8 files changed, 3 insertions(+), 484 deletions(-) >>> delete mode 100644 .travis.yml >>> delete mode 100755 scripts/travis/coverage-summary.sh >> ... >> >>> diff --git a/configure b/configure >>> index 7c496d81fc..058a7c7967 100755 >>> --- a/configure >>> +++ b/configure >>> @@ -4872,7 +4872,6 @@ fi >>> >>> # See if __attribute__((alias)) is supported. >>> # This false for Xcode 9, but has been remedied for Xcode 10. >> >> Not related to this patch, but I don't think Xcode 9 is supported >> anymore. >> >>> -# Unfortunately, travis uses Xcode 9 by default. >>> >>> attralias=no >>> cat > $TMPC << EOF >> >>> diff --git a/scripts/travis/coverage-summary.sh >>> b/scripts/travis/coverage-summary.sh >>> deleted file mode 100755 >>> index d7086cf9ca..00 >>> --- a/scripts/travis/coverage-summary.sh >>> +++ /dev/null >>> @@ -1,27 +0,0 @@ >>> -#!/bin/sh >>> -# >>> -# Author: Alex Bennée >>> -# >>> -# Summerise the state of code coverage with gcovr and tweak the output >>> -# to be more sane on Travis hosts. As we expect to be executed on a >>> -# throw away CI instance we do spam temp files all over the shop. You >>> -# most likely don't want to execute this script but just call gcovr >>> -# directly. See also "make coverage-report" >>> -# >>> -# This code is licensed under the GPL version 2 or later. See >>> -# the COPYING file in the top-level directory. >>> - >>> -# first generate the coverage report >>> -gcovr -p -o raw-report.txt >>> - >>> -# strip the full-path and line markers >>> -sed s@$PWD\/@@ raw-report.txt | sed s/[0-9]\*[,-]//g > simplified.txt >>> - >>> -# reflow lines that got split >>> -awk '/.[ch]$/ { printf("%s", $0); next } 1' simplified.txt > rejoined.txt >>> - >>> -# columnify >>> -column -t rejoined.txt > final.txt >>> - >>> -# and dump, stripping out 0% coverage >>> -grep -v "0%" final.txt >> >> This script can be run on other CI. >> >> Keeping scripts/travis/coverage-summary.sh (moved to >> scripts/ci/coverage-summary.sh): > > I notice that the "gcovr" program used here should be able to output > an XML document in a format that is supported by GitLab, which can > then pretty-display the results. Good idea. > If we do that, perhaps we won't ned this coverage-summary script > for post-processing the text output format ? This indeed requires further testing. I'd worry about that later. I'll let Alex see how he wants to deal with that, we can still add the script back later. > I guess we need to make sure gcovr is actually installed in all > our dockerfiles used by gitlab. > >> Reviewed-by: Philippe Mathieu-Daudé >> > > Regards, > Daniel >
[PATCH 2/2] travis: remove travis configuration and all references to Travis CI
The Travis CI system QEMU has been using has removed the unlimited free usage model, replacing it with a one-time only grant of CI minutes that is not renewed. The QEMU CI jobs quickly exhaust maintainer's free CI credits, leaving them unable to test with Travis. This is not a sustainable situation, so we have no choice by to discontinue use of Travis. GitLab CI is now the primary target, with Cirrus CI filling in some platform gaps where needed. Signed-off-by: Daniel P. Berrangé --- .travis.yml| 439 - MAINTAINERS| 3 - configure | 1 - contrib/gitdm/filetypes.txt| 2 +- scripts/travis/coverage-summary.sh | 27 -- tests/docker/docker.py | 2 +- tests/qemu-iotests/079 | 2 +- tests/test-util-filemonitor.c | 11 - 8 files changed, 3 insertions(+), 484 deletions(-) delete mode 100644 .travis.yml delete mode 100755 scripts/travis/coverage-summary.sh diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 5f1dea873e..00 --- a/.travis.yml +++ /dev/null @@ -1,439 +0,0 @@ -# The current Travis default is a VM based 16.04 Xenial on GCE -# Additional builds with specific requirements for a full VM need to -# be added as additional matrix: entries later on -os: linux -dist: focal -language: c -compiler: - - gcc -cache: - # There is one cache per branch and compiler version. - # characteristics of each job are used to identify the cache: - # - OS name (currently only linux) - # - OS distribution (for Linux, bionic or focal) - # - Names and values of visible environment variables set in .travis.yml or Settings panel - timeout: 1200 - ccache: true - pip: true - directories: - - $HOME/avocado/data/cache - - -addons: - apt: -packages: - # Build dependencies - - libaio-dev - - libattr1-dev - - libbrlapi-dev - - libcap-ng-dev - - libgcc-7-dev - - libgnutls28-dev - - libgtk-3-dev - - libiscsi-dev - - liblttng-ust-dev - - libncurses5-dev - - libnfs-dev - - libnss3-dev - - libpixman-1-dev - - libpng-dev - - librados-dev - - libsdl2-dev - - libsdl2-image-dev - - libseccomp-dev - - libspice-protocol-dev - - libspice-server-dev - - libssh-dev - - liburcu-dev - - libusb-1.0-0-dev - - libvdeplug-dev - - libvte-2.91-dev - - libzstd-dev - - ninja-build - - sparse - - uuid-dev - - gcovr - # Tests dependencies - - genisoimage - - -# The channel name "irc.oftc.net#qemu" is encrypted against qemu/qemu -# to prevent IRC notifications from forks. This was created using: -# $ travis encrypt -r "qemu/qemu" "irc.oftc.net#qemu" -notifications: - irc: -channels: - - secure: "F7GDRgjuOo5IUyRLqSkmDL7kvdU4UcH3Lm/W2db2JnDHTGCqgEdaYEYKciyCLZ57vOTsTsOgesN8iUT7hNHBd1KWKjZe9KDTZWppWRYVwAwQMzVeSOsbbU4tRoJ6Pp+3qhH1Z0eGYR9ZgKYAoTumDFgSAYRp4IscKS8jkoedOqM=" -on_success: change -on_failure: always - - -env: - global: -- SRC_DIR=".." -- BUILD_DIR="build" -- BASE_CONFIG="--disable-docs --disable-tools" -- TEST_BUILD_CMD="" -- TEST_CMD="make check V=1" -# This is broadly a list of "mainline" softmmu targets which have support across the major distros -- MAIN_SOFTMMU_TARGETS="aarch64-softmmu,mips64-softmmu,ppc64-softmmu,riscv64-softmmu,s390x-softmmu,x86_64-softmmu" -- CCACHE_SLOPPINESS="include_file_ctime,include_file_mtime" -- CCACHE_MAXSIZE=1G -- G_MESSAGES_DEBUG=error - - -git: - # we want to do this ourselves - submodules: false - -# Common first phase for all steps -before_install: - - if command -v ccache ; then ccache --zero-stats ; fi - - export JOBS=$(($(getconf _NPROCESSORS_ONLN) + 1)) - - echo "=== Using ${JOBS} simultaneous jobs ===" - -# Configure step - may be overridden -before_script: - - mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR} - - ${SRC_DIR}/configure ${BASE_CONFIG} ${CONFIG} || { cat config.log meson-logs/meson-log.txt && exit 1; } - -# Main build & test - rarely overridden - controlled by TEST_CMD -script: - - BUILD_RC=0 && make -j${JOBS} || BUILD_RC=$? - - | -if [ "$BUILD_RC" -eq 0 ] && [ -n "$TEST_BUILD_CMD" ]; then -${TEST_BUILD_CMD} || BUILD_RC=$? -else -$(exit $BUILD_RC); -fi - - | -if [ "$BUILD_RC" -eq 0 ] ; then -${TEST_CMD} ; -else -$(exit $BUILD_RC); -fi -after_script: - - df -h - - if command -v ccache ; then ccache --show-stats ; fi - - -jobs: - include: -# --enable-debug implies --enable-debug-tcg, also runs quite a bit slower -- name: "GCC debug (main-softmmu)" - env: -- CONFIG="--enable-debug --target-list=${MAIN_SOFTMMU_TARGETS}" -- CACHE_NAME="${TRAVIS_BRANCH}-linux-gcc-debug" - - -# TCG debug can be run just on its own and is mostly agnostic to user/softmmu distinctions -- name: "GCC debug (user)" -
[PATCH 1/2] tests/docker: remove travis container
The travis container that we have no longer matches what travis currently uses. As all x86 jobs are being moved to GitLab CI too, there is no compelling reason to update the travis container. It is simpler to just remove it. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Wainer dos Santos Moschetta Signed-off-by: Daniel P. Berrangé --- docs/devel/testing.rst | 14 tests/docker/Makefile.include | 11 ++ tests/docker/dockerfiles/travis.docker | 17 -- tests/docker/travis| 22 tests/docker/travis.py | 47 -- 5 files changed, 2 insertions(+), 109 deletions(-) delete mode 100644 tests/docker/dockerfiles/travis.docker delete mode 100755 tests/docker/travis delete mode 100755 tests/docker/travis.py diff --git a/docs/devel/testing.rst b/docs/devel/testing.rst index 209f9d8172..00ce16de48 100644 --- a/docs/devel/testing.rst +++ b/docs/devel/testing.rst @@ -357,20 +357,6 @@ source and build it. The full list of tests is printed in the ``make docker`` help. -Tools -- - -There are executables that are created to run in a specific Docker environment. -This makes it easy to write scripts that have heavy or special dependencies, -but are still very easy to use. - -Currently the only tool is ``travis``, which mimics the Travis-CI tests in a -container. It runs in the ``travis`` image: - -.. code:: - - make docker-travis@travis - Debugging a Docker test failure --- diff --git a/tests/docker/Makefile.include b/tests/docker/Makefile.include index 93b29ad823..7cab761bf5 100644 --- a/tests/docker/Makefile.include +++ b/tests/docker/Makefile.include @@ -21,8 +21,6 @@ DOCKER_REGISTRY := $(if $(REGISTRY),$(REGISTRY),registry.gitlab.com/qemu-project DOCKER_TESTS := $(notdir $(shell \ find $(SRC_PATH)/tests/docker/ -name 'test-*' -type f)) -DOCKER_TOOLS := travis - ENGINE := auto DOCKER_SCRIPT=$(SRC_PATH)/tests/docker/docker.py --engine $(ENGINE) @@ -126,7 +124,7 @@ ifneq ($(HOST_ARCH),x86_64) DOCKER_PARTIAL_IMAGES += debian-mips-cross debian-mipsel-cross debian-mips64el-cross DOCKER_PARTIAL_IMAGES += debian-ppc64el-cross DOCKER_PARTIAL_IMAGES += debian-s390x-cross -DOCKER_PARTIAL_IMAGES += fedora travis +DOCKER_PARTIAL_IMAGES += fedora endif docker-image-debian-alpha-cross: docker-image-debian10 @@ -147,8 +145,6 @@ docker-image-debian-s390x-cross: docker-image-debian10 docker-image-debian-sh4-cross: docker-image-debian10 docker-image-debian-sparc64-cross: docker-image-debian10 -docker-image-travis: NOUSER=1 - # Specialist build images, sometimes very limited tools docker-image-debian-tricore-cross: docker-image-debian10 docker-image-debian-all-test-cross: docker-image-debian10 @@ -174,7 +170,7 @@ DOCKER_PARTIAL_IMAGES += fedora-i386-cross fedora-cris-cross # Expand all the pre-requistes for each docker image and test combination $(foreach i,$(filter-out $(DOCKER_PARTIAL_IMAGES),$(DOCKER_IMAGES)), \ - $(foreach t,$(DOCKER_TESTS) $(DOCKER_TOOLS), \ + $(foreach t,$(DOCKER_TESTS), \ $(eval .PHONY: docker-$t@$i) \ $(eval docker-$t@$i: docker-image-$i docker-run-$t@$i) \ ) \ @@ -212,9 +208,6 @@ endif @echo 'Available tests:' @echo '$(DOCKER_TESTS)' @echo - @echo 'Available tools:' - @echo '$(DOCKER_TOOLS)' - @echo @echo 'Special variables:' @echo 'TARGET_LIST=a,b,cOverride target list in builds.' @echo 'EXTRA_CONFIGURE_OPTS="..."' diff --git a/tests/docker/dockerfiles/travis.docker b/tests/docker/dockerfiles/travis.docker deleted file mode 100644 index cd1435a7e9..00 --- a/tests/docker/dockerfiles/travis.docker +++ /dev/null @@ -1,17 +0,0 @@ -# -# Travis Image - this is broadly the same image that we run our CI -# tests on. -# -FROM travisci/ci-sardonyx:packer-1552557266-f909ac5 -ENV DEBIAN_FRONTEND noninteractive -ENV LANG en_US.UTF-8 -ENV LC_ALL en_US.UTF-8 -RUN sed -i "s/# deb-src/deb-src/" /etc/apt/sources.list -RUN apt-get update -RUN apt-get -y build-dep qemu -RUN apt-get -y install device-tree-compiler python3 python3-yaml dh-autoreconf gdb strace lsof net-tools gcovr ninja-build -# Travis tools require PhantomJS / Neo4j / Maven accessible -# in their PATH (QEMU build won't access them). -ENV PATH /usr/local/phantomjs/bin:/usr/local/phantomjs:/usr/local/neo4j-3.2.7/bin:/usr/local/maven-3.5.2/bin:/usr/local/cmake-3.9.2/bin:/usr/local/clang-5.0.0/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin -ENV FEATURES clang pyyaml docs -USER travis diff --git a/tests/docker/travis b/tests/docker/travis deleted file mode 100755 index 47c03677d6..00 --- a/tests/docker/travis +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -e -# -# Mimic a travis testing matrix -# -# Copyright (c) 2016 Red Hat Inc. -# -# Authors: -# Fam Zheng -# -# This work is licensed under the terms of t
[PATCH 0/2] travis: remove all use of Travis CI
It is not sustainable to keep supporting Travis CI when our maintainer have exhausted their free CI credit allowance and it isn't easily renewable for most While there are still some unique scenarios covered by Travis, this is not useful when maintainers can't run the pipelines. If people see scenarios that are desirable for GitLab CI they can be added as jobs when desired. Daniel P. Berrangé (2): tests/docker: remove travis container travis: remove travis configuration and all references to Travis CI .travis.yml| 439 - MAINTAINERS| 3 - configure | 1 - contrib/gitdm/filetypes.txt| 2 +- docs/devel/testing.rst | 14 - scripts/travis/coverage-summary.sh | 27 -- tests/docker/Makefile.include | 11 +- tests/docker/docker.py | 2 +- tests/docker/dockerfiles/travis.docker | 17 - tests/docker/travis| 22 -- tests/docker/travis.py | 47 --- tests/qemu-iotests/079 | 2 +- tests/test-util-filemonitor.c | 11 - 13 files changed, 5 insertions(+), 593 deletions(-) delete mode 100644 .travis.yml delete mode 100755 scripts/travis/coverage-summary.sh delete mode 100644 tests/docker/dockerfiles/travis.docker delete mode 100755 tests/docker/travis delete mode 100755 tests/docker/travis.py -- 2.29.2
Re: [PATCH 0/7] qcow2: compressed write cache
On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote: Hi all! I know, I have several series waiting for a resend, but I had to switch to another task spawned from our customer's bug. Original problem: we use O_DIRECT for all vm images in our product, it's the policy. The only exclusion is backup target qcow2 image for compressed backup, because compressed backup is extremely slow with O_DIRECT (due to unaligned writes). Customer complains that backup produces a lot of pagecache. So we can either implement some internal cache or use fadvise somehow. Backup has several async workes, which writes simultaneously, so in both ways we have to track host cluster filling (before dropping the cache corresponding to the cluster). So, if we have to track anyway, let's try to implement the cache. I wanted to be excited here, because that sounds like it would be very easy to implement caching. Like, just keep the cluster at free_byte_offset cached until the cluster it points to changes, then flush the cluster. But then I see like 900 new lines of code, and I’m much less excited... Idea is simple: cache small unaligned write and flush the cluster when filled. Performance result is very good (results in a table is time of compressed backup of 1000M disk filled with ones in seconds): “Filled with ones” really is an edge case, though. --- --- --- backup(old) backup(new) ssd:hdd(direct) 3e+024.4 -99% ssd:hdd(cached) 5.7 5.4 -5% --- --- --- So, we have benefit even for cached mode! And the fastest thing is O_DIRECT with new implemented cache. So, I suggest to enable the new cache by default (which is done by the series). First, I’m not sure how O_DIRECT really is relevant, because I don’t really see the point for writing compressed images. Second, I find it a bit cheating if you say there is a huge improvement for the no-cache case, when actually, well, you just added a cache. So the no-cache case just became faster because there is a cache now. Well, I suppose I could follow that if O_DIRECT doesn’t make much sense for compressed images, qemu’s format drivers are free to introduce some caching (because technically the cache.direct option only applies to the protocol driver) for collecting compressed writes. That conclusion makes both of my complaints kind of moot. *shrug* Third, what is the real-world impact on the page cache? You described that that’s the reason why you need the cache in qemu, because otherwise the page cache is polluted too much. How much is the difference really? (I don’t know how good the compression ratio is for real-world images.) Related to that, I remember a long time ago we had some discussion about letting qemu-img convert set a special cache mode for the target image that would make Linux drop everything before the last offset written (i.e., I suppose fadvise() with POSIX_FADV_SEQUENTIAL). You discard that idea based on the fact that implementing a cache in qemu would be simple, but it isn’t, really. What would the impact of POSIX_FADV_SEQUENTIAL be? (One advantage of using that would be that we could reuse it for non-compressed images that are written by backup or qemu-img convert.) (I don’t remember why that qemu-img discussion died back then.) Fourth, regarding the code, would it be simpler if it were a pure write cache? I.e., on read, everything is flushed, so we don’t have to deal with that. I don’t think there are many valid cases where a compressed image is both written to and read from at the same time. (Just asking, because I’d really want this code to be simpler. I can imagine that reading from the cache is the least bit of complexity, but perhaps...) Max
[PATCH 1/2] hw/nvme: move nvme emulation out of hw/block
From: Klaus Jensen With the introduction of the nvme-subsystem device we are really cluttering up the hw/block directory. As suggested by Philippe previously, move the nvme emulation to hw/nvme. Suggested-by: Philippe Mathieu-Daudé Signed-off-by: Klaus Jensen --- meson.build | 1 + hw/block/nvme-ns.h| 193 - hw/block/nvme-subsys.h| 32 hw/{block => nvme}/nvme.h | 198 +- hw/nvme/trace.h | 1 + hw/{block/nvme.c => nvme/ctrl.c} | 1 - hw/{block/nvme-ns.c => nvme/ns.c} | 1 - hw/{block/nvme-subsys.c => nvme/subsys.c} | 2 +- MAINTAINERS | 2 +- hw/Kconfig| 1 + hw/block/Kconfig | 5 - hw/block/meson.build | 1 - hw/block/trace-events | 180 hw/meson.build| 1 + hw/nvme/Kconfig | 4 + hw/nvme/meson.build | 1 + hw/nvme/trace-events | 178 +++ 17 files changed, 385 insertions(+), 417 deletions(-) delete mode 100644 hw/block/nvme-ns.h delete mode 100644 hw/block/nvme-subsys.h rename hw/{block => nvme}/nvme.h (55%) create mode 100644 hw/nvme/trace.h rename hw/{block/nvme.c => nvme/ctrl.c} (99%) rename hw/{block/nvme-ns.c => nvme/ns.c} (99%) rename hw/{block/nvme-subsys.c => nvme/subsys.c} (98%) create mode 100644 hw/nvme/Kconfig create mode 100644 hw/nvme/meson.build create mode 100644 hw/nvme/trace-events diff --git a/meson.build b/meson.build index e3386196ba41..255f54918786 100644 --- a/meson.build +++ b/meson.build @@ -1433,6 +1433,7 @@ if have_system 'hw/misc', 'hw/misc/macio', 'hw/net', +'hw/nvme', 'hw/nvram', 'hw/pci', 'hw/pci-host', diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h deleted file mode 100644 index 7af6884862b5.. --- a/hw/block/nvme-ns.h +++ /dev/null @@ -1,193 +0,0 @@ -/* - * QEMU NVM Express Virtual Namespace - * - * Copyright (c) 2019 CNEX Labs - * Copyright (c) 2020 Samsung Electronics - * - * Authors: - * Klaus Jensen - * - * This work is licensed under the terms of the GNU GPL, version 2. See the - * COPYING file in the top-level directory. - * - */ - -#ifndef NVME_NS_H -#define NVME_NS_H - -#define TYPE_NVME_NS "nvme-ns" -#define NVME_NS(obj) \ -OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS) - -typedef struct NvmeZone { -NvmeZoneDescr d; -uint64_tw_ptr; -QTAILQ_ENTRY(NvmeZone) entry; -} NvmeZone; - -typedef struct NvmeNamespaceParams { -uint32_t nsid; -QemuUUID uuid; - -uint16_t mssrl; -uint32_t mcl; -uint8_t msrc; - -bool zoned; -bool cross_zone_read; -uint64_t zone_size_bs; -uint64_t zone_cap_bs; -uint32_t max_active_zones; -uint32_t max_open_zones; -uint32_t zd_extension_size; -} NvmeNamespaceParams; - -typedef struct NvmeNamespace { -DeviceState parent_obj; -BlockConfblkconf; -int32_t bootindex; -int64_t size; -NvmeIdNs id_ns; -const uint32_t *iocs; -uint8_t csi; - -NvmeSubsystem *subsys; - -NvmeIdNsZoned *id_ns_zoned; -NvmeZone*zone_array; -QTAILQ_HEAD(, NvmeZone) exp_open_zones; -QTAILQ_HEAD(, NvmeZone) imp_open_zones; -QTAILQ_HEAD(, NvmeZone) closed_zones; -QTAILQ_HEAD(, NvmeZone) full_zones; -uint32_tnum_zones; -uint64_tzone_size; -uint64_tzone_capacity; -uint32_tzone_size_log2; -uint8_t *zd_extensions; -int32_t nr_open_zones; -int32_t nr_active_zones; - -NvmeNamespaceParams params; - -struct { -uint32_t err_rec; -} features; -} NvmeNamespace; - -static inline uint32_t nvme_nsid(NvmeNamespace *ns) -{ -if (ns) { -return ns->params.nsid; -} - -return -1; -} - -static inline bool nvme_ns_shared(NvmeNamespace *ns) -{ -return !!ns->subsys; -} - -static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns) -{ -NvmeIdNs *id_ns = &ns->id_ns; -return &id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)]; -} - -static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns) -{ -return nvme_ns_lbaf(ns)->ds; -} - -/* calculate the number of LBAs that the namespace can accomodate */ -static inline uint64_t nvme_ns_nlbas(NvmeNamespace *ns) -{ -return ns->size >> nvme_ns_lbads(ns); -} - -/* convert an LBA to the equivalent in bytes */ -static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba) -{ -return lba << nvme_ns_lbads(ns); -} - -typedef struct NvmeCtrl NvmeCtrl; - -static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone) -{ -return zone->d.zs >> 4; -} - -static inline void nvme_set_zone_state(NvmeZone *zone, NvmeZoneState state) -{ -zo
[PATCH 2/2] hw/nvme: move device-scoped functions
From: Klaus Jensen Move a bunch of functions that are internal to a device out of the shared header. Signed-off-by: Klaus Jensen --- hw/nvme/nvme.h | 110 + hw/nvme/ctrl.c | 90 +++- hw/nvme/ns.c | 7 +++- 3 files changed, 97 insertions(+), 110 deletions(-) diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 452a64499b1b..929c6c553ca2 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -96,36 +96,13 @@ static inline uint32_t nvme_nsid(NvmeNamespace *ns) return -1; } -static inline bool nvme_ns_shared(NvmeNamespace *ns) -{ -return !!ns->subsys; -} - -static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns) -{ -NvmeIdNs *id_ns = &ns->id_ns; -return &id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)]; -} - static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns) { -return nvme_ns_lbaf(ns)->ds; -} +NvmeLBAF lbaf = ns->id_ns.lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)]; -/* calculate the number of LBAs that the namespace can accomodate */ -static inline uint64_t nvme_ns_nlbas(NvmeNamespace *ns) -{ -return ns->size >> nvme_ns_lbads(ns); +return lbaf.ds; } -/* convert an LBA to the equivalent in bytes */ -static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba) -{ -return lba << nvme_ns_lbads(ns); -} - -typedef struct NvmeCtrl NvmeCtrl; - static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone) { return zone->d.zs >> 4; @@ -136,31 +113,6 @@ static inline void nvme_set_zone_state(NvmeZone *zone, NvmeZoneState state) zone->d.zs = state << 4; } -static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone) -{ -return zone->d.zslba + ns->zone_size; -} - -static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone) -{ -return zone->d.zslba + zone->d.zcap; -} - -static inline bool nvme_wp_is_valid(NvmeZone *zone) -{ -uint8_t st = nvme_get_zone_state(zone); - -return st != NVME_ZONE_STATE_FULL && - st != NVME_ZONE_STATE_READ_ONLY && - st != NVME_ZONE_STATE_OFFLINE; -} - -static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns, - uint32_t zone_idx) -{ -return &ns->zd_extensions[zone_idx * ns->params.zd_extension_size]; -} - static inline void nvme_aor_inc_open(NvmeNamespace *ns) { assert(ns->nr_open_zones >= 0); @@ -203,7 +155,6 @@ void nvme_ns_drain(NvmeNamespace *ns); void nvme_ns_shutdown(NvmeNamespace *ns); void nvme_ns_cleanup(NvmeNamespace *ns); - typedef struct NvmeParams { char *serial; uint32_t num_queues; /* deprecated since 5.1 */ @@ -237,40 +188,6 @@ typedef struct NvmeRequest { QTAILQ_ENTRY(NvmeRequest)entry; } NvmeRequest; -static inline const char *nvme_adm_opc_str(uint8_t opc) -{ -switch (opc) { -case NVME_ADM_CMD_DELETE_SQ:return "NVME_ADM_CMD_DELETE_SQ"; -case NVME_ADM_CMD_CREATE_SQ:return "NVME_ADM_CMD_CREATE_SQ"; -case NVME_ADM_CMD_GET_LOG_PAGE: return "NVME_ADM_CMD_GET_LOG_PAGE"; -case NVME_ADM_CMD_DELETE_CQ:return "NVME_ADM_CMD_DELETE_CQ"; -case NVME_ADM_CMD_CREATE_CQ:return "NVME_ADM_CMD_CREATE_CQ"; -case NVME_ADM_CMD_IDENTIFY: return "NVME_ADM_CMD_IDENTIFY"; -case NVME_ADM_CMD_ABORT:return "NVME_ADM_CMD_ABORT"; -case NVME_ADM_CMD_SET_FEATURES: return "NVME_ADM_CMD_SET_FEATURES"; -case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES"; -case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ"; -default:return "NVME_ADM_CMD_UNKNOWN"; -} -} - -static inline const char *nvme_io_opc_str(uint8_t opc) -{ -switch (opc) { -case NVME_CMD_FLUSH:return "NVME_NVM_CMD_FLUSH"; -case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE"; -case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; -case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE"; -case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; -case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; -case NVME_CMD_COPY: return "NVME_NVM_CMD_COPY"; -case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND"; -case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV"; -case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND"; -default:return "NVME_NVM_CMD_UNKNOWN"; -} -} - typedef struct NvmeSQueue { struct NvmeCtrl *ctrl; uint16_tsqid; @@ -379,29 +296,6 @@ typedef struct NvmeCtrl { NvmeFeatureVal features; } NvmeCtrl; -static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid) -{ -if (!nsid || nsid > n->num_namespaces) { -return NULL; -} - -return n->namespaces[nsid - 1]; -} - -static inline NvmeCQueue *nvme_cq(NvmeRequest *req) -{ -NvmeSQueue *sq = req->sq; -NvmeCtrl *n = s
Re: [PATCH v2] hw/block/nvme: use locally assigned QEMU IEEE OUI
On 2/9/21 11:45 AM, Klaus Jensen wrote: > From: Gollu Appalanaidu > > Commit 6eb7a071292a ("hw/block/nvme: change controller pci id") changed > the controller to use a Red Hat assigned PCI Device and Vendor ID, but > did not change the IEEE OUI away from the Intel IEEE OUI. > > Fix that and use the locally assigned QEMU IEEE OUI instead if the > `use-intel-id` parameter is not explicitly set. Also reverse the Intel > IEEE OUI bytes. > > Signed-off-by: Gollu Appalanaidu > Signed-off-by: Klaus Jensen > --- > > v2: drop telemetry and add a check on the use_intel_id parameter. > > hw/block/nvme.c | 14 +++--- > 1 file changed, 11 insertions(+), 3 deletions(-) > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > index c2f0c88fbf39..870e9d8e1c17 100644 > --- a/hw/block/nvme.c > +++ b/hw/block/nvme.c > @@ -4685,9 +4685,17 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice > *pci_dev) > id->cntlid = cpu_to_le16(n->cntlid); > > id->rab = 6; > -id->ieee[0] = 0x00; > -id->ieee[1] = 0x02; > -id->ieee[2] = 0xb3; > + > +if (n->params.use_intel_id) { > +id->ieee[0] = 0xb3; > +id->ieee[1] = 0x02; > +id->ieee[2] = 0x00; > +} else { > +id->ieee[0] = 0x00; > +id->ieee[1] = 0x54; > +id->ieee[2] = 0x52; > +} Correct. Reviewed-by: Philippe Mathieu-Daudé Ideally we should have definitions and use them here and in qemu_macaddr_default_if_unset() instead of this magic values. > + > id->mdts = n->params.mdts; > id->ver = cpu_to_le32(NVME_SPEC_VER); > id->oacs = cpu_to_le16(0); >
[PATCH 0/2] hw/nvme: move nvme emulation out of hw/block
From: Klaus Jensen With the introduction of the nvme-subsystem device we are really cluttering up the hw/block directory. As suggested by Philippe previously, move the nvme emulation to hw/nvme. Klaus Jensen (2): hw/nvme: move nvme emulation out of hw/block hw/nvme: move device-scoped functions meson.build | 1 + hw/block/nvme-ns.h| 193 hw/block/nvme-subsys.h| 32 hw/{block => nvme}/nvme.h | 206 +++--- hw/nvme/trace.h | 1 + hw/{block/nvme.c => nvme/ctrl.c} | 91 +- hw/{block/nvme-ns.c => nvme/ns.c} | 8 +- hw/{block/nvme-subsys.c => nvme/subsys.c} | 2 +- MAINTAINERS | 2 +- hw/Kconfig| 1 + hw/block/Kconfig | 5 - hw/block/meson.build | 1 - hw/block/trace-events | 180 --- hw/meson.build| 1 + hw/nvme/Kconfig | 4 + hw/nvme/meson.build | 1 + hw/nvme/trace-events | 178 +++ 17 files changed, 431 insertions(+), 476 deletions(-) delete mode 100644 hw/block/nvme-ns.h delete mode 100644 hw/block/nvme-subsys.h rename hw/{block => nvme}/nvme.h (51%) create mode 100644 hw/nvme/trace.h rename hw/{block/nvme.c => nvme/ctrl.c} (97%) rename hw/{block/nvme-ns.c => nvme/ns.c} (98%) rename hw/{block/nvme-subsys.c => nvme/subsys.c} (98%) create mode 100644 hw/nvme/Kconfig create mode 100644 hw/nvme/meson.build create mode 100644 hw/nvme/trace-events -- 2.30.0
Re: [PATCH] hw/sd: sdhci: Do not transfer any data when command fails
On 2/9/21 11:54 AM, Bin Meng wrote: > At the end of sdhci_send_command(), it starts a data transfer if > the command register indicates a data is associated. However the > data transfer should only be initiated when the command execution > has succeeded. > > Cc: qemu-sta...@nongnu.org > Fixes: CVE-2020-17380 > Fixes: CVE-2020-25085 > Reported-by: Alexander Bulekov > Reported-by: Sergej Schumilo (Ruhr-University Bochum) > Reported-by: Cornelius Aschermann (Ruhr-University Bochum) > Reported-by: Simon Wrner (Ruhr-University Bochum) > Buglink: https://bugs.launchpad.net/qemu/+bug/1892960 > Signed-off-by: Bin Meng > --- > > hw/sd/sdhci.c | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) Tested-by: Philippe Mathieu-Daudé
[PATCH] hw/sd: sdhci: Do not transfer any data when command fails
At the end of sdhci_send_command(), it starts a data transfer if the command register indicates a data is associated. However the data transfer should only be initiated when the command execution has succeeded. Cc: qemu-sta...@nongnu.org Fixes: CVE-2020-17380 Fixes: CVE-2020-25085 Reported-by: Alexander Bulekov Reported-by: Sergej Schumilo (Ruhr-University Bochum) Reported-by: Cornelius Aschermann (Ruhr-University Bochum) Reported-by: Simon Wrner (Ruhr-University Bochum) Buglink: https://bugs.launchpad.net/qemu/+bug/1892960 Signed-off-by: Bin Meng --- hw/sd/sdhci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c index 8ffa539..0450110 100644 --- a/hw/sd/sdhci.c +++ b/hw/sd/sdhci.c @@ -326,6 +326,7 @@ static void sdhci_send_command(SDHCIState *s) SDRequest request; uint8_t response[16]; int rlen; +bool cmd_failure = false; s->errintsts = 0; s->acmd12errsts = 0; @@ -349,6 +350,7 @@ static void sdhci_send_command(SDHCIState *s) trace_sdhci_response16(s->rspreg[3], s->rspreg[2], s->rspreg[1], s->rspreg[0]); } else { +cmd_failure = true; trace_sdhci_error("timeout waiting for command response"); if (s->errintstsen & SDHC_EISEN_CMDTIMEOUT) { s->errintsts |= SDHC_EIS_CMDTIMEOUT; @@ -369,7 +371,7 @@ static void sdhci_send_command(SDHCIState *s) sdhci_update_irq(s); -if (s->blksize && (s->cmdreg & SDHC_CMD_DATA_PRESENT)) { +if (!cmd_failure && s->blksize && (s->cmdreg & SDHC_CMD_DATA_PRESENT)) { s->data_count = 0; sdhci_data_transfer(s); } -- 2.7.4
[PATCH v2] hw/block/nvme: use locally assigned QEMU IEEE OUI
From: Gollu Appalanaidu Commit 6eb7a071292a ("hw/block/nvme: change controller pci id") changed the controller to use a Red Hat assigned PCI Device and Vendor ID, but did not change the IEEE OUI away from the Intel IEEE OUI. Fix that and use the locally assigned QEMU IEEE OUI instead if the `use-intel-id` parameter is not explicitly set. Also reverse the Intel IEEE OUI bytes. Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen --- v2: drop telemetry and add a check on the use_intel_id parameter. hw/block/nvme.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c2f0c88fbf39..870e9d8e1c17 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4685,9 +4685,17 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->cntlid = cpu_to_le16(n->cntlid); id->rab = 6; -id->ieee[0] = 0x00; -id->ieee[1] = 0x02; -id->ieee[2] = 0xb3; + +if (n->params.use_intel_id) { +id->ieee[0] = 0xb3; +id->ieee[1] = 0x02; +id->ieee[2] = 0x00; +} else { +id->ieee[0] = 0x00; +id->ieee[1] = 0x54; +id->ieee[2] = 0x52; +} + id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); id->oacs = cpu_to_le16(0); -- 2.30.0
Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress
Oops, hitting "send" by mistake ... On Tue, Feb 9, 2021 at 5:42 PM Bin Meng wrote: > > Hi Philippe, > > On Tue, Feb 9, 2021 at 5:38 PM Philippe Mathieu-Daudé wrote: > > > > On 2/9/21 9:28 AM, Bin Meng wrote: > > > Hi Philippe, > > > > > > On Tue, Feb 9, 2021 at 3:34 AM Philippe Mathieu-Daudé > > > wrote: > > >> > > >> Per the "SD Host Controller Simplified Specification Version 2.00" > > >> spec. 'Table 2-4 : Block Size Register': > > >> > > >> Transfer Block Size [...] can be accessed only if no > > >> transaction is executing (i.e., after a transaction has stopped). > > >> Read operations during transfers may return an invalid value, > > >> and write operations shall be ignored. > > >> > > >> Transactions will update 'data_count', so do not modify 'blksize' > > >> and 'blkcnt' when 'data_count' is used. This fixes: > > >> > > >> $ cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \ > > >>-nographic -serial none -M pc-q35-5.0 \ > > >>-device sdhci-pci,sd-spec-version=3 \ > > >>-device sd-card,drive=mydrive \ > > >>-drive if=sd,index=0,file=null-co://,format=raw,id=mydrive > > >> outl 0xcf8 0x80001810 > > >> outl 0xcfc 0xe1068000 > > >> outl 0xcf8 0x80001814 > > > > > > Is this command needed? > > > > My guess is this makes the northbridge somehow map the device PCI space. > > > > Probably not needed in machines where SDHCI is MMIO mapped. > > I think this is not needed. Writing only the CFG_ADDR I think this is not needed. Writing only the CFG_ADDR without wring CFG_DATA does not take any effect. > > > > > > > > >> outl 0xcf8 0x80001804 > > >> outw 0xcfc 0x7 > > >> outl 0xcf8 0x8000fa20 > > > > > > and this one? > > > > Ditto. > > > > > > > >> write 0xe106802c 0x1 0x0f > > >> write 0xe1068004 0xc 0x2801d10101fbff28a384 > > > > > > Are these fuzzy data? > > > > Yes, I didn't try to understand what this does, as often > > non-sense operations. But this is what would craft a malicious > > attacker. > > > > > > > >> write 0xe106800c 0x1f > > >> 0x9dacbbcad9e8f7061524334251606f7e8d9cabbac9d8e7f60514233241505f > > >> write 0xe1068003 0x28 > > >> 0x80d000251480d000252280d000253080d000253e80d000254c80d000255a80d000256880d0002576 > > >> write 0xe1068003 0x1 0xfe > > >> EOF > > >> = > > >> ==2686219==ERROR: AddressSanitizer: heap-buffer-overflow on address > > >> 0x6153bb00 at pc 0x55ab469f456c bp 0x7ffee71be330 sp 0x7ffee71bdae0 > > >> WRITE of size 4 at 0x6153bb00 thread T0 > > >> #0 0x55ab469f456b in __asan_memcpy (qemu-system-i386+0x1cea56b) > > >> #1 0x55ab483dc396 in stl_he_p include/qemu/bswap.h:353:5 > > >> #2 0x55ab483af5e4 in stn_he_p include/qemu/bswap.h:546:1 > > >> #3 0x55ab483aeb4b in flatview_read_continue > > >> softmmu/physmem.c:2839:13 > > >> #4 0x55ab483b0705 in flatview_read softmmu/physmem.c:2877:12 > > >> #5 0x55ab483b028e in address_space_read_full > > >> softmmu/physmem.c:2890:18 > > >> #6 0x55ab483b1294 in address_space_rw softmmu/physmem.c:2918:16 > > >> #7 0x55ab479374a2 in dma_memory_rw_relaxed > > >> include/sysemu/dma.h:88:12 > > >> #8 0x55ab47936f50 in dma_memory_rw include/sysemu/dma.h:127:12 > > >> #9 0x55ab4793665f in dma_memory_read include/sysemu/dma.h:145:12 > > >> #10 0x55ab4792f176 in sdhci_sdma_transfer_multi_blocks > > >> hw/sd/sdhci.c:639:13 > > >> #11 0x55ab4793dc9d in sdhci_write hw/sd/sdhci.c:1129:17 > > >> #12 0x55ab483f8db8 in memory_region_write_accessor > > >> softmmu/memory.c:491:5 > > >> #13 0x55ab483f868a in access_with_adjusted_size > > >> softmmu/memory.c:552:18 > > >> #14 0x55ab483f6da5 in memory_region_dispatch_write > > >> softmmu/memory.c:1501:16 > > >> #15 0x55ab483c3b11 in flatview_write_continue > > >> softmmu/physmem.c:2774:23 > > >> #16 0x55ab483b0eb6 in flatview_write softmmu/physmem.c:2814:14 > > >> #17 0x55ab483b0a3e in address_space_write softmmu/physmem.c:2906:18 > > >> #18 0x55ab48465c56 in qtest_process_command softmmu/qtest.c:654:9 > > >> > > >> 0x6153bb00 is located 0 bytes to the right of 512-byte region > > >> [0x6153b900,0x6153bb00) > > >> allocated by thread T0 here: > > >> #0 0x55ab469f58a7 in calloc (qemu-system-i386+0x1ceb8a7) > > >> #1 0x7f21d678f9b0 in g_malloc0 (/lib64/libglib-2.0.so.0+0x589b0) > > >> #2 0x55ab479530ed in sdhci_pci_realize hw/sd/sdhci-pci.c:36:5 > > >> #3 0x55ab476f102a in pci_qdev_realize hw/pci/pci.c:2108:9 > > >> #4 0x55ab48baaad2 in device_set_realized hw/core/qdev.c:761:13 > > >> > > >> SUMMARY: AddressSanitizer: heap-buffer-overflow > > >> (qemu-system-i386+0x1cea56b) in __asan_memcpy > > >> Shadow bytes around the buggy address: > > >> 0x0c2a7710: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa > > >> 0x0c2a7720: 00 00 00 00 00 00
Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress
Hi Philippe, On Tue, Feb 9, 2021 at 5:38 PM Philippe Mathieu-Daudé wrote: > > On 2/9/21 9:28 AM, Bin Meng wrote: > > Hi Philippe, > > > > On Tue, Feb 9, 2021 at 3:34 AM Philippe Mathieu-Daudé > > wrote: > >> > >> Per the "SD Host Controller Simplified Specification Version 2.00" > >> spec. 'Table 2-4 : Block Size Register': > >> > >> Transfer Block Size [...] can be accessed only if no > >> transaction is executing (i.e., after a transaction has stopped). > >> Read operations during transfers may return an invalid value, > >> and write operations shall be ignored. > >> > >> Transactions will update 'data_count', so do not modify 'blksize' > >> and 'blkcnt' when 'data_count' is used. This fixes: > >> > >> $ cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \ > >>-nographic -serial none -M pc-q35-5.0 \ > >>-device sdhci-pci,sd-spec-version=3 \ > >>-device sd-card,drive=mydrive \ > >>-drive if=sd,index=0,file=null-co://,format=raw,id=mydrive > >> outl 0xcf8 0x80001810 > >> outl 0xcfc 0xe1068000 > >> outl 0xcf8 0x80001814 > > > > Is this command needed? > > My guess is this makes the northbridge somehow map the device PCI space. > > Probably not needed in machines where SDHCI is MMIO mapped. I think this is not needed. Writing only the CFG_ADDR > > > > >> outl 0xcf8 0x80001804 > >> outw 0xcfc 0x7 > >> outl 0xcf8 0x8000fa20 > > > > and this one? > > Ditto. > > > > >> write 0xe106802c 0x1 0x0f > >> write 0xe1068004 0xc 0x2801d10101fbff28a384 > > > > Are these fuzzy data? > > Yes, I didn't try to understand what this does, as often > non-sense operations. But this is what would craft a malicious > attacker. > > > > >> write 0xe106800c 0x1f > >> 0x9dacbbcad9e8f7061524334251606f7e8d9cabbac9d8e7f60514233241505f > >> write 0xe1068003 0x28 > >> 0x80d000251480d000252280d000253080d000253e80d000254c80d000255a80d000256880d0002576 > >> write 0xe1068003 0x1 0xfe > >> EOF > >> = > >> ==2686219==ERROR: AddressSanitizer: heap-buffer-overflow on address > >> 0x6153bb00 at pc 0x55ab469f456c bp 0x7ffee71be330 sp 0x7ffee71bdae0 > >> WRITE of size 4 at 0x6153bb00 thread T0 > >> #0 0x55ab469f456b in __asan_memcpy (qemu-system-i386+0x1cea56b) > >> #1 0x55ab483dc396 in stl_he_p include/qemu/bswap.h:353:5 > >> #2 0x55ab483af5e4 in stn_he_p include/qemu/bswap.h:546:1 > >> #3 0x55ab483aeb4b in flatview_read_continue softmmu/physmem.c:2839:13 > >> #4 0x55ab483b0705 in flatview_read softmmu/physmem.c:2877:12 > >> #5 0x55ab483b028e in address_space_read_full > >> softmmu/physmem.c:2890:18 > >> #6 0x55ab483b1294 in address_space_rw softmmu/physmem.c:2918:16 > >> #7 0x55ab479374a2 in dma_memory_rw_relaxed include/sysemu/dma.h:88:12 > >> #8 0x55ab47936f50 in dma_memory_rw include/sysemu/dma.h:127:12 > >> #9 0x55ab4793665f in dma_memory_read include/sysemu/dma.h:145:12 > >> #10 0x55ab4792f176 in sdhci_sdma_transfer_multi_blocks > >> hw/sd/sdhci.c:639:13 > >> #11 0x55ab4793dc9d in sdhci_write hw/sd/sdhci.c:1129:17 > >> #12 0x55ab483f8db8 in memory_region_write_accessor > >> softmmu/memory.c:491:5 > >> #13 0x55ab483f868a in access_with_adjusted_size > >> softmmu/memory.c:552:18 > >> #14 0x55ab483f6da5 in memory_region_dispatch_write > >> softmmu/memory.c:1501:16 > >> #15 0x55ab483c3b11 in flatview_write_continue > >> softmmu/physmem.c:2774:23 > >> #16 0x55ab483b0eb6 in flatview_write softmmu/physmem.c:2814:14 > >> #17 0x55ab483b0a3e in address_space_write softmmu/physmem.c:2906:18 > >> #18 0x55ab48465c56 in qtest_process_command softmmu/qtest.c:654:9 > >> > >> 0x6153bb00 is located 0 bytes to the right of 512-byte region > >> [0x6153b900,0x6153bb00) > >> allocated by thread T0 here: > >> #0 0x55ab469f58a7 in calloc (qemu-system-i386+0x1ceb8a7) > >> #1 0x7f21d678f9b0 in g_malloc0 (/lib64/libglib-2.0.so.0+0x589b0) > >> #2 0x55ab479530ed in sdhci_pci_realize hw/sd/sdhci-pci.c:36:5 > >> #3 0x55ab476f102a in pci_qdev_realize hw/pci/pci.c:2108:9 > >> #4 0x55ab48baaad2 in device_set_realized hw/core/qdev.c:761:13 > >> > >> SUMMARY: AddressSanitizer: heap-buffer-overflow > >> (qemu-system-i386+0x1cea56b) in __asan_memcpy > >> Shadow bytes around the buggy address: > >> 0x0c2a7710: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa > >> 0x0c2a7720: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > >> 0x0c2a7730: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > >> 0x0c2a7740: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > >> 0x0c2a7750: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > >> =>0x0c2a7760:[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa > >> 0x0c2a7770: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd > >> 0x0c2a7780: fd
Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress
On 2/9/21 9:28 AM, Bin Meng wrote: > Hi Philippe, > > On Tue, Feb 9, 2021 at 3:34 AM Philippe Mathieu-Daudé wrote: >> >> Per the "SD Host Controller Simplified Specification Version 2.00" >> spec. 'Table 2-4 : Block Size Register': >> >> Transfer Block Size [...] can be accessed only if no >> transaction is executing (i.e., after a transaction has stopped). >> Read operations during transfers may return an invalid value, >> and write operations shall be ignored. >> >> Transactions will update 'data_count', so do not modify 'blksize' >> and 'blkcnt' when 'data_count' is used. This fixes: >> >> $ cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \ >>-nographic -serial none -M pc-q35-5.0 \ >>-device sdhci-pci,sd-spec-version=3 \ >>-device sd-card,drive=mydrive \ >>-drive if=sd,index=0,file=null-co://,format=raw,id=mydrive >> outl 0xcf8 0x80001810 >> outl 0xcfc 0xe1068000 >> outl 0xcf8 0x80001814 > > Is this command needed? My guess is this makes the northbridge somehow map the device PCI space. Probably not needed in machines where SDHCI is MMIO mapped. > >> outl 0xcf8 0x80001804 >> outw 0xcfc 0x7 >> outl 0xcf8 0x8000fa20 > > and this one? Ditto. > >> write 0xe106802c 0x1 0x0f >> write 0xe1068004 0xc 0x2801d10101fbff28a384 > > Are these fuzzy data? Yes, I didn't try to understand what this does, as often non-sense operations. But this is what would craft a malicious attacker. > >> write 0xe106800c 0x1f >> 0x9dacbbcad9e8f7061524334251606f7e8d9cabbac9d8e7f60514233241505f >> write 0xe1068003 0x28 >> 0x80d000251480d000252280d000253080d000253e80d000254c80d000255a80d000256880d0002576 >> write 0xe1068003 0x1 0xfe >> EOF >> = >> ==2686219==ERROR: AddressSanitizer: heap-buffer-overflow on address >> 0x6153bb00 at pc 0x55ab469f456c bp 0x7ffee71be330 sp 0x7ffee71bdae0 >> WRITE of size 4 at 0x6153bb00 thread T0 >> #0 0x55ab469f456b in __asan_memcpy (qemu-system-i386+0x1cea56b) >> #1 0x55ab483dc396 in stl_he_p include/qemu/bswap.h:353:5 >> #2 0x55ab483af5e4 in stn_he_p include/qemu/bswap.h:546:1 >> #3 0x55ab483aeb4b in flatview_read_continue softmmu/physmem.c:2839:13 >> #4 0x55ab483b0705 in flatview_read softmmu/physmem.c:2877:12 >> #5 0x55ab483b028e in address_space_read_full softmmu/physmem.c:2890:18 >> #6 0x55ab483b1294 in address_space_rw softmmu/physmem.c:2918:16 >> #7 0x55ab479374a2 in dma_memory_rw_relaxed include/sysemu/dma.h:88:12 >> #8 0x55ab47936f50 in dma_memory_rw include/sysemu/dma.h:127:12 >> #9 0x55ab4793665f in dma_memory_read include/sysemu/dma.h:145:12 >> #10 0x55ab4792f176 in sdhci_sdma_transfer_multi_blocks >> hw/sd/sdhci.c:639:13 >> #11 0x55ab4793dc9d in sdhci_write hw/sd/sdhci.c:1129:17 >> #12 0x55ab483f8db8 in memory_region_write_accessor >> softmmu/memory.c:491:5 >> #13 0x55ab483f868a in access_with_adjusted_size softmmu/memory.c:552:18 >> #14 0x55ab483f6da5 in memory_region_dispatch_write >> softmmu/memory.c:1501:16 >> #15 0x55ab483c3b11 in flatview_write_continue softmmu/physmem.c:2774:23 >> #16 0x55ab483b0eb6 in flatview_write softmmu/physmem.c:2814:14 >> #17 0x55ab483b0a3e in address_space_write softmmu/physmem.c:2906:18 >> #18 0x55ab48465c56 in qtest_process_command softmmu/qtest.c:654:9 >> >> 0x6153bb00 is located 0 bytes to the right of 512-byte region >> [0x6153b900,0x6153bb00) >> allocated by thread T0 here: >> #0 0x55ab469f58a7 in calloc (qemu-system-i386+0x1ceb8a7) >> #1 0x7f21d678f9b0 in g_malloc0 (/lib64/libglib-2.0.so.0+0x589b0) >> #2 0x55ab479530ed in sdhci_pci_realize hw/sd/sdhci-pci.c:36:5 >> #3 0x55ab476f102a in pci_qdev_realize hw/pci/pci.c:2108:9 >> #4 0x55ab48baaad2 in device_set_realized hw/core/qdev.c:761:13 >> >> SUMMARY: AddressSanitizer: heap-buffer-overflow >> (qemu-system-i386+0x1cea56b) in __asan_memcpy >> Shadow bytes around the buggy address: >> 0x0c2a7710: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa >> 0x0c2a7720: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >> 0x0c2a7730: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >> 0x0c2a7740: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >> 0x0c2a7750: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >> =>0x0c2a7760:[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa >> 0x0c2a7770: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd >> 0x0c2a7780: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd >> 0x0c2a7790: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd >> 0x0c2a77a0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd >> 0x0c2a77b0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa >> Shadow byte legend (one shadow byte represents 8 application bytes): >> Addressable:
Re: [PATCH 0/9] hw/block: m25p80: Fix the mess of dummy bytes needed for fast read commands
Hello Edgar, On [2021 Feb 08] Mon 16:30:00, Edgar E. Iglesias wrote: >On Mon, Feb 8, 2021 at 3:42 PM Bin Meng wrote: > > On Thu, Jan 21, 2021 at 10:18 PM Francisco Iglesias > wrote: > > > > Hi Bin, > > > > On [2021 Jan 21] Thu 16:59:51, Bin Meng wrote: > > > Hi Francisco, > > > > > > On Thu, Jan 21, 2021 at 4:50 PM Francisco Iglesias > > > wrote: > > > > > > > > Dear Bin, > > > > > > > > On [2021 Jan 20] Wed 22:20:25, Bin Meng wrote: > > > > > Hi Francisco, > > > > > > > > > > On Tue, Jan 19, 2021 at 9:01 PM Francisco Iglesias > > > > > wrote: > > > > > > > > > > > > Hi Bin, > > > > > > > > > > > > On [2021 Jan 18] Mon 20:32:19, Bin Meng wrote: > > > > > > > Hi Francisco, > > > > > > > > > > > > > > On Mon, Jan 18, 2021 at 6:06 PM Francisco Iglesias > > > > > > > wrote: > > > > > > > > > > > > > > > > Hi Bin, > > > > > > > > > > > > > > > > On [2021 Jan 15] Fri 22:38:18, Bin Meng wrote: > > > > > > > > > Hi Francisco, > > > > > > > > > > > > > > > > > > On Fri, Jan 15, 2021 at 8:26 PM Francisco Iglesias > > > > > > > > > wrote: > > > > > > > > > > > > > > > > > > > > Hi Bin, > > > > > > > > > > > > > > > > > > > > On [2021 Jan 15] Fri 10:07:52, Bin Meng wrote: > > > > > > > > > > > Hi Francisco, > > > > > > > > > > > > > > > > > > > > > > On Fri, Jan 15, 2021 at 2:13 AM Francisco Iglesias > > > > > > > > > > > wrote: > > > > > > > > > > > > > > > > > > > > > > > > Hi Bin, > > > > > > > > > > > > > > > > > > > > > > > > On [2021 Jan 14] Thu 23:08:53, Bin Meng wrote: > > > > > > > > > > > > > From: Bin Meng > > > > > > > > > > > > > > > > > > > > > > > > > > The m25p80 model uses s->needed_bytes to > indicate how many follow-up > > > > > > > > > > > > > bytes are expected to be received after it > receives a command. For > > > > > > > > > > > > > example, depending on the address mode, either > 3-byte address or > > > > > > > > > > > > > 4-byte address is needed. > > > > > > > > > > > > > > > > > > > > > > > > > > For fast read family commands, some dummy cycles > are required after > > > > > > > > > > > > > sending the address bytes, and the dummy cycles > need to be counted > > > > > > > > > > > > > in s->needed_bytes. This is where the mess > began. > > > > > > > > > > > > > > > > > > > > > > > > > > As the variable name (needed_bytes) indicates, > the unit is in byte. > > > > > > > > > > > > > It is not in bit, or cycle. However for some > reason the model has > > > > > > > > > > > > > been using the number of dummy cycles for > s->needed_bytes. The right > > > > > > > > > > > > > approach is to convert the number of dummy > cycles to bytes based on > > > > > > > > > > > > > the SPI protocol, for example, 6 dummy cycles > for the Fast Read Quad > > > > > > > > > > > > > I/O (EBh) should be converted to 3 bytes per the > formula (6 * 4 / 8). > > > > > > > > > > > > > > > > > > > > > > > > While not being the original implementor I must > assume that above solution was > > > > > > > > > > > > considered but not chosen by the developers due to > it is inaccuracy (it > > > > > > > > > > > > wouldn't be possible to model exacly 6 dummy > cycles, only a multiple of 8, > > > > > > > > > > > > meaning that if the controller is wrongly > programmed to generate 7 the error > > > > > > > > > > > > wouldn't be caught and the controller will still > be considered "correct"). Now > > > > > > > > > > > > that we have this detail in the implementation I'm > in favor of keeping it, this > > > > > > > > > > > > also because the detail is already in use for > catching exactly above error. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > I found no clue from the commit message that my > proposed solution here > > > > > > > > > > > was ever considered, otherwise all SPI controller > models supporting > > > > > > > > > > > software generation should have been found out > seriously broken long > > > > > > > > > > > time ago! > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > The controllers you are referring to might lack > support for commands requiring > > > > > > > > > > dummy clock cycles but I really hope they work with > the other commands? If so I > > > > > > > > > > > > > > > > > > I am not sure why you view dummy clock cycles as > something special > > > > > > > > > that needs some special support from the SPI controller. > For the case > > > > > > > > > 1 controller, it's nothing special from the controller > perspective, >
Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress
On Mon, Feb 8, 2021 at 9:26 PM Philippe Mathieu-Daudé wrote: > > On Mon, Feb 8, 2021 at 8:59 PM Mauro Matteo Cascella > wrote: > > On Mon, Feb 8, 2021 at 8:35 PM Philippe Mathieu-Daudé > > wrote: > > > > > > Per the "SD Host Controller Simplified Specification Version 2.00" > > > spec. 'Table 2-4 : Block Size Register': > > > > > > Transfer Block Size [...] can be accessed only if no > > > transaction is executing (i.e., after a transaction has stopped). > > > Read operations during transfers may return an invalid value, > > > and write operations shall be ignored. > > > > ... > > > > > > Fixes: CVE-2020-17380 > > > Fixes: CVE-2020-25085 > > > Signed-off-by: Philippe Mathieu-Daudé > > > --- > > > Cc: Mauro Matteo Cascella > > > Cc: Alexander Bulekov > > > Cc: Alistair Francis > > > Cc: Prasad J Pandit > > > Cc: Bandan Das > > > > > > RFC because missing Reported-by tags, launchpad/bugzilla links and > > > qtest reproducer. Sending for review meanwhile. > ... > > For the above CVEs: > > Tested-by: Mauro Matteo Cascella > > Thanks Mauro for testing. Do you know what tags I should add for the credits? > > Phil. > I think the credit should go to Alexander for reporting [1] as well as people from Ruhr-University Bochum for CVE-2020-25085 (I don't know about their emails, though): Reported-by: Alexander Bulekov Reported-by: Sergej Schumilo (Ruhr-University Bochum) Reported-by: Cornelius Aschermann (Ruhr-University Bochum) Reported-by: Simon Wrner (Ruhr-University Bochum) [1] https://bugs.launchpad.net/qemu/+bug/1892960 -- Mauro Matteo Cascella Red Hat Product Security PGP-Key ID: BB3410B0
Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress
Hi Philippe, On Tue, Feb 9, 2021 at 3:34 AM Philippe Mathieu-Daudé wrote: > > Per the "SD Host Controller Simplified Specification Version 2.00" > spec. 'Table 2-4 : Block Size Register': > > Transfer Block Size [...] can be accessed only if no > transaction is executing (i.e., after a transaction has stopped). > Read operations during transfers may return an invalid value, > and write operations shall be ignored. > > Transactions will update 'data_count', so do not modify 'blksize' > and 'blkcnt' when 'data_count' is used. This fixes: > > $ cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \ >-nographic -serial none -M pc-q35-5.0 \ >-device sdhci-pci,sd-spec-version=3 \ >-device sd-card,drive=mydrive \ >-drive if=sd,index=0,file=null-co://,format=raw,id=mydrive > outl 0xcf8 0x80001810 > outl 0xcfc 0xe1068000 > outl 0xcf8 0x80001814 Is this command needed? > outl 0xcf8 0x80001804 > outw 0xcfc 0x7 > outl 0xcf8 0x8000fa20 and this one? > write 0xe106802c 0x1 0x0f > write 0xe1068004 0xc 0x2801d10101fbff28a384 Are these fuzzy data? > write 0xe106800c 0x1f > 0x9dacbbcad9e8f7061524334251606f7e8d9cabbac9d8e7f60514233241505f > write 0xe1068003 0x28 > 0x80d000251480d000252280d000253080d000253e80d000254c80d000255a80d000256880d0002576 > write 0xe1068003 0x1 0xfe > EOF > = > ==2686219==ERROR: AddressSanitizer: heap-buffer-overflow on address > 0x6153bb00 at pc 0x55ab469f456c bp 0x7ffee71be330 sp 0x7ffee71bdae0 > WRITE of size 4 at 0x6153bb00 thread T0 > #0 0x55ab469f456b in __asan_memcpy (qemu-system-i386+0x1cea56b) > #1 0x55ab483dc396 in stl_he_p include/qemu/bswap.h:353:5 > #2 0x55ab483af5e4 in stn_he_p include/qemu/bswap.h:546:1 > #3 0x55ab483aeb4b in flatview_read_continue softmmu/physmem.c:2839:13 > #4 0x55ab483b0705 in flatview_read softmmu/physmem.c:2877:12 > #5 0x55ab483b028e in address_space_read_full softmmu/physmem.c:2890:18 > #6 0x55ab483b1294 in address_space_rw softmmu/physmem.c:2918:16 > #7 0x55ab479374a2 in dma_memory_rw_relaxed include/sysemu/dma.h:88:12 > #8 0x55ab47936f50 in dma_memory_rw include/sysemu/dma.h:127:12 > #9 0x55ab4793665f in dma_memory_read include/sysemu/dma.h:145:12 > #10 0x55ab4792f176 in sdhci_sdma_transfer_multi_blocks > hw/sd/sdhci.c:639:13 > #11 0x55ab4793dc9d in sdhci_write hw/sd/sdhci.c:1129:17 > #12 0x55ab483f8db8 in memory_region_write_accessor > softmmu/memory.c:491:5 > #13 0x55ab483f868a in access_with_adjusted_size softmmu/memory.c:552:18 > #14 0x55ab483f6da5 in memory_region_dispatch_write > softmmu/memory.c:1501:16 > #15 0x55ab483c3b11 in flatview_write_continue softmmu/physmem.c:2774:23 > #16 0x55ab483b0eb6 in flatview_write softmmu/physmem.c:2814:14 > #17 0x55ab483b0a3e in address_space_write softmmu/physmem.c:2906:18 > #18 0x55ab48465c56 in qtest_process_command softmmu/qtest.c:654:9 > > 0x6153bb00 is located 0 bytes to the right of 512-byte region > [0x6153b900,0x6153bb00) > allocated by thread T0 here: > #0 0x55ab469f58a7 in calloc (qemu-system-i386+0x1ceb8a7) > #1 0x7f21d678f9b0 in g_malloc0 (/lib64/libglib-2.0.so.0+0x589b0) > #2 0x55ab479530ed in sdhci_pci_realize hw/sd/sdhci-pci.c:36:5 > #3 0x55ab476f102a in pci_qdev_realize hw/pci/pci.c:2108:9 > #4 0x55ab48baaad2 in device_set_realized hw/core/qdev.c:761:13 > > SUMMARY: AddressSanitizer: heap-buffer-overflow > (qemu-system-i386+0x1cea56b) in __asan_memcpy > Shadow bytes around the buggy address: > 0x0c2a7710: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa > 0x0c2a7720: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > 0x0c2a7730: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > 0x0c2a7740: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > 0x0c2a7750: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > =>0x0c2a7760:[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa > 0x0c2a7770: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd > 0x0c2a7780: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd > 0x0c2a7790: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd > 0x0c2a77a0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd > 0x0c2a77b0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa > Shadow byte legend (one shadow byte represents 8 application bytes): > Addressable: 00 > Heap left redzone: fa > Freed heap region: fd > ==2686219==ABORTING > > Fixes: CVE-2020-17380 > Fixes: CVE-2020-25085 > Signed-off-by: Philippe Mathieu-Daudé > --- > Cc: Mauro Matteo Cascella > Cc: Alexander Bulekov > Cc: Alistair Francis > Cc: Prasad J Pandit > Cc: Bandan Das > > RFC because missing Reported-by tags, launchpad/bugzilla links and > qtest reproducer. Sendin
Re: [RFC PATCH v2 3/4] block: Support multiple reopening with x-blockdev-reopen
08.02.2021 21:44, Alberto Garcia wrote: Signed-off-by: Alberto Garcia --- qapi/block-core.json | 2 +- include/block/block.h | 1 + block.c| 16 +-- blockdev.c | 85 +- tests/qemu-iotests/155 | 9 ++-- tests/qemu-iotests/165 | 4 +- tests/qemu-iotests/245 | 27 +++- tests/qemu-iotests/248 | 2 +- tests/qemu-iotests/248.out | 2 +- tests/qemu-iotests/298 | 4 +- 10 files changed, 89 insertions(+), 63 deletions(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index c0e7c23331..b9fcf20a81 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -4177,7 +4177,7 @@ # Since: 4.0 ## { 'command': 'x-blockdev-reopen', - 'data': 'BlockdevOptions', 'boxed': true } + 'data': { 'options': ['BlockdevOptions'] } } Do we also want to drop x- prefix? ## # @blockdev-del: diff --git a/include/block/block.h b/include/block/block.h index 6dd687a69e..fe4a220da9 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -372,6 +372,7 @@ BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name, BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, BlockDriverState *bs, QDict *options, bool keep_old_opts); +void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue); int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp); int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, Error **errp); diff --git a/block.c b/block.c index 19b62da4af..b4fef2308f 100644 --- a/block.c +++ b/block.c @@ -3933,6 +3933,17 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, NULL, 0, keep_old_opts); } +void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue) +{ +if (bs_queue) { +BlockReopenQueueEntry *bs_entry, *next; +QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { +g_free(bs_entry); +} +g_free(bs_queue); +} +} + /* * Reopen multiple BlockDriverStates atomically & transactionally. * @@ -4024,10 +4035,7 @@ abort: } cleanup: -QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { -g_free(bs_entry); -} -g_free(bs_queue); +bdrv_reopen_queue_free(bs_queue); this may be a separate patch return ret; } diff --git a/blockdev.c b/blockdev.c index 098a05709d..6b688c0f73 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3528,38 +3528,16 @@ fail: visit_free(v); } -void qmp_x_blockdev_reopen(BlockdevOptions *options, Error **errp) +void qmp_x_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp) { -BlockDriverState *bs; -QObject *obj; -Visitor *v = qobject_output_visitor_new(&obj); -BlockReopenQueue *queue; -QDict *qdict; - -/* Check for the selected node name */ -if (!options->has_node_name) { -error_setg(errp, "Node name not specified"); -goto fail; -} - -bs = bdrv_find_node(options->node_name); -if (!bs) { -error_setg(errp, "Cannot find node named '%s'", options->node_name); -goto fail; -} - -/* Put all options in a QDict and flatten it */ -visit_type_BlockdevOptions(v, NULL, &options, &error_abort); -visit_complete(v, &obj); -qdict = qobject_to(QDict, obj); - -qdict_flatten(qdict); - -/* Perform the reopen operation */ +BlockReopenQueue *queue = NULL; +GSList *aio_ctxs = NULL; +GSList *visitors = NULL; +GSList *drained = NULL; BdrvNextIterator it; -GSList *aio_ctxs = NULL, *ctx; BlockDriverState *it_bs; +/* Acquire all AIO contexts */ for (it_bs = bdrv_first(&it); it_bs; it_bs = bdrv_next(&it)) { AioContext *aio_context = bdrv_get_aio_context(it_bs); @@ -3569,19 +3547,50 @@ void qmp_x_blockdev_reopen(BlockdevOptions *options, Error **errp) } } -bdrv_subtree_drained_begin(bs); -queue = bdrv_reopen_queue(NULL, bs, qdict, false); +/* Add each one of the BDS that we want to reopen to the queue */ +for (; reopen_list != NULL; reopen_list = reopen_list->next) { +BlockdevOptions *options = reopen_list->value; +QDict *qdict; +Visitor *v; +BlockDriverState *bs; +QObject *obj; + +/* Check for the selected node name */ +if (!options->has_node_name) { +error_setg(errp, "Node name not specified"); +goto fail; +} + +bs = bdrv_find_node(options->node_name); +if (!bs) { +error_setg(errp, "Cannot find node named '%s'", options->node_name); +goto fail; +} + +v = qobject_output_visitor_new(&obj); +visitors = g_slist_prepend(visitors, v); + +/* Put all options in a QDict and flatten it */ +
[PULL 52/56] hw/block/nvme: fix set feature save field check
From: Gollu Appalanaidu Currently, no features are saveable, so the current check is not wrong, but add a check against the feature capabilities to make sure this will not regress if saveable features are added later. Signed-off-by: Gollu Appalanaidu Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b3d072c8b2bb..c99a3fbf3461 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3324,7 +3324,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11); -if (save) { +if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) { return NVME_FID_NOT_SAVEABLE | NVME_DNR; } -- 2.30.0
[PULL 56/56] hw/block/nvme: refactor the logic for zone write checks
From: Klaus Jensen Refactor the zone write check logic such that the most "meaningful" error is returned first. That is, first, if the zone is not writable, return an appropriate status code for that. Then, make sure we are actually writing at the write pointer and finally check that we do not cross the zone write boundary. This aligns with the "priority" of status codes for zone read checks. Also add a couple of additional descriptive trace events and remove an always true assert. Cc: Dmitry Fomichev Tested-by: Niklas Cassel Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 49 --- hw/block/trace-events | 5 + 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index cedb4ad9ffd3..5ce21b7100b3 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1161,56 +1161,53 @@ static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba) static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) { -uint16_t status; +uint64_t zslba = zone->d.zslba; switch (nvme_get_zone_state(zone)) { case NVME_ZONE_STATE_EMPTY: case NVME_ZONE_STATE_IMPLICITLY_OPEN: case NVME_ZONE_STATE_EXPLICITLY_OPEN: case NVME_ZONE_STATE_CLOSED: -status = NVME_SUCCESS; -break; +return NVME_SUCCESS; case NVME_ZONE_STATE_FULL: -status = NVME_ZONE_FULL; -break; +trace_pci_nvme_err_zone_is_full(zslba); +return NVME_ZONE_FULL; case NVME_ZONE_STATE_OFFLINE: -status = NVME_ZONE_OFFLINE; -break; +trace_pci_nvme_err_zone_is_offline(zslba); +return NVME_ZONE_OFFLINE; case NVME_ZONE_STATE_READ_ONLY: -status = NVME_ZONE_READ_ONLY; -break; +trace_pci_nvme_err_zone_is_read_only(zslba); +return NVME_ZONE_READ_ONLY; default: assert(false); } -return status; +return NVME_INTERNAL_DEV_ERROR; } static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone, uint64_t slba, uint32_t nlb) { +uint64_t zcap = nvme_zone_wr_boundary(zone); uint16_t status; -if (unlikely((slba + nlb) > nvme_zone_wr_boundary(zone))) { -status = NVME_ZONE_BOUNDARY_ERROR; -} else { -status = nvme_check_zone_state_for_write(zone); -} - +status = nvme_check_zone_state_for_write(zone); if (status) { -trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status); -} else { -assert(nvme_wp_is_valid(zone)); - -if (unlikely(slba != zone->w_ptr)) { -trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, - zone->w_ptr); -status = NVME_ZONE_INVALID_WRITE; -} +return status; } -return status; +if (unlikely(slba != zone->w_ptr)) { +trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr); +return NVME_ZONE_INVALID_WRITE; +} + +if (unlikely((slba + nlb) > zcap)) { +trace_pci_nvme_err_zone_boundary(slba, nlb, zcap); +return NVME_ZONE_BOUNDARY_ERROR; +} + +return NVME_SUCCESS; } static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) diff --git a/hw/block/trace-events b/hw/block/trace-events index 87ab6c509045..d32475c3989e 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -129,6 +129,11 @@ pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) " pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32"" pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) "writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64"" pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at slba=%"PRIu64", but zone=%"PRIu64"" +pci_nvme_err_zone_is_full(uint64_t zslba) "zslba 0x%"PRIx64"" +pci_nvme_err_zone_is_read_only(uint64_t zslba) "zslba 0x%"PRIx64"" +pci_nvme_err_zone_is_offline(uint64_t zslba) "zslba 0x%"PRIx64"" +pci_nvme_err_zone_boundary(uint64_t slba, uint32_t nlb, uint64_t zcap) "lba 0x%"PRIx64" nlb %"PRIu32" zcap 0x%"PRIx64"" +pci_nvme_err_zone_invalid_write(uint64_t slba, uint64_t wp) "lba 0x%"PRIx64" wp 0x%"PRIx64"" pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8"" -- 2.30.0
[PULL 55/56] hw/block/nvme: fix zone boundary check for append
From: Klaus Jensen When a zone append is processed the controller checks that validity of the write before assigning the LBA to the append command. This causes the boundary check to be wrong. Fix this by checking the write *after* assigning the LBA. Remove the append special case from the nvme_check_zone_write and open code it in nvme_do_write, assigning the slba when basic sanity checks have been performed. Then check the validity of the resulting write like any other write command. In the process, also fix a missing endianness conversion for the zone append ALBA. Reported-by: Niklas Cassel Cc: Dmitry Fomichev Tested-by: Niklas Cassel Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 46 -- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index e562d7467b3b..cedb4ad9ffd3 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1188,7 +1188,7 @@ static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone, uint64_t slba, - uint32_t nlb, bool append) + uint32_t nlb) { uint16_t status; @@ -1202,16 +1202,8 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status); } else { assert(nvme_wp_is_valid(zone)); -if (append) { -if (unlikely(slba != zone->d.zslba)) { -trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); -status = NVME_INVALID_FIELD; -} -if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) { -trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl); -status = NVME_INVALID_FIELD; -} -} else if (unlikely(slba != zone->w_ptr)) { + +if (unlikely(slba != zone->w_ptr)) { trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr); status = NVME_ZONE_INVALID_WRITE; @@ -1349,10 +1341,9 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, } } -static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, - uint32_t nlb) +static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, + uint32_t nlb) { -uint64_t result = zone->w_ptr; uint8_t zs; zone->w_ptr += nlb; @@ -1368,8 +1359,6 @@ static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); } } - -return result; } static inline bool nvme_is_write(NvmeRequest *req) @@ -1747,7 +1736,24 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, if (ns->params.zoned) { zone = nvme_get_zone_by_slba(ns, slba); -status = nvme_check_zone_write(n, ns, zone, slba, nlb, append); +if (append) { +if (unlikely(slba != zone->d.zslba)) { +trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); +status = NVME_INVALID_FIELD; +goto invalid; +} + +if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) { +trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl); +status = NVME_INVALID_FIELD; +goto invalid; +} + +slba = zone->w_ptr; +res->slba = cpu_to_le64(slba); +} + +status = nvme_check_zone_write(n, ns, zone, slba, nlb); if (status) { goto invalid; } @@ -1757,11 +1763,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, goto invalid; } -if (append) { -slba = zone->w_ptr; -} - -res->slba = nvme_advance_zone_wp(ns, zone, nlb); +nvme_advance_zone_wp(ns, zone, nlb); } data_offset = nvme_l2b(ns, slba); -- 2.30.0
[PULL 51/56] hw/block/nvme: fix set feature for error recovery
From: Gollu Appalanaidu Only enable DULBE if the namespace supports it. Signed-off-by: Gollu Appalanaidu Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 40784bd908fb..b3d072c8b2bb 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3396,7 +3396,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) } assert(ns); -ns->features.err_rec = dw11; +if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { +ns->features.err_rec = dw11; +} break; case NVME_VOLATILE_WRITE_CACHE: for (i = 1; i <= n->num_namespaces; i++) { -- 2.30.0
[PULL 54/56] hw/block/nvme: fix wrong parameter name 'cross_read'
From: Minwoo Im The actual parameter name is 'cross_read' rather than 'cross_zone_read'. Signed-off-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 2335739bdb17..e562d7467b3b 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -81,7 +81,7 @@ * The default value means there is no limit to the number of * concurrently open zones. * - * zoned.cross_zone_read= + * zoned.cross_read= * Setting this property to true enables Read Across Zone Boundaries. */ -- 2.30.0
[PULL 50/56] hw/block/nvme: error if drive less than a zone size
From: Minwoo Im If a user assigns a backing device with less capacity than the size of a single zone, the namespace capacity will be reported as zero and the kernel will silently fail to allocate the namespace. This patch errors out in case that the backing device cannot accomodate at least a single zone. Signed-off-by: Minwoo Im [k.jensen: small fixup in the error and commit message] Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 3f52acb89c95..dfed71a950fa 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -134,6 +134,13 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) ns->num_zones = ns->size / lbasz / ns->zone_size; /* Do a few more sanity checks of ZNS properties */ +if (!ns->num_zones) { +error_setg(errp, + "insufficient drive capacity, must be at least the size " + "of one zone (%"PRIu64"B)", zone_size); +return -1; +} + if (ns->params.max_open_zones > ns->num_zones) { error_setg(errp, "max_open_zones value %u exceeds the number of zones %u", -- 2.30.0
[PULL 49/56] hw/block/nvme: lift cmb restrictions
From: Klaus Jensen The controller now implements v1.4 and we can lift the restrictions on CMB Data Pointer and Command Independent Locations Support (CDPCILS) and CMB Data Pointer Mixed Locations Support (CDPMLS) since the device really does not care about mixed host/cmb pointers in those cases. Reviewed-by: Keith Busch Reviewed-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 33 ++--- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c4c968f5951e..40784bd908fb 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -509,7 +509,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, trans_len = MIN(len, trans_len); int num_prps = (len >> n->page_bits) + 1; uint16_t status; -bool prp_list_in_cmb = false; int ret; QEMUSGList *qsg = &req->qsg; @@ -535,10 +534,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, uint32_t nents, prp_trans; int i = 0; -if (nvme_addr_is_cmb(n, prp2)) { -prp_list_in_cmb = true; -} - nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); @@ -555,10 +550,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, return NVME_INVALID_PRP_OFFSET | NVME_DNR; } -if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) { -return NVME_INVALID_USE_OF_CMB | NVME_DNR; -} - i = 0; nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); @@ -692,7 +683,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, uint64_t nsgld; uint32_t seg_len; uint16_t status; -bool sgl_in_cmb = false; hwaddr addr; int ret; @@ -714,18 +704,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, goto out; } -/* - * If the segment is located in the CMB, the submission queue of the - * request must also reside there. - */ -if (nvme_addr_is_cmb(n, addr)) { -if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) { -return NVME_INVALID_USE_OF_CMB | NVME_DNR; -} - -sgl_in_cmb = true; -} - for (;;) { switch (NVME_SGL_TYPE(sgld->type)) { case NVME_SGL_DESCR_TYPE_SEGMENT: @@ -814,15 +792,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, if (status) { goto unmap; } - -/* - * If the next segment is in the CMB, make sure that the sgl was - * already located there. - */ -if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) { -status = NVME_INVALID_USE_OF_CMB | NVME_DNR; -goto unmap; -} } out: @@ -3777,6 +3746,8 @@ static int nvme_start_ctrl(NvmeCtrl *n) static void nvme_cmb_enable_regs(NvmeCtrl *n) { +NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1); +NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1); NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR); NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1); -- 2.30.0
Re: [RFC PATCH v2 1/4] block: Allow changing bs->file on reopen
08.02.2021 21:44, Alberto Garcia wrote: When the x-blockdev-reopen was added it allowed reconfiguring the graph by replacing backing files, but changing the 'file' option was forbidden. Because of this restriction some operations are not possible, notably inserting and removing block filters. This patch adds support for replacing the 'file' option. This is similar to replacing the backing file and the user is likewise responsible for the correctness of the resulting graph, otherwise this can lead to data corruption. Signed-off-by: Alberto Garcia --- include/block/block.h | 1 + block.c| 65 ++ tests/qemu-iotests/245 | 7 +++-- 3 files changed, 70 insertions(+), 3 deletions(-) diff --git a/include/block/block.h b/include/block/block.h index 82271d9ccd..6dd687a69e 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -196,6 +196,7 @@ typedef struct BDRVReopenState { bool backing_missing; bool replace_backing_bs; /* new_backing_bs is ignored if this is false */ BlockDriverState *old_backing_bs; /* keep pointer for permissions update */ +BlockDriverState *old_file_bs;/* keep pointer for permissions update */ uint64_t perm, shared_perm; QDict *options; QDict *explicit_options; diff --git a/block.c b/block.c index 576b145cbf..19b62da4af 100644 --- a/block.c +++ b/block.c @@ -3978,6 +3978,10 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) refresh_list = bdrv_topological_dfs(refresh_list, found, state->old_backing_bs); } +if (state->old_file_bs) { +refresh_list = bdrv_topological_dfs(refresh_list, found, +state->old_file_bs); +} } ret = bdrv_list_refresh_perms(refresh_list, bs_queue, &tran, errp); @@ -4196,6 +4200,61 @@ static int bdrv_reopen_parse_backing(BDRVReopenState *reopen_state, return 0; } +static int bdrv_reopen_parse_file(BDRVReopenState *reopen_state, + GSList **tran, + Error **errp) +{ +BlockDriverState *bs = reopen_state->bs; +BlockDriverState *new_file_bs; +QObject *value; +const char *str; + +value = qdict_get(reopen_state->options, "file"); +if (value == NULL) { +return 0; +} + +/* The 'file' option only allows strings */ +assert(qobject_type(value) == QTYPE_QSTRING); + +str = qobject_get_try_str(value); +new_file_bs = bdrv_lookup_bs(NULL, str, errp); +if (new_file_bs == NULL) { +return -EINVAL; +} else if (bdrv_recurse_has_child(new_file_bs, bs)) { +error_setg(errp, "Making '%s' a file of '%s' " + "would create a cycle", str, bs->node_name); +return -EINVAL; +} + +assert(bs->file && bs->file->bs); + +/* If 'file' points to the current child then there's nothing to do */ +if (bs->file->bs == new_file_bs) { +return 0; +} + +if (bs->file->frozen) { +error_setg(errp, "Cannot change the 'file' link of '%s' " + "from '%s' to '%s'", bs->node_name, + bs->file->bs->node_name, new_file_bs->node_name); +return -EPERM; +} + +/* Check AioContext compatibility */ +if (!bdrv_reopen_can_attach(bs, bs->file, new_file_bs, errp)) { +return -EINVAL; +} + +/* Store the old file bs because we'll need to refresh its permissions */ +reopen_state->old_file_bs = bs->file->bs; + +/* And finally replace the child */ +bdrv_replace_child(bs->file, new_file_bs, tran); + +return 0; +} The function mostly do the same that bdrv_reopen_parse_backing().. I don't think that they should really differ. Probably it should be one function. At least, they should work absolutely the same way for backing-child based and file-child based filters. And you lose bdrv_is_backing_chain_frozen() check + /* * Prepares a BlockDriverState for reopen. All changes are staged in the * 'opaque' field of the BDRVReopenState, which is used and allocated by @@ -4347,6 +4406,12 @@ static int bdrv_reopen_prepare(BDRVReopenState *reopen_state, } qdict_del(reopen_state->options, "backing"); +ret = bdrv_reopen_parse_file(reopen_state, set_backings_tran, errp); +if (ret < 0) { +goto error; +} +qdict_del(reopen_state->options, "file"); + /* Options that are not handled are only okay if they are unchanged * compared to the old state. It is expected that some options are only * used for the initial open, but not reopen (e.g. filename) */ diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 index e60c8326d3..f9d68b3958 100755 --- a/tests/qemu-iotests/245 +++ b/tests/qemu-iotests/245 @@ -145,8 +145,8 @@ class TestBlockdevReopen(iotests.QMPTestCase): self.reopen(
[PULL 47/56] hw/block/nvme: move cmb logic to v1.4
From: Padmakar Kalghatgi Implement v1.4 logic for configuring the Controller Memory Buffer. By default, the v1.4 scheme will be used (CMB must be explicitly enabled by the host), so drivers that only support v1.3 will not be able to use the CMB anymore. To retain the v1.3 behavior, set the boolean 'legacy-cmb' nvme device parameter. Reviewed-by: Keith Busch Reviewed-by: Minwoo Im Signed-off-by: Padmakar Kalghatgi Signed-off-by: Klaus Jensen --- hw/block/nvme.h | 10 +++- include/block/nvme.h | 107 +- hw/block/nvme.c | 101 +-- hw/block/trace-events | 2 + 4 files changed, 182 insertions(+), 38 deletions(-) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index b7702e937e56..dee6092bd45f 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -20,6 +20,7 @@ typedef struct NvmeParams { uint8_t mdts; bool use_intel_id; uint32_t zasl_bs; +bool legacy_cmb; } NvmeParams; typedef struct NvmeAsyncEvent { @@ -127,7 +128,6 @@ typedef struct NvmeCtrl { PCIDeviceparent_obj; MemoryRegion bar0; MemoryRegion iomem; -MemoryRegion ctrl_mem; NvmeBar bar; NvmeParams params; NvmeBus bus; @@ -143,7 +143,6 @@ typedef struct NvmeCtrl { uint32_tnum_namespaces; uint32_tmax_q_ents; uint8_t outstanding_aers; -uint8_t *cmbuf; uint32_tirq_status; uint64_thost_timestamp; /* Timestamp sent by the host */ uint64_ttimestamp_set_qemu_clock_ms;/* QEMU clock time */ @@ -151,6 +150,13 @@ typedef struct NvmeCtrl { uint16_ttemperature; uint8_t smart_critical_warning; +struct { +MemoryRegion mem; +uint8_t *buf; +bool cmse; +hwaddr cba; +} cmb; + struct { HostMemoryBackend *dev; bool cmse; diff --git a/include/block/nvme.h b/include/block/nvme.h index 008108bd1af8..2e85b97a6c4e 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -15,14 +15,19 @@ typedef struct QEMU_PACKED NvmeBar { uint64_tacq; uint32_tcmbloc; uint32_tcmbsz; -uint8_t padding[3520]; /* not used by QEMU */ +uint32_tbpinfo; +uint32_tbprsel; +uint64_tbpmbl; +uint64_tcmbmsc; +uint32_tcmbsts; +uint8_t rsvd92[3492]; uint32_tpmrcap; uint32_tpmrctl; uint32_tpmrsts; uint32_tpmrebs; uint32_tpmrswtp; uint64_tpmrmsc; -uint8_t reserved[484]; +uint8_t css[484]; } NvmeBar; enum NvmeCapShift { @@ -63,6 +68,7 @@ enum NvmeCapMask { #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) #define NVME_CAP_PMRS(cap) (((cap) >> CAP_PMRS_SHIFT) & CAP_PMRS_MASK) +#define NVME_CAP_CMBS(cap) (((cap) >> CAP_CMBS_SHIFT) & CAP_CMBS_MASK) #define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ << CAP_MQES_SHIFT) @@ -184,25 +190,64 @@ enum NvmeAqaMask { #define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK) enum NvmeCmblocShift { -CMBLOC_BIR_SHIFT = 0, -CMBLOC_OFST_SHIFT = 12, +CMBLOC_BIR_SHIFT = 0, +CMBLOC_CQMMS_SHIFT = 3, +CMBLOC_CQPDS_SHIFT = 4, +CMBLOC_CDPMLS_SHIFT = 5, +CMBLOC_CDPCILS_SHIFT = 6, +CMBLOC_CDMMMS_SHIFT = 7, +CMBLOC_CQDA_SHIFT= 8, +CMBLOC_OFST_SHIFT= 12, }; enum NvmeCmblocMask { -CMBLOC_BIR_MASK = 0x7, -CMBLOC_OFST_MASK = 0xf, +CMBLOC_BIR_MASK = 0x7, +CMBLOC_CQMMS_MASK = 0x1, +CMBLOC_CQPDS_MASK = 0x1, +CMBLOC_CDPMLS_MASK = 0x1, +CMBLOC_CDPCILS_MASK = 0x1, +CMBLOC_CDMMMS_MASK = 0x1, +CMBLOC_CQDA_MASK= 0x1, +CMBLOC_OFST_MASK= 0xf, }; -#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT) & \ - CMBLOC_BIR_MASK) -#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \ - CMBLOC_OFST_MASK) +#define NVME_CMBLOC_BIR(cmbloc) \ +((cmbloc >> CMBLOC_BIR_SHIFT) & CMBLOC_BIR_MASK) +#define NVME_CMBLOC_CQMMS(cmbloc) \ +((cmbloc >> CMBLOC_CQMMS_SHIFT) & CMBLOC_CQMMS_MASK) +#define NVME_CMBLOC_CQPDS(cmbloc) \ +((cmbloc >> CMBLOC_CQPDS_SHIFT) & CMBLOC_CQPDS_MASK) +#define NVME_CMBLOC_CDPMLS(cmbloc) \ +((cmbloc >> CMBLOC_CDPMLS_SHIFT) & CMBLOC_CDPMLS_MASK) +#define NVME_CMBLOC_CDPCILS(cmbloc) \ +((cmbloc >> CMBLOC_CDPCILS_SHIFT) & CMBLOC_CDPCILS_MASK) +#define NVME_CMBLOC_CDMMMS(cmbloc) \ +((cmbloc >> CMBLOC_CDMMMS_SHIFT) & CMBLOC_CDMMMS_MASK) +#define NVME_CMBLOC_CQDA(cmbloc) \ +((cmbloc >> CMBLOC_CQDA_SHIFT) & CMBLOC_CQDA_MASK) +#define NVME_CMBLOC_OFST(cmbloc) \ +((cmbloc >> CMBLOC_OFST_SHIFT) & CMBLOC_OFST_MASK) -#de
[PULL 43/56] hw/block/nvme: rename PMR/CMB shift/mask fields
From: Klaus Jensen Use the correct field names. Reviewed-by: Minwoo Im Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- include/block/nvme.h | 18 +- hw/block/nvme.c | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index 151921da21f9..008108bd1af8 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -35,8 +35,8 @@ enum NvmeCapShift { CAP_CSS_SHIFT = 37, CAP_MPSMIN_SHIFT = 48, CAP_MPSMAX_SHIFT = 52, -CAP_PMR_SHIFT = 56, -CAP_CMB_SHIFT = 57, +CAP_PMRS_SHIFT = 56, +CAP_CMBS_SHIFT = 57, }; enum NvmeCapMask { @@ -49,8 +49,8 @@ enum NvmeCapMask { CAP_CSS_MASK = 0xff, CAP_MPSMIN_MASK= 0xf, CAP_MPSMAX_MASK= 0xf, -CAP_PMR_MASK = 0x1, -CAP_CMB_MASK = 0x1, +CAP_PMRS_MASK = 0x1, +CAP_CMBS_MASK = 0x1, }; #define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) @@ -62,7 +62,7 @@ enum NvmeCapMask { #define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT)& CAP_CSS_MASK) #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) -#define NVME_CAP_PMR(cap) (((cap) >> CAP_PMR_SHIFT)& CAP_PMR_MASK) +#define NVME_CAP_PMRS(cap) (((cap) >> CAP_PMRS_SHIFT) & CAP_PMRS_MASK) #define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ << CAP_MQES_SHIFT) @@ -82,10 +82,10 @@ enum NvmeCapMask { << CAP_MPSMIN_SHIFT) #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ << CAP_MPSMAX_SHIFT) -#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK) \ - << CAP_PMR_SHIFT) -#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMB_MASK) \ - << CAP_CMB_SHIFT) +#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMRS_MASK) \ + << CAP_PMRS_SHIFT) +#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMBS_MASK) \ + << CAP_CMBS_SHIFT) enum NvmeCapCss { NVME_CAP_CSS_NVM= 1 << 0, diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 4ce75642f1a4..0057a02402b7 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4501,7 +4501,7 @@ static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA; -if (NVME_CAP_PMR(n->bar.cap)) { +if (NVME_CAP_PMRS(n->bar.cap)) { cap |= NVME_SMART_PMR_UNRELIABLE; } -- 2.30.0
[PULL 40/56] hw/block/nvme: indicate CMB support through controller capabilities register
From: Andrzej Jakowski This patch sets CMBS bit in controller capabilities register when user configures NVMe driver with CMB support, so capabilites are correctly reported to guest OS. Signed-off-by: Andrzej Jakowski Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- include/block/nvme.h | 10 +++--- hw/block/nvme.c | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index 854fb2abb6f8..151921da21f9 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -36,6 +36,7 @@ enum NvmeCapShift { CAP_MPSMIN_SHIFT = 48, CAP_MPSMAX_SHIFT = 52, CAP_PMR_SHIFT = 56, +CAP_CMB_SHIFT = 57, }; enum NvmeCapMask { @@ -49,6 +50,7 @@ enum NvmeCapMask { CAP_MPSMIN_MASK= 0xf, CAP_MPSMAX_MASK= 0xf, CAP_PMR_MASK = 0x1, +CAP_CMB_MASK = 0x1, }; #define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) @@ -79,9 +81,11 @@ enum NvmeCapMask { #define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\ << CAP_MPSMIN_SHIFT) #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ -<< CAP_MPSMAX_SHIFT) -#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\ -<< CAP_PMR_SHIFT) + << CAP_MPSMAX_SHIFT) +#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK) \ + << CAP_PMR_SHIFT) +#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMB_MASK) \ + << CAP_CMB_SHIFT) enum NvmeCapCss { NVME_CAP_CSS_NVM= 1 << 0, diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 2785127037db..5f12ac1200ec 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4374,6 +4374,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP); NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); +NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0); n->bar.vs = NVME_SPEC_VER; n->bar.intmc = n->bar.intms = 0; -- 2.30.0
[PULL 53/56] hw/block/nvme: align with existing style
From: Gollu Appalanaidu Change status checks to align with the existing style and remove the explicit check against NVME_SUCCESS. Cc: Dmitry Fomichev Signed-off-by: Gollu Appalanaidu Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch Reviewed-by: Dmitry Fomichev Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 20 ++-- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c99a3fbf3461..2335739bdb17 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1198,7 +1198,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, status = nvme_check_zone_state_for_write(zone); } -if (status != NVME_SUCCESS) { +if (status) { trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status); } else { assert(nvme_wp_is_valid(zone)); @@ -1253,7 +1253,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, uint16_t status; status = nvme_check_zone_state_for_read(zone); -if (status != NVME_SUCCESS) { +if (status) { ; } else if (unlikely(end > bndry)) { if (!ns->params.cross_zone_read) { @@ -1266,7 +1266,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, do { zone++; status = nvme_check_zone_state_for_read(zone); -if (status != NVME_SUCCESS) { +if (status) { break; } } while (end > nvme_zone_rd_boundary(ns, zone)); @@ -1677,7 +1677,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) if (ns->params.zoned) { status = nvme_check_zone_read(ns, slba, nlb); -if (status != NVME_SUCCESS) { +if (status) { trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status); goto invalid; } @@ -1748,12 +1748,12 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, zone = nvme_get_zone_by_slba(ns, slba); status = nvme_check_zone_write(n, ns, zone, slba, nlb, append); -if (status != NVME_SUCCESS) { +if (status) { goto invalid; } status = nvme_auto_open_zone(ns, zone); -if (status != NVME_SUCCESS) { +if (status) { goto invalid; } @@ -1852,14 +1852,14 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, switch (state) { case NVME_ZONE_STATE_EMPTY: status = nvme_aor_check(ns, 1, 0); -if (status != NVME_SUCCESS) { +if (status) { return status; } nvme_aor_inc_active(ns); /* fall through */ case NVME_ZONE_STATE_CLOSED: status = nvme_aor_check(ns, 0, 1); -if (status != NVME_SUCCESS) { +if (status) { if (state == NVME_ZONE_STATE_EMPTY) { nvme_aor_dec_active(ns); } @@ -1972,7 +1972,7 @@ static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone) if (state == NVME_ZONE_STATE_EMPTY) { status = nvme_aor_check(ns, 1, 0); -if (status != NVME_SUCCESS) { +if (status) { return status; } nvme_aor_inc_active(ns); @@ -3301,7 +3301,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) ret = nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), DMA_DIRECTION_TO_DEVICE, req); -if (ret != NVME_SUCCESS) { +if (ret) { return ret; } -- 2.30.0
[PULL 44/56] hw/block/nvme: remove redundant zeroing of PMR registers
From: Klaus Jensen The controller registers are initially zero. Remove the redundant zeroing. Reviewed-by: Keith Busch Reviewed-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 34 -- 1 file changed, 34 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 0057a02402b7..f8dd771925f9 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4217,43 +4217,9 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) { -/* PMR Capabities register */ -n->bar.pmrcap = 0; -NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0); -NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0); NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR); -NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0); /* Turn on bit 1 support */ NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02); -NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0); -NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0); - -/* PMR Control register */ -n->bar.pmrctl = 0; -NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0); - -/* PMR Status register */ -n->bar.pmrsts = 0; -NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0); -NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0); -NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0); -NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0); - -/* PMR Elasticity Buffer Size register */ -n->bar.pmrebs = 0; -NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0); -NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0); -NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0); - -/* PMR Sustained Write Throughput register */ -n->bar.pmrswtp = 0; -NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0); -NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0); - -/* PMR Memory Space Control register */ -n->bar.pmrmsc = 0; -NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0); -NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0); pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap), PCI_BASE_ADDRESS_SPACE_MEMORY | -- 2.30.0
[PULL 37/56] hw/block/nvme: trigger async event during injecting smart warning
From: zhenwei pi During smart critical warning injection by setting property from QMP command, also try to trigger asynchronous event. Suggested by Keith, if a event has already been raised, there is no need to enqueue the duplicate event any more. Signed-off-by: zhenwei pi [k.jensen: fix typo in commit message] Signed-off-by: Klaus Jensen --- include/block/nvme.h | 1 + hw/block/nvme.c | 48 +--- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index 88af3b42348c..854fb2abb6f8 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -784,6 +784,7 @@ typedef struct QEMU_PACKED NvmeSmartLog { uint8_t reserved2[320]; } NvmeSmartLog; +#define NVME_SMART_WARN_MAX 6 enum NvmeSmartWarn { NVME_SMART_SPARE = 1 << 0, NVME_SMART_TEMPERATURE= 1 << 1, diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f0cb7acd7454..09eb1f06e8b1 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -980,6 +980,35 @@ static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type, nvme_process_aers(n); } +static void nvme_smart_event(NvmeCtrl *n, uint8_t event) +{ +uint8_t aer_info; + +/* Ref SPEC */ +if (!(NVME_AEC_SMART(n->features.async_config) & event)) { +return; +} + +switch (event) { +case NVME_SMART_SPARE: +aer_info = NVME_AER_INFO_SMART_SPARE_THRESH; +break; +case NVME_SMART_TEMPERATURE: +aer_info = NVME_AER_INFO_SMART_TEMP_THRESH; +break; +case NVME_SMART_RELIABILITY: +case NVME_SMART_MEDIA_READ_ONLY: +case NVME_SMART_FAILED_VOLATILE_MEDIA: +case NVME_SMART_PMR_UNRELIABLE: +aer_info = NVME_AER_INFO_SMART_RELIABILITY; +break; +default: +return; +} + +nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO); +} + static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type) { n->aer_mask &= ~(1 << event_type); @@ -3317,12 +3346,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } -if (((n->temperature >= n->features.temp_thresh_hi) || - (n->temperature <= n->features.temp_thresh_low)) && -NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) { -nvme_enqueue_event(n, NVME_AER_TYPE_SMART, - NVME_AER_INFO_SMART_TEMP_THRESH, - NVME_LOG_SMART_INFO); +if ((n->temperature >= n->features.temp_thresh_hi) || +(n->temperature <= n->features.temp_thresh_low)) { +nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH); } break; @@ -4446,7 +4472,7 @@ static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { NvmeCtrl *n = NVME(obj); -uint8_t value, cap = 0; +uint8_t value, old_value, cap = 0, index, event; if (!visit_type_uint8(v, name, &value, errp)) { return; @@ -4464,7 +4490,15 @@ static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, return; } +old_value = n->smart_critical_warning; n->smart_critical_warning = value; + +/* only inject new bits of smart critical warning */ +for (index = 0; index < NVME_SMART_WARN_MAX; index++) { +event = 1 << index; +if (value & ~old_value & event) +nvme_smart_event(n, event); +} } static const VMStateDescription nvme_vmstate = { -- 2.30.0
[PULL 35/56] nvme: introduce bit 5 for critical warning
From: zhenwei pi According to NVM Express v1.4, Section 5.14.1.2 ("SMART / Health Information"), introduce bit 5 for "Persistent Memory Region has become read-only or unreliable". Signed-off-by: zhenwei pi [k.jensen: minor brush ups in commit message] Signed-off-by: Klaus Jensen --- include/block/nvme.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/block/nvme.h b/include/block/nvme.h index 45b2678db1f0..41614c5e12af 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -789,6 +789,7 @@ enum NvmeSmartWarn { NVME_SMART_RELIABILITY= 1 << 2, NVME_SMART_MEDIA_READ_ONLY= 1 << 3, NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4, +NVME_SMART_PMR_UNRELIABLE = 1 << 5, }; typedef struct NvmeEffectsLog { -- 2.30.0
[PULL 46/56] hw/block/nvme: add PMR RDS/WDS support
From: Naveen Nagar Add support for the PMRMSCL and PMRMSCU MMIO registers. This allows adding RDS/WDS support for PMR as well. Reviewed-by: Keith Busch Signed-off-by: Naveen Nagar Signed-off-by: Klaus Jensen --- hw/block/nvme.h | 6 ++- hw/block/nvme.c | 122 +++- 2 files changed, 106 insertions(+), 22 deletions(-) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 1cdb360bc549..b7702e937e56 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -151,7 +151,11 @@ typedef struct NvmeCtrl { uint16_ttemperature; uint8_t smart_critical_warning; -HostMemoryBackend *pmrdev; +struct { +HostMemoryBackend *dev; +bool cmse; +hwaddrcba; +} pmr; uint8_t aer_mask; NvmeRequest **aer_reqs; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index d773796051d6..7f1c8dd7751c 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -273,6 +273,24 @@ static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) return &n->cmbuf[addr - n->ctrl_mem.addr]; } +static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr) +{ +hwaddr hi; + +if (!n->pmr.cmse) { +return false; +} + +hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size); + +return addr >= n->pmr.cba && addr < hi; +} + +static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr) +{ +return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba); +} + static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) { hwaddr hi = addr + size - 1; @@ -285,6 +303,11 @@ static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) return 0; } +if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) { +memcpy(buf, nvme_addr_to_pmr(n, addr), size); +return 0; +} + return pci_dma_read(&n->parent_obj, addr, buf, size); } @@ -406,9 +429,27 @@ static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, return NVME_SUCCESS; } +static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, +size_t len) +{ +if (!len) { +return NVME_SUCCESS; +} + +if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) { +return NVME_DATA_TRAS_ERROR; +} + +qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len); + +return NVME_SUCCESS; +} + static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, hwaddr addr, size_t len) { +bool cmb = false, pmr = false; + if (!len) { return NVME_SUCCESS; } @@ -416,6 +457,12 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, trace_pci_nvme_map_addr(addr, len); if (nvme_addr_is_cmb(n, addr)) { +cmb = true; +} else if (nvme_addr_is_pmr(n, addr)) { +pmr = true; +} + +if (cmb || pmr) { if (qsg && qsg->sg) { return NVME_INVALID_USE_OF_CMB | NVME_DNR; } @@ -426,7 +473,11 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, qemu_iovec_init(iov, 1); } -return nvme_map_addr_cmb(n, iov, addr, len); +if (cmb) { +return nvme_map_addr_cmb(n, iov, addr, len); +} else { +return nvme_map_addr_pmr(n, iov, addr, len); +} } if (iov && iov->iov) { @@ -459,7 +510,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); -if (nvme_addr_is_cmb(n, prp1)) { +if (nvme_addr_is_cmb(n, prp1) || (nvme_addr_is_pmr(n, prp1))) { qemu_iovec_init(iov, num_prps); } else { pci_dma_sglist_init(qsg, &n->parent_obj, num_prps); @@ -3561,8 +3612,8 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n) NvmeNamespace *ns; int i; -if (n->pmrdev) { -memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size); +if (n->pmr.dev) { +memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); } for (i = 1; i <= n->num_namespaces; i++) { @@ -3851,11 +3902,12 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, case 0xE04: /* PMRCTL */ n->bar.pmrctl = data; if (NVME_PMRCTL_EN(data)) { -memory_region_set_enabled(&n->pmrdev->mr, true); +memory_region_set_enabled(&n->pmr.dev->mr, true); n->bar.pmrsts = 0; } else { -memory_region_set_enabled(&n->pmrdev->mr, false); +memory_region_set_enabled(&n->pmr.dev->mr, false); NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1); +n->pmr.cmse = false; } return; case 0xE08: /* PMRSTS */ @@ -3870,8 +3922,33 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp
[PULL 41/56] hw/block/nvme: move msix table and pba to BAR 0
From: Klaus Jensen In the interest of supporting both CMB and PMR to be enabled on the same device, move the MSI-X table and pending bit array out of BAR 4 and into BAR 0. This is a simplified version of the patch contributed by Andrzej Jakowski (see [1]). Leaving the CMB at offset 0 removes the need for changes to CMB address mapping code. [1]: https://lore.kernel.org/qemu-devel/20200729220107.37758-3-andrzej.jakow...@linux.intel.com/ Reviewed-by: Minwoo Im Tested-by: Minwoo Im Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.h | 1 + hw/block/nvme.c | 23 +-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index b0d5b6409d8e..1cdb360bc549 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -125,6 +125,7 @@ typedef struct NvmeFeatureVal { typedef struct NvmeCtrl { PCIDeviceparent_obj; +MemoryRegion bar0; MemoryRegion iomem; MemoryRegion ctrl_mem; NvmeBar bar; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 5f12ac1200ec..85d3c43c4f74 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4268,6 +4268,8 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) { uint8_t *pci_conf = pci_dev->config; +uint64_t bar_size, msix_table_size, msix_pba_size; +unsigned msix_table_offset, msix_pba_offset; int ret; Error *err = NULL; @@ -4286,11 +4288,28 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS); pcie_endpoint_cap_init(pci_dev, 0x80); +bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB); +msix_table_offset = bar_size; +msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize; + +bar_size += msix_table_size; +bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB); +msix_pba_offset = bar_size; +msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8; + +bar_size += msix_pba_size; +bar_size = pow2ceil(bar_size); + +memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size); memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme", n->reg_size); +memory_region_add_subregion(&n->bar0, 0, &n->iomem); + pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | - PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem); -ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, &err); + PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); +ret = msix_init(pci_dev, n->params.msix_qsize, +&n->bar0, 0, msix_table_offset, +&n->bar0, 0, msix_pba_offset, 0, &err); if (ret < 0) { if (ret == -ENOTSUP) { warn_report_err(err); -- 2.30.0
[PULL 36/56] hw/block/nvme: add smart_critical_warning property
From: zhenwei pi There is a very low probability that hitting physical NVMe disk hardware critical warning case, it's hard to write & test a monitor agent service. For debugging purposes, add a new 'smart_critical_warning' property to emulate this situation. The orignal version of this change is implemented by adding a fixed property which could be initialized by QEMU command line. Suggested by Philippe & Klaus, rework like current version. Test with this patch: 1, change smart_critical_warning property for a running VM: #virsh qemu-monitor-command nvme-upstream '{ "execute": "qom-set", "arguments": { "path": "/machine/peripheral-anon/device[0]", "property": "smart_critical_warning", "value":16 } }' 2, run smartctl in guest #smartctl -H -l error /dev/nvme0n1 === START OF SMART DATA SECTION === SMART overall-health self-assessment test result: FAILED! - volatile memory backup device has failed Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: zhenwei pi Signed-off-by: Klaus Jensen --- hw/block/nvme.h | 1 + include/block/nvme.h | 1 + hw/block/nvme.c | 45 +--- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 347c149e7905..b0d5b6409d8e 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -148,6 +148,7 @@ typedef struct NvmeCtrl { uint64_ttimestamp_set_qemu_clock_ms;/* QEMU clock time */ uint64_tstarttime_ms; uint16_ttemperature; +uint8_t smart_critical_warning; HostMemoryBackend *pmrdev; diff --git a/include/block/nvme.h b/include/block/nvme.h index 41614c5e12af..88af3b42348c 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -60,6 +60,7 @@ enum NvmeCapMask { #define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT)& CAP_CSS_MASK) #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) +#define NVME_CAP_PMR(cap) (((cap) >> CAP_PMR_SHIFT)& CAP_PMR_MASK) #define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ << CAP_MQES_SHIFT) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 4d73398798f1..f0cb7acd7454 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2490,6 +2490,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, } trans_len = MIN(sizeof(smart) - off, buf_len); +smart.critical_warning = n->smart_critical_warning; smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read, 1000)); @@ -4432,6 +4433,40 @@ static Property nvme_props[] = { DEFINE_PROP_END_OF_LIST(), }; +static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ +NvmeCtrl *n = NVME(obj); +uint8_t value = n->smart_critical_warning; + +visit_type_uint8(v, name, &value, errp); +} + +static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ +NvmeCtrl *n = NVME(obj); +uint8_t value, cap = 0; + +if (!visit_type_uint8(v, name, &value, errp)) { +return; +} + +cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY + | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA; +if (NVME_CAP_PMR(n->bar.cap)) { +cap |= NVME_SMART_PMR_UNRELIABLE; +} + +if ((value & cap) != value) { +error_setg(errp, "unsupported smart critical warning bits: 0x%x", + value & ~cap); +return; +} + +n->smart_critical_warning = value; +} + static const VMStateDescription nvme_vmstate = { .name = "nvme", .unmigratable = 1, @@ -4455,13 +4490,17 @@ static void nvme_class_init(ObjectClass *oc, void *data) static void nvme_instance_init(Object *obj) { -NvmeCtrl *s = NVME(obj); +NvmeCtrl *n = NVME(obj); -if (s->namespace.blkconf.blk) { -device_add_bootindex_property(obj, &s->namespace.blkconf.bootindex, +if (n->namespace.blkconf.blk) { +device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex, "bootindex", "/namespace@1,0", DEVICE(obj)); } + +object_property_add(obj, "smart_critical_warning", "uint8", +nvme_get_smart_warning, +nvme_set_smart_warning, NULL, NULL); } static const TypeInfo nvme_info = { -- 2.30.0
[PULL 42/56] hw/block/nvme: allow cmb and pmr to coexist
From: Klaus Jensen With BAR 4 now free to use, allow PMR and CMB to be enabled simultaneously. Reviewed-by: Minwoo Im Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 17 - 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 85d3c43c4f74..4ce75642f1a4 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -29,14 +29,13 @@ * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. * - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when - * both provided. * Enabling pmr emulation can be achieved by pointing to memory-backend-file. * For example: * -object memory-backend-file,id=,share=on,mem-path=, \ * size= -device nvme,...,pmrdev= * + * The PMR will use BAR 4/5 exclusively. + * * * nvme device parameters * ~~ @@ -109,7 +108,7 @@ #define NVME_DB_SIZE 4 #define NVME_SPEC_VER 0x00010300 #define NVME_CMB_BIR 2 -#define NVME_PMR_BIR 2 +#define NVME_PMR_BIR 4 #define NVME_TEMPERATURE 0x143 #define NVME_TEMPERATURE_WARNING 0x157 #define NVME_TEMPERATURE_CRITICAL 0x175 @@ -4121,7 +4120,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) return; } -if (!n->params.cmb_size_mb && n->pmrdev) { +if (n->pmrdev) { if (host_memory_backend_is_mapped(n->pmrdev)) { error_setg(errp, "can't use already busy memdev: %s", object_get_canonical_path_component(OBJECT(n->pmrdev))); @@ -4218,9 +4217,6 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) { -/* Controller Capabilities register */ -NVME_CAP_SET_PMRS(n->bar.cap, 1); - /* PMR Capabities register */ n->bar.pmrcap = 0; NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0); @@ -4321,7 +4317,9 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) if (n->params.cmb_size_mb) { nvme_init_cmb(n, pci_dev); -} else if (n->pmrdev) { +} + +if (n->pmrdev) { nvme_init_pmr(n, pci_dev); } @@ -4394,6 +4392,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0); +NVME_CAP_SET_PMRS(n->bar.cap, n->pmrdev ? 1 : 0); n->bar.vs = NVME_SPEC_VER; n->bar.intmc = n->bar.intms = 0; -- 2.30.0
[PULL 34/56] hw/block/nvme: fix zone write finalize
From: Klaus Jensen The zone write pointer is unconditionally advanced, even for write faults. Make sure that the zone is always transitioned to Full if the write pointer reaches zone capacity. Cc: Dmitry Fomichev Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 30bd70fd5b07..4d73398798f1 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1268,10 +1268,13 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, nlb = le16_to_cpu(rw->nlb) + 1; zone = nvme_get_zone_by_slba(ns, slba); +zone->d.wp += nlb; + if (failed) { res->slba = 0; -zone->d.wp += nlb; -} else if (zone->w_ptr == nvme_zone_wr_boundary(zone)) { +} + +if (zone->d.wp == nvme_zone_wr_boundary(zone)) { switch (nvme_get_zone_state(zone)) { case NVME_ZONE_STATE_IMPLICITLY_OPEN: case NVME_ZONE_STATE_EXPLICITLY_OPEN: @@ -1288,9 +1291,6 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, default: assert(false); } -zone->d.wp = zone->w_ptr; -} else { -zone->d.wp += nlb; } } -- 2.30.0
[PULL 32/56] hw/block/nvme: split setup and register for namespace
From: Minwoo Im In NVMe, namespace is being attached to process I/O. We register NVMe namespace to a controller via nvme_register_namespace() during nvme_ns_setup(). This is main reason of receiving NvmeCtrl object instance to this function to map the namespace to a controller. To make namespace instance more independent, it should be split into two parts: setup and register. This patch split them into two differnt parts, and finally nvme_ns_setup() does not have nothing to do with NvmeCtrl instance at all. This patch is a former patch to introduce NVMe subsystem scheme to the existing design especially for multi-path. In that case, it should be split into two to make namespace independent from a controller. Signed-off-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 11 +++ 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 17e876e6bc44..ce79ad4a5319 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -321,10 +321,6 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) nvme_ns_init_zoned(ns, 0); } -if (nvme_register_namespace(n, ns, errp)) { -return -1; -} - return 0; } @@ -362,6 +358,13 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) "could not setup namespace: "); return; } + +if (nvme_register_namespace(n, ns, errp)) { +error_propagate_prepend(errp, local_err, +"could not register namespace: "); +return; +} + } static Property nvme_ns_props[] = { -- 2.30.0
[PULL 31/56] hw/block/nvme: remove unused argument in nvme_ns_init_blk
From: Minwoo Im Removed no longer used aregument NvmeCtrl object in nvme_ns_init_blk(). Signed-off-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 7a5a77983798..17e876e6bc44 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -66,7 +66,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) return 0; } -static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) +static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp) { bool read_only; @@ -307,7 +307,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) return -1; } -if (nvme_ns_init_blk(n, ns, errp)) { +if (nvme_ns_init_blk(ns, errp)) { return -1; } -- 2.30.0
[PULL 38/56] hw/block/nvme: add size to mmio read/write trace events
From: Klaus Jensen Add the size of the mmio read/write to the trace event. Reviewed-by: Minwoo Im Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 4 ++-- hw/block/trace-events | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 09eb1f06e8b1..2407b6578abc 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3878,7 +3878,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) uint8_t *ptr = (uint8_t *)&n->bar; uint64_t val = 0; -trace_pci_nvme_mmio_read(addr); +trace_pci_nvme_mmio_read(addr, size); if (unlikely(addr & (sizeof(uint32_t) - 1))) { NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32, @@ -4042,7 +4042,7 @@ static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data, { NvmeCtrl *n = (NvmeCtrl *)opaque; -trace_pci_nvme_mmio_write(addr, data); +trace_pci_nvme_mmio_write(addr, data, size); if (addr < sizeof(n->bar)) { nvme_write_bar(n, addr, data, size); diff --git a/hw/block/trace-events b/hw/block/trace-events index 6d1686e6dc9d..3772502033af 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -80,8 +80,8 @@ pci_nvme_enqueue_event_noqueue(int queued) "queued %d" pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8"" pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs" pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16"" -pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64"" -pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64"" +pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d" +pci_nvme_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" data 0x%"PRIx64" size %d" pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16"" pci_nvme_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "sqid %"PRIu16" new_tail %"PRIu16"" pci_nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64"" -- 2.30.0
[PULL 48/56] hw/block/nvme: bump to v1.4
From: Klaus Jensen With the new CMB logic in place, bump the implemented specification version to v1.4 by default. This requires adding the setting the CNTRLTYPE field and modifying the VWC field since 0x00 is no longer a valid value for bits 2:1. Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- include/block/nvme.h | 3 ++- hw/block/nvme.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index 2e85b97a6c4e..07cfc929368b 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -951,7 +951,8 @@ typedef struct QEMU_PACKED NvmeIdCtrl { uint32_trtd3e; uint32_toaes; uint32_tctratt; -uint8_t rsvd100[12]; +uint8_t rsvd100[11]; +uint8_t cntrltype; uint8_t fguid[16]; uint8_t rsvd128[128]; uint16_toacs; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 1e13d25b0887..c4c968f5951e 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -108,7 +108,7 @@ #define NVME_MAX_IOQPAIRS 0x #define NVME_DB_SIZE 4 -#define NVME_SPEC_VER 0x00010300 +#define NVME_SPEC_VER 0x00010400 #define NVME_CMB_BIR 2 #define NVME_PMR_BIR 4 #define NVME_TEMPERATURE 0x143 @@ -4450,6 +4450,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); id->oacs = cpu_to_le16(0); +id->cntrltype = 0x1; /* * Because the controller always completes the Abort command immediately, @@ -4478,7 +4479,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_ONCS_FEATURES | NVME_ONCS_DSM | NVME_ONCS_COMPARE); -id->vwc = 0x1; +id->vwc = (0x2 << 1) | 0x1; id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | NVME_CTRL_SGLS_BITBUCKET); -- 2.30.0
[PULL 33/56] hw/block/nvme: remove unused argument in nvme_ns_setup
From: Minwoo Im nvme_ns_setup() finally does not have nothing to do with NvmeCtrl instance. Signed-off-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.h | 2 +- hw/block/nvme-ns.c | 4 ++-- hw/block/nvme.c| 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index a0baa5f6d44c..293ac990e3f6 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -174,7 +174,7 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns) assert(ns->nr_active_zones >= 0); } -int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); +int nvme_ns_setup(NvmeNamespace *ns, Error **errp); void nvme_ns_drain(NvmeNamespace *ns); void nvme_ns_shutdown(NvmeNamespace *ns); void nvme_ns_cleanup(NvmeNamespace *ns); diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index ce79ad4a5319..3f52acb89c95 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -301,7 +301,7 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) return 0; } -int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) +int nvme_ns_setup(NvmeNamespace *ns, Error **errp) { if (nvme_ns_check_constraints(ns, errp)) { return -1; @@ -353,7 +353,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) NvmeCtrl *n = NVME(s->parent); Error *local_err = NULL; -if (nvme_ns_setup(n, ns, &local_err)) { +if (nvme_ns_setup(ns, &local_err)) { error_propagate_prepend(errp, local_err, "could not setup namespace: "); return; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 0b002cb2beab..30bd70fd5b07 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4377,7 +4377,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) ns = &n->namespace; ns->params.nsid = 1; -if (nvme_ns_setup(n, ns, errp)) { +if (nvme_ns_setup(ns, errp)) { return; } } -- 2.30.0
[PULL 29/56] hw/block/nvme: remove unused argument in nvme_ns_init_zoned
From: Minwoo Im nvme_ns_init_zoned() has no use for given NvmeCtrl object. Signed-off-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 9be170abb78d..d35c2925ecb8 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -205,7 +205,7 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns) } } -static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index) +static void nvme_ns_init_zoned(NvmeNamespace *ns, int lba_index) { NvmeIdNsZoned *id_ns_z; @@ -322,7 +322,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) { return -1; } -nvme_ns_init_zoned(n, ns, 0); +nvme_ns_init_zoned(ns, 0); } if (nvme_register_namespace(n, ns, errp)) { -- 2.30.0
[PULL 45/56] hw/block/nvme: disable PMR at boot up
From: Klaus Jensen The PMR should not be enabled at boot up. Disable the PMR MemoryRegion initially and implement MMIO for PMRCTL, allowing the host to enable the PMR explicitly. Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 14 -- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f8dd771925f9..d773796051d6 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3848,8 +3848,16 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly, "invalid write to PMRCAP register, ignored"); return; -case 0xE04: /* TODO PMRCTL */ -break; +case 0xE04: /* PMRCTL */ +n->bar.pmrctl = data; +if (NVME_PMRCTL_EN(data)) { +memory_region_set_enabled(&n->pmrdev->mr, true); +n->bar.pmrsts = 0; +} else { +memory_region_set_enabled(&n->pmrdev->mr, false); +NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1); +} +return; case 0xE08: /* PMRSTS */ NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly, "invalid write to PMRSTS register, ignored"); @@ -4225,6 +4233,8 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr); + +memory_region_set_enabled(&n->pmrdev->mr, false); } static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) -- 2.30.0
[PULL 30/56] hw/block/nvme: open code for volatile write cache
From: Minwoo Im Volatile Write Cache(VWC) feature is set in nvme_ns_setup() in the initial time. This feature is related to block device backed, but this feature is controlled in controller level via Set/Get Features command. This patch removed dependency between nvme and nvme-ns to manage the VWC flag value. Also, it open coded the Get Features for VWC to check all namespaces attached to the controller, and if false detected, return directly false. Signed-off-by: Minwoo Im [k.jensen: report write cache preset if present on ANY namespace] Signed-off-by: Klaus Jensen --- hw/block/nvme.h| 1 - hw/block/nvme-ns.c | 4 hw/block/nvme.c| 15 --- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 65540b650e1d..347c149e7905 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -121,7 +121,6 @@ typedef struct NvmeFeatureVal { uint16_t temp_thresh_low; }; uint32_tasync_config; -uint32_tvwc; } NvmeFeatureVal; typedef struct NvmeCtrl { diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index d35c2925ecb8..7a5a77983798 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -90,10 +90,6 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) return -1; } -if (blk_enable_write_cache(ns->blkconf.blk)) { -n->features.vwc = 0x1; -} - return 0; } diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 35f39ecd9559..0b002cb2beab 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3097,6 +3097,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); uint16_t iv; NvmeNamespace *ns; +int i; static const uint32_t nvme_feature_default[NVME_FID_MAX] = { [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, @@ -3172,7 +3173,17 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) result = ns->features.err_rec; goto out; case NVME_VOLATILE_WRITE_CACHE: -result = n->features.vwc; +for (i = 1; i <= n->num_namespaces; i++) { +ns = nvme_ns(n, i); +if (!ns) { +continue; +} + +result = blk_enable_write_cache(ns->blkconf.blk); +if (result) { +break; +} +} trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); goto out; case NVME_ASYNCHRONOUS_EVENT_CONF: @@ -3335,8 +3346,6 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) ns->features.err_rec = dw11; break; case NVME_VOLATILE_WRITE_CACHE: -n->features.vwc = dw11 & 0x1; - for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); if (!ns) { -- 2.30.0
[PULL 28/56] hw/block/nvme: Correct error status for unaligned ZA
From: Dmitry Fomichev TP 4053 says (in section 2.3.1.1) - ... if a Zone Append command specifies a ZSLBA that is not the lowest logical block address in that zone, then the controller shall abort that command with a status code of Invalid Field In Command. In the code, Zone Invalid Write is returned instead, fix this. Signed-off-by: Dmitry Fomichev Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c73afdf8054f..35f39ecd9559 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1150,7 +1150,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, if (append) { if (unlikely(slba != zone->d.zslba)) { trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); -status = NVME_ZONE_INVALID_WRITE; +status = NVME_INVALID_FIELD; } if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) { trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl); -- 2.30.0
[PULL 27/56] hw/block/nvme: remove unnecessary check for append
From: Klaus Jensen nvme_io_cmd already checks if the namespace supports the Zone Append command, so the removed check is dead code. Signed-off-by: Klaus Jensen Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev --- hw/block/nvme.c | 4 1 file changed, 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 7222eff755ee..c73afdf8054f 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1707,10 +1707,6 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, } res->slba = nvme_advance_zone_wp(ns, zone, nlb); -} else if (append) { -trace_pci_nvme_err_invalid_opc(rw->opcode); -status = NVME_INVALID_OPCODE; -goto invalid; } data_offset = nvme_l2b(ns, slba); -- 2.30.0
[PULL 23/56] hw/block/nvme: merge implicitly/explicitly opened processing masks
From: Klaus Jensen Implicitly and explicitly opended zones are always bulk processed together, so merge the two processing masks. Signed-off-by: Klaus Jensen Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev --- hw/block/nvme.c | 27 +++ 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 551878338e5d..a7245a7e05a1 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1740,11 +1740,10 @@ typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, enum NvmeZoneProcessingMask { NVME_PROC_CURRENT_ZONE= 0, -NVME_PROC_IMP_OPEN_ZONES = 1 << 0, -NVME_PROC_EXP_OPEN_ZONES = 1 << 1, -NVME_PROC_CLOSED_ZONES= 1 << 2, -NVME_PROC_READ_ONLY_ZONES = 1 << 3, -NVME_PROC_FULL_ZONES = 1 << 4, +NVME_PROC_OPENED_ZONES= 1 << 0, +NVME_PROC_CLOSED_ZONES= 1 << 1, +NVME_PROC_READ_ONLY_ZONES = 1 << 2, +NVME_PROC_FULL_ZONES = 1 << 3, }; static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, @@ -1885,10 +1884,8 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, switch (zs) { case NVME_ZONE_STATE_IMPLICITLY_OPEN: -proc_zone = proc_mask & NVME_PROC_IMP_OPEN_ZONES; -break; case NVME_ZONE_STATE_EXPLICITLY_OPEN: -proc_zone = proc_mask & NVME_PROC_EXP_OPEN_ZONES; +proc_zone = proc_mask & NVME_PROC_OPENED_ZONES; break; case NVME_ZONE_STATE_CLOSED: proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES; @@ -1929,15 +1926,14 @@ static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone, } } } -if (proc_mask & NVME_PROC_IMP_OPEN_ZONES) { +if (proc_mask & NVME_PROC_OPENED_ZONES) { QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); if (status != NVME_SUCCESS) { goto out; } } -} -if (proc_mask & NVME_PROC_EXP_OPEN_ZONES) { + QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); if (status != NVME_SUCCESS) { @@ -2012,7 +2008,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) case NVME_ZONE_ACTION_CLOSE: if (all) { -proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES; +proc_mask = NVME_PROC_OPENED_ZONES; } trace_pci_nvme_close_zone(slba, zone_idx, all); status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone); @@ -2020,8 +2016,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) case NVME_ZONE_ACTION_FINISH: if (all) { -proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES | -NVME_PROC_CLOSED_ZONES; +proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES; } trace_pci_nvme_finish_zone(slba, zone_idx, all); status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone); @@ -2029,8 +2024,8 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) case NVME_ZONE_ACTION_RESET: if (all) { -proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES | -NVME_PROC_CLOSED_ZONES | NVME_PROC_FULL_ZONES; +proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES | +NVME_PROC_FULL_ZONES; } trace_pci_nvme_reset_zone(slba, zone_idx, all); status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone); -- 2.30.0
[PULL 25/56] hw/block/nvme: zero out zones on reset
From: Klaus Jensen The zoned command set specification states that "All logical blocks in a zone *shall* be marked as deallocated when [the zone is reset]". Since the device guarantees 0x00 to be read from deallocated blocks we have to issue a pwrite_zeroes since we cannot be sure that a discard will do anything. But typically, this will be achieved with an efficient unmap/discard operation. Signed-off-by: Klaus Jensen Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev --- hw/block/nvme.c | 150 +++--- hw/block/trace-events | 1 + 2 files changed, 113 insertions(+), 38 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index a5cf798bbbaa..7222eff755ee 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1371,6 +1371,53 @@ static void nvme_aio_discard_cb(void *opaque, int ret) nvme_enqueue_req_completion(nvme_cq(req), req); } +struct nvme_zone_reset_ctx { +NvmeRequest *req; +NvmeZone*zone; +}; + +static void nvme_aio_zone_reset_cb(void *opaque, int ret) +{ +struct nvme_zone_reset_ctx *ctx = opaque; +NvmeRequest *req = ctx->req; +NvmeNamespace *ns = req->ns; +NvmeZone *zone = ctx->zone; +uintptr_t *resets = (uintptr_t *)&req->opaque; + +g_free(ctx); + +trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba); + +if (!ret) { +switch (nvme_get_zone_state(zone)) { +case NVME_ZONE_STATE_EXPLICITLY_OPEN: +case NVME_ZONE_STATE_IMPLICITLY_OPEN: +nvme_aor_dec_open(ns); +/* fall through */ +case NVME_ZONE_STATE_CLOSED: +nvme_aor_dec_active(ns); +/* fall through */ +case NVME_ZONE_STATE_FULL: +zone->w_ptr = zone->d.zslba; +zone->d.wp = zone->w_ptr; +nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); +/* fall through */ +default: +break; +} +} else { +nvme_aio_err(req, ret); +} + +(*resets)--; + +if (*resets) { +return; +} + +nvme_enqueue_req_completion(nvme_cq(req), req); +} + struct nvme_compare_ctx { QEMUIOVector iov; uint8_t *bounce; @@ -1735,7 +1782,8 @@ static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c, return NVME_SUCCESS; } -typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState); +typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState, + NvmeRequest *); enum NvmeZoneProcessingMask { NVME_PROC_CURRENT_ZONE= 0, @@ -1746,7 +1794,7 @@ enum NvmeZoneProcessingMask { }; static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, - NvmeZoneState state) + NvmeZoneState state, NvmeRequest *req) { uint16_t status; @@ -1779,7 +1827,7 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, } static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, -NvmeZoneState state) +NvmeZoneState state, NvmeRequest *req) { switch (state) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: @@ -1795,7 +1843,7 @@ static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, } static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, - NvmeZoneState state) + NvmeZoneState state, NvmeRequest *req) { switch (state) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: @@ -1818,30 +1866,42 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, } static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, -NvmeZoneState state) +NvmeZoneState state, NvmeRequest *req) { +uintptr_t *resets = (uintptr_t *)&req->opaque; +struct nvme_zone_reset_ctx *ctx; + switch (state) { -case NVME_ZONE_STATE_EXPLICITLY_OPEN: -case NVME_ZONE_STATE_IMPLICITLY_OPEN: -nvme_aor_dec_open(ns); -/* fall through */ -case NVME_ZONE_STATE_CLOSED: -nvme_aor_dec_active(ns); -/* fall through */ -case NVME_ZONE_STATE_FULL: -zone->w_ptr = zone->d.zslba; -zone->d.wp = zone->w_ptr; -nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); -/* fall through */ case NVME_ZONE_STATE_EMPTY: return NVME_SUCCESS; +case NVME_ZONE_STATE_EXPLICITLY_OPEN: +case NVME_ZONE_STATE_IMPLICITLY_OPEN: +case NVME_ZONE_STATE_CLOSED: +case NVME_ZONE_STATE_FULL: +break; default: return NVME_ZONE_INVAL_TRANSITION; } + +/* + * The zone reset aio callback needs to know the zone that is being reset + * in order to transition the zone on completion. + */ +ctx = g_new(struct nvme_zone_reset_ctx, 1); +ctx->req = req; +ctx->zone = zone; + +
[PULL 39/56] hw/block/nvme: fix 64 bit register hi/lo split writes
From: Klaus Jensen 64 bit registers like ASQ and ACQ should be writable by both a hi/lo 32 bit write combination as well as a plain 64 bit write. The spec does not define ordering on the hi/lo split, but the code currently assumes that the low order bits are written first. Additionally, the code does not consider that another address might already have been written into the register, causing the OR'ing to result in a bad address. Fix this by explicitly overwriting only the low or high order bits for 32 bit writes. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 2407b6578abc..2785127037db 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3819,19 +3819,21 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, trace_pci_nvme_mmio_aqattr(data & 0x); break; case 0x28: /* ASQ */ -n->bar.asq = data; +n->bar.asq = size == 8 ? data : +(n->bar.asq & ~0xULL) | (data & 0x); trace_pci_nvme_mmio_asqaddr(data); break; case 0x2c: /* ASQ hi */ -n->bar.asq |= data << 32; +n->bar.asq = (n->bar.asq & 0x) | (data << 32); trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq); break; case 0x30: /* ACQ */ trace_pci_nvme_mmio_acqaddr(data); -n->bar.acq = data; +n->bar.acq = size == 8 ? data : +(n->bar.acq & ~0xULL) | (data & 0x); break; case 0x34: /* ACQ hi */ -n->bar.acq |= data << 32; +n->bar.acq = (n->bar.acq & 0x) | (data << 32); trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq); break; case 0x38: /* CMBLOC */ -- 2.30.0
[PULL 20/56] hw/block/nvme: fix for non-msix machines
From: Klaus Jensen Commit 1c0c2163aa08 ("hw/block/nvme: verify msix_init_exclusive_bar() return value") had the unintended effect of breaking support on several platforms not supporting MSI-X. Still check for errors, but only report that MSI-X is unsupported instead of bailing out. Fixes: 1c0c2163aa08 ("hw/block/nvme: verify msix_init_exclusive_bar() return value") Fixes: fbf2e5375e33 ("hw/block/nvme: Verify msix_vector_use() returned value") Reported-by: Guenter Roeck Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 31 ++- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f4f1487afeb1..b0b7abf3312e 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2590,7 +2590,9 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) { n->cq[cq->cqid] = NULL; timer_free(cq->timer); -msix_vector_unuse(&n->parent_obj, cq->vector); +if (msix_enabled(&n->parent_obj)) { +msix_vector_unuse(&n->parent_obj, cq->vector); +} if (cq->cqid) { g_free(cq); } @@ -2624,8 +2626,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, { int ret; -ret = msix_vector_use(&n->parent_obj, vector); -assert(ret == 0); +if (msix_enabled(&n->parent_obj)) { +ret = msix_vector_use(&n->parent_obj, vector); +assert(ret == 0); +} cq->ctrl = n; cq->cqid = cqid; cq->size = size; @@ -4161,9 +4165,12 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr); } -static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) +static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) { uint8_t *pci_conf = pci_dev->config; +int ret; + +Error *err = NULL; pci_conf[PCI_INTERRUPT_PIN] = 1; pci_config_set_prog_interface(pci_conf, 0x2); @@ -4183,8 +4190,14 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) n->reg_size); pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem); -if (msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp)) { -return; +ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, &err); +if (ret < 0) { +if (ret == -ENOTSUP) { +warn_report_err(err); +} else { +error_propagate(errp, err); +return ret; +} } if (n->params.cmb_size_mb) { @@ -4192,6 +4205,8 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) } else if (n->pmrdev) { nvme_init_pmr(n, pci_dev); } + +return 0; } static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) @@ -4280,9 +4295,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) &pci_dev->qdev, n->parent_obj.qdev.id); nvme_init_state(n); -nvme_init_pci(n, pci_dev, &local_err); -if (local_err) { -error_propagate(errp, local_err); +if (nvme_init_pci(n, pci_dev, errp)) { return; } -- 2.30.0
[PULL 17/56] hw/block/nvme: Introduce max active and open zone limits
From: Dmitry Fomichev Add two module properties, "zoned.max_active" and "zoned.max_open" to control the maximum number of zones that can be active or open. Once these variables are set to non-default values, these limits are checked during I/O and Too Many Active or Too Many Open command status is returned if they are exceeded. Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Reviewed-by: Niklas Cassel Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.h| 41 +++ hw/block/nvme-ns.c| 31 ++- hw/block/nvme.c | 92 +++ hw/block/trace-events | 2 + 4 files changed, 164 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 388381dda0df..7e1fd26909ba 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -33,6 +33,8 @@ typedef struct NvmeNamespaceParams { bool cross_zone_read; uint64_t zone_size_bs; uint64_t zone_cap_bs; +uint32_t max_active_zones; +uint32_t max_open_zones; } NvmeNamespaceParams; typedef struct NvmeNamespace { @@ -54,6 +56,8 @@ typedef struct NvmeNamespace { uint64_tzone_size; uint64_tzone_capacity; uint32_tzone_size_log2; +int32_t nr_open_zones; +int32_t nr_active_zones; NvmeNamespaceParams params; @@ -125,6 +129,43 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone) st != NVME_ZONE_STATE_OFFLINE; } +static inline void nvme_aor_inc_open(NvmeNamespace *ns) +{ +assert(ns->nr_open_zones >= 0); +if (ns->params.max_open_zones) { +ns->nr_open_zones++; +assert(ns->nr_open_zones <= ns->params.max_open_zones); +} +} + +static inline void nvme_aor_dec_open(NvmeNamespace *ns) +{ +if (ns->params.max_open_zones) { +assert(ns->nr_open_zones > 0); +ns->nr_open_zones--; +} +assert(ns->nr_open_zones >= 0); +} + +static inline void nvme_aor_inc_active(NvmeNamespace *ns) +{ +assert(ns->nr_active_zones >= 0); +if (ns->params.max_active_zones) { +ns->nr_active_zones++; +assert(ns->nr_active_zones <= ns->params.max_active_zones); +} +} + +static inline void nvme_aor_dec_active(NvmeNamespace *ns) +{ +if (ns->params.max_active_zones) { +assert(ns->nr_active_zones > 0); +ns->nr_active_zones--; +assert(ns->nr_active_zones >= ns->nr_open_zones); +} +assert(ns->nr_active_zones >= 0); +} + int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); void nvme_ns_drain(NvmeNamespace *ns); void nvme_ns_shutdown(NvmeNamespace *ns); diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index d79452c627cf..c55afc1920a3 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -135,6 +135,21 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) ns->zone_size = zone_size / lbasz; ns->zone_capacity = zone_cap / lbasz; ns->num_zones = ns->size / lbasz / ns->zone_size; + +/* Do a few more sanity checks of ZNS properties */ +if (ns->params.max_open_zones > ns->num_zones) { +error_setg(errp, + "max_open_zones value %u exceeds the number of zones %u", + ns->params.max_open_zones, ns->num_zones); +return -1; +} +if (ns->params.max_active_zones > ns->num_zones) { +error_setg(errp, + "max_active_zones value %u exceeds the number of zones %u", + ns->params.max_active_zones, ns->num_zones); +return -1; +} + return 0; } @@ -182,8 +197,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index) id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned)); /* MAR/MOR are zeroes-based, 0x means no limit */ -id_ns_z->mar = 0x; -id_ns_z->mor = 0x; +id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1); +id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1); id_ns_z->zoc = 0; id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00; @@ -209,6 +224,7 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone) trace_pci_nvme_clear_ns_close(state, zone->d.zslba); nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED); } +nvme_aor_inc_active(ns); QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry); } else { trace_pci_nvme_clear_ns_reset(state, zone->d.zslba); @@ -225,16 +241,23 @@ static void nvme_zoned_ns_shutdown(NvmeNamespace *ns) QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { QTAILQ_REMOVE(&ns->closed_zones, zone, entry); +nvme_aor_dec_active(ns); nvme_clear_zone(ns, zone); } QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); +nvme_aor_dec_open(ns); +nvme_aor_dec_active(ns); n
[PULL 18/56] hw/block/nvme: Support Zone Descriptor Extensions
From: Dmitry Fomichev Zone Descriptor Extension is a label that can be assigned to a zone. It can be set to an Empty zone and it stays assigned until the zone is reset. This commit adds a new optional module property, "zoned.descr_ext_size". Its value must be a multiple of 64 bytes. If this value is non-zero, it becomes possible to assign extensions of that size to any Empty zones. The default value for this property is 0, therefore setting extensions is disabled by default. Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Reviewed-by: Niklas Cassel Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.h| 8 +++ hw/block/nvme-ns.c| 25 ++-- hw/block/nvme.c | 53 +-- hw/block/trace-events | 2 ++ 4 files changed, 84 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 7e1fd26909ba..f8f3c28c360b 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -35,6 +35,7 @@ typedef struct NvmeNamespaceParams { uint64_t zone_cap_bs; uint32_t max_active_zones; uint32_t max_open_zones; +uint32_t zd_extension_size; } NvmeNamespaceParams; typedef struct NvmeNamespace { @@ -56,6 +57,7 @@ typedef struct NvmeNamespace { uint64_tzone_size; uint64_tzone_capacity; uint32_tzone_size_log2; +uint8_t *zd_extensions; int32_t nr_open_zones; int32_t nr_active_zones; @@ -129,6 +131,12 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone) st != NVME_ZONE_STATE_OFFLINE; } +static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns, + uint32_t zone_idx) +{ +return &ns->zd_extensions[zone_idx * ns->params.zd_extension_size]; +} + static inline void nvme_aor_inc_open(NvmeNamespace *ns) { assert(ns->nr_open_zones >= 0); diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index c55afc1920a3..838b15c064f5 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -150,6 +150,18 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) return -1; } +if (ns->params.zd_extension_size) { +if (ns->params.zd_extension_size & 0x3f) { +error_setg(errp, +"zone descriptor extension size must be a multiple of 64B"); +return -1; +} +if ((ns->params.zd_extension_size >> 6) > 0xff) { +error_setg(errp, "zone descriptor extension size is too large"); +return -1; +} +} + return 0; } @@ -161,6 +173,10 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns) int i; ns->zone_array = g_new0(NvmeZone, ns->num_zones); +if (ns->params.zd_extension_size) { +ns->zd_extensions = g_malloc0(ns->params.zd_extension_size * + ns->num_zones); +} QTAILQ_INIT(&ns->exp_open_zones); QTAILQ_INIT(&ns->imp_open_zones); @@ -203,7 +219,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index) id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00; id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size); -id_ns_z->lbafe[lba_index].zdes = 0; +id_ns_z->lbafe[lba_index].zdes = +ns->params.zd_extension_size >> 6; /* Units of 64B */ ns->csi = NVME_CSI_ZONED; ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size); @@ -219,7 +236,8 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone) zone->w_ptr = zone->d.wp; state = nvme_get_zone_state(zone); -if (zone->d.wp != zone->d.zslba) { +if (zone->d.wp != zone->d.zslba || +(zone->d.za & NVME_ZA_ZD_EXT_VALID)) { if (state != NVME_ZONE_STATE_CLOSED) { trace_pci_nvme_clear_ns_close(state, zone->d.zslba); nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED); @@ -315,6 +333,7 @@ void nvme_ns_cleanup(NvmeNamespace *ns) if (ns->params.zoned) { g_free(ns->id_ns_zoned); g_free(ns->zone_array); +g_free(ns->zd_extensions); } } @@ -347,6 +366,8 @@ static Property nvme_ns_props[] = { params.max_active_zones, 0), DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace, params.max_open_zones, 0), +DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace, + params.zd_extension_size, 0), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c07dbcd2a809..4bcc7660736b 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1823,6 +1823,25 @@ static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, } } +static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone) +{ +uint16_t status; +uint8_t state = nvme_get_zone_state(zone); + +if (state == NVME_ZONE_STATE_EMPTY) { +sta
[PULL 16/56] hw/block/nvme: Support Zoned Namespace Command Set
From: Dmitry Fomichev The emulation code has been changed to advertise NVM Command Set when "zoned" device property is not set (default) and Zoned Namespace Command Set otherwise. Define values and structures that are needed to support Zoned Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator. Define trace events where needed in newly introduced code. In order to improve scalability, all open, closed and full zones are organized in separate linked lists. Consequently, almost all zone operations don't require scanning of the entire zone array (which potentially can be quite large) - it is only necessary to enumerate one or more zone lists. Handlers for three new NVMe commands introduced in Zoned Namespace Command Set specification are added, namely for Zone Management Receive, Zone Management Send and Zone Append. Device initialization code has been extended to create a proper configuration for zoned operation using device properties. Read/Write command handler is modified to only allow writes at the write pointer if the namespace is zoned. For Zone Append command, writes implicitly happen at the write pointer and the starting write pointer value is returned as the result of the command. Write Zeroes handler is modified to add zoned checks that are identical to those done as a part of Write flow. Subsequent commits in this series add ZDE support and checks for active and open zone limits. Signed-off-by: Niklas Cassel Signed-off-by: Hans Holmberg Signed-off-by: Ajay Joshi Signed-off-by: Chaitanya Kulkarni Signed-off-by: Matias Bjorling Signed-off-by: Aravind Ramesh Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: Adam Manzanares Signed-off-by: Dmitry Fomichev Reviewed-by: Niklas Cassel Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.h| 52 +++ hw/block/nvme.h | 6 + hw/block/nvme-ns.c| 166 + hw/block/nvme.c | 807 +- hw/block/trace-events | 17 + 5 files changed, 1040 insertions(+), 8 deletions(-) diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index bdbc98c2ec17..388381dda0df 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -19,9 +19,20 @@ #define NVME_NS(obj) \ OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS) +typedef struct NvmeZone { +NvmeZoneDescr d; +uint64_tw_ptr; +QTAILQ_ENTRY(NvmeZone) entry; +} NvmeZone; + typedef struct NvmeNamespaceParams { uint32_t nsid; QemuUUID uuid; + +bool zoned; +bool cross_zone_read; +uint64_t zone_size_bs; +uint64_t zone_cap_bs; } NvmeNamespaceParams; typedef struct NvmeNamespace { @@ -33,6 +44,17 @@ typedef struct NvmeNamespace { const uint32_t *iocs; uint8_t csi; +NvmeIdNsZoned *id_ns_zoned; +NvmeZone*zone_array; +QTAILQ_HEAD(, NvmeZone) exp_open_zones; +QTAILQ_HEAD(, NvmeZone) imp_open_zones; +QTAILQ_HEAD(, NvmeZone) closed_zones; +QTAILQ_HEAD(, NvmeZone) full_zones; +uint32_tnum_zones; +uint64_tzone_size; +uint64_tzone_capacity; +uint32_tzone_size_log2; + NvmeNamespaceParams params; struct { @@ -74,8 +96,38 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba) typedef struct NvmeCtrl NvmeCtrl; +static inline enum NvmeZoneState nvme_get_zone_state(NvmeZone *zone) +{ +return zone->d.zs >> 4; +} + +static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState state) +{ +zone->d.zs = state << 4; +} + +static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone) +{ +return zone->d.zslba + ns->zone_size; +} + +static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone) +{ +return zone->d.zslba + zone->d.zcap; +} + +static inline bool nvme_wp_is_valid(NvmeZone *zone) +{ +uint8_t st = nvme_get_zone_state(zone); + +return st != NVME_ZONE_STATE_FULL && + st != NVME_ZONE_STATE_READ_ONLY && + st != NVME_ZONE_STATE_OFFLINE; +} + int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); void nvme_ns_drain(NvmeNamespace *ns); void nvme_ns_shutdown(NvmeNamespace *ns); +void nvme_ns_cleanup(NvmeNamespace *ns); #endif /* NVME_NS_H */ diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 574333caa3f9..b7fbcca39d9f 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -6,6 +6,9 @@ #define NVME_MAX_NAMESPACES 256 +#define NVME_DEFAULT_ZONE_SIZE (128 * MiB) +#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB) + typedef struct NvmeParams { char *serial; uint32_t num_queues; /* deprecated since 5.1 */ @@ -16,6 +19,7 @@ typedef struct NvmeParams { uint32_t aer_max_queued; uint8_t mdts; bool use_intel_id; +uint32_t zasl_bs; } NvmeParams; typedef struct NvmeAsyncEvent { @@ -149,6 +153,8 @@ typedef struct NvmeCtrl { QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue; int aer_queued; +uint8_t zasl; +
[PULL 26/56] hw/block/nvme: add missing string representations for commands
From: Klaus Jensen Add missing string representations for a couple of new commands. Signed-off-by: Klaus Jensen Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev --- hw/block/nvme.h | 4 1 file changed, 4 insertions(+) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index b7fbcca39d9f..65540b650e1d 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -64,8 +64,12 @@ static inline const char *nvme_io_opc_str(uint8_t opc) case NVME_CMD_FLUSH:return "NVME_NVM_CMD_FLUSH"; case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE"; case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; +case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE"; case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; +case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND"; +case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV"; +case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND"; default:return "NVME_NVM_CMD_UNKNOWN"; } } -- 2.30.0
[PULL 15/56] nvme: Make ZNS-related definitions
From: Dmitry Fomichev Define values and structures that are needed to support Zoned Namespace Command Set (NVMe TP 4053). Signed-off-by: Dmitry Fomichev Acked-by: Stefan Hajnoczi Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- include/block/nvme.h | 114 ++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index adb5806365a3..9494246f1f59 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -489,6 +489,9 @@ enum NvmeIoCommands { NVME_CMD_COMPARE= 0x05, NVME_CMD_WRITE_ZEROES = 0x08, NVME_CMD_DSM= 0x09, +NVME_CMD_ZONE_MGMT_SEND = 0x79, +NVME_CMD_ZONE_MGMT_RECV = 0x7a, +NVME_CMD_ZONE_APPEND= 0x7d, }; typedef struct QEMU_PACKED NvmeDeleteQ { @@ -654,9 +657,13 @@ typedef struct QEMU_PACKED NvmeAerResult { uint8_t resv; } NvmeAerResult; +typedef struct QEMU_PACKED NvmeZonedResult { +uint64_t slba; +} NvmeZonedResult; + typedef struct QEMU_PACKED NvmeCqe { uint32_tresult; -uint32_trsvd; +uint32_tdw1; uint16_tsq_head; uint16_tsq_id; uint16_tcid; @@ -685,6 +692,7 @@ enum NvmeStatusCodes { NVME_INVALID_USE_OF_CMB = 0x0012, NVME_INVALID_PRP_OFFSET = 0x0013, NVME_CMD_SET_CMB_REJECTED = 0x002b, +NVME_INVALID_CMD_SET= 0x002c, NVME_LBA_RANGE = 0x0080, NVME_CAP_EXCEEDED = 0x0081, NVME_NS_NOT_READY = 0x0082, @@ -709,6 +717,14 @@ enum NvmeStatusCodes { NVME_CONFLICTING_ATTRS = 0x0180, NVME_INVALID_PROT_INFO = 0x0181, NVME_WRITE_TO_RO= 0x0182, +NVME_ZONE_BOUNDARY_ERROR= 0x01b8, +NVME_ZONE_FULL = 0x01b9, +NVME_ZONE_READ_ONLY = 0x01ba, +NVME_ZONE_OFFLINE = 0x01bb, +NVME_ZONE_INVALID_WRITE = 0x01bc, +NVME_ZONE_TOO_MANY_ACTIVE = 0x01bd, +NVME_ZONE_TOO_MANY_OPEN = 0x01be, +NVME_ZONE_INVAL_TRANSITION = 0x01bf, NVME_WRITE_FAULT= 0x0280, NVME_UNRECOVERED_READ = 0x0281, NVME_E2E_GUARD_ERROR= 0x0282, @@ -894,6 +910,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl { uint8_t vs[1024]; } NvmeIdCtrl; +typedef struct NvmeIdCtrlZoned { +uint8_t zasl; +uint8_t rsvd1[4095]; +} NvmeIdCtrlZoned; + enum NvmeIdCtrlOacs { NVME_OACS_SECURITY = 1 << 0, NVME_OACS_FORMAT= 1 << 1, @@ -1022,6 +1043,12 @@ typedef struct QEMU_PACKED NvmeLBAF { uint8_t rp; } NvmeLBAF; +typedef struct QEMU_PACKED NvmeLBAFE { +uint64_tzsze; +uint8_t zdes; +uint8_t rsvd9[7]; +} NvmeLBAFE; + #define NVME_NSID_BROADCAST 0x typedef struct QEMU_PACKED NvmeIdNs { @@ -1081,10 +1108,24 @@ enum NvmeNsIdentifierType { enum NvmeCsi { NVME_CSI_NVM= 0x00, +NVME_CSI_ZONED = 0x02, }; #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi))) +typedef struct QEMU_PACKED NvmeIdNsZoned { +uint16_tzoc; +uint16_tozcs; +uint32_tmar; +uint32_tmor; +uint32_trrl; +uint32_tfrl; +uint8_t rsvd20[2796]; +NvmeLBAFE lbafe[16]; +uint8_t rsvd3072[768]; +uint8_t vs[256]; +} NvmeIdNsZoned; + /*Deallocate Logical Block Features*/ #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat) ((dlfeat) & 0x10) #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)((dlfeat) & 0x08) @@ -1117,10 +1158,76 @@ enum NvmeIdNsDps { DPS_FIRST_EIGHT = 8, }; +enum NvmeZoneAttr { +NVME_ZA_FINISHED_BY_CTLR = 1 << 0, +NVME_ZA_FINISH_RECOMMENDED = 1 << 1, +NVME_ZA_RESET_RECOMMENDED= 1 << 2, +NVME_ZA_ZD_EXT_VALID = 1 << 7, +}; + +typedef struct QEMU_PACKED NvmeZoneReportHeader { +uint64_tnr_zones; +uint8_t rsvd[56]; +} NvmeZoneReportHeader; + +enum NvmeZoneReceiveAction { +NVME_ZONE_REPORT = 0, +NVME_ZONE_REPORT_EXTENDED= 1, +}; + +enum NvmeZoneReportType { +NVME_ZONE_REPORT_ALL = 0, +NVME_ZONE_REPORT_EMPTY = 1, +NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2, +NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3, +NVME_ZONE_REPORT_CLOSED = 4, +NVME_ZONE_REPORT_FULL= 5, +NVME_ZONE_REPORT_READ_ONLY = 6, +NVME_ZONE_REPORT_OFFLINE = 7, +}; + +enum NvmeZoneType { +NVME_ZONE_TYPE_RESERVED = 0x00, +NVME_ZONE_TYPE_SEQ_WRITE = 0x02, +}; + +enum NvmeZoneSendAction { +NVME_ZONE_ACTION_RSD = 0x00, +NVME_ZONE_ACTION_CLOSE = 0x01, +NVME_ZONE_ACTION_FINISH = 0x02, +NVME_ZONE_ACTION_OPEN= 0x03, +NVME_ZONE_ACTION_RESET = 0x04, +NVME_ZONE_ACTION_OFFLINE = 0x05, +NVME_ZONE_ACTION_SET_ZD_EXT = 0x10, +}; + +typedef struct QEMU_PACKED NvmeZoneDescr { +uint8_t
[PULL 13/56] hw/block/nvme: Add support for Namespace Types
From: Niklas Cassel Define the structures and constants required to implement Namespace Types support. Namespace Types introduce a new command set, "I/O Command Sets", that allows the host to retrieve the command sets associated with a namespace. Introduce support for the command set and enable detection for the NVM Command Set. The new workflows for identify commands rely heavily on zero-filled identify structs. E.g., certain CNS commands are defined to return a zero-filled identify struct when an inactive namespace NSID is supplied. Add a helper function in order to avoid code duplication when reporting zero-filled identify structures. Signed-off-by: Niklas Cassel Signed-off-by: Dmitry Fomichev Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.h| 1 + include/block/nvme.h | 64 ++ hw/block/nvme-ns.c| 2 + hw/block/nvme.c | 188 +++--- hw/block/trace-events | 6 ++ 5 files changed, 217 insertions(+), 44 deletions(-) diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index bdeaf1c0de84..bdbc98c2ec17 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -31,6 +31,7 @@ typedef struct NvmeNamespace { int64_t size; NvmeIdNs id_ns; const uint32_t *iocs; +uint8_t csi; NvmeNamespaceParams params; diff --git a/include/block/nvme.h b/include/block/nvme.h index 397f7ca3b5cb..19347cf69e52 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -84,6 +84,7 @@ enum NvmeCapMask { enum NvmeCapCss { NVME_CAP_CSS_NVM= 1 << 0, +NVME_CAP_CSS_CSI_SUPP = 1 << 6, NVME_CAP_CSS_ADMIN_ONLY = 1 << 7, }; @@ -117,9 +118,25 @@ enum NvmeCcMask { enum NvmeCcCss { NVME_CC_CSS_NVM= 0x0, +NVME_CC_CSS_CSI= 0x6, NVME_CC_CSS_ADMIN_ONLY = 0x7, }; +#define NVME_SET_CC_EN(cc, val) \ +(cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT) +#define NVME_SET_CC_CSS(cc, val)\ +(cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT) +#define NVME_SET_CC_MPS(cc, val)\ +(cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT) +#define NVME_SET_CC_AMS(cc, val)\ +(cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT) +#define NVME_SET_CC_SHN(cc, val)\ +(cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT) +#define NVME_SET_CC_IOSQES(cc, val) \ +(cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT) +#define NVME_SET_CC_IOCQES(cc, val) \ +(cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT) + enum NvmeCstsShift { CSTS_RDY_SHIFT = 0, CSTS_CFS_SHIFT = 1, @@ -540,8 +557,13 @@ typedef struct QEMU_PACKED NvmeIdentify { uint64_trsvd2[2]; uint64_tprp1; uint64_tprp2; -uint32_tcns; -uint32_trsvd11[5]; +uint8_t cns; +uint8_t rsvd10; +uint16_tctrlid; +uint16_tnvmsetid; +uint8_t rsvd11; +uint8_t csi; +uint32_trsvd12[4]; } NvmeIdentify; typedef struct QEMU_PACKED NvmeRwCmd { @@ -662,6 +684,7 @@ enum NvmeStatusCodes { NVME_SGL_DESCR_TYPE_INVALID = 0x0011, NVME_INVALID_USE_OF_CMB = 0x0012, NVME_INVALID_PRP_OFFSET = 0x0013, +NVME_CMD_SET_CMB_REJECTED = 0x002b, NVME_LBA_RANGE = 0x0080, NVME_CAP_EXCEEDED = 0x0081, NVME_NS_NOT_READY = 0x0082, @@ -789,11 +812,15 @@ typedef struct QEMU_PACKED NvmePSD { #define NVME_IDENTIFY_DATA_SIZE 4096 -enum { -NVME_ID_CNS_NS = 0x0, -NVME_ID_CNS_CTRL = 0x1, -NVME_ID_CNS_NS_ACTIVE_LIST = 0x2, -NVME_ID_CNS_NS_DESCR_LIST = 0x3, +enum NvmeIdCns { +NVME_ID_CNS_NS= 0x00, +NVME_ID_CNS_CTRL = 0x01, +NVME_ID_CNS_NS_ACTIVE_LIST= 0x02, +NVME_ID_CNS_NS_DESCR_LIST = 0x03, +NVME_ID_CNS_CS_NS = 0x05, +NVME_ID_CNS_CS_CTRL = 0x06, +NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07, +NVME_ID_CNS_IO_COMMAND_SET= 0x1c, }; typedef struct QEMU_PACKED NvmeIdCtrl { @@ -944,6 +971,7 @@ enum NvmeFeatureIds { NVME_WRITE_ATOMICITY= 0xa, NVME_ASYNCHRONOUS_EVENT_CONF= 0xb, NVME_TIMESTAMP = 0xe, +NVME_COMMAND_SET_PROFILE= 0x19, NVME_SOFTWARE_PROGRESS_MARKER = 0x80, NVME_FID_MAX= 0x100, }; @@ -1033,18 +1061,26 @@ typedef struct QEMU_PACKED NvmeIdNsDescr { uint8_t rsvd2[2]; } NvmeIdNsDescr; -enum { -NVME_NIDT_EUI64_LEN = 8, -NVME_NIDT_NGUID_LEN = 16, -NVME_NIDT_UUID_LEN = 16, +enum NvmeNsIdentifierLength { +NVME_NIDL_EUI64 = 8, +NVME_NIDL_NGUID = 16, +NVME_NIDL_UUID = 16, +NVME_NIDL_CSI = 1, }; enum NvmeNsIdentifierType { -NVME_NIDT_EUI64 = 0x1, -NVME_NIDT_NGUID = 0x2, -NVME_NIDT_UUID = 0x3, +NVME_NIDT_EUI64 = 0x01, +NVME_NIDT_NGUID
[PULL 12/56] hw/block/nvme: Add Commands Supported and Effects log
From: Dmitry Fomichev This log page becomes necessary to implement to allow checking for Zone Append command support in Zoned Namespace Command Set. This commit adds the code to report this log page for NVM Command Set only. The parts that are specific to zoned operation will be added later in the series. All incoming admin and i/o commands are now only processed if their corresponding support bits are set in this log. This provides an easy way to control what commands to support and what not to depending on set CC.CSS. Signed-off-by: Dmitry Fomichev Reviewed-by: Niklas Cassel Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.h| 1 + include/block/nvme.h | 19 hw/block/nvme.c | 102 ++ hw/block/trace-events | 1 + 4 files changed, 114 insertions(+), 9 deletions(-) diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index aeca810fc7a8..bdeaf1c0de84 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -30,6 +30,7 @@ typedef struct NvmeNamespace { int32_t bootindex; int64_t size; NvmeIdNs id_ns; +const uint32_t *iocs; NvmeNamespaceParams params; diff --git a/include/block/nvme.h b/include/block/nvme.h index 11ac1c2b7dfb..397f7ca3b5cb 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -752,10 +752,27 @@ enum NvmeSmartWarn { NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4, }; +typedef struct NvmeEffectsLog { +uint32_tacs[256]; +uint32_tiocs[256]; +uint8_t resv[2048]; +} NvmeEffectsLog; + +enum { +NVME_CMD_EFF_CSUPP = 1 << 0, +NVME_CMD_EFF_LBCC = 1 << 1, +NVME_CMD_EFF_NCC= 1 << 2, +NVME_CMD_EFF_NIC= 1 << 3, +NVME_CMD_EFF_CCC= 1 << 4, +NVME_CMD_EFF_CSE_MASK = 3 << 16, +NVME_CMD_EFF_UUID_SEL = 1 << 19, +}; + enum NvmeLogIdentifier { NVME_LOG_ERROR_INFO = 0x01, NVME_LOG_SMART_INFO = 0x02, NVME_LOG_FW_SLOT_INFO = 0x03, +NVME_LOG_CMD_EFFECTS= 0x05, }; typedef struct QEMU_PACKED NvmePSD { @@ -868,6 +885,7 @@ enum NvmeIdCtrlFrmw { enum NvmeIdCtrlLpa { NVME_LPA_NS_SMART = 1 << 0, +NVME_LPA_CSE = 1 << 1, NVME_LPA_EXTENDED = 1 << 2, }; @@ -1076,6 +1094,7 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512); QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); +QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16); diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 4d1ca8c466c5..05e799623c41 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -112,6 +112,30 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE, }; +static const uint32_t nvme_cse_acs[256] = { +[NVME_ADM_CMD_DELETE_SQ]= NVME_CMD_EFF_CSUPP, +[NVME_ADM_CMD_CREATE_SQ]= NVME_CMD_EFF_CSUPP, +[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP, +[NVME_ADM_CMD_DELETE_CQ]= NVME_CMD_EFF_CSUPP, +[NVME_ADM_CMD_CREATE_CQ]= NVME_CMD_EFF_CSUPP, +[NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP, +[NVME_ADM_CMD_ABORT]= NVME_CMD_EFF_CSUPP, +[NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP, +[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, +[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, +}; + +static const uint32_t nvme_cse_iocs_none[256]; + +static const uint32_t nvme_cse_iocs_nvm[256] = { +[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, +[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, +[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, +[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, +[NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, +[NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, +}; + static void nvme_process_sq(void *opaque); static uint16_t nvme_cid(NvmeRequest *req) @@ -1306,10 +1330,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); -if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) { -return NVME_INVALID_OPCODE | NVME_DNR; -} - if (!nvme_nsid_valid(n, nsid)) { return NVME_INVALID_NSID | NVME_DNR; } @@ -1319,6 +1339,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } +if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { +trace_pci_nvme_err_invalid_opc(req->cmd.opcode); +return NVME_INVALID_OPCODE